Les événements métier (emails d'invitation, reset password, activation) bloquaient la réponse API en étant traités de manière synchrone. Ce commit route ces événements vers un transport AMQP asynchrone avec un worker dédié, garantissant des réponses API rapides et une gestion robuste des échecs. Le retry utilise une stratégie Fibonacci (1s, 1s, 2s, 3s, 5s, 8s, 13s) qui offre un bon compromis entre réactivité et protection des services externes. Les messages qui épuisent leurs tentatives arrivent dans une dead-letter queue Doctrine avec alerte email à l'admin. La commande console CreateTestActivationTokenCommand détecte désormais les comptes déjà actifs et génère un token de réinitialisation de mot de passe au lieu d'un token d'activation, évitant une erreur bloquante lors de la ré-invitation par un admin.
161 lines
6.4 KiB
YAML
161 lines
6.4 KiB
YAML
# Prometheus Alert Rules for Classeo
|
|
# NFR-OB2: Automated alerts when SLA threatened (< 5 min detection)
|
|
|
|
groups:
|
|
# =============================================================================
|
|
# SLA & Performance Alerts
|
|
# =============================================================================
|
|
- name: sla_alerts
|
|
rules:
|
|
# NFR-P4: API response time P95 < 200ms
|
|
- alert: HighApiLatencyP95
|
|
expr: histogram_quantile(0.95, sum(rate(classeo_http_request_duration_seconds_bucket{job="classeo-backend"}[5m])) by (le)) > 0.2
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
annotations:
|
|
summary: "API P95 latency above SLA threshold"
|
|
description: "P95 latency is {{ $value | humanizeDuration }} (threshold: 200ms)"
|
|
runbook_url: "https://docs.classeo.local/runbooks/high-latency"
|
|
|
|
# NFR-P5: API response time P99 < 500ms
|
|
- alert: HighApiLatencyP99
|
|
expr: histogram_quantile(0.99, sum(rate(classeo_http_request_duration_seconds_bucket{job="classeo-backend"}[5m])) by (le)) > 0.5
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
team: platform
|
|
annotations:
|
|
summary: "API P99 latency critically high"
|
|
description: "P99 latency is {{ $value | humanizeDuration }} (threshold: 500ms)"
|
|
runbook_url: "https://docs.classeo.local/runbooks/high-latency"
|
|
|
|
# Error rate > 1% (AC3: error rate > 1% pendant 2 min)
|
|
- alert: HighErrorRate
|
|
expr: sum(rate(classeo_http_requests_total{status=~"5.."}[2m])) / sum(rate(classeo_http_requests_total[2m])) > 0.01
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
team: platform
|
|
annotations:
|
|
summary: "High error rate detected"
|
|
description: "Error rate is {{ $value | humanizePercentage }} (threshold: 1%)"
|
|
runbook_url: "https://docs.classeo.local/runbooks/high-error-rate"
|
|
|
|
# =============================================================================
|
|
# Infrastructure Alerts
|
|
# =============================================================================
|
|
- name: infrastructure_alerts
|
|
rules:
|
|
# Redis memory usage
|
|
- alert: RedisHighMemoryUsage
|
|
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
annotations:
|
|
summary: "Redis memory usage above 80%"
|
|
description: "Redis is using {{ $value | humanizePercentage }} of available memory"
|
|
runbook_url: "https://docs.classeo.local/runbooks/redis-memory"
|
|
|
|
# Database connection issues
|
|
- alert: DatabaseConnectionFailed
|
|
expr: pg_up == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: platform
|
|
annotations:
|
|
summary: "PostgreSQL connection failed"
|
|
description: "Cannot connect to PostgreSQL database"
|
|
runbook_url: "https://docs.classeo.local/runbooks/database-down"
|
|
|
|
# RabbitMQ queue backlog
|
|
- alert: RabbitMQQueueBacklog
|
|
expr: rabbitmq_queue_messages > 10000
|
|
for: 10m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
annotations:
|
|
summary: "RabbitMQ queue backlog growing"
|
|
description: "Queue has {{ $value }} messages pending"
|
|
runbook_url: "https://docs.classeo.local/runbooks/rabbitmq-backlog"
|
|
|
|
# =============================================================================
|
|
# Messenger Queue Alerts
|
|
# =============================================================================
|
|
- name: messenger_alerts
|
|
rules:
|
|
# Messenger queue backlog > 100 messages for 5 minutes
|
|
- alert: MessengerQueueBacklog
|
|
expr: classeo_messenger_messages_waiting{transport="async"} > 100
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
annotations:
|
|
summary: "File d'attente Messenger surchargee"
|
|
description: "{{ $value }} messages en attente depuis 5 minutes"
|
|
runbook_url: "https://docs.classeo.local/runbooks/messenger-backlog"
|
|
|
|
# =============================================================================
|
|
# Security Alerts
|
|
# =============================================================================
|
|
- name: security_alerts
|
|
rules:
|
|
# NFR-S2: Excessive login failures (potential brute force)
|
|
- alert: ExcessiveLoginFailures
|
|
expr: sum(rate(classeo_login_failures_total[5m])) > 10
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: security
|
|
annotations:
|
|
summary: "Excessive login failures detected"
|
|
description: "More than 10 failed logins per minute"
|
|
runbook_url: "https://docs.classeo.local/runbooks/brute-force"
|
|
|
|
# Per-tenant excessive login failures
|
|
- alert: TenantExcessiveLoginFailures
|
|
expr: sum by (tenant_id) (rate(classeo_login_failures_total[5m])) > 5
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
team: security
|
|
annotations:
|
|
summary: "Excessive login failures for tenant {{ $labels.tenant_id }}"
|
|
description: "More than 5 failed logins per minute for single tenant"
|
|
runbook_url: "https://docs.classeo.local/runbooks/brute-force"
|
|
|
|
# =============================================================================
|
|
# Application Health Alerts
|
|
# =============================================================================
|
|
- name: application_alerts
|
|
rules:
|
|
# Backend scrape target down
|
|
- alert: ApplicationUnhealthy
|
|
expr: up{job="classeo-backend"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
team: platform
|
|
annotations:
|
|
summary: "Backend application is down"
|
|
description: "Cannot scrape metrics from backend - application may be crashed or unreachable"
|
|
runbook_url: "https://docs.classeo.local/runbooks/health-check"
|
|
|
|
# Infrastructure service unhealthy (postgres, redis, rabbitmq)
|
|
- alert: InfrastructureServiceUnhealthy
|
|
expr: classeo_health_check_status == 0
|
|
for: 2m
|
|
labels:
|
|
severity: warning
|
|
team: platform
|
|
annotations:
|
|
summary: "Infrastructure service {{ $labels.service }} is unhealthy"
|
|
description: "Health check for {{ $labels.service }} is failing"
|
|
runbook_url: "https://docs.classeo.local/runbooks/degraded-mode"
|