Files
Classeo/monitoring/prometheus/alerts.yml
Mathias STRASSER 9ccad77bf0 feat: Messaging asynchrone fiable avec retry, dead-letter et métriques
Les événements métier (emails d'invitation, reset password, activation)
bloquaient la réponse API en étant traités de manière synchrone. Ce commit
route ces événements vers un transport AMQP asynchrone avec un worker
dédié, garantissant des réponses API rapides et une gestion robuste des
échecs.

Le retry utilise une stratégie Fibonacci (1s, 1s, 2s, 3s, 5s, 8s, 13s)
qui offre un bon compromis entre réactivité et protection des services
externes. Les messages qui épuisent leurs tentatives arrivent dans une
dead-letter queue Doctrine avec alerte email à l'admin.

La commande console CreateTestActivationTokenCommand détecte désormais
les comptes déjà actifs et génère un token de réinitialisation de mot
de passe au lieu d'un token d'activation, évitant une erreur bloquante
lors de la ré-invitation par un admin.
2026-02-08 21:38:20 +01:00

161 lines
6.4 KiB
YAML

# Prometheus Alert Rules for Classeo
# NFR-OB2: Automated alerts when SLA threatened (< 5 min detection)
groups:
# =============================================================================
# SLA & Performance Alerts
# =============================================================================
- name: sla_alerts
rules:
# NFR-P4: API response time P95 < 200ms
- alert: HighApiLatencyP95
expr: histogram_quantile(0.95, sum(rate(classeo_http_request_duration_seconds_bucket{job="classeo-backend"}[5m])) by (le)) > 0.2
for: 2m
labels:
severity: warning
team: platform
annotations:
summary: "API P95 latency above SLA threshold"
description: "P95 latency is {{ $value | humanizeDuration }} (threshold: 200ms)"
runbook_url: "https://docs.classeo.local/runbooks/high-latency"
# NFR-P5: API response time P99 < 500ms
- alert: HighApiLatencyP99
expr: histogram_quantile(0.99, sum(rate(classeo_http_request_duration_seconds_bucket{job="classeo-backend"}[5m])) by (le)) > 0.5
for: 5m
labels:
severity: critical
team: platform
annotations:
summary: "API P99 latency critically high"
description: "P99 latency is {{ $value | humanizeDuration }} (threshold: 500ms)"
runbook_url: "https://docs.classeo.local/runbooks/high-latency"
# Error rate > 1% (AC3: error rate > 1% pendant 2 min)
- alert: HighErrorRate
expr: sum(rate(classeo_http_requests_total{status=~"5.."}[2m])) / sum(rate(classeo_http_requests_total[2m])) > 0.01
for: 2m
labels:
severity: critical
team: platform
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} (threshold: 1%)"
runbook_url: "https://docs.classeo.local/runbooks/high-error-rate"
# =============================================================================
# Infrastructure Alerts
# =============================================================================
- name: infrastructure_alerts
rules:
# Redis memory usage
- alert: RedisHighMemoryUsage
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.8
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "Redis memory usage above 80%"
description: "Redis is using {{ $value | humanizePercentage }} of available memory"
runbook_url: "https://docs.classeo.local/runbooks/redis-memory"
# Database connection issues
- alert: DatabaseConnectionFailed
expr: pg_up == 0
for: 1m
labels:
severity: critical
team: platform
annotations:
summary: "PostgreSQL connection failed"
description: "Cannot connect to PostgreSQL database"
runbook_url: "https://docs.classeo.local/runbooks/database-down"
# RabbitMQ queue backlog
- alert: RabbitMQQueueBacklog
expr: rabbitmq_queue_messages > 10000
for: 10m
labels:
severity: warning
team: platform
annotations:
summary: "RabbitMQ queue backlog growing"
description: "Queue has {{ $value }} messages pending"
runbook_url: "https://docs.classeo.local/runbooks/rabbitmq-backlog"
# =============================================================================
# Messenger Queue Alerts
# =============================================================================
- name: messenger_alerts
rules:
# Messenger queue backlog > 100 messages for 5 minutes
- alert: MessengerQueueBacklog
expr: classeo_messenger_messages_waiting{transport="async"} > 100
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "File d'attente Messenger surchargee"
description: "{{ $value }} messages en attente depuis 5 minutes"
runbook_url: "https://docs.classeo.local/runbooks/messenger-backlog"
# =============================================================================
# Security Alerts
# =============================================================================
- name: security_alerts
rules:
# NFR-S2: Excessive login failures (potential brute force)
- alert: ExcessiveLoginFailures
expr: sum(rate(classeo_login_failures_total[5m])) > 10
for: 2m
labels:
severity: warning
team: security
annotations:
summary: "Excessive login failures detected"
description: "More than 10 failed logins per minute"
runbook_url: "https://docs.classeo.local/runbooks/brute-force"
# Per-tenant excessive login failures
- alert: TenantExcessiveLoginFailures
expr: sum by (tenant_id) (rate(classeo_login_failures_total[5m])) > 5
for: 5m
labels:
severity: warning
team: security
annotations:
summary: "Excessive login failures for tenant {{ $labels.tenant_id }}"
description: "More than 5 failed logins per minute for single tenant"
runbook_url: "https://docs.classeo.local/runbooks/brute-force"
# =============================================================================
# Application Health Alerts
# =============================================================================
- name: application_alerts
rules:
# Backend scrape target down
- alert: ApplicationUnhealthy
expr: up{job="classeo-backend"} == 0
for: 1m
labels:
severity: critical
team: platform
annotations:
summary: "Backend application is down"
description: "Cannot scrape metrics from backend - application may be crashed or unreachable"
runbook_url: "https://docs.classeo.local/runbooks/health-check"
# Infrastructure service unhealthy (postgres, redis, rabbitmq)
- alert: InfrastructureServiceUnhealthy
expr: classeo_health_check_status == 0
for: 2m
labels:
severity: warning
team: platform
annotations:
summary: "Infrastructure service {{ $labels.service }} is unhealthy"
description: "Health check for {{ $labels.service }} is failing"
runbook_url: "https://docs.classeo.local/runbooks/degraded-mode"