# Prometheus Alert Rules for Classeo # NFR-OB2: Automated alerts when SLA threatened (< 5 min detection) groups: # ============================================================================= # SLA & Performance Alerts # ============================================================================= - name: sla_alerts rules: # NFR-P4: API response time P95 < 200ms - alert: HighApiLatencyP95 expr: histogram_quantile(0.95, sum(rate(classeo_http_request_duration_seconds_bucket{job="classeo-backend"}[5m])) by (le)) > 0.2 for: 2m labels: severity: warning team: platform annotations: summary: "API P95 latency above SLA threshold" description: "P95 latency is {{ $value | humanizeDuration }} (threshold: 200ms)" runbook_url: "https://docs.classeo.local/runbooks/high-latency" # NFR-P5: API response time P99 < 500ms - alert: HighApiLatencyP99 expr: histogram_quantile(0.99, sum(rate(classeo_http_request_duration_seconds_bucket{job="classeo-backend"}[5m])) by (le)) > 0.5 for: 5m labels: severity: critical team: platform annotations: summary: "API P99 latency critically high" description: "P99 latency is {{ $value | humanizeDuration }} (threshold: 500ms)" runbook_url: "https://docs.classeo.local/runbooks/high-latency" # Error rate > 1% (AC3: error rate > 1% pendant 2 min) - alert: HighErrorRate expr: sum(rate(classeo_http_requests_total{status=~"5.."}[2m])) / sum(rate(classeo_http_requests_total[2m])) > 0.01 for: 2m labels: severity: critical team: platform annotations: summary: "High error rate detected" description: "Error rate is {{ $value | humanizePercentage }} (threshold: 1%)" runbook_url: "https://docs.classeo.local/runbooks/high-error-rate" # ============================================================================= # Infrastructure Alerts # ============================================================================= - name: infrastructure_alerts rules: # Redis memory usage - alert: RedisHighMemoryUsage expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.8 for: 5m labels: severity: warning team: platform annotations: summary: "Redis memory usage above 80%" description: "Redis is using {{ $value | humanizePercentage }} of available memory" runbook_url: "https://docs.classeo.local/runbooks/redis-memory" # Database connection issues - alert: DatabaseConnectionFailed expr: pg_up == 0 for: 1m labels: severity: critical team: platform annotations: summary: "PostgreSQL connection failed" description: "Cannot connect to PostgreSQL database" runbook_url: "https://docs.classeo.local/runbooks/database-down" # RabbitMQ queue backlog - alert: RabbitMQQueueBacklog expr: rabbitmq_queue_messages > 10000 for: 10m labels: severity: warning team: platform annotations: summary: "RabbitMQ queue backlog growing" description: "Queue has {{ $value }} messages pending" runbook_url: "https://docs.classeo.local/runbooks/rabbitmq-backlog" # ============================================================================= # Messenger Queue Alerts # ============================================================================= - name: messenger_alerts rules: # Messenger queue backlog > 100 messages for 5 minutes - alert: MessengerQueueBacklog expr: classeo_messenger_messages_waiting{transport="async"} > 100 for: 5m labels: severity: warning team: platform annotations: summary: "File d'attente Messenger surchargee" description: "{{ $value }} messages en attente depuis 5 minutes" runbook_url: "https://docs.classeo.local/runbooks/messenger-backlog" # ============================================================================= # Security Alerts # ============================================================================= - name: security_alerts rules: # NFR-S2: Excessive login failures (potential brute force) - alert: ExcessiveLoginFailures expr: sum(rate(classeo_login_failures_total[5m])) > 10 for: 2m labels: severity: warning team: security annotations: summary: "Excessive login failures detected" description: "More than 10 failed logins per minute" runbook_url: "https://docs.classeo.local/runbooks/brute-force" # Per-tenant excessive login failures - alert: TenantExcessiveLoginFailures expr: sum by (tenant_id) (rate(classeo_login_failures_total[5m])) > 5 for: 5m labels: severity: warning team: security annotations: summary: "Excessive login failures for tenant {{ $labels.tenant_id }}" description: "More than 5 failed logins per minute for single tenant" runbook_url: "https://docs.classeo.local/runbooks/brute-force" # ============================================================================= # Application Health Alerts # ============================================================================= - name: application_alerts rules: # Backend scrape target down - alert: ApplicationUnhealthy expr: up{job="classeo-backend"} == 0 for: 1m labels: severity: critical team: platform annotations: summary: "Backend application is down" description: "Cannot scrape metrics from backend - application may be crashed or unreachable" runbook_url: "https://docs.classeo.local/runbooks/health-check" # Infrastructure service unhealthy (postgres, redis, rabbitmq) - alert: InfrastructureServiceUnhealthy expr: classeo_health_check_status == 0 for: 2m labels: severity: warning team: platform annotations: summary: "Infrastructure service {{ $labels.service }} is unhealthy" description: "Health check for {{ $labels.service }} is failing" runbook_url: "https://docs.classeo.local/runbooks/degraded-mode"