feat: Observabilité et monitoring complet
Implémentation complète de la stack d'observabilité pour le monitoring de la plateforme multi-tenant Classeo. ## Error Tracking (GlitchTip) - Intégration Sentry SDK avec GlitchTip auto-hébergé - Scrubber PII avant envoi (RGPD: emails, tokens JWT, NIR français) - Contexte enrichi: tenant_id, user_id, correlation_id - Configuration backend (sentry.yaml) et frontend (sentry.ts) ## Metrics (Prometheus) - Endpoint /metrics avec restriction IP en production - Métriques HTTP: requests_total, request_duration_seconds (histogramme) - Métriques sécurité: login_failures_total par tenant - Métriques santé: health_check_status (postgres, redis, rabbitmq) - Storage Redis pour persistance entre requêtes ## Logs (Loki) - Processors Monolog: CorrelationIdLogProcessor, PiiScrubberLogProcessor - Détection PII: emails, téléphones FR, tokens JWT, NIR français - Labels structurés: tenant_id, correlation_id, level ## Dashboards (Grafana) - Dashboard principal: latence P50/P95/P99, error rate, RPS - Dashboard par tenant: métriques isolées par sous-domaine - Dashboard infrastructure: santé postgres/redis/rabbitmq - Datasources avec UIDs fixes pour portabilité ## Alertes (Alertmanager) - HighApiLatencyP95/P99: SLA monitoring (200ms/500ms) - HighErrorRate: error rate > 1% pendant 2 min - ExcessiveLoginFailures: détection brute force - ApplicationUnhealthy: health check failures ## Infrastructure - InfrastructureHealthChecker: service partagé (DRY) - HealthCheckController: endpoint /health pour load balancers - Pre-push hook: make ci && make e2e avant push
This commit is contained in:
143
monitoring/prometheus/alerts.yml
Normal file
143
monitoring/prometheus/alerts.yml
Normal file
@@ -0,0 +1,143 @@
|
||||
# Prometheus Alert Rules for Classeo
|
||||
# NFR-OB2: Automated alerts when SLA threatened (< 5 min detection)
|
||||
|
||||
groups:
|
||||
# =============================================================================
|
||||
# SLA & Performance Alerts
|
||||
# =============================================================================
|
||||
- name: sla_alerts
|
||||
rules:
|
||||
# NFR-P4: API response time P95 < 200ms
|
||||
- alert: HighApiLatencyP95
|
||||
expr: histogram_quantile(0.95, sum(rate(classeo_http_request_duration_seconds_bucket{job="classeo-backend"}[5m])) by (le)) > 0.2
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "API P95 latency above SLA threshold"
|
||||
description: "P95 latency is {{ $value | humanizeDuration }} (threshold: 200ms)"
|
||||
runbook_url: "https://docs.classeo.local/runbooks/high-latency"
|
||||
|
||||
# NFR-P5: API response time P99 < 500ms
|
||||
- alert: HighApiLatencyP99
|
||||
expr: histogram_quantile(0.99, sum(rate(classeo_http_request_duration_seconds_bucket{job="classeo-backend"}[5m])) by (le)) > 0.5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: critical
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "API P99 latency critically high"
|
||||
description: "P99 latency is {{ $value | humanizeDuration }} (threshold: 500ms)"
|
||||
runbook_url: "https://docs.classeo.local/runbooks/high-latency"
|
||||
|
||||
# Error rate > 1% (AC3: error rate > 1% pendant 2 min)
|
||||
- alert: HighErrorRate
|
||||
expr: sum(rate(classeo_http_requests_total{status=~"5.."}[2m])) / sum(rate(classeo_http_requests_total[2m])) > 0.01
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "High error rate detected"
|
||||
description: "Error rate is {{ $value | humanizePercentage }} (threshold: 1%)"
|
||||
runbook_url: "https://docs.classeo.local/runbooks/high-error-rate"
|
||||
|
||||
# =============================================================================
|
||||
# Infrastructure Alerts
|
||||
# =============================================================================
|
||||
- name: infrastructure_alerts
|
||||
rules:
|
||||
# Redis memory usage
|
||||
- alert: RedisHighMemoryUsage
|
||||
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.8
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Redis memory usage above 80%"
|
||||
description: "Redis is using {{ $value | humanizePercentage }} of available memory"
|
||||
runbook_url: "https://docs.classeo.local/runbooks/redis-memory"
|
||||
|
||||
# Database connection issues
|
||||
- alert: DatabaseConnectionFailed
|
||||
expr: pg_up == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "PostgreSQL connection failed"
|
||||
description: "Cannot connect to PostgreSQL database"
|
||||
runbook_url: "https://docs.classeo.local/runbooks/database-down"
|
||||
|
||||
# RabbitMQ queue backlog
|
||||
- alert: RabbitMQQueueBacklog
|
||||
expr: rabbitmq_queue_messages > 10000
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "RabbitMQ queue backlog growing"
|
||||
description: "Queue has {{ $value }} messages pending"
|
||||
runbook_url: "https://docs.classeo.local/runbooks/rabbitmq-backlog"
|
||||
|
||||
# =============================================================================
|
||||
# Security Alerts
|
||||
# =============================================================================
|
||||
- name: security_alerts
|
||||
rules:
|
||||
# NFR-S2: Excessive login failures (potential brute force)
|
||||
- alert: ExcessiveLoginFailures
|
||||
expr: sum(rate(classeo_login_failures_total[5m])) > 10
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
team: security
|
||||
annotations:
|
||||
summary: "Excessive login failures detected"
|
||||
description: "More than 10 failed logins per minute"
|
||||
runbook_url: "https://docs.classeo.local/runbooks/brute-force"
|
||||
|
||||
# Per-tenant excessive login failures
|
||||
- alert: TenantExcessiveLoginFailures
|
||||
expr: sum by (tenant_id) (rate(classeo_login_failures_total[5m])) > 5
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
team: security
|
||||
annotations:
|
||||
summary: "Excessive login failures for tenant {{ $labels.tenant_id }}"
|
||||
description: "More than 5 failed logins per minute for single tenant"
|
||||
runbook_url: "https://docs.classeo.local/runbooks/brute-force"
|
||||
|
||||
# =============================================================================
|
||||
# Application Health Alerts
|
||||
# =============================================================================
|
||||
- name: application_alerts
|
||||
rules:
|
||||
# Backend scrape target down
|
||||
- alert: ApplicationUnhealthy
|
||||
expr: up{job="classeo-backend"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Backend application is down"
|
||||
description: "Cannot scrape metrics from backend - application may be crashed or unreachable"
|
||||
runbook_url: "https://docs.classeo.local/runbooks/health-check"
|
||||
|
||||
# Infrastructure service unhealthy (postgres, redis, rabbitmq)
|
||||
- alert: InfrastructureServiceUnhealthy
|
||||
expr: classeo_health_check_status == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
team: platform
|
||||
annotations:
|
||||
summary: "Infrastructure service {{ $labels.service }} is unhealthy"
|
||||
description: "Health check for {{ $labels.service }} is failing"
|
||||
runbook_url: "https://docs.classeo.local/runbooks/degraded-mode"
|
||||
Reference in New Issue
Block a user