feat: Observabilité et monitoring complet

Implémentation complète de la stack d'observabilité pour le monitoring
de la plateforme multi-tenant Classeo.

## Error Tracking (GlitchTip)
- Intégration Sentry SDK avec GlitchTip auto-hébergé
- Scrubber PII avant envoi (RGPD: emails, tokens JWT, NIR français)
- Contexte enrichi: tenant_id, user_id, correlation_id
- Configuration backend (sentry.yaml) et frontend (sentry.ts)

## Metrics (Prometheus)
- Endpoint /metrics avec restriction IP en production
- Métriques HTTP: requests_total, request_duration_seconds (histogramme)
- Métriques sécurité: login_failures_total par tenant
- Métriques santé: health_check_status (postgres, redis, rabbitmq)
- Storage Redis pour persistance entre requêtes

## Logs (Loki)
- Processors Monolog: CorrelationIdLogProcessor, PiiScrubberLogProcessor
- Détection PII: emails, téléphones FR, tokens JWT, NIR français
- Labels structurés: tenant_id, correlation_id, level

## Dashboards (Grafana)
- Dashboard principal: latence P50/P95/P99, error rate, RPS
- Dashboard par tenant: métriques isolées par sous-domaine
- Dashboard infrastructure: santé postgres/redis/rabbitmq
- Datasources avec UIDs fixes pour portabilité

## Alertes (Alertmanager)
- HighApiLatencyP95/P99: SLA monitoring (200ms/500ms)
- HighErrorRate: error rate > 1% pendant 2 min
- ExcessiveLoginFailures: détection brute force
- ApplicationUnhealthy: health check failures

## Infrastructure
- InfrastructureHealthChecker: service partagé (DRY)
- HealthCheckController: endpoint /health pour load balancers
- Pre-push hook: make ci && make e2e avant push
This commit is contained in:
2026-02-04 11:47:01 +01:00
parent 2ed60fdcc1
commit d3c6773be5
48 changed files with 5846 additions and 32 deletions

View File

@@ -0,0 +1,143 @@
# Prometheus Alert Rules for Classeo
# NFR-OB2: Automated alerts when SLA threatened (< 5 min detection)
groups:
# =============================================================================
# SLA & Performance Alerts
# =============================================================================
- name: sla_alerts
rules:
# NFR-P4: API response time P95 < 200ms
- alert: HighApiLatencyP95
expr: histogram_quantile(0.95, sum(rate(classeo_http_request_duration_seconds_bucket{job="classeo-backend"}[5m])) by (le)) > 0.2
for: 2m
labels:
severity: warning
team: platform
annotations:
summary: "API P95 latency above SLA threshold"
description: "P95 latency is {{ $value | humanizeDuration }} (threshold: 200ms)"
runbook_url: "https://docs.classeo.local/runbooks/high-latency"
# NFR-P5: API response time P99 < 500ms
- alert: HighApiLatencyP99
expr: histogram_quantile(0.99, sum(rate(classeo_http_request_duration_seconds_bucket{job="classeo-backend"}[5m])) by (le)) > 0.5
for: 5m
labels:
severity: critical
team: platform
annotations:
summary: "API P99 latency critically high"
description: "P99 latency is {{ $value | humanizeDuration }} (threshold: 500ms)"
runbook_url: "https://docs.classeo.local/runbooks/high-latency"
# Error rate > 1% (AC3: error rate > 1% pendant 2 min)
- alert: HighErrorRate
expr: sum(rate(classeo_http_requests_total{status=~"5.."}[2m])) / sum(rate(classeo_http_requests_total[2m])) > 0.01
for: 2m
labels:
severity: critical
team: platform
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} (threshold: 1%)"
runbook_url: "https://docs.classeo.local/runbooks/high-error-rate"
# =============================================================================
# Infrastructure Alerts
# =============================================================================
- name: infrastructure_alerts
rules:
# Redis memory usage
- alert: RedisHighMemoryUsage
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.8
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "Redis memory usage above 80%"
description: "Redis is using {{ $value | humanizePercentage }} of available memory"
runbook_url: "https://docs.classeo.local/runbooks/redis-memory"
# Database connection issues
- alert: DatabaseConnectionFailed
expr: pg_up == 0
for: 1m
labels:
severity: critical
team: platform
annotations:
summary: "PostgreSQL connection failed"
description: "Cannot connect to PostgreSQL database"
runbook_url: "https://docs.classeo.local/runbooks/database-down"
# RabbitMQ queue backlog
- alert: RabbitMQQueueBacklog
expr: rabbitmq_queue_messages > 10000
for: 10m
labels:
severity: warning
team: platform
annotations:
summary: "RabbitMQ queue backlog growing"
description: "Queue has {{ $value }} messages pending"
runbook_url: "https://docs.classeo.local/runbooks/rabbitmq-backlog"
# =============================================================================
# Security Alerts
# =============================================================================
- name: security_alerts
rules:
# NFR-S2: Excessive login failures (potential brute force)
- alert: ExcessiveLoginFailures
expr: sum(rate(classeo_login_failures_total[5m])) > 10
for: 2m
labels:
severity: warning
team: security
annotations:
summary: "Excessive login failures detected"
description: "More than 10 failed logins per minute"
runbook_url: "https://docs.classeo.local/runbooks/brute-force"
# Per-tenant excessive login failures
- alert: TenantExcessiveLoginFailures
expr: sum by (tenant_id) (rate(classeo_login_failures_total[5m])) > 5
for: 5m
labels:
severity: warning
team: security
annotations:
summary: "Excessive login failures for tenant {{ $labels.tenant_id }}"
description: "More than 5 failed logins per minute for single tenant"
runbook_url: "https://docs.classeo.local/runbooks/brute-force"
# =============================================================================
# Application Health Alerts
# =============================================================================
- name: application_alerts
rules:
# Backend scrape target down
- alert: ApplicationUnhealthy
expr: up{job="classeo-backend"} == 0
for: 1m
labels:
severity: critical
team: platform
annotations:
summary: "Backend application is down"
description: "Cannot scrape metrics from backend - application may be crashed or unreachable"
runbook_url: "https://docs.classeo.local/runbooks/health-check"
# Infrastructure service unhealthy (postgres, redis, rabbitmq)
- alert: InfrastructureServiceUnhealthy
expr: classeo_health_check_status == 0
for: 2m
labels:
severity: warning
team: platform
annotations:
summary: "Infrastructure service {{ $labels.service }} is unhealthy"
description: "Health check for {{ $labels.service }} is failing"
runbook_url: "https://docs.classeo.local/runbooks/degraded-mode"

View File

@@ -0,0 +1,52 @@
# Prometheus Configuration for Classeo
# Scrapes metrics from PHP backend and other services
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
environment: ${ENVIRONMENT:-development}
project: classeo
# Alerting configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load alert rules
rule_files:
- /etc/prometheus/alerts.yml
# Scrape configurations
scrape_configs:
# Prometheus self-monitoring
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# PHP Backend metrics
- job_name: 'classeo-backend'
metrics_path: '/metrics'
static_configs:
- targets: ['php:8000']
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: 'classeo-backend'
# Redis metrics (via redis_exporter would be added in production)
# For now, we rely on application-level metrics
# PostgreSQL metrics (via postgres_exporter would be added in production)
# For now, we rely on application-level metrics
# RabbitMQ metrics
- job_name: 'rabbitmq'
static_configs:
- targets: ['rabbitmq:15692']
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: 'classeo-rabbitmq'