feat: Observabilité et monitoring complet
Implémentation complète de la stack d'observabilité pour le monitoring de la plateforme multi-tenant Classeo. ## Error Tracking (GlitchTip) - Intégration Sentry SDK avec GlitchTip auto-hébergé - Scrubber PII avant envoi (RGPD: emails, tokens JWT, NIR français) - Contexte enrichi: tenant_id, user_id, correlation_id - Configuration backend (sentry.yaml) et frontend (sentry.ts) ## Metrics (Prometheus) - Endpoint /metrics avec restriction IP en production - Métriques HTTP: requests_total, request_duration_seconds (histogramme) - Métriques sécurité: login_failures_total par tenant - Métriques santé: health_check_status (postgres, redis, rabbitmq) - Storage Redis pour persistance entre requêtes ## Logs (Loki) - Processors Monolog: CorrelationIdLogProcessor, PiiScrubberLogProcessor - Détection PII: emails, téléphones FR, tokens JWT, NIR français - Labels structurés: tenant_id, correlation_id, level ## Dashboards (Grafana) - Dashboard principal: latence P50/P95/P99, error rate, RPS - Dashboard par tenant: métriques isolées par sous-domaine - Dashboard infrastructure: santé postgres/redis/rabbitmq - Datasources avec UIDs fixes pour portabilité ## Alertes (Alertmanager) - HighApiLatencyP95/P99: SLA monitoring (200ms/500ms) - HighErrorRate: error rate > 1% pendant 2 min - ExcessiveLoginFailures: détection brute force - ApplicationUnhealthy: health check failures ## Infrastructure - InfrastructureHealthChecker: service partagé (DRY) - HealthCheckController: endpoint /health pour load balancers - Pre-push hook: make ci && make e2e avant push
This commit is contained in:
206
compose.monitoring.yaml
Normal file
206
compose.monitoring.yaml
Normal file
@@ -0,0 +1,206 @@
|
||||
# =============================================================================
|
||||
# MONITORING & OBSERVABILITY SERVICES
|
||||
# =============================================================================
|
||||
# Usage: docker compose -f compose.yaml -f compose.monitoring.yaml up -d
|
||||
# =============================================================================
|
||||
|
||||
services:
|
||||
# =============================================================================
|
||||
# ERROR TRACKING - GlitchTip (Sentry-compatible)
|
||||
# =============================================================================
|
||||
glitchtip:
|
||||
image: glitchtip/glitchtip:v4.1
|
||||
container_name: classeo_glitchtip
|
||||
depends_on:
|
||||
glitchtip-db:
|
||||
condition: service_healthy
|
||||
glitchtip-redis:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
DATABASE_URL: postgresql://glitchtip:glitchtip@glitchtip-db:5432/glitchtip
|
||||
SECRET_KEY: ${GLITCHTIP_SECRET_KEY:-change_me_in_production_very_secret_key}
|
||||
REDIS_URL: redis://glitchtip-redis:6379/0
|
||||
GLITCHTIP_DOMAIN: ${GLITCHTIP_DOMAIN:-http://localhost:8081}
|
||||
DEFAULT_FROM_EMAIL: ${DEFAULT_FROM_EMAIL:-glitchtip@classeo.local}
|
||||
EMAIL_URL: ${EMAIL_URL:-smtp://mailpit:1025}
|
||||
CELERY_WORKER_AUTOSCALE: "1,3"
|
||||
CELERY_WORKER_MAX_TASKS_PER_CHILD: "10000"
|
||||
ENABLE_ORGANIZATION_CREATION: "true"
|
||||
ENABLE_USER_REGISTRATION: "true"
|
||||
ports:
|
||||
- "8081:8080"
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8080/_health/')\""]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 5
|
||||
start_period: 60s
|
||||
restart: unless-stopped
|
||||
|
||||
glitchtip-worker:
|
||||
image: glitchtip/glitchtip:v4.1
|
||||
container_name: classeo_glitchtip_worker
|
||||
depends_on:
|
||||
glitchtip-db:
|
||||
condition: service_healthy
|
||||
glitchtip-redis:
|
||||
condition: service_healthy
|
||||
environment:
|
||||
DATABASE_URL: postgresql://glitchtip:glitchtip@glitchtip-db:5432/glitchtip
|
||||
SECRET_KEY: ${GLITCHTIP_SECRET_KEY:-change_me_in_production_very_secret_key}
|
||||
REDIS_URL: redis://glitchtip-redis:6379/0
|
||||
command: ./bin/run-celery-with-beat.sh
|
||||
restart: unless-stopped
|
||||
|
||||
glitchtip-db:
|
||||
image: postgres:18.1-alpine
|
||||
container_name: classeo_glitchtip_db
|
||||
environment:
|
||||
POSTGRES_DB: glitchtip
|
||||
POSTGRES_USER: glitchtip
|
||||
POSTGRES_PASSWORD: glitchtip
|
||||
volumes:
|
||||
- glitchtip_postgres_data:/var/lib/postgresql/data
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U glitchtip -d glitchtip"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 10s
|
||||
restart: unless-stopped
|
||||
|
||||
glitchtip-redis:
|
||||
image: redis:7.4-alpine
|
||||
container_name: classeo_glitchtip_redis
|
||||
command: redis-server --appendonly yes --maxmemory 128mb --maxmemory-policy allkeys-lru
|
||||
volumes:
|
||||
- glitchtip_redis_data:/data
|
||||
healthcheck:
|
||||
test: ["CMD", "redis-cli", "ping"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 5s
|
||||
restart: unless-stopped
|
||||
|
||||
# =============================================================================
|
||||
# METRICS - Prometheus
|
||||
# =============================================================================
|
||||
prometheus:
|
||||
image: prom/prometheus:v3.2.0
|
||||
container_name: classeo_prometheus
|
||||
command:
|
||||
- '--config.file=/etc/prometheus/prometheus.yml'
|
||||
- '--storage.tsdb.path=/prometheus'
|
||||
- '--storage.tsdb.retention.time=15d'
|
||||
- '--web.enable-lifecycle'
|
||||
volumes:
|
||||
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
||||
- ./monitoring/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
|
||||
- prometheus_data:/prometheus
|
||||
ports:
|
||||
- "9090:9090"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-q", "--spider", "http://localhost:9090/-/healthy"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 10s
|
||||
restart: unless-stopped
|
||||
|
||||
# =============================================================================
|
||||
# DASHBOARDS - Grafana
|
||||
# =============================================================================
|
||||
grafana:
|
||||
image: grafana/grafana:11.4.0
|
||||
container_name: classeo_grafana
|
||||
environment:
|
||||
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
|
||||
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin}
|
||||
GF_USERS_ALLOW_SIGN_UP: "false"
|
||||
GF_SERVER_ROOT_URL: ${GRAFANA_ROOT_URL:-http://localhost:3001}
|
||||
volumes:
|
||||
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
|
||||
- grafana_data:/var/lib/grafana
|
||||
ports:
|
||||
- "3001:3000"
|
||||
depends_on:
|
||||
prometheus:
|
||||
condition: service_healthy
|
||||
loki:
|
||||
condition: service_healthy
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-q", "--spider", "http://localhost:3000/api/health"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 30s
|
||||
restart: unless-stopped
|
||||
|
||||
# =============================================================================
|
||||
# LOGS - Loki
|
||||
# =============================================================================
|
||||
loki:
|
||||
image: grafana/loki:3.3.2
|
||||
container_name: classeo_loki
|
||||
command: -config.file=/etc/loki/config.yml
|
||||
volumes:
|
||||
- ./monitoring/loki/config.yml:/etc/loki/config.yml:ro
|
||||
- loki_data:/loki
|
||||
ports:
|
||||
- "3100:3100"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-q", "--spider", "http://localhost:3100/ready"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 30s
|
||||
restart: unless-stopped
|
||||
|
||||
# =============================================================================
|
||||
# LOG COLLECTOR - Promtail
|
||||
# =============================================================================
|
||||
promtail:
|
||||
image: grafana/promtail:3.3.2
|
||||
container_name: classeo_promtail
|
||||
command: -config.file=/etc/promtail/config.yml
|
||||
volumes:
|
||||
- ./monitoring/promtail/config.yml:/etc/promtail/config.yml:ro
|
||||
- /var/run/docker.sock:/var/run/docker.sock:ro
|
||||
depends_on:
|
||||
loki:
|
||||
condition: service_healthy
|
||||
restart: unless-stopped
|
||||
|
||||
# =============================================================================
|
||||
# ALERTING - Alertmanager
|
||||
# =============================================================================
|
||||
alertmanager:
|
||||
image: prom/alertmanager:v0.28.0
|
||||
container_name: classeo_alertmanager
|
||||
command:
|
||||
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
||||
- '--storage.path=/alertmanager'
|
||||
volumes:
|
||||
- ./monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
||||
- alertmanager_data:/alertmanager
|
||||
ports:
|
||||
- "9093:9093"
|
||||
healthcheck:
|
||||
test: ["CMD", "wget", "-q", "--spider", "http://localhost:9093/-/healthy"]
|
||||
interval: 10s
|
||||
timeout: 5s
|
||||
retries: 5
|
||||
start_period: 10s
|
||||
restart: unless-stopped
|
||||
|
||||
# =============================================================================
|
||||
# VOLUMES PERSISTANTS MONITORING
|
||||
# =============================================================================
|
||||
volumes:
|
||||
glitchtip_postgres_data:
|
||||
glitchtip_redis_data:
|
||||
prometheus_data:
|
||||
grafana_data:
|
||||
loki_data:
|
||||
alertmanager_data:
|
||||
Reference in New Issue
Block a user