Implémentation complète de la stack d'observabilité pour le monitoring de la plateforme multi-tenant Classeo. ## Error Tracking (GlitchTip) - Intégration Sentry SDK avec GlitchTip auto-hébergé - Scrubber PII avant envoi (RGPD: emails, tokens JWT, NIR français) - Contexte enrichi: tenant_id, user_id, correlation_id - Configuration backend (sentry.yaml) et frontend (sentry.ts) ## Metrics (Prometheus) - Endpoint /metrics avec restriction IP en production - Métriques HTTP: requests_total, request_duration_seconds (histogramme) - Métriques sécurité: login_failures_total par tenant - Métriques santé: health_check_status (postgres, redis, rabbitmq) - Storage Redis pour persistance entre requêtes ## Logs (Loki) - Processors Monolog: CorrelationIdLogProcessor, PiiScrubberLogProcessor - Détection PII: emails, téléphones FR, tokens JWT, NIR français - Labels structurés: tenant_id, correlation_id, level ## Dashboards (Grafana) - Dashboard principal: latence P50/P95/P99, error rate, RPS - Dashboard par tenant: métriques isolées par sous-domaine - Dashboard infrastructure: santé postgres/redis/rabbitmq - Datasources avec UIDs fixes pour portabilité ## Alertes (Alertmanager) - HighApiLatencyP95/P99: SLA monitoring (200ms/500ms) - HighErrorRate: error rate > 1% pendant 2 min - ExcessiveLoginFailures: détection brute force - ApplicationUnhealthy: health check failures ## Infrastructure - InfrastructureHealthChecker: service partagé (DRY) - HealthCheckController: endpoint /health pour load balancers - Pre-push hook: make ci && make e2e avant push
207 lines
7.1 KiB
YAML
207 lines
7.1 KiB
YAML
# =============================================================================
|
|
# MONITORING & OBSERVABILITY SERVICES
|
|
# =============================================================================
|
|
# Usage: docker compose -f compose.yaml -f compose.monitoring.yaml up -d
|
|
# =============================================================================
|
|
|
|
services:
|
|
# =============================================================================
|
|
# ERROR TRACKING - GlitchTip (Sentry-compatible)
|
|
# =============================================================================
|
|
glitchtip:
|
|
image: glitchtip/glitchtip:v4.1
|
|
container_name: classeo_glitchtip
|
|
depends_on:
|
|
glitchtip-db:
|
|
condition: service_healthy
|
|
glitchtip-redis:
|
|
condition: service_healthy
|
|
environment:
|
|
DATABASE_URL: postgresql://glitchtip:glitchtip@glitchtip-db:5432/glitchtip
|
|
SECRET_KEY: ${GLITCHTIP_SECRET_KEY:-change_me_in_production_very_secret_key}
|
|
REDIS_URL: redis://glitchtip-redis:6379/0
|
|
GLITCHTIP_DOMAIN: ${GLITCHTIP_DOMAIN:-http://localhost:8081}
|
|
DEFAULT_FROM_EMAIL: ${DEFAULT_FROM_EMAIL:-glitchtip@classeo.local}
|
|
EMAIL_URL: ${EMAIL_URL:-smtp://mailpit:1025}
|
|
CELERY_WORKER_AUTOSCALE: "1,3"
|
|
CELERY_WORKER_MAX_TASKS_PER_CHILD: "10000"
|
|
ENABLE_ORGANIZATION_CREATION: "true"
|
|
ENABLE_USER_REGISTRATION: "true"
|
|
ports:
|
|
- "8081:8080"
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8080/_health/')\""]
|
|
interval: 30s
|
|
timeout: 10s
|
|
retries: 5
|
|
start_period: 60s
|
|
restart: unless-stopped
|
|
|
|
glitchtip-worker:
|
|
image: glitchtip/glitchtip:v4.1
|
|
container_name: classeo_glitchtip_worker
|
|
depends_on:
|
|
glitchtip-db:
|
|
condition: service_healthy
|
|
glitchtip-redis:
|
|
condition: service_healthy
|
|
environment:
|
|
DATABASE_URL: postgresql://glitchtip:glitchtip@glitchtip-db:5432/glitchtip
|
|
SECRET_KEY: ${GLITCHTIP_SECRET_KEY:-change_me_in_production_very_secret_key}
|
|
REDIS_URL: redis://glitchtip-redis:6379/0
|
|
command: ./bin/run-celery-with-beat.sh
|
|
restart: unless-stopped
|
|
|
|
glitchtip-db:
|
|
image: postgres:18.1-alpine
|
|
container_name: classeo_glitchtip_db
|
|
environment:
|
|
POSTGRES_DB: glitchtip
|
|
POSTGRES_USER: glitchtip
|
|
POSTGRES_PASSWORD: glitchtip
|
|
volumes:
|
|
- glitchtip_postgres_data:/var/lib/postgresql/data
|
|
healthcheck:
|
|
test: ["CMD-SHELL", "pg_isready -U glitchtip -d glitchtip"]
|
|
interval: 10s
|
|
timeout: 5s
|
|
retries: 5
|
|
start_period: 10s
|
|
restart: unless-stopped
|
|
|
|
glitchtip-redis:
|
|
image: redis:7.4-alpine
|
|
container_name: classeo_glitchtip_redis
|
|
command: redis-server --appendonly yes --maxmemory 128mb --maxmemory-policy allkeys-lru
|
|
volumes:
|
|
- glitchtip_redis_data:/data
|
|
healthcheck:
|
|
test: ["CMD", "redis-cli", "ping"]
|
|
interval: 10s
|
|
timeout: 5s
|
|
retries: 5
|
|
start_period: 5s
|
|
restart: unless-stopped
|
|
|
|
# =============================================================================
|
|
# METRICS - Prometheus
|
|
# =============================================================================
|
|
prometheus:
|
|
image: prom/prometheus:v3.2.0
|
|
container_name: classeo_prometheus
|
|
command:
|
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
|
- '--storage.tsdb.path=/prometheus'
|
|
- '--storage.tsdb.retention.time=15d'
|
|
- '--web.enable-lifecycle'
|
|
volumes:
|
|
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
|
|
- ./monitoring/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
|
|
- prometheus_data:/prometheus
|
|
ports:
|
|
- "9090:9090"
|
|
healthcheck:
|
|
test: ["CMD", "wget", "-q", "--spider", "http://localhost:9090/-/healthy"]
|
|
interval: 10s
|
|
timeout: 5s
|
|
retries: 5
|
|
start_period: 10s
|
|
restart: unless-stopped
|
|
|
|
# =============================================================================
|
|
# DASHBOARDS - Grafana
|
|
# =============================================================================
|
|
grafana:
|
|
image: grafana/grafana:11.4.0
|
|
container_name: classeo_grafana
|
|
environment:
|
|
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
|
|
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin}
|
|
GF_USERS_ALLOW_SIGN_UP: "false"
|
|
GF_SERVER_ROOT_URL: ${GRAFANA_ROOT_URL:-http://localhost:3001}
|
|
volumes:
|
|
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
|
|
- grafana_data:/var/lib/grafana
|
|
ports:
|
|
- "3001:3000"
|
|
depends_on:
|
|
prometheus:
|
|
condition: service_healthy
|
|
loki:
|
|
condition: service_healthy
|
|
healthcheck:
|
|
test: ["CMD", "wget", "-q", "--spider", "http://localhost:3000/api/health"]
|
|
interval: 10s
|
|
timeout: 5s
|
|
retries: 5
|
|
start_period: 30s
|
|
restart: unless-stopped
|
|
|
|
# =============================================================================
|
|
# LOGS - Loki
|
|
# =============================================================================
|
|
loki:
|
|
image: grafana/loki:3.3.2
|
|
container_name: classeo_loki
|
|
command: -config.file=/etc/loki/config.yml
|
|
volumes:
|
|
- ./monitoring/loki/config.yml:/etc/loki/config.yml:ro
|
|
- loki_data:/loki
|
|
ports:
|
|
- "3100:3100"
|
|
healthcheck:
|
|
test: ["CMD", "wget", "-q", "--spider", "http://localhost:3100/ready"]
|
|
interval: 10s
|
|
timeout: 5s
|
|
retries: 5
|
|
start_period: 30s
|
|
restart: unless-stopped
|
|
|
|
# =============================================================================
|
|
# LOG COLLECTOR - Promtail
|
|
# =============================================================================
|
|
promtail:
|
|
image: grafana/promtail:3.3.2
|
|
container_name: classeo_promtail
|
|
command: -config.file=/etc/promtail/config.yml
|
|
volumes:
|
|
- ./monitoring/promtail/config.yml:/etc/promtail/config.yml:ro
|
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
|
depends_on:
|
|
loki:
|
|
condition: service_healthy
|
|
restart: unless-stopped
|
|
|
|
# =============================================================================
|
|
# ALERTING - Alertmanager
|
|
# =============================================================================
|
|
alertmanager:
|
|
image: prom/alertmanager:v0.28.0
|
|
container_name: classeo_alertmanager
|
|
command:
|
|
- '--config.file=/etc/alertmanager/alertmanager.yml'
|
|
- '--storage.path=/alertmanager'
|
|
volumes:
|
|
- ./monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
|
|
- alertmanager_data:/alertmanager
|
|
ports:
|
|
- "9093:9093"
|
|
healthcheck:
|
|
test: ["CMD", "wget", "-q", "--spider", "http://localhost:9093/-/healthy"]
|
|
interval: 10s
|
|
timeout: 5s
|
|
retries: 5
|
|
start_period: 10s
|
|
restart: unless-stopped
|
|
|
|
# =============================================================================
|
|
# VOLUMES PERSISTANTS MONITORING
|
|
# =============================================================================
|
|
volumes:
|
|
glitchtip_postgres_data:
|
|
glitchtip_redis_data:
|
|
prometheus_data:
|
|
grafana_data:
|
|
loki_data:
|
|
alertmanager_data:
|