Files
Classeo/compose.monitoring.yaml
Mathias STRASSER d3c6773be5 feat: Observabilité et monitoring complet
Implémentation complète de la stack d'observabilité pour le monitoring
de la plateforme multi-tenant Classeo.

## Error Tracking (GlitchTip)
- Intégration Sentry SDK avec GlitchTip auto-hébergé
- Scrubber PII avant envoi (RGPD: emails, tokens JWT, NIR français)
- Contexte enrichi: tenant_id, user_id, correlation_id
- Configuration backend (sentry.yaml) et frontend (sentry.ts)

## Metrics (Prometheus)
- Endpoint /metrics avec restriction IP en production
- Métriques HTTP: requests_total, request_duration_seconds (histogramme)
- Métriques sécurité: login_failures_total par tenant
- Métriques santé: health_check_status (postgres, redis, rabbitmq)
- Storage Redis pour persistance entre requêtes

## Logs (Loki)
- Processors Monolog: CorrelationIdLogProcessor, PiiScrubberLogProcessor
- Détection PII: emails, téléphones FR, tokens JWT, NIR français
- Labels structurés: tenant_id, correlation_id, level

## Dashboards (Grafana)
- Dashboard principal: latence P50/P95/P99, error rate, RPS
- Dashboard par tenant: métriques isolées par sous-domaine
- Dashboard infrastructure: santé postgres/redis/rabbitmq
- Datasources avec UIDs fixes pour portabilité

## Alertes (Alertmanager)
- HighApiLatencyP95/P99: SLA monitoring (200ms/500ms)
- HighErrorRate: error rate > 1% pendant 2 min
- ExcessiveLoginFailures: détection brute force
- ApplicationUnhealthy: health check failures

## Infrastructure
- InfrastructureHealthChecker: service partagé (DRY)
- HealthCheckController: endpoint /health pour load balancers
- Pre-push hook: make ci && make e2e avant push
2026-02-04 12:59:12 +01:00

207 lines
7.1 KiB
YAML

# =============================================================================
# MONITORING & OBSERVABILITY SERVICES
# =============================================================================
# Usage: docker compose -f compose.yaml -f compose.monitoring.yaml up -d
# =============================================================================
services:
# =============================================================================
# ERROR TRACKING - GlitchTip (Sentry-compatible)
# =============================================================================
glitchtip:
image: glitchtip/glitchtip:v4.1
container_name: classeo_glitchtip
depends_on:
glitchtip-db:
condition: service_healthy
glitchtip-redis:
condition: service_healthy
environment:
DATABASE_URL: postgresql://glitchtip:glitchtip@glitchtip-db:5432/glitchtip
SECRET_KEY: ${GLITCHTIP_SECRET_KEY:-change_me_in_production_very_secret_key}
REDIS_URL: redis://glitchtip-redis:6379/0
GLITCHTIP_DOMAIN: ${GLITCHTIP_DOMAIN:-http://localhost:8081}
DEFAULT_FROM_EMAIL: ${DEFAULT_FROM_EMAIL:-glitchtip@classeo.local}
EMAIL_URL: ${EMAIL_URL:-smtp://mailpit:1025}
CELERY_WORKER_AUTOSCALE: "1,3"
CELERY_WORKER_MAX_TASKS_PER_CHILD: "10000"
ENABLE_ORGANIZATION_CREATION: "true"
ENABLE_USER_REGISTRATION: "true"
ports:
- "8081:8080"
healthcheck:
test: ["CMD-SHELL", "python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8080/_health/')\""]
interval: 30s
timeout: 10s
retries: 5
start_period: 60s
restart: unless-stopped
glitchtip-worker:
image: glitchtip/glitchtip:v4.1
container_name: classeo_glitchtip_worker
depends_on:
glitchtip-db:
condition: service_healthy
glitchtip-redis:
condition: service_healthy
environment:
DATABASE_URL: postgresql://glitchtip:glitchtip@glitchtip-db:5432/glitchtip
SECRET_KEY: ${GLITCHTIP_SECRET_KEY:-change_me_in_production_very_secret_key}
REDIS_URL: redis://glitchtip-redis:6379/0
command: ./bin/run-celery-with-beat.sh
restart: unless-stopped
glitchtip-db:
image: postgres:18.1-alpine
container_name: classeo_glitchtip_db
environment:
POSTGRES_DB: glitchtip
POSTGRES_USER: glitchtip
POSTGRES_PASSWORD: glitchtip
volumes:
- glitchtip_postgres_data:/var/lib/postgresql/data
healthcheck:
test: ["CMD-SHELL", "pg_isready -U glitchtip -d glitchtip"]
interval: 10s
timeout: 5s
retries: 5
start_period: 10s
restart: unless-stopped
glitchtip-redis:
image: redis:7.4-alpine
container_name: classeo_glitchtip_redis
command: redis-server --appendonly yes --maxmemory 128mb --maxmemory-policy allkeys-lru
volumes:
- glitchtip_redis_data:/data
healthcheck:
test: ["CMD", "redis-cli", "ping"]
interval: 10s
timeout: 5s
retries: 5
start_period: 5s
restart: unless-stopped
# =============================================================================
# METRICS - Prometheus
# =============================================================================
prometheus:
image: prom/prometheus:v3.2.0
container_name: classeo_prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
- '--storage.tsdb.retention.time=15d'
- '--web.enable-lifecycle'
volumes:
- ./monitoring/prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro
- ./monitoring/prometheus/alerts.yml:/etc/prometheus/alerts.yml:ro
- prometheus_data:/prometheus
ports:
- "9090:9090"
healthcheck:
test: ["CMD", "wget", "-q", "--spider", "http://localhost:9090/-/healthy"]
interval: 10s
timeout: 5s
retries: 5
start_period: 10s
restart: unless-stopped
# =============================================================================
# DASHBOARDS - Grafana
# =============================================================================
grafana:
image: grafana/grafana:11.4.0
container_name: classeo_grafana
environment:
GF_SECURITY_ADMIN_USER: ${GRAFANA_ADMIN_USER:-admin}
GF_SECURITY_ADMIN_PASSWORD: ${GRAFANA_ADMIN_PASSWORD:-admin}
GF_USERS_ALLOW_SIGN_UP: "false"
GF_SERVER_ROOT_URL: ${GRAFANA_ROOT_URL:-http://localhost:3001}
volumes:
- ./monitoring/grafana/provisioning:/etc/grafana/provisioning:ro
- grafana_data:/var/lib/grafana
ports:
- "3001:3000"
depends_on:
prometheus:
condition: service_healthy
loki:
condition: service_healthy
healthcheck:
test: ["CMD", "wget", "-q", "--spider", "http://localhost:3000/api/health"]
interval: 10s
timeout: 5s
retries: 5
start_period: 30s
restart: unless-stopped
# =============================================================================
# LOGS - Loki
# =============================================================================
loki:
image: grafana/loki:3.3.2
container_name: classeo_loki
command: -config.file=/etc/loki/config.yml
volumes:
- ./monitoring/loki/config.yml:/etc/loki/config.yml:ro
- loki_data:/loki
ports:
- "3100:3100"
healthcheck:
test: ["CMD", "wget", "-q", "--spider", "http://localhost:3100/ready"]
interval: 10s
timeout: 5s
retries: 5
start_period: 30s
restart: unless-stopped
# =============================================================================
# LOG COLLECTOR - Promtail
# =============================================================================
promtail:
image: grafana/promtail:3.3.2
container_name: classeo_promtail
command: -config.file=/etc/promtail/config.yml
volumes:
- ./monitoring/promtail/config.yml:/etc/promtail/config.yml:ro
- /var/run/docker.sock:/var/run/docker.sock:ro
depends_on:
loki:
condition: service_healthy
restart: unless-stopped
# =============================================================================
# ALERTING - Alertmanager
# =============================================================================
alertmanager:
image: prom/alertmanager:v0.28.0
container_name: classeo_alertmanager
command:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
volumes:
- ./monitoring/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro
- alertmanager_data:/alertmanager
ports:
- "9093:9093"
healthcheck:
test: ["CMD", "wget", "-q", "--spider", "http://localhost:9093/-/healthy"]
interval: 10s
timeout: 5s
retries: 5
start_period: 10s
restart: unless-stopped
# =============================================================================
# VOLUMES PERSISTANTS MONITORING
# =============================================================================
volumes:
glitchtip_postgres_data:
glitchtip_redis_data:
prometheus_data:
grafana_data:
loki_data:
alertmanager_data: