feat: Observabilité et monitoring complet
Implémentation complète de la stack d'observabilité pour le monitoring de la plateforme multi-tenant Classeo. ## Error Tracking (GlitchTip) - Intégration Sentry SDK avec GlitchTip auto-hébergé - Scrubber PII avant envoi (RGPD: emails, tokens JWT, NIR français) - Contexte enrichi: tenant_id, user_id, correlation_id - Configuration backend (sentry.yaml) et frontend (sentry.ts) ## Metrics (Prometheus) - Endpoint /metrics avec restriction IP en production - Métriques HTTP: requests_total, request_duration_seconds (histogramme) - Métriques sécurité: login_failures_total par tenant - Métriques santé: health_check_status (postgres, redis, rabbitmq) - Storage Redis pour persistance entre requêtes ## Logs (Loki) - Processors Monolog: CorrelationIdLogProcessor, PiiScrubberLogProcessor - Détection PII: emails, téléphones FR, tokens JWT, NIR français - Labels structurés: tenant_id, correlation_id, level ## Dashboards (Grafana) - Dashboard principal: latence P50/P95/P99, error rate, RPS - Dashboard par tenant: métriques isolées par sous-domaine - Dashboard infrastructure: santé postgres/redis/rabbitmq - Datasources avec UIDs fixes pour portabilité ## Alertes (Alertmanager) - HighApiLatencyP95/P99: SLA monitoring (200ms/500ms) - HighErrorRate: error rate > 1% pendant 2 min - ExcessiveLoginFailures: détection brute force - ApplicationUnhealthy: health check failures ## Infrastructure - InfrastructureHealthChecker: service partagé (DRY) - HealthCheckController: endpoint /health pour load balancers - Pre-push hook: make ci && make e2e avant push
This commit is contained in:
95
monitoring/alertmanager/alertmanager.yml
Normal file
95
monitoring/alertmanager/alertmanager.yml
Normal file
@@ -0,0 +1,95 @@
|
||||
# Alertmanager Configuration for Classeo
|
||||
# NFR-OB2: Notification channels for SLA alerts
|
||||
|
||||
global:
|
||||
resolve_timeout: 5m
|
||||
# SMTP settings for email alerts (configure in production)
|
||||
smtp_smarthost: 'mailpit:1025'
|
||||
smtp_from: 'alertmanager@classeo.local'
|
||||
smtp_require_tls: false
|
||||
|
||||
# Templates for notification messages
|
||||
templates:
|
||||
- '/etc/alertmanager/templates/*.tmpl'
|
||||
|
||||
# Routing tree for alert handling
|
||||
route:
|
||||
# Default receiver
|
||||
receiver: 'platform-team'
|
||||
# Group alerts by alertname and severity
|
||||
group_by: ['alertname', 'severity']
|
||||
# Wait time before sending first notification
|
||||
group_wait: 30s
|
||||
# Wait time before sending next batch
|
||||
group_interval: 5m
|
||||
# Wait time before resending same alert
|
||||
repeat_interval: 4h
|
||||
|
||||
# Child routes for specific teams
|
||||
routes:
|
||||
# Critical alerts: immediate notification
|
||||
- receiver: 'platform-team-critical'
|
||||
match:
|
||||
severity: critical
|
||||
group_wait: 10s
|
||||
repeat_interval: 1h
|
||||
|
||||
# Security alerts: route to security team
|
||||
- receiver: 'security-team'
|
||||
match:
|
||||
team: security
|
||||
group_wait: 30s
|
||||
repeat_interval: 2h
|
||||
|
||||
# Inhibition rules - suppress less severe alerts when critical alert is firing
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname', 'instance']
|
||||
|
||||
# Notification receivers
|
||||
receivers:
|
||||
# Default platform team receiver
|
||||
- name: 'platform-team'
|
||||
email_configs:
|
||||
- to: 'platform@classeo.local'
|
||||
send_resolved: true
|
||||
# Slack integration (configure webhook in production)
|
||||
# slack_configs:
|
||||
# - api_url: '${SLACK_WEBHOOK_URL}'
|
||||
# channel: '#platform-alerts'
|
||||
# send_resolved: true
|
||||
# title: '{{ .Status | toUpper }}: {{ .CommonLabels.alertname }}'
|
||||
# text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
|
||||
|
||||
# Critical alerts - higher priority
|
||||
- name: 'platform-team-critical'
|
||||
email_configs:
|
||||
- to: 'platform-critical@classeo.local'
|
||||
send_resolved: true
|
||||
# Slack integration for critical alerts
|
||||
# slack_configs:
|
||||
# - api_url: '${SLACK_WEBHOOK_URL}'
|
||||
# channel: '#platform-critical'
|
||||
# send_resolved: true
|
||||
# title: ':rotating_light: CRITICAL: {{ .CommonLabels.alertname }}'
|
||||
# text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
|
||||
# PagerDuty integration (configure in production)
|
||||
# pagerduty_configs:
|
||||
# - service_key: '${PAGERDUTY_SERVICE_KEY}'
|
||||
# severity: critical
|
||||
|
||||
# Security team receiver
|
||||
- name: 'security-team'
|
||||
email_configs:
|
||||
- to: 'security@classeo.local'
|
||||
send_resolved: true
|
||||
# Slack integration for security alerts
|
||||
# slack_configs:
|
||||
# - api_url: '${SLACK_SECURITY_WEBHOOK_URL}'
|
||||
# channel: '#security-alerts'
|
||||
# send_resolved: true
|
||||
# title: ':lock: Security Alert: {{ .CommonLabels.alertname }}'
|
||||
# text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
|
||||
Reference in New Issue
Block a user