Implémentation complète de la stack d'observabilité pour le monitoring de la plateforme multi-tenant Classeo. ## Error Tracking (GlitchTip) - Intégration Sentry SDK avec GlitchTip auto-hébergé - Scrubber PII avant envoi (RGPD: emails, tokens JWT, NIR français) - Contexte enrichi: tenant_id, user_id, correlation_id - Configuration backend (sentry.yaml) et frontend (sentry.ts) ## Metrics (Prometheus) - Endpoint /metrics avec restriction IP en production - Métriques HTTP: requests_total, request_duration_seconds (histogramme) - Métriques sécurité: login_failures_total par tenant - Métriques santé: health_check_status (postgres, redis, rabbitmq) - Storage Redis pour persistance entre requêtes ## Logs (Loki) - Processors Monolog: CorrelationIdLogProcessor, PiiScrubberLogProcessor - Détection PII: emails, téléphones FR, tokens JWT, NIR français - Labels structurés: tenant_id, correlation_id, level ## Dashboards (Grafana) - Dashboard principal: latence P50/P95/P99, error rate, RPS - Dashboard par tenant: métriques isolées par sous-domaine - Dashboard infrastructure: santé postgres/redis/rabbitmq - Datasources avec UIDs fixes pour portabilité ## Alertes (Alertmanager) - HighApiLatencyP95/P99: SLA monitoring (200ms/500ms) - HighErrorRate: error rate > 1% pendant 2 min - ExcessiveLoginFailures: détection brute force - ApplicationUnhealthy: health check failures ## Infrastructure - InfrastructureHealthChecker: service partagé (DRY) - HealthCheckController: endpoint /health pour load balancers - Pre-push hook: make ci && make e2e avant push
96 lines
2.9 KiB
YAML
96 lines
2.9 KiB
YAML
# Alertmanager Configuration for Classeo
|
|
# NFR-OB2: Notification channels for SLA alerts
|
|
|
|
global:
|
|
resolve_timeout: 5m
|
|
# SMTP settings for email alerts (configure in production)
|
|
smtp_smarthost: 'mailpit:1025'
|
|
smtp_from: 'alertmanager@classeo.local'
|
|
smtp_require_tls: false
|
|
|
|
# Templates for notification messages
|
|
templates:
|
|
- '/etc/alertmanager/templates/*.tmpl'
|
|
|
|
# Routing tree for alert handling
|
|
route:
|
|
# Default receiver
|
|
receiver: 'platform-team'
|
|
# Group alerts by alertname and severity
|
|
group_by: ['alertname', 'severity']
|
|
# Wait time before sending first notification
|
|
group_wait: 30s
|
|
# Wait time before sending next batch
|
|
group_interval: 5m
|
|
# Wait time before resending same alert
|
|
repeat_interval: 4h
|
|
|
|
# Child routes for specific teams
|
|
routes:
|
|
# Critical alerts: immediate notification
|
|
- receiver: 'platform-team-critical'
|
|
match:
|
|
severity: critical
|
|
group_wait: 10s
|
|
repeat_interval: 1h
|
|
|
|
# Security alerts: route to security team
|
|
- receiver: 'security-team'
|
|
match:
|
|
team: security
|
|
group_wait: 30s
|
|
repeat_interval: 2h
|
|
|
|
# Inhibition rules - suppress less severe alerts when critical alert is firing
|
|
inhibit_rules:
|
|
- source_match:
|
|
severity: 'critical'
|
|
target_match:
|
|
severity: 'warning'
|
|
equal: ['alertname', 'instance']
|
|
|
|
# Notification receivers
|
|
receivers:
|
|
# Default platform team receiver
|
|
- name: 'platform-team'
|
|
email_configs:
|
|
- to: 'platform@classeo.local'
|
|
send_resolved: true
|
|
# Slack integration (configure webhook in production)
|
|
# slack_configs:
|
|
# - api_url: '${SLACK_WEBHOOK_URL}'
|
|
# channel: '#platform-alerts'
|
|
# send_resolved: true
|
|
# title: '{{ .Status | toUpper }}: {{ .CommonLabels.alertname }}'
|
|
# text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
|
|
|
|
# Critical alerts - higher priority
|
|
- name: 'platform-team-critical'
|
|
email_configs:
|
|
- to: 'platform-critical@classeo.local'
|
|
send_resolved: true
|
|
# Slack integration for critical alerts
|
|
# slack_configs:
|
|
# - api_url: '${SLACK_WEBHOOK_URL}'
|
|
# channel: '#platform-critical'
|
|
# send_resolved: true
|
|
# title: ':rotating_light: CRITICAL: {{ .CommonLabels.alertname }}'
|
|
# text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
|
|
# PagerDuty integration (configure in production)
|
|
# pagerduty_configs:
|
|
# - service_key: '${PAGERDUTY_SERVICE_KEY}'
|
|
# severity: critical
|
|
|
|
# Security team receiver
|
|
- name: 'security-team'
|
|
email_configs:
|
|
- to: 'security@classeo.local'
|
|
send_resolved: true
|
|
# Slack integration for security alerts
|
|
# slack_configs:
|
|
# - api_url: '${SLACK_SECURITY_WEBHOOK_URL}'
|
|
# channel: '#security-alerts'
|
|
# send_resolved: true
|
|
# title: ':lock: Security Alert: {{ .CommonLabels.alertname }}'
|
|
# text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
|