feat: Observabilité et monitoring complet

Implémentation complète de la stack d'observabilité pour le monitoring
de la plateforme multi-tenant Classeo.

## Error Tracking (GlitchTip)
- Intégration Sentry SDK avec GlitchTip auto-hébergé
- Scrubber PII avant envoi (RGPD: emails, tokens JWT, NIR français)
- Contexte enrichi: tenant_id, user_id, correlation_id
- Configuration backend (sentry.yaml) et frontend (sentry.ts)

## Metrics (Prometheus)
- Endpoint /metrics avec restriction IP en production
- Métriques HTTP: requests_total, request_duration_seconds (histogramme)
- Métriques sécurité: login_failures_total par tenant
- Métriques santé: health_check_status (postgres, redis, rabbitmq)
- Storage Redis pour persistance entre requêtes

## Logs (Loki)
- Processors Monolog: CorrelationIdLogProcessor, PiiScrubberLogProcessor
- Détection PII: emails, téléphones FR, tokens JWT, NIR français
- Labels structurés: tenant_id, correlation_id, level

## Dashboards (Grafana)
- Dashboard principal: latence P50/P95/P99, error rate, RPS
- Dashboard par tenant: métriques isolées par sous-domaine
- Dashboard infrastructure: santé postgres/redis/rabbitmq
- Datasources avec UIDs fixes pour portabilité

## Alertes (Alertmanager)
- HighApiLatencyP95/P99: SLA monitoring (200ms/500ms)
- HighErrorRate: error rate > 1% pendant 2 min
- ExcessiveLoginFailures: détection brute force
- ApplicationUnhealthy: health check failures

## Infrastructure
- InfrastructureHealthChecker: service partagé (DRY)
- HealthCheckController: endpoint /health pour load balancers
- Pre-push hook: make ci && make e2e avant push
This commit is contained in:
2026-02-04 11:47:01 +01:00
parent 2ed60fdcc1
commit d3c6773be5
48 changed files with 5846 additions and 32 deletions

View File

@@ -0,0 +1,466 @@
{
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 1,
"panels": [],
"title": "SLA Overview",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 150 },
{ "color": "red", "value": 200 }
]
},
"unit": "ms"
}
},
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(classeo_http_request_duration_seconds_bucket{job=\"classeo-backend\"}[5m])) by (le)) * 1000",
"legendFormat": "P95",
"refId": "A"
}
],
"title": "API Latency P95",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 350 },
{ "color": "red", "value": 500 }
]
},
"unit": "ms"
}
},
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(classeo_http_request_duration_seconds_bucket{job=\"classeo-backend\"}[5m])) by (le)) * 1000",
"legendFormat": "P99",
"refId": "A"
}
],
"title": "API Latency P99",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.5 },
{ "color": "red", "value": 1 }
]
},
"unit": "percent"
}
},
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "sum(rate(classeo_http_requests_total{job=\"classeo-backend\",status=~\"5..\"}[5m])) / sum(rate(classeo_http_requests_total{job=\"classeo-backend\"}[5m])) * 100",
"legendFormat": "Error Rate",
"refId": "A"
}
],
"title": "Error Rate",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
"unit": "reqps"
}
},
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
"id": 5,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "sum(rate(classeo_http_requests_total{job=\"classeo-backend\"}[5m]))",
"legendFormat": "RPS",
"refId": "A"
}
],
"title": "Requests/Second",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
"id": 6,
"panels": [],
"title": "Request Metrics",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "line" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 200 }
]
},
"unit": "ms"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
"id": 7,
"options": {
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(classeo_http_request_duration_seconds_bucket{job=\"classeo-backend\"}[5m])) by (le)) * 1000",
"legendFormat": "P50",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum(rate(classeo_http_request_duration_seconds_bucket{job=\"classeo-backend\"}[5m])) by (le)) * 1000",
"legendFormat": "P95",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, sum(rate(classeo_http_request_duration_seconds_bucket{job=\"classeo-backend\"}[5m])) by (le)) * 1000",
"legendFormat": "P99",
"refId": "C"
}
],
"title": "API Latency Distribution",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "normal" }
},
"mappings": [],
"unit": "reqps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
"id": 8,
"options": {
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "sum(rate(classeo_http_requests_total{job=\"classeo-backend\",status=~\"2..\"}[5m])) by (status)",
"legendFormat": "{{ status }}",
"refId": "A"
},
{
"expr": "sum(rate(classeo_http_requests_total{job=\"classeo-backend\",status=~\"4..\"}[5m])) by (status)",
"legendFormat": "{{ status }}",
"refId": "B"
},
{
"expr": "sum(rate(classeo_http_requests_total{job=\"classeo-backend\",status=~\"5..\"}[5m])) by (status)",
"legendFormat": "{{ status }}",
"refId": "C"
}
],
"title": "Requests by Status Code",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
"id": 9,
"panels": [],
"title": "Infrastructure Health",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "0": { "color": "red", "index": 1, "text": "DOWN" }, "1": { "color": "green", "index": 0, "text": "UP" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
}
},
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 15 },
"id": 10,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "up{job=\"classeo-backend\"}",
"legendFormat": "Backend",
"refId": "A"
}
],
"title": "Backend",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "healthy": { "color": "green", "index": 0, "text": "HEALTHY" }, "degraded": { "color": "yellow", "index": 1, "text": "DEGRADED" }, "unhealthy": { "color": "red", "index": 2, "text": "UNHEALTHY" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
}
},
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 15 },
"id": 11,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "classeo_health_check_status{service=\"postgres\"}",
"legendFormat": "PostgreSQL",
"refId": "A"
}
],
"title": "PostgreSQL",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "healthy": { "color": "green", "index": 0, "text": "HEALTHY" }, "degraded": { "color": "yellow", "index": 1, "text": "DEGRADED" }, "unhealthy": { "color": "red", "index": 2, "text": "UNHEALTHY" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
}
},
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 15 },
"id": 12,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "classeo_health_check_status{service=\"redis\"}",
"legendFormat": "Redis",
"refId": "A"
}
],
"title": "Redis",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "healthy": { "color": "green", "index": 0, "text": "HEALTHY" }, "degraded": { "color": "yellow", "index": 1, "text": "DEGRADED" }, "unhealthy": { "color": "red", "index": 2, "text": "UNHEALTHY" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
}
},
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 15 },
"id": 13,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "classeo_health_check_status{service=\"rabbitmq\"}",
"legendFormat": "RabbitMQ",
"refId": "A"
}
],
"title": "RabbitMQ",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 19 },
"id": 14,
"panels": [],
"title": "Logs",
"type": "row"
},
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 20 },
"id": 15,
"options": {
"dedupStrategy": "none",
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": true,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "{service=\"php\"} |= ``",
"legendFormat": "",
"refId": "A"
}
],
"title": "Backend Logs",
"type": "logs"
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": ["classeo", "sla", "overview"],
"templating": {
"list": [
{
"current": { "selected": false, "text": "All", "value": "$__all" },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"definition": "label_values(classeo_http_requests_total, tenant_id)",
"hide": 0,
"includeAll": true,
"label": "Tenant",
"multi": true,
"name": "tenant_id",
"options": [],
"query": { "qryType": 1, "query": "label_values(classeo_http_requests_total, tenant_id)", "refId": "PrometheusVariableQueryEditor-VariableQuery" },
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
}
]
},
"time": { "from": "now-1h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Classeo - Main Dashboard",
"uid": "classeo-main",
"version": 1,
"weekStart": ""
}