Files
Mathias STRASSER d3c6773be5 feat: Observabilité et monitoring complet
Implémentation complète de la stack d'observabilité pour le monitoring
de la plateforme multi-tenant Classeo.

## Error Tracking (GlitchTip)
- Intégration Sentry SDK avec GlitchTip auto-hébergé
- Scrubber PII avant envoi (RGPD: emails, tokens JWT, NIR français)
- Contexte enrichi: tenant_id, user_id, correlation_id
- Configuration backend (sentry.yaml) et frontend (sentry.ts)

## Metrics (Prometheus)
- Endpoint /metrics avec restriction IP en production
- Métriques HTTP: requests_total, request_duration_seconds (histogramme)
- Métriques sécurité: login_failures_total par tenant
- Métriques santé: health_check_status (postgres, redis, rabbitmq)
- Storage Redis pour persistance entre requêtes

## Logs (Loki)
- Processors Monolog: CorrelationIdLogProcessor, PiiScrubberLogProcessor
- Détection PII: emails, téléphones FR, tokens JWT, NIR français
- Labels structurés: tenant_id, correlation_id, level

## Dashboards (Grafana)
- Dashboard principal: latence P50/P95/P99, error rate, RPS
- Dashboard par tenant: métriques isolées par sous-domaine
- Dashboard infrastructure: santé postgres/redis/rabbitmq
- Datasources avec UIDs fixes pour portabilité

## Alertes (Alertmanager)
- HighApiLatencyP95/P99: SLA monitoring (200ms/500ms)
- HighErrorRate: error rate > 1% pendant 2 min
- ExcessiveLoginFailures: détection brute force
- ApplicationUnhealthy: health check failures

## Infrastructure
- InfrastructureHealthChecker: service partagé (DRY)
- HealthCheckController: endpoint /health pour load balancers
- Pre-push hook: make ci && make e2e avant push
2026-02-04 12:59:12 +01:00

355 lines
10 KiB
JSON

{
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [
{
"asDropdown": false,
"icon": "external link",
"includeVars": false,
"keepTime": true,
"tags": [],
"targetBlank": true,
"title": "Main Dashboard",
"tooltip": "",
"type": "link",
"url": "/d/classeo-main"
}
],
"liveNow": false,
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 1,
"panels": [],
"title": "Tenant: $tenant_id",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 150 },
{ "color": "red", "value": 200 }
]
},
"unit": "ms"
}
},
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(classeo_http_request_duration_seconds_bucket{tenant_id=\"$tenant_id\"}[5m])) by (le)) * 1000",
"legendFormat": "P95",
"refId": "A"
}
],
"title": "Tenant P95 Latency",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.5 },
{ "color": "red", "value": 1 }
]
},
"unit": "percent"
}
},
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "sum(rate(classeo_http_requests_total{tenant_id=\"$tenant_id\",status=~\"5..\"}[5m])) / sum(rate(classeo_http_requests_total{tenant_id=\"$tenant_id\"}[5m])) * 100",
"legendFormat": "Error Rate",
"refId": "A"
}
],
"title": "Tenant Error Rate",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
"unit": "reqps"
}
},
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "sum(rate(classeo_http_requests_total{tenant_id=\"$tenant_id\"}[5m]))",
"legendFormat": "RPS",
"refId": "A"
}
],
"title": "Tenant RPS",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
"unit": "none"
}
},
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
"id": 5,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "sum(rate(classeo_login_failures_total{tenant_id=\"$tenant_id\"}[5m])) * 60",
"legendFormat": "Failed Logins/min",
"refId": "A"
}
],
"title": "Login Failures/min",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
"id": 6,
"panels": [],
"title": "Request Metrics",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "line" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 200 }
]
},
"unit": "ms"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
"id": 7,
"options": {
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(classeo_http_request_duration_seconds_bucket{tenant_id=\"$tenant_id\"}[5m])) by (le)) * 1000",
"legendFormat": "P50",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum(rate(classeo_http_request_duration_seconds_bucket{tenant_id=\"$tenant_id\"}[5m])) by (le)) * 1000",
"legendFormat": "P95",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, sum(rate(classeo_http_request_duration_seconds_bucket{tenant_id=\"$tenant_id\"}[5m])) by (le)) * 1000",
"legendFormat": "P99",
"refId": "C"
}
],
"title": "Latency Distribution",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "normal" }
},
"mappings": [],
"unit": "reqps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
"id": 8,
"options": {
"legend": { "calcs": ["mean", "sum"], "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "sum(rate(classeo_http_requests_total{tenant_id=\"$tenant_id\"}[5m])) by (route)",
"legendFormat": "{{ route }}",
"refId": "A"
}
],
"title": "Requests by Route",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
"id": 9,
"panels": [],
"title": "Logs",
"type": "row"
},
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 15 },
"id": 10,
"options": {
"dedupStrategy": "none",
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": true,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "{tenant_id=\"$tenant_id\"}",
"legendFormat": "",
"refId": "A"
}
],
"title": "Tenant Logs",
"type": "logs"
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": ["classeo", "tenant", "multi-tenant"],
"templating": {
"list": [
{
"current": {},
"datasource": { "type": "prometheus", "uid": "prometheus" },
"definition": "label_values(classeo_http_requests_total, tenant_id)",
"hide": 0,
"includeAll": false,
"label": "Tenant",
"multi": false,
"name": "tenant_id",
"options": [],
"query": { "qryType": 1, "query": "label_values(classeo_http_requests_total, tenant_id)", "refId": "PrometheusVariableQueryEditor-VariableQuery" },
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
}
]
},
"time": { "from": "now-1h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Classeo - Per Tenant",
"uid": "classeo-tenant",
"version": 1,
"weekStart": ""
}