feat: Observabilité et monitoring complet

Implémentation complète de la stack d'observabilité pour le monitoring
de la plateforme multi-tenant Classeo.

## Error Tracking (GlitchTip)
- Intégration Sentry SDK avec GlitchTip auto-hébergé
- Scrubber PII avant envoi (RGPD: emails, tokens JWT, NIR français)
- Contexte enrichi: tenant_id, user_id, correlation_id
- Configuration backend (sentry.yaml) et frontend (sentry.ts)

## Metrics (Prometheus)
- Endpoint /metrics avec restriction IP en production
- Métriques HTTP: requests_total, request_duration_seconds (histogramme)
- Métriques sécurité: login_failures_total par tenant
- Métriques santé: health_check_status (postgres, redis, rabbitmq)
- Storage Redis pour persistance entre requêtes

## Logs (Loki)
- Processors Monolog: CorrelationIdLogProcessor, PiiScrubberLogProcessor
- Détection PII: emails, téléphones FR, tokens JWT, NIR français
- Labels structurés: tenant_id, correlation_id, level

## Dashboards (Grafana)
- Dashboard principal: latence P50/P95/P99, error rate, RPS
- Dashboard par tenant: métriques isolées par sous-domaine
- Dashboard infrastructure: santé postgres/redis/rabbitmq
- Datasources avec UIDs fixes pour portabilité

## Alertes (Alertmanager)
- HighApiLatencyP95/P99: SLA monitoring (200ms/500ms)
- HighErrorRate: error rate > 1% pendant 2 min
- ExcessiveLoginFailures: détection brute force
- ApplicationUnhealthy: health check failures

## Infrastructure
- InfrastructureHealthChecker: service partagé (DRY)
- HealthCheckController: endpoint /health pour load balancers
- Pre-push hook: make ci && make e2e avant push
This commit is contained in:
2026-02-04 11:47:01 +01:00
parent 2ed60fdcc1
commit d3c6773be5
48 changed files with 5846 additions and 32 deletions

View File

@@ -0,0 +1,95 @@
# Alertmanager Configuration for Classeo
# NFR-OB2: Notification channels for SLA alerts
global:
resolve_timeout: 5m
# SMTP settings for email alerts (configure in production)
smtp_smarthost: 'mailpit:1025'
smtp_from: 'alertmanager@classeo.local'
smtp_require_tls: false
# Templates for notification messages
templates:
- '/etc/alertmanager/templates/*.tmpl'
# Routing tree for alert handling
route:
# Default receiver
receiver: 'platform-team'
# Group alerts by alertname and severity
group_by: ['alertname', 'severity']
# Wait time before sending first notification
group_wait: 30s
# Wait time before sending next batch
group_interval: 5m
# Wait time before resending same alert
repeat_interval: 4h
# Child routes for specific teams
routes:
# Critical alerts: immediate notification
- receiver: 'platform-team-critical'
match:
severity: critical
group_wait: 10s
repeat_interval: 1h
# Security alerts: route to security team
- receiver: 'security-team'
match:
team: security
group_wait: 30s
repeat_interval: 2h
# Inhibition rules - suppress less severe alerts when critical alert is firing
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
# Notification receivers
receivers:
# Default platform team receiver
- name: 'platform-team'
email_configs:
- to: 'platform@classeo.local'
send_resolved: true
# Slack integration (configure webhook in production)
# slack_configs:
# - api_url: '${SLACK_WEBHOOK_URL}'
# channel: '#platform-alerts'
# send_resolved: true
# title: '{{ .Status | toUpper }}: {{ .CommonLabels.alertname }}'
# text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
# Critical alerts - higher priority
- name: 'platform-team-critical'
email_configs:
- to: 'platform-critical@classeo.local'
send_resolved: true
# Slack integration for critical alerts
# slack_configs:
# - api_url: '${SLACK_WEBHOOK_URL}'
# channel: '#platform-critical'
# send_resolved: true
# title: ':rotating_light: CRITICAL: {{ .CommonLabels.alertname }}'
# text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
# PagerDuty integration (configure in production)
# pagerduty_configs:
# - service_key: '${PAGERDUTY_SERVICE_KEY}'
# severity: critical
# Security team receiver
- name: 'security-team'
email_configs:
- to: 'security@classeo.local'
send_resolved: true
# Slack integration for security alerts
# slack_configs:
# - api_url: '${SLACK_SECURITY_WEBHOOK_URL}'
# channel: '#security-alerts'
# send_resolved: true
# title: ':lock: Security Alert: {{ .CommonLabels.alertname }}'
# text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'

View File

@@ -0,0 +1,16 @@
# Grafana Dashboard Provisioning
# Auto-loads dashboards from JSON files
apiVersion: 1
providers:
- name: 'Classeo Dashboards'
orgId: 1
folder: 'Classeo'
folderUid: 'classeo'
type: file
disableDeletion: false
updateIntervalSeconds: 30
allowUiUpdates: true
options:
path: /etc/grafana/provisioning/dashboards/json

View File

@@ -0,0 +1,466 @@
{
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [],
"liveNow": false,
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 1,
"panels": [],
"title": "SLA Overview",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 150 },
{ "color": "red", "value": 200 }
]
},
"unit": "ms"
}
},
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(classeo_http_request_duration_seconds_bucket{job=\"classeo-backend\"}[5m])) by (le)) * 1000",
"legendFormat": "P95",
"refId": "A"
}
],
"title": "API Latency P95",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 350 },
{ "color": "red", "value": 500 }
]
},
"unit": "ms"
}
},
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "histogram_quantile(0.99, sum(rate(classeo_http_request_duration_seconds_bucket{job=\"classeo-backend\"}[5m])) by (le)) * 1000",
"legendFormat": "P99",
"refId": "A"
}
],
"title": "API Latency P99",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.5 },
{ "color": "red", "value": 1 }
]
},
"unit": "percent"
}
},
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "sum(rate(classeo_http_requests_total{job=\"classeo-backend\",status=~\"5..\"}[5m])) / sum(rate(classeo_http_requests_total{job=\"classeo-backend\"}[5m])) * 100",
"legendFormat": "Error Rate",
"refId": "A"
}
],
"title": "Error Rate",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
"unit": "reqps"
}
},
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
"id": 5,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "sum(rate(classeo_http_requests_total{job=\"classeo-backend\"}[5m]))",
"legendFormat": "RPS",
"refId": "A"
}
],
"title": "Requests/Second",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
"id": 6,
"panels": [],
"title": "Request Metrics",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "line" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 200 }
]
},
"unit": "ms"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
"id": 7,
"options": {
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(classeo_http_request_duration_seconds_bucket{job=\"classeo-backend\"}[5m])) by (le)) * 1000",
"legendFormat": "P50",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum(rate(classeo_http_request_duration_seconds_bucket{job=\"classeo-backend\"}[5m])) by (le)) * 1000",
"legendFormat": "P95",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, sum(rate(classeo_http_request_duration_seconds_bucket{job=\"classeo-backend\"}[5m])) by (le)) * 1000",
"legendFormat": "P99",
"refId": "C"
}
],
"title": "API Latency Distribution",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "normal" }
},
"mappings": [],
"unit": "reqps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
"id": 8,
"options": {
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "sum(rate(classeo_http_requests_total{job=\"classeo-backend\",status=~\"2..\"}[5m])) by (status)",
"legendFormat": "{{ status }}",
"refId": "A"
},
{
"expr": "sum(rate(classeo_http_requests_total{job=\"classeo-backend\",status=~\"4..\"}[5m])) by (status)",
"legendFormat": "{{ status }}",
"refId": "B"
},
{
"expr": "sum(rate(classeo_http_requests_total{job=\"classeo-backend\",status=~\"5..\"}[5m])) by (status)",
"legendFormat": "{{ status }}",
"refId": "C"
}
],
"title": "Requests by Status Code",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
"id": 9,
"panels": [],
"title": "Infrastructure Health",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "0": { "color": "red", "index": 1, "text": "DOWN" }, "1": { "color": "green", "index": 0, "text": "UP" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [{ "color": "red", "value": null }, { "color": "green", "value": 1 }] }
}
},
"gridPos": { "h": 4, "w": 4, "x": 0, "y": 15 },
"id": 10,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "up{job=\"classeo-backend\"}",
"legendFormat": "Backend",
"refId": "A"
}
],
"title": "Backend",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "healthy": { "color": "green", "index": 0, "text": "HEALTHY" }, "degraded": { "color": "yellow", "index": 1, "text": "DEGRADED" }, "unhealthy": { "color": "red", "index": 2, "text": "UNHEALTHY" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
}
},
"gridPos": { "h": 4, "w": 4, "x": 4, "y": 15 },
"id": 11,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "classeo_health_check_status{service=\"postgres\"}",
"legendFormat": "PostgreSQL",
"refId": "A"
}
],
"title": "PostgreSQL",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "healthy": { "color": "green", "index": 0, "text": "HEALTHY" }, "degraded": { "color": "yellow", "index": 1, "text": "DEGRADED" }, "unhealthy": { "color": "red", "index": 2, "text": "UNHEALTHY" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
}
},
"gridPos": { "h": 4, "w": 4, "x": 8, "y": 15 },
"id": 12,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "classeo_health_check_status{service=\"redis\"}",
"legendFormat": "Redis",
"refId": "A"
}
],
"title": "Redis",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [
{ "options": { "healthy": { "color": "green", "index": 0, "text": "HEALTHY" }, "degraded": { "color": "yellow", "index": 1, "text": "DEGRADED" }, "unhealthy": { "color": "red", "index": 2, "text": "UNHEALTHY" } }, "type": "value" }
],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] }
}
},
"gridPos": { "h": 4, "w": 4, "x": 12, "y": 15 },
"id": 13,
"options": { "colorMode": "background", "graphMode": "none", "justifyMode": "auto", "orientation": "auto", "reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false }, "textMode": "auto" },
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "classeo_health_check_status{service=\"rabbitmq\"}",
"legendFormat": "RabbitMQ",
"refId": "A"
}
],
"title": "RabbitMQ",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 19 },
"id": 14,
"panels": [],
"title": "Logs",
"type": "row"
},
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 8, "w": 24, "x": 0, "y": 20 },
"id": 15,
"options": {
"dedupStrategy": "none",
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": true,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "{service=\"php\"} |= ``",
"legendFormat": "",
"refId": "A"
}
],
"title": "Backend Logs",
"type": "logs"
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": ["classeo", "sla", "overview"],
"templating": {
"list": [
{
"current": { "selected": false, "text": "All", "value": "$__all" },
"datasource": { "type": "prometheus", "uid": "prometheus" },
"definition": "label_values(classeo_http_requests_total, tenant_id)",
"hide": 0,
"includeAll": true,
"label": "Tenant",
"multi": true,
"name": "tenant_id",
"options": [],
"query": { "qryType": 1, "query": "label_values(classeo_http_requests_total, tenant_id)", "refId": "PrometheusVariableQueryEditor-VariableQuery" },
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
}
]
},
"time": { "from": "now-1h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Classeo - Main Dashboard",
"uid": "classeo-main",
"version": 1,
"weekStart": ""
}

View File

@@ -0,0 +1,354 @@
{
"annotations": {
"list": []
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"links": [
{
"asDropdown": false,
"icon": "external link",
"includeVars": false,
"keepTime": true,
"tags": [],
"targetBlank": true,
"title": "Main Dashboard",
"tooltip": "",
"type": "link",
"url": "/d/classeo-main"
}
],
"liveNow": false,
"panels": [
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 },
"id": 1,
"panels": [],
"title": "Tenant: $tenant_id",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 150 },
{ "color": "red", "value": 200 }
]
},
"unit": "ms"
}
},
"gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 },
"id": 2,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "histogram_quantile(0.95, sum(rate(classeo_http_request_duration_seconds_bucket{tenant_id=\"$tenant_id\"}[5m])) by (le)) * 1000",
"legendFormat": "P95",
"refId": "A"
}
],
"title": "Tenant P95 Latency",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 0.5 },
{ "color": "red", "value": 1 }
]
},
"unit": "percent"
}
},
"gridPos": { "h": 4, "w": 6, "x": 6, "y": 1 },
"id": 3,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "sum(rate(classeo_http_requests_total{tenant_id=\"$tenant_id\",status=~\"5..\"}[5m])) / sum(rate(classeo_http_requests_total{tenant_id=\"$tenant_id\"}[5m])) * 100",
"legendFormat": "Error Rate",
"refId": "A"
}
],
"title": "Tenant Error Rate",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
"unit": "reqps"
}
},
"gridPos": { "h": 4, "w": 6, "x": 12, "y": 1 },
"id": 4,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "sum(rate(classeo_http_requests_total{tenant_id=\"$tenant_id\"}[5m]))",
"legendFormat": "RPS",
"refId": "A"
}
],
"title": "Tenant RPS",
"type": "stat"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "thresholds" },
"mappings": [],
"thresholds": { "mode": "absolute", "steps": [{ "color": "green", "value": null }] },
"unit": "none"
}
},
"gridPos": { "h": 4, "w": 6, "x": 18, "y": 1 },
"id": 5,
"options": {
"colorMode": "value",
"graphMode": "area",
"justifyMode": "auto",
"orientation": "auto",
"reduceOptions": { "calcs": ["lastNotNull"], "fields": "", "values": false },
"textMode": "auto"
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "sum(rate(classeo_login_failures_total{tenant_id=\"$tenant_id\"}[5m])) * 60",
"legendFormat": "Failed Logins/min",
"refId": "A"
}
],
"title": "Login Failures/min",
"type": "stat"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 },
"id": 6,
"panels": [],
"title": "Request Metrics",
"type": "row"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "none" },
"thresholdsStyle": { "mode": "line" }
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{ "color": "green", "value": null },
{ "color": "yellow", "value": 200 }
]
},
"unit": "ms"
}
},
"gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 },
"id": 7,
"options": {
"legend": { "calcs": ["mean", "max"], "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "histogram_quantile(0.50, sum(rate(classeo_http_request_duration_seconds_bucket{tenant_id=\"$tenant_id\"}[5m])) by (le)) * 1000",
"legendFormat": "P50",
"refId": "A"
},
{
"expr": "histogram_quantile(0.95, sum(rate(classeo_http_request_duration_seconds_bucket{tenant_id=\"$tenant_id\"}[5m])) by (le)) * 1000",
"legendFormat": "P95",
"refId": "B"
},
{
"expr": "histogram_quantile(0.99, sum(rate(classeo_http_request_duration_seconds_bucket{tenant_id=\"$tenant_id\"}[5m])) by (le)) * 1000",
"legendFormat": "P99",
"refId": "C"
}
],
"title": "Latency Distribution",
"type": "timeseries"
},
{
"datasource": { "type": "prometheus", "uid": "prometheus" },
"fieldConfig": {
"defaults": {
"color": { "mode": "palette-classic" },
"custom": {
"axisBorderShow": false,
"axisCenteredZero": false,
"axisColorMode": "text",
"axisLabel": "",
"axisPlacement": "auto",
"barAlignment": 0,
"drawStyle": "line",
"fillOpacity": 10,
"gradientMode": "none",
"hideFrom": { "legend": false, "tooltip": false, "viz": false },
"insertNulls": false,
"lineInterpolation": "linear",
"lineWidth": 1,
"pointSize": 5,
"scaleDistribution": { "type": "linear" },
"showPoints": "never",
"spanNulls": false,
"stacking": { "group": "A", "mode": "normal" }
},
"mappings": [],
"unit": "reqps"
}
},
"gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 },
"id": 8,
"options": {
"legend": { "calcs": ["mean", "sum"], "displayMode": "table", "placement": "bottom", "showLegend": true },
"tooltip": { "mode": "multi", "sort": "desc" }
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "sum(rate(classeo_http_requests_total{tenant_id=\"$tenant_id\"}[5m])) by (route)",
"legendFormat": "{{ route }}",
"refId": "A"
}
],
"title": "Requests by Route",
"type": "timeseries"
},
{
"collapsed": false,
"gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 },
"id": 9,
"panels": [],
"title": "Logs",
"type": "row"
},
{
"datasource": { "type": "loki", "uid": "loki" },
"gridPos": { "h": 10, "w": 24, "x": 0, "y": 15 },
"id": 10,
"options": {
"dedupStrategy": "none",
"enableLogDetails": true,
"prettifyLogMessage": false,
"showCommonLabels": false,
"showLabels": false,
"showTime": true,
"sortOrder": "Descending",
"wrapLogMessage": false
},
"pluginVersion": "11.4.0",
"targets": [
{
"expr": "{tenant_id=\"$tenant_id\"}",
"legendFormat": "",
"refId": "A"
}
],
"title": "Tenant Logs",
"type": "logs"
}
],
"refresh": "30s",
"schemaVersion": 39,
"tags": ["classeo", "tenant", "multi-tenant"],
"templating": {
"list": [
{
"current": {},
"datasource": { "type": "prometheus", "uid": "prometheus" },
"definition": "label_values(classeo_http_requests_total, tenant_id)",
"hide": 0,
"includeAll": false,
"label": "Tenant",
"multi": false,
"name": "tenant_id",
"options": [],
"query": { "qryType": 1, "query": "label_values(classeo_http_requests_total, tenant_id)", "refId": "PrometheusVariableQueryEditor-VariableQuery" },
"refresh": 2,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
}
]
},
"time": { "from": "now-1h", "to": "now" },
"timepicker": {},
"timezone": "browser",
"title": "Classeo - Per Tenant",
"uid": "classeo-tenant",
"version": 1,
"weekStart": ""
}

View File

@@ -0,0 +1,44 @@
# Grafana Datasources Provisioning
# Auto-configures Prometheus and Loki connections
apiVersion: 1
datasources:
# Prometheus - Metrics
- name: Prometheus
uid: prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: false
jsonData:
timeInterval: "15s"
httpMethod: POST
# Loki - Logs
- name: Loki
uid: loki
type: loki
access: proxy
url: http://loki:3100
editable: false
jsonData:
maxLines: 1000
derivedFields:
# Link correlation_id to traces
- name: correlation_id
matcherRegex: '"correlation_id":"([^"]+)"'
url: '/explore?orgId=1&left=["now-1h","now","Loki",{"expr":"{correlation_id=\"$${__value.raw}\"}"}]'
datasourceUid: loki
urlDisplayLabel: "View correlated logs"
# Alertmanager
- name: Alertmanager
uid: alertmanager
type: alertmanager
access: proxy
url: http://alertmanager:9093
editable: false
jsonData:
implementation: prometheus

View File

@@ -0,0 +1,61 @@
# Loki Configuration for Classeo
# NFR-OB4: Log retention 30 days
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
log_level: info
common:
instance_addr: 127.0.0.1
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
kvstore:
store: inmemory
query_range:
results_cache:
cache:
embedded_cache:
enabled: true
max_size_mb: 100
schema_config:
configs:
- from: 2024-01-01
store: tsdb
object_store: filesystem
schema: v13
index:
prefix: index_
period: 24h
ruler:
alertmanager_url: http://alertmanager:9093
# NFR-OB4: 30 days retention
limits_config:
retention_period: 720h # 30 days
max_query_length: 721h
max_query_parallelism: 32
max_entries_limit_per_query: 10000
ingestion_rate_mb: 4
ingestion_burst_size_mb: 6
compactor:
working_directory: /loki/compactor
compaction_interval: 10m
retention_enabled: true
retention_delete_delay: 2h
retention_delete_worker_count: 150
delete_request_store: filesystem
analytics:
reporting_enabled: false

View File

@@ -0,0 +1,143 @@
# Prometheus Alert Rules for Classeo
# NFR-OB2: Automated alerts when SLA threatened (< 5 min detection)
groups:
# =============================================================================
# SLA & Performance Alerts
# =============================================================================
- name: sla_alerts
rules:
# NFR-P4: API response time P95 < 200ms
- alert: HighApiLatencyP95
expr: histogram_quantile(0.95, sum(rate(classeo_http_request_duration_seconds_bucket{job="classeo-backend"}[5m])) by (le)) > 0.2
for: 2m
labels:
severity: warning
team: platform
annotations:
summary: "API P95 latency above SLA threshold"
description: "P95 latency is {{ $value | humanizeDuration }} (threshold: 200ms)"
runbook_url: "https://docs.classeo.local/runbooks/high-latency"
# NFR-P5: API response time P99 < 500ms
- alert: HighApiLatencyP99
expr: histogram_quantile(0.99, sum(rate(classeo_http_request_duration_seconds_bucket{job="classeo-backend"}[5m])) by (le)) > 0.5
for: 5m
labels:
severity: critical
team: platform
annotations:
summary: "API P99 latency critically high"
description: "P99 latency is {{ $value | humanizeDuration }} (threshold: 500ms)"
runbook_url: "https://docs.classeo.local/runbooks/high-latency"
# Error rate > 1% (AC3: error rate > 1% pendant 2 min)
- alert: HighErrorRate
expr: sum(rate(classeo_http_requests_total{status=~"5.."}[2m])) / sum(rate(classeo_http_requests_total[2m])) > 0.01
for: 2m
labels:
severity: critical
team: platform
annotations:
summary: "High error rate detected"
description: "Error rate is {{ $value | humanizePercentage }} (threshold: 1%)"
runbook_url: "https://docs.classeo.local/runbooks/high-error-rate"
# =============================================================================
# Infrastructure Alerts
# =============================================================================
- name: infrastructure_alerts
rules:
# Redis memory usage
- alert: RedisHighMemoryUsage
expr: redis_memory_used_bytes / redis_memory_max_bytes > 0.8
for: 5m
labels:
severity: warning
team: platform
annotations:
summary: "Redis memory usage above 80%"
description: "Redis is using {{ $value | humanizePercentage }} of available memory"
runbook_url: "https://docs.classeo.local/runbooks/redis-memory"
# Database connection issues
- alert: DatabaseConnectionFailed
expr: pg_up == 0
for: 1m
labels:
severity: critical
team: platform
annotations:
summary: "PostgreSQL connection failed"
description: "Cannot connect to PostgreSQL database"
runbook_url: "https://docs.classeo.local/runbooks/database-down"
# RabbitMQ queue backlog
- alert: RabbitMQQueueBacklog
expr: rabbitmq_queue_messages > 10000
for: 10m
labels:
severity: warning
team: platform
annotations:
summary: "RabbitMQ queue backlog growing"
description: "Queue has {{ $value }} messages pending"
runbook_url: "https://docs.classeo.local/runbooks/rabbitmq-backlog"
# =============================================================================
# Security Alerts
# =============================================================================
- name: security_alerts
rules:
# NFR-S2: Excessive login failures (potential brute force)
- alert: ExcessiveLoginFailures
expr: sum(rate(classeo_login_failures_total[5m])) > 10
for: 2m
labels:
severity: warning
team: security
annotations:
summary: "Excessive login failures detected"
description: "More than 10 failed logins per minute"
runbook_url: "https://docs.classeo.local/runbooks/brute-force"
# Per-tenant excessive login failures
- alert: TenantExcessiveLoginFailures
expr: sum by (tenant_id) (rate(classeo_login_failures_total[5m])) > 5
for: 5m
labels:
severity: warning
team: security
annotations:
summary: "Excessive login failures for tenant {{ $labels.tenant_id }}"
description: "More than 5 failed logins per minute for single tenant"
runbook_url: "https://docs.classeo.local/runbooks/brute-force"
# =============================================================================
# Application Health Alerts
# =============================================================================
- name: application_alerts
rules:
# Backend scrape target down
- alert: ApplicationUnhealthy
expr: up{job="classeo-backend"} == 0
for: 1m
labels:
severity: critical
team: platform
annotations:
summary: "Backend application is down"
description: "Cannot scrape metrics from backend - application may be crashed or unreachable"
runbook_url: "https://docs.classeo.local/runbooks/health-check"
# Infrastructure service unhealthy (postgres, redis, rabbitmq)
- alert: InfrastructureServiceUnhealthy
expr: classeo_health_check_status == 0
for: 2m
labels:
severity: warning
team: platform
annotations:
summary: "Infrastructure service {{ $labels.service }} is unhealthy"
description: "Health check for {{ $labels.service }} is failing"
runbook_url: "https://docs.classeo.local/runbooks/degraded-mode"

View File

@@ -0,0 +1,52 @@
# Prometheus Configuration for Classeo
# Scrapes metrics from PHP backend and other services
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
environment: ${ENVIRONMENT:-development}
project: classeo
# Alerting configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load alert rules
rule_files:
- /etc/prometheus/alerts.yml
# Scrape configurations
scrape_configs:
# Prometheus self-monitoring
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
# PHP Backend metrics
- job_name: 'classeo-backend'
metrics_path: '/metrics'
static_configs:
- targets: ['php:8000']
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: 'classeo-backend'
# Redis metrics (via redis_exporter would be added in production)
# For now, we rely on application-level metrics
# PostgreSQL metrics (via postgres_exporter would be added in production)
# For now, we rely on application-level metrics
# RabbitMQ metrics
- job_name: 'rabbitmq'
static_configs:
- targets: ['rabbitmq:15692']
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: 'classeo-rabbitmq'

View File

@@ -0,0 +1,72 @@
# Promtail Configuration for Classeo
# Collects logs from Docker containers and ships to Loki
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
# Docker container logs via Docker socket
- job_name: docker
docker_sd_configs:
- host: unix:///var/run/docker.sock
refresh_interval: 5s
relabel_configs:
# Only scrape classeo containers
- source_labels: ['__meta_docker_container_name']
regex: '/classeo_.*'
action: keep
# Extract container name as label
- source_labels: ['__meta_docker_container_name']
regex: '/classeo_(.*)'
target_label: service
# Add environment label
- source_labels: []
target_label: environment
replacement: ${ENVIRONMENT:-development}
# Add project label
- source_labels: []
target_label: project
replacement: classeo
pipeline_stages:
# Parse JSON logs from PHP backend
- json:
expressions:
level: level
message: message
channel: channel
correlation_id: extra.correlation_id
tenant_id: extra.tenant_id
user_id: context.user_id
timestamp: datetime
source: log
# Extract labels from parsed JSON
- labels:
level:
channel:
correlation_id:
tenant_id:
# Set timestamp from log entry
- timestamp:
source: timestamp
format: "2006-01-02T15:04:05.000000Z07:00"
fallback_formats:
- "2006-01-02T15:04:05Z07:00"
- RFC3339
# Filter out health check noise
- match:
selector: '{service="php"}'
stages:
- drop:
expression: '.*GET /health.*'
drop_counter_reason: health_check_noise
- drop:
expression: '.*GET /metrics.*'
drop_counter_reason: metrics_endpoint_noise