nanochat/helm/observability/dashboards/app-performance.json
Manmohan Sharma aa0818aae2
feat(observability): Prometheus + Grafana + Loki stack for samosaChaat (#9)
Replaces the helm/observability scaffold with a real monitoring stack
wired into the samosaChaat platform.

Helm chart (helm/observability/)
- Chart.yaml declares kube-prometheus-stack (~62.0) and loki-stack
  (~2.10) as subchart dependencies.
- values.yaml configures Prometheus (15d retention, 50Gi PVC,
  ServiceMonitor + rule selector on app.kubernetes.io/part-of:
  samosachaat), Alertmanager (10Gi PVC), Grafana (OAuth-only via
  GitHub + Google, local login disabled, Prometheus + Loki datasources,
  dashboards auto-provisioned from a ConfigMap, email + Slack contact
  points with a critical route to Slack), Loki (50Gi, 30d retention,
  tsdb schema), and Promtail (JSON pipeline that lifts level / service
  / trace_id / user_id into labels, scrape config with pod labels).
- Alert rules: HighCPU, HighMemory, DiskSpaceLow, High5xxRate,
  InferenceServiceDown, HighP99Latency.
- templates/grafana-dashboards-configmap.yaml renders every file under
  dashboards/ into a single grafana_dashboard=1 ConfigMap.
- dashboards/node-health.json, app-performance.json, inference.json -
  fully-formed Grafana dashboards with Prometheus datasource variable,
  templated app selector, thresholded gauges, and LogQL-ready labels.

Scraping (helm/samosachaat/templates/servicemonitor.yaml)
- ServiceMonitor CRs for auth / chat-api / inference that Prometheus
  picks up via the part-of=samosachaat selector; scrapes /metrics
  every 15s and replaces the app label so dashboards line up.

Application instrumentation
- services/{auth,chat-api,inference} each depend on
  prometheus-fastapi-instrumentator and expose /metrics (request count,
  latency histograms, in-progress gauges).
- services/auth/src/logging_setup.py and
  services/inference/src/logging_setup.py mirror the canonical
  chat-api implementation - structlog JSON with service, trace_id,
  user_id context injection.
- configure_logging() is called at create_app() in auth and inference;
  inference's main.py now uses structlog via get_logger() instead of
  logging.getLogger.
- log_level setting added to auth + inference config (LOG_LEVEL env).

Docs
- contracts/logging-standard.md defines the required JSON fields,
  Python (structlog) + Node.js (pino) implementations, LogQL examples
  for cross-service queries, and the x-trace-id propagation contract.

Closes #9

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-16 12:29:16 -07:00

215 lines
6.5 KiB
JSON

{
"annotations": {
"list": [
{
"builtIn": 1,
"datasource": {"type": "grafana", "uid": "-- Grafana --"},
"enable": true,
"hide": true,
"iconColor": "rgba(0, 211, 255, 1)",
"name": "Annotations & Alerts",
"type": "dashboard"
}
]
},
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 0,
"id": null,
"liveNow": false,
"panels": [
{
"type": "row",
"title": "Traffic",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
"id": 100,
"collapsed": false,
"panels": []
},
{
"type": "timeseries",
"title": "Requests per second by service",
"id": 1,
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 1},
"fieldConfig": {"defaults": {"unit": "reqps"}, "overrides": []},
"options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}},
"targets": [
{
"refId": "A",
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"expr": "sum by (app) (rate(http_requests_total{app=~\"$app\"}[5m]))",
"legendFormat": "{{app}}"
}
]
},
{
"type": "row",
"title": "Latency",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 9},
"id": 101,
"collapsed": false,
"panels": []
},
{
"type": "timeseries",
"title": "p50 / p95 / p99 latency",
"id": 2,
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"gridPos": {"h": 10, "w": 24, "x": 0, "y": 10},
"fieldConfig": {"defaults": {"unit": "s"}, "overrides": []},
"options": {"legend": {"displayMode": "table", "placement": "bottom", "calcs": ["mean", "lastNotNull"]}, "tooltip": {"mode": "multi"}},
"targets": [
{
"refId": "A",
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"expr": "histogram_quantile(0.50, sum by (le, app) (rate(http_request_duration_seconds_bucket{app=~\"$app\"}[5m])))",
"legendFormat": "p50 {{app}}"
},
{
"refId": "B",
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"expr": "histogram_quantile(0.95, sum by (le, app) (rate(http_request_duration_seconds_bucket{app=~\"$app\"}[5m])))",
"legendFormat": "p95 {{app}}"
},
{
"refId": "C",
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"expr": "histogram_quantile(0.99, sum by (le, app) (rate(http_request_duration_seconds_bucket{app=~\"$app\"}[5m])))",
"legendFormat": "p99 {{app}}"
}
]
},
{
"type": "row",
"title": "Errors",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 20},
"id": 102,
"collapsed": false,
"panels": []
},
{
"type": "timeseries",
"title": "Error rate by service (4xx + 5xx, stacked)",
"id": 3,
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 21},
"fieldConfig": {
"defaults": {
"unit": "reqps",
"custom": {"stacking": {"mode": "normal"}, "fillOpacity": 25}
},
"overrides": []
},
"options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}},
"targets": [
{
"refId": "A",
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"expr": "sum by (app) (rate(http_requests_total{app=~\"$app\",status=~\"4..\"}[5m]))",
"legendFormat": "4xx {{app}}"
},
{
"refId": "B",
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"expr": "sum by (app) (rate(http_requests_total{app=~\"$app\",status=~\"5..\"}[5m]))",
"legendFormat": "5xx {{app}}"
}
]
},
{
"type": "row",
"title": "Availability",
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 29},
"id": 103,
"collapsed": false,
"panels": []
},
{
"type": "stat",
"title": "Uptime % (last 24h) by service",
"id": 4,
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"gridPos": {"h": 6, "w": 24, "x": 0, "y": 30},
"fieldConfig": {
"defaults": {
"unit": "percentunit",
"min": 0,
"max": 1,
"thresholds": {
"mode": "absolute",
"steps": [
{"color": "red", "value": null},
{"color": "yellow", "value": 0.95},
{"color": "green", "value": 0.99}
]
}
},
"overrides": []
},
"options": {
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
"orientation": "auto",
"textMode": "auto",
"colorMode": "value",
"graphMode": "area"
},
"targets": [
{
"refId": "A",
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"expr": "avg_over_time(up{app=~\"$app\"}[24h])",
"legendFormat": "{{app}}"
}
]
}
],
"refresh": "30s",
"schemaVersion": 39,
"style": "dark",
"tags": ["samosachaat", "application"],
"templating": {
"list": [
{
"current": {"selected": false, "text": "Prometheus", "value": "prometheus"},
"hide": 0,
"includeAll": false,
"label": "Datasource",
"multi": false,
"name": "DS_PROMETHEUS",
"options": [],
"query": "prometheus",
"queryValue": "",
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"type": "datasource"
},
{
"current": {"selected": true, "text": "All", "value": "$__all"},
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
"definition": "label_values(http_requests_total, app)",
"hide": 0,
"includeAll": true,
"label": "app",
"multi": true,
"name": "app",
"options": [],
"query": {"qryType": 1, "query": "label_values(http_requests_total, app)", "refId": "app"},
"refresh": 1,
"regex": "",
"skipUrlSync": false,
"sort": 1,
"type": "query"
}
]
},
"time": {"from": "now-6h", "to": "now"},
"timepicker": {},
"timezone": "",
"title": "samosaChaat — Application Performance",
"uid": "samosachaat-app-performance",
"version": 1,
"weekStart": ""
}