mirror of
https://github.com/karpathy/nanochat.git
synced 2026-05-09 01:10:10 +00:00
Replaces the helm/observability scaffold with a real monitoring stack
wired into the samosaChaat platform.
Helm chart (helm/observability/)
- Chart.yaml declares kube-prometheus-stack (~62.0) and loki-stack
(~2.10) as subchart dependencies.
- values.yaml configures Prometheus (15d retention, 50Gi PVC,
ServiceMonitor + rule selector on app.kubernetes.io/part-of:
samosachaat), Alertmanager (10Gi PVC), Grafana (OAuth-only via
GitHub + Google, local login disabled, Prometheus + Loki datasources,
dashboards auto-provisioned from a ConfigMap, email + Slack contact
points with a critical route to Slack), Loki (50Gi, 30d retention,
tsdb schema), and Promtail (JSON pipeline that lifts level / service
/ trace_id / user_id into labels, scrape config with pod labels).
- Alert rules: HighCPU, HighMemory, DiskSpaceLow, High5xxRate,
InferenceServiceDown, HighP99Latency.
- templates/grafana-dashboards-configmap.yaml renders every file under
dashboards/ into a single grafana_dashboard=1 ConfigMap.
- dashboards/node-health.json, app-performance.json, inference.json -
fully-formed Grafana dashboards with Prometheus datasource variable,
templated app selector, thresholded gauges, and LogQL-ready labels.
Scraping (helm/samosachaat/templates/servicemonitor.yaml)
- ServiceMonitor CRs for auth / chat-api / inference that Prometheus
picks up via the part-of=samosachaat selector; scrapes /metrics
every 15s and replaces the app label so dashboards line up.
Application instrumentation
- services/{auth,chat-api,inference} each depend on
prometheus-fastapi-instrumentator and expose /metrics (request count,
latency histograms, in-progress gauges).
- services/auth/src/logging_setup.py and
services/inference/src/logging_setup.py mirror the canonical
chat-api implementation - structlog JSON with service, trace_id,
user_id context injection.
- configure_logging() is called at create_app() in auth and inference;
inference's main.py now uses structlog via get_logger() instead of
logging.getLogger.
- log_level setting added to auth + inference config (LOG_LEVEL env).
Docs
- contracts/logging-standard.md defines the required JSON fields,
Python (structlog) + Node.js (pino) implementations, LogQL examples
for cross-service queries, and the x-trace-id propagation contract.
Closes #9
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
235 lines
6.9 KiB
JSON
235 lines
6.9 KiB
JSON
{
|
|
"annotations": {
|
|
"list": [
|
|
{
|
|
"builtIn": 1,
|
|
"datasource": {"type": "grafana", "uid": "-- Grafana --"},
|
|
"enable": true,
|
|
"hide": true,
|
|
"iconColor": "rgba(0, 211, 255, 1)",
|
|
"name": "Annotations & Alerts",
|
|
"type": "dashboard"
|
|
}
|
|
]
|
|
},
|
|
"editable": true,
|
|
"fiscalYearStartMonth": 0,
|
|
"graphTooltip": 0,
|
|
"id": null,
|
|
"liveNow": false,
|
|
"panels": [
|
|
{
|
|
"type": "row",
|
|
"title": "Latency",
|
|
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
|
|
"id": 100,
|
|
"collapsed": false,
|
|
"panels": []
|
|
},
|
|
{
|
|
"type": "heatmap",
|
|
"title": "Inference duration heatmap",
|
|
"id": 1,
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"gridPos": {"h": 9, "w": 18, "x": 0, "y": 1},
|
|
"fieldConfig": {"defaults": {"unit": "s"}, "overrides": []},
|
|
"options": {
|
|
"calculate": false,
|
|
"cellGap": 1,
|
|
"color": {"mode": "scheme", "scheme": "Oranges", "steps": 64, "exponent": 0.5},
|
|
"yAxis": {"unit": "s"},
|
|
"tooltip": {"show": true}
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"expr": "sum by (le) (rate(inference_duration_seconds_bucket[5m]))",
|
|
"format": "heatmap",
|
|
"legendFormat": "{{le}}"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "stat",
|
|
"title": "Current p50 / p95",
|
|
"id": 2,
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"gridPos": {"h": 9, "w": 6, "x": 18, "y": 1},
|
|
"fieldConfig": {"defaults": {"unit": "s"}, "overrides": []},
|
|
"options": {
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
|
|
"orientation": "vertical",
|
|
"textMode": "auto",
|
|
"colorMode": "value",
|
|
"graphMode": "area"
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"expr": "histogram_quantile(0.50, sum by (le) (rate(inference_duration_seconds_bucket[5m])))",
|
|
"legendFormat": "p50"
|
|
},
|
|
{
|
|
"refId": "B",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"expr": "histogram_quantile(0.95, sum by (le) (rate(inference_duration_seconds_bucket[5m])))",
|
|
"legendFormat": "p95"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "row",
|
|
"title": "Throughput",
|
|
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 10},
|
|
"id": 101,
|
|
"collapsed": false,
|
|
"panels": []
|
|
},
|
|
{
|
|
"type": "timeseries",
|
|
"title": "Tokens generated per second",
|
|
"id": 3,
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 11},
|
|
"fieldConfig": {"defaults": {"unit": "short"}, "overrides": []},
|
|
"options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"expr": "sum(rate(tokens_generated_total[1m]))",
|
|
"legendFormat": "tokens/s"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "row",
|
|
"title": "Workers",
|
|
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 19},
|
|
"id": 102,
|
|
"collapsed": false,
|
|
"panels": []
|
|
},
|
|
{
|
|
"type": "gauge",
|
|
"title": "Worker state",
|
|
"id": 4,
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 20},
|
|
"fieldConfig": {
|
|
"defaults": {
|
|
"unit": "short",
|
|
"min": 0
|
|
},
|
|
"overrides": []
|
|
},
|
|
"options": {
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
|
|
"orientation": "auto",
|
|
"showThresholdLabels": false,
|
|
"showThresholdMarkers": true
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"expr": "sum(workers_total)",
|
|
"legendFormat": "total"
|
|
},
|
|
{
|
|
"refId": "B",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"expr": "sum(workers_busy)",
|
|
"legendFormat": "busy"
|
|
},
|
|
{
|
|
"refId": "C",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"expr": "sum(workers_available)",
|
|
"legendFormat": "available"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "timeseries",
|
|
"title": "Worker utilization over time (busy / total)",
|
|
"id": 5,
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 20},
|
|
"fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}, "overrides": []},
|
|
"options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"expr": "sum(workers_busy) / clamp_min(sum(workers_total), 1)",
|
|
"legendFormat": "utilization"
|
|
}
|
|
]
|
|
},
|
|
{
|
|
"type": "row",
|
|
"title": "Active",
|
|
"gridPos": {"h": 1, "w": 24, "x": 0, "y": 28},
|
|
"id": 103,
|
|
"collapsed": false,
|
|
"panels": []
|
|
},
|
|
{
|
|
"type": "stat",
|
|
"title": "Active generations",
|
|
"id": 6,
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"gridPos": {"h": 6, "w": 24, "x": 0, "y": 29},
|
|
"fieldConfig": {"defaults": {"unit": "short"}, "overrides": []},
|
|
"options": {
|
|
"reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
|
|
"orientation": "auto",
|
|
"textMode": "value_and_name",
|
|
"colorMode": "value",
|
|
"graphMode": "area"
|
|
},
|
|
"targets": [
|
|
{
|
|
"refId": "A",
|
|
"datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
|
|
"expr": "sum(active_generations)",
|
|
"legendFormat": "active"
|
|
}
|
|
]
|
|
}
|
|
],
|
|
"refresh": "15s",
|
|
"schemaVersion": 39,
|
|
"style": "dark",
|
|
"tags": ["samosachaat", "inference"],
|
|
"templating": {
|
|
"list": [
|
|
{
|
|
"current": {"selected": false, "text": "Prometheus", "value": "prometheus"},
|
|
"hide": 0,
|
|
"includeAll": false,
|
|
"label": "Datasource",
|
|
"multi": false,
|
|
"name": "DS_PROMETHEUS",
|
|
"options": [],
|
|
"query": "prometheus",
|
|
"queryValue": "",
|
|
"refresh": 1,
|
|
"regex": "",
|
|
"skipUrlSync": false,
|
|
"type": "datasource"
|
|
}
|
|
]
|
|
},
|
|
"time": {"from": "now-1h", "to": "now"},
|
|
"timepicker": {},
|
|
"timezone": "",
|
|
"title": "samosaChaat — Inference Service",
|
|
"uid": "samosachaat-inference",
|
|
"version": 1,
|
|
"weekStart": ""
|
|
}
|