From aa0818aae2b95257d3f63cc3b22f4f610613692c Mon Sep 17 00:00:00 2001 From: Manmohan Sharma Date: Thu, 16 Apr 2026 12:29:16 -0700 Subject: [PATCH] feat(observability): Prometheus + Grafana + Loki stack for samosaChaat (#9) Replaces the helm/observability scaffold with a real monitoring stack wired into the samosaChaat platform. Helm chart (helm/observability/) - Chart.yaml declares kube-prometheus-stack (~62.0) and loki-stack (~2.10) as subchart dependencies. - values.yaml configures Prometheus (15d retention, 50Gi PVC, ServiceMonitor + rule selector on app.kubernetes.io/part-of: samosachaat), Alertmanager (10Gi PVC), Grafana (OAuth-only via GitHub + Google, local login disabled, Prometheus + Loki datasources, dashboards auto-provisioned from a ConfigMap, email + Slack contact points with a critical route to Slack), Loki (50Gi, 30d retention, tsdb schema), and Promtail (JSON pipeline that lifts level / service / trace_id / user_id into labels, scrape config with pod labels). - Alert rules: HighCPU, HighMemory, DiskSpaceLow, High5xxRate, InferenceServiceDown, HighP99Latency. - templates/grafana-dashboards-configmap.yaml renders every file under dashboards/ into a single grafana_dashboard=1 ConfigMap. - dashboards/node-health.json, app-performance.json, inference.json - fully-formed Grafana dashboards with Prometheus datasource variable, templated app selector, thresholded gauges, and LogQL-ready labels. Scraping (helm/samosachaat/templates/servicemonitor.yaml) - ServiceMonitor CRs for auth / chat-api / inference that Prometheus picks up via the part-of=samosachaat selector; scrapes /metrics every 15s and replaces the app label so dashboards line up. Application instrumentation - services/{auth,chat-api,inference} each depend on prometheus-fastapi-instrumentator and expose /metrics (request count, latency histograms, in-progress gauges). - services/auth/src/logging_setup.py and services/inference/src/logging_setup.py mirror the canonical chat-api implementation - structlog JSON with service, trace_id, user_id context injection. - configure_logging() is called at create_app() in auth and inference; inference's main.py now uses structlog via get_logger() instead of logging.getLogger. - log_level setting added to auth + inference config (LOG_LEVEL env). Docs - contracts/logging-standard.md defines the required JSON fields, Python (structlog) + Node.js (pino) implementations, LogQL examples for cross-service queries, and the x-trace-id propagation contract. Closes #9 Co-Authored-By: Claude Opus 4.7 (1M context) --- contracts/logging-standard.md | 110 ++++++ helm/observability/Chart.yaml | 15 +- .../dashboards/app-performance.json | 214 ++++++++++++ helm/observability/dashboards/inference.json | 234 +++++++++++++ .../observability/dashboards/node-health.json | 297 ++++++++++++++++ helm/observability/templates/NOTES.txt | 24 +- .../grafana-dashboards-configmap.yaml | 21 ++ helm/observability/values.yaml | 321 +++++++++++++++++- .../samosachaat/templates/servicemonitor.yaml | 40 +++ services/auth/pyproject.toml | 2 + services/auth/src/config.py | 2 + services/auth/src/logging_setup.py | 78 +++++ services/auth/src/main.py | 5 + services/chat-api/pyproject.toml | 1 + services/chat-api/src/main.py | 3 + services/inference/pyproject.toml | 2 + services/inference/src/config.py | 1 + services/inference/src/logging_setup.py | 78 +++++ services/inference/src/main.py | 10 +- 19 files changed, 1444 insertions(+), 14 deletions(-) create mode 100644 contracts/logging-standard.md create mode 100644 helm/observability/dashboards/app-performance.json create mode 100644 helm/observability/dashboards/inference.json create mode 100644 helm/observability/dashboards/node-health.json create mode 100644 helm/observability/templates/grafana-dashboards-configmap.yaml create mode 100644 helm/samosachaat/templates/servicemonitor.yaml create mode 100644 services/auth/src/logging_setup.py create mode 100644 services/inference/src/logging_setup.py diff --git a/contracts/logging-standard.md b/contracts/logging-standard.md new file mode 100644 index 00000000..8292fa16 --- /dev/null +++ b/contracts/logging-standard.md @@ -0,0 +1,110 @@ +# samosaChaat Logging Standard + +All services in the samosaChaat platform emit logs as **single-line JSON** +on stdout. Promtail ships them to Loki, where Grafana queries them by label +and by JSON field. Because every service shares the same schema, a single +trace_id lets you follow a request from the frontend through auth → chat-api +→ inference. + +## Required fields + +Every log line MUST include: + +| Field | Type | Source | +|-------------|---------|------------------------------------------| +| `timestamp` | ISO8601 | structlog `TimeStamper(fmt="iso")` | +| `level` | string | `debug` / `info` / `warning` / `error` | +| `service` | string | hard-coded per service (`auth`, `chat-api`, `inference`, `frontend`) | +| `message` | string | the human-readable event (`event` key in structlog) | + +Conditionally included (when present in the request context): + +| Field | When to include | +|--------------|------------------------------------------| +| `trace_id` | every request served by a backend service — propagated via the `x-trace-id` header | +| `user_id` | every request authenticated as a user | +| `inference_time_ms` | emitted by chat-api and inference around model calls | +| `error` | on exceptions — the stringified cause | + +Anything else is free-form structured context (`method`, `path`, +`status_code`, `model_tag`, …). Keep keys `snake_case`. + +## Python implementation — `structlog` + +The canonical setup lives at `services/chat-api/src/logging_setup.py`. +`services/auth/src/logging_setup.py` and `services/inference/src/logging_setup.py` +mirror it, differing only in the hard-coded `service` value. + +Key pieces: + +```python +structlog.configure( + processors=[ + structlog.contextvars.merge_contextvars, # trace_id / user_id from context + structlog.processors.add_log_level, # -> level field + structlog.processors.TimeStamper(fmt="iso", utc=True), + _inject_context, # service + trace_id + user_id defaults + structlog.processors.JSONRenderer(), # final JSON line + ], + ... +) +``` + +Each service calls `configure_logging()` once at startup (inside +`create_app()`), and then uses `logger = get_logger(__name__)` everywhere. +`trace_id` is set by a FastAPI middleware that reads the incoming +`x-trace-id` header (or mints a new one via `new_trace_id()`) and propagates +it to downstream calls. + +## Node.js implementation — `pino` (frontend) + +The Next.js frontend should log JSON with the same schema. Reference config: + +```ts +// services/frontend/lib/logger.ts +import pino from "pino"; + +export const logger = pino({ + base: { service: "frontend" }, + timestamp: pino.stdTimeFunctions.isoTime, + formatters: { + level: (label) => ({ level: label }), // keep the string level + }, + messageKey: "message", +}); +``` + +When emitting, always include `trace_id` and `user_id` when known: + +```ts +logger.info({ trace_id, user_id, path: req.url }, "request_start"); +``` + +In API routes, read the incoming `x-trace-id` header and echo it back on the +response so client-side traces can join up. + +## Cross-service querying (LogQL / Grafana Explore) + +Labels Promtail applies: `namespace`, `app`, `pod`, `level`, `service`, +`container`. Everything else is a JSON field — use `| json` to extract it. + +| Goal | Query | +|-------------------------------|-----------------------------------------------------------------------| +| All errors in prod | `{namespace="samosachaat-prod"} | json | level="error"` | +| Trace a request across tiers | `{namespace="samosachaat-prod"} | json | trace_id=""` | +| Auth failures | `{app="auth"} | json | level="error"` | +| Slow inference calls | `{app="inference"} | json | inference_time_ms > 5000` | +| 5xx by service | `{namespace="samosachaat-prod"} | json | status_code >= 500` | +| Rate-limited OAuth logins | `{app="auth"} | json | path=~"/auth/oauth/.*" | status_code=429` | + +## Trace propagation contract + +1. **Frontend** — mint `trace_id` on navigation (or reuse an existing one from + the current session), send it as `x-trace-id` on every `fetch` to the API. +2. **chat-api** — read `x-trace-id`, store in a context var, re-emit on the + response, and forward it on every httpx call to auth or inference. +3. **auth / inference** — read `x-trace-id` and bind it to the logger context + for the duration of the request. + +Services MUST NOT log raw secrets, JWTs, OAuth client secrets, API keys, or +full user message text. Log IDs, lengths, and booleans — not contents. diff --git a/helm/observability/Chart.yaml b/helm/observability/Chart.yaml index 071fb150..07a81f76 100644 --- a/helm/observability/Chart.yaml +++ b/helm/observability/Chart.yaml @@ -1,6 +1,17 @@ apiVersion: v2 name: observability -description: Observability chart scaffold for Grafana, Prometheus, and Loki. +description: | + Observability stack for the samosaChaat platform: kube-prometheus-stack + (Prometheus + Grafana + Alertmanager + node/kube-state exporters) and + loki-stack (Loki + Promtail) for metrics, alerts, dashboards, and logs. type: application version: 0.1.0 -appVersion: "0.1.0" +appVersion: "1.0.0" + +dependencies: + - name: kube-prometheus-stack + version: "~62.0" + repository: https://prometheus-community.github.io/helm-charts + - name: loki-stack + version: "~2.10" + repository: https://grafana.github.io/helm-charts diff --git a/helm/observability/dashboards/app-performance.json b/helm/observability/dashboards/app-performance.json new file mode 100644 index 00000000..6bce5e75 --- /dev/null +++ b/helm/observability/dashboards/app-performance.json @@ -0,0 +1,214 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": {"type": "grafana", "uid": "-- Grafana --"}, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "liveNow": false, + "panels": [ + { + "type": "row", + "title": "Traffic", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "id": 100, + "collapsed": false, + "panels": [] + }, + { + "type": "timeseries", + "title": "Requests per second by service", + "id": 1, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 1}, + "fieldConfig": {"defaults": {"unit": "reqps"}, "overrides": []}, + "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "sum by (app) (rate(http_requests_total{app=~\"$app\"}[5m]))", + "legendFormat": "{{app}}" + } + ] + }, + { + "type": "row", + "title": "Latency", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 9}, + "id": 101, + "collapsed": false, + "panels": [] + }, + { + "type": "timeseries", + "title": "p50 / p95 / p99 latency", + "id": 2, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 10, "w": 24, "x": 0, "y": 10}, + "fieldConfig": {"defaults": {"unit": "s"}, "overrides": []}, + "options": {"legend": {"displayMode": "table", "placement": "bottom", "calcs": ["mean", "lastNotNull"]}, "tooltip": {"mode": "multi"}}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "histogram_quantile(0.50, sum by (le, app) (rate(http_request_duration_seconds_bucket{app=~\"$app\"}[5m])))", + "legendFormat": "p50 {{app}}" + }, + { + "refId": "B", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "histogram_quantile(0.95, sum by (le, app) (rate(http_request_duration_seconds_bucket{app=~\"$app\"}[5m])))", + "legendFormat": "p95 {{app}}" + }, + { + "refId": "C", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "histogram_quantile(0.99, sum by (le, app) (rate(http_request_duration_seconds_bucket{app=~\"$app\"}[5m])))", + "legendFormat": "p99 {{app}}" + } + ] + }, + { + "type": "row", + "title": "Errors", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 20}, + "id": 102, + "collapsed": false, + "panels": [] + }, + { + "type": "timeseries", + "title": "Error rate by service (4xx + 5xx, stacked)", + "id": 3, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 21}, + "fieldConfig": { + "defaults": { + "unit": "reqps", + "custom": {"stacking": {"mode": "normal"}, "fillOpacity": 25} + }, + "overrides": [] + }, + "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "sum by (app) (rate(http_requests_total{app=~\"$app\",status=~\"4..\"}[5m]))", + "legendFormat": "4xx {{app}}" + }, + { + "refId": "B", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "sum by (app) (rate(http_requests_total{app=~\"$app\",status=~\"5..\"}[5m]))", + "legendFormat": "5xx {{app}}" + } + ] + }, + { + "type": "row", + "title": "Availability", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 29}, + "id": 103, + "collapsed": false, + "panels": [] + }, + { + "type": "stat", + "title": "Uptime % (last 24h) by service", + "id": 4, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 6, "w": 24, "x": 0, "y": 30}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "red", "value": null}, + {"color": "yellow", "value": 0.95}, + {"color": "green", "value": 0.99} + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area" + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "avg_over_time(up{app=~\"$app\"}[24h])", + "legendFormat": "{{app}}" + } + ] + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": ["samosachaat", "application"], + "templating": { + "list": [ + { + "current": {"selected": false, "text": "Prometheus", "value": "prometheus"}, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": {"selected": true, "text": "All", "value": "$__all"}, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "definition": "label_values(http_requests_total, app)", + "hide": 0, + "includeAll": true, + "label": "app", + "multi": true, + "name": "app", + "options": [], + "query": {"qryType": 1, "query": "label_values(http_requests_total, app)", "refId": "app"}, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + } + ] + }, + "time": {"from": "now-6h", "to": "now"}, + "timepicker": {}, + "timezone": "", + "title": "samosaChaat — Application Performance", + "uid": "samosachaat-app-performance", + "version": 1, + "weekStart": "" +} diff --git a/helm/observability/dashboards/inference.json b/helm/observability/dashboards/inference.json new file mode 100644 index 00000000..ee9c5231 --- /dev/null +++ b/helm/observability/dashboards/inference.json @@ -0,0 +1,234 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": {"type": "grafana", "uid": "-- Grafana --"}, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "liveNow": false, + "panels": [ + { + "type": "row", + "title": "Latency", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "id": 100, + "collapsed": false, + "panels": [] + }, + { + "type": "heatmap", + "title": "Inference duration heatmap", + "id": 1, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 9, "w": 18, "x": 0, "y": 1}, + "fieldConfig": {"defaults": {"unit": "s"}, "overrides": []}, + "options": { + "calculate": false, + "cellGap": 1, + "color": {"mode": "scheme", "scheme": "Oranges", "steps": 64, "exponent": 0.5}, + "yAxis": {"unit": "s"}, + "tooltip": {"show": true} + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "sum by (le) (rate(inference_duration_seconds_bucket[5m]))", + "format": "heatmap", + "legendFormat": "{{le}}" + } + ] + }, + { + "type": "stat", + "title": "Current p50 / p95", + "id": 2, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 9, "w": 6, "x": 18, "y": 1}, + "fieldConfig": {"defaults": {"unit": "s"}, "overrides": []}, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "orientation": "vertical", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area" + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "histogram_quantile(0.50, sum by (le) (rate(inference_duration_seconds_bucket[5m])))", + "legendFormat": "p50" + }, + { + "refId": "B", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "histogram_quantile(0.95, sum by (le) (rate(inference_duration_seconds_bucket[5m])))", + "legendFormat": "p95" + } + ] + }, + { + "type": "row", + "title": "Throughput", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 10}, + "id": 101, + "collapsed": false, + "panels": [] + }, + { + "type": "timeseries", + "title": "Tokens generated per second", + "id": 3, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 11}, + "fieldConfig": {"defaults": {"unit": "short"}, "overrides": []}, + "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "sum(rate(tokens_generated_total[1m]))", + "legendFormat": "tokens/s" + } + ] + }, + { + "type": "row", + "title": "Workers", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 19}, + "id": 102, + "collapsed": false, + "panels": [] + }, + { + "type": "gauge", + "title": "Worker state", + "id": 4, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 20}, + "fieldConfig": { + "defaults": { + "unit": "short", + "min": 0 + }, + "overrides": [] + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "orientation": "auto", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "sum(workers_total)", + "legendFormat": "total" + }, + { + "refId": "B", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "sum(workers_busy)", + "legendFormat": "busy" + }, + { + "refId": "C", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "sum(workers_available)", + "legendFormat": "available" + } + ] + }, + { + "type": "timeseries", + "title": "Worker utilization over time (busy / total)", + "id": 5, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 20}, + "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}, "overrides": []}, + "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "sum(workers_busy) / clamp_min(sum(workers_total), 1)", + "legendFormat": "utilization" + } + ] + }, + { + "type": "row", + "title": "Active", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 28}, + "id": 103, + "collapsed": false, + "panels": [] + }, + { + "type": "stat", + "title": "Active generations", + "id": 6, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 6, "w": 24, "x": 0, "y": 29}, + "fieldConfig": {"defaults": {"unit": "short"}, "overrides": []}, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "orientation": "auto", + "textMode": "value_and_name", + "colorMode": "value", + "graphMode": "area" + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "sum(active_generations)", + "legendFormat": "active" + } + ] + } + ], + "refresh": "15s", + "schemaVersion": 39, + "style": "dark", + "tags": ["samosachaat", "inference"], + "templating": { + "list": [ + { + "current": {"selected": false, "text": "Prometheus", "value": "prometheus"}, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": {"from": "now-1h", "to": "now"}, + "timepicker": {}, + "timezone": "", + "title": "samosaChaat — Inference Service", + "uid": "samosachaat-inference", + "version": 1, + "weekStart": "" +} diff --git a/helm/observability/dashboards/node-health.json b/helm/observability/dashboards/node-health.json new file mode 100644 index 00000000..29ca802c --- /dev/null +++ b/helm/observability/dashboards/node-health.json @@ -0,0 +1,297 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": {"type": "grafana", "uid": "-- Grafana --"}, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": null, + "liveNow": false, + "panels": [ + { + "type": "row", + "title": "CPU", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0}, + "id": 100, + "collapsed": false, + "panels": [] + }, + { + "type": "gauge", + "title": "CPU usage by node (avg %)", + "id": 1, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 1}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 0.6}, + {"color": "red", "value": 0.8} + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "orientation": "auto", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "avg by (instance) (rate(node_cpu_seconds_total{mode!=\"idle\"}[5m]))", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "timeseries", + "title": "CPU usage by node (1h)", + "id": 2, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 1}, + "fieldConfig": { + "defaults": {"unit": "percentunit", "min": 0, "max": 1}, + "overrides": [] + }, + "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "avg by (instance) (rate(node_cpu_seconds_total{mode!=\"idle\"}[5m]))", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "row", + "title": "Memory", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 9}, + "id": 101, + "collapsed": false, + "panels": [] + }, + { + "type": "gauge", + "title": "Memory usage by node (%)", + "id": 3, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 10}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 0.7}, + {"color": "red", "value": 0.85} + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "orientation": "auto", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "timeseries", + "title": "Memory usage over time", + "id": 4, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 10}, + "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}, "overrides": []}, + "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "row", + "title": "Disk", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 18}, + "id": 102, + "collapsed": false, + "panels": [] + }, + { + "type": "gauge", + "title": "Filesystem usage by mountpoint", + "id": 5, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 8, "w": 24, "x": 0, "y": 19}, + "fieldConfig": { + "defaults": { + "unit": "percentunit", + "min": 0, + "max": 1, + "thresholds": { + "mode": "absolute", + "steps": [ + {"color": "green", "value": null}, + {"color": "yellow", "value": 0.75}, + {"color": "red", "value": 0.9} + ] + } + }, + "overrides": [] + }, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "orientation": "auto", + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "(1 - node_filesystem_avail_bytes{fstype!=\"tmpfs\"} / node_filesystem_size_bytes{fstype!=\"tmpfs\"})", + "legendFormat": "{{instance}} {{mountpoint}}" + } + ] + }, + { + "type": "row", + "title": "Network", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 27}, + "id": 103, + "collapsed": false, + "panels": [] + }, + { + "type": "timeseries", + "title": "Network received (bytes/s)", + "id": 6, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 28}, + "fieldConfig": {"defaults": {"unit": "Bps"}, "overrides": []}, + "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "sum by (instance) (rate(node_network_receive_bytes_total[5m]))", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "timeseries", + "title": "Network transmitted (bytes/s)", + "id": 7, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 8, "w": 12, "x": 12, "y": 28}, + "fieldConfig": {"defaults": {"unit": "Bps"}, "overrides": []}, + "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}}, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "sum by (instance) (rate(node_network_transmit_bytes_total[5m]))", + "legendFormat": "{{instance}}" + } + ] + }, + { + "type": "row", + "title": "Pods", + "gridPos": {"h": 1, "w": 24, "x": 0, "y": 36}, + "id": 104, + "collapsed": false, + "panels": [] + }, + { + "type": "stat", + "title": "Pods per node", + "id": 8, + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "gridPos": {"h": 6, "w": 24, "x": 0, "y": 37}, + "fieldConfig": {"defaults": {"unit": "short"}, "overrides": []}, + "options": { + "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false}, + "orientation": "auto", + "textMode": "auto", + "colorMode": "value", + "graphMode": "area" + }, + "targets": [ + { + "refId": "A", + "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"}, + "expr": "count by (node) (kube_pod_info)", + "legendFormat": "{{node}}" + } + ] + } + ], + "refresh": "30s", + "schemaVersion": 39, + "style": "dark", + "tags": ["samosachaat", "nodes"], + "templating": { + "list": [ + { + "current": {"selected": false, "text": "Prometheus", "value": "prometheus"}, + "hide": 0, + "includeAll": false, + "label": "Datasource", + "multi": false, + "name": "DS_PROMETHEUS", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + } + ] + }, + "time": {"from": "now-1h", "to": "now"}, + "timepicker": {}, + "timezone": "", + "title": "samosaChaat — Node Health", + "uid": "samosachaat-node-health", + "version": 1, + "weekStart": "" +} diff --git a/helm/observability/templates/NOTES.txt b/helm/observability/templates/NOTES.txt index 9e750d08..932cb7a3 100644 --- a/helm/observability/templates/NOTES.txt +++ b/helm/observability/templates/NOTES.txt @@ -1,4 +1,22 @@ -The observability chart scaffold is in place. +samosaChaat observability stack installed. -Add concrete manifests for Grafana, Prometheus, Loki, dashboards, and scrape -configuration as the platform monitoring stack is implemented. +Components: + * kube-prometheus-stack -> Prometheus, Alertmanager, Grafana, node-exporter, kube-state-metrics + * loki-stack -> Loki (logs) + Promtail (log shipping) + +Access: + Grafana is exposed at https://grafana.samosachaat.art via the samosaChaat + app-chart ingress. Login is OAuth-only (GitHub + Google); the local login form + is disabled. + +Required secrets in namespace {{ .Release.Namespace }}: + kubectl create secret generic grafana-oauth-secrets \ + --from-literal=GITHUB_GRAFANA_CLIENT_ID=... \ + --from-literal=GITHUB_GRAFANA_CLIENT_SECRET=... \ + --from-literal=GOOGLE_GRAFANA_CLIENT_ID=... \ + --from-literal=GOOGLE_GRAFANA_CLIENT_SECRET=... \ + --from-literal=SLACK_WEBHOOK_URL=... + +Useful in-cluster URLs: + Prometheus: http://prom-kube-prometheus-stack-prometheus.{{ .Release.Namespace }}.svc:9090 + Loki: http://loki.{{ .Release.Namespace }}.svc:3100 diff --git a/helm/observability/templates/grafana-dashboards-configmap.yaml b/helm/observability/templates/grafana-dashboards-configmap.yaml new file mode 100644 index 00000000..3e54529c --- /dev/null +++ b/helm/observability/templates/grafana-dashboards-configmap.yaml @@ -0,0 +1,21 @@ +{{/* +ConfigMap that carries every dashboard JSON from helm/observability/dashboards/ +into Grafana. The kube-prometheus-stack Grafana deployment mounts ConfigMaps +listed under grafana.dashboardsConfigMaps (see values.yaml) at +/var/lib/grafana/dashboards//, which the matching dashboardProvider picks +up and loads. +*/}} +apiVersion: v1 +kind: ConfigMap +metadata: + name: samosachaat-dashboards + namespace: {{ .Release.Namespace }} + labels: + grafana_dashboard: "1" + app.kubernetes.io/part-of: samosachaat + app.kubernetes.io/managed-by: {{ .Release.Service }} +data: +{{- range $path, $_ := .Files.Glob "dashboards/*.json" }} + {{ base $path }}: |- +{{ $.Files.Get $path | indent 4 }} +{{- end }} diff --git a/helm/observability/values.yaml b/helm/observability/values.yaml index 013a3646..9b651c81 100644 --- a/helm/observability/values.yaml +++ b/helm/observability/values.yaml @@ -1,8 +1,317 @@ -grafana: - enabled: true +## Values for the samosaChaat observability umbrella chart. +## +## This chart deploys kube-prometheus-stack (Prometheus, Grafana, Alertmanager, +## node-exporter, kube-state-metrics) and loki-stack (Loki + Promtail) into the +## cluster. Grafana is exposed at grafana.samosachaat.art via the main ingress +## in the samosachaat app chart (which routes to a ClusterIP service named +## "grafana" on port 3000 in the same namespace). -prometheus: - enabled: true +kube-prometheus-stack: + fullnameOverride: prom -loki: - enabled: true + ## ----------------------- Prometheus ----------------------- + prometheus: + prometheusSpec: + ## Only scrape ServiceMonitors that belong to the samosaChaat app. + serviceMonitorSelector: + matchLabels: + app.kubernetes.io/part-of: samosachaat + serviceMonitorSelectorNilUsesHelmValues: false + ## Also evaluate our custom PrometheusRules. + ruleSelector: + matchLabels: + app.kubernetes.io/part-of: samosachaat + ruleSelectorNilUsesHelmValues: false + ## Keep 15 days of metric data on a 50Gi PVC. + retention: 15d + storageSpec: + volumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 50Gi + resources: + requests: + cpu: 200m + memory: 1Gi + limits: + cpu: "1" + memory: 2Gi + + ## ----------------------- Alertmanager ----------------------- + alertmanager: + alertmanagerSpec: + storage: + volumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 10Gi + + ## ----------------------- Grafana ----------------------- + grafana: + enabled: true + ## The main samosaChaat ingress already routes grafana.samosachaat.art + ## to a ClusterIP service named "grafana" port 3000 in the same namespace. + ## Override the generated service name so it matches. + fullnameOverride: grafana + service: + type: ClusterIP + port: 3000 + ingress: + enabled: false + + ## OAuth2 — no username/password, only GitHub + Google sign-in. + ## The referenced env vars must be provided via Grafana's extraEnvVarsSecret + ## (see grafana.envFromSecret below) or set in the cluster's secret store. + grafana.ini: + server: + root_url: https://grafana.samosachaat.art + auth: + disable_login_form: true + "auth.github": + enabled: true + allow_sign_up: true + client_id: ${GITHUB_GRAFANA_CLIENT_ID} + client_secret: ${GITHUB_GRAFANA_CLIENT_SECRET} + scopes: user:email,read:org + auth_url: https://github.com/login/oauth/authorize + token_url: https://github.com/login/oauth/access_token + api_url: https://api.github.com/user + "auth.google": + enabled: true + allow_sign_up: true + client_id: ${GOOGLE_GRAFANA_CLIENT_ID} + client_secret: ${GOOGLE_GRAFANA_CLIENT_SECRET} + scopes: openid email profile + auth_url: https://accounts.google.com/o/oauth2/auth + token_url: https://accounts.google.com/o/oauth2/token + allowed_domains: gmail.com + + ## OAuth client IDs/secrets come from a Kubernetes secret mounted as env. + ## Expected keys: GITHUB_GRAFANA_CLIENT_ID, GITHUB_GRAFANA_CLIENT_SECRET, + ## GOOGLE_GRAFANA_CLIENT_ID, GOOGLE_GRAFANA_CLIENT_SECRET, SLACK_WEBHOOK_URL. + envFromSecret: grafana-oauth-secrets + + ## Datasources — Prometheus (default) + Loki for logs. + datasources: + datasources.yaml: + apiVersion: 1 + datasources: + - name: Prometheus + type: prometheus + uid: prometheus + url: http://prom-kube-prometheus-stack-prometheus.{{ .Release.Namespace }}.svc:9090 + access: proxy + isDefault: true + - name: Loki + type: loki + uid: loki + url: http://loki.{{ .Release.Namespace }}.svc:3100 + access: proxy + jsonData: + maxLines: 1000 + + ## Auto-provision dashboards from the dashboards ConfigMap rendered by + ## templates/grafana-dashboards-configmap.yaml. + dashboardProviders: + dashboardproviders.yaml: + apiVersion: 1 + providers: + - name: samosachaat + orgId: 1 + folder: samosaChaat + type: file + disableDeletion: false + editable: true + options: + path: /var/lib/grafana/dashboards/samosachaat + + dashboardsConfigMaps: + samosachaat: samosachaat-dashboards + + ## Unified alerting — contact points (email + slack). + alerting: + contactpoints.yaml: + apiVersion: 1 + contactPoints: + - orgId: 1 + name: email-alerts + receivers: + - uid: email + type: email + settings: + addresses: manmohan659@gmail.com + - orgId: 1 + name: slack-alerts + receivers: + - uid: slack + type: slack + settings: + url: ${SLACK_WEBHOOK_URL} + policies.yaml: + apiVersion: 1 + policies: + - orgId: 1 + receiver: email-alerts + group_by: [alertname, severity] + routes: + - receiver: slack-alerts + matchers: + - severity = critical + + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: 500m + memory: 512Mi + + ## ----------------------- Alert rules ----------------------- + ## Prometheus will pick these up via the ruleSelector above (the generated + ## PrometheusRule gets labelled app.kubernetes.io/part-of: samosachaat). + additionalPrometheusRulesMap: + samosachaat-alerts: + groups: + - name: node-health + rules: + - alert: HighCPU + expr: avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[5m])) > 0.8 + for: 5m + labels: + severity: warning + annotations: + summary: "CPU > 80% on {{ $labels.instance }} for 5m" + description: "Node {{ $labels.instance }} has had CPU usage above 80% for 5 minutes." + + - alert: HighMemory + expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.85 + for: 5m + labels: + severity: critical + annotations: + summary: "Memory > 85% on {{ $labels.instance }}" + description: "Node {{ $labels.instance }} memory usage is above 85%." + + - alert: DiskSpaceLow + expr: (1 - node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) > 0.9 + for: 10m + labels: + severity: critical + annotations: + summary: "Disk > 90% on {{ $labels.instance }}:{{ $labels.mountpoint }}" + description: "Filesystem {{ $labels.mountpoint }} on {{ $labels.instance }} is above 90% full." + + - name: application + rules: + - alert: High5xxRate + expr: sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.05 + for: 2m + labels: + severity: critical + annotations: + summary: "5xx error rate > 5% across services" + description: "Aggregate 5xx error ratio across all services has exceeded 5% for 2 minutes." + + - alert: InferenceServiceDown + expr: up{app="inference"} == 0 + for: 1m + labels: + severity: critical + annotations: + summary: "Inference service is DOWN" + description: "Prometheus cannot scrape the inference service /metrics endpoint." + + - alert: HighP99Latency + expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, app)) > 5 + for: 3m + labels: + severity: warning + annotations: + summary: "p99 latency > 5s for {{ $labels.app }}" + description: "Service {{ $labels.app }} p99 latency has exceeded 5s for 3 minutes." + +## ----------------------- Loki stack ----------------------- +loki-stack: + loki: + enabled: true + persistence: + enabled: true + size: 50Gi + config: + limits_config: + retention_period: 720h + schema_config: + configs: + - from: "2024-01-01" + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: loki_index_ + period: 24h + + promtail: + enabled: true + config: + snippets: + pipelineStages: + - cri: {} + - json: + expressions: + level: level + service: service + trace_id: trace_id + user_id: user_id + - labels: + level: + service: + scrapeConfigs: | + - job_name: kubernetes-pods + pipeline_stages: + - cri: {} + - json: + expressions: + level: level + service: service + trace_id: trace_id + user_id: user_id + - labels: + level: + service: + kubernetes_sd_configs: + - role: pod + relabel_configs: + - source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - source_labels: + - __meta_kubernetes_pod_label_app_kubernetes_io_name + target_label: app + - source_labels: + - __meta_kubernetes_pod_label_app_kubernetes_io_component + target_label: component + - source_labels: + - __meta_kubernetes_pod_label_app_kubernetes_io_part_of + target_label: part_of + - source_labels: + - __meta_kubernetes_pod_container_name + target_label: container + - action: replace + replacement: /var/log/pods/*$1/*.log + separator: / + source_labels: + - __meta_kubernetes_pod_uid + - __meta_kubernetes_pod_container_name + target_label: __path__ + + prometheus: + enabled: false + grafana: + enabled: false diff --git a/helm/samosachaat/templates/servicemonitor.yaml b/helm/samosachaat/templates/servicemonitor.yaml new file mode 100644 index 00000000..124366c3 --- /dev/null +++ b/helm/samosachaat/templates/servicemonitor.yaml @@ -0,0 +1,40 @@ +{{/* +ServiceMonitor CRs so the Prometheus instance deployed by the observability +chart discovers and scrapes the samosaChaat backend services. Prometheus +selects only ServiceMonitors labelled app.kubernetes.io/part-of=samosachaat +(see helm/observability/values.yaml -> prometheus.serviceMonitorSelector). + +Each backend Python service exposes /metrics via +prometheus-fastapi-instrumentator. The frontend (Next.js) is omitted until it +grows its own metrics endpoint. +*/}} +{{- $services := dict "auth" .Values.auth "chat-api" .Values.chatApi "inference" .Values.inference -}} +{{- range $svc, $cfg := $services }} +{{- if $cfg.enabled }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: ServiceMonitor +metadata: + name: {{ $svc }} + namespace: {{ include "samosachaat.namespace" $ }} + labels: + {{- include "samosachaat.labels" $ | nindent 4 }} + app: {{ $svc }} +spec: + namespaceSelector: + matchNames: + - {{ include "samosachaat.namespace" $ }} + selector: + matchLabels: + {{- include "samosachaat.selectorLabels" (dict "root" $ "svc" $svc) | nindent 6 }} + endpoints: + - port: http + interval: 15s + path: /metrics + scheme: http + relabelings: + - action: replace + targetLabel: app + replacement: {{ $svc }} +{{- end }} +{{- end }} diff --git a/services/auth/pyproject.toml b/services/auth/pyproject.toml index c42bcc68..9f6ac217 100644 --- a/services/auth/pyproject.toml +++ b/services/auth/pyproject.toml @@ -19,6 +19,8 @@ dependencies = [ "cryptography>=43.0.0", "slowapi>=0.1.9", "python-multipart>=0.0.9", + "structlog>=24.4.0", + "prometheus-fastapi-instrumentator>=7.0.0", ] [dependency-groups] diff --git a/services/auth/src/config.py b/services/auth/src/config.py index 636b7f39..03453bc2 100644 --- a/services/auth/src/config.py +++ b/services/auth/src/config.py @@ -37,6 +37,8 @@ class Settings(BaseSettings): cookie_secure: bool = Field(default=False) cookie_domain: str | None = Field(default=None) + log_level: str = Field(default="INFO") + @property def refresh_cookie_name(self) -> str: return "samosachaat_refresh" diff --git a/services/auth/src/logging_setup.py b/services/auth/src/logging_setup.py new file mode 100644 index 00000000..b7404101 --- /dev/null +++ b/services/auth/src/logging_setup.py @@ -0,0 +1,78 @@ +"""Structured JSON logging for the auth service. + +Mirrors the canonical implementation in services/chat-api/src/logging_setup.py +so every service emits the same JSON shape (see contracts/logging-standard.md). +""" +from __future__ import annotations + +import logging +import sys +import uuid +from contextvars import ContextVar + +import structlog + +from .config import get_settings + +_trace_id_ctx: ContextVar[str | None] = ContextVar("trace_id", default=None) +_user_id_ctx: ContextVar[str | None] = ContextVar("user_id", default=None) + + +def set_trace_id(trace_id: str | None) -> None: + _trace_id_ctx.set(trace_id) + + +def set_user_id(user_id: str | None) -> None: + _user_id_ctx.set(user_id) + + +def get_trace_id() -> str | None: + return _trace_id_ctx.get() + + +def get_user_id() -> str | None: + return _user_id_ctx.get() + + +def new_trace_id() -> str: + return uuid.uuid4().hex + + +def _inject_context(_logger, _method, event_dict): + event_dict.setdefault("service", "auth") + trace_id = _trace_id_ctx.get() + if trace_id is not None: + event_dict.setdefault("trace_id", trace_id) + user_id = _user_id_ctx.get() + if user_id is not None: + event_dict.setdefault("user_id", user_id) + return event_dict + + +def configure_logging() -> None: + settings = get_settings() + level = getattr(logging, settings.log_level.upper(), logging.INFO) + + logging.basicConfig( + format="%(message)s", + stream=sys.stdout, + level=level, + force=True, + ) + + structlog.configure( + processors=[ + structlog.contextvars.merge_contextvars, + structlog.processors.add_log_level, + structlog.processors.TimeStamper(fmt="iso", utc=True), + _inject_context, + structlog.processors.JSONRenderer(), + ], + wrapper_class=structlog.make_filtering_bound_logger(level), + logger_factory=structlog.stdlib.LoggerFactory(), + cache_logger_on_first_use=True, + ) + + +def get_logger(name: str | None = None): + return structlog.get_logger(name) diff --git a/services/auth/src/main.py b/services/auth/src/main.py index 0d3d86c4..e62316cb 100644 --- a/services/auth/src/main.py +++ b/services/auth/src/main.py @@ -4,11 +4,13 @@ from __future__ import annotations from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse +from prometheus_fastapi_instrumentator import Instrumentator from slowapi.errors import RateLimitExceeded from slowapi.middleware import SlowAPIMiddleware from starlette.middleware.sessions import SessionMiddleware from .config import get_settings +from .logging_setup import configure_logging from .rate_limit import limiter from .routes import oauth, session, users @@ -18,6 +20,7 @@ def _rate_limit_handler(request, exc: RateLimitExceeded): def create_app() -> FastAPI: + configure_logging() settings = get_settings() app = FastAPI(title="samosaChaat Auth", version="0.1.0") @@ -44,6 +47,8 @@ def create_app() -> FastAPI: async def health(): return {"status": "ok"} + Instrumentator().instrument(app).expose(app, endpoint="/metrics", include_in_schema=False) + return app diff --git a/services/chat-api/pyproject.toml b/services/chat-api/pyproject.toml index 07df9eb6..1d953516 100644 --- a/services/chat-api/pyproject.toml +++ b/services/chat-api/pyproject.toml @@ -15,6 +15,7 @@ dependencies = [ "sse-starlette>=2.1.3", "structlog>=24.4.0", "cachetools>=5.5.0", + "prometheus-fastapi-instrumentator>=7.0.0", ] [dependency-groups] diff --git a/services/chat-api/src/main.py b/services/chat-api/src/main.py index bdb2608f..9bb0ab1c 100644 --- a/services/chat-api/src/main.py +++ b/services/chat-api/src/main.py @@ -6,6 +6,7 @@ from contextlib import asynccontextmanager import httpx from fastapi import FastAPI, Request, Response from fastapi.middleware.cors import CORSMiddleware +from prometheus_fastapi_instrumentator import Instrumentator from .config import get_settings from .logging_setup import ( @@ -78,6 +79,8 @@ def create_app() -> FastAPI: async def health(): return {"status": "ok", "ready": True, "service": "chat-api"} + Instrumentator().instrument(app).expose(app, endpoint="/metrics", include_in_schema=False) + return app diff --git a/services/inference/pyproject.toml b/services/inference/pyproject.toml index 500104a3..38fed545 100644 --- a/services/inference/pyproject.toml +++ b/services/inference/pyproject.toml @@ -16,6 +16,8 @@ dependencies = [ "torch==2.9.1", "transformers>=4.57.3", "uvicorn>=0.36.0", + "structlog>=24.4.0", + "prometheus-fastapi-instrumentator>=7.0.0", ] [dependency-groups] diff --git a/services/inference/src/config.py b/services/inference/src/config.py index 694c16a7..a2b8f72c 100644 --- a/services/inference/src/config.py +++ b/services/inference/src/config.py @@ -29,6 +29,7 @@ class Settings(BaseSettings): default_temperature: float = 0.8 default_top_k: int = 50 default_max_tokens: int = 512 + log_level: str = "INFO" @property def resolved_device_type(self) -> str: diff --git a/services/inference/src/logging_setup.py b/services/inference/src/logging_setup.py new file mode 100644 index 00000000..c6bc1d40 --- /dev/null +++ b/services/inference/src/logging_setup.py @@ -0,0 +1,78 @@ +"""Structured JSON logging for the inference service. + +Mirrors the canonical implementation in services/chat-api/src/logging_setup.py +so every service emits the same JSON shape (see contracts/logging-standard.md). +""" +from __future__ import annotations + +import logging +import sys +import uuid +from contextvars import ContextVar + +import structlog + +from config import get_settings + +_trace_id_ctx: ContextVar[str | None] = ContextVar("trace_id", default=None) +_user_id_ctx: ContextVar[str | None] = ContextVar("user_id", default=None) + + +def set_trace_id(trace_id: str | None) -> None: + _trace_id_ctx.set(trace_id) + + +def set_user_id(user_id: str | None) -> None: + _user_id_ctx.set(user_id) + + +def get_trace_id() -> str | None: + return _trace_id_ctx.get() + + +def get_user_id() -> str | None: + return _user_id_ctx.get() + + +def new_trace_id() -> str: + return uuid.uuid4().hex + + +def _inject_context(_logger, _method, event_dict): + event_dict.setdefault("service", "inference") + trace_id = _trace_id_ctx.get() + if trace_id is not None: + event_dict.setdefault("trace_id", trace_id) + user_id = _user_id_ctx.get() + if user_id is not None: + event_dict.setdefault("user_id", user_id) + return event_dict + + +def configure_logging() -> None: + settings = get_settings() + level = getattr(logging, settings.log_level.upper(), logging.INFO) + + logging.basicConfig( + format="%(message)s", + stream=sys.stdout, + level=level, + force=True, + ) + + structlog.configure( + processors=[ + structlog.contextvars.merge_contextvars, + structlog.processors.add_log_level, + structlog.processors.TimeStamper(fmt="iso", utc=True), + _inject_context, + structlog.processors.JSONRenderer(), + ], + wrapper_class=structlog.make_filtering_bound_logger(level), + logger_factory=structlog.stdlib.LoggerFactory(), + cache_logger_on_first_use=True, + ) + + +def get_logger(name: str | None = None): + return structlog.get_logger(name) diff --git a/services/inference/src/main.py b/services/inference/src/main.py index 186c7e10..5c3982e5 100644 --- a/services/inference/src/main.py +++ b/services/inference/src/main.py @@ -2,20 +2,21 @@ from __future__ import annotations import asyncio import json -import logging import random from contextlib import asynccontextmanager from typing import AsyncGenerator from fastapi import Depends, FastAPI, HTTPException, Request, status from fastapi.responses import JSONResponse, StreamingResponse +from prometheus_fastapi_instrumentator import Instrumentator from pydantic import BaseModel from config import Settings, get_settings +from logging_setup import configure_logging, get_logger from middleware.internal_auth import require_internal_api_key from services.weight_manager import WeightManager -logger = logging.getLogger(__name__) +logger = get_logger(__name__) # Abuse prevention limits MAX_MESSAGES_PER_REQUEST = 500 @@ -169,7 +170,7 @@ class InferenceRuntime: step=self.settings.default_step, ) except Exception as exc: # pragma: no cover - exercised by deployment conditions - logger.warning("Skipping startup model load: %s", exc) + logger.warning("skipping startup model load", error=str(exc)) self.worker_pool = None async def shutdown(self) -> None: @@ -236,6 +237,7 @@ def get_runtime(request: Request) -> InferenceRuntime: def create_app(settings: Settings | None = None, runtime: InferenceRuntime | None = None) -> FastAPI: + configure_logging() resolved_settings = settings or get_settings() @asynccontextmanager @@ -285,6 +287,8 @@ def create_app(settings: Settings | None = None, runtime: InferenceRuntime | Non async def stats(runtime: InferenceRuntime = Depends(get_runtime)): return runtime.stats_payload() + Instrumentator().instrument(app).expose(app, endpoint="/metrics", include_in_schema=False) + return app