From aa0818aae2b95257d3f63cc3b22f4f610613692c Mon Sep 17 00:00:00 2001
From: Manmohan Sharma <manmohan659@gmail.com>
Date: Thu, 16 Apr 2026 12:29:16 -0700
Subject: [PATCH] feat(observability): Prometheus + Grafana + Loki stack for
 samosaChaat (#9)

Replaces the helm/observability scaffold with a real monitoring stack
wired into the samosaChaat platform.

Helm chart (helm/observability/)
- Chart.yaml declares kube-prometheus-stack (~62.0) and loki-stack
  (~2.10) as subchart dependencies.
- values.yaml configures Prometheus (15d retention, 50Gi PVC,
  ServiceMonitor + rule selector on app.kubernetes.io/part-of:
  samosachaat), Alertmanager (10Gi PVC), Grafana (OAuth-only via
  GitHub + Google, local login disabled, Prometheus + Loki datasources,
  dashboards auto-provisioned from a ConfigMap, email + Slack contact
  points with a critical route to Slack), Loki (50Gi, 30d retention,
  tsdb schema), and Promtail (JSON pipeline that lifts level / service
  / trace_id / user_id into labels, scrape config with pod labels).
- Alert rules: HighCPU, HighMemory, DiskSpaceLow, High5xxRate,
  InferenceServiceDown, HighP99Latency.
- templates/grafana-dashboards-configmap.yaml renders every file under
  dashboards/ into a single grafana_dashboard=1 ConfigMap.
- dashboards/node-health.json, app-performance.json, inference.json -
  fully-formed Grafana dashboards with Prometheus datasource variable,
  templated app selector, thresholded gauges, and LogQL-ready labels.

Scraping (helm/samosachaat/templates/servicemonitor.yaml)
- ServiceMonitor CRs for auth / chat-api / inference that Prometheus
  picks up via the part-of=samosachaat selector; scrapes /metrics
  every 15s and replaces the app label so dashboards line up.

Application instrumentation
- services/{auth,chat-api,inference} each depend on
  prometheus-fastapi-instrumentator and expose /metrics (request count,
  latency histograms, in-progress gauges).
- services/auth/src/logging_setup.py and
  services/inference/src/logging_setup.py mirror the canonical
  chat-api implementation - structlog JSON with service, trace_id,
  user_id context injection.
- configure_logging() is called at create_app() in auth and inference;
  inference's main.py now uses structlog via get_logger() instead of
  logging.getLogger.
- log_level setting added to auth + inference config (LOG_LEVEL env).

Docs
- contracts/logging-standard.md defines the required JSON fields,
  Python (structlog) + Node.js (pino) implementations, LogQL examples
  for cross-service queries, and the x-trace-id propagation contract.

Closes #9

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 contracts/logging-standard.md                 | 110 ++++++
 helm/observability/Chart.yaml                 |  15 +-
 .../dashboards/app-performance.json           | 214 ++++++++++++
 helm/observability/dashboards/inference.json  | 234 +++++++++++++
 .../observability/dashboards/node-health.json | 297 ++++++++++++++++
 helm/observability/templates/NOTES.txt        |  24 +-
 .../grafana-dashboards-configmap.yaml         |  21 ++
 helm/observability/values.yaml                | 321 +++++++++++++++++-
 .../samosachaat/templates/servicemonitor.yaml |  40 +++
 services/auth/pyproject.toml                  |   2 +
 services/auth/src/config.py                   |   2 +
 services/auth/src/logging_setup.py            |  78 +++++
 services/auth/src/main.py                     |   5 +
 services/chat-api/pyproject.toml              |   1 +
 services/chat-api/src/main.py                 |   3 +
 services/inference/pyproject.toml             |   2 +
 services/inference/src/config.py              |   1 +
 services/inference/src/logging_setup.py       |  78 +++++
 services/inference/src/main.py                |  10 +-
 19 files changed, 1444 insertions(+), 14 deletions(-)
 create mode 100644 contracts/logging-standard.md
 create mode 100644 helm/observability/dashboards/app-performance.json
 create mode 100644 helm/observability/dashboards/inference.json
 create mode 100644 helm/observability/dashboards/node-health.json
 create mode 100644 helm/observability/templates/grafana-dashboards-configmap.yaml
 create mode 100644 helm/samosachaat/templates/servicemonitor.yaml
 create mode 100644 services/auth/src/logging_setup.py
 create mode 100644 services/inference/src/logging_setup.py

diff --git a/contracts/logging-standard.md b/contracts/logging-standard.md
new file mode 100644
index 00000000..8292fa16
--- /dev/null
+++ b/contracts/logging-standard.md
@@ -0,0 +1,110 @@
+# samosaChaat Logging Standard
+
+All services in the samosaChaat platform emit logs as **single-line JSON**
+on stdout. Promtail ships them to Loki, where Grafana queries them by label
+and by JSON field. Because every service shares the same schema, a single
+trace_id lets you follow a request from the frontend through auth → chat-api
+→ inference.
+
+## Required fields
+
+Every log line MUST include:
+
+| Field       | Type    | Source                                   |
+|-------------|---------|------------------------------------------|
+| `timestamp` | ISO8601 | structlog `TimeStamper(fmt="iso")`       |
+| `level`     | string  | `debug` / `info` / `warning` / `error`   |
+| `service`   | string  | hard-coded per service (`auth`, `chat-api`, `inference`, `frontend`) |
+| `message`   | string  | the human-readable event (`event` key in structlog) |
+
+Conditionally included (when present in the request context):
+
+| Field        | When to include                          |
+|--------------|------------------------------------------|
+| `trace_id`   | every request served by a backend service — propagated via the `x-trace-id` header |
+| `user_id`    | every request authenticated as a user    |
+| `inference_time_ms` | emitted by chat-api and inference around model calls |
+| `error`      | on exceptions — the stringified cause    |
+
+Anything else is free-form structured context (`method`, `path`,
+`status_code`, `model_tag`, …). Keep keys `snake_case`.
+
+## Python implementation — `structlog`
+
+The canonical setup lives at `services/chat-api/src/logging_setup.py`.
+`services/auth/src/logging_setup.py` and `services/inference/src/logging_setup.py`
+mirror it, differing only in the hard-coded `service` value.
+
+Key pieces:
+
+```python
+structlog.configure(
+    processors=[
+        structlog.contextvars.merge_contextvars,   # trace_id / user_id from context
+        structlog.processors.add_log_level,        # -> level field
+        structlog.processors.TimeStamper(fmt="iso", utc=True),
+        _inject_context,                           # service + trace_id + user_id defaults
+        structlog.processors.JSONRenderer(),       # final JSON line
+    ],
+    ...
+)
+```
+
+Each service calls `configure_logging()` once at startup (inside
+`create_app()`), and then uses `logger = get_logger(__name__)` everywhere.
+`trace_id` is set by a FastAPI middleware that reads the incoming
+`x-trace-id` header (or mints a new one via `new_trace_id()`) and propagates
+it to downstream calls.
+
+## Node.js implementation — `pino` (frontend)
+
+The Next.js frontend should log JSON with the same schema. Reference config:
+
+```ts
+// services/frontend/lib/logger.ts
+import pino from "pino";
+
+export const logger = pino({
+  base: { service: "frontend" },
+  timestamp: pino.stdTimeFunctions.isoTime,
+  formatters: {
+    level: (label) => ({ level: label }),   // keep the string level
+  },
+  messageKey: "message",
+});
+```
+
+When emitting, always include `trace_id` and `user_id` when known:
+
+```ts
+logger.info({ trace_id, user_id, path: req.url }, "request_start");
+```
+
+In API routes, read the incoming `x-trace-id` header and echo it back on the
+response so client-side traces can join up.
+
+## Cross-service querying (LogQL / Grafana Explore)
+
+Labels Promtail applies: `namespace`, `app`, `pod`, `level`, `service`,
+`container`. Everything else is a JSON field — use `| json` to extract it.
+
+| Goal                          | Query                                                                 |
+|-------------------------------|-----------------------------------------------------------------------|
+| All errors in prod            | `{namespace="samosachaat-prod"} | json | level="error"`               |
+| Trace a request across tiers  | `{namespace="samosachaat-prod"} | json | trace_id="<trace>"`          |
+| Auth failures                 | `{app="auth"} | json | level="error"`                                 |
+| Slow inference calls          | `{app="inference"} | json | inference_time_ms > 5000`                 |
+| 5xx by service                | `{namespace="samosachaat-prod"} | json | status_code >= 500`          |
+| Rate-limited OAuth logins     | `{app="auth"} | json | path=~"/auth/oauth/.*" | status_code=429`      |
+
+## Trace propagation contract
+
+1. **Frontend** — mint `trace_id` on navigation (or reuse an existing one from
+   the current session), send it as `x-trace-id` on every `fetch` to the API.
+2. **chat-api** — read `x-trace-id`, store in a context var, re-emit on the
+   response, and forward it on every httpx call to auth or inference.
+3. **auth / inference** — read `x-trace-id` and bind it to the logger context
+   for the duration of the request.
+
+Services MUST NOT log raw secrets, JWTs, OAuth client secrets, API keys, or
+full user message text. Log IDs, lengths, and booleans — not contents.
diff --git a/helm/observability/Chart.yaml b/helm/observability/Chart.yaml
index 071fb150..07a81f76 100644
--- a/helm/observability/Chart.yaml
+++ b/helm/observability/Chart.yaml
@@ -1,6 +1,17 @@
 apiVersion: v2
 name: observability
-description: Observability chart scaffold for Grafana, Prometheus, and Loki.
+description: |
+  Observability stack for the samosaChaat platform: kube-prometheus-stack
+  (Prometheus + Grafana + Alertmanager + node/kube-state exporters) and
+  loki-stack (Loki + Promtail) for metrics, alerts, dashboards, and logs.
 type: application
 version: 0.1.0
-appVersion: "0.1.0"
+appVersion: "1.0.0"
+
+dependencies:
+  - name: kube-prometheus-stack
+    version: "~62.0"
+    repository: https://prometheus-community.github.io/helm-charts
+  - name: loki-stack
+    version: "~2.10"
+    repository: https://grafana.github.io/helm-charts
diff --git a/helm/observability/dashboards/app-performance.json b/helm/observability/dashboards/app-performance.json
new file mode 100644
index 00000000..6bce5e75
--- /dev/null
+++ b/helm/observability/dashboards/app-performance.json
@@ -0,0 +1,214 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {"type": "grafana", "uid": "-- Grafana --"},
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "liveNow": false,
+  "panels": [
+    {
+      "type": "row",
+      "title": "Traffic",
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
+      "id": 100,
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "timeseries",
+      "title": "Requests per second by service",
+      "id": 1,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 8, "w": 24, "x": 0, "y": 1},
+      "fieldConfig": {"defaults": {"unit": "reqps"}, "overrides": []},
+      "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}},
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "sum by (app) (rate(http_requests_total{app=~\"$app\"}[5m]))",
+          "legendFormat": "{{app}}"
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "title": "Latency",
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 9},
+      "id": 101,
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "timeseries",
+      "title": "p50 / p95 / p99 latency",
+      "id": 2,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 10, "w": 24, "x": 0, "y": 10},
+      "fieldConfig": {"defaults": {"unit": "s"}, "overrides": []},
+      "options": {"legend": {"displayMode": "table", "placement": "bottom", "calcs": ["mean", "lastNotNull"]}, "tooltip": {"mode": "multi"}},
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "histogram_quantile(0.50, sum by (le, app) (rate(http_request_duration_seconds_bucket{app=~\"$app\"}[5m])))",
+          "legendFormat": "p50 {{app}}"
+        },
+        {
+          "refId": "B",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "histogram_quantile(0.95, sum by (le, app) (rate(http_request_duration_seconds_bucket{app=~\"$app\"}[5m])))",
+          "legendFormat": "p95 {{app}}"
+        },
+        {
+          "refId": "C",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "histogram_quantile(0.99, sum by (le, app) (rate(http_request_duration_seconds_bucket{app=~\"$app\"}[5m])))",
+          "legendFormat": "p99 {{app}}"
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "title": "Errors",
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 20},
+      "id": 102,
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "timeseries",
+      "title": "Error rate by service (4xx + 5xx, stacked)",
+      "id": 3,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 8, "w": 24, "x": 0, "y": 21},
+      "fieldConfig": {
+        "defaults": {
+          "unit": "reqps",
+          "custom": {"stacking": {"mode": "normal"}, "fillOpacity": 25}
+        },
+        "overrides": []
+      },
+      "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}},
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "sum by (app) (rate(http_requests_total{app=~\"$app\",status=~\"4..\"}[5m]))",
+          "legendFormat": "4xx {{app}}"
+        },
+        {
+          "refId": "B",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "sum by (app) (rate(http_requests_total{app=~\"$app\",status=~\"5..\"}[5m]))",
+          "legendFormat": "5xx {{app}}"
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "title": "Availability",
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 29},
+      "id": 103,
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "stat",
+      "title": "Uptime % (last 24h) by service",
+      "id": 4,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 6, "w": 24, "x": 0, "y": 30},
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {"color": "red", "value": null},
+              {"color": "yellow", "value": 0.95},
+              {"color": "green", "value": 0.99}
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+        "orientation": "auto",
+        "textMode": "auto",
+        "colorMode": "value",
+        "graphMode": "area"
+      },
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "avg_over_time(up{app=~\"$app\"}[24h])",
+          "legendFormat": "{{app}}"
+        }
+      ]
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 39,
+  "style": "dark",
+  "tags": ["samosachaat", "application"],
+  "templating": {
+    "list": [
+      {
+        "current": {"selected": false, "text": "Prometheus", "value": "prometheus"},
+        "hide": 0,
+        "includeAll": false,
+        "label": "Datasource",
+        "multi": false,
+        "name": "DS_PROMETHEUS",
+        "options": [],
+        "query": "prometheus",
+        "queryValue": "",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "type": "datasource"
+      },
+      {
+        "current": {"selected": true, "text": "All", "value": "$__all"},
+        "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+        "definition": "label_values(http_requests_total, app)",
+        "hide": 0,
+        "includeAll": true,
+        "label": "app",
+        "multi": true,
+        "name": "app",
+        "options": [],
+        "query": {"qryType": 1, "query": "label_values(http_requests_total, app)", "refId": "app"},
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "sort": 1,
+        "type": "query"
+      }
+    ]
+  },
+  "time": {"from": "now-6h", "to": "now"},
+  "timepicker": {},
+  "timezone": "",
+  "title": "samosaChaat — Application Performance",
+  "uid": "samosachaat-app-performance",
+  "version": 1,
+  "weekStart": ""
+}
diff --git a/helm/observability/dashboards/inference.json b/helm/observability/dashboards/inference.json
new file mode 100644
index 00000000..ee9c5231
--- /dev/null
+++ b/helm/observability/dashboards/inference.json
@@ -0,0 +1,234 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {"type": "grafana", "uid": "-- Grafana --"},
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "liveNow": false,
+  "panels": [
+    {
+      "type": "row",
+      "title": "Latency",
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
+      "id": 100,
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "heatmap",
+      "title": "Inference duration heatmap",
+      "id": 1,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 9, "w": 18, "x": 0, "y": 1},
+      "fieldConfig": {"defaults": {"unit": "s"}, "overrides": []},
+      "options": {
+        "calculate": false,
+        "cellGap": 1,
+        "color": {"mode": "scheme", "scheme": "Oranges", "steps": 64, "exponent": 0.5},
+        "yAxis": {"unit": "s"},
+        "tooltip": {"show": true}
+      },
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "sum by (le) (rate(inference_duration_seconds_bucket[5m]))",
+          "format": "heatmap",
+          "legendFormat": "{{le}}"
+        }
+      ]
+    },
+    {
+      "type": "stat",
+      "title": "Current p50 / p95",
+      "id": 2,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 9, "w": 6, "x": 18, "y": 1},
+      "fieldConfig": {"defaults": {"unit": "s"}, "overrides": []},
+      "options": {
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+        "orientation": "vertical",
+        "textMode": "auto",
+        "colorMode": "value",
+        "graphMode": "area"
+      },
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "histogram_quantile(0.50, sum by (le) (rate(inference_duration_seconds_bucket[5m])))",
+          "legendFormat": "p50"
+        },
+        {
+          "refId": "B",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "histogram_quantile(0.95, sum by (le) (rate(inference_duration_seconds_bucket[5m])))",
+          "legendFormat": "p95"
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "title": "Throughput",
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 10},
+      "id": 101,
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "timeseries",
+      "title": "Tokens generated per second",
+      "id": 3,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 8, "w": 24, "x": 0, "y": 11},
+      "fieldConfig": {"defaults": {"unit": "short"}, "overrides": []},
+      "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}},
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "sum(rate(tokens_generated_total[1m]))",
+          "legendFormat": "tokens/s"
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "title": "Workers",
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 19},
+      "id": 102,
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "gauge",
+      "title": "Worker state",
+      "id": 4,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 20},
+      "fieldConfig": {
+        "defaults": {
+          "unit": "short",
+          "min": 0
+        },
+        "overrides": []
+      },
+      "options": {
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+        "orientation": "auto",
+        "showThresholdLabels": false,
+        "showThresholdMarkers": true
+      },
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "sum(workers_total)",
+          "legendFormat": "total"
+        },
+        {
+          "refId": "B",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "sum(workers_busy)",
+          "legendFormat": "busy"
+        },
+        {
+          "refId": "C",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "sum(workers_available)",
+          "legendFormat": "available"
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "Worker utilization over time (busy / total)",
+      "id": 5,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 20},
+      "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}, "overrides": []},
+      "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}},
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "sum(workers_busy) / clamp_min(sum(workers_total), 1)",
+          "legendFormat": "utilization"
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "title": "Active",
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 28},
+      "id": 103,
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "stat",
+      "title": "Active generations",
+      "id": 6,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 6, "w": 24, "x": 0, "y": 29},
+      "fieldConfig": {"defaults": {"unit": "short"}, "overrides": []},
+      "options": {
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+        "orientation": "auto",
+        "textMode": "value_and_name",
+        "colorMode": "value",
+        "graphMode": "area"
+      },
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "sum(active_generations)",
+          "legendFormat": "active"
+        }
+      ]
+    }
+  ],
+  "refresh": "15s",
+  "schemaVersion": 39,
+  "style": "dark",
+  "tags": ["samosachaat", "inference"],
+  "templating": {
+    "list": [
+      {
+        "current": {"selected": false, "text": "Prometheus", "value": "prometheus"},
+        "hide": 0,
+        "includeAll": false,
+        "label": "Datasource",
+        "multi": false,
+        "name": "DS_PROMETHEUS",
+        "options": [],
+        "query": "prometheus",
+        "queryValue": "",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "type": "datasource"
+      }
+    ]
+  },
+  "time": {"from": "now-1h", "to": "now"},
+  "timepicker": {},
+  "timezone": "",
+  "title": "samosaChaat — Inference Service",
+  "uid": "samosachaat-inference",
+  "version": 1,
+  "weekStart": ""
+}
diff --git a/helm/observability/dashboards/node-health.json b/helm/observability/dashboards/node-health.json
new file mode 100644
index 00000000..29ca802c
--- /dev/null
+++ b/helm/observability/dashboards/node-health.json
@@ -0,0 +1,297 @@
+{
+  "annotations": {
+    "list": [
+      {
+        "builtIn": 1,
+        "datasource": {"type": "grafana", "uid": "-- Grafana --"},
+        "enable": true,
+        "hide": true,
+        "iconColor": "rgba(0, 211, 255, 1)",
+        "name": "Annotations & Alerts",
+        "type": "dashboard"
+      }
+    ]
+  },
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 0,
+  "id": null,
+  "liveNow": false,
+  "panels": [
+    {
+      "type": "row",
+      "title": "CPU",
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 0},
+      "id": 100,
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "gauge",
+      "title": "CPU usage by node (avg %)",
+      "id": 1,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 1},
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {"color": "green", "value": null},
+              {"color": "yellow", "value": 0.6},
+              {"color": "red", "value": 0.8}
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+        "orientation": "auto",
+        "showThresholdLabels": false,
+        "showThresholdMarkers": true
+      },
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "avg by (instance) (rate(node_cpu_seconds_total{mode!=\"idle\"}[5m]))",
+          "legendFormat": "{{instance}}"
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "CPU usage by node (1h)",
+      "id": 2,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 1},
+      "fieldConfig": {
+        "defaults": {"unit": "percentunit", "min": 0, "max": 1},
+        "overrides": []
+      },
+      "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}},
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "avg by (instance) (rate(node_cpu_seconds_total{mode!=\"idle\"}[5m]))",
+          "legendFormat": "{{instance}}"
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "title": "Memory",
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 9},
+      "id": 101,
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "gauge",
+      "title": "Memory usage by node (%)",
+      "id": 3,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 10},
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {"color": "green", "value": null},
+              {"color": "yellow", "value": 0.7},
+              {"color": "red", "value": 0.85}
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+        "orientation": "auto",
+        "showThresholdLabels": false,
+        "showThresholdMarkers": true
+      },
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)",
+          "legendFormat": "{{instance}}"
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "Memory usage over time",
+      "id": 4,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 10},
+      "fieldConfig": {"defaults": {"unit": "percentunit", "min": 0, "max": 1}, "overrides": []},
+      "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}},
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "(1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)",
+          "legendFormat": "{{instance}}"
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "title": "Disk",
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 18},
+      "id": 102,
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "gauge",
+      "title": "Filesystem usage by mountpoint",
+      "id": 5,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 8, "w": 24, "x": 0, "y": 19},
+      "fieldConfig": {
+        "defaults": {
+          "unit": "percentunit",
+          "min": 0,
+          "max": 1,
+          "thresholds": {
+            "mode": "absolute",
+            "steps": [
+              {"color": "green", "value": null},
+              {"color": "yellow", "value": 0.75},
+              {"color": "red", "value": 0.9}
+            ]
+          }
+        },
+        "overrides": []
+      },
+      "options": {
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+        "orientation": "auto",
+        "showThresholdLabels": false,
+        "showThresholdMarkers": true
+      },
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "(1 - node_filesystem_avail_bytes{fstype!=\"tmpfs\"} / node_filesystem_size_bytes{fstype!=\"tmpfs\"})",
+          "legendFormat": "{{instance}} {{mountpoint}}"
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "title": "Network",
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 27},
+      "id": 103,
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "timeseries",
+      "title": "Network received (bytes/s)",
+      "id": 6,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 8, "w": 12, "x": 0, "y": 28},
+      "fieldConfig": {"defaults": {"unit": "Bps"}, "overrides": []},
+      "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}},
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "sum by (instance) (rate(node_network_receive_bytes_total[5m]))",
+          "legendFormat": "{{instance}}"
+        }
+      ]
+    },
+    {
+      "type": "timeseries",
+      "title": "Network transmitted (bytes/s)",
+      "id": 7,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 8, "w": 12, "x": 12, "y": 28},
+      "fieldConfig": {"defaults": {"unit": "Bps"}, "overrides": []},
+      "options": {"legend": {"displayMode": "list", "placement": "bottom"}, "tooltip": {"mode": "multi"}},
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "sum by (instance) (rate(node_network_transmit_bytes_total[5m]))",
+          "legendFormat": "{{instance}}"
+        }
+      ]
+    },
+    {
+      "type": "row",
+      "title": "Pods",
+      "gridPos": {"h": 1, "w": 24, "x": 0, "y": 36},
+      "id": 104,
+      "collapsed": false,
+      "panels": []
+    },
+    {
+      "type": "stat",
+      "title": "Pods per node",
+      "id": 8,
+      "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+      "gridPos": {"h": 6, "w": 24, "x": 0, "y": 37},
+      "fieldConfig": {"defaults": {"unit": "short"}, "overrides": []},
+      "options": {
+        "reduceOptions": {"calcs": ["lastNotNull"], "fields": "", "values": false},
+        "orientation": "auto",
+        "textMode": "auto",
+        "colorMode": "value",
+        "graphMode": "area"
+      },
+      "targets": [
+        {
+          "refId": "A",
+          "datasource": {"type": "prometheus", "uid": "${DS_PROMETHEUS}"},
+          "expr": "count by (node) (kube_pod_info)",
+          "legendFormat": "{{node}}"
+        }
+      ]
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 39,
+  "style": "dark",
+  "tags": ["samosachaat", "nodes"],
+  "templating": {
+    "list": [
+      {
+        "current": {"selected": false, "text": "Prometheus", "value": "prometheus"},
+        "hide": 0,
+        "includeAll": false,
+        "label": "Datasource",
+        "multi": false,
+        "name": "DS_PROMETHEUS",
+        "options": [],
+        "query": "prometheus",
+        "queryValue": "",
+        "refresh": 1,
+        "regex": "",
+        "skipUrlSync": false,
+        "type": "datasource"
+      }
+    ]
+  },
+  "time": {"from": "now-1h", "to": "now"},
+  "timepicker": {},
+  "timezone": "",
+  "title": "samosaChaat — Node Health",
+  "uid": "samosachaat-node-health",
+  "version": 1,
+  "weekStart": ""
+}
diff --git a/helm/observability/templates/NOTES.txt b/helm/observability/templates/NOTES.txt
index 9e750d08..932cb7a3 100644
--- a/helm/observability/templates/NOTES.txt
+++ b/helm/observability/templates/NOTES.txt
@@ -1,4 +1,22 @@
-The observability chart scaffold is in place.
+samosaChaat observability stack installed.
 
-Add concrete manifests for Grafana, Prometheus, Loki, dashboards, and scrape
-configuration as the platform monitoring stack is implemented.
+Components:
+  * kube-prometheus-stack  -> Prometheus, Alertmanager, Grafana, node-exporter, kube-state-metrics
+  * loki-stack             -> Loki (logs) + Promtail (log shipping)
+
+Access:
+  Grafana is exposed at https://grafana.samosachaat.art via the samosaChaat
+  app-chart ingress. Login is OAuth-only (GitHub + Google); the local login form
+  is disabled.
+
+Required secrets in namespace {{ .Release.Namespace }}:
+  kubectl create secret generic grafana-oauth-secrets \
+    --from-literal=GITHUB_GRAFANA_CLIENT_ID=... \
+    --from-literal=GITHUB_GRAFANA_CLIENT_SECRET=... \
+    --from-literal=GOOGLE_GRAFANA_CLIENT_ID=... \
+    --from-literal=GOOGLE_GRAFANA_CLIENT_SECRET=... \
+    --from-literal=SLACK_WEBHOOK_URL=...
+
+Useful in-cluster URLs:
+  Prometheus: http://prom-kube-prometheus-stack-prometheus.{{ .Release.Namespace }}.svc:9090
+  Loki:       http://loki.{{ .Release.Namespace }}.svc:3100
diff --git a/helm/observability/templates/grafana-dashboards-configmap.yaml b/helm/observability/templates/grafana-dashboards-configmap.yaml
new file mode 100644
index 00000000..3e54529c
--- /dev/null
+++ b/helm/observability/templates/grafana-dashboards-configmap.yaml
@@ -0,0 +1,21 @@
+{{/*
+ConfigMap that carries every dashboard JSON from helm/observability/dashboards/
+into Grafana. The kube-prometheus-stack Grafana deployment mounts ConfigMaps
+listed under grafana.dashboardsConfigMaps (see values.yaml) at
+/var/lib/grafana/dashboards/<key>/, which the matching dashboardProvider picks
+up and loads.
+*/}}
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: samosachaat-dashboards
+  namespace: {{ .Release.Namespace }}
+  labels:
+    grafana_dashboard: "1"
+    app.kubernetes.io/part-of: samosachaat
+    app.kubernetes.io/managed-by: {{ .Release.Service }}
+data:
+{{- range $path, $_ := .Files.Glob "dashboards/*.json" }}
+  {{ base $path }}: |-
+{{ $.Files.Get $path | indent 4 }}
+{{- end }}
diff --git a/helm/observability/values.yaml b/helm/observability/values.yaml
index 013a3646..9b651c81 100644
--- a/helm/observability/values.yaml
+++ b/helm/observability/values.yaml
@@ -1,8 +1,317 @@
-grafana:
-  enabled: true
+## Values for the samosaChaat observability umbrella chart.
+##
+## This chart deploys kube-prometheus-stack (Prometheus, Grafana, Alertmanager,
+## node-exporter, kube-state-metrics) and loki-stack (Loki + Promtail) into the
+## cluster. Grafana is exposed at grafana.samosachaat.art via the main ingress
+## in the samosachaat app chart (which routes to a ClusterIP service named
+## "grafana" on port 3000 in the same namespace).
 
-prometheus:
-  enabled: true
+kube-prometheus-stack:
+  fullnameOverride: prom
 
-loki:
-  enabled: true
+  ## ----------------------- Prometheus -----------------------
+  prometheus:
+    prometheusSpec:
+      ## Only scrape ServiceMonitors that belong to the samosaChaat app.
+      serviceMonitorSelector:
+        matchLabels:
+          app.kubernetes.io/part-of: samosachaat
+      serviceMonitorSelectorNilUsesHelmValues: false
+      ## Also evaluate our custom PrometheusRules.
+      ruleSelector:
+        matchLabels:
+          app.kubernetes.io/part-of: samosachaat
+      ruleSelectorNilUsesHelmValues: false
+      ## Keep 15 days of metric data on a 50Gi PVC.
+      retention: 15d
+      storageSpec:
+        volumeClaimTemplate:
+          spec:
+            accessModes: ["ReadWriteOnce"]
+            resources:
+              requests:
+                storage: 50Gi
+      resources:
+        requests:
+          cpu: 200m
+          memory: 1Gi
+        limits:
+          cpu: "1"
+          memory: 2Gi
+
+  ## ----------------------- Alertmanager -----------------------
+  alertmanager:
+    alertmanagerSpec:
+      storage:
+        volumeClaimTemplate:
+          spec:
+            accessModes: ["ReadWriteOnce"]
+            resources:
+              requests:
+                storage: 10Gi
+
+  ## ----------------------- Grafana -----------------------
+  grafana:
+    enabled: true
+    ## The main samosaChaat ingress already routes grafana.samosachaat.art
+    ## to a ClusterIP service named "grafana" port 3000 in the same namespace.
+    ## Override the generated service name so it matches.
+    fullnameOverride: grafana
+    service:
+      type: ClusterIP
+      port: 3000
+    ingress:
+      enabled: false
+
+    ## OAuth2 — no username/password, only GitHub + Google sign-in.
+    ## The referenced env vars must be provided via Grafana's extraEnvVarsSecret
+    ## (see grafana.envFromSecret below) or set in the cluster's secret store.
+    grafana.ini:
+      server:
+        root_url: https://grafana.samosachaat.art
+      auth:
+        disable_login_form: true
+      "auth.github":
+        enabled: true
+        allow_sign_up: true
+        client_id: ${GITHUB_GRAFANA_CLIENT_ID}
+        client_secret: ${GITHUB_GRAFANA_CLIENT_SECRET}
+        scopes: user:email,read:org
+        auth_url: https://github.com/login/oauth/authorize
+        token_url: https://github.com/login/oauth/access_token
+        api_url: https://api.github.com/user
+      "auth.google":
+        enabled: true
+        allow_sign_up: true
+        client_id: ${GOOGLE_GRAFANA_CLIENT_ID}
+        client_secret: ${GOOGLE_GRAFANA_CLIENT_SECRET}
+        scopes: openid email profile
+        auth_url: https://accounts.google.com/o/oauth2/auth
+        token_url: https://accounts.google.com/o/oauth2/token
+        allowed_domains: gmail.com
+
+    ## OAuth client IDs/secrets come from a Kubernetes secret mounted as env.
+    ## Expected keys: GITHUB_GRAFANA_CLIENT_ID, GITHUB_GRAFANA_CLIENT_SECRET,
+    ## GOOGLE_GRAFANA_CLIENT_ID, GOOGLE_GRAFANA_CLIENT_SECRET, SLACK_WEBHOOK_URL.
+    envFromSecret: grafana-oauth-secrets
+
+    ## Datasources — Prometheus (default) + Loki for logs.
+    datasources:
+      datasources.yaml:
+        apiVersion: 1
+        datasources:
+          - name: Prometheus
+            type: prometheus
+            uid: prometheus
+            url: http://prom-kube-prometheus-stack-prometheus.{{ .Release.Namespace }}.svc:9090
+            access: proxy
+            isDefault: true
+          - name: Loki
+            type: loki
+            uid: loki
+            url: http://loki.{{ .Release.Namespace }}.svc:3100
+            access: proxy
+            jsonData:
+              maxLines: 1000
+
+    ## Auto-provision dashboards from the dashboards ConfigMap rendered by
+    ## templates/grafana-dashboards-configmap.yaml.
+    dashboardProviders:
+      dashboardproviders.yaml:
+        apiVersion: 1
+        providers:
+          - name: samosachaat
+            orgId: 1
+            folder: samosaChaat
+            type: file
+            disableDeletion: false
+            editable: true
+            options:
+              path: /var/lib/grafana/dashboards/samosachaat
+
+    dashboardsConfigMaps:
+      samosachaat: samosachaat-dashboards
+
+    ## Unified alerting — contact points (email + slack).
+    alerting:
+      contactpoints.yaml:
+        apiVersion: 1
+        contactPoints:
+          - orgId: 1
+            name: email-alerts
+            receivers:
+              - uid: email
+                type: email
+                settings:
+                  addresses: manmohan659@gmail.com
+          - orgId: 1
+            name: slack-alerts
+            receivers:
+              - uid: slack
+                type: slack
+                settings:
+                  url: ${SLACK_WEBHOOK_URL}
+      policies.yaml:
+        apiVersion: 1
+        policies:
+          - orgId: 1
+            receiver: email-alerts
+            group_by: [alertname, severity]
+            routes:
+              - receiver: slack-alerts
+                matchers:
+                  - severity = critical
+
+    resources:
+      requests:
+        cpu: 100m
+        memory: 256Mi
+      limits:
+        cpu: 500m
+        memory: 512Mi
+
+  ## ----------------------- Alert rules -----------------------
+  ## Prometheus will pick these up via the ruleSelector above (the generated
+  ## PrometheusRule gets labelled app.kubernetes.io/part-of: samosachaat).
+  additionalPrometheusRulesMap:
+    samosachaat-alerts:
+      groups:
+        - name: node-health
+          rules:
+            - alert: HighCPU
+              expr: avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[5m])) > 0.8
+              for: 5m
+              labels:
+                severity: warning
+              annotations:
+                summary: "CPU > 80% on {{ $labels.instance }} for 5m"
+                description: "Node {{ $labels.instance }} has had CPU usage above 80% for 5 minutes."
+
+            - alert: HighMemory
+              expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.85
+              for: 5m
+              labels:
+                severity: critical
+              annotations:
+                summary: "Memory > 85% on {{ $labels.instance }}"
+                description: "Node {{ $labels.instance }} memory usage is above 85%."
+
+            - alert: DiskSpaceLow
+              expr: (1 - node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) > 0.9
+              for: 10m
+              labels:
+                severity: critical
+              annotations:
+                summary: "Disk > 90% on {{ $labels.instance }}:{{ $labels.mountpoint }}"
+                description: "Filesystem {{ $labels.mountpoint }} on {{ $labels.instance }} is above 90% full."
+
+        - name: application
+          rules:
+            - alert: High5xxRate
+              expr: sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.05
+              for: 2m
+              labels:
+                severity: critical
+              annotations:
+                summary: "5xx error rate > 5% across services"
+                description: "Aggregate 5xx error ratio across all services has exceeded 5% for 2 minutes."
+
+            - alert: InferenceServiceDown
+              expr: up{app="inference"} == 0
+              for: 1m
+              labels:
+                severity: critical
+              annotations:
+                summary: "Inference service is DOWN"
+                description: "Prometheus cannot scrape the inference service /metrics endpoint."
+
+            - alert: HighP99Latency
+              expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, app)) > 5
+              for: 3m
+              labels:
+                severity: warning
+              annotations:
+                summary: "p99 latency > 5s for {{ $labels.app }}"
+                description: "Service {{ $labels.app }} p99 latency has exceeded 5s for 3 minutes."
+
+## ----------------------- Loki stack -----------------------
+loki-stack:
+  loki:
+    enabled: true
+    persistence:
+      enabled: true
+      size: 50Gi
+    config:
+      limits_config:
+        retention_period: 720h
+      schema_config:
+        configs:
+          - from: "2024-01-01"
+            store: tsdb
+            object_store: filesystem
+            schema: v13
+            index:
+              prefix: loki_index_
+              period: 24h
+
+  promtail:
+    enabled: true
+    config:
+      snippets:
+        pipelineStages:
+          - cri: {}
+          - json:
+              expressions:
+                level: level
+                service: service
+                trace_id: trace_id
+                user_id: user_id
+          - labels:
+              level:
+              service:
+      scrapeConfigs: |
+        - job_name: kubernetes-pods
+          pipeline_stages:
+            - cri: {}
+            - json:
+                expressions:
+                  level: level
+                  service: service
+                  trace_id: trace_id
+                  user_id: user_id
+            - labels:
+                level:
+                service:
+          kubernetes_sd_configs:
+            - role: pod
+          relabel_configs:
+            - source_labels:
+                - __meta_kubernetes_namespace
+              target_label: namespace
+            - source_labels:
+                - __meta_kubernetes_pod_name
+              target_label: pod
+            - source_labels:
+                - __meta_kubernetes_pod_label_app_kubernetes_io_name
+              target_label: app
+            - source_labels:
+                - __meta_kubernetes_pod_label_app_kubernetes_io_component
+              target_label: component
+            - source_labels:
+                - __meta_kubernetes_pod_label_app_kubernetes_io_part_of
+              target_label: part_of
+            - source_labels:
+                - __meta_kubernetes_pod_container_name
+              target_label: container
+            - action: replace
+              replacement: /var/log/pods/*$1/*.log
+              separator: /
+              source_labels:
+                - __meta_kubernetes_pod_uid
+                - __meta_kubernetes_pod_container_name
+              target_label: __path__
+
+  prometheus:
+    enabled: false
+  grafana:
+    enabled: false
diff --git a/helm/samosachaat/templates/servicemonitor.yaml b/helm/samosachaat/templates/servicemonitor.yaml
new file mode 100644
index 00000000..124366c3
--- /dev/null
+++ b/helm/samosachaat/templates/servicemonitor.yaml
@@ -0,0 +1,40 @@
+{{/*
+ServiceMonitor CRs so the Prometheus instance deployed by the observability
+chart discovers and scrapes the samosaChaat backend services. Prometheus
+selects only ServiceMonitors labelled app.kubernetes.io/part-of=samosachaat
+(see helm/observability/values.yaml -> prometheus.serviceMonitorSelector).
+
+Each backend Python service exposes /metrics via
+prometheus-fastapi-instrumentator. The frontend (Next.js) is omitted until it
+grows its own metrics endpoint.
+*/}}
+{{- $services := dict "auth" .Values.auth "chat-api" .Values.chatApi "inference" .Values.inference -}}
+{{- range $svc, $cfg := $services }}
+{{- if $cfg.enabled }}
+---
+apiVersion: monitoring.coreos.com/v1
+kind: ServiceMonitor
+metadata:
+  name: {{ $svc }}
+  namespace: {{ include "samosachaat.namespace" $ }}
+  labels:
+    {{- include "samosachaat.labels" $ | nindent 4 }}
+    app: {{ $svc }}
+spec:
+  namespaceSelector:
+    matchNames:
+      - {{ include "samosachaat.namespace" $ }}
+  selector:
+    matchLabels:
+      {{- include "samosachaat.selectorLabels" (dict "root" $ "svc" $svc) | nindent 6 }}
+  endpoints:
+    - port: http
+      interval: 15s
+      path: /metrics
+      scheme: http
+      relabelings:
+        - action: replace
+          targetLabel: app
+          replacement: {{ $svc }}
+{{- end }}
+{{- end }}
diff --git a/services/auth/pyproject.toml b/services/auth/pyproject.toml
index c42bcc68..9f6ac217 100644
--- a/services/auth/pyproject.toml
+++ b/services/auth/pyproject.toml
@@ -19,6 +19,8 @@ dependencies = [
     "cryptography>=43.0.0",
     "slowapi>=0.1.9",
     "python-multipart>=0.0.9",
+    "structlog>=24.4.0",
+    "prometheus-fastapi-instrumentator>=7.0.0",
 ]
 
 [dependency-groups]
diff --git a/services/auth/src/config.py b/services/auth/src/config.py
index 636b7f39..03453bc2 100644
--- a/services/auth/src/config.py
+++ b/services/auth/src/config.py
@@ -37,6 +37,8 @@ class Settings(BaseSettings):
     cookie_secure: bool = Field(default=False)
     cookie_domain: str | None = Field(default=None)
 
+    log_level: str = Field(default="INFO")
+
     @property
     def refresh_cookie_name(self) -> str:
         return "samosachaat_refresh"
diff --git a/services/auth/src/logging_setup.py b/services/auth/src/logging_setup.py
new file mode 100644
index 00000000..b7404101
--- /dev/null
+++ b/services/auth/src/logging_setup.py
@@ -0,0 +1,78 @@
+"""Structured JSON logging for the auth service.
+
+Mirrors the canonical implementation in services/chat-api/src/logging_setup.py
+so every service emits the same JSON shape (see contracts/logging-standard.md).
+"""
+from __future__ import annotations
+
+import logging
+import sys
+import uuid
+from contextvars import ContextVar
+
+import structlog
+
+from .config import get_settings
+
+_trace_id_ctx: ContextVar[str | None] = ContextVar("trace_id", default=None)
+_user_id_ctx: ContextVar[str | None] = ContextVar("user_id", default=None)
+
+
+def set_trace_id(trace_id: str | None) -> None:
+    _trace_id_ctx.set(trace_id)
+
+
+def set_user_id(user_id: str | None) -> None:
+    _user_id_ctx.set(user_id)
+
+
+def get_trace_id() -> str | None:
+    return _trace_id_ctx.get()
+
+
+def get_user_id() -> str | None:
+    return _user_id_ctx.get()
+
+
+def new_trace_id() -> str:
+    return uuid.uuid4().hex
+
+
+def _inject_context(_logger, _method, event_dict):
+    event_dict.setdefault("service", "auth")
+    trace_id = _trace_id_ctx.get()
+    if trace_id is not None:
+        event_dict.setdefault("trace_id", trace_id)
+    user_id = _user_id_ctx.get()
+    if user_id is not None:
+        event_dict.setdefault("user_id", user_id)
+    return event_dict
+
+
+def configure_logging() -> None:
+    settings = get_settings()
+    level = getattr(logging, settings.log_level.upper(), logging.INFO)
+
+    logging.basicConfig(
+        format="%(message)s",
+        stream=sys.stdout,
+        level=level,
+        force=True,
+    )
+
+    structlog.configure(
+        processors=[
+            structlog.contextvars.merge_contextvars,
+            structlog.processors.add_log_level,
+            structlog.processors.TimeStamper(fmt="iso", utc=True),
+            _inject_context,
+            structlog.processors.JSONRenderer(),
+        ],
+        wrapper_class=structlog.make_filtering_bound_logger(level),
+        logger_factory=structlog.stdlib.LoggerFactory(),
+        cache_logger_on_first_use=True,
+    )
+
+
+def get_logger(name: str | None = None):
+    return structlog.get_logger(name)
diff --git a/services/auth/src/main.py b/services/auth/src/main.py
index 0d3d86c4..e62316cb 100644
--- a/services/auth/src/main.py
+++ b/services/auth/src/main.py
@@ -4,11 +4,13 @@ from __future__ import annotations
 from fastapi import FastAPI
 from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
+from prometheus_fastapi_instrumentator import Instrumentator
 from slowapi.errors import RateLimitExceeded
 from slowapi.middleware import SlowAPIMiddleware
 from starlette.middleware.sessions import SessionMiddleware
 
 from .config import get_settings
+from .logging_setup import configure_logging
 from .rate_limit import limiter
 from .routes import oauth, session, users
 
@@ -18,6 +20,7 @@ def _rate_limit_handler(request, exc: RateLimitExceeded):
 
 
 def create_app() -> FastAPI:
+    configure_logging()
     settings = get_settings()
     app = FastAPI(title="samosaChaat Auth", version="0.1.0")
 
@@ -44,6 +47,8 @@ def create_app() -> FastAPI:
     async def health():
         return {"status": "ok"}
 
+    Instrumentator().instrument(app).expose(app, endpoint="/metrics", include_in_schema=False)
+
     return app
 
 
diff --git a/services/chat-api/pyproject.toml b/services/chat-api/pyproject.toml
index 07df9eb6..1d953516 100644
--- a/services/chat-api/pyproject.toml
+++ b/services/chat-api/pyproject.toml
@@ -15,6 +15,7 @@ dependencies = [
     "sse-starlette>=2.1.3",
     "structlog>=24.4.0",
     "cachetools>=5.5.0",
+    "prometheus-fastapi-instrumentator>=7.0.0",
 ]
 
 [dependency-groups]
diff --git a/services/chat-api/src/main.py b/services/chat-api/src/main.py
index bdb2608f..9bb0ab1c 100644
--- a/services/chat-api/src/main.py
+++ b/services/chat-api/src/main.py
@@ -6,6 +6,7 @@ from contextlib import asynccontextmanager
 import httpx
 from fastapi import FastAPI, Request, Response
 from fastapi.middleware.cors import CORSMiddleware
+from prometheus_fastapi_instrumentator import Instrumentator
 
 from .config import get_settings
 from .logging_setup import (
@@ -78,6 +79,8 @@ def create_app() -> FastAPI:
     async def health():
         return {"status": "ok", "ready": True, "service": "chat-api"}
 
+    Instrumentator().instrument(app).expose(app, endpoint="/metrics", include_in_schema=False)
+
     return app
 
 
diff --git a/services/inference/pyproject.toml b/services/inference/pyproject.toml
index 500104a3..38fed545 100644
--- a/services/inference/pyproject.toml
+++ b/services/inference/pyproject.toml
@@ -16,6 +16,8 @@ dependencies = [
     "torch==2.9.1",
     "transformers>=4.57.3",
     "uvicorn>=0.36.0",
+    "structlog>=24.4.0",
+    "prometheus-fastapi-instrumentator>=7.0.0",
 ]
 
 [dependency-groups]
diff --git a/services/inference/src/config.py b/services/inference/src/config.py
index 694c16a7..a2b8f72c 100644
--- a/services/inference/src/config.py
+++ b/services/inference/src/config.py
@@ -29,6 +29,7 @@ class Settings(BaseSettings):
     default_temperature: float = 0.8
     default_top_k: int = 50
     default_max_tokens: int = 512
+    log_level: str = "INFO"
 
     @property
     def resolved_device_type(self) -> str:
diff --git a/services/inference/src/logging_setup.py b/services/inference/src/logging_setup.py
new file mode 100644
index 00000000..c6bc1d40
--- /dev/null
+++ b/services/inference/src/logging_setup.py
@@ -0,0 +1,78 @@
+"""Structured JSON logging for the inference service.
+
+Mirrors the canonical implementation in services/chat-api/src/logging_setup.py
+so every service emits the same JSON shape (see contracts/logging-standard.md).
+"""
+from __future__ import annotations
+
+import logging
+import sys
+import uuid
+from contextvars import ContextVar
+
+import structlog
+
+from config import get_settings
+
+_trace_id_ctx: ContextVar[str | None] = ContextVar("trace_id", default=None)
+_user_id_ctx: ContextVar[str | None] = ContextVar("user_id", default=None)
+
+
+def set_trace_id(trace_id: str | None) -> None:
+    _trace_id_ctx.set(trace_id)
+
+
+def set_user_id(user_id: str | None) -> None:
+    _user_id_ctx.set(user_id)
+
+
+def get_trace_id() -> str | None:
+    return _trace_id_ctx.get()
+
+
+def get_user_id() -> str | None:
+    return _user_id_ctx.get()
+
+
+def new_trace_id() -> str:
+    return uuid.uuid4().hex
+
+
+def _inject_context(_logger, _method, event_dict):
+    event_dict.setdefault("service", "inference")
+    trace_id = _trace_id_ctx.get()
+    if trace_id is not None:
+        event_dict.setdefault("trace_id", trace_id)
+    user_id = _user_id_ctx.get()
+    if user_id is not None:
+        event_dict.setdefault("user_id", user_id)
+    return event_dict
+
+
+def configure_logging() -> None:
+    settings = get_settings()
+    level = getattr(logging, settings.log_level.upper(), logging.INFO)
+
+    logging.basicConfig(
+        format="%(message)s",
+        stream=sys.stdout,
+        level=level,
+        force=True,
+    )
+
+    structlog.configure(
+        processors=[
+            structlog.contextvars.merge_contextvars,
+            structlog.processors.add_log_level,
+            structlog.processors.TimeStamper(fmt="iso", utc=True),
+            _inject_context,
+            structlog.processors.JSONRenderer(),
+        ],
+        wrapper_class=structlog.make_filtering_bound_logger(level),
+        logger_factory=structlog.stdlib.LoggerFactory(),
+        cache_logger_on_first_use=True,
+    )
+
+
+def get_logger(name: str | None = None):
+    return structlog.get_logger(name)
diff --git a/services/inference/src/main.py b/services/inference/src/main.py
index 186c7e10..5c3982e5 100644
--- a/services/inference/src/main.py
+++ b/services/inference/src/main.py
@@ -2,20 +2,21 @@ from __future__ import annotations
 
 import asyncio
 import json
-import logging
 import random
 from contextlib import asynccontextmanager
 from typing import AsyncGenerator
 
 from fastapi import Depends, FastAPI, HTTPException, Request, status
 from fastapi.responses import JSONResponse, StreamingResponse
+from prometheus_fastapi_instrumentator import Instrumentator
 from pydantic import BaseModel
 
 from config import Settings, get_settings
+from logging_setup import configure_logging, get_logger
 from middleware.internal_auth import require_internal_api_key
 from services.weight_manager import WeightManager
 
-logger = logging.getLogger(__name__)
+logger = get_logger(__name__)
 
 # Abuse prevention limits
 MAX_MESSAGES_PER_REQUEST = 500
@@ -169,7 +170,7 @@ class InferenceRuntime:
                 step=self.settings.default_step,
             )
         except Exception as exc:  # pragma: no cover - exercised by deployment conditions
-            logger.warning("Skipping startup model load: %s", exc)
+            logger.warning("skipping startup model load", error=str(exc))
             self.worker_pool = None
 
     async def shutdown(self) -> None:
@@ -236,6 +237,7 @@ def get_runtime(request: Request) -> InferenceRuntime:
 
 
 def create_app(settings: Settings | None = None, runtime: InferenceRuntime | None = None) -> FastAPI:
+    configure_logging()
     resolved_settings = settings or get_settings()
 
     @asynccontextmanager
@@ -285,6 +287,8 @@ def create_app(settings: Settings | None = None, runtime: InferenceRuntime | Non
     async def stats(runtime: InferenceRuntime = Depends(get_runtime)):
         return runtime.stats_payload()
 
+    Instrumentator().instrument(app).expose(app, endpoint="/metrics", include_in_schema=False)
+
     return app