mirror of
https://github.com/karpathy/nanochat.git
synced 2026-05-15 12:17:33 +00:00
Replaces the helm/observability scaffold with a real monitoring stack
wired into the samosaChaat platform.
Helm chart (helm/observability/)
- Chart.yaml declares kube-prometheus-stack (~62.0) and loki-stack
(~2.10) as subchart dependencies.
- values.yaml configures Prometheus (15d retention, 50Gi PVC,
ServiceMonitor + rule selector on app.kubernetes.io/part-of:
samosachaat), Alertmanager (10Gi PVC), Grafana (OAuth-only via
GitHub + Google, local login disabled, Prometheus + Loki datasources,
dashboards auto-provisioned from a ConfigMap, email + Slack contact
points with a critical route to Slack), Loki (50Gi, 30d retention,
tsdb schema), and Promtail (JSON pipeline that lifts level / service
/ trace_id / user_id into labels, scrape config with pod labels).
- Alert rules: HighCPU, HighMemory, DiskSpaceLow, High5xxRate,
InferenceServiceDown, HighP99Latency.
- templates/grafana-dashboards-configmap.yaml renders every file under
dashboards/ into a single grafana_dashboard=1 ConfigMap.
- dashboards/node-health.json, app-performance.json, inference.json -
fully-formed Grafana dashboards with Prometheus datasource variable,
templated app selector, thresholded gauges, and LogQL-ready labels.
Scraping (helm/samosachaat/templates/servicemonitor.yaml)
- ServiceMonitor CRs for auth / chat-api / inference that Prometheus
picks up via the part-of=samosachaat selector; scrapes /metrics
every 15s and replaces the app label so dashboards line up.
Application instrumentation
- services/{auth,chat-api,inference} each depend on
prometheus-fastapi-instrumentator and expose /metrics (request count,
latency histograms, in-progress gauges).
- services/auth/src/logging_setup.py and
services/inference/src/logging_setup.py mirror the canonical
chat-api implementation - structlog JSON with service, trace_id,
user_id context injection.
- configure_logging() is called at create_app() in auth and inference;
inference's main.py now uses structlog via get_logger() instead of
logging.getLogger.
- log_level setting added to auth + inference config (LOG_LEVEL env).
Docs
- contracts/logging-standard.md defines the required JSON fields,
Python (structlog) + Node.js (pino) implementations, LogQL examples
for cross-service queries, and the x-trace-id propagation contract.
Closes #9
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
79 lines
2.0 KiB
Python
79 lines
2.0 KiB
Python
"""Structured JSON logging for the inference service.
|
|
|
|
Mirrors the canonical implementation in services/chat-api/src/logging_setup.py
|
|
so every service emits the same JSON shape (see contracts/logging-standard.md).
|
|
"""
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import sys
|
|
import uuid
|
|
from contextvars import ContextVar
|
|
|
|
import structlog
|
|
|
|
from config import get_settings
|
|
|
|
_trace_id_ctx: ContextVar[str | None] = ContextVar("trace_id", default=None)
|
|
_user_id_ctx: ContextVar[str | None] = ContextVar("user_id", default=None)
|
|
|
|
|
|
def set_trace_id(trace_id: str | None) -> None:
|
|
_trace_id_ctx.set(trace_id)
|
|
|
|
|
|
def set_user_id(user_id: str | None) -> None:
|
|
_user_id_ctx.set(user_id)
|
|
|
|
|
|
def get_trace_id() -> str | None:
|
|
return _trace_id_ctx.get()
|
|
|
|
|
|
def get_user_id() -> str | None:
|
|
return _user_id_ctx.get()
|
|
|
|
|
|
def new_trace_id() -> str:
|
|
return uuid.uuid4().hex
|
|
|
|
|
|
def _inject_context(_logger, _method, event_dict):
|
|
event_dict.setdefault("service", "inference")
|
|
trace_id = _trace_id_ctx.get()
|
|
if trace_id is not None:
|
|
event_dict.setdefault("trace_id", trace_id)
|
|
user_id = _user_id_ctx.get()
|
|
if user_id is not None:
|
|
event_dict.setdefault("user_id", user_id)
|
|
return event_dict
|
|
|
|
|
|
def configure_logging() -> None:
|
|
settings = get_settings()
|
|
level = getattr(logging, settings.log_level.upper(), logging.INFO)
|
|
|
|
logging.basicConfig(
|
|
format="%(message)s",
|
|
stream=sys.stdout,
|
|
level=level,
|
|
force=True,
|
|
)
|
|
|
|
structlog.configure(
|
|
processors=[
|
|
structlog.contextvars.merge_contextvars,
|
|
structlog.processors.add_log_level,
|
|
structlog.processors.TimeStamper(fmt="iso", utc=True),
|
|
_inject_context,
|
|
structlog.processors.JSONRenderer(),
|
|
],
|
|
wrapper_class=structlog.make_filtering_bound_logger(level),
|
|
logger_factory=structlog.stdlib.LoggerFactory(),
|
|
cache_logger_on_first_use=True,
|
|
)
|
|
|
|
|
|
def get_logger(name: str | None = None):
|
|
return structlog.get_logger(name)
|