mirror of
https://github.com/karpathy/nanochat.git
synced 2026-05-08 16:59:59 +00:00
Replaces the helm/observability scaffold with a real monitoring stack
wired into the samosaChaat platform.
Helm chart (helm/observability/)
- Chart.yaml declares kube-prometheus-stack (~62.0) and loki-stack
(~2.10) as subchart dependencies.
- values.yaml configures Prometheus (15d retention, 50Gi PVC,
ServiceMonitor + rule selector on app.kubernetes.io/part-of:
samosachaat), Alertmanager (10Gi PVC), Grafana (OAuth-only via
GitHub + Google, local login disabled, Prometheus + Loki datasources,
dashboards auto-provisioned from a ConfigMap, email + Slack contact
points with a critical route to Slack), Loki (50Gi, 30d retention,
tsdb schema), and Promtail (JSON pipeline that lifts level / service
/ trace_id / user_id into labels, scrape config with pod labels).
- Alert rules: HighCPU, HighMemory, DiskSpaceLow, High5xxRate,
InferenceServiceDown, HighP99Latency.
- templates/grafana-dashboards-configmap.yaml renders every file under
dashboards/ into a single grafana_dashboard=1 ConfigMap.
- dashboards/node-health.json, app-performance.json, inference.json -
fully-formed Grafana dashboards with Prometheus datasource variable,
templated app selector, thresholded gauges, and LogQL-ready labels.
Scraping (helm/samosachaat/templates/servicemonitor.yaml)
- ServiceMonitor CRs for auth / chat-api / inference that Prometheus
picks up via the part-of=samosachaat selector; scrapes /metrics
every 15s and replaces the app label so dashboards line up.
Application instrumentation
- services/{auth,chat-api,inference} each depend on
prometheus-fastapi-instrumentator and expose /metrics (request count,
latency histograms, in-progress gauges).
- services/auth/src/logging_setup.py and
services/inference/src/logging_setup.py mirror the canonical
chat-api implementation - structlog JSON with service, trace_id,
user_id context injection.
- configure_logging() is called at create_app() in auth and inference;
inference's main.py now uses structlog via get_logger() instead of
logging.getLogger.
- log_level setting added to auth + inference config (LOG_LEVEL env).
Docs
- contracts/logging-standard.md defines the required JSON fields,
Python (structlog) + Node.js (pino) implementations, LogQL examples
for cross-service queries, and the x-trace-id propagation contract.
Closes #9
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
318 lines
10 KiB
YAML
318 lines
10 KiB
YAML
## Values for the samosaChaat observability umbrella chart.
|
|
##
|
|
## This chart deploys kube-prometheus-stack (Prometheus, Grafana, Alertmanager,
|
|
## node-exporter, kube-state-metrics) and loki-stack (Loki + Promtail) into the
|
|
## cluster. Grafana is exposed at grafana.samosachaat.art via the main ingress
|
|
## in the samosachaat app chart (which routes to a ClusterIP service named
|
|
## "grafana" on port 3000 in the same namespace).
|
|
|
|
kube-prometheus-stack:
|
|
fullnameOverride: prom
|
|
|
|
## ----------------------- Prometheus -----------------------
|
|
prometheus:
|
|
prometheusSpec:
|
|
## Only scrape ServiceMonitors that belong to the samosaChaat app.
|
|
serviceMonitorSelector:
|
|
matchLabels:
|
|
app.kubernetes.io/part-of: samosachaat
|
|
serviceMonitorSelectorNilUsesHelmValues: false
|
|
## Also evaluate our custom PrometheusRules.
|
|
ruleSelector:
|
|
matchLabels:
|
|
app.kubernetes.io/part-of: samosachaat
|
|
ruleSelectorNilUsesHelmValues: false
|
|
## Keep 15 days of metric data on a 50Gi PVC.
|
|
retention: 15d
|
|
storageSpec:
|
|
volumeClaimTemplate:
|
|
spec:
|
|
accessModes: ["ReadWriteOnce"]
|
|
resources:
|
|
requests:
|
|
storage: 50Gi
|
|
resources:
|
|
requests:
|
|
cpu: 200m
|
|
memory: 1Gi
|
|
limits:
|
|
cpu: "1"
|
|
memory: 2Gi
|
|
|
|
## ----------------------- Alertmanager -----------------------
|
|
alertmanager:
|
|
alertmanagerSpec:
|
|
storage:
|
|
volumeClaimTemplate:
|
|
spec:
|
|
accessModes: ["ReadWriteOnce"]
|
|
resources:
|
|
requests:
|
|
storage: 10Gi
|
|
|
|
## ----------------------- Grafana -----------------------
|
|
grafana:
|
|
enabled: true
|
|
## The main samosaChaat ingress already routes grafana.samosachaat.art
|
|
## to a ClusterIP service named "grafana" port 3000 in the same namespace.
|
|
## Override the generated service name so it matches.
|
|
fullnameOverride: grafana
|
|
service:
|
|
type: ClusterIP
|
|
port: 3000
|
|
ingress:
|
|
enabled: false
|
|
|
|
## OAuth2 — no username/password, only GitHub + Google sign-in.
|
|
## The referenced env vars must be provided via Grafana's extraEnvVarsSecret
|
|
## (see grafana.envFromSecret below) or set in the cluster's secret store.
|
|
grafana.ini:
|
|
server:
|
|
root_url: https://grafana.samosachaat.art
|
|
auth:
|
|
disable_login_form: true
|
|
"auth.github":
|
|
enabled: true
|
|
allow_sign_up: true
|
|
client_id: ${GITHUB_GRAFANA_CLIENT_ID}
|
|
client_secret: ${GITHUB_GRAFANA_CLIENT_SECRET}
|
|
scopes: user:email,read:org
|
|
auth_url: https://github.com/login/oauth/authorize
|
|
token_url: https://github.com/login/oauth/access_token
|
|
api_url: https://api.github.com/user
|
|
"auth.google":
|
|
enabled: true
|
|
allow_sign_up: true
|
|
client_id: ${GOOGLE_GRAFANA_CLIENT_ID}
|
|
client_secret: ${GOOGLE_GRAFANA_CLIENT_SECRET}
|
|
scopes: openid email profile
|
|
auth_url: https://accounts.google.com/o/oauth2/auth
|
|
token_url: https://accounts.google.com/o/oauth2/token
|
|
allowed_domains: gmail.com
|
|
|
|
## OAuth client IDs/secrets come from a Kubernetes secret mounted as env.
|
|
## Expected keys: GITHUB_GRAFANA_CLIENT_ID, GITHUB_GRAFANA_CLIENT_SECRET,
|
|
## GOOGLE_GRAFANA_CLIENT_ID, GOOGLE_GRAFANA_CLIENT_SECRET, SLACK_WEBHOOK_URL.
|
|
envFromSecret: grafana-oauth-secrets
|
|
|
|
## Datasources — Prometheus (default) + Loki for logs.
|
|
datasources:
|
|
datasources.yaml:
|
|
apiVersion: 1
|
|
datasources:
|
|
- name: Prometheus
|
|
type: prometheus
|
|
uid: prometheus
|
|
url: http://prom-kube-prometheus-stack-prometheus.{{ .Release.Namespace }}.svc:9090
|
|
access: proxy
|
|
isDefault: true
|
|
- name: Loki
|
|
type: loki
|
|
uid: loki
|
|
url: http://loki.{{ .Release.Namespace }}.svc:3100
|
|
access: proxy
|
|
jsonData:
|
|
maxLines: 1000
|
|
|
|
## Auto-provision dashboards from the dashboards ConfigMap rendered by
|
|
## templates/grafana-dashboards-configmap.yaml.
|
|
dashboardProviders:
|
|
dashboardproviders.yaml:
|
|
apiVersion: 1
|
|
providers:
|
|
- name: samosachaat
|
|
orgId: 1
|
|
folder: samosaChaat
|
|
type: file
|
|
disableDeletion: false
|
|
editable: true
|
|
options:
|
|
path: /var/lib/grafana/dashboards/samosachaat
|
|
|
|
dashboardsConfigMaps:
|
|
samosachaat: samosachaat-dashboards
|
|
|
|
## Unified alerting — contact points (email + slack).
|
|
alerting:
|
|
contactpoints.yaml:
|
|
apiVersion: 1
|
|
contactPoints:
|
|
- orgId: 1
|
|
name: email-alerts
|
|
receivers:
|
|
- uid: email
|
|
type: email
|
|
settings:
|
|
addresses: manmohan659@gmail.com
|
|
- orgId: 1
|
|
name: slack-alerts
|
|
receivers:
|
|
- uid: slack
|
|
type: slack
|
|
settings:
|
|
url: ${SLACK_WEBHOOK_URL}
|
|
policies.yaml:
|
|
apiVersion: 1
|
|
policies:
|
|
- orgId: 1
|
|
receiver: email-alerts
|
|
group_by: [alertname, severity]
|
|
routes:
|
|
- receiver: slack-alerts
|
|
matchers:
|
|
- severity = critical
|
|
|
|
resources:
|
|
requests:
|
|
cpu: 100m
|
|
memory: 256Mi
|
|
limits:
|
|
cpu: 500m
|
|
memory: 512Mi
|
|
|
|
## ----------------------- Alert rules -----------------------
|
|
## Prometheus will pick these up via the ruleSelector above (the generated
|
|
## PrometheusRule gets labelled app.kubernetes.io/part-of: samosachaat).
|
|
additionalPrometheusRulesMap:
|
|
samosachaat-alerts:
|
|
groups:
|
|
- name: node-health
|
|
rules:
|
|
- alert: HighCPU
|
|
expr: avg by (instance) (rate(node_cpu_seconds_total{mode!="idle"}[5m])) > 0.8
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "CPU > 80% on {{ $labels.instance }} for 5m"
|
|
description: "Node {{ $labels.instance }} has had CPU usage above 80% for 5 minutes."
|
|
|
|
- alert: HighMemory
|
|
expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) > 0.85
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Memory > 85% on {{ $labels.instance }}"
|
|
description: "Node {{ $labels.instance }} memory usage is above 85%."
|
|
|
|
- alert: DiskSpaceLow
|
|
expr: (1 - node_filesystem_avail_bytes{fstype!="tmpfs"} / node_filesystem_size_bytes{fstype!="tmpfs"}) > 0.9
|
|
for: 10m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Disk > 90% on {{ $labels.instance }}:{{ $labels.mountpoint }}"
|
|
description: "Filesystem {{ $labels.mountpoint }} on {{ $labels.instance }} is above 90% full."
|
|
|
|
- name: application
|
|
rules:
|
|
- alert: High5xxRate
|
|
expr: sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) > 0.05
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "5xx error rate > 5% across services"
|
|
description: "Aggregate 5xx error ratio across all services has exceeded 5% for 2 minutes."
|
|
|
|
- alert: InferenceServiceDown
|
|
expr: up{app="inference"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Inference service is DOWN"
|
|
description: "Prometheus cannot scrape the inference service /metrics endpoint."
|
|
|
|
- alert: HighP99Latency
|
|
expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le, app)) > 5
|
|
for: 3m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "p99 latency > 5s for {{ $labels.app }}"
|
|
description: "Service {{ $labels.app }} p99 latency has exceeded 5s for 3 minutes."
|
|
|
|
## ----------------------- Loki stack -----------------------
|
|
loki-stack:
|
|
loki:
|
|
enabled: true
|
|
persistence:
|
|
enabled: true
|
|
size: 50Gi
|
|
config:
|
|
limits_config:
|
|
retention_period: 720h
|
|
schema_config:
|
|
configs:
|
|
- from: "2024-01-01"
|
|
store: tsdb
|
|
object_store: filesystem
|
|
schema: v13
|
|
index:
|
|
prefix: loki_index_
|
|
period: 24h
|
|
|
|
promtail:
|
|
enabled: true
|
|
config:
|
|
snippets:
|
|
pipelineStages:
|
|
- cri: {}
|
|
- json:
|
|
expressions:
|
|
level: level
|
|
service: service
|
|
trace_id: trace_id
|
|
user_id: user_id
|
|
- labels:
|
|
level:
|
|
service:
|
|
scrapeConfigs: |
|
|
- job_name: kubernetes-pods
|
|
pipeline_stages:
|
|
- cri: {}
|
|
- json:
|
|
expressions:
|
|
level: level
|
|
service: service
|
|
trace_id: trace_id
|
|
user_id: user_id
|
|
- labels:
|
|
level:
|
|
service:
|
|
kubernetes_sd_configs:
|
|
- role: pod
|
|
relabel_configs:
|
|
- source_labels:
|
|
- __meta_kubernetes_namespace
|
|
target_label: namespace
|
|
- source_labels:
|
|
- __meta_kubernetes_pod_name
|
|
target_label: pod
|
|
- source_labels:
|
|
- __meta_kubernetes_pod_label_app_kubernetes_io_name
|
|
target_label: app
|
|
- source_labels:
|
|
- __meta_kubernetes_pod_label_app_kubernetes_io_component
|
|
target_label: component
|
|
- source_labels:
|
|
- __meta_kubernetes_pod_label_app_kubernetes_io_part_of
|
|
target_label: part_of
|
|
- source_labels:
|
|
- __meta_kubernetes_pod_container_name
|
|
target_label: container
|
|
- action: replace
|
|
replacement: /var/log/pods/*$1/*.log
|
|
separator: /
|
|
source_labels:
|
|
- __meta_kubernetes_pod_uid
|
|
- __meta_kubernetes_pod_container_name
|
|
target_label: __path__
|
|
|
|
prometheus:
|
|
enabled: false
|
|
grafana:
|
|
enabled: false
|