mirror of
https://github.com/karpathy/nanochat.git
synced 2026-05-08 08:49:53 +00:00
Adds tooling and documentation for Day 2 cluster operations: - scripts/rotate-nodes.sh: interactive node-rotation driver that applies terraform to pick up the latest SSM-resolved EKS AMI and watches the rolling replacement. - scripts/demo-schema-change.sh: end-to-end demo of the zero-downtime is_favorited column migration via helm upgrade + migration hook. - scripts/verify-deployment.sh: post-deploy health check across pods, per-service HTTP health endpoints, rollout status, and PDBs. - docs/chaos-runbook.md: failure-mode playbook with simulate / Grafana / Loki / recovery steps for six scenarios (pod kill, node failure, DB pool exhaustion, inference OOM, high latency, SSL issues) plus a Loki quick-reference. - terraform/modules/eks: expose current_node_ami_id output, add update_config.max_unavailable_percentage (configurable, default 33) so node-group rolls are controlled. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
53 lines
1.8 KiB
Bash
Executable File
53 lines
1.8 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
# Verify all samosaChaat services are healthy after deployment.
|
|
# Usage: ./scripts/verify-deployment.sh <namespace>
|
|
|
|
NAMESPACE="${1:?Usage: verify-deployment.sh <namespace>}"
|
|
|
|
echo "=== samosaChaat Deployment Verification — $NAMESPACE ==="
|
|
|
|
PASS=0
|
|
FAIL=0
|
|
|
|
check() {
|
|
local name="$1" cmd="$2"
|
|
if eval "$cmd" > /dev/null 2>&1; then
|
|
echo " ✓ $name"
|
|
((PASS++))
|
|
else
|
|
echo " ✗ $name"
|
|
((FAIL++))
|
|
fi
|
|
}
|
|
|
|
echo ""
|
|
echo "Pods:"
|
|
kubectl get pods -n "$NAMESPACE" --no-headers | while read line; do
|
|
echo " $line"
|
|
done
|
|
|
|
echo "Health checks:"
|
|
check "Frontend" "kubectl exec -n $NAMESPACE deploy/frontend -- wget -qO- http://localhost:3000/api/health"
|
|
check "Auth" "kubectl exec -n $NAMESPACE deploy/auth -- python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8001/auth/health')\""
|
|
check "Chat API" "kubectl exec -n $NAMESPACE deploy/chat-api -- python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8002/api/health')\""
|
|
check "Inference" "kubectl exec -n $NAMESPACE deploy/inference -- python -c \"import urllib.request; urllib.request.urlopen('http://localhost:8003/health')\""
|
|
|
|
echo ""
|
|
echo "Deployments:"
|
|
check "Frontend available" "kubectl rollout status deploy/frontend -n $NAMESPACE --timeout=10s"
|
|
check "Auth available" "kubectl rollout status deploy/auth -n $NAMESPACE --timeout=10s"
|
|
check "Chat API available" "kubectl rollout status deploy/chat-api -n $NAMESPACE --timeout=10s"
|
|
check "Inference available" "kubectl rollout status deploy/inference -n $NAMESPACE --timeout=10s"
|
|
|
|
echo ""
|
|
echo "PDBs:"
|
|
kubectl get pdb -n "$NAMESPACE" --no-headers 2>/dev/null | while read line; do
|
|
echo " $line"
|
|
done
|
|
|
|
echo ""
|
|
echo "Result: $PASS passed, $FAIL failed"
|
|
[ "$FAIL" -eq 0 ] && echo "All checks passed!" || { echo "SOME CHECKS FAILED"; exit 1; }
|