mirror of
https://github.com/karpathy/nanochat.git
synced 2026-05-08 00:39:50 +00:00
Adds tooling and documentation for Day 2 cluster operations: - scripts/rotate-nodes.sh: interactive node-rotation driver that applies terraform to pick up the latest SSM-resolved EKS AMI and watches the rolling replacement. - scripts/demo-schema-change.sh: end-to-end demo of the zero-downtime is_favorited column migration via helm upgrade + migration hook. - scripts/verify-deployment.sh: post-deploy health check across pods, per-service HTTP health endpoints, rollout status, and PDBs. - docs/chaos-runbook.md: failure-mode playbook with simulate / Grafana / Loki / recovery steps for six scenarios (pod kill, node failure, DB pool exhaustion, inference OOM, high latency, SSL issues) plus a Loki quick-reference. - terraform/modules/eks: expose current_node_ami_id output, add update_config.max_unavailable_percentage (configurable, default 33) so node-group rolls are controlled. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
44 lines
1.4 KiB
Bash
Executable File
44 lines
1.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
set -euo pipefail
|
|
|
|
# Rotate EKS managed node group to latest AMI with zero downtime.
|
|
# Usage: ./scripts/rotate-nodes.sh <environment>
|
|
# Example: ./scripts/rotate-nodes.sh dev
|
|
|
|
ENVIRONMENT="${1:?Usage: rotate-nodes.sh <environment> (dev|uat|prod)}"
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
TF_DIR="$SCRIPT_DIR/../terraform/environments/$ENVIRONMENT"
|
|
|
|
echo "=== samosaChaat Node Rotation — $ENVIRONMENT ==="
|
|
|
|
echo ""
|
|
echo "Step 1: Check current AMI vs latest available"
|
|
cd "$TF_DIR"
|
|
|
|
echo ""
|
|
echo "Step 2: Apply Terraform to update launch template with latest AMI"
|
|
echo "This triggers EKS managed node group rolling update."
|
|
echo "EKS will:"
|
|
echo " 1. Launch new nodes with patched AMI"
|
|
echo " 2. Cordon old nodes (stop scheduling new pods)"
|
|
echo " 3. Drain pods from old nodes (respecting PodDisruptionBudgets)"
|
|
echo " 4. Terminate old nodes"
|
|
echo ""
|
|
echo "PDBs ensure minAvailable: 1 for each service = zero downtime."
|
|
echo ""
|
|
read -p "Proceed with terraform apply? [y/N] " -n 1 -r
|
|
echo ""
|
|
if [[ $REPLY =~ ^[Yy]$ ]]; then
|
|
terraform apply -auto-approve
|
|
else
|
|
echo "Aborted."
|
|
exit 0
|
|
fi
|
|
|
|
echo ""
|
|
echo "Step 3: Monitor node rotation"
|
|
CLUSTER_NAME=$(terraform output -raw eks_cluster_name 2>/dev/null || echo "samosachaat-$ENVIRONMENT")
|
|
aws eks update-kubeconfig --name "$CLUSTER_NAME" --region us-west-2 2>/dev/null || true
|
|
echo "Watching nodes (Ctrl+C to stop):"
|
|
kubectl get nodes -w
|