mirror of
https://github.com/karpathy/nanochat.git
synced 2026-05-09 01:10:10 +00:00
Adds tooling and documentation for Day 2 cluster operations: - scripts/rotate-nodes.sh: interactive node-rotation driver that applies terraform to pick up the latest SSM-resolved EKS AMI and watches the rolling replacement. - scripts/demo-schema-change.sh: end-to-end demo of the zero-downtime is_favorited column migration via helm upgrade + migration hook. - scripts/verify-deployment.sh: post-deploy health check across pods, per-service HTTP health endpoints, rollout status, and PDBs. - docs/chaos-runbook.md: failure-mode playbook with simulate / Grafana / Loki / recovery steps for six scenarios (pod kill, node failure, DB pool exhaustion, inference OOM, high latency, SSL issues) plus a Loki quick-reference. - terraform/modules/eks: expose current_node_ami_id output, add update_config.max_unavailable_percentage (configurable, default 33) so node-group rolls are controlled. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
65 lines
1.5 KiB
HCL
65 lines
1.5 KiB
HCL
terraform {
|
|
required_version = ">= 1.5.0"
|
|
required_providers {
|
|
aws = {
|
|
source = "hashicorp/aws"
|
|
version = ">= 5.0"
|
|
}
|
|
}
|
|
}
|
|
|
|
data "aws_ssm_parameter" "eks_ami_id" {
|
|
name = "/aws/service/eks/optimized-ami/${var.cluster_version}/amazon-linux-2/recommended/image_id"
|
|
}
|
|
|
|
module "eks" {
|
|
source = "terraform-aws-modules/eks/aws"
|
|
version = "~> 20.0"
|
|
|
|
cluster_name = var.cluster_name
|
|
cluster_version = var.cluster_version
|
|
|
|
cluster_endpoint_public_access = true
|
|
cluster_endpoint_private_access = true
|
|
|
|
enable_irsa = true
|
|
|
|
vpc_id = var.vpc_id
|
|
subnet_ids = var.private_subnet_ids
|
|
control_plane_subnet_ids = var.private_subnet_ids
|
|
|
|
cluster_addons = {
|
|
coredns = { most_recent = true }
|
|
kube-proxy = { most_recent = true }
|
|
vpc-cni = { most_recent = true }
|
|
aws-ebs-csi-driver = { most_recent = true }
|
|
aws-efs-csi-driver = { most_recent = true }
|
|
}
|
|
|
|
eks_managed_node_group_defaults = {
|
|
ami_id = data.aws_ssm_parameter.eks_ami_id.value
|
|
enable_bootstrap_user_data = true
|
|
}
|
|
|
|
eks_managed_node_groups = {
|
|
default = {
|
|
min_size = var.node_min_size
|
|
max_size = var.node_max_size
|
|
desired_size = var.node_desired_size
|
|
|
|
instance_types = [var.node_instance_type]
|
|
capacity_type = "ON_DEMAND"
|
|
|
|
update_config = {
|
|
max_unavailable_percentage = var.node_max_unavailable_percentage
|
|
}
|
|
|
|
labels = {
|
|
role = "general"
|
|
}
|
|
}
|
|
}
|
|
|
|
tags = var.tags
|
|
}
|