mirror of
https://github.com/karpathy/nanochat.git
synced 2026-05-08 16:59:59 +00:00
Adds tooling and documentation for Day 2 cluster operations: - scripts/rotate-nodes.sh: interactive node-rotation driver that applies terraform to pick up the latest SSM-resolved EKS AMI and watches the rolling replacement. - scripts/demo-schema-change.sh: end-to-end demo of the zero-downtime is_favorited column migration via helm upgrade + migration hook. - scripts/verify-deployment.sh: post-deploy health check across pods, per-service HTTP health endpoints, rollout status, and PDBs. - docs/chaos-runbook.md: failure-mode playbook with simulate / Grafana / Loki / recovery steps for six scenarios (pod kill, node failure, DB pool exhaustion, inference OOM, high latency, SSL issues) plus a Loki quick-reference. - terraform/modules/eks: expose current_node_ami_id output, add update_config.max_unavailable_percentage (configurable, default 33) so node-group rolls are controlled. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
57 lines
1.2 KiB
HCL
57 lines
1.2 KiB
HCL
variable "cluster_name" {
|
|
description = "EKS cluster name."
|
|
type = string
|
|
}
|
|
|
|
variable "cluster_version" {
|
|
description = "Kubernetes version for the EKS control plane."
|
|
type = string
|
|
default = "1.29"
|
|
}
|
|
|
|
variable "vpc_id" {
|
|
description = "VPC the cluster lives in."
|
|
type = string
|
|
}
|
|
|
|
variable "private_subnet_ids" {
|
|
description = "Private subnets for nodes and control-plane ENIs."
|
|
type = list(string)
|
|
}
|
|
|
|
variable "node_instance_type" {
|
|
description = "EC2 instance type for the managed node group."
|
|
type = string
|
|
default = "t3.large"
|
|
}
|
|
|
|
variable "node_min_size" {
|
|
description = "Minimum nodes in the managed node group."
|
|
type = number
|
|
default = 2
|
|
}
|
|
|
|
variable "node_max_size" {
|
|
description = "Maximum nodes in the managed node group."
|
|
type = number
|
|
default = 4
|
|
}
|
|
|
|
variable "node_desired_size" {
|
|
description = "Desired nodes in the managed node group."
|
|
type = number
|
|
default = 2
|
|
}
|
|
|
|
variable "node_max_unavailable_percentage" {
|
|
description = "Max percentage of nodes unavailable during rolling update."
|
|
type = number
|
|
default = 33
|
|
}
|
|
|
|
variable "tags" {
|
|
description = "Tags applied to every resource."
|
|
type = map(string)
|
|
default = {}
|
|
}
|