Add minimal smoke runner

2026-05-16 04:37:34 +00:00 · 2026-04-25 22:39:45 -04:00 · 2026-04-25 22:39:45 -04:00 · 86cf10c693
commit 86cf10c693
parent d7325f9306
2 changed files with 104 additions and 2 deletions
--- a/runs/runpod/kickoff.sh
+++ b/runs/runpod/kickoff.sh
@ -17,7 +17,7 @@
 # Optional env overrides:
 #   GPU_ID         default: "NVIDIA H100 80GB HBM3"
 #   GPU_COUNT      default: 8
-#   CLOUD_TYPE     default: COMMUNITY     (SECURE for guaranteed availability)
+#   CLOUD_TYPE     default: SECURE        (COMMUNITY when capacity available, cheaper)
 #   DISK_GB        default: 200
 #   NANOCHAT_REPO  default: Team-XSA/nanochat
 #   NANOCHAT_REF   default: dev
@ -44,7 +44,7 @@ RUNNER_URL="${RUNNER_URL:-https://raw.githubusercontent.com/${NANOCHAT_REPO}/${N

 GPU_ID="${GPU_ID:-NVIDIA H100 80GB HBM3}"
 GPU_COUNT="${GPU_COUNT:-8}"
-CLOUD_TYPE="${CLOUD_TYPE:-COMMUNITY}"
+CLOUD_TYPE="${CLOUD_TYPE:-SECURE}"
 DISK_GB="${DISK_GB:-200}"
 POD_NAME="${POD_NAME:-${RUNNER}-$(date +%Y%m%d-%H%M)}"

--- a/runs/runpod/smoke.sh
+++ b/runs/runpod/smoke.sh
@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+# Minimal smoke test. Runs INSIDE a RunPod pod.
+# Validates: pod boot, env-var injection, git clone, uv sync, GPU torch,
+# tokenizer + base_train code paths, HF upload, runpodctl self-delete.
+# Does NOT test: multi-GPU, FP8, full training horizon, SFT, eval.
+#
+# Sized for a 1-GPU pod, completes in ~3-4 min wall clock.
+# Kick off with: GPU_COUNT=1 bash runs/runpod/kickoff.sh smoke
+#
+# Required env: HF_TOKEN, WANDB_API_KEY
+# Auto-set by RunPod: RUNPOD_POD_ID, RUNPOD_API_KEY
+
+set -euo pipefail
+
+NANOCHAT_REPO="${NANOCHAT_REPO:-Team-XSA/nanochat}"
+NANOCHAT_REF="${NANOCHAT_REF:-dev}"
+HF_REPO="${HF_REPO:-haydenfree/nanochat-d12-baseline}"
+WANDB_RUN="${WANDB_RUN:-smoke}"
+
+TS=$(date -u +%Y%m%dT%H%M%SZ)
+HF_PATH_PREFIX="_smoke/${TS}"
+
+WORKDIR="/workspace/nanochat"
+LOG_FILE="/workspace/runner.log"
+NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
+
+mkdir -p /workspace
+exec > >(tee -a "$LOG_FILE") 2>&1
+
+echo "[smoke] $(date -Iseconds) starting on pod=$RUNPOD_POD_ID"
+
+cleanup() {
+  local rc=$?
+  set +e
+  echo "[smoke] cleanup: exit code $rc"
+
+  # Always upload the runner log (success or failure) so we can see what happened
+  mkdir -p /tmp/smoke-out
+  cp /workspace/*.log /tmp/smoke-out/ 2>/dev/null || true
+  echo "rc=$rc ts=$TS pod=$RUNPOD_POD_ID" > /tmp/smoke-out/result.txt
+  [ -d "$WORKDIR" ] && (cd "$WORKDIR" && git rev-parse HEAD 2>/dev/null > /tmp/smoke-out/git-head.txt || true)
+  huggingface-cli upload "$HF_REPO" /tmp/smoke-out "$HF_PATH_PREFIX" \
+    --repo-type model --commit-message "smoke rc=$rc $TS" || \
+    echo "[smoke] WARN: HF upload failed"
+
+  echo "[smoke] artifacts: https://huggingface.co/$HF_REPO/tree/main/$HF_PATH_PREFIX"
+  echo "[smoke] self-deleting pod $RUNPOD_POD_ID"
+  runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \
+    curl -sS -X DELETE -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \
+      "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID"
+  exit "$rc"
+}
+trap cleanup EXIT
+
+: "${HF_TOKEN:?HF_TOKEN must be set}"
+: "${WANDB_API_KEY:?WANDB_API_KEY must be set}"
+: "${RUNPOD_POD_ID:?RUNPOD_POD_ID must be set (auto by RunPod)}"
+
+# Clone fork
+rm -rf "$WORKDIR"
+git clone "https://github.com/${NANOCHAT_REPO}.git" "$WORKDIR"
+cd "$WORKDIR"
+git checkout "$NANOCHAT_REF"
+echo "[smoke] HEAD = $(git rev-parse HEAD)"
+
+# Env + uv
+export OMP_NUM_THREADS=1
+export NANOCHAT_BASE_DIR
+mkdir -p "$NANOCHAT_BASE_DIR"
+command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
+[ -d ".venv" ] || uv venv
+uv sync --extra gpu
+source .venv/bin/activate
+pip install --quiet --upgrade huggingface_hub
+
+# GPU sanity
+python -c "import torch; print('[smoke] torch', torch.__version__, 'cuda', torch.cuda.is_available(), 'devices', torch.cuda.device_count())"
+
+# Minimum dataset + tokenizer (1 shard, 50M chars — enough for the tokenizer
+# to train on AND for base_train to consume 20 iterations of tokens)
+python -m nanochat.dataset -n 1
+python -m scripts.tok_train --max-chars=50000000
+
+# Tiny base_train. Params from base_train.py docstring (the CPU smoke), adjusted
+# slightly for GPU. depth=4, 20 iterations. Should finish in ~30s.
+NPROC=$(nvidia-smi -L | wc -l)
+echo "[smoke] training on $NPROC GPU(s)"
+torchrun --standalone --nproc_per_node="$NPROC" -m scripts.base_train -- \
+    --depth=4 \
+    --max-seq-len=512 \
+    --device-batch-size=1 \
+    --total-batch-size=512 \
+    --num-iterations=20 \
+    --eval-every=10 \
+    --eval-tokens=512 \
+    --core-metric-every=-1 \
+    --sample-every=-1 \
+    --save-every=-1 \
+    --run="$WANDB_RUN"
+
+echo "[smoke] $(date -Iseconds) base_train complete — smoke passed"
+# trap cleanup handles HF upload + self-delete