diff --git a/runs/runpod/kickoff.sh b/runs/runpod/kickoff.sh index a0e4e2fb..bb918171 100755 --- a/runs/runpod/kickoff.sh +++ b/runs/runpod/kickoff.sh @@ -17,7 +17,7 @@ # Optional env overrides: # GPU_ID default: "NVIDIA H100 80GB HBM3" # GPU_COUNT default: 8 -# CLOUD_TYPE default: COMMUNITY (SECURE for guaranteed availability) +# CLOUD_TYPE default: SECURE (COMMUNITY when capacity available, cheaper) # DISK_GB default: 200 # NANOCHAT_REPO default: Team-XSA/nanochat # NANOCHAT_REF default: dev @@ -44,7 +44,7 @@ RUNNER_URL="${RUNNER_URL:-https://raw.githubusercontent.com/${NANOCHAT_REPO}/${N GPU_ID="${GPU_ID:-NVIDIA H100 80GB HBM3}" GPU_COUNT="${GPU_COUNT:-8}" -CLOUD_TYPE="${CLOUD_TYPE:-COMMUNITY}" +CLOUD_TYPE="${CLOUD_TYPE:-SECURE}" DISK_GB="${DISK_GB:-200}" POD_NAME="${POD_NAME:-${RUNNER}-$(date +%Y%m%d-%H%M)}" diff --git a/runs/runpod/smoke.sh b/runs/runpod/smoke.sh new file mode 100755 index 00000000..d023a764 --- /dev/null +++ b/runs/runpod/smoke.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +# Minimal smoke test. Runs INSIDE a RunPod pod. +# Validates: pod boot, env-var injection, git clone, uv sync, GPU torch, +# tokenizer + base_train code paths, HF upload, runpodctl self-delete. +# Does NOT test: multi-GPU, FP8, full training horizon, SFT, eval. +# +# Sized for a 1-GPU pod, completes in ~3-4 min wall clock. +# Kick off with: GPU_COUNT=1 bash runs/runpod/kickoff.sh smoke +# +# Required env: HF_TOKEN, WANDB_API_KEY +# Auto-set by RunPod: RUNPOD_POD_ID, RUNPOD_API_KEY + +set -euo pipefail + +NANOCHAT_REPO="${NANOCHAT_REPO:-Team-XSA/nanochat}" +NANOCHAT_REF="${NANOCHAT_REF:-dev}" +HF_REPO="${HF_REPO:-haydenfree/nanochat-d12-baseline}" +WANDB_RUN="${WANDB_RUN:-smoke}" + +TS=$(date -u +%Y%m%dT%H%M%SZ) +HF_PATH_PREFIX="_smoke/${TS}" + +WORKDIR="/workspace/nanochat" +LOG_FILE="/workspace/runner.log" +NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" + +mkdir -p /workspace +exec > >(tee -a "$LOG_FILE") 2>&1 + +echo "[smoke] $(date -Iseconds) starting on pod=$RUNPOD_POD_ID" + +cleanup() { + local rc=$? + set +e + echo "[smoke] cleanup: exit code $rc" + + # Always upload the runner log (success or failure) so we can see what happened + mkdir -p /tmp/smoke-out + cp /workspace/*.log /tmp/smoke-out/ 2>/dev/null || true + echo "rc=$rc ts=$TS pod=$RUNPOD_POD_ID" > /tmp/smoke-out/result.txt + [ -d "$WORKDIR" ] && (cd "$WORKDIR" && git rev-parse HEAD 2>/dev/null > /tmp/smoke-out/git-head.txt || true) + huggingface-cli upload "$HF_REPO" /tmp/smoke-out "$HF_PATH_PREFIX" \ + --repo-type model --commit-message "smoke rc=$rc $TS" || \ + echo "[smoke] WARN: HF upload failed" + + echo "[smoke] artifacts: https://huggingface.co/$HF_REPO/tree/main/$HF_PATH_PREFIX" + echo "[smoke] self-deleting pod $RUNPOD_POD_ID" + runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \ + curl -sS -X DELETE -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ + "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" + exit "$rc" +} +trap cleanup EXIT + +: "${HF_TOKEN:?HF_TOKEN must be set}" +: "${WANDB_API_KEY:?WANDB_API_KEY must be set}" +: "${RUNPOD_POD_ID:?RUNPOD_POD_ID must be set (auto by RunPod)}" + +# Clone fork +rm -rf "$WORKDIR" +git clone "https://github.com/${NANOCHAT_REPO}.git" "$WORKDIR" +cd "$WORKDIR" +git checkout "$NANOCHAT_REF" +echo "[smoke] HEAD = $(git rev-parse HEAD)" + +# Env + uv +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR +mkdir -p "$NANOCHAT_BASE_DIR" +command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh +[ -d ".venv" ] || uv venv +uv sync --extra gpu +source .venv/bin/activate +pip install --quiet --upgrade huggingface_hub + +# GPU sanity +python -c "import torch; print('[smoke] torch', torch.__version__, 'cuda', torch.cuda.is_available(), 'devices', torch.cuda.device_count())" + +# Minimum dataset + tokenizer (1 shard, 50M chars — enough for the tokenizer +# to train on AND for base_train to consume 20 iterations of tokens) +python -m nanochat.dataset -n 1 +python -m scripts.tok_train --max-chars=50000000 + +# Tiny base_train. Params from base_train.py docstring (the CPU smoke), adjusted +# slightly for GPU. depth=4, 20 iterations. Should finish in ~30s. +NPROC=$(nvidia-smi -L | wc -l) +echo "[smoke] training on $NPROC GPU(s)" +torchrun --standalone --nproc_per_node="$NPROC" -m scripts.base_train -- \ + --depth=4 \ + --max-seq-len=512 \ + --device-batch-size=1 \ + --total-batch-size=512 \ + --num-iterations=20 \ + --eval-every=10 \ + --eval-tokens=512 \ + --core-metric-every=-1 \ + --sample-every=-1 \ + --save-every=-1 \ + --run="$WANDB_RUN" + +echo "[smoke] $(date -Iseconds) base_train complete — smoke passed" +# trap cleanup handles HF upload + self-delete