mirror of
https://github.com/karpathy/nanochat.git
synced 2026-05-08 08:49:53 +00:00
Add generic RunPod runner harness + d12 baseline
This commit is contained in:
parent
c5e8ce370c
commit
d7325f9306
119
runs/runpod/d12.sh
Executable file
119
runs/runpod/d12.sh
Executable file
|
|
@ -0,0 +1,119 @@
|
|||
#!/usr/bin/env bash
|
||||
# d12 baseline runner. Runs INSIDE a RunPod pod.
|
||||
# Pipeline: tokenizer -> base_train -> base_eval -> SFT -> chat_eval -> report.
|
||||
# On exit:
|
||||
# success -> upload final cache to HF, self-delete pod
|
||||
# failure -> upload logs + report dir to HF under _failures/, self-delete pod
|
||||
# (set UPLOAD_FAILURE_CACHE=1 to also dump partial cache for offline debug)
|
||||
#
|
||||
# Required env (passed via runpodctl --env at pod-create):
|
||||
# HF_TOKEN, WANDB_API_KEY
|
||||
# Optional env:
|
||||
# WANDB_RUN default: d12
|
||||
# NANOCHAT_REPO default: Team-XSA/nanochat
|
||||
# NANOCHAT_REF default: dev
|
||||
# HF_REPO default: haydenfree/nanochat-d12-baseline
|
||||
# BACKUP_INTERVAL default: 300 (seconds between background HF uploads)
|
||||
# UPLOAD_FAILURE_CACHE default: 0
|
||||
# Auto-set by RunPod:
|
||||
# RUNPOD_POD_ID, RUNPOD_API_KEY (pod-scoped)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
NANOCHAT_REPO="${NANOCHAT_REPO:-Team-XSA/nanochat}"
|
||||
NANOCHAT_REF="${NANOCHAT_REF:-dev}"
|
||||
HF_REPO="${HF_REPO:-haydenfree/nanochat-d12-baseline}"
|
||||
WANDB_RUN="${WANDB_RUN:-d12}"
|
||||
BACKUP_INTERVAL="${BACKUP_INTERVAL:-300}"
|
||||
UPLOAD_FAILURE_CACHE="${UPLOAD_FAILURE_CACHE:-0}"
|
||||
|
||||
WORKDIR="/workspace/nanochat"
|
||||
LOG_FILE="/workspace/runner.log"
|
||||
NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
|
||||
BACKUP_PID=""
|
||||
|
||||
mkdir -p /workspace
|
||||
exec > >(tee -a "$LOG_FILE") 2>&1
|
||||
|
||||
echo "[runner] $(date -Iseconds) starting on pod=$RUNPOD_POD_ID"
|
||||
echo "[runner] repo=$NANOCHAT_REPO ref=$NANOCHAT_REF hf_repo=$HF_REPO wandb_run=$WANDB_RUN"
|
||||
|
||||
cleanup() {
|
||||
local rc=$?
|
||||
set +e
|
||||
echo "[runner] cleanup: exit code $rc at $(date -Iseconds)"
|
||||
if [ -n "$BACKUP_PID" ] && kill -0 "$BACKUP_PID" 2>/dev/null; then
|
||||
kill "$BACKUP_PID" 2>/dev/null || true
|
||||
fi
|
||||
|
||||
local TS
|
||||
TS=$(date -u +%Y%m%dT%H%M%SZ)
|
||||
|
||||
if [ "$rc" -eq 0 ]; then
|
||||
echo "[runner] success — final upload to $HF_REPO"
|
||||
if [ -d "$NANOCHAT_BASE_DIR" ]; then
|
||||
huggingface-cli upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \
|
||||
--repo-type model --commit-message "final rc=0 $TS" || \
|
||||
echo "[runner] WARN: final upload failed"
|
||||
fi
|
||||
else
|
||||
echo "[runner] failure rc=$rc — dumping logs to HF for offline debug"
|
||||
mkdir -p /tmp/failure
|
||||
cp /workspace/*.log /tmp/failure/ 2>/dev/null || true
|
||||
[ -d "$NANOCHAT_BASE_DIR/report" ] && cp -r "$NANOCHAT_BASE_DIR/report" /tmp/failure/ 2>/dev/null || true
|
||||
[ -d "$WORKDIR" ] && (cd "$WORKDIR" && git rev-parse HEAD 2>/dev/null > /tmp/failure/git-head.txt || true)
|
||||
|
||||
huggingface-cli upload "$HF_REPO" /tmp/failure "_failures/${TS}-rc${rc}/logs" \
|
||||
--repo-type model --commit-message "failure rc=$rc logs $TS" || \
|
||||
echo "[runner] WARN: log upload failed"
|
||||
|
||||
if [ "$UPLOAD_FAILURE_CACHE" = "1" ] && [ -d "$NANOCHAT_BASE_DIR" ]; then
|
||||
echo "[runner] UPLOAD_FAILURE_CACHE=1 — also dumping partial cache (may be slow)"
|
||||
huggingface-cli upload "$HF_REPO" "$NANOCHAT_BASE_DIR" "_failures/${TS}-rc${rc}/cache" \
|
||||
--repo-type model --commit-message "failure rc=$rc cache $TS" || true
|
||||
fi
|
||||
echo "[runner] failure artifacts: https://huggingface.co/$HF_REPO/tree/main/_failures/${TS}-rc${rc}"
|
||||
fi
|
||||
|
||||
echo "[runner] self-deleting pod $RUNPOD_POD_ID"
|
||||
runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \
|
||||
curl -sS -X DELETE -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \
|
||||
"https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID"
|
||||
exit "$rc"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
: "${HF_TOKEN:?HF_TOKEN must be set}"
|
||||
: "${WANDB_API_KEY:?WANDB_API_KEY must be set}"
|
||||
: "${RUNPOD_POD_ID:?RUNPOD_POD_ID must be set (auto by RunPod)}"
|
||||
|
||||
rm -rf "$WORKDIR"
|
||||
git clone "https://github.com/${NANOCHAT_REPO}.git" "$WORKDIR"
|
||||
cd "$WORKDIR"
|
||||
git checkout "$NANOCHAT_REF"
|
||||
echo "[runner] HEAD = $(git rev-parse HEAD)"
|
||||
|
||||
sed -i 's/--depth=24/--depth=12/' runs/speedrun.sh
|
||||
sed -i 's/ --target-param-data-ratio=8//' runs/speedrun.sh
|
||||
echo "[runner] speedrun.sh edits applied:"
|
||||
grep -n 'depth\|target-param' runs/speedrun.sh || true
|
||||
|
||||
pip install --quiet --upgrade huggingface_hub
|
||||
|
||||
(
|
||||
while true; do
|
||||
sleep "$BACKUP_INTERVAL"
|
||||
if [ -d "$NANOCHAT_BASE_DIR" ]; then
|
||||
huggingface-cli upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \
|
||||
--repo-type model \
|
||||
--commit-message "checkpoint $(date -Iseconds)" >> /workspace/backup.log 2>&1 || true
|
||||
fi
|
||||
done
|
||||
) &
|
||||
BACKUP_PID=$!
|
||||
echo "[runner] backup loop pid=$BACKUP_PID interval=${BACKUP_INTERVAL}s"
|
||||
|
||||
export WANDB_RUN
|
||||
WANDB_RUN="$WANDB_RUN" bash runs/speedrun.sh
|
||||
|
||||
echo "[runner] $(date -Iseconds) pipeline complete"
|
||||
92
runs/runpod/kickoff.sh
Executable file
92
runs/runpod/kickoff.sh
Executable file
|
|
@ -0,0 +1,92 @@
|
|||
#!/usr/bin/env bash
|
||||
# Generic local kickoff for RunPod runs.
|
||||
# Picks a runner script in this repo (runs/runpod/<RUNNER>.sh) and spins up a pod.
|
||||
#
|
||||
# Prereqs:
|
||||
# 1. ~/.config/team-xsa/runpod.env exports HF_TOKEN, WANDB_API_KEY, RUNPOD_TEMPLATE_ID
|
||||
# 2. The template referenced by RUNPOD_TEMPLATE_ID has docker-start-cmd:
|
||||
# bash,-lc,curl -fsSL "$RUNNER_URL" | bash >> /workspace/runner.log 2>&1
|
||||
# 3. The runner script for this experiment has been pushed to Team-XSA/nanochat
|
||||
#
|
||||
# Usage:
|
||||
# source ~/.config/team-xsa/runpod.env
|
||||
# bash runs/runpod/kickoff.sh d12 # uses runs/runpod/d12.sh
|
||||
# bash runs/runpod/kickoff.sh d24 # uses runs/runpod/d24.sh
|
||||
# bash runs/runpod/kickoff.sh xsa_d12 # uses runs/runpod/xsa_d12.sh
|
||||
#
|
||||
# Optional env overrides:
|
||||
# GPU_ID default: "NVIDIA H100 80GB HBM3"
|
||||
# GPU_COUNT default: 8
|
||||
# CLOUD_TYPE default: COMMUNITY (SECURE for guaranteed availability)
|
||||
# DISK_GB default: 200
|
||||
# NANOCHAT_REPO default: Team-XSA/nanochat
|
||||
# NANOCHAT_REF default: dev
|
||||
# WANDB_RUN default: <RUNNER>
|
||||
# POD_NAME default: <RUNNER>-<timestamp>
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
RUNNER="${1:-}"
|
||||
if [ -z "$RUNNER" ]; then
|
||||
echo "Usage: bash runs/runpod/kickoff.sh <runner-name>"
|
||||
echo " e.g. bash runs/runpod/kickoff.sh d12"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
: "${HF_TOKEN:?HF_TOKEN not set — source ~/.config/team-xsa/runpod.env}"
|
||||
: "${WANDB_API_KEY:?WANDB_API_KEY not set — source ~/.config/team-xsa/runpod.env}"
|
||||
: "${RUNPOD_TEMPLATE_ID:?RUNPOD_TEMPLATE_ID not set — create the template once and add it to ~/.config/team-xsa/runpod.env}"
|
||||
|
||||
NANOCHAT_REPO="${NANOCHAT_REPO:-Team-XSA/nanochat}"
|
||||
NANOCHAT_REF="${NANOCHAT_REF:-dev}"
|
||||
WANDB_RUN="${WANDB_RUN:-$RUNNER}"
|
||||
RUNNER_URL="${RUNNER_URL:-https://raw.githubusercontent.com/${NANOCHAT_REPO}/${NANOCHAT_REF}/runs/runpod/${RUNNER}.sh}"
|
||||
|
||||
GPU_ID="${GPU_ID:-NVIDIA H100 80GB HBM3}"
|
||||
GPU_COUNT="${GPU_COUNT:-8}"
|
||||
CLOUD_TYPE="${CLOUD_TYPE:-COMMUNITY}"
|
||||
DISK_GB="${DISK_GB:-200}"
|
||||
POD_NAME="${POD_NAME:-${RUNNER}-$(date +%Y%m%d-%H%M)}"
|
||||
|
||||
echo "Verifying runner URL is reachable: $RUNNER_URL"
|
||||
if ! curl -sfI "$RUNNER_URL" >/dev/null; then
|
||||
echo "ERROR: runner not reachable at $RUNNER_URL"
|
||||
echo " - Did you push runs/runpod/${RUNNER}.sh to ${NANOCHAT_REPO}@${NANOCHAT_REF}?"
|
||||
echo " - Is the repo public?"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export HF_TOKEN WANDB_API_KEY WANDB_RUN RUNNER_URL NANOCHAT_REPO NANOCHAT_REF
|
||||
ENV_JSON=$(python3 - <<'PY'
|
||||
import json, os
|
||||
keys = ["HF_TOKEN","WANDB_API_KEY","WANDB_RUN","RUNNER_URL","NANOCHAT_REPO","NANOCHAT_REF"]
|
||||
print(json.dumps({k: os.environ[k] for k in keys if k in os.environ}))
|
||||
PY
|
||||
)
|
||||
|
||||
echo "Creating pod:"
|
||||
echo " name = $POD_NAME"
|
||||
echo " template = $RUNPOD_TEMPLATE_ID"
|
||||
echo " runner = $RUNNER_URL"
|
||||
echo " gpu = $GPU_COUNT × $GPU_ID"
|
||||
echo " cloud = $CLOUD_TYPE"
|
||||
echo " disk = ${DISK_GB} GB"
|
||||
|
||||
runpodctl pod create \
|
||||
--name "$POD_NAME" \
|
||||
--template-id "$RUNPOD_TEMPLATE_ID" \
|
||||
--gpu-id "$GPU_ID" \
|
||||
--gpu-count "$GPU_COUNT" \
|
||||
--cloud-type "$CLOUD_TYPE" \
|
||||
--container-disk-in-gb "$DISK_GB" \
|
||||
--env "$ENV_JSON"
|
||||
|
||||
echo
|
||||
echo "Logs (after pod boots):"
|
||||
echo " POD_ID=\$(runpodctl pod list --name '$POD_NAME' -o json | jq -r '.[0].id')"
|
||||
echo " runpodctl ssh info \$POD_ID"
|
||||
echo " ssh <user>@<host> 'tail -f /workspace/runner.log'"
|
||||
echo
|
||||
echo "Wandb: project=nanochat / nanochat-sft, run name: $WANDB_RUN"
|
||||
echo "HF (success): https://huggingface.co/haydenfree/nanochat-d12-baseline"
|
||||
echo "HF (failure): https://huggingface.co/haydenfree/nanochat-d12-baseline/tree/main/_failures"
|
||||
Loading…
Reference in New Issue
Block a user