From c5e8ce370ce2face6feba26b0be076dca1fe5ae0 Mon Sep 17 00:00:00 2001 From: zolopgh Date: Sat, 25 Apr 2026 12:53:11 -0400 Subject: [PATCH 1/2] batch baseline speedrun.sh scripts --- runs/pace_stage1_tokenizer.sh | 42 ++++++++++++++++++ runs/pace_stage2a_pretrain.sh | 43 +++++++++++++++++++ runs/pace_stage2b_pretrain.sh | 68 +++++++++++++++++++++++++++++ runs/pace_stage2c_pretrain.sh | 68 +++++++++++++++++++++++++++++ runs/pace_stage2d_pretrain.sh | 68 +++++++++++++++++++++++++++++ runs/pace_stage3_sft.sh | 55 ++++++++++++++++++++++++ runs/pace_submit.sh | 81 +++++++++++++++++++++++++++++++++++ 7 files changed, 425 insertions(+) create mode 100644 runs/pace_stage1_tokenizer.sh create mode 100644 runs/pace_stage2a_pretrain.sh create mode 100644 runs/pace_stage2b_pretrain.sh create mode 100644 runs/pace_stage2c_pretrain.sh create mode 100644 runs/pace_stage2d_pretrain.sh create mode 100644 runs/pace_stage3_sft.sh create mode 100644 runs/pace_submit.sh diff --git a/runs/pace_stage1_tokenizer.sh b/runs/pace_stage1_tokenizer.sh new file mode 100644 index 00000000..8caa6214 --- /dev/null +++ b/runs/pace_stage1_tokenizer.sh @@ -0,0 +1,42 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=24 +#SBATCH --mem=64G +#SBATCH -t 2:00:00 +#SBATCH -J nanochat-stage1-tokenizer +#SBATCH -o runs/logs/stage1_%j.out +#SBATCH -e runs/logs/stage1_%j.err + +# Stage 1 + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p "$NANOCHAT_BASE_DIR" +mkdir -p runs/logs + +echo "=== Stage 1: Tokenizer ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "Started: $(date)" + +command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh +export PATH="$HOME/.local/bin:$PATH" +[ -d ".venv" ] || uv venv +uv sync --extra gpu +source .venv/bin/activate +python -m nanochat.report reset +python -m nanochat.dataset -n 8 +python -m nanochat.dataset -n 170 & +DATASET_DOWNLOAD_PID=$! + +python -m scripts.tok_train +python -m scripts.tok_eval + +echo "Waiting for full dataset download..." +wait $DATASET_DOWNLOAD_PID + +echo "=== Stage 1 complete: $(date) ===" +echo "Dataset and tokenizer ready in $NANOCHAT_BASE_DIR" diff --git a/runs/pace_stage2a_pretrain.sh b/runs/pace_stage2a_pretrain.sh new file mode 100644 index 00000000..bee6d11f --- /dev/null +++ b/runs/pace_stage2a_pretrain.sh @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH -p ice-gpu +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:2 +#SBATCH --constraint="gpu-h100|gpu-h200" +#SBATCH --mem-per-gpu=48G +#SBATCH -t 3:55:00 +#SBATCH -J nanochat-stage2a +#SBATCH -o runs/logs/stage2a_%j.out +#SBATCH -e runs/logs/stage2a_%j.err + +# Stage 2a + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" +CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" +DONE_MARKER="$CHECKPOINT_DIR/.training_complete" + +echo "=== Stage 2a: Pretraining (chunk 1) ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "WANDB_RUN: $WANDB_RUN" +echo "Started: $(date)" + +source .venv/bin/activate + +torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + --run=$WANDB_RUN + +mkdir -p "$CHECKPOINT_DIR" +touch "$DONE_MARKER" +echo "=== Stage 2a complete: $(date) ===" diff --git a/runs/pace_stage2b_pretrain.sh b/runs/pace_stage2b_pretrain.sh new file mode 100644 index 00000000..7b8ea806 --- /dev/null +++ b/runs/pace_stage2b_pretrain.sh @@ -0,0 +1,68 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH -p ice-gpu +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:2 +#SBATCH --constraint="gpu-h100|gpu-h200" +#SBATCH --mem-per-gpu=48G +#SBATCH -t 3:55:00 +#SBATCH -J nanochat-stage2b +#SBATCH -o runs/logs/stage2b_%j.out +#SBATCH -e runs/logs/stage2b_%j.err + +# Stage 2b + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" +CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" +DONE_MARKER="$CHECKPOINT_DIR/.training_complete" + +echo "=== Stage 2b: Pretraining (chunk 2 / auto-resume) ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "Started: $(date)" + +if [ -f "$DONE_MARKER" ]; then + echo "Training already complete (marker: $DONE_MARKER). Nothing to do." + echo "=== Stage 2b skipped: $(date) ===" + exit 0 +fi + +source .venv/bin/activate + +LAST_STEP=$(python -c " +import glob, os, sys +files = glob.glob('${CHECKPOINT_DIR}/model_*.pt') +if not files: + print(0); sys.exit(0) +print(max(int(os.path.basename(f).split('_')[-1].split('.')[0]) for f in files)) +") + +if [ "$LAST_STEP" -eq 0 ]; then + echo "No checkpoint found — starting from scratch" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + --run=$WANDB_RUN +else + echo "Resuming from step $LAST_STEP" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + --resume-from-step=$LAST_STEP \ + --run=$WANDB_RUN +fi + +mkdir -p "$CHECKPOINT_DIR" +touch "$DONE_MARKER" +echo "=== Stage 2b complete: $(date) ===" diff --git a/runs/pace_stage2c_pretrain.sh b/runs/pace_stage2c_pretrain.sh new file mode 100644 index 00000000..92c748e9 --- /dev/null +++ b/runs/pace_stage2c_pretrain.sh @@ -0,0 +1,68 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH -p ice-gpu +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:2 +#SBATCH --constraint="gpu-h100|gpu-h200" +#SBATCH --mem-per-gpu=48G +#SBATCH -t 3:55:00 +#SBATCH -J nanochat-stage2c +#SBATCH -o runs/logs/stage2c_%j.out +#SBATCH -e runs/logs/stage2c_%j.err + +# Stage 2c + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" +CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" +DONE_MARKER="$CHECKPOINT_DIR/.training_complete" + +echo "=== Stage 2c: Pretraining (chunk 3 / auto-resume) ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "Started: $(date)" + +if [ -f "$DONE_MARKER" ]; then + echo "Training already complete (marker: $DONE_MARKER). Nothing to do." + echo "=== Stage 2c skipped: $(date) ===" + exit 0 +fi + +source .venv/bin/activate + +LAST_STEP=$(python -c " +import glob, os, sys +files = glob.glob('${CHECKPOINT_DIR}/model_*.pt') +if not files: + print(0); sys.exit(0) +print(max(int(os.path.basename(f).split('_')[-1].split('.')[0]) for f in files)) +") + +if [ "$LAST_STEP" -eq 0 ]; then + echo "No checkpoint found — starting from scratch" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + --run=$WANDB_RUN +else + echo "Resuming from step $LAST_STEP" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + --resume-from-step=$LAST_STEP \ + --run=$WANDB_RUN +fi + +mkdir -p "$CHECKPOINT_DIR" +touch "$DONE_MARKER" +echo "=== Stage 2c complete: $(date) ===" diff --git a/runs/pace_stage2d_pretrain.sh b/runs/pace_stage2d_pretrain.sh new file mode 100644 index 00000000..70030001 --- /dev/null +++ b/runs/pace_stage2d_pretrain.sh @@ -0,0 +1,68 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH -p ice-gpu +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:2 +#SBATCH --constraint="gpu-h100|gpu-h200" +#SBATCH --mem-per-gpu=48G +#SBATCH -t 3:55:00 +#SBATCH -J nanochat-stage2d +#SBATCH -o runs/logs/stage2d_%j.out +#SBATCH -e runs/logs/stage2d_%j.err + +# Stage 2d + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" +CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" +DONE_MARKER="$CHECKPOINT_DIR/.training_complete" + +echo "=== Stage 2d: Pretraining (chunk 4 / auto-resume) ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "Started: $(date)" + +if [ -f "$DONE_MARKER" ]; then + echo "Training already complete (marker: $DONE_MARKER). Nothing to do." + echo "=== Stage 2d skipped: $(date) ===" + exit 0 +fi + +source .venv/bin/activate + +LAST_STEP=$(python -c " +import glob, os, sys +files = glob.glob('${CHECKPOINT_DIR}/model_*.pt') +if not files: + print(0); sys.exit(0) +print(max(int(os.path.basename(f).split('_')[-1].split('.')[0]) for f in files)) +") + +if [ "$LAST_STEP" -eq 0 ]; then + echo "No checkpoint found — starting from scratch" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + --run=$WANDB_RUN +else + echo "Resuming from step $LAST_STEP" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + --resume-from-step=$LAST_STEP \ + --run=$WANDB_RUN +fi + +mkdir -p "$CHECKPOINT_DIR" +touch "$DONE_MARKER" +echo "=== Stage 2d complete: $(date) ===" diff --git a/runs/pace_stage3_sft.sh b/runs/pace_stage3_sft.sh new file mode 100644 index 00000000..f354f7c4 --- /dev/null +++ b/runs/pace_stage3_sft.sh @@ -0,0 +1,55 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH -p ice-gpu +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:2 +#SBATCH --constraint="gpu-h100|gpu-h200" +#SBATCH --mem-per-gpu=48G +#SBATCH -t 3:55:00 +#SBATCH -J nanochat-stage3-sft +#SBATCH -o runs/logs/stage3_%j.out +#SBATCH -e runs/logs/stage3_%j.err + +# Stage 3 + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" + +echo "=== Stage 3: Eval + SFT ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "WANDB_RUN: $WANDB_RUN" +echo "Started: $(date)" + +CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" +DONE_MARKER="$CHECKPOINT_DIR/.training_complete" +if [ ! -f "$DONE_MARKER" ]; then + echo "ERROR: pretraining did not finish — missing $DONE_MARKER" + echo "Re-run pretrain chunks 2a–2d until the marker is created before running stage 3." + exit 1 +fi + +source .venv/bin/activate + +torchrun --standalone --nproc_per_node=2 -m scripts.base_eval -- \ + --device-batch-size=16 + +curl -L -o "$NANOCHAT_BASE_DIR/identity_conversations.jsonl" \ + https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl + +torchrun --standalone --nproc_per_node=2 -m scripts.chat_sft -- \ + --device-batch-size=16 \ + --run=$WANDB_RUN + +torchrun --standalone --nproc_per_node=2 -m scripts.chat_eval -- -i sft + +python -m nanochat.report generate + +echo "=== Stage 3 complete: $(date) ===" + diff --git a/runs/pace_submit.sh b/runs/pace_submit.sh new file mode 100644 index 00000000..951d8849 --- /dev/null +++ b/runs/pace_submit.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +# Pipeline: +# Stage 1 — CPU: tokenizer + dataset +# Stage 2a — GPU: pretraining chunk 1 +# Stage 2b — GPU: auto-resume chunk 2 +# Stage 2c — GPU: auto-resume chunk 3 +# Stage 2d — GPU: auto-resume chunk 4 +# Stage 3 — GPU: base eval + SFT + chat eval + report +# +# Usage (from repo root): +# bash runs/pace_submit.sh +# +# Optional W&B logging: +# WANDB_RUN=my-run bash runs/pace_submit.sh + +set -e +cd "$HOME/scratch/nanochat" + +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" +export WANDB_RUN + +echo "Submitting nanochat full pipeline..." +echo "WANDB_RUN=$WANDB_RUN" +echo "" + +# Stage 1 +JOB1=$(sbatch --parsable \ + --export=ALL,WANDB_RUN=$WANDB_RUN \ + runs/pace_stage1_tokenizer.sh) +echo "Stage 1 submitted: job $JOB1 (tokenizer + dataset)" + +# Stage 2a +JOB2A=$(sbatch --parsable \ + --dependency=afterok:$JOB1 \ + --export=ALL,WANDB_RUN=$WANDB_RUN \ + runs/pace_stage2a_pretrain.sh) +echo "Stage 2a submitted: job $JOB2A (pretrain chunk 1, depends on $JOB1)" + +# Stage 2b +JOB2B=$(sbatch --parsable \ + --dependency=afterany:$JOB2A \ + --export=ALL,WANDB_RUN=$WANDB_RUN \ + runs/pace_stage2b_pretrain.sh) +echo "Stage 2b submitted: job $JOB2B (pretrain chunk 2, depends on $JOB2A)" + +# Stage 2c +JOB2C=$(sbatch --parsable \ + --dependency=afterany:$JOB2B \ + --export=ALL,WANDB_RUN=$WANDB_RUN \ + runs/pace_stage2c_pretrain.sh) +echo "Stage 2c submitted: job $JOB2C (pretrain chunk 3, depends on $JOB2B)" + +# Stage 2d +JOB2D=$(sbatch --parsable \ + --dependency=afterany:$JOB2C \ + --export=ALL,WANDB_RUN=$WANDB_RUN \ + runs/pace_stage2d_pretrain.sh) +echo "Stage 2d submitted: job $JOB2D (pretrain chunk 4, depends on $JOB2C)" + +# Stage 3 +JOB3=$(sbatch --parsable \ + --dependency=afterok:$JOB2D \ + --export=ALL,WANDB_RUN=$WANDB_RUN \ + runs/pace_stage3_sft.sh) +echo "Stage 3 submitted: job $JOB3 (eval + SFT, depends on $JOB2D)" + +echo "" +echo "All jobs queued. Monitor with:" +echo " squeue -u $USER" +echo " tail -f runs/logs/stage1_${JOB1}.out" +echo " tail -f runs/logs/stage2a_${JOB2A}.out" +echo " tail -f runs/logs/stage2b_${JOB2B}.out" +echo " tail -f runs/logs/stage2c_${JOB2C}.out" +echo " tail -f runs/logs/stage2d_${JOB2D}.out" +echo " tail -f runs/logs/stage3_${JOB3}.out" +echo "" +echo "To cancel everything:" +echo " scancel $JOB1 $JOB2A $JOB2B $JOB2C $JOB2D $JOB3" From d7325f930683be629d9179dc07e6a0b9545feffe Mon Sep 17 00:00:00 2001 From: Hayden Free Date: Sat, 25 Apr 2026 22:23:45 -0400 Subject: [PATCH 2/2] Add generic RunPod runner harness + d12 baseline --- runs/runpod/d12.sh | 119 +++++++++++++++++++++++++++++++++++++++++ runs/runpod/kickoff.sh | 92 +++++++++++++++++++++++++++++++ 2 files changed, 211 insertions(+) create mode 100755 runs/runpod/d12.sh create mode 100755 runs/runpod/kickoff.sh diff --git a/runs/runpod/d12.sh b/runs/runpod/d12.sh new file mode 100755 index 00000000..9119c87d --- /dev/null +++ b/runs/runpod/d12.sh @@ -0,0 +1,119 @@ +#!/usr/bin/env bash +# d12 baseline runner. Runs INSIDE a RunPod pod. +# Pipeline: tokenizer -> base_train -> base_eval -> SFT -> chat_eval -> report. +# On exit: +# success -> upload final cache to HF, self-delete pod +# failure -> upload logs + report dir to HF under _failures/, self-delete pod +# (set UPLOAD_FAILURE_CACHE=1 to also dump partial cache for offline debug) +# +# Required env (passed via runpodctl --env at pod-create): +# HF_TOKEN, WANDB_API_KEY +# Optional env: +# WANDB_RUN default: d12 +# NANOCHAT_REPO default: Team-XSA/nanochat +# NANOCHAT_REF default: dev +# HF_REPO default: haydenfree/nanochat-d12-baseline +# BACKUP_INTERVAL default: 300 (seconds between background HF uploads) +# UPLOAD_FAILURE_CACHE default: 0 +# Auto-set by RunPod: +# RUNPOD_POD_ID, RUNPOD_API_KEY (pod-scoped) + +set -euo pipefail + +NANOCHAT_REPO="${NANOCHAT_REPO:-Team-XSA/nanochat}" +NANOCHAT_REF="${NANOCHAT_REF:-dev}" +HF_REPO="${HF_REPO:-haydenfree/nanochat-d12-baseline}" +WANDB_RUN="${WANDB_RUN:-d12}" +BACKUP_INTERVAL="${BACKUP_INTERVAL:-300}" +UPLOAD_FAILURE_CACHE="${UPLOAD_FAILURE_CACHE:-0}" + +WORKDIR="/workspace/nanochat" +LOG_FILE="/workspace/runner.log" +NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" +BACKUP_PID="" + +mkdir -p /workspace +exec > >(tee -a "$LOG_FILE") 2>&1 + +echo "[runner] $(date -Iseconds) starting on pod=$RUNPOD_POD_ID" +echo "[runner] repo=$NANOCHAT_REPO ref=$NANOCHAT_REF hf_repo=$HF_REPO wandb_run=$WANDB_RUN" + +cleanup() { + local rc=$? + set +e + echo "[runner] cleanup: exit code $rc at $(date -Iseconds)" + if [ -n "$BACKUP_PID" ] && kill -0 "$BACKUP_PID" 2>/dev/null; then + kill "$BACKUP_PID" 2>/dev/null || true + fi + + local TS + TS=$(date -u +%Y%m%dT%H%M%SZ) + + if [ "$rc" -eq 0 ]; then + echo "[runner] success — final upload to $HF_REPO" + if [ -d "$NANOCHAT_BASE_DIR" ]; then + huggingface-cli upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \ + --repo-type model --commit-message "final rc=0 $TS" || \ + echo "[runner] WARN: final upload failed" + fi + else + echo "[runner] failure rc=$rc — dumping logs to HF for offline debug" + mkdir -p /tmp/failure + cp /workspace/*.log /tmp/failure/ 2>/dev/null || true + [ -d "$NANOCHAT_BASE_DIR/report" ] && cp -r "$NANOCHAT_BASE_DIR/report" /tmp/failure/ 2>/dev/null || true + [ -d "$WORKDIR" ] && (cd "$WORKDIR" && git rev-parse HEAD 2>/dev/null > /tmp/failure/git-head.txt || true) + + huggingface-cli upload "$HF_REPO" /tmp/failure "_failures/${TS}-rc${rc}/logs" \ + --repo-type model --commit-message "failure rc=$rc logs $TS" || \ + echo "[runner] WARN: log upload failed" + + if [ "$UPLOAD_FAILURE_CACHE" = "1" ] && [ -d "$NANOCHAT_BASE_DIR" ]; then + echo "[runner] UPLOAD_FAILURE_CACHE=1 — also dumping partial cache (may be slow)" + huggingface-cli upload "$HF_REPO" "$NANOCHAT_BASE_DIR" "_failures/${TS}-rc${rc}/cache" \ + --repo-type model --commit-message "failure rc=$rc cache $TS" || true + fi + echo "[runner] failure artifacts: https://huggingface.co/$HF_REPO/tree/main/_failures/${TS}-rc${rc}" + fi + + echo "[runner] self-deleting pod $RUNPOD_POD_ID" + runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \ + curl -sS -X DELETE -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ + "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" + exit "$rc" +} +trap cleanup EXIT + +: "${HF_TOKEN:?HF_TOKEN must be set}" +: "${WANDB_API_KEY:?WANDB_API_KEY must be set}" +: "${RUNPOD_POD_ID:?RUNPOD_POD_ID must be set (auto by RunPod)}" + +rm -rf "$WORKDIR" +git clone "https://github.com/${NANOCHAT_REPO}.git" "$WORKDIR" +cd "$WORKDIR" +git checkout "$NANOCHAT_REF" +echo "[runner] HEAD = $(git rev-parse HEAD)" + +sed -i 's/--depth=24/--depth=12/' runs/speedrun.sh +sed -i 's/ --target-param-data-ratio=8//' runs/speedrun.sh +echo "[runner] speedrun.sh edits applied:" +grep -n 'depth\|target-param' runs/speedrun.sh || true + +pip install --quiet --upgrade huggingface_hub + +( + while true; do + sleep "$BACKUP_INTERVAL" + if [ -d "$NANOCHAT_BASE_DIR" ]; then + huggingface-cli upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \ + --repo-type model \ + --commit-message "checkpoint $(date -Iseconds)" >> /workspace/backup.log 2>&1 || true + fi + done +) & +BACKUP_PID=$! +echo "[runner] backup loop pid=$BACKUP_PID interval=${BACKUP_INTERVAL}s" + +export WANDB_RUN +WANDB_RUN="$WANDB_RUN" bash runs/speedrun.sh + +echo "[runner] $(date -Iseconds) pipeline complete" diff --git a/runs/runpod/kickoff.sh b/runs/runpod/kickoff.sh new file mode 100755 index 00000000..a0e4e2fb --- /dev/null +++ b/runs/runpod/kickoff.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +# Generic local kickoff for RunPod runs. +# Picks a runner script in this repo (runs/runpod/.sh) and spins up a pod. +# +# Prereqs: +# 1. ~/.config/team-xsa/runpod.env exports HF_TOKEN, WANDB_API_KEY, RUNPOD_TEMPLATE_ID +# 2. The template referenced by RUNPOD_TEMPLATE_ID has docker-start-cmd: +# bash,-lc,curl -fsSL "$RUNNER_URL" | bash >> /workspace/runner.log 2>&1 +# 3. The runner script for this experiment has been pushed to Team-XSA/nanochat +# +# Usage: +# source ~/.config/team-xsa/runpod.env +# bash runs/runpod/kickoff.sh d12 # uses runs/runpod/d12.sh +# bash runs/runpod/kickoff.sh d24 # uses runs/runpod/d24.sh +# bash runs/runpod/kickoff.sh xsa_d12 # uses runs/runpod/xsa_d12.sh +# +# Optional env overrides: +# GPU_ID default: "NVIDIA H100 80GB HBM3" +# GPU_COUNT default: 8 +# CLOUD_TYPE default: COMMUNITY (SECURE for guaranteed availability) +# DISK_GB default: 200 +# NANOCHAT_REPO default: Team-XSA/nanochat +# NANOCHAT_REF default: dev +# WANDB_RUN default: +# POD_NAME default: - + +set -euo pipefail + +RUNNER="${1:-}" +if [ -z "$RUNNER" ]; then + echo "Usage: bash runs/runpod/kickoff.sh " + echo " e.g. bash runs/runpod/kickoff.sh d12" + exit 1 +fi + +: "${HF_TOKEN:?HF_TOKEN not set — source ~/.config/team-xsa/runpod.env}" +: "${WANDB_API_KEY:?WANDB_API_KEY not set — source ~/.config/team-xsa/runpod.env}" +: "${RUNPOD_TEMPLATE_ID:?RUNPOD_TEMPLATE_ID not set — create the template once and add it to ~/.config/team-xsa/runpod.env}" + +NANOCHAT_REPO="${NANOCHAT_REPO:-Team-XSA/nanochat}" +NANOCHAT_REF="${NANOCHAT_REF:-dev}" +WANDB_RUN="${WANDB_RUN:-$RUNNER}" +RUNNER_URL="${RUNNER_URL:-https://raw.githubusercontent.com/${NANOCHAT_REPO}/${NANOCHAT_REF}/runs/runpod/${RUNNER}.sh}" + +GPU_ID="${GPU_ID:-NVIDIA H100 80GB HBM3}" +GPU_COUNT="${GPU_COUNT:-8}" +CLOUD_TYPE="${CLOUD_TYPE:-COMMUNITY}" +DISK_GB="${DISK_GB:-200}" +POD_NAME="${POD_NAME:-${RUNNER}-$(date +%Y%m%d-%H%M)}" + +echo "Verifying runner URL is reachable: $RUNNER_URL" +if ! curl -sfI "$RUNNER_URL" >/dev/null; then + echo "ERROR: runner not reachable at $RUNNER_URL" + echo " - Did you push runs/runpod/${RUNNER}.sh to ${NANOCHAT_REPO}@${NANOCHAT_REF}?" + echo " - Is the repo public?" + exit 1 +fi + +export HF_TOKEN WANDB_API_KEY WANDB_RUN RUNNER_URL NANOCHAT_REPO NANOCHAT_REF +ENV_JSON=$(python3 - <<'PY' +import json, os +keys = ["HF_TOKEN","WANDB_API_KEY","WANDB_RUN","RUNNER_URL","NANOCHAT_REPO","NANOCHAT_REF"] +print(json.dumps({k: os.environ[k] for k in keys if k in os.environ})) +PY +) + +echo "Creating pod:" +echo " name = $POD_NAME" +echo " template = $RUNPOD_TEMPLATE_ID" +echo " runner = $RUNNER_URL" +echo " gpu = $GPU_COUNT × $GPU_ID" +echo " cloud = $CLOUD_TYPE" +echo " disk = ${DISK_GB} GB" + +runpodctl pod create \ + --name "$POD_NAME" \ + --template-id "$RUNPOD_TEMPLATE_ID" \ + --gpu-id "$GPU_ID" \ + --gpu-count "$GPU_COUNT" \ + --cloud-type "$CLOUD_TYPE" \ + --container-disk-in-gb "$DISK_GB" \ + --env "$ENV_JSON" + +echo +echo "Logs (after pod boots):" +echo " POD_ID=\$(runpodctl pod list --name '$POD_NAME' -o json | jq -r '.[0].id')" +echo " runpodctl ssh info \$POD_ID" +echo " ssh @ 'tail -f /workspace/runner.log'" +echo +echo "Wandb: project=nanochat / nanochat-sft, run name: $WANDB_RUN" +echo "HF (success): https://huggingface.co/haydenfree/nanochat-d12-baseline" +echo "HF (failure): https://huggingface.co/haydenfree/nanochat-d12-baseline/tree/main/_failures"