From c5e8ce370ce2face6feba26b0be076dca1fe5ae0 Mon Sep 17 00:00:00 2001 From: zolopgh Date: Sat, 25 Apr 2026 12:53:11 -0400 Subject: [PATCH 01/11] batch baseline speedrun.sh scripts --- runs/pace_stage1_tokenizer.sh | 42 ++++++++++++++++++ runs/pace_stage2a_pretrain.sh | 43 +++++++++++++++++++ runs/pace_stage2b_pretrain.sh | 68 +++++++++++++++++++++++++++++ runs/pace_stage2c_pretrain.sh | 68 +++++++++++++++++++++++++++++ runs/pace_stage2d_pretrain.sh | 68 +++++++++++++++++++++++++++++ runs/pace_stage3_sft.sh | 55 ++++++++++++++++++++++++ runs/pace_submit.sh | 81 +++++++++++++++++++++++++++++++++++ 7 files changed, 425 insertions(+) create mode 100644 runs/pace_stage1_tokenizer.sh create mode 100644 runs/pace_stage2a_pretrain.sh create mode 100644 runs/pace_stage2b_pretrain.sh create mode 100644 runs/pace_stage2c_pretrain.sh create mode 100644 runs/pace_stage2d_pretrain.sh create mode 100644 runs/pace_stage3_sft.sh create mode 100644 runs/pace_submit.sh diff --git a/runs/pace_stage1_tokenizer.sh b/runs/pace_stage1_tokenizer.sh new file mode 100644 index 00000000..8caa6214 --- /dev/null +++ b/runs/pace_stage1_tokenizer.sh @@ -0,0 +1,42 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=24 +#SBATCH --mem=64G +#SBATCH -t 2:00:00 +#SBATCH -J nanochat-stage1-tokenizer +#SBATCH -o runs/logs/stage1_%j.out +#SBATCH -e runs/logs/stage1_%j.err + +# Stage 1 + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p "$NANOCHAT_BASE_DIR" +mkdir -p runs/logs + +echo "=== Stage 1: Tokenizer ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "Started: $(date)" + +command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh +export PATH="$HOME/.local/bin:$PATH" +[ -d ".venv" ] || uv venv +uv sync --extra gpu +source .venv/bin/activate +python -m nanochat.report reset +python -m nanochat.dataset -n 8 +python -m nanochat.dataset -n 170 & +DATASET_DOWNLOAD_PID=$! + +python -m scripts.tok_train +python -m scripts.tok_eval + +echo "Waiting for full dataset download..." +wait $DATASET_DOWNLOAD_PID + +echo "=== Stage 1 complete: $(date) ===" +echo "Dataset and tokenizer ready in $NANOCHAT_BASE_DIR" diff --git a/runs/pace_stage2a_pretrain.sh b/runs/pace_stage2a_pretrain.sh new file mode 100644 index 00000000..bee6d11f --- /dev/null +++ b/runs/pace_stage2a_pretrain.sh @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH -p ice-gpu +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:2 +#SBATCH --constraint="gpu-h100|gpu-h200" +#SBATCH --mem-per-gpu=48G +#SBATCH -t 3:55:00 +#SBATCH -J nanochat-stage2a +#SBATCH -o runs/logs/stage2a_%j.out +#SBATCH -e runs/logs/stage2a_%j.err + +# Stage 2a + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" +CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" +DONE_MARKER="$CHECKPOINT_DIR/.training_complete" + +echo "=== Stage 2a: Pretraining (chunk 1) ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "WANDB_RUN: $WANDB_RUN" +echo "Started: $(date)" + +source .venv/bin/activate + +torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + --run=$WANDB_RUN + +mkdir -p "$CHECKPOINT_DIR" +touch "$DONE_MARKER" +echo "=== Stage 2a complete: $(date) ===" diff --git a/runs/pace_stage2b_pretrain.sh b/runs/pace_stage2b_pretrain.sh new file mode 100644 index 00000000..7b8ea806 --- /dev/null +++ b/runs/pace_stage2b_pretrain.sh @@ -0,0 +1,68 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH -p ice-gpu +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:2 +#SBATCH --constraint="gpu-h100|gpu-h200" +#SBATCH --mem-per-gpu=48G +#SBATCH -t 3:55:00 +#SBATCH -J nanochat-stage2b +#SBATCH -o runs/logs/stage2b_%j.out +#SBATCH -e runs/logs/stage2b_%j.err + +# Stage 2b + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" +CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" +DONE_MARKER="$CHECKPOINT_DIR/.training_complete" + +echo "=== Stage 2b: Pretraining (chunk 2 / auto-resume) ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "Started: $(date)" + +if [ -f "$DONE_MARKER" ]; then + echo "Training already complete (marker: $DONE_MARKER). Nothing to do." + echo "=== Stage 2b skipped: $(date) ===" + exit 0 +fi + +source .venv/bin/activate + +LAST_STEP=$(python -c " +import glob, os, sys +files = glob.glob('${CHECKPOINT_DIR}/model_*.pt') +if not files: + print(0); sys.exit(0) +print(max(int(os.path.basename(f).split('_')[-1].split('.')[0]) for f in files)) +") + +if [ "$LAST_STEP" -eq 0 ]; then + echo "No checkpoint found — starting from scratch" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + --run=$WANDB_RUN +else + echo "Resuming from step $LAST_STEP" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + --resume-from-step=$LAST_STEP \ + --run=$WANDB_RUN +fi + +mkdir -p "$CHECKPOINT_DIR" +touch "$DONE_MARKER" +echo "=== Stage 2b complete: $(date) ===" diff --git a/runs/pace_stage2c_pretrain.sh b/runs/pace_stage2c_pretrain.sh new file mode 100644 index 00000000..92c748e9 --- /dev/null +++ b/runs/pace_stage2c_pretrain.sh @@ -0,0 +1,68 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH -p ice-gpu +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:2 +#SBATCH --constraint="gpu-h100|gpu-h200" +#SBATCH --mem-per-gpu=48G +#SBATCH -t 3:55:00 +#SBATCH -J nanochat-stage2c +#SBATCH -o runs/logs/stage2c_%j.out +#SBATCH -e runs/logs/stage2c_%j.err + +# Stage 2c + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" +CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" +DONE_MARKER="$CHECKPOINT_DIR/.training_complete" + +echo "=== Stage 2c: Pretraining (chunk 3 / auto-resume) ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "Started: $(date)" + +if [ -f "$DONE_MARKER" ]; then + echo "Training already complete (marker: $DONE_MARKER). Nothing to do." + echo "=== Stage 2c skipped: $(date) ===" + exit 0 +fi + +source .venv/bin/activate + +LAST_STEP=$(python -c " +import glob, os, sys +files = glob.glob('${CHECKPOINT_DIR}/model_*.pt') +if not files: + print(0); sys.exit(0) +print(max(int(os.path.basename(f).split('_')[-1].split('.')[0]) for f in files)) +") + +if [ "$LAST_STEP" -eq 0 ]; then + echo "No checkpoint found — starting from scratch" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + --run=$WANDB_RUN +else + echo "Resuming from step $LAST_STEP" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + --resume-from-step=$LAST_STEP \ + --run=$WANDB_RUN +fi + +mkdir -p "$CHECKPOINT_DIR" +touch "$DONE_MARKER" +echo "=== Stage 2c complete: $(date) ===" diff --git a/runs/pace_stage2d_pretrain.sh b/runs/pace_stage2d_pretrain.sh new file mode 100644 index 00000000..70030001 --- /dev/null +++ b/runs/pace_stage2d_pretrain.sh @@ -0,0 +1,68 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH -p ice-gpu +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:2 +#SBATCH --constraint="gpu-h100|gpu-h200" +#SBATCH --mem-per-gpu=48G +#SBATCH -t 3:55:00 +#SBATCH -J nanochat-stage2d +#SBATCH -o runs/logs/stage2d_%j.out +#SBATCH -e runs/logs/stage2d_%j.err + +# Stage 2d + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" +CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" +DONE_MARKER="$CHECKPOINT_DIR/.training_complete" + +echo "=== Stage 2d: Pretraining (chunk 4 / auto-resume) ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "Started: $(date)" + +if [ -f "$DONE_MARKER" ]; then + echo "Training already complete (marker: $DONE_MARKER). Nothing to do." + echo "=== Stage 2d skipped: $(date) ===" + exit 0 +fi + +source .venv/bin/activate + +LAST_STEP=$(python -c " +import glob, os, sys +files = glob.glob('${CHECKPOINT_DIR}/model_*.pt') +if not files: + print(0); sys.exit(0) +print(max(int(os.path.basename(f).split('_')[-1].split('.')[0]) for f in files)) +") + +if [ "$LAST_STEP" -eq 0 ]; then + echo "No checkpoint found — starting from scratch" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + --run=$WANDB_RUN +else + echo "Resuming from step $LAST_STEP" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + --resume-from-step=$LAST_STEP \ + --run=$WANDB_RUN +fi + +mkdir -p "$CHECKPOINT_DIR" +touch "$DONE_MARKER" +echo "=== Stage 2d complete: $(date) ===" diff --git a/runs/pace_stage3_sft.sh b/runs/pace_stage3_sft.sh new file mode 100644 index 00000000..f354f7c4 --- /dev/null +++ b/runs/pace_stage3_sft.sh @@ -0,0 +1,55 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH -p ice-gpu +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:2 +#SBATCH --constraint="gpu-h100|gpu-h200" +#SBATCH --mem-per-gpu=48G +#SBATCH -t 3:55:00 +#SBATCH -J nanochat-stage3-sft +#SBATCH -o runs/logs/stage3_%j.out +#SBATCH -e runs/logs/stage3_%j.err + +# Stage 3 + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" + +echo "=== Stage 3: Eval + SFT ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "WANDB_RUN: $WANDB_RUN" +echo "Started: $(date)" + +CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" +DONE_MARKER="$CHECKPOINT_DIR/.training_complete" +if [ ! -f "$DONE_MARKER" ]; then + echo "ERROR: pretraining did not finish — missing $DONE_MARKER" + echo "Re-run pretrain chunks 2a–2d until the marker is created before running stage 3." + exit 1 +fi + +source .venv/bin/activate + +torchrun --standalone --nproc_per_node=2 -m scripts.base_eval -- \ + --device-batch-size=16 + +curl -L -o "$NANOCHAT_BASE_DIR/identity_conversations.jsonl" \ + https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl + +torchrun --standalone --nproc_per_node=2 -m scripts.chat_sft -- \ + --device-batch-size=16 \ + --run=$WANDB_RUN + +torchrun --standalone --nproc_per_node=2 -m scripts.chat_eval -- -i sft + +python -m nanochat.report generate + +echo "=== Stage 3 complete: $(date) ===" + diff --git a/runs/pace_submit.sh b/runs/pace_submit.sh new file mode 100644 index 00000000..951d8849 --- /dev/null +++ b/runs/pace_submit.sh @@ -0,0 +1,81 @@ +#!/bin/bash + +# Pipeline: +# Stage 1 — CPU: tokenizer + dataset +# Stage 2a — GPU: pretraining chunk 1 +# Stage 2b — GPU: auto-resume chunk 2 +# Stage 2c — GPU: auto-resume chunk 3 +# Stage 2d — GPU: auto-resume chunk 4 +# Stage 3 — GPU: base eval + SFT + chat eval + report +# +# Usage (from repo root): +# bash runs/pace_submit.sh +# +# Optional W&B logging: +# WANDB_RUN=my-run bash runs/pace_submit.sh + +set -e +cd "$HOME/scratch/nanochat" + +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" +export WANDB_RUN + +echo "Submitting nanochat full pipeline..." +echo "WANDB_RUN=$WANDB_RUN" +echo "" + +# Stage 1 +JOB1=$(sbatch --parsable \ + --export=ALL,WANDB_RUN=$WANDB_RUN \ + runs/pace_stage1_tokenizer.sh) +echo "Stage 1 submitted: job $JOB1 (tokenizer + dataset)" + +# Stage 2a +JOB2A=$(sbatch --parsable \ + --dependency=afterok:$JOB1 \ + --export=ALL,WANDB_RUN=$WANDB_RUN \ + runs/pace_stage2a_pretrain.sh) +echo "Stage 2a submitted: job $JOB2A (pretrain chunk 1, depends on $JOB1)" + +# Stage 2b +JOB2B=$(sbatch --parsable \ + --dependency=afterany:$JOB2A \ + --export=ALL,WANDB_RUN=$WANDB_RUN \ + runs/pace_stage2b_pretrain.sh) +echo "Stage 2b submitted: job $JOB2B (pretrain chunk 2, depends on $JOB2A)" + +# Stage 2c +JOB2C=$(sbatch --parsable \ + --dependency=afterany:$JOB2B \ + --export=ALL,WANDB_RUN=$WANDB_RUN \ + runs/pace_stage2c_pretrain.sh) +echo "Stage 2c submitted: job $JOB2C (pretrain chunk 3, depends on $JOB2B)" + +# Stage 2d +JOB2D=$(sbatch --parsable \ + --dependency=afterany:$JOB2C \ + --export=ALL,WANDB_RUN=$WANDB_RUN \ + runs/pace_stage2d_pretrain.sh) +echo "Stage 2d submitted: job $JOB2D (pretrain chunk 4, depends on $JOB2C)" + +# Stage 3 +JOB3=$(sbatch --parsable \ + --dependency=afterok:$JOB2D \ + --export=ALL,WANDB_RUN=$WANDB_RUN \ + runs/pace_stage3_sft.sh) +echo "Stage 3 submitted: job $JOB3 (eval + SFT, depends on $JOB2D)" + +echo "" +echo "All jobs queued. Monitor with:" +echo " squeue -u $USER" +echo " tail -f runs/logs/stage1_${JOB1}.out" +echo " tail -f runs/logs/stage2a_${JOB2A}.out" +echo " tail -f runs/logs/stage2b_${JOB2B}.out" +echo " tail -f runs/logs/stage2c_${JOB2C}.out" +echo " tail -f runs/logs/stage2d_${JOB2D}.out" +echo " tail -f runs/logs/stage3_${JOB3}.out" +echo "" +echo "To cancel everything:" +echo " scancel $JOB1 $JOB2A $JOB2B $JOB2C $JOB2D $JOB3" From d7325f930683be629d9179dc07e6a0b9545feffe Mon Sep 17 00:00:00 2001 From: Hayden Free Date: Sat, 25 Apr 2026 22:23:45 -0400 Subject: [PATCH 02/11] Add generic RunPod runner harness + d12 baseline --- runs/runpod/d12.sh | 119 +++++++++++++++++++++++++++++++++++++++++ runs/runpod/kickoff.sh | 92 +++++++++++++++++++++++++++++++ 2 files changed, 211 insertions(+) create mode 100755 runs/runpod/d12.sh create mode 100755 runs/runpod/kickoff.sh diff --git a/runs/runpod/d12.sh b/runs/runpod/d12.sh new file mode 100755 index 00000000..9119c87d --- /dev/null +++ b/runs/runpod/d12.sh @@ -0,0 +1,119 @@ +#!/usr/bin/env bash +# d12 baseline runner. Runs INSIDE a RunPod pod. +# Pipeline: tokenizer -> base_train -> base_eval -> SFT -> chat_eval -> report. +# On exit: +# success -> upload final cache to HF, self-delete pod +# failure -> upload logs + report dir to HF under _failures/, self-delete pod +# (set UPLOAD_FAILURE_CACHE=1 to also dump partial cache for offline debug) +# +# Required env (passed via runpodctl --env at pod-create): +# HF_TOKEN, WANDB_API_KEY +# Optional env: +# WANDB_RUN default: d12 +# NANOCHAT_REPO default: Team-XSA/nanochat +# NANOCHAT_REF default: dev +# HF_REPO default: haydenfree/nanochat-d12-baseline +# BACKUP_INTERVAL default: 300 (seconds between background HF uploads) +# UPLOAD_FAILURE_CACHE default: 0 +# Auto-set by RunPod: +# RUNPOD_POD_ID, RUNPOD_API_KEY (pod-scoped) + +set -euo pipefail + +NANOCHAT_REPO="${NANOCHAT_REPO:-Team-XSA/nanochat}" +NANOCHAT_REF="${NANOCHAT_REF:-dev}" +HF_REPO="${HF_REPO:-haydenfree/nanochat-d12-baseline}" +WANDB_RUN="${WANDB_RUN:-d12}" +BACKUP_INTERVAL="${BACKUP_INTERVAL:-300}" +UPLOAD_FAILURE_CACHE="${UPLOAD_FAILURE_CACHE:-0}" + +WORKDIR="/workspace/nanochat" +LOG_FILE="/workspace/runner.log" +NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" +BACKUP_PID="" + +mkdir -p /workspace +exec > >(tee -a "$LOG_FILE") 2>&1 + +echo "[runner] $(date -Iseconds) starting on pod=$RUNPOD_POD_ID" +echo "[runner] repo=$NANOCHAT_REPO ref=$NANOCHAT_REF hf_repo=$HF_REPO wandb_run=$WANDB_RUN" + +cleanup() { + local rc=$? + set +e + echo "[runner] cleanup: exit code $rc at $(date -Iseconds)" + if [ -n "$BACKUP_PID" ] && kill -0 "$BACKUP_PID" 2>/dev/null; then + kill "$BACKUP_PID" 2>/dev/null || true + fi + + local TS + TS=$(date -u +%Y%m%dT%H%M%SZ) + + if [ "$rc" -eq 0 ]; then + echo "[runner] success — final upload to $HF_REPO" + if [ -d "$NANOCHAT_BASE_DIR" ]; then + huggingface-cli upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \ + --repo-type model --commit-message "final rc=0 $TS" || \ + echo "[runner] WARN: final upload failed" + fi + else + echo "[runner] failure rc=$rc — dumping logs to HF for offline debug" + mkdir -p /tmp/failure + cp /workspace/*.log /tmp/failure/ 2>/dev/null || true + [ -d "$NANOCHAT_BASE_DIR/report" ] && cp -r "$NANOCHAT_BASE_DIR/report" /tmp/failure/ 2>/dev/null || true + [ -d "$WORKDIR" ] && (cd "$WORKDIR" && git rev-parse HEAD 2>/dev/null > /tmp/failure/git-head.txt || true) + + huggingface-cli upload "$HF_REPO" /tmp/failure "_failures/${TS}-rc${rc}/logs" \ + --repo-type model --commit-message "failure rc=$rc logs $TS" || \ + echo "[runner] WARN: log upload failed" + + if [ "$UPLOAD_FAILURE_CACHE" = "1" ] && [ -d "$NANOCHAT_BASE_DIR" ]; then + echo "[runner] UPLOAD_FAILURE_CACHE=1 — also dumping partial cache (may be slow)" + huggingface-cli upload "$HF_REPO" "$NANOCHAT_BASE_DIR" "_failures/${TS}-rc${rc}/cache" \ + --repo-type model --commit-message "failure rc=$rc cache $TS" || true + fi + echo "[runner] failure artifacts: https://huggingface.co/$HF_REPO/tree/main/_failures/${TS}-rc${rc}" + fi + + echo "[runner] self-deleting pod $RUNPOD_POD_ID" + runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \ + curl -sS -X DELETE -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ + "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" + exit "$rc" +} +trap cleanup EXIT + +: "${HF_TOKEN:?HF_TOKEN must be set}" +: "${WANDB_API_KEY:?WANDB_API_KEY must be set}" +: "${RUNPOD_POD_ID:?RUNPOD_POD_ID must be set (auto by RunPod)}" + +rm -rf "$WORKDIR" +git clone "https://github.com/${NANOCHAT_REPO}.git" "$WORKDIR" +cd "$WORKDIR" +git checkout "$NANOCHAT_REF" +echo "[runner] HEAD = $(git rev-parse HEAD)" + +sed -i 's/--depth=24/--depth=12/' runs/speedrun.sh +sed -i 's/ --target-param-data-ratio=8//' runs/speedrun.sh +echo "[runner] speedrun.sh edits applied:" +grep -n 'depth\|target-param' runs/speedrun.sh || true + +pip install --quiet --upgrade huggingface_hub + +( + while true; do + sleep "$BACKUP_INTERVAL" + if [ -d "$NANOCHAT_BASE_DIR" ]; then + huggingface-cli upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \ + --repo-type model \ + --commit-message "checkpoint $(date -Iseconds)" >> /workspace/backup.log 2>&1 || true + fi + done +) & +BACKUP_PID=$! +echo "[runner] backup loop pid=$BACKUP_PID interval=${BACKUP_INTERVAL}s" + +export WANDB_RUN +WANDB_RUN="$WANDB_RUN" bash runs/speedrun.sh + +echo "[runner] $(date -Iseconds) pipeline complete" diff --git a/runs/runpod/kickoff.sh b/runs/runpod/kickoff.sh new file mode 100755 index 00000000..a0e4e2fb --- /dev/null +++ b/runs/runpod/kickoff.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +# Generic local kickoff for RunPod runs. +# Picks a runner script in this repo (runs/runpod/.sh) and spins up a pod. +# +# Prereqs: +# 1. ~/.config/team-xsa/runpod.env exports HF_TOKEN, WANDB_API_KEY, RUNPOD_TEMPLATE_ID +# 2. The template referenced by RUNPOD_TEMPLATE_ID has docker-start-cmd: +# bash,-lc,curl -fsSL "$RUNNER_URL" | bash >> /workspace/runner.log 2>&1 +# 3. The runner script for this experiment has been pushed to Team-XSA/nanochat +# +# Usage: +# source ~/.config/team-xsa/runpod.env +# bash runs/runpod/kickoff.sh d12 # uses runs/runpod/d12.sh +# bash runs/runpod/kickoff.sh d24 # uses runs/runpod/d24.sh +# bash runs/runpod/kickoff.sh xsa_d12 # uses runs/runpod/xsa_d12.sh +# +# Optional env overrides: +# GPU_ID default: "NVIDIA H100 80GB HBM3" +# GPU_COUNT default: 8 +# CLOUD_TYPE default: COMMUNITY (SECURE for guaranteed availability) +# DISK_GB default: 200 +# NANOCHAT_REPO default: Team-XSA/nanochat +# NANOCHAT_REF default: dev +# WANDB_RUN default: +# POD_NAME default: - + +set -euo pipefail + +RUNNER="${1:-}" +if [ -z "$RUNNER" ]; then + echo "Usage: bash runs/runpod/kickoff.sh " + echo " e.g. bash runs/runpod/kickoff.sh d12" + exit 1 +fi + +: "${HF_TOKEN:?HF_TOKEN not set — source ~/.config/team-xsa/runpod.env}" +: "${WANDB_API_KEY:?WANDB_API_KEY not set — source ~/.config/team-xsa/runpod.env}" +: "${RUNPOD_TEMPLATE_ID:?RUNPOD_TEMPLATE_ID not set — create the template once and add it to ~/.config/team-xsa/runpod.env}" + +NANOCHAT_REPO="${NANOCHAT_REPO:-Team-XSA/nanochat}" +NANOCHAT_REF="${NANOCHAT_REF:-dev}" +WANDB_RUN="${WANDB_RUN:-$RUNNER}" +RUNNER_URL="${RUNNER_URL:-https://raw.githubusercontent.com/${NANOCHAT_REPO}/${NANOCHAT_REF}/runs/runpod/${RUNNER}.sh}" + +GPU_ID="${GPU_ID:-NVIDIA H100 80GB HBM3}" +GPU_COUNT="${GPU_COUNT:-8}" +CLOUD_TYPE="${CLOUD_TYPE:-COMMUNITY}" +DISK_GB="${DISK_GB:-200}" +POD_NAME="${POD_NAME:-${RUNNER}-$(date +%Y%m%d-%H%M)}" + +echo "Verifying runner URL is reachable: $RUNNER_URL" +if ! curl -sfI "$RUNNER_URL" >/dev/null; then + echo "ERROR: runner not reachable at $RUNNER_URL" + echo " - Did you push runs/runpod/${RUNNER}.sh to ${NANOCHAT_REPO}@${NANOCHAT_REF}?" + echo " - Is the repo public?" + exit 1 +fi + +export HF_TOKEN WANDB_API_KEY WANDB_RUN RUNNER_URL NANOCHAT_REPO NANOCHAT_REF +ENV_JSON=$(python3 - <<'PY' +import json, os +keys = ["HF_TOKEN","WANDB_API_KEY","WANDB_RUN","RUNNER_URL","NANOCHAT_REPO","NANOCHAT_REF"] +print(json.dumps({k: os.environ[k] for k in keys if k in os.environ})) +PY +) + +echo "Creating pod:" +echo " name = $POD_NAME" +echo " template = $RUNPOD_TEMPLATE_ID" +echo " runner = $RUNNER_URL" +echo " gpu = $GPU_COUNT × $GPU_ID" +echo " cloud = $CLOUD_TYPE" +echo " disk = ${DISK_GB} GB" + +runpodctl pod create \ + --name "$POD_NAME" \ + --template-id "$RUNPOD_TEMPLATE_ID" \ + --gpu-id "$GPU_ID" \ + --gpu-count "$GPU_COUNT" \ + --cloud-type "$CLOUD_TYPE" \ + --container-disk-in-gb "$DISK_GB" \ + --env "$ENV_JSON" + +echo +echo "Logs (after pod boots):" +echo " POD_ID=\$(runpodctl pod list --name '$POD_NAME' -o json | jq -r '.[0].id')" +echo " runpodctl ssh info \$POD_ID" +echo " ssh @ 'tail -f /workspace/runner.log'" +echo +echo "Wandb: project=nanochat / nanochat-sft, run name: $WANDB_RUN" +echo "HF (success): https://huggingface.co/haydenfree/nanochat-d12-baseline" +echo "HF (failure): https://huggingface.co/haydenfree/nanochat-d12-baseline/tree/main/_failures" From 86cf10c693fd08c5e0160116ed8c8cd5150db612 Mon Sep 17 00:00:00 2001 From: Hayden Free Date: Sat, 25 Apr 2026 22:39:45 -0400 Subject: [PATCH 03/11] Add minimal smoke runner --- runs/runpod/kickoff.sh | 4 +- runs/runpod/smoke.sh | 102 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 104 insertions(+), 2 deletions(-) create mode 100755 runs/runpod/smoke.sh diff --git a/runs/runpod/kickoff.sh b/runs/runpod/kickoff.sh index a0e4e2fb..bb918171 100755 --- a/runs/runpod/kickoff.sh +++ b/runs/runpod/kickoff.sh @@ -17,7 +17,7 @@ # Optional env overrides: # GPU_ID default: "NVIDIA H100 80GB HBM3" # GPU_COUNT default: 8 -# CLOUD_TYPE default: COMMUNITY (SECURE for guaranteed availability) +# CLOUD_TYPE default: SECURE (COMMUNITY when capacity available, cheaper) # DISK_GB default: 200 # NANOCHAT_REPO default: Team-XSA/nanochat # NANOCHAT_REF default: dev @@ -44,7 +44,7 @@ RUNNER_URL="${RUNNER_URL:-https://raw.githubusercontent.com/${NANOCHAT_REPO}/${N GPU_ID="${GPU_ID:-NVIDIA H100 80GB HBM3}" GPU_COUNT="${GPU_COUNT:-8}" -CLOUD_TYPE="${CLOUD_TYPE:-COMMUNITY}" +CLOUD_TYPE="${CLOUD_TYPE:-SECURE}" DISK_GB="${DISK_GB:-200}" POD_NAME="${POD_NAME:-${RUNNER}-$(date +%Y%m%d-%H%M)}" diff --git a/runs/runpod/smoke.sh b/runs/runpod/smoke.sh new file mode 100755 index 00000000..d023a764 --- /dev/null +++ b/runs/runpod/smoke.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +# Minimal smoke test. Runs INSIDE a RunPod pod. +# Validates: pod boot, env-var injection, git clone, uv sync, GPU torch, +# tokenizer + base_train code paths, HF upload, runpodctl self-delete. +# Does NOT test: multi-GPU, FP8, full training horizon, SFT, eval. +# +# Sized for a 1-GPU pod, completes in ~3-4 min wall clock. +# Kick off with: GPU_COUNT=1 bash runs/runpod/kickoff.sh smoke +# +# Required env: HF_TOKEN, WANDB_API_KEY +# Auto-set by RunPod: RUNPOD_POD_ID, RUNPOD_API_KEY + +set -euo pipefail + +NANOCHAT_REPO="${NANOCHAT_REPO:-Team-XSA/nanochat}" +NANOCHAT_REF="${NANOCHAT_REF:-dev}" +HF_REPO="${HF_REPO:-haydenfree/nanochat-d12-baseline}" +WANDB_RUN="${WANDB_RUN:-smoke}" + +TS=$(date -u +%Y%m%dT%H%M%SZ) +HF_PATH_PREFIX="_smoke/${TS}" + +WORKDIR="/workspace/nanochat" +LOG_FILE="/workspace/runner.log" +NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" + +mkdir -p /workspace +exec > >(tee -a "$LOG_FILE") 2>&1 + +echo "[smoke] $(date -Iseconds) starting on pod=$RUNPOD_POD_ID" + +cleanup() { + local rc=$? + set +e + echo "[smoke] cleanup: exit code $rc" + + # Always upload the runner log (success or failure) so we can see what happened + mkdir -p /tmp/smoke-out + cp /workspace/*.log /tmp/smoke-out/ 2>/dev/null || true + echo "rc=$rc ts=$TS pod=$RUNPOD_POD_ID" > /tmp/smoke-out/result.txt + [ -d "$WORKDIR" ] && (cd "$WORKDIR" && git rev-parse HEAD 2>/dev/null > /tmp/smoke-out/git-head.txt || true) + huggingface-cli upload "$HF_REPO" /tmp/smoke-out "$HF_PATH_PREFIX" \ + --repo-type model --commit-message "smoke rc=$rc $TS" || \ + echo "[smoke] WARN: HF upload failed" + + echo "[smoke] artifacts: https://huggingface.co/$HF_REPO/tree/main/$HF_PATH_PREFIX" + echo "[smoke] self-deleting pod $RUNPOD_POD_ID" + runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \ + curl -sS -X DELETE -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ + "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" + exit "$rc" +} +trap cleanup EXIT + +: "${HF_TOKEN:?HF_TOKEN must be set}" +: "${WANDB_API_KEY:?WANDB_API_KEY must be set}" +: "${RUNPOD_POD_ID:?RUNPOD_POD_ID must be set (auto by RunPod)}" + +# Clone fork +rm -rf "$WORKDIR" +git clone "https://github.com/${NANOCHAT_REPO}.git" "$WORKDIR" +cd "$WORKDIR" +git checkout "$NANOCHAT_REF" +echo "[smoke] HEAD = $(git rev-parse HEAD)" + +# Env + uv +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR +mkdir -p "$NANOCHAT_BASE_DIR" +command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh +[ -d ".venv" ] || uv venv +uv sync --extra gpu +source .venv/bin/activate +pip install --quiet --upgrade huggingface_hub + +# GPU sanity +python -c "import torch; print('[smoke] torch', torch.__version__, 'cuda', torch.cuda.is_available(), 'devices', torch.cuda.device_count())" + +# Minimum dataset + tokenizer (1 shard, 50M chars — enough for the tokenizer +# to train on AND for base_train to consume 20 iterations of tokens) +python -m nanochat.dataset -n 1 +python -m scripts.tok_train --max-chars=50000000 + +# Tiny base_train. Params from base_train.py docstring (the CPU smoke), adjusted +# slightly for GPU. depth=4, 20 iterations. Should finish in ~30s. +NPROC=$(nvidia-smi -L | wc -l) +echo "[smoke] training on $NPROC GPU(s)" +torchrun --standalone --nproc_per_node="$NPROC" -m scripts.base_train -- \ + --depth=4 \ + --max-seq-len=512 \ + --device-batch-size=1 \ + --total-batch-size=512 \ + --num-iterations=20 \ + --eval-every=10 \ + --eval-tokens=512 \ + --core-metric-every=-1 \ + --sample-every=-1 \ + --save-every=-1 \ + --run="$WANDB_RUN" + +echo "[smoke] $(date -Iseconds) base_train complete — smoke passed" +# trap cleanup handles HF upload + self-delete From 6ee799ef00755f2ff063e8f01fc19fef2664553b Mon Sep 17 00:00:00 2001 From: Hayden Free Date: Sat, 25 Apr 2026 23:10:37 -0400 Subject: [PATCH 04/11] Fix smoke/d12 runner: dedupe logs, disambiguate git ref, robust HF + self-delete on early failure --- runs/runpod/d12.sh | 28 +++++++++++++++++++++++----- runs/runpod/smoke.sh | 28 +++++++++++++++++++++++----- 2 files changed, 46 insertions(+), 10 deletions(-) diff --git a/runs/runpod/d12.sh b/runs/runpod/d12.sh index 9119c87d..2203916f 100755 --- a/runs/runpod/d12.sh +++ b/runs/runpod/d12.sh @@ -33,11 +33,18 @@ NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" BACKUP_PID="" mkdir -p /workspace -exec > >(tee -a "$LOG_FILE") 2>&1 +# NOTE: dockerStartCmd already redirects stdout/stderr to $LOG_FILE. +# Don't add a second tee here — would write every line twice. echo "[runner] $(date -Iseconds) starting on pod=$RUNPOD_POD_ID" echo "[runner] repo=$NANOCHAT_REPO ref=$NANOCHAT_REF hf_repo=$HF_REPO wandb_run=$WANDB_RUN" +# Bootstrap huggingface_hub system-wide so the cleanup trap can upload logs +# even if we fail before the venv is activated. +{ pip3 install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \ + python3 -m pip install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \ + echo "[runner] WARN: could not pre-install huggingface_hub; cleanup uploads may fail"; } || true + cleanup() { local rc=$? set +e @@ -76,9 +83,19 @@ cleanup() { fi echo "[runner] self-deleting pod $RUNPOD_POD_ID" - runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \ - curl -sS -X DELETE -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ - "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" + # Preinstalled runpodctl may be older (legacy 'remove pod') or newer ('pod delete'). + # Try new, then legacy, then REST API. -fsS makes curl fail loudly on HTTP errors. + if runpodctl pod delete "$RUNPOD_POD_ID" 2>&1; then + : + elif runpodctl remove pod "$RUNPOD_POD_ID" 2>&1; then + : + else + echo "[runner] runpodctl delete failed via both syntaxes, using REST API" + curl -fsS -X DELETE \ + -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ + "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1 || \ + echo "[runner] WARN: REST delete also failed — pod may need manual cleanup" + fi exit "$rc" } trap cleanup EXIT @@ -90,7 +107,8 @@ trap cleanup EXIT rm -rf "$WORKDIR" git clone "https://github.com/${NANOCHAT_REPO}.git" "$WORKDIR" cd "$WORKDIR" -git checkout "$NANOCHAT_REF" +# `--` disambiguates ref-vs-file (some images create a `dev` file in HOME) +git checkout "$NANOCHAT_REF" -- echo "[runner] HEAD = $(git rev-parse HEAD)" sed -i 's/--depth=24/--depth=12/' runs/speedrun.sh diff --git a/runs/runpod/smoke.sh b/runs/runpod/smoke.sh index d023a764..0f60bc58 100755 --- a/runs/runpod/smoke.sh +++ b/runs/runpod/smoke.sh @@ -25,10 +25,17 @@ LOG_FILE="/workspace/runner.log" NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" mkdir -p /workspace -exec > >(tee -a "$LOG_FILE") 2>&1 +# NOTE: dockerStartCmd already redirects stdout/stderr to $LOG_FILE. +# Don't add a second tee here — would write every line twice. echo "[smoke] $(date -Iseconds) starting on pod=$RUNPOD_POD_ID" +# Bootstrap huggingface_hub system-wide so the cleanup trap can upload logs +# even if we fail before the venv is activated. Try pip3, then python3 -m pip. +{ pip3 install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \ + python3 -m pip install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \ + echo "[smoke] WARN: could not pre-install huggingface_hub; cleanup uploads may fail"; } || true + cleanup() { local rc=$? set +e @@ -45,9 +52,19 @@ cleanup() { echo "[smoke] artifacts: https://huggingface.co/$HF_REPO/tree/main/$HF_PATH_PREFIX" echo "[smoke] self-deleting pod $RUNPOD_POD_ID" - runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \ - curl -sS -X DELETE -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ - "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" + # The preinstalled runpodctl may be older (legacy 'remove pod' syntax) or newer ('pod delete'). + # Try new, then legacy, then REST API. -fsS makes curl fail loudly on HTTP errors. + if runpodctl pod delete "$RUNPOD_POD_ID" 2>&1; then + : + elif runpodctl remove pod "$RUNPOD_POD_ID" 2>&1; then + : + else + echo "[smoke] runpodctl delete failed via both syntaxes, using REST API" + curl -fsS -X DELETE \ + -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ + "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1 || \ + echo "[smoke] WARN: REST delete also failed — pod may need manual cleanup" + fi exit "$rc" } trap cleanup EXIT @@ -60,7 +77,8 @@ trap cleanup EXIT rm -rf "$WORKDIR" git clone "https://github.com/${NANOCHAT_REPO}.git" "$WORKDIR" cd "$WORKDIR" -git checkout "$NANOCHAT_REF" +# `--` disambiguates ref-vs-file (some images create a `dev` file in HOME) +git checkout "$NANOCHAT_REF" -- echo "[smoke] HEAD = $(git rev-parse HEAD)" # Env + uv From e26bf8719bc8b71ff9c0146ac6bd938c67ad68a8 Mon Sep 17 00:00:00 2001 From: Hayden Free Date: Sat, 25 Apr 2026 23:27:56 -0400 Subject: [PATCH 05/11] Add FA3 probe + kernels upgrade for smoke and d12 runners --- runs/runpod/d12.sh | 25 +++++ runs/runpod/probe_fa3.py | 216 +++++++++++++++++++++++++++++++++++++++ runs/runpod/smoke.sh | 15 +++ 3 files changed, 256 insertions(+) create mode 100644 runs/runpod/probe_fa3.py diff --git a/runs/runpod/d12.sh b/runs/runpod/d12.sh index 2203916f..61fda5f4 100755 --- a/runs/runpod/d12.sh +++ b/runs/runpod/d12.sh @@ -116,8 +116,33 @@ sed -i 's/ --target-param-data-ratio=8//' runs/speedrun.sh echo "[runner] speedrun.sh edits applied:" grep -n 'depth\|target-param' runs/speedrun.sh || true +# Explicit venv setup BEFORE speedrun.sh so we can run diagnostic probes +# inside the venv. speedrun.sh's uv sync is idempotent (no-op the second time). +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR +mkdir -p "$NANOCHAT_BASE_DIR" +command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh +[ -d ".venv" ] || uv venv +uv sync --extra gpu +source .venv/bin/activate pip install --quiet --upgrade huggingface_hub +# Ensure HF token flows to the kernels lib (some libs read HF_HUB_TOKEN, not HF_TOKEN) +export HF_HUB_TOKEN="${HF_TOKEN}" + +# Bump kernels to latest — pyproject pins >=0.11.7 and uv often picks exactly that; +# 0.11.x had kernel-resolution bugs that affect FA3 loading silently. +echo "[runner] upgrading kernels lib for FA3 reliability" +uv pip install --quiet --upgrade 'kernels>=0.13.0' 2>&1 || \ + echo "[runner] WARN: kernels upgrade failed (continuing)" + +# FA3 diagnostic probe — surfaces real errors (nanochat silently swallows them). +# Non-fatal: SDPA fallback is automatic. We want this output in the log +# regardless of outcome so we can decide what to do about FA3. +echo "[runner] === FA3 PROBE BEGIN ===" +python "$WORKDIR/runs/runpod/probe_fa3.py" || echo "[runner] FA3 probe reported issues (non-fatal — continuing with SDPA fallback)" +echo "[runner] === FA3 PROBE END ===" + ( while true; do sleep "$BACKUP_INTERVAL" diff --git a/runs/runpod/probe_fa3.py b/runs/runpod/probe_fa3.py new file mode 100644 index 00000000..dc2575c5 --- /dev/null +++ b/runs/runpod/probe_fa3.py @@ -0,0 +1,216 @@ +#!/usr/bin/env python3 +# pyright: reportMissingImports=false +""" +Comprehensive FA3 / kernels diagnostic probe. + +nanochat/flash_attention.py:_load_flash_attention_3 swallows ALL exceptions silently +and falls back to SDPA. This script runs the same code path with full tracebacks +so we can see why FA3 isn't loading on the pod. + +Run inside the pod (after uv sync, with venv active): + python runs/runpod/probe_fa3.py + +Exits 0 if FA3 is fully usable, 1 if any check fails. + +Note: the pyright pragma above is intentional — torch/huggingface_hub/kernels +are only present at pod runtime; the local IDE will flag them as unresolved. +""" +import os +import sys +import traceback +import platform +import subprocess + +OK = "\033[32mOK\033[0m" +FAIL = "\033[31mFAIL\033[0m" +WARN = "\033[33mWARN\033[0m" + + +def section(n, name): + print() + print("=" * 80) + print(f"[{n}] {name}") + print("=" * 80) + + +def fmt_token(tok): + if not tok: + return "NOT SET" + return f"SET (len={len(tok)}, prefix={tok[:7]}…)" + + +passed_all = True + + +def fail(msg): + global passed_all + passed_all = False + print(f" {FAIL} {msg}") + + +def warn(msg): + print(f" {WARN} {msg}") + + +def ok(msg): + print(f" {OK} {msg}") + + +# --------------------------------------------------------------------------- +section(1, "Environment") +print(f" python : {sys.version.split()[0]}") +print(f" platform : {platform.platform()}") +print(f" cwd : {os.getcwd()}") +hf_tok = os.environ.get("HF_TOKEN", "") +hf_hub_tok = os.environ.get("HF_HUB_TOKEN", "") +print(f" HF_TOKEN : {fmt_token(hf_tok)}") +print(f" HF_HUB_TOKEN : {fmt_token(hf_hub_tok)}") +print(f" HF_HOME : {os.environ.get('HF_HOME', '(default ~/.cache/huggingface)')}") +print(f" HUGGINGFACE_HUB_CACHE : {os.environ.get('HUGGINGFACE_HUB_CACHE', '(unset)')}") +print(f" WANDB_API_KEY: {fmt_token(os.environ.get('WANDB_API_KEY',''))}") + +if not hf_tok: + fail("HF_TOKEN env var is empty — kernels lib will fall back to anonymous and may rate-limit") + +# --------------------------------------------------------------------------- +section(2, "Network connectivity") +for url, label in [ + ("https://huggingface.co", "huggingface.co"), + ("https://cdn-lfs.huggingface.co", "cdn-lfs.huggingface.co"), + ("https://github.com", "github.com"), +]: + try: + rc = subprocess.run( + ["curl", "-sfI", "--max-time", "10", url], + capture_output=True, text=True, timeout=15, + ).returncode + if rc == 0: + ok(f"{label} reachable") + else: + fail(f"{label} unreachable (curl rc={rc})") + except Exception as e: + fail(f"{label}: {type(e).__name__}: {e}") + +# --------------------------------------------------------------------------- +section(3, "huggingface_hub auth (does the token actually work?)") +try: + from huggingface_hub import whoami + info = whoami(token=hf_tok or None) + ok(f"authenticated as: {info.get('name','?')} (type={info.get('type','?')})") + print(f" orgs: {[o.get('name') for o in info.get('orgs', [])]}") + print(f" access token role: {info.get('auth',{}).get('accessToken',{}).get('role','?')}") +except Exception as e: + fail(f"whoami failed: {type(e).__name__}: {e}") + traceback.print_exc() + +# --------------------------------------------------------------------------- +section(4, "PyTorch / CUDA / GPU") +try: + import torch + print(f" torch : {torch.__version__}") + print(f" cuda available : {torch.cuda.is_available()}") + print(f" cuda version : {torch.version.cuda}") + if torch.cuda.is_available(): + print(f" device count : {torch.cuda.device_count()}") + for i in range(torch.cuda.device_count()): + major, minor = torch.cuda.get_device_capability(i) + name = torch.cuda.get_device_name(i) + mark = OK if major == 9 else WARN + print(f" device {i} : {name} sm{major}{minor} [{mark}]") + major, _ = torch.cuda.get_device_capability(0) + if major != 9: + fail(f"FA3 requires sm90 (Hopper); device 0 is sm{major}{_}") + else: + fail("CUDA not available") +except Exception as e: + fail(f"torch import failed: {type(e).__name__}: {e}") + traceback.print_exc() + +# --------------------------------------------------------------------------- +section(5, "kernels library") +try: + import kernels + ver = getattr(kernels, "__version__", "?") + print(f" kernels : {ver}") + if ver != "?": + major_minor = tuple(int(x) for x in ver.split(".")[:2]) + if major_minor < (0, 13): + warn(f"kernels {ver} < 0.13 — older versions have known kernel-resolution bugs; consider 'uv pip install --upgrade kernels'") + else: + ok(f"kernels {ver} is recent") + print(f" kernels path : {kernels.__file__}") +except Exception as e: + fail(f"kernels not importable: {type(e).__name__}: {e}") + traceback.print_exc() + sys.exit(1) + +# --------------------------------------------------------------------------- +section(6, "Fetch varunneal/flash-attention-3 (THE actual nanochat code path)") +os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1") +try: + from kernels import get_kernel + print(" calling get_kernel('varunneal/flash-attention-3') …") + k = get_kernel("varunneal/flash-attention-3") + ok(f"get_kernel returned: {type(k).__name__}") + print(f" module path: {getattr(k, '__file__', '(no __file__)')}") + iface = k.flash_attn_interface + ok(f"flash_attn_interface: {type(iface).__name__}") + fn = iface.flash_attn_func + ok(f"flash_attn_func: callable={callable(fn)}") + print("\n >>> FA3 binary is usable on this pod. <<<") +except Exception as e: + fail(f"FA3 fetch failed: {type(e).__name__}: {e}") + print() + traceback.print_exc() + print() + print(" Likely causes:") + print(" 1. Network/DNS issue (HF Hub unreachable from this DC)") + print(" 2. Old kernels version with resolver bugs (try kernels>=0.13)") + print(" 3. HF token not flowing — try `export HF_HUB_TOKEN=$HF_TOKEN`") + print(" 4. No prebuilt binary for this torch/cuda combo (we have torch 2.9 + cu128 — should be supported)") + +# --------------------------------------------------------------------------- +section(7, "HF Hub cache state") +import pathlib +cache_root = pathlib.Path(os.environ.get("HF_HOME", os.path.expanduser("~/.cache/huggingface"))) +print(f" cache root : {cache_root}") +if cache_root.exists(): + try: + size = sum(p.stat().st_size for p in cache_root.rglob("*") if p.is_file()) + print(f" size on disk : {size / 1024 / 1024:.1f} MB") + except Exception as e: + print(f" (could not size cache: {e})") + fa3_marker = list(cache_root.rglob("*flash*attention*3*")) + if fa3_marker: + ok(f"found FA3-related cache entries: {len(fa3_marker)}") + for p in fa3_marker[:5]: + print(f" {p}") + else: + warn("no flash-attention-3 entries in cache yet") +else: + print(" (cache directory does not exist)") + +# --------------------------------------------------------------------------- +section(8, "Replicate nanochat.flash_attention detection") +try: + from nanochat.flash_attention import HAS_FA3, USE_FA3, _fa3 + if HAS_FA3: + ok("nanochat.flash_attention.HAS_FA3 = True") + else: + fail("nanochat.flash_attention.HAS_FA3 = False (despite section 6 results)") + print(f" USE_FA3 = {USE_FA3}") + print(f" _fa3 object: {_fa3}") +except Exception as e: + fail(f"import nanochat.flash_attention failed: {type(e).__name__}: {e}") + traceback.print_exc() + +# --------------------------------------------------------------------------- +section(9, "Verdict") +if passed_all: + print(f" {OK} all checks passed — FA3 is wired up and base_train should use it") + sys.exit(0) +else: + print(f" {FAIL} one or more checks failed — see above. Run will fall back to SDPA (slower, possibly much).") + print() + print(" Continuing the training run anyway is safe; FA3 fallback to SDPA is automatic.") + sys.exit(1) diff --git a/runs/runpod/smoke.sh b/runs/runpod/smoke.sh index 0f60bc58..504544cc 100755 --- a/runs/runpod/smoke.sh +++ b/runs/runpod/smoke.sh @@ -91,9 +91,24 @@ uv sync --extra gpu source .venv/bin/activate pip install --quiet --upgrade huggingface_hub +# Ensure HF token flows to the kernels lib (some libs read HF_HUB_TOKEN, not HF_TOKEN) +export HF_HUB_TOKEN="${HF_TOKEN}" + +# Bump kernels to latest — pyproject pins >=0.11.7, uv often picks exactly that; +# 0.11.x had kernel-resolution bugs that affect FA3 loading silently. +echo "[smoke] upgrading kernels lib for FA3 reliability" +uv pip install --quiet --upgrade 'kernels>=0.13.0' 2>&1 || \ + echo "[smoke] WARN: kernels upgrade failed (continuing with whatever uv installed)" + # GPU sanity python -c "import torch; print('[smoke] torch', torch.__version__, 'cuda', torch.cuda.is_available(), 'devices', torch.cuda.device_count())" +# FA3 diagnostic probe — surfaces the real error if FA3 won't load (nanochat +# silently swallows it). Non-fatal: SDPA fallback is automatic if probe fails. +echo "[smoke] === FA3 PROBE BEGIN ===" +python "$WORKDIR/runs/runpod/probe_fa3.py" || echo "[smoke] FA3 probe reported issues (non-fatal — continuing with SDPA fallback)" +echo "[smoke] === FA3 PROBE END ===" + # Minimum dataset + tokenizer (1 shard, 50M chars — enough for the tokenizer # to train on AND for base_train to consume 20 iterations of tokens) python -m nanochat.dataset -n 1 From 66b92b108cf5442481268125f41a25228ef5cf3b Mon Sep 17 00:00:00 2001 From: Hayden Free Date: Sat, 25 Apr 2026 23:46:35 -0400 Subject: [PATCH 06/11] Use 'hf' (huggingface-cli is deprecated); REST-first self-delete --- runs/runpod/d12.sh | 29 ++++++++++++++--------------- runs/runpod/smoke.sh | 23 +++++++++++------------ 2 files changed, 25 insertions(+), 27 deletions(-) diff --git a/runs/runpod/d12.sh b/runs/runpod/d12.sh index 61fda5f4..aea99d9a 100755 --- a/runs/runpod/d12.sh +++ b/runs/runpod/d12.sh @@ -59,7 +59,7 @@ cleanup() { if [ "$rc" -eq 0 ]; then echo "[runner] success — final upload to $HF_REPO" if [ -d "$NANOCHAT_BASE_DIR" ]; then - huggingface-cli upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \ + hf upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \ --repo-type model --commit-message "final rc=0 $TS" || \ echo "[runner] WARN: final upload failed" fi @@ -70,31 +70,30 @@ cleanup() { [ -d "$NANOCHAT_BASE_DIR/report" ] && cp -r "$NANOCHAT_BASE_DIR/report" /tmp/failure/ 2>/dev/null || true [ -d "$WORKDIR" ] && (cd "$WORKDIR" && git rev-parse HEAD 2>/dev/null > /tmp/failure/git-head.txt || true) - huggingface-cli upload "$HF_REPO" /tmp/failure "_failures/${TS}-rc${rc}/logs" \ + hf upload "$HF_REPO" /tmp/failure "_failures/${TS}-rc${rc}/logs" \ --repo-type model --commit-message "failure rc=$rc logs $TS" || \ echo "[runner] WARN: log upload failed" if [ "$UPLOAD_FAILURE_CACHE" = "1" ] && [ -d "$NANOCHAT_BASE_DIR" ]; then echo "[runner] UPLOAD_FAILURE_CACHE=1 — also dumping partial cache (may be slow)" - huggingface-cli upload "$HF_REPO" "$NANOCHAT_BASE_DIR" "_failures/${TS}-rc${rc}/cache" \ + hf upload "$HF_REPO" "$NANOCHAT_BASE_DIR" "_failures/${TS}-rc${rc}/cache" \ --repo-type model --commit-message "failure rc=$rc cache $TS" || true fi echo "[runner] failure artifacts: https://huggingface.co/$HF_REPO/tree/main/_failures/${TS}-rc${rc}" fi echo "[runner] self-deleting pod $RUNPOD_POD_ID" - # Preinstalled runpodctl may be older (legacy 'remove pod') or newer ('pod delete'). - # Try new, then legacy, then REST API. -fsS makes curl fail loudly on HTTP errors. - if runpodctl pod delete "$RUNPOD_POD_ID" 2>&1; then - : - elif runpodctl remove pod "$RUNPOD_POD_ID" 2>&1; then - : + # REST API first — pod-scoped key has delete permission and the API is reliable. + # The pod's preinstalled runpodctl is unreliable (often missing config or 'pod' subcommand). + if curl -fsS -X DELETE \ + -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ + "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1; then + echo "[runner] REST delete request accepted" else - echo "[runner] runpodctl delete failed via both syntaxes, using REST API" - curl -fsS -X DELETE \ - -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ - "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1 || \ - echo "[runner] WARN: REST delete also failed — pod may need manual cleanup" + echo "[runner] REST delete failed, trying runpodctl as fallback" + runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \ + runpodctl remove pod "$RUNPOD_POD_ID" 2>&1 || \ + echo "[runner] WARN: all delete methods failed — pod may need manual cleanup" fi exit "$rc" } @@ -147,7 +146,7 @@ echo "[runner] === FA3 PROBE END ===" while true; do sleep "$BACKUP_INTERVAL" if [ -d "$NANOCHAT_BASE_DIR" ]; then - huggingface-cli upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \ + hf upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \ --repo-type model \ --commit-message "checkpoint $(date -Iseconds)" >> /workspace/backup.log 2>&1 || true fi diff --git a/runs/runpod/smoke.sh b/runs/runpod/smoke.sh index 504544cc..86df17a3 100755 --- a/runs/runpod/smoke.sh +++ b/runs/runpod/smoke.sh @@ -46,24 +46,23 @@ cleanup() { cp /workspace/*.log /tmp/smoke-out/ 2>/dev/null || true echo "rc=$rc ts=$TS pod=$RUNPOD_POD_ID" > /tmp/smoke-out/result.txt [ -d "$WORKDIR" ] && (cd "$WORKDIR" && git rev-parse HEAD 2>/dev/null > /tmp/smoke-out/git-head.txt || true) - huggingface-cli upload "$HF_REPO" /tmp/smoke-out "$HF_PATH_PREFIX" \ + hf upload "$HF_REPO" /tmp/smoke-out "$HF_PATH_PREFIX" \ --repo-type model --commit-message "smoke rc=$rc $TS" || \ echo "[smoke] WARN: HF upload failed" echo "[smoke] artifacts: https://huggingface.co/$HF_REPO/tree/main/$HF_PATH_PREFIX" echo "[smoke] self-deleting pod $RUNPOD_POD_ID" - # The preinstalled runpodctl may be older (legacy 'remove pod' syntax) or newer ('pod delete'). - # Try new, then legacy, then REST API. -fsS makes curl fail loudly on HTTP errors. - if runpodctl pod delete "$RUNPOD_POD_ID" 2>&1; then - : - elif runpodctl remove pod "$RUNPOD_POD_ID" 2>&1; then - : + # REST API first — pod-scoped key has delete permission and the API is reliable. + # The pod's preinstalled runpodctl is unreliable (often missing config or 'pod' subcommand). + if curl -fsS -X DELETE \ + -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ + "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1; then + echo "[smoke] REST delete request accepted" else - echo "[smoke] runpodctl delete failed via both syntaxes, using REST API" - curl -fsS -X DELETE \ - -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ - "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1 || \ - echo "[smoke] WARN: REST delete also failed — pod may need manual cleanup" + echo "[smoke] REST delete failed, trying runpodctl as fallback" + runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \ + runpodctl remove pod "$RUNPOD_POD_ID" 2>&1 || \ + echo "[smoke] WARN: all delete methods failed — pod may need manual cleanup" fi exit "$rc" } From 21183f01a39c77aec79ddd41d926e6cfa47b532d Mon Sep 17 00:00:00 2001 From: Hayden Free Date: Sat, 25 Apr 2026 23:49:14 -0400 Subject: [PATCH 07/11] fix probe file --- runs/runpod/probe_fa3.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/runs/runpod/probe_fa3.py b/runs/runpod/probe_fa3.py index dc2575c5..392f1d81 100644 --- a/runs/runpod/probe_fa3.py +++ b/runs/runpod/probe_fa3.py @@ -21,6 +21,12 @@ import traceback import platform import subprocess +# This probe lives at /runs/runpod/probe_fa3.py — add repo root to +# sys.path so we can `import nanochat.*` regardless of cwd or how we're invoked. +_REPO_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +if _REPO_ROOT not in sys.path: + sys.path.insert(0, _REPO_ROOT) + OK = "\033[32mOK\033[0m" FAIL = "\033[31mFAIL\033[0m" WARN = "\033[33mWARN\033[0m" From 39cee6fc76598005b42fafb93c1fba6b374d80d0 Mon Sep 17 00:00:00 2001 From: Hayden Free Date: Sun, 26 Apr 2026 00:02:19 -0400 Subject: [PATCH 08/11] Exclude dataset shards from HF uploads --- runs/runpod/d12.sh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/runs/runpod/d12.sh b/runs/runpod/d12.sh index aea99d9a..2962701a 100755 --- a/runs/runpod/d12.sh +++ b/runs/runpod/d12.sh @@ -59,8 +59,10 @@ cleanup() { if [ "$rc" -eq 0 ]; then echo "[runner] success — final upload to $HF_REPO" if [ -d "$NANOCHAT_BASE_DIR" ]; then + # Skip the climbmix dataset shards (~2GB of public data, not model artifacts) hf upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \ - --repo-type model --commit-message "final rc=0 $TS" || \ + --repo-type model --commit-message "final rc=0 $TS" \ + --exclude "base_data_climbmix/**" --exclude "wandb/**" || \ echo "[runner] WARN: final upload failed" fi else @@ -77,7 +79,8 @@ cleanup() { if [ "$UPLOAD_FAILURE_CACHE" = "1" ] && [ -d "$NANOCHAT_BASE_DIR" ]; then echo "[runner] UPLOAD_FAILURE_CACHE=1 — also dumping partial cache (may be slow)" hf upload "$HF_REPO" "$NANOCHAT_BASE_DIR" "_failures/${TS}-rc${rc}/cache" \ - --repo-type model --commit-message "failure rc=$rc cache $TS" || true + --repo-type model --commit-message "failure rc=$rc cache $TS" \ + --exclude "base_data_climbmix/**" --exclude "wandb/**" || true fi echo "[runner] failure artifacts: https://huggingface.co/$HF_REPO/tree/main/_failures/${TS}-rc${rc}" fi @@ -148,7 +151,9 @@ echo "[runner] === FA3 PROBE END ===" if [ -d "$NANOCHAT_BASE_DIR" ]; then hf upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \ --repo-type model \ - --commit-message "checkpoint $(date -Iseconds)" >> /workspace/backup.log 2>&1 || true + --commit-message "checkpoint $(date -Iseconds)" \ + --exclude "base_data_climbmix/**" --exclude "wandb/**" \ + >> /workspace/backup.log 2>&1 || true fi done ) & From 22eeed326c1bddc1b5aa63266cb354a372dc7809 Mon Sep 17 00:00:00 2001 From: Hayden Free Date: Sun, 26 Apr 2026 00:31:29 -0400 Subject: [PATCH 09/11] Add SFT-only resume runner; install hf_transfer in d12 runner --- runs/runpod/d12.sh | 32 +++++++- runs/runpod/d12_sft_only.sh | 149 ++++++++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+), 1 deletion(-) create mode 100755 runs/runpod/d12_sft_only.sh diff --git a/runs/runpod/d12.sh b/runs/runpod/d12.sh index 2962701a..dbf4aa5b 100755 --- a/runs/runpod/d12.sh +++ b/runs/runpod/d12.sh @@ -65,6 +65,12 @@ cleanup() { --exclude "base_data_climbmix/**" --exclude "wandb/**" || \ echo "[runner] WARN: final upload failed" fi + # Also upload the runner log so we have a permanent record of this successful run. + if [ -f "$LOG_FILE" ]; then + hf upload "$HF_REPO" "$LOG_FILE" "_runs/${TS}/runner.log" \ + --repo-type model --commit-message "runner log $TS" || \ + echo "[runner] WARN: runner.log upload failed" + fi else echo "[runner] failure rc=$rc — dumping logs to HF for offline debug" mkdir -p /tmp/failure @@ -115,8 +121,11 @@ echo "[runner] HEAD = $(git rev-parse HEAD)" sed -i 's/--depth=24/--depth=12/' runs/speedrun.sh sed -i 's/ --target-param-data-ratio=8//' runs/speedrun.sh +# Inject `set -euo pipefail` so a mid-pipeline failure (e.g. chat_sft) propagates +# as rc!=0 instead of being silently swallowed by the next command. +sed -i '1a set -euo pipefail' runs/speedrun.sh echo "[runner] speedrun.sh edits applied:" -grep -n 'depth\|target-param' runs/speedrun.sh || true +grep -n 'depth\|target-param\|set -e' runs/speedrun.sh || true # Explicit venv setup BEFORE speedrun.sh so we can run diagnostic probes # inside the venv. speedrun.sh's uv sync is idempotent (no-op the second time). @@ -138,6 +147,12 @@ echo "[runner] upgrading kernels lib for FA3 reliability" uv pip install --quiet --upgrade 'kernels>=0.13.0' 2>&1 || \ echo "[runner] WARN: kernels upgrade failed (continuing)" +# Install hf_transfer — runpod base image sets HF_HUB_ENABLE_HF_TRANSFER=1, which +# makes huggingface_hub raise ValueError if the package is missing. chat_sft loads +# HuggingFaceTB/smol-smoltalk via datasets and crashes without this. +echo "[runner] installing hf_transfer for SFT dataset download" +uv pip install --quiet hf_transfer 2>&1 || echo "[runner] WARN: hf_transfer install failed" + # FA3 diagnostic probe — surfaces real errors (nanochat silently swallows them). # Non-fatal: SDPA fallback is automatic. We want this output in the log # regardless of outcome so we can decide what to do about FA3. @@ -163,4 +178,19 @@ echo "[runner] backup loop pid=$BACKUP_PID interval=${BACKUP_INTERVAL}s" export WANDB_RUN WANDB_RUN="$WANDB_RUN" bash runs/speedrun.sh +# Verify expected pipeline outputs — speedrun.sh historically didn't `set -e`; +# we patched it above, but double-check the artifacts that matter for the d12 baseline. +echo "[runner] verifying pipeline outputs" +missing=() +for required in base_checkpoints/d12 chatsft_checkpoints/d12 tokenizer report; do + if [ ! -d "$NANOCHAT_BASE_DIR/$required" ]; then + missing+=("$required") + fi +done +if [ ${#missing[@]} -gt 0 ]; then + echo "[runner] FAIL: pipeline finished but missing expected artifacts: ${missing[*]}" + exit 1 +fi +echo "[runner] all expected artifacts present" + echo "[runner] $(date -Iseconds) pipeline complete" diff --git a/runs/runpod/d12_sft_only.sh b/runs/runpod/d12_sft_only.sh new file mode 100755 index 00000000..42fc3567 --- /dev/null +++ b/runs/runpod/d12_sft_only.sh @@ -0,0 +1,149 @@ +#!/usr/bin/env bash +# d12 SFT-only resume runner. Runs INSIDE a RunPod pod. +# +# Use case: the d12 base_train + base_eval already succeeded and uploaded to HF, +# but chat_sft failed (e.g., missing hf_transfer package). Instead of re-running +# the whole pipeline, this runner: +# 1. Downloads base_checkpoints/d12/ + tokenizer/ from HF +# 2. Installs hf_transfer (the actual SFT bug fix) +# 3. Runs chat_sft + chat_eval directly (skips speedrun.sh) +# 4. Uploads chatsft_checkpoints/ + chat_eval results + report to HF +# 5. Self-deletes +# +# Required env: HF_TOKEN, WANDB_API_KEY +# Optional env: +# WANDB_RUN default: d12-sft +# NANOCHAT_REPO default: Team-XSA/nanochat +# NANOCHAT_REF default: dev +# HF_REPO default: haydenfree/nanochat-d12-baseline (where the base lives) + +set -euo pipefail + +NANOCHAT_REPO="${NANOCHAT_REPO:-Team-XSA/nanochat}" +NANOCHAT_REF="${NANOCHAT_REF:-dev}" +HF_REPO="${HF_REPO:-haydenfree/nanochat-d12-baseline}" +WANDB_RUN="${WANDB_RUN:-d12-sft}" + +WORKDIR="/workspace/nanochat" +LOG_FILE="/workspace/runner.log" +NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" + +mkdir -p /workspace +echo "[sft] $(date -Iseconds) starting on pod=$RUNPOD_POD_ID" +echo "[sft] resuming from base checkpoint at $HF_REPO" + +# Bootstrap huggingface_hub system-wide so cleanup can upload logs even on early failure. +{ pip3 install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \ + python3 -m pip install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \ + echo "[sft] WARN: could not pre-install huggingface_hub"; } || true + +cleanup() { + local rc=$? + set +e + echo "[sft] cleanup: exit code $rc at $(date -Iseconds)" + + local TS + TS=$(date -u +%Y%m%dT%H%M%SZ) + + if [ "$rc" -eq 0 ]; then + echo "[sft] success — uploading chatsft_checkpoints + report + log" + # Only upload the SFT-specific subdirs so we don't re-upload base. + for subdir in chatsft_checkpoints report; do + if [ -d "$NANOCHAT_BASE_DIR/$subdir" ]; then + hf upload "$HF_REPO" "$NANOCHAT_BASE_DIR/$subdir" "$subdir" \ + --repo-type model --commit-message "$subdir SFT-resume rc=0 $TS" || \ + echo "[sft] WARN: $subdir upload failed" + fi + done + if [ -f "$LOG_FILE" ]; then + hf upload "$HF_REPO" "$LOG_FILE" "_runs/${TS}-sft/runner.log" \ + --repo-type model --commit-message "SFT runner log $TS" || \ + echo "[sft] WARN: runner.log upload failed" + fi + else + echo "[sft] failure rc=$rc — dumping logs" + mkdir -p /tmp/failure + cp /workspace/*.log /tmp/failure/ 2>/dev/null || true + [ -d "$NANOCHAT_BASE_DIR/report" ] && cp -r "$NANOCHAT_BASE_DIR/report" /tmp/failure/ 2>/dev/null || true + [ -d "$WORKDIR" ] && (cd "$WORKDIR" && git rev-parse HEAD 2>/dev/null > /tmp/failure/git-head.txt || true) + hf upload "$HF_REPO" /tmp/failure "_failures/${TS}-sft-rc${rc}/logs" \ + --repo-type model --commit-message "SFT-resume failure rc=$rc $TS" || \ + echo "[sft] WARN: log upload failed" + fi + + echo "[sft] self-deleting pod $RUNPOD_POD_ID" + if curl -fsS -X DELETE \ + -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ + "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1; then + echo "[sft] REST delete request accepted" + else + echo "[sft] REST delete failed, trying runpodctl as fallback" + runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \ + runpodctl remove pod "$RUNPOD_POD_ID" 2>&1 || \ + echo "[sft] WARN: all delete methods failed — pod may need manual cleanup" + fi + exit "$rc" +} +trap cleanup EXIT + +: "${HF_TOKEN:?HF_TOKEN must be set}" +: "${WANDB_API_KEY:?WANDB_API_KEY must be set}" +: "${RUNPOD_POD_ID:?RUNPOD_POD_ID must be set (auto by RunPod)}" + +# Clone fork +rm -rf "$WORKDIR" +git clone "https://github.com/${NANOCHAT_REPO}.git" "$WORKDIR" +cd "$WORKDIR" +git checkout "$NANOCHAT_REF" -- +echo "[sft] HEAD = $(git rev-parse HEAD)" + +# Env + uv +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR +mkdir -p "$NANOCHAT_BASE_DIR" +command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh +[ -d ".venv" ] || uv venv +uv sync --extra gpu +source .venv/bin/activate +pip install --quiet --upgrade huggingface_hub +export HF_HUB_TOKEN="${HF_TOKEN}" + +# Install hf_transfer — THE actual fix for the previous SFT failure. +echo "[sft] installing hf_transfer (the bug from last run)" +uv pip install --quiet hf_transfer + +# Pull tokenizer + base checkpoint from HF — skip base_train entirely +echo "[sft] downloading tokenizer and base_checkpoints/d12 from $HF_REPO" +hf download "$HF_REPO" \ + --include "tokenizer/**" \ + --include "base_checkpoints/d12/**" \ + --local-dir "$NANOCHAT_BASE_DIR" \ + --repo-type model + +ls -la "$NANOCHAT_BASE_DIR/base_checkpoints/d12/" || true +ls -la "$NANOCHAT_BASE_DIR/tokenizer/" || true + +# Also need identity_conversations.jsonl for SFT (speedrun.sh normally fetches it) +echo "[sft] fetching identity_conversations.jsonl" +curl -L -fsS -o "$NANOCHAT_BASE_DIR/identity_conversations.jsonl" \ + https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl + +# Run only SFT + chat_eval + report. NOT speedrun.sh (which would re-do base_train). +NPROC=$(nvidia-smi -L | wc -l) +echo "[sft] running chat_sft on $NPROC GPUs" +torchrun --standalone --nproc_per_node="$NPROC" -m scripts.chat_sft -- \ + --device-batch-size=16 --run="$WANDB_RUN" + +echo "[sft] running chat_eval" +torchrun --standalone --nproc_per_node="$NPROC" -m scripts.chat_eval -- -i sft + +echo "[sft] regenerating report (will include new SFT sections)" +python -m nanochat.report generate || true + +# Verify SFT artifacts exist before declaring success +if [ ! -d "$NANOCHAT_BASE_DIR/chatsft_checkpoints" ]; then + echo "[sft] FAIL: chatsft_checkpoints/ missing after chat_sft" + exit 1 +fi + +echo "[sft] $(date -Iseconds) SFT pipeline complete" From eb66bbd4e246d759374f7010b4132c1f5869b8dc Mon Sep 17 00:00:00 2001 From: Hayden Free Date: Sun, 26 Apr 2026 00:54:23 -0400 Subject: [PATCH 10/11] =?UTF-8?q?Split=20tokenizer=20+=20base=20downloads?= =?UTF-8?q?=20=E2=80=94=20hf=20download=20--include=20only=20honors=20last?= =?UTF-8?q?=20value?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- runs/runpod/d12_sft_only.sh | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/runs/runpod/d12_sft_only.sh b/runs/runpod/d12_sft_only.sh index 42fc3567..46f2fd94 100755 --- a/runs/runpod/d12_sft_only.sh +++ b/runs/runpod/d12_sft_only.sh @@ -112,16 +112,28 @@ export HF_HUB_TOKEN="${HF_TOKEN}" echo "[sft] installing hf_transfer (the bug from last run)" uv pip install --quiet hf_transfer -# Pull tokenizer + base checkpoint from HF — skip base_train entirely -echo "[sft] downloading tokenizer and base_checkpoints/d12 from $HF_REPO" +# Pull tokenizer + base checkpoint from HF in TWO separate calls. +# `hf download` only honors the LAST --include when specified multiple times +# (multi-include works for upload, not download — verified the hard way). +echo "[sft] downloading tokenizer from $HF_REPO" hf download "$HF_REPO" \ --include "tokenizer/**" \ + --local-dir "$NANOCHAT_BASE_DIR" \ + --repo-type model + +echo "[sft] downloading base_checkpoints/d12 from $HF_REPO" +hf download "$HF_REPO" \ --include "base_checkpoints/d12/**" \ --local-dir "$NANOCHAT_BASE_DIR" \ --repo-type model -ls -la "$NANOCHAT_BASE_DIR/base_checkpoints/d12/" || true -ls -la "$NANOCHAT_BASE_DIR/tokenizer/" || true +# Verify both pieces actually landed before invoking chat_sft. +echo "[sft] verifying downloads" +ls -la "$NANOCHAT_BASE_DIR/base_checkpoints/d12/" 2>&1 || true +ls -la "$NANOCHAT_BASE_DIR/tokenizer/" 2>&1 || true +[ -f "$NANOCHAT_BASE_DIR/tokenizer/tokenizer.pkl" ] || { echo "[sft] FAIL: tokenizer.pkl missing after download"; exit 1; } +[ -n "$(ls -A "$NANOCHAT_BASE_DIR/base_checkpoints/d12/" 2>/dev/null)" ] || { echo "[sft] FAIL: base_checkpoints/d12 is empty"; exit 1; } +echo "[sft] downloads verified" # Also need identity_conversations.jsonl for SFT (speedrun.sh normally fetches it) echo "[sft] fetching identity_conversations.jsonl" From a305bd9612028a281daee7a9bda9e64079c3c1b8 Mon Sep 17 00:00:00 2001 From: zolopgh Date: Sun, 26 Apr 2026 01:30:26 -0400 Subject: [PATCH 11/11] Implemented XSA, added to pace scripts --- nanochat/checkpoint_manager.py | 3 +++ nanochat/gpt.py | 13 +++++++++++++ runs/pace_stage1_tokenizer.sh | 1 + runs/pace_stage2a_pretrain.sh | 5 +++++ runs/pace_stage2b_pretrain.sh | 6 ++++++ runs/pace_stage2c_pretrain.sh | 6 ++++++ runs/pace_stage2d_pretrain.sh | 6 ++++++ runs/pace_stage3_sft.sh | 2 +- runs/pace_submit.sh | 18 ++++++++++++------ scripts/base_train.py | 5 ++++- scripts/chat_sft.py | 1 + 11 files changed, 58 insertions(+), 8 deletions(-) diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py index f71524ed..852139f7 100644 --- a/nanochat/checkpoint_manager.py +++ b/nanochat/checkpoint_manager.py @@ -26,6 +26,9 @@ def _patch_missing_config_keys(model_config_kwargs): if "window_pattern" not in model_config_kwargs: model_config_kwargs["window_pattern"] = "L" log0(f"Patching missing window_pattern in model config to 'L'") + if "use_xsa" not in model_config_kwargs: + model_config_kwargs["use_xsa"] = False + log0(f"Patching missing use_xsa in model config to False") def _patch_missing_keys(model_data, model_config): """Add default values for new parameters that may be missing in old checkpoints.""" diff --git a/nanochat/gpt.py b/nanochat/gpt.py index 07a1eae8..20db8b8a 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -37,6 +37,7 @@ class GPTConfig: # Characters: L=long (full context), S=short (quarter context) # Examples: "L"=all full context, "SL"=alternating, "SSL"=two short then one long window_pattern: str = "SSSL" + use_xsa: bool = False def norm(x): @@ -70,6 +71,8 @@ class CausalSelfAttention(nn.Module): self.n_kv_head = config.n_kv_head self.n_embd = config.n_embd self.head_dim = self.n_embd // self.n_head + self.use_xsa = config.use_xsa + self.xsa = ExclusiveSelfAttention() assert self.n_embd % self.n_head == 0 assert self.n_kv_head <= self.n_head and self.n_head % self.n_kv_head == 0 self.c_q = Linear(self.n_embd, self.n_head * self.head_dim, bias=False) @@ -120,12 +123,22 @@ class CausalSelfAttention(nn.Module): if self.layer_idx == kv_cache.n_layers - 1: kv_cache.advance(T) + if self.use_xsa: + y = self.xsa.XSA(y, v) + # Re-assemble the heads and project back to residual stream y = y.contiguous().view(B, T, -1) y = self.c_proj(y) return y +class ExclusiveSelfAttention(nn.Module): + def XSA(self, y, v): + Vn = F.normalize(v, dim=-1) + Z = y - (y * Vn).sum(dim=-1, keepdim=True) * Vn + return Z + + class MLP(nn.Module): def __init__(self, config): super().__init__() diff --git a/runs/pace_stage1_tokenizer.sh b/runs/pace_stage1_tokenizer.sh index 8caa6214..95573de1 100644 --- a/runs/pace_stage1_tokenizer.sh +++ b/runs/pace_stage1_tokenizer.sh @@ -20,6 +20,7 @@ mkdir -p runs/logs echo "=== Stage 1: Tokenizer ===" echo "Base dir: $NANOCHAT_BASE_DIR" +echo "XSA: ${XSA:-FALSE}" echo "Started: $(date)" command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh diff --git a/runs/pace_stage2a_pretrain.sh b/runs/pace_stage2a_pretrain.sh index bee6d11f..732099d8 100644 --- a/runs/pace_stage2a_pretrain.sh +++ b/runs/pace_stage2a_pretrain.sh @@ -21,12 +21,16 @@ export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" mkdir -p runs/logs WANDB_RUN="${WANDB_RUN:-dummy}" +XSA="${XSA:-FALSE}" +XSA_ARG="" +[ "$XSA" = "TRUE" ] && XSA_ARG="--xsa" CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" DONE_MARKER="$CHECKPOINT_DIR/.training_complete" echo "=== Stage 2a: Pretraining (chunk 1) ===" echo "Base dir: $NANOCHAT_BASE_DIR" echo "WANDB_RUN: $WANDB_RUN" +echo "XSA: $XSA" echo "Started: $(date)" source .venv/bin/activate @@ -36,6 +40,7 @@ torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ --target-param-data-ratio=8 \ --device-batch-size=16 \ --save-every=200 \ + $XSA_ARG \ --run=$WANDB_RUN mkdir -p "$CHECKPOINT_DIR" diff --git a/runs/pace_stage2b_pretrain.sh b/runs/pace_stage2b_pretrain.sh index 7b8ea806..7e7ad3be 100644 --- a/runs/pace_stage2b_pretrain.sh +++ b/runs/pace_stage2b_pretrain.sh @@ -21,11 +21,15 @@ export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" mkdir -p runs/logs WANDB_RUN="${WANDB_RUN:-dummy}" +XSA="${XSA:-FALSE}" +XSA_ARG="" +[ "$XSA" = "TRUE" ] && XSA_ARG="--xsa" CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" DONE_MARKER="$CHECKPOINT_DIR/.training_complete" echo "=== Stage 2b: Pretraining (chunk 2 / auto-resume) ===" echo "Base dir: $NANOCHAT_BASE_DIR" +echo "XSA: $XSA" echo "Started: $(date)" if [ -f "$DONE_MARKER" ]; then @@ -51,6 +55,7 @@ if [ "$LAST_STEP" -eq 0 ]; then --target-param-data-ratio=8 \ --device-batch-size=16 \ --save-every=200 \ + $XSA_ARG \ --run=$WANDB_RUN else echo "Resuming from step $LAST_STEP" @@ -60,6 +65,7 @@ else --device-batch-size=16 \ --save-every=200 \ --resume-from-step=$LAST_STEP \ + $XSA_ARG \ --run=$WANDB_RUN fi diff --git a/runs/pace_stage2c_pretrain.sh b/runs/pace_stage2c_pretrain.sh index 92c748e9..5089a444 100644 --- a/runs/pace_stage2c_pretrain.sh +++ b/runs/pace_stage2c_pretrain.sh @@ -21,11 +21,15 @@ export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" mkdir -p runs/logs WANDB_RUN="${WANDB_RUN:-dummy}" +XSA="${XSA:-FALSE}" +XSA_ARG="" +[ "$XSA" = "TRUE" ] && XSA_ARG="--xsa" CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" DONE_MARKER="$CHECKPOINT_DIR/.training_complete" echo "=== Stage 2c: Pretraining (chunk 3 / auto-resume) ===" echo "Base dir: $NANOCHAT_BASE_DIR" +echo "XSA: $XSA" echo "Started: $(date)" if [ -f "$DONE_MARKER" ]; then @@ -51,6 +55,7 @@ if [ "$LAST_STEP" -eq 0 ]; then --target-param-data-ratio=8 \ --device-batch-size=16 \ --save-every=200 \ + $XSA_ARG \ --run=$WANDB_RUN else echo "Resuming from step $LAST_STEP" @@ -60,6 +65,7 @@ else --device-batch-size=16 \ --save-every=200 \ --resume-from-step=$LAST_STEP \ + $XSA_ARG \ --run=$WANDB_RUN fi diff --git a/runs/pace_stage2d_pretrain.sh b/runs/pace_stage2d_pretrain.sh index 70030001..ffe190d6 100644 --- a/runs/pace_stage2d_pretrain.sh +++ b/runs/pace_stage2d_pretrain.sh @@ -21,11 +21,15 @@ export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" mkdir -p runs/logs WANDB_RUN="${WANDB_RUN:-dummy}" +XSA="${XSA:-FALSE}" +XSA_ARG="" +[ "$XSA" = "TRUE" ] && XSA_ARG="--xsa" CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" DONE_MARKER="$CHECKPOINT_DIR/.training_complete" echo "=== Stage 2d: Pretraining (chunk 4 / auto-resume) ===" echo "Base dir: $NANOCHAT_BASE_DIR" +echo "XSA: $XSA" echo "Started: $(date)" if [ -f "$DONE_MARKER" ]; then @@ -51,6 +55,7 @@ if [ "$LAST_STEP" -eq 0 ]; then --target-param-data-ratio=8 \ --device-batch-size=16 \ --save-every=200 \ + $XSA_ARG \ --run=$WANDB_RUN else echo "Resuming from step $LAST_STEP" @@ -60,6 +65,7 @@ else --device-batch-size=16 \ --save-every=200 \ --resume-from-step=$LAST_STEP \ + $XSA_ARG \ --run=$WANDB_RUN fi diff --git a/runs/pace_stage3_sft.sh b/runs/pace_stage3_sft.sh index f354f7c4..532c6fa9 100644 --- a/runs/pace_stage3_sft.sh +++ b/runs/pace_stage3_sft.sh @@ -25,6 +25,7 @@ WANDB_RUN="${WANDB_RUN:-dummy}" echo "=== Stage 3: Eval + SFT ===" echo "Base dir: $NANOCHAT_BASE_DIR" echo "WANDB_RUN: $WANDB_RUN" +echo "XSA: ${XSA:-FALSE}" echo "Started: $(date)" CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" @@ -52,4 +53,3 @@ torchrun --standalone --nproc_per_node=2 -m scripts.chat_eval -- -i sft python -m nanochat.report generate echo "=== Stage 3 complete: $(date) ===" - diff --git a/runs/pace_submit.sh b/runs/pace_submit.sh index 951d8849..63cc3de2 100644 --- a/runs/pace_submit.sh +++ b/runs/pace_submit.sh @@ -13,6 +13,9 @@ # # Optional W&B logging: # WANDB_RUN=my-run bash runs/pace_submit.sh +# +# Optional XSA attention: +# XSA=TRUE bash runs/pace_submit.sh set -e cd "$HOME/scratch/nanochat" @@ -20,50 +23,53 @@ cd "$HOME/scratch/nanochat" mkdir -p runs/logs WANDB_RUN="${WANDB_RUN:-dummy}" +XSA="${XSA:-FALSE}" export WANDB_RUN +export XSA echo "Submitting nanochat full pipeline..." echo "WANDB_RUN=$WANDB_RUN" +echo "XSA=$XSA" echo "" # Stage 1 JOB1=$(sbatch --parsable \ - --export=ALL,WANDB_RUN=$WANDB_RUN \ + --export=ALL,WANDB_RUN=$WANDB_RUN,XSA=$XSA \ runs/pace_stage1_tokenizer.sh) echo "Stage 1 submitted: job $JOB1 (tokenizer + dataset)" # Stage 2a JOB2A=$(sbatch --parsable \ --dependency=afterok:$JOB1 \ - --export=ALL,WANDB_RUN=$WANDB_RUN \ + --export=ALL,WANDB_RUN=$WANDB_RUN,XSA=$XSA \ runs/pace_stage2a_pretrain.sh) echo "Stage 2a submitted: job $JOB2A (pretrain chunk 1, depends on $JOB1)" # Stage 2b JOB2B=$(sbatch --parsable \ --dependency=afterany:$JOB2A \ - --export=ALL,WANDB_RUN=$WANDB_RUN \ + --export=ALL,WANDB_RUN=$WANDB_RUN,XSA=$XSA \ runs/pace_stage2b_pretrain.sh) echo "Stage 2b submitted: job $JOB2B (pretrain chunk 2, depends on $JOB2A)" # Stage 2c JOB2C=$(sbatch --parsable \ --dependency=afterany:$JOB2B \ - --export=ALL,WANDB_RUN=$WANDB_RUN \ + --export=ALL,WANDB_RUN=$WANDB_RUN,XSA=$XSA \ runs/pace_stage2c_pretrain.sh) echo "Stage 2c submitted: job $JOB2C (pretrain chunk 3, depends on $JOB2B)" # Stage 2d JOB2D=$(sbatch --parsable \ --dependency=afterany:$JOB2C \ - --export=ALL,WANDB_RUN=$WANDB_RUN \ + --export=ALL,WANDB_RUN=$WANDB_RUN,XSA=$XSA \ runs/pace_stage2d_pretrain.sh) echo "Stage 2d submitted: job $JOB2D (pretrain chunk 4, depends on $JOB2C)" # Stage 3 JOB3=$(sbatch --parsable \ --dependency=afterok:$JOB2D \ - --export=ALL,WANDB_RUN=$WANDB_RUN \ + --export=ALL,WANDB_RUN=$WANDB_RUN,XSA=$XSA \ runs/pace_stage3_sft.sh) echo "Stage 3 submitted: job $JOB3 (eval + SFT, depends on $JOB2D)" diff --git a/scripts/base_train.py b/scripts/base_train.py index a161c477..ec037bbb 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -52,6 +52,7 @@ parser.add_argument("--aspect-ratio", type=int, default=64, help="model_dim = de parser.add_argument("--head-dim", type=int, default=128, help="target head dimension for attention") parser.add_argument("--max-seq-len", type=int, default=2048, help="max context length") parser.add_argument("--window-pattern", type=str, default="SSSL", help="sliding window pattern tiled across layers: L=full, S=half context (e.g. 'SSL')") +parser.add_argument("--xsa", action="store_true", help="enable XSA") # Training horizon (only one used, in order of precedence) parser.add_argument("--num-iterations", type=int, default=-1, help="explicit number of optimization steps (-1 = disable)") parser.add_argument("--target-flops", type=float, default=-1.0, help="calculate num_iterations to reach target_flops (-1 = disable)") @@ -102,6 +103,8 @@ wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat", # Flash Attention status from nanochat.flash_attention import USE_FA3 using_fa3 = USE_FA3 +if args.xsa: + print0("XSA enabled") if using_fa3: print0("✓ Using Flash Attention 3 (Hopper GPU detected), efficient, new and awesome.") else: @@ -136,7 +139,7 @@ def build_model_meta(depth): config = GPTConfig( sequence_len=args.max_seq_len, vocab_size=vocab_size, n_layer=depth, n_head=num_heads, n_kv_head=num_heads, n_embd=model_dim, - window_pattern=args.window_pattern, + window_pattern=args.window_pattern, use_xsa=args.xsa, ) with torch.device("meta"): model_meta = GPT(config) diff --git a/scripts/chat_sft.py b/scripts/chat_sft.py index b46dd817..e085b119 100644 --- a/scripts/chat_sft.py +++ b/scripts/chat_sft.py @@ -415,6 +415,7 @@ while True: "n_kv_head": model.config.n_kv_head, "n_embd": model.config.n_embd, "window_pattern": model.config.window_pattern, + "use_xsa": model.config.use_xsa, }, "user_config": user_config, # inputs to the training script },