diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py index f71524ed..852139f7 100644 --- a/nanochat/checkpoint_manager.py +++ b/nanochat/checkpoint_manager.py @@ -26,6 +26,9 @@ def _patch_missing_config_keys(model_config_kwargs): if "window_pattern" not in model_config_kwargs: model_config_kwargs["window_pattern"] = "L" log0(f"Patching missing window_pattern in model config to 'L'") + if "use_xsa" not in model_config_kwargs: + model_config_kwargs["use_xsa"] = False + log0(f"Patching missing use_xsa in model config to False") def _patch_missing_keys(model_data, model_config): """Add default values for new parameters that may be missing in old checkpoints.""" diff --git a/nanochat/gpt.py b/nanochat/gpt.py index 07a1eae8..20db8b8a 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -37,6 +37,7 @@ class GPTConfig: # Characters: L=long (full context), S=short (quarter context) # Examples: "L"=all full context, "SL"=alternating, "SSL"=two short then one long window_pattern: str = "SSSL" + use_xsa: bool = False def norm(x): @@ -70,6 +71,8 @@ class CausalSelfAttention(nn.Module): self.n_kv_head = config.n_kv_head self.n_embd = config.n_embd self.head_dim = self.n_embd // self.n_head + self.use_xsa = config.use_xsa + self.xsa = ExclusiveSelfAttention() assert self.n_embd % self.n_head == 0 assert self.n_kv_head <= self.n_head and self.n_head % self.n_kv_head == 0 self.c_q = Linear(self.n_embd, self.n_head * self.head_dim, bias=False) @@ -120,12 +123,22 @@ class CausalSelfAttention(nn.Module): if self.layer_idx == kv_cache.n_layers - 1: kv_cache.advance(T) + if self.use_xsa: + y = self.xsa.XSA(y, v) + # Re-assemble the heads and project back to residual stream y = y.contiguous().view(B, T, -1) y = self.c_proj(y) return y +class ExclusiveSelfAttention(nn.Module): + def XSA(self, y, v): + Vn = F.normalize(v, dim=-1) + Z = y - (y * Vn).sum(dim=-1, keepdim=True) * Vn + return Z + + class MLP(nn.Module): def __init__(self, config): super().__init__() diff --git a/runs/pace_stage1_tokenizer.sh b/runs/pace_stage1_tokenizer.sh new file mode 100644 index 00000000..95573de1 --- /dev/null +++ b/runs/pace_stage1_tokenizer.sh @@ -0,0 +1,43 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=24 +#SBATCH --mem=64G +#SBATCH -t 2:00:00 +#SBATCH -J nanochat-stage1-tokenizer +#SBATCH -o runs/logs/stage1_%j.out +#SBATCH -e runs/logs/stage1_%j.err + +# Stage 1 + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p "$NANOCHAT_BASE_DIR" +mkdir -p runs/logs + +echo "=== Stage 1: Tokenizer ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "XSA: ${XSA:-FALSE}" +echo "Started: $(date)" + +command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh +export PATH="$HOME/.local/bin:$PATH" +[ -d ".venv" ] || uv venv +uv sync --extra gpu +source .venv/bin/activate +python -m nanochat.report reset +python -m nanochat.dataset -n 8 +python -m nanochat.dataset -n 170 & +DATASET_DOWNLOAD_PID=$! + +python -m scripts.tok_train +python -m scripts.tok_eval + +echo "Waiting for full dataset download..." +wait $DATASET_DOWNLOAD_PID + +echo "=== Stage 1 complete: $(date) ===" +echo "Dataset and tokenizer ready in $NANOCHAT_BASE_DIR" diff --git a/runs/pace_stage2a_pretrain.sh b/runs/pace_stage2a_pretrain.sh new file mode 100644 index 00000000..732099d8 --- /dev/null +++ b/runs/pace_stage2a_pretrain.sh @@ -0,0 +1,48 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH -p ice-gpu +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:2 +#SBATCH --constraint="gpu-h100|gpu-h200" +#SBATCH --mem-per-gpu=48G +#SBATCH -t 3:55:00 +#SBATCH -J nanochat-stage2a +#SBATCH -o runs/logs/stage2a_%j.out +#SBATCH -e runs/logs/stage2a_%j.err + +# Stage 2a + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" +XSA="${XSA:-FALSE}" +XSA_ARG="" +[ "$XSA" = "TRUE" ] && XSA_ARG="--xsa" +CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" +DONE_MARKER="$CHECKPOINT_DIR/.training_complete" + +echo "=== Stage 2a: Pretraining (chunk 1) ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "WANDB_RUN: $WANDB_RUN" +echo "XSA: $XSA" +echo "Started: $(date)" + +source .venv/bin/activate + +torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + $XSA_ARG \ + --run=$WANDB_RUN + +mkdir -p "$CHECKPOINT_DIR" +touch "$DONE_MARKER" +echo "=== Stage 2a complete: $(date) ===" diff --git a/runs/pace_stage2b_pretrain.sh b/runs/pace_stage2b_pretrain.sh new file mode 100644 index 00000000..7e7ad3be --- /dev/null +++ b/runs/pace_stage2b_pretrain.sh @@ -0,0 +1,74 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH -p ice-gpu +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:2 +#SBATCH --constraint="gpu-h100|gpu-h200" +#SBATCH --mem-per-gpu=48G +#SBATCH -t 3:55:00 +#SBATCH -J nanochat-stage2b +#SBATCH -o runs/logs/stage2b_%j.out +#SBATCH -e runs/logs/stage2b_%j.err + +# Stage 2b + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" +XSA="${XSA:-FALSE}" +XSA_ARG="" +[ "$XSA" = "TRUE" ] && XSA_ARG="--xsa" +CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" +DONE_MARKER="$CHECKPOINT_DIR/.training_complete" + +echo "=== Stage 2b: Pretraining (chunk 2 / auto-resume) ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "XSA: $XSA" +echo "Started: $(date)" + +if [ -f "$DONE_MARKER" ]; then + echo "Training already complete (marker: $DONE_MARKER). Nothing to do." + echo "=== Stage 2b skipped: $(date) ===" + exit 0 +fi + +source .venv/bin/activate + +LAST_STEP=$(python -c " +import glob, os, sys +files = glob.glob('${CHECKPOINT_DIR}/model_*.pt') +if not files: + print(0); sys.exit(0) +print(max(int(os.path.basename(f).split('_')[-1].split('.')[0]) for f in files)) +") + +if [ "$LAST_STEP" -eq 0 ]; then + echo "No checkpoint found — starting from scratch" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + $XSA_ARG \ + --run=$WANDB_RUN +else + echo "Resuming from step $LAST_STEP" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + --resume-from-step=$LAST_STEP \ + $XSA_ARG \ + --run=$WANDB_RUN +fi + +mkdir -p "$CHECKPOINT_DIR" +touch "$DONE_MARKER" +echo "=== Stage 2b complete: $(date) ===" diff --git a/runs/pace_stage2c_pretrain.sh b/runs/pace_stage2c_pretrain.sh new file mode 100644 index 00000000..5089a444 --- /dev/null +++ b/runs/pace_stage2c_pretrain.sh @@ -0,0 +1,74 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH -p ice-gpu +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:2 +#SBATCH --constraint="gpu-h100|gpu-h200" +#SBATCH --mem-per-gpu=48G +#SBATCH -t 3:55:00 +#SBATCH -J nanochat-stage2c +#SBATCH -o runs/logs/stage2c_%j.out +#SBATCH -e runs/logs/stage2c_%j.err + +# Stage 2c + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" +XSA="${XSA:-FALSE}" +XSA_ARG="" +[ "$XSA" = "TRUE" ] && XSA_ARG="--xsa" +CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" +DONE_MARKER="$CHECKPOINT_DIR/.training_complete" + +echo "=== Stage 2c: Pretraining (chunk 3 / auto-resume) ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "XSA: $XSA" +echo "Started: $(date)" + +if [ -f "$DONE_MARKER" ]; then + echo "Training already complete (marker: $DONE_MARKER). Nothing to do." + echo "=== Stage 2c skipped: $(date) ===" + exit 0 +fi + +source .venv/bin/activate + +LAST_STEP=$(python -c " +import glob, os, sys +files = glob.glob('${CHECKPOINT_DIR}/model_*.pt') +if not files: + print(0); sys.exit(0) +print(max(int(os.path.basename(f).split('_')[-1].split('.')[0]) for f in files)) +") + +if [ "$LAST_STEP" -eq 0 ]; then + echo "No checkpoint found — starting from scratch" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + $XSA_ARG \ + --run=$WANDB_RUN +else + echo "Resuming from step $LAST_STEP" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + --resume-from-step=$LAST_STEP \ + $XSA_ARG \ + --run=$WANDB_RUN +fi + +mkdir -p "$CHECKPOINT_DIR" +touch "$DONE_MARKER" +echo "=== Stage 2c complete: $(date) ===" diff --git a/runs/pace_stage2d_pretrain.sh b/runs/pace_stage2d_pretrain.sh new file mode 100644 index 00000000..ffe190d6 --- /dev/null +++ b/runs/pace_stage2d_pretrain.sh @@ -0,0 +1,74 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH -p ice-gpu +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:2 +#SBATCH --constraint="gpu-h100|gpu-h200" +#SBATCH --mem-per-gpu=48G +#SBATCH -t 3:55:00 +#SBATCH -J nanochat-stage2d +#SBATCH -o runs/logs/stage2d_%j.out +#SBATCH -e runs/logs/stage2d_%j.err + +# Stage 2d + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" +XSA="${XSA:-FALSE}" +XSA_ARG="" +[ "$XSA" = "TRUE" ] && XSA_ARG="--xsa" +CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" +DONE_MARKER="$CHECKPOINT_DIR/.training_complete" + +echo "=== Stage 2d: Pretraining (chunk 4 / auto-resume) ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "XSA: $XSA" +echo "Started: $(date)" + +if [ -f "$DONE_MARKER" ]; then + echo "Training already complete (marker: $DONE_MARKER). Nothing to do." + echo "=== Stage 2d skipped: $(date) ===" + exit 0 +fi + +source .venv/bin/activate + +LAST_STEP=$(python -c " +import glob, os, sys +files = glob.glob('${CHECKPOINT_DIR}/model_*.pt') +if not files: + print(0); sys.exit(0) +print(max(int(os.path.basename(f).split('_')[-1].split('.')[0]) for f in files)) +") + +if [ "$LAST_STEP" -eq 0 ]; then + echo "No checkpoint found — starting from scratch" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + $XSA_ARG \ + --run=$WANDB_RUN +else + echo "Resuming from step $LAST_STEP" + torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \ + --depth=24 \ + --target-param-data-ratio=8 \ + --device-batch-size=16 \ + --save-every=200 \ + --resume-from-step=$LAST_STEP \ + $XSA_ARG \ + --run=$WANDB_RUN +fi + +mkdir -p "$CHECKPOINT_DIR" +touch "$DONE_MARKER" +echo "=== Stage 2d complete: $(date) ===" diff --git a/runs/pace_stage3_sft.sh b/runs/pace_stage3_sft.sh new file mode 100644 index 00000000..532c6fa9 --- /dev/null +++ b/runs/pace_stage3_sft.sh @@ -0,0 +1,55 @@ +#!/bin/bash +#SBATCH -N 1 +#SBATCH -p ice-gpu +#SBATCH --ntasks-per-node=1 +#SBATCH --cpus-per-task=8 +#SBATCH --gres=gpu:2 +#SBATCH --constraint="gpu-h100|gpu-h200" +#SBATCH --mem-per-gpu=48G +#SBATCH -t 3:55:00 +#SBATCH -J nanochat-stage3-sft +#SBATCH -o runs/logs/stage3_%j.out +#SBATCH -e runs/logs/stage3_%j.err + +# Stage 3 + +set -e +cd "$HOME/scratch/nanochat" + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat" +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" + +echo "=== Stage 3: Eval + SFT ===" +echo "Base dir: $NANOCHAT_BASE_DIR" +echo "WANDB_RUN: $WANDB_RUN" +echo "XSA: ${XSA:-FALSE}" +echo "Started: $(date)" + +CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24" +DONE_MARKER="$CHECKPOINT_DIR/.training_complete" +if [ ! -f "$DONE_MARKER" ]; then + echo "ERROR: pretraining did not finish — missing $DONE_MARKER" + echo "Re-run pretrain chunks 2a–2d until the marker is created before running stage 3." + exit 1 +fi + +source .venv/bin/activate + +torchrun --standalone --nproc_per_node=2 -m scripts.base_eval -- \ + --device-batch-size=16 + +curl -L -o "$NANOCHAT_BASE_DIR/identity_conversations.jsonl" \ + https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl + +torchrun --standalone --nproc_per_node=2 -m scripts.chat_sft -- \ + --device-batch-size=16 \ + --run=$WANDB_RUN + +torchrun --standalone --nproc_per_node=2 -m scripts.chat_eval -- -i sft + +python -m nanochat.report generate + +echo "=== Stage 3 complete: $(date) ===" diff --git a/runs/pace_submit.sh b/runs/pace_submit.sh new file mode 100644 index 00000000..63cc3de2 --- /dev/null +++ b/runs/pace_submit.sh @@ -0,0 +1,87 @@ +#!/bin/bash + +# Pipeline: +# Stage 1 — CPU: tokenizer + dataset +# Stage 2a — GPU: pretraining chunk 1 +# Stage 2b — GPU: auto-resume chunk 2 +# Stage 2c — GPU: auto-resume chunk 3 +# Stage 2d — GPU: auto-resume chunk 4 +# Stage 3 — GPU: base eval + SFT + chat eval + report +# +# Usage (from repo root): +# bash runs/pace_submit.sh +# +# Optional W&B logging: +# WANDB_RUN=my-run bash runs/pace_submit.sh +# +# Optional XSA attention: +# XSA=TRUE bash runs/pace_submit.sh + +set -e +cd "$HOME/scratch/nanochat" + +mkdir -p runs/logs + +WANDB_RUN="${WANDB_RUN:-dummy}" +XSA="${XSA:-FALSE}" +export WANDB_RUN +export XSA + +echo "Submitting nanochat full pipeline..." +echo "WANDB_RUN=$WANDB_RUN" +echo "XSA=$XSA" +echo "" + +# Stage 1 +JOB1=$(sbatch --parsable \ + --export=ALL,WANDB_RUN=$WANDB_RUN,XSA=$XSA \ + runs/pace_stage1_tokenizer.sh) +echo "Stage 1 submitted: job $JOB1 (tokenizer + dataset)" + +# Stage 2a +JOB2A=$(sbatch --parsable \ + --dependency=afterok:$JOB1 \ + --export=ALL,WANDB_RUN=$WANDB_RUN,XSA=$XSA \ + runs/pace_stage2a_pretrain.sh) +echo "Stage 2a submitted: job $JOB2A (pretrain chunk 1, depends on $JOB1)" + +# Stage 2b +JOB2B=$(sbatch --parsable \ + --dependency=afterany:$JOB2A \ + --export=ALL,WANDB_RUN=$WANDB_RUN,XSA=$XSA \ + runs/pace_stage2b_pretrain.sh) +echo "Stage 2b submitted: job $JOB2B (pretrain chunk 2, depends on $JOB2A)" + +# Stage 2c +JOB2C=$(sbatch --parsable \ + --dependency=afterany:$JOB2B \ + --export=ALL,WANDB_RUN=$WANDB_RUN,XSA=$XSA \ + runs/pace_stage2c_pretrain.sh) +echo "Stage 2c submitted: job $JOB2C (pretrain chunk 3, depends on $JOB2B)" + +# Stage 2d +JOB2D=$(sbatch --parsable \ + --dependency=afterany:$JOB2C \ + --export=ALL,WANDB_RUN=$WANDB_RUN,XSA=$XSA \ + runs/pace_stage2d_pretrain.sh) +echo "Stage 2d submitted: job $JOB2D (pretrain chunk 4, depends on $JOB2C)" + +# Stage 3 +JOB3=$(sbatch --parsable \ + --dependency=afterok:$JOB2D \ + --export=ALL,WANDB_RUN=$WANDB_RUN,XSA=$XSA \ + runs/pace_stage3_sft.sh) +echo "Stage 3 submitted: job $JOB3 (eval + SFT, depends on $JOB2D)" + +echo "" +echo "All jobs queued. Monitor with:" +echo " squeue -u $USER" +echo " tail -f runs/logs/stage1_${JOB1}.out" +echo " tail -f runs/logs/stage2a_${JOB2A}.out" +echo " tail -f runs/logs/stage2b_${JOB2B}.out" +echo " tail -f runs/logs/stage2c_${JOB2C}.out" +echo " tail -f runs/logs/stage2d_${JOB2D}.out" +echo " tail -f runs/logs/stage3_${JOB3}.out" +echo "" +echo "To cancel everything:" +echo " scancel $JOB1 $JOB2A $JOB2B $JOB2C $JOB2D $JOB3" diff --git a/runs/runpod/d12.sh b/runs/runpod/d12.sh new file mode 100755 index 00000000..dbf4aa5b --- /dev/null +++ b/runs/runpod/d12.sh @@ -0,0 +1,196 @@ +#!/usr/bin/env bash +# d12 baseline runner. Runs INSIDE a RunPod pod. +# Pipeline: tokenizer -> base_train -> base_eval -> SFT -> chat_eval -> report. +# On exit: +# success -> upload final cache to HF, self-delete pod +# failure -> upload logs + report dir to HF under _failures/, self-delete pod +# (set UPLOAD_FAILURE_CACHE=1 to also dump partial cache for offline debug) +# +# Required env (passed via runpodctl --env at pod-create): +# HF_TOKEN, WANDB_API_KEY +# Optional env: +# WANDB_RUN default: d12 +# NANOCHAT_REPO default: Team-XSA/nanochat +# NANOCHAT_REF default: dev +# HF_REPO default: haydenfree/nanochat-d12-baseline +# BACKUP_INTERVAL default: 300 (seconds between background HF uploads) +# UPLOAD_FAILURE_CACHE default: 0 +# Auto-set by RunPod: +# RUNPOD_POD_ID, RUNPOD_API_KEY (pod-scoped) + +set -euo pipefail + +NANOCHAT_REPO="${NANOCHAT_REPO:-Team-XSA/nanochat}" +NANOCHAT_REF="${NANOCHAT_REF:-dev}" +HF_REPO="${HF_REPO:-haydenfree/nanochat-d12-baseline}" +WANDB_RUN="${WANDB_RUN:-d12}" +BACKUP_INTERVAL="${BACKUP_INTERVAL:-300}" +UPLOAD_FAILURE_CACHE="${UPLOAD_FAILURE_CACHE:-0}" + +WORKDIR="/workspace/nanochat" +LOG_FILE="/workspace/runner.log" +NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" +BACKUP_PID="" + +mkdir -p /workspace +# NOTE: dockerStartCmd already redirects stdout/stderr to $LOG_FILE. +# Don't add a second tee here — would write every line twice. + +echo "[runner] $(date -Iseconds) starting on pod=$RUNPOD_POD_ID" +echo "[runner] repo=$NANOCHAT_REPO ref=$NANOCHAT_REF hf_repo=$HF_REPO wandb_run=$WANDB_RUN" + +# Bootstrap huggingface_hub system-wide so the cleanup trap can upload logs +# even if we fail before the venv is activated. +{ pip3 install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \ + python3 -m pip install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \ + echo "[runner] WARN: could not pre-install huggingface_hub; cleanup uploads may fail"; } || true + +cleanup() { + local rc=$? + set +e + echo "[runner] cleanup: exit code $rc at $(date -Iseconds)" + if [ -n "$BACKUP_PID" ] && kill -0 "$BACKUP_PID" 2>/dev/null; then + kill "$BACKUP_PID" 2>/dev/null || true + fi + + local TS + TS=$(date -u +%Y%m%dT%H%M%SZ) + + if [ "$rc" -eq 0 ]; then + echo "[runner] success — final upload to $HF_REPO" + if [ -d "$NANOCHAT_BASE_DIR" ]; then + # Skip the climbmix dataset shards (~2GB of public data, not model artifacts) + hf upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \ + --repo-type model --commit-message "final rc=0 $TS" \ + --exclude "base_data_climbmix/**" --exclude "wandb/**" || \ + echo "[runner] WARN: final upload failed" + fi + # Also upload the runner log so we have a permanent record of this successful run. + if [ -f "$LOG_FILE" ]; then + hf upload "$HF_REPO" "$LOG_FILE" "_runs/${TS}/runner.log" \ + --repo-type model --commit-message "runner log $TS" || \ + echo "[runner] WARN: runner.log upload failed" + fi + else + echo "[runner] failure rc=$rc — dumping logs to HF for offline debug" + mkdir -p /tmp/failure + cp /workspace/*.log /tmp/failure/ 2>/dev/null || true + [ -d "$NANOCHAT_BASE_DIR/report" ] && cp -r "$NANOCHAT_BASE_DIR/report" /tmp/failure/ 2>/dev/null || true + [ -d "$WORKDIR" ] && (cd "$WORKDIR" && git rev-parse HEAD 2>/dev/null > /tmp/failure/git-head.txt || true) + + hf upload "$HF_REPO" /tmp/failure "_failures/${TS}-rc${rc}/logs" \ + --repo-type model --commit-message "failure rc=$rc logs $TS" || \ + echo "[runner] WARN: log upload failed" + + if [ "$UPLOAD_FAILURE_CACHE" = "1" ] && [ -d "$NANOCHAT_BASE_DIR" ]; then + echo "[runner] UPLOAD_FAILURE_CACHE=1 — also dumping partial cache (may be slow)" + hf upload "$HF_REPO" "$NANOCHAT_BASE_DIR" "_failures/${TS}-rc${rc}/cache" \ + --repo-type model --commit-message "failure rc=$rc cache $TS" \ + --exclude "base_data_climbmix/**" --exclude "wandb/**" || true + fi + echo "[runner] failure artifacts: https://huggingface.co/$HF_REPO/tree/main/_failures/${TS}-rc${rc}" + fi + + echo "[runner] self-deleting pod $RUNPOD_POD_ID" + # REST API first — pod-scoped key has delete permission and the API is reliable. + # The pod's preinstalled runpodctl is unreliable (often missing config or 'pod' subcommand). + if curl -fsS -X DELETE \ + -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ + "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1; then + echo "[runner] REST delete request accepted" + else + echo "[runner] REST delete failed, trying runpodctl as fallback" + runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \ + runpodctl remove pod "$RUNPOD_POD_ID" 2>&1 || \ + echo "[runner] WARN: all delete methods failed — pod may need manual cleanup" + fi + exit "$rc" +} +trap cleanup EXIT + +: "${HF_TOKEN:?HF_TOKEN must be set}" +: "${WANDB_API_KEY:?WANDB_API_KEY must be set}" +: "${RUNPOD_POD_ID:?RUNPOD_POD_ID must be set (auto by RunPod)}" + +rm -rf "$WORKDIR" +git clone "https://github.com/${NANOCHAT_REPO}.git" "$WORKDIR" +cd "$WORKDIR" +# `--` disambiguates ref-vs-file (some images create a `dev` file in HOME) +git checkout "$NANOCHAT_REF" -- +echo "[runner] HEAD = $(git rev-parse HEAD)" + +sed -i 's/--depth=24/--depth=12/' runs/speedrun.sh +sed -i 's/ --target-param-data-ratio=8//' runs/speedrun.sh +# Inject `set -euo pipefail` so a mid-pipeline failure (e.g. chat_sft) propagates +# as rc!=0 instead of being silently swallowed by the next command. +sed -i '1a set -euo pipefail' runs/speedrun.sh +echo "[runner] speedrun.sh edits applied:" +grep -n 'depth\|target-param\|set -e' runs/speedrun.sh || true + +# Explicit venv setup BEFORE speedrun.sh so we can run diagnostic probes +# inside the venv. speedrun.sh's uv sync is idempotent (no-op the second time). +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR +mkdir -p "$NANOCHAT_BASE_DIR" +command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh +[ -d ".venv" ] || uv venv +uv sync --extra gpu +source .venv/bin/activate +pip install --quiet --upgrade huggingface_hub + +# Ensure HF token flows to the kernels lib (some libs read HF_HUB_TOKEN, not HF_TOKEN) +export HF_HUB_TOKEN="${HF_TOKEN}" + +# Bump kernels to latest — pyproject pins >=0.11.7 and uv often picks exactly that; +# 0.11.x had kernel-resolution bugs that affect FA3 loading silently. +echo "[runner] upgrading kernels lib for FA3 reliability" +uv pip install --quiet --upgrade 'kernels>=0.13.0' 2>&1 || \ + echo "[runner] WARN: kernels upgrade failed (continuing)" + +# Install hf_transfer — runpod base image sets HF_HUB_ENABLE_HF_TRANSFER=1, which +# makes huggingface_hub raise ValueError if the package is missing. chat_sft loads +# HuggingFaceTB/smol-smoltalk via datasets and crashes without this. +echo "[runner] installing hf_transfer for SFT dataset download" +uv pip install --quiet hf_transfer 2>&1 || echo "[runner] WARN: hf_transfer install failed" + +# FA3 diagnostic probe — surfaces real errors (nanochat silently swallows them). +# Non-fatal: SDPA fallback is automatic. We want this output in the log +# regardless of outcome so we can decide what to do about FA3. +echo "[runner] === FA3 PROBE BEGIN ===" +python "$WORKDIR/runs/runpod/probe_fa3.py" || echo "[runner] FA3 probe reported issues (non-fatal — continuing with SDPA fallback)" +echo "[runner] === FA3 PROBE END ===" + +( + while true; do + sleep "$BACKUP_INTERVAL" + if [ -d "$NANOCHAT_BASE_DIR" ]; then + hf upload "$HF_REPO" "$NANOCHAT_BASE_DIR" . \ + --repo-type model \ + --commit-message "checkpoint $(date -Iseconds)" \ + --exclude "base_data_climbmix/**" --exclude "wandb/**" \ + >> /workspace/backup.log 2>&1 || true + fi + done +) & +BACKUP_PID=$! +echo "[runner] backup loop pid=$BACKUP_PID interval=${BACKUP_INTERVAL}s" + +export WANDB_RUN +WANDB_RUN="$WANDB_RUN" bash runs/speedrun.sh + +# Verify expected pipeline outputs — speedrun.sh historically didn't `set -e`; +# we patched it above, but double-check the artifacts that matter for the d12 baseline. +echo "[runner] verifying pipeline outputs" +missing=() +for required in base_checkpoints/d12 chatsft_checkpoints/d12 tokenizer report; do + if [ ! -d "$NANOCHAT_BASE_DIR/$required" ]; then + missing+=("$required") + fi +done +if [ ${#missing[@]} -gt 0 ]; then + echo "[runner] FAIL: pipeline finished but missing expected artifacts: ${missing[*]}" + exit 1 +fi +echo "[runner] all expected artifacts present" + +echo "[runner] $(date -Iseconds) pipeline complete" diff --git a/runs/runpod/d12_sft_only.sh b/runs/runpod/d12_sft_only.sh new file mode 100755 index 00000000..46f2fd94 --- /dev/null +++ b/runs/runpod/d12_sft_only.sh @@ -0,0 +1,161 @@ +#!/usr/bin/env bash +# d12 SFT-only resume runner. Runs INSIDE a RunPod pod. +# +# Use case: the d12 base_train + base_eval already succeeded and uploaded to HF, +# but chat_sft failed (e.g., missing hf_transfer package). Instead of re-running +# the whole pipeline, this runner: +# 1. Downloads base_checkpoints/d12/ + tokenizer/ from HF +# 2. Installs hf_transfer (the actual SFT bug fix) +# 3. Runs chat_sft + chat_eval directly (skips speedrun.sh) +# 4. Uploads chatsft_checkpoints/ + chat_eval results + report to HF +# 5. Self-deletes +# +# Required env: HF_TOKEN, WANDB_API_KEY +# Optional env: +# WANDB_RUN default: d12-sft +# NANOCHAT_REPO default: Team-XSA/nanochat +# NANOCHAT_REF default: dev +# HF_REPO default: haydenfree/nanochat-d12-baseline (where the base lives) + +set -euo pipefail + +NANOCHAT_REPO="${NANOCHAT_REPO:-Team-XSA/nanochat}" +NANOCHAT_REF="${NANOCHAT_REF:-dev}" +HF_REPO="${HF_REPO:-haydenfree/nanochat-d12-baseline}" +WANDB_RUN="${WANDB_RUN:-d12-sft}" + +WORKDIR="/workspace/nanochat" +LOG_FILE="/workspace/runner.log" +NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" + +mkdir -p /workspace +echo "[sft] $(date -Iseconds) starting on pod=$RUNPOD_POD_ID" +echo "[sft] resuming from base checkpoint at $HF_REPO" + +# Bootstrap huggingface_hub system-wide so cleanup can upload logs even on early failure. +{ pip3 install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \ + python3 -m pip install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \ + echo "[sft] WARN: could not pre-install huggingface_hub"; } || true + +cleanup() { + local rc=$? + set +e + echo "[sft] cleanup: exit code $rc at $(date -Iseconds)" + + local TS + TS=$(date -u +%Y%m%dT%H%M%SZ) + + if [ "$rc" -eq 0 ]; then + echo "[sft] success — uploading chatsft_checkpoints + report + log" + # Only upload the SFT-specific subdirs so we don't re-upload base. + for subdir in chatsft_checkpoints report; do + if [ -d "$NANOCHAT_BASE_DIR/$subdir" ]; then + hf upload "$HF_REPO" "$NANOCHAT_BASE_DIR/$subdir" "$subdir" \ + --repo-type model --commit-message "$subdir SFT-resume rc=0 $TS" || \ + echo "[sft] WARN: $subdir upload failed" + fi + done + if [ -f "$LOG_FILE" ]; then + hf upload "$HF_REPO" "$LOG_FILE" "_runs/${TS}-sft/runner.log" \ + --repo-type model --commit-message "SFT runner log $TS" || \ + echo "[sft] WARN: runner.log upload failed" + fi + else + echo "[sft] failure rc=$rc — dumping logs" + mkdir -p /tmp/failure + cp /workspace/*.log /tmp/failure/ 2>/dev/null || true + [ -d "$NANOCHAT_BASE_DIR/report" ] && cp -r "$NANOCHAT_BASE_DIR/report" /tmp/failure/ 2>/dev/null || true + [ -d "$WORKDIR" ] && (cd "$WORKDIR" && git rev-parse HEAD 2>/dev/null > /tmp/failure/git-head.txt || true) + hf upload "$HF_REPO" /tmp/failure "_failures/${TS}-sft-rc${rc}/logs" \ + --repo-type model --commit-message "SFT-resume failure rc=$rc $TS" || \ + echo "[sft] WARN: log upload failed" + fi + + echo "[sft] self-deleting pod $RUNPOD_POD_ID" + if curl -fsS -X DELETE \ + -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ + "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1; then + echo "[sft] REST delete request accepted" + else + echo "[sft] REST delete failed, trying runpodctl as fallback" + runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \ + runpodctl remove pod "$RUNPOD_POD_ID" 2>&1 || \ + echo "[sft] WARN: all delete methods failed — pod may need manual cleanup" + fi + exit "$rc" +} +trap cleanup EXIT + +: "${HF_TOKEN:?HF_TOKEN must be set}" +: "${WANDB_API_KEY:?WANDB_API_KEY must be set}" +: "${RUNPOD_POD_ID:?RUNPOD_POD_ID must be set (auto by RunPod)}" + +# Clone fork +rm -rf "$WORKDIR" +git clone "https://github.com/${NANOCHAT_REPO}.git" "$WORKDIR" +cd "$WORKDIR" +git checkout "$NANOCHAT_REF" -- +echo "[sft] HEAD = $(git rev-parse HEAD)" + +# Env + uv +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR +mkdir -p "$NANOCHAT_BASE_DIR" +command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh +[ -d ".venv" ] || uv venv +uv sync --extra gpu +source .venv/bin/activate +pip install --quiet --upgrade huggingface_hub +export HF_HUB_TOKEN="${HF_TOKEN}" + +# Install hf_transfer — THE actual fix for the previous SFT failure. +echo "[sft] installing hf_transfer (the bug from last run)" +uv pip install --quiet hf_transfer + +# Pull tokenizer + base checkpoint from HF in TWO separate calls. +# `hf download` only honors the LAST --include when specified multiple times +# (multi-include works for upload, not download — verified the hard way). +echo "[sft] downloading tokenizer from $HF_REPO" +hf download "$HF_REPO" \ + --include "tokenizer/**" \ + --local-dir "$NANOCHAT_BASE_DIR" \ + --repo-type model + +echo "[sft] downloading base_checkpoints/d12 from $HF_REPO" +hf download "$HF_REPO" \ + --include "base_checkpoints/d12/**" \ + --local-dir "$NANOCHAT_BASE_DIR" \ + --repo-type model + +# Verify both pieces actually landed before invoking chat_sft. +echo "[sft] verifying downloads" +ls -la "$NANOCHAT_BASE_DIR/base_checkpoints/d12/" 2>&1 || true +ls -la "$NANOCHAT_BASE_DIR/tokenizer/" 2>&1 || true +[ -f "$NANOCHAT_BASE_DIR/tokenizer/tokenizer.pkl" ] || { echo "[sft] FAIL: tokenizer.pkl missing after download"; exit 1; } +[ -n "$(ls -A "$NANOCHAT_BASE_DIR/base_checkpoints/d12/" 2>/dev/null)" ] || { echo "[sft] FAIL: base_checkpoints/d12 is empty"; exit 1; } +echo "[sft] downloads verified" + +# Also need identity_conversations.jsonl for SFT (speedrun.sh normally fetches it) +echo "[sft] fetching identity_conversations.jsonl" +curl -L -fsS -o "$NANOCHAT_BASE_DIR/identity_conversations.jsonl" \ + https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl + +# Run only SFT + chat_eval + report. NOT speedrun.sh (which would re-do base_train). +NPROC=$(nvidia-smi -L | wc -l) +echo "[sft] running chat_sft on $NPROC GPUs" +torchrun --standalone --nproc_per_node="$NPROC" -m scripts.chat_sft -- \ + --device-batch-size=16 --run="$WANDB_RUN" + +echo "[sft] running chat_eval" +torchrun --standalone --nproc_per_node="$NPROC" -m scripts.chat_eval -- -i sft + +echo "[sft] regenerating report (will include new SFT sections)" +python -m nanochat.report generate || true + +# Verify SFT artifacts exist before declaring success +if [ ! -d "$NANOCHAT_BASE_DIR/chatsft_checkpoints" ]; then + echo "[sft] FAIL: chatsft_checkpoints/ missing after chat_sft" + exit 1 +fi + +echo "[sft] $(date -Iseconds) SFT pipeline complete" diff --git a/runs/runpod/kickoff.sh b/runs/runpod/kickoff.sh new file mode 100755 index 00000000..bb918171 --- /dev/null +++ b/runs/runpod/kickoff.sh @@ -0,0 +1,92 @@ +#!/usr/bin/env bash +# Generic local kickoff for RunPod runs. +# Picks a runner script in this repo (runs/runpod/.sh) and spins up a pod. +# +# Prereqs: +# 1. ~/.config/team-xsa/runpod.env exports HF_TOKEN, WANDB_API_KEY, RUNPOD_TEMPLATE_ID +# 2. The template referenced by RUNPOD_TEMPLATE_ID has docker-start-cmd: +# bash,-lc,curl -fsSL "$RUNNER_URL" | bash >> /workspace/runner.log 2>&1 +# 3. The runner script for this experiment has been pushed to Team-XSA/nanochat +# +# Usage: +# source ~/.config/team-xsa/runpod.env +# bash runs/runpod/kickoff.sh d12 # uses runs/runpod/d12.sh +# bash runs/runpod/kickoff.sh d24 # uses runs/runpod/d24.sh +# bash runs/runpod/kickoff.sh xsa_d12 # uses runs/runpod/xsa_d12.sh +# +# Optional env overrides: +# GPU_ID default: "NVIDIA H100 80GB HBM3" +# GPU_COUNT default: 8 +# CLOUD_TYPE default: SECURE (COMMUNITY when capacity available, cheaper) +# DISK_GB default: 200 +# NANOCHAT_REPO default: Team-XSA/nanochat +# NANOCHAT_REF default: dev +# WANDB_RUN default: +# POD_NAME default: - + +set -euo pipefail + +RUNNER="${1:-}" +if [ -z "$RUNNER" ]; then + echo "Usage: bash runs/runpod/kickoff.sh " + echo " e.g. bash runs/runpod/kickoff.sh d12" + exit 1 +fi + +: "${HF_TOKEN:?HF_TOKEN not set — source ~/.config/team-xsa/runpod.env}" +: "${WANDB_API_KEY:?WANDB_API_KEY not set — source ~/.config/team-xsa/runpod.env}" +: "${RUNPOD_TEMPLATE_ID:?RUNPOD_TEMPLATE_ID not set — create the template once and add it to ~/.config/team-xsa/runpod.env}" + +NANOCHAT_REPO="${NANOCHAT_REPO:-Team-XSA/nanochat}" +NANOCHAT_REF="${NANOCHAT_REF:-dev}" +WANDB_RUN="${WANDB_RUN:-$RUNNER}" +RUNNER_URL="${RUNNER_URL:-https://raw.githubusercontent.com/${NANOCHAT_REPO}/${NANOCHAT_REF}/runs/runpod/${RUNNER}.sh}" + +GPU_ID="${GPU_ID:-NVIDIA H100 80GB HBM3}" +GPU_COUNT="${GPU_COUNT:-8}" +CLOUD_TYPE="${CLOUD_TYPE:-SECURE}" +DISK_GB="${DISK_GB:-200}" +POD_NAME="${POD_NAME:-${RUNNER}-$(date +%Y%m%d-%H%M)}" + +echo "Verifying runner URL is reachable: $RUNNER_URL" +if ! curl -sfI "$RUNNER_URL" >/dev/null; then + echo "ERROR: runner not reachable at $RUNNER_URL" + echo " - Did you push runs/runpod/${RUNNER}.sh to ${NANOCHAT_REPO}@${NANOCHAT_REF}?" + echo " - Is the repo public?" + exit 1 +fi + +export HF_TOKEN WANDB_API_KEY WANDB_RUN RUNNER_URL NANOCHAT_REPO NANOCHAT_REF +ENV_JSON=$(python3 - <<'PY' +import json, os +keys = ["HF_TOKEN","WANDB_API_KEY","WANDB_RUN","RUNNER_URL","NANOCHAT_REPO","NANOCHAT_REF"] +print(json.dumps({k: os.environ[k] for k in keys if k in os.environ})) +PY +) + +echo "Creating pod:" +echo " name = $POD_NAME" +echo " template = $RUNPOD_TEMPLATE_ID" +echo " runner = $RUNNER_URL" +echo " gpu = $GPU_COUNT × $GPU_ID" +echo " cloud = $CLOUD_TYPE" +echo " disk = ${DISK_GB} GB" + +runpodctl pod create \ + --name "$POD_NAME" \ + --template-id "$RUNPOD_TEMPLATE_ID" \ + --gpu-id "$GPU_ID" \ + --gpu-count "$GPU_COUNT" \ + --cloud-type "$CLOUD_TYPE" \ + --container-disk-in-gb "$DISK_GB" \ + --env "$ENV_JSON" + +echo +echo "Logs (after pod boots):" +echo " POD_ID=\$(runpodctl pod list --name '$POD_NAME' -o json | jq -r '.[0].id')" +echo " runpodctl ssh info \$POD_ID" +echo " ssh @ 'tail -f /workspace/runner.log'" +echo +echo "Wandb: project=nanochat / nanochat-sft, run name: $WANDB_RUN" +echo "HF (success): https://huggingface.co/haydenfree/nanochat-d12-baseline" +echo "HF (failure): https://huggingface.co/haydenfree/nanochat-d12-baseline/tree/main/_failures" diff --git a/runs/runpod/probe_fa3.py b/runs/runpod/probe_fa3.py new file mode 100644 index 00000000..392f1d81 --- /dev/null +++ b/runs/runpod/probe_fa3.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python3 +# pyright: reportMissingImports=false +""" +Comprehensive FA3 / kernels diagnostic probe. + +nanochat/flash_attention.py:_load_flash_attention_3 swallows ALL exceptions silently +and falls back to SDPA. This script runs the same code path with full tracebacks +so we can see why FA3 isn't loading on the pod. + +Run inside the pod (after uv sync, with venv active): + python runs/runpod/probe_fa3.py + +Exits 0 if FA3 is fully usable, 1 if any check fails. + +Note: the pyright pragma above is intentional — torch/huggingface_hub/kernels +are only present at pod runtime; the local IDE will flag them as unresolved. +""" +import os +import sys +import traceback +import platform +import subprocess + +# This probe lives at /runs/runpod/probe_fa3.py — add repo root to +# sys.path so we can `import nanochat.*` regardless of cwd or how we're invoked. +_REPO_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +if _REPO_ROOT not in sys.path: + sys.path.insert(0, _REPO_ROOT) + +OK = "\033[32mOK\033[0m" +FAIL = "\033[31mFAIL\033[0m" +WARN = "\033[33mWARN\033[0m" + + +def section(n, name): + print() + print("=" * 80) + print(f"[{n}] {name}") + print("=" * 80) + + +def fmt_token(tok): + if not tok: + return "NOT SET" + return f"SET (len={len(tok)}, prefix={tok[:7]}…)" + + +passed_all = True + + +def fail(msg): + global passed_all + passed_all = False + print(f" {FAIL} {msg}") + + +def warn(msg): + print(f" {WARN} {msg}") + + +def ok(msg): + print(f" {OK} {msg}") + + +# --------------------------------------------------------------------------- +section(1, "Environment") +print(f" python : {sys.version.split()[0]}") +print(f" platform : {platform.platform()}") +print(f" cwd : {os.getcwd()}") +hf_tok = os.environ.get("HF_TOKEN", "") +hf_hub_tok = os.environ.get("HF_HUB_TOKEN", "") +print(f" HF_TOKEN : {fmt_token(hf_tok)}") +print(f" HF_HUB_TOKEN : {fmt_token(hf_hub_tok)}") +print(f" HF_HOME : {os.environ.get('HF_HOME', '(default ~/.cache/huggingface)')}") +print(f" HUGGINGFACE_HUB_CACHE : {os.environ.get('HUGGINGFACE_HUB_CACHE', '(unset)')}") +print(f" WANDB_API_KEY: {fmt_token(os.environ.get('WANDB_API_KEY',''))}") + +if not hf_tok: + fail("HF_TOKEN env var is empty — kernels lib will fall back to anonymous and may rate-limit") + +# --------------------------------------------------------------------------- +section(2, "Network connectivity") +for url, label in [ + ("https://huggingface.co", "huggingface.co"), + ("https://cdn-lfs.huggingface.co", "cdn-lfs.huggingface.co"), + ("https://github.com", "github.com"), +]: + try: + rc = subprocess.run( + ["curl", "-sfI", "--max-time", "10", url], + capture_output=True, text=True, timeout=15, + ).returncode + if rc == 0: + ok(f"{label} reachable") + else: + fail(f"{label} unreachable (curl rc={rc})") + except Exception as e: + fail(f"{label}: {type(e).__name__}: {e}") + +# --------------------------------------------------------------------------- +section(3, "huggingface_hub auth (does the token actually work?)") +try: + from huggingface_hub import whoami + info = whoami(token=hf_tok or None) + ok(f"authenticated as: {info.get('name','?')} (type={info.get('type','?')})") + print(f" orgs: {[o.get('name') for o in info.get('orgs', [])]}") + print(f" access token role: {info.get('auth',{}).get('accessToken',{}).get('role','?')}") +except Exception as e: + fail(f"whoami failed: {type(e).__name__}: {e}") + traceback.print_exc() + +# --------------------------------------------------------------------------- +section(4, "PyTorch / CUDA / GPU") +try: + import torch + print(f" torch : {torch.__version__}") + print(f" cuda available : {torch.cuda.is_available()}") + print(f" cuda version : {torch.version.cuda}") + if torch.cuda.is_available(): + print(f" device count : {torch.cuda.device_count()}") + for i in range(torch.cuda.device_count()): + major, minor = torch.cuda.get_device_capability(i) + name = torch.cuda.get_device_name(i) + mark = OK if major == 9 else WARN + print(f" device {i} : {name} sm{major}{minor} [{mark}]") + major, _ = torch.cuda.get_device_capability(0) + if major != 9: + fail(f"FA3 requires sm90 (Hopper); device 0 is sm{major}{_}") + else: + fail("CUDA not available") +except Exception as e: + fail(f"torch import failed: {type(e).__name__}: {e}") + traceback.print_exc() + +# --------------------------------------------------------------------------- +section(5, "kernels library") +try: + import kernels + ver = getattr(kernels, "__version__", "?") + print(f" kernels : {ver}") + if ver != "?": + major_minor = tuple(int(x) for x in ver.split(".")[:2]) + if major_minor < (0, 13): + warn(f"kernels {ver} < 0.13 — older versions have known kernel-resolution bugs; consider 'uv pip install --upgrade kernels'") + else: + ok(f"kernels {ver} is recent") + print(f" kernels path : {kernels.__file__}") +except Exception as e: + fail(f"kernels not importable: {type(e).__name__}: {e}") + traceback.print_exc() + sys.exit(1) + +# --------------------------------------------------------------------------- +section(6, "Fetch varunneal/flash-attention-3 (THE actual nanochat code path)") +os.environ.setdefault("HF_HUB_DISABLE_PROGRESS_BARS", "1") +try: + from kernels import get_kernel + print(" calling get_kernel('varunneal/flash-attention-3') …") + k = get_kernel("varunneal/flash-attention-3") + ok(f"get_kernel returned: {type(k).__name__}") + print(f" module path: {getattr(k, '__file__', '(no __file__)')}") + iface = k.flash_attn_interface + ok(f"flash_attn_interface: {type(iface).__name__}") + fn = iface.flash_attn_func + ok(f"flash_attn_func: callable={callable(fn)}") + print("\n >>> FA3 binary is usable on this pod. <<<") +except Exception as e: + fail(f"FA3 fetch failed: {type(e).__name__}: {e}") + print() + traceback.print_exc() + print() + print(" Likely causes:") + print(" 1. Network/DNS issue (HF Hub unreachable from this DC)") + print(" 2. Old kernels version with resolver bugs (try kernels>=0.13)") + print(" 3. HF token not flowing — try `export HF_HUB_TOKEN=$HF_TOKEN`") + print(" 4. No prebuilt binary for this torch/cuda combo (we have torch 2.9 + cu128 — should be supported)") + +# --------------------------------------------------------------------------- +section(7, "HF Hub cache state") +import pathlib +cache_root = pathlib.Path(os.environ.get("HF_HOME", os.path.expanduser("~/.cache/huggingface"))) +print(f" cache root : {cache_root}") +if cache_root.exists(): + try: + size = sum(p.stat().st_size for p in cache_root.rglob("*") if p.is_file()) + print(f" size on disk : {size / 1024 / 1024:.1f} MB") + except Exception as e: + print(f" (could not size cache: {e})") + fa3_marker = list(cache_root.rglob("*flash*attention*3*")) + if fa3_marker: + ok(f"found FA3-related cache entries: {len(fa3_marker)}") + for p in fa3_marker[:5]: + print(f" {p}") + else: + warn("no flash-attention-3 entries in cache yet") +else: + print(" (cache directory does not exist)") + +# --------------------------------------------------------------------------- +section(8, "Replicate nanochat.flash_attention detection") +try: + from nanochat.flash_attention import HAS_FA3, USE_FA3, _fa3 + if HAS_FA3: + ok("nanochat.flash_attention.HAS_FA3 = True") + else: + fail("nanochat.flash_attention.HAS_FA3 = False (despite section 6 results)") + print(f" USE_FA3 = {USE_FA3}") + print(f" _fa3 object: {_fa3}") +except Exception as e: + fail(f"import nanochat.flash_attention failed: {type(e).__name__}: {e}") + traceback.print_exc() + +# --------------------------------------------------------------------------- +section(9, "Verdict") +if passed_all: + print(f" {OK} all checks passed — FA3 is wired up and base_train should use it") + sys.exit(0) +else: + print(f" {FAIL} one or more checks failed — see above. Run will fall back to SDPA (slower, possibly much).") + print() + print(" Continuing the training run anyway is safe; FA3 fallback to SDPA is automatic.") + sys.exit(1) diff --git a/runs/runpod/smoke.sh b/runs/runpod/smoke.sh new file mode 100755 index 00000000..86df17a3 --- /dev/null +++ b/runs/runpod/smoke.sh @@ -0,0 +1,134 @@ +#!/usr/bin/env bash +# Minimal smoke test. Runs INSIDE a RunPod pod. +# Validates: pod boot, env-var injection, git clone, uv sync, GPU torch, +# tokenizer + base_train code paths, HF upload, runpodctl self-delete. +# Does NOT test: multi-GPU, FP8, full training horizon, SFT, eval. +# +# Sized for a 1-GPU pod, completes in ~3-4 min wall clock. +# Kick off with: GPU_COUNT=1 bash runs/runpod/kickoff.sh smoke +# +# Required env: HF_TOKEN, WANDB_API_KEY +# Auto-set by RunPod: RUNPOD_POD_ID, RUNPOD_API_KEY + +set -euo pipefail + +NANOCHAT_REPO="${NANOCHAT_REPO:-Team-XSA/nanochat}" +NANOCHAT_REF="${NANOCHAT_REF:-dev}" +HF_REPO="${HF_REPO:-haydenfree/nanochat-d12-baseline}" +WANDB_RUN="${WANDB_RUN:-smoke}" + +TS=$(date -u +%Y%m%dT%H%M%SZ) +HF_PATH_PREFIX="_smoke/${TS}" + +WORKDIR="/workspace/nanochat" +LOG_FILE="/workspace/runner.log" +NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" + +mkdir -p /workspace +# NOTE: dockerStartCmd already redirects stdout/stderr to $LOG_FILE. +# Don't add a second tee here — would write every line twice. + +echo "[smoke] $(date -Iseconds) starting on pod=$RUNPOD_POD_ID" + +# Bootstrap huggingface_hub system-wide so the cleanup trap can upload logs +# even if we fail before the venv is activated. Try pip3, then python3 -m pip. +{ pip3 install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \ + python3 -m pip install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \ + echo "[smoke] WARN: could not pre-install huggingface_hub; cleanup uploads may fail"; } || true + +cleanup() { + local rc=$? + set +e + echo "[smoke] cleanup: exit code $rc" + + # Always upload the runner log (success or failure) so we can see what happened + mkdir -p /tmp/smoke-out + cp /workspace/*.log /tmp/smoke-out/ 2>/dev/null || true + echo "rc=$rc ts=$TS pod=$RUNPOD_POD_ID" > /tmp/smoke-out/result.txt + [ -d "$WORKDIR" ] && (cd "$WORKDIR" && git rev-parse HEAD 2>/dev/null > /tmp/smoke-out/git-head.txt || true) + hf upload "$HF_REPO" /tmp/smoke-out "$HF_PATH_PREFIX" \ + --repo-type model --commit-message "smoke rc=$rc $TS" || \ + echo "[smoke] WARN: HF upload failed" + + echo "[smoke] artifacts: https://huggingface.co/$HF_REPO/tree/main/$HF_PATH_PREFIX" + echo "[smoke] self-deleting pod $RUNPOD_POD_ID" + # REST API first — pod-scoped key has delete permission and the API is reliable. + # The pod's preinstalled runpodctl is unreliable (often missing config or 'pod' subcommand). + if curl -fsS -X DELETE \ + -H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \ + "https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1; then + echo "[smoke] REST delete request accepted" + else + echo "[smoke] REST delete failed, trying runpodctl as fallback" + runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \ + runpodctl remove pod "$RUNPOD_POD_ID" 2>&1 || \ + echo "[smoke] WARN: all delete methods failed — pod may need manual cleanup" + fi + exit "$rc" +} +trap cleanup EXIT + +: "${HF_TOKEN:?HF_TOKEN must be set}" +: "${WANDB_API_KEY:?WANDB_API_KEY must be set}" +: "${RUNPOD_POD_ID:?RUNPOD_POD_ID must be set (auto by RunPod)}" + +# Clone fork +rm -rf "$WORKDIR" +git clone "https://github.com/${NANOCHAT_REPO}.git" "$WORKDIR" +cd "$WORKDIR" +# `--` disambiguates ref-vs-file (some images create a `dev` file in HOME) +git checkout "$NANOCHAT_REF" -- +echo "[smoke] HEAD = $(git rev-parse HEAD)" + +# Env + uv +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR +mkdir -p "$NANOCHAT_BASE_DIR" +command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh +[ -d ".venv" ] || uv venv +uv sync --extra gpu +source .venv/bin/activate +pip install --quiet --upgrade huggingface_hub + +# Ensure HF token flows to the kernels lib (some libs read HF_HUB_TOKEN, not HF_TOKEN) +export HF_HUB_TOKEN="${HF_TOKEN}" + +# Bump kernels to latest — pyproject pins >=0.11.7, uv often picks exactly that; +# 0.11.x had kernel-resolution bugs that affect FA3 loading silently. +echo "[smoke] upgrading kernels lib for FA3 reliability" +uv pip install --quiet --upgrade 'kernels>=0.13.0' 2>&1 || \ + echo "[smoke] WARN: kernels upgrade failed (continuing with whatever uv installed)" + +# GPU sanity +python -c "import torch; print('[smoke] torch', torch.__version__, 'cuda', torch.cuda.is_available(), 'devices', torch.cuda.device_count())" + +# FA3 diagnostic probe — surfaces the real error if FA3 won't load (nanochat +# silently swallows it). Non-fatal: SDPA fallback is automatic if probe fails. +echo "[smoke] === FA3 PROBE BEGIN ===" +python "$WORKDIR/runs/runpod/probe_fa3.py" || echo "[smoke] FA3 probe reported issues (non-fatal — continuing with SDPA fallback)" +echo "[smoke] === FA3 PROBE END ===" + +# Minimum dataset + tokenizer (1 shard, 50M chars — enough for the tokenizer +# to train on AND for base_train to consume 20 iterations of tokens) +python -m nanochat.dataset -n 1 +python -m scripts.tok_train --max-chars=50000000 + +# Tiny base_train. Params from base_train.py docstring (the CPU smoke), adjusted +# slightly for GPU. depth=4, 20 iterations. Should finish in ~30s. +NPROC=$(nvidia-smi -L | wc -l) +echo "[smoke] training on $NPROC GPU(s)" +torchrun --standalone --nproc_per_node="$NPROC" -m scripts.base_train -- \ + --depth=4 \ + --max-seq-len=512 \ + --device-batch-size=1 \ + --total-batch-size=512 \ + --num-iterations=20 \ + --eval-every=10 \ + --eval-tokens=512 \ + --core-metric-every=-1 \ + --sample-every=-1 \ + --save-every=-1 \ + --run="$WANDB_RUN" + +echo "[smoke] $(date -Iseconds) base_train complete — smoke passed" +# trap cleanup handles HF upload + self-delete diff --git a/scripts/base_train.py b/scripts/base_train.py index a161c477..ec037bbb 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -52,6 +52,7 @@ parser.add_argument("--aspect-ratio", type=int, default=64, help="model_dim = de parser.add_argument("--head-dim", type=int, default=128, help="target head dimension for attention") parser.add_argument("--max-seq-len", type=int, default=2048, help="max context length") parser.add_argument("--window-pattern", type=str, default="SSSL", help="sliding window pattern tiled across layers: L=full, S=half context (e.g. 'SSL')") +parser.add_argument("--xsa", action="store_true", help="enable XSA") # Training horizon (only one used, in order of precedence) parser.add_argument("--num-iterations", type=int, default=-1, help="explicit number of optimization steps (-1 = disable)") parser.add_argument("--target-flops", type=float, default=-1.0, help="calculate num_iterations to reach target_flops (-1 = disable)") @@ -102,6 +103,8 @@ wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat", # Flash Attention status from nanochat.flash_attention import USE_FA3 using_fa3 = USE_FA3 +if args.xsa: + print0("XSA enabled") if using_fa3: print0("✓ Using Flash Attention 3 (Hopper GPU detected), efficient, new and awesome.") else: @@ -136,7 +139,7 @@ def build_model_meta(depth): config = GPTConfig( sequence_len=args.max_seq_len, vocab_size=vocab_size, n_layer=depth, n_head=num_heads, n_kv_head=num_heads, n_embd=model_dim, - window_pattern=args.window_pattern, + window_pattern=args.window_pattern, use_xsa=args.xsa, ) with torch.device("meta"): model_meta = GPT(config) diff --git a/scripts/chat_sft.py b/scripts/chat_sft.py index b46dd817..e085b119 100644 --- a/scripts/chat_sft.py +++ b/scripts/chat_sft.py @@ -415,6 +415,7 @@ while True: "n_kv_head": model.config.n_kv_head, "n_embd": model.config.n_embd, "window_pattern": model.config.window_pattern, + "use_xsa": model.config.use_xsa, }, "user_config": user_config, # inputs to the training script },