mirror of
https://github.com/karpathy/nanochat.git
synced 2026-05-13 19:30:23 +00:00
Add SFT-only resume runner; install hf_transfer in d12 runner
This commit is contained in:
parent
39cee6fc76
commit
22eeed326c
|
|
@ -65,6 +65,12 @@ cleanup() {
|
|||
--exclude "base_data_climbmix/**" --exclude "wandb/**" || \
|
||||
echo "[runner] WARN: final upload failed"
|
||||
fi
|
||||
# Also upload the runner log so we have a permanent record of this successful run.
|
||||
if [ -f "$LOG_FILE" ]; then
|
||||
hf upload "$HF_REPO" "$LOG_FILE" "_runs/${TS}/runner.log" \
|
||||
--repo-type model --commit-message "runner log $TS" || \
|
||||
echo "[runner] WARN: runner.log upload failed"
|
||||
fi
|
||||
else
|
||||
echo "[runner] failure rc=$rc — dumping logs to HF for offline debug"
|
||||
mkdir -p /tmp/failure
|
||||
|
|
@ -115,8 +121,11 @@ echo "[runner] HEAD = $(git rev-parse HEAD)"
|
|||
|
||||
sed -i 's/--depth=24/--depth=12/' runs/speedrun.sh
|
||||
sed -i 's/ --target-param-data-ratio=8//' runs/speedrun.sh
|
||||
# Inject `set -euo pipefail` so a mid-pipeline failure (e.g. chat_sft) propagates
|
||||
# as rc!=0 instead of being silently swallowed by the next command.
|
||||
sed -i '1a set -euo pipefail' runs/speedrun.sh
|
||||
echo "[runner] speedrun.sh edits applied:"
|
||||
grep -n 'depth\|target-param' runs/speedrun.sh || true
|
||||
grep -n 'depth\|target-param\|set -e' runs/speedrun.sh || true
|
||||
|
||||
# Explicit venv setup BEFORE speedrun.sh so we can run diagnostic probes
|
||||
# inside the venv. speedrun.sh's uv sync is idempotent (no-op the second time).
|
||||
|
|
@ -138,6 +147,12 @@ echo "[runner] upgrading kernels lib for FA3 reliability"
|
|||
uv pip install --quiet --upgrade 'kernels>=0.13.0' 2>&1 || \
|
||||
echo "[runner] WARN: kernels upgrade failed (continuing)"
|
||||
|
||||
# Install hf_transfer — runpod base image sets HF_HUB_ENABLE_HF_TRANSFER=1, which
|
||||
# makes huggingface_hub raise ValueError if the package is missing. chat_sft loads
|
||||
# HuggingFaceTB/smol-smoltalk via datasets and crashes without this.
|
||||
echo "[runner] installing hf_transfer for SFT dataset download"
|
||||
uv pip install --quiet hf_transfer 2>&1 || echo "[runner] WARN: hf_transfer install failed"
|
||||
|
||||
# FA3 diagnostic probe — surfaces real errors (nanochat silently swallows them).
|
||||
# Non-fatal: SDPA fallback is automatic. We want this output in the log
|
||||
# regardless of outcome so we can decide what to do about FA3.
|
||||
|
|
@ -163,4 +178,19 @@ echo "[runner] backup loop pid=$BACKUP_PID interval=${BACKUP_INTERVAL}s"
|
|||
export WANDB_RUN
|
||||
WANDB_RUN="$WANDB_RUN" bash runs/speedrun.sh
|
||||
|
||||
# Verify expected pipeline outputs — speedrun.sh historically didn't `set -e`;
|
||||
# we patched it above, but double-check the artifacts that matter for the d12 baseline.
|
||||
echo "[runner] verifying pipeline outputs"
|
||||
missing=()
|
||||
for required in base_checkpoints/d12 chatsft_checkpoints/d12 tokenizer report; do
|
||||
if [ ! -d "$NANOCHAT_BASE_DIR/$required" ]; then
|
||||
missing+=("$required")
|
||||
fi
|
||||
done
|
||||
if [ ${#missing[@]} -gt 0 ]; then
|
||||
echo "[runner] FAIL: pipeline finished but missing expected artifacts: ${missing[*]}"
|
||||
exit 1
|
||||
fi
|
||||
echo "[runner] all expected artifacts present"
|
||||
|
||||
echo "[runner] $(date -Iseconds) pipeline complete"
|
||||
|
|
|
|||
149
runs/runpod/d12_sft_only.sh
Executable file
149
runs/runpod/d12_sft_only.sh
Executable file
|
|
@ -0,0 +1,149 @@
|
|||
#!/usr/bin/env bash
|
||||
# d12 SFT-only resume runner. Runs INSIDE a RunPod pod.
|
||||
#
|
||||
# Use case: the d12 base_train + base_eval already succeeded and uploaded to HF,
|
||||
# but chat_sft failed (e.g., missing hf_transfer package). Instead of re-running
|
||||
# the whole pipeline, this runner:
|
||||
# 1. Downloads base_checkpoints/d12/ + tokenizer/ from HF
|
||||
# 2. Installs hf_transfer (the actual SFT bug fix)
|
||||
# 3. Runs chat_sft + chat_eval directly (skips speedrun.sh)
|
||||
# 4. Uploads chatsft_checkpoints/ + chat_eval results + report to HF
|
||||
# 5. Self-deletes
|
||||
#
|
||||
# Required env: HF_TOKEN, WANDB_API_KEY
|
||||
# Optional env:
|
||||
# WANDB_RUN default: d12-sft
|
||||
# NANOCHAT_REPO default: Team-XSA/nanochat
|
||||
# NANOCHAT_REF default: dev
|
||||
# HF_REPO default: haydenfree/nanochat-d12-baseline (where the base lives)
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
NANOCHAT_REPO="${NANOCHAT_REPO:-Team-XSA/nanochat}"
|
||||
NANOCHAT_REF="${NANOCHAT_REF:-dev}"
|
||||
HF_REPO="${HF_REPO:-haydenfree/nanochat-d12-baseline}"
|
||||
WANDB_RUN="${WANDB_RUN:-d12-sft}"
|
||||
|
||||
WORKDIR="/workspace/nanochat"
|
||||
LOG_FILE="/workspace/runner.log"
|
||||
NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
|
||||
|
||||
mkdir -p /workspace
|
||||
echo "[sft] $(date -Iseconds) starting on pod=$RUNPOD_POD_ID"
|
||||
echo "[sft] resuming from base checkpoint at $HF_REPO"
|
||||
|
||||
# Bootstrap huggingface_hub system-wide so cleanup can upload logs even on early failure.
|
||||
{ pip3 install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \
|
||||
python3 -m pip install --break-system-packages --quiet --upgrade huggingface_hub 2>&1 || \
|
||||
echo "[sft] WARN: could not pre-install huggingface_hub"; } || true
|
||||
|
||||
cleanup() {
|
||||
local rc=$?
|
||||
set +e
|
||||
echo "[sft] cleanup: exit code $rc at $(date -Iseconds)"
|
||||
|
||||
local TS
|
||||
TS=$(date -u +%Y%m%dT%H%M%SZ)
|
||||
|
||||
if [ "$rc" -eq 0 ]; then
|
||||
echo "[sft] success — uploading chatsft_checkpoints + report + log"
|
||||
# Only upload the SFT-specific subdirs so we don't re-upload base.
|
||||
for subdir in chatsft_checkpoints report; do
|
||||
if [ -d "$NANOCHAT_BASE_DIR/$subdir" ]; then
|
||||
hf upload "$HF_REPO" "$NANOCHAT_BASE_DIR/$subdir" "$subdir" \
|
||||
--repo-type model --commit-message "$subdir SFT-resume rc=0 $TS" || \
|
||||
echo "[sft] WARN: $subdir upload failed"
|
||||
fi
|
||||
done
|
||||
if [ -f "$LOG_FILE" ]; then
|
||||
hf upload "$HF_REPO" "$LOG_FILE" "_runs/${TS}-sft/runner.log" \
|
||||
--repo-type model --commit-message "SFT runner log $TS" || \
|
||||
echo "[sft] WARN: runner.log upload failed"
|
||||
fi
|
||||
else
|
||||
echo "[sft] failure rc=$rc — dumping logs"
|
||||
mkdir -p /tmp/failure
|
||||
cp /workspace/*.log /tmp/failure/ 2>/dev/null || true
|
||||
[ -d "$NANOCHAT_BASE_DIR/report" ] && cp -r "$NANOCHAT_BASE_DIR/report" /tmp/failure/ 2>/dev/null || true
|
||||
[ -d "$WORKDIR" ] && (cd "$WORKDIR" && git rev-parse HEAD 2>/dev/null > /tmp/failure/git-head.txt || true)
|
||||
hf upload "$HF_REPO" /tmp/failure "_failures/${TS}-sft-rc${rc}/logs" \
|
||||
--repo-type model --commit-message "SFT-resume failure rc=$rc $TS" || \
|
||||
echo "[sft] WARN: log upload failed"
|
||||
fi
|
||||
|
||||
echo "[sft] self-deleting pod $RUNPOD_POD_ID"
|
||||
if curl -fsS -X DELETE \
|
||||
-H "Authorization: Bearer ${RUNPOD_API_KEY:-}" \
|
||||
"https://rest.runpod.io/v1/pods/$RUNPOD_POD_ID" 2>&1; then
|
||||
echo "[sft] REST delete request accepted"
|
||||
else
|
||||
echo "[sft] REST delete failed, trying runpodctl as fallback"
|
||||
runpodctl pod delete "$RUNPOD_POD_ID" 2>&1 || \
|
||||
runpodctl remove pod "$RUNPOD_POD_ID" 2>&1 || \
|
||||
echo "[sft] WARN: all delete methods failed — pod may need manual cleanup"
|
||||
fi
|
||||
exit "$rc"
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
: "${HF_TOKEN:?HF_TOKEN must be set}"
|
||||
: "${WANDB_API_KEY:?WANDB_API_KEY must be set}"
|
||||
: "${RUNPOD_POD_ID:?RUNPOD_POD_ID must be set (auto by RunPod)}"
|
||||
|
||||
# Clone fork
|
||||
rm -rf "$WORKDIR"
|
||||
git clone "https://github.com/${NANOCHAT_REPO}.git" "$WORKDIR"
|
||||
cd "$WORKDIR"
|
||||
git checkout "$NANOCHAT_REF" --
|
||||
echo "[sft] HEAD = $(git rev-parse HEAD)"
|
||||
|
||||
# Env + uv
|
||||
export OMP_NUM_THREADS=1
|
||||
export NANOCHAT_BASE_DIR
|
||||
mkdir -p "$NANOCHAT_BASE_DIR"
|
||||
command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
[ -d ".venv" ] || uv venv
|
||||
uv sync --extra gpu
|
||||
source .venv/bin/activate
|
||||
pip install --quiet --upgrade huggingface_hub
|
||||
export HF_HUB_TOKEN="${HF_TOKEN}"
|
||||
|
||||
# Install hf_transfer — THE actual fix for the previous SFT failure.
|
||||
echo "[sft] installing hf_transfer (the bug from last run)"
|
||||
uv pip install --quiet hf_transfer
|
||||
|
||||
# Pull tokenizer + base checkpoint from HF — skip base_train entirely
|
||||
echo "[sft] downloading tokenizer and base_checkpoints/d12 from $HF_REPO"
|
||||
hf download "$HF_REPO" \
|
||||
--include "tokenizer/**" \
|
||||
--include "base_checkpoints/d12/**" \
|
||||
--local-dir "$NANOCHAT_BASE_DIR" \
|
||||
--repo-type model
|
||||
|
||||
ls -la "$NANOCHAT_BASE_DIR/base_checkpoints/d12/" || true
|
||||
ls -la "$NANOCHAT_BASE_DIR/tokenizer/" || true
|
||||
|
||||
# Also need identity_conversations.jsonl for SFT (speedrun.sh normally fetches it)
|
||||
echo "[sft] fetching identity_conversations.jsonl"
|
||||
curl -L -fsS -o "$NANOCHAT_BASE_DIR/identity_conversations.jsonl" \
|
||||
https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
|
||||
|
||||
# Run only SFT + chat_eval + report. NOT speedrun.sh (which would re-do base_train).
|
||||
NPROC=$(nvidia-smi -L | wc -l)
|
||||
echo "[sft] running chat_sft on $NPROC GPUs"
|
||||
torchrun --standalone --nproc_per_node="$NPROC" -m scripts.chat_sft -- \
|
||||
--device-batch-size=16 --run="$WANDB_RUN"
|
||||
|
||||
echo "[sft] running chat_eval"
|
||||
torchrun --standalone --nproc_per_node="$NPROC" -m scripts.chat_eval -- -i sft
|
||||
|
||||
echo "[sft] regenerating report (will include new SFT sections)"
|
||||
python -m nanochat.report generate || true
|
||||
|
||||
# Verify SFT artifacts exist before declaring success
|
||||
if [ ! -d "$NANOCHAT_BASE_DIR/chatsft_checkpoints" ]; then
|
||||
echo "[sft] FAIL: chatsft_checkpoints/ missing after chat_sft"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "[sft] $(date -Iseconds) SFT pipeline complete"
|
||||
Loading…
Reference in New Issue
Block a user