From eb66bbd4e246d759374f7010b4132c1f5869b8dc Mon Sep 17 00:00:00 2001 From: Hayden Free Date: Sun, 26 Apr 2026 00:54:23 -0400 Subject: [PATCH] =?UTF-8?q?Split=20tokenizer=20+=20base=20downloads=20?= =?UTF-8?q?=E2=80=94=20hf=20download=20--include=20only=20honors=20last=20?= =?UTF-8?q?value?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- runs/runpod/d12_sft_only.sh | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/runs/runpod/d12_sft_only.sh b/runs/runpod/d12_sft_only.sh index 42fc3567..46f2fd94 100755 --- a/runs/runpod/d12_sft_only.sh +++ b/runs/runpod/d12_sft_only.sh @@ -112,16 +112,28 @@ export HF_HUB_TOKEN="${HF_TOKEN}" echo "[sft] installing hf_transfer (the bug from last run)" uv pip install --quiet hf_transfer -# Pull tokenizer + base checkpoint from HF — skip base_train entirely -echo "[sft] downloading tokenizer and base_checkpoints/d12 from $HF_REPO" +# Pull tokenizer + base checkpoint from HF in TWO separate calls. +# `hf download` only honors the LAST --include when specified multiple times +# (multi-include works for upload, not download — verified the hard way). +echo "[sft] downloading tokenizer from $HF_REPO" hf download "$HF_REPO" \ --include "tokenizer/**" \ + --local-dir "$NANOCHAT_BASE_DIR" \ + --repo-type model + +echo "[sft] downloading base_checkpoints/d12 from $HF_REPO" +hf download "$HF_REPO" \ --include "base_checkpoints/d12/**" \ --local-dir "$NANOCHAT_BASE_DIR" \ --repo-type model -ls -la "$NANOCHAT_BASE_DIR/base_checkpoints/d12/" || true -ls -la "$NANOCHAT_BASE_DIR/tokenizer/" || true +# Verify both pieces actually landed before invoking chat_sft. +echo "[sft] verifying downloads" +ls -la "$NANOCHAT_BASE_DIR/base_checkpoints/d12/" 2>&1 || true +ls -la "$NANOCHAT_BASE_DIR/tokenizer/" 2>&1 || true +[ -f "$NANOCHAT_BASE_DIR/tokenizer/tokenizer.pkl" ] || { echo "[sft] FAIL: tokenizer.pkl missing after download"; exit 1; } +[ -n "$(ls -A "$NANOCHAT_BASE_DIR/base_checkpoints/d12/" 2>/dev/null)" ] || { echo "[sft] FAIL: base_checkpoints/d12 is empty"; exit 1; } +echo "[sft] downloads verified" # Also need identity_conversations.jsonl for SFT (speedrun.sh normally fetches it) echo "[sft] fetching identity_conversations.jsonl"