From 861414d500cfa74ff81d7610aa814fb511124894 Mon Sep 17 00:00:00 2001 From: Matt Van Horn Date: Mon, 9 Mar 2026 07:52:48 -0700 Subject: [PATCH] add explicit --model-tag to run scripts Without --model-tag, chat_sft/chat_cli/chat_web/base_eval can pick the wrong model when multiple models exist in the cache. Add explicit --model-tag=d6 (runcpu) and --model-tag=d24 (speedrun) matching the depth used in each script's base_train call. Co-Authored-By: Claude Opus 4.6 --- runs/runcpu.sh | 7 ++++--- runs/speedrun.sh | 10 +++++----- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/runs/runcpu.sh b/runs/runcpu.sh index 853fa1f..eb236cd 100755 --- a/runs/runcpu.sh +++ b/runs/runcpu.sh @@ -42,11 +42,12 @@ python -m scripts.base_train \ --sample-every=100 \ --num-iterations=5000 \ --run=$WANDB_RUN -python -m scripts.base_eval --device-batch-size=1 --split-tokens=16384 --max-per-task=16 +python -m scripts.base_eval --model-tag=d6 --device-batch-size=1 --split-tokens=16384 --max-per-task=16 # SFT (~10 minutes on my MacBook Pro M3 Max) curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl python -m scripts.chat_sft \ + --model-tag=d6 \ --max-seq-len=512 \ --device-batch-size=32 \ --total-batch-size=16384 \ @@ -59,7 +60,7 @@ python -m scripts.chat_sft \ # The model should be able to say that it is Paris. # It might even know that the color of the sky is blue. # Sometimes the model likes it if you first say Hi before you ask it questions. -# python -m scripts.chat_cli -p "What is the capital of France?" +# python -m scripts.chat_cli --model-tag=d6 -p "What is the capital of France?" # Chat with the model over a pretty WebUI ChatGPT style -# python -m scripts.chat_web +# python -m scripts.chat_web --model-tag=d6 diff --git a/runs/speedrun.sh b/runs/speedrun.sh index fa50694..36ee142 100644 --- a/runs/speedrun.sh +++ b/runs/speedrun.sh @@ -72,7 +72,7 @@ wait $DATASET_DOWNLOAD_PID # d24 model (slightly undertrained to beat GPT-2 => decrease data:params ratio from compute optimal 10.5 (default) to 9.5) torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=24 --target-param-data-ratio=9.5 --device-batch-size=16 --fp8 --run=$WANDB_RUN # evaluate the model: CORE metric, BPB on train/val, and draw samples -torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch-size=16 +torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --model-tag=d24 --device-batch-size=16 # ----------------------------------------------------------------------------- # SFT (teach the model conversation special tokens, tool use, multiple choice) @@ -82,14 +82,14 @@ torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch- curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl # run SFT and eval the model -torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --device-batch-size=16 --run=$WANDB_RUN -torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft +torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --model-tag=d24 --device-batch-size=16 --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- --model-tag=d24 -i sft # chat with the model over CLI! Leave out the -p to chat interactively -# python -m scripts.chat_cli -p "Why is the sky blue?" +# python -m scripts.chat_cli --model-tag=d24 -p "Why is the sky blue?" # even better, chat with your model over a pretty WebUI ChatGPT style -# python -m scripts.chat_web +# python -m scripts.chat_web --model-tag=d24 # ----------------------------------------------------------------------------- # Generate the full report by putting together all the sections