Merge 861414d500 into 4e1694cc95

2026-06-15 10:39:08 +00:00 · 2026-03-25 11:04:13 +02:00 · 2026-03-25 11:04:13 +02:00 · 3d9856061d
commit 3d9856061d
parent 4e1694cc95 861414d500
2 changed files with 9 additions and 8 deletions
--- a/runs/runcpu.sh
+++ b/runs/runcpu.sh
@ -42,11 +42,12 @@ python -m scripts.base_train \
    --sample-every=100 \
    --num-iterations=5000 \
    --run=$WANDB_RUN
-python -m scripts.base_eval --device-batch-size=1 --split-tokens=16384 --max-per-task=16
+python -m scripts.base_eval --model-tag=d6 --device-batch-size=1 --split-tokens=16384 --max-per-task=16

 # SFT (~10 minutes on my MacBook Pro M3 Max)
 curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
 python -m scripts.chat_sft \
+    --model-tag=d6 \
    --max-seq-len=512 \
    --device-batch-size=32 \
    --total-batch-size=16384 \
@ -59,7 +60,7 @@ python -m scripts.chat_sft \
 # The model should be able to say that it is Paris.
 # It might even know that the color of the sky is blue.
 # Sometimes the model likes it if you first say Hi before you ask it questions.
-# python -m scripts.chat_cli -p "What is the capital of France?"
+# python -m scripts.chat_cli --model-tag=d6 -p "What is the capital of France?"

 # Chat with the model over a pretty WebUI ChatGPT style
-# python -m scripts.chat_web
+# python -m scripts.chat_web --model-tag=d6
--- a/runs/speedrun.sh
+++ b/runs/speedrun.sh
@ -72,7 +72,7 @@ wait $DATASET_DOWNLOAD_PID
 # d24 model (slightly undertrained to beat GPT-2 => decrease data:params ratio from compute optimal 10.5 (default) to 8)
 torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=24 --target-param-data-ratio=8 --device-batch-size=16 --fp8 --run=$WANDB_RUN
 # evaluate the model: CORE metric, BPB on train/val, and draw samples
-torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch-size=16
+torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --model-tag=d24 --device-batch-size=16

 # -----------------------------------------------------------------------------
 # SFT (teach the model conversation special tokens, tool use, multiple choice)
@ -82,14 +82,14 @@ torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch-
 curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl

 # run SFT and eval the model
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --device-batch-size=16 --run=$WANDB_RUN
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft
+torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --model-tag=d24 --device-batch-size=16 --run=$WANDB_RUN
+torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- --model-tag=d24 -i sft

 # chat with the model over CLI! Leave out the -p to chat interactively
-# python -m scripts.chat_cli -p "Why is the sky blue?"
+# python -m scripts.chat_cli --model-tag=d24 -p "Why is the sky blue?"

 # even better, chat with your model over a pretty WebUI ChatGPT style
-# python -m scripts.chat_web
+# python -m scripts.chat_web --model-tag=d24

 # -----------------------------------------------------------------------------
 # Generate the full report by putting together all the sections