Set speedrun default to d22 bigram recipe

2026-07-06 12:59:14 +00:00 · 2026-05-07 09:02:55 +00:00 · 2026-05-07 09:02:55 +00:00 · 0de3a39910
commit 0de3a39910
parent e014abacc6
2 changed files with 20 additions and 6 deletions
--- a/dev/bigram_minimal_pr_changes.md
+++ b/dev/bigram_minimal_pr_changes.md
@ -6,12 +6,17 @@ best-performing speedrun recipe:

 ```bash
 --fp8
+--depth=22
+--num-iterations=11600
+--total-batch-size=524288
 --bigram-embed-factor=5
 --muon-plus
 --muon-eq=row
 --scalar-lr=0.3
 --train-log-every=50
 --compile-mode=max-autotune-no-cudagraphs
+--eval-every=250
+--core-metric-every=5800
 ```

 It does not include the experimental branches that were tested and rejected:
@ -167,13 +172,17 @@ step-0 validation pass when it is not needed for a speedrun submission.
 Updates the default speedrun command to use the winning recipe flags:

 - FP8
- total batch size `1048576`
+- depth `22`
+- fixed `11600` optimizer steps
+- total batch size `524288`
 - Muon+
 - row equilibration
 - bigram factor 5
 - scalar LR `0.3`
 - log every 50 training steps
 - `max-autotune-no-cudagraphs` compile mode
+- validation every 250 steps
+- one CORE metric pass halfway through at step 5800

 This script is the intended entry point for reproducing the submitted run.

--- a/runs/speedrun.sh
+++ b/runs/speedrun.sh
@ -69,12 +69,15 @@ python -m scripts.tok_eval
 echo "Waiting for dataset download to complete..."
 wait $DATASET_DOWNLOAD_PID

-# d24 model (slightly undertrained to beat GPT-2 => decrease data:params ratio from compute optimal 10.5 (default) to 8)
+# d22 Muon+/row-eq + hashed bigram recipe.
+# This is the submission default: fixed 11,600 optimizer steps, eval every 250,
+# and one in-training CORE pass halfway through.
 torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- \
-    --depth=24 \
-    --target-param-data-ratio=8 \
-    --device-batch-size=16 \
-    --total-batch-size=1048576 \
+    --depth=22 \
+    --num-iterations=11600 \
+    --target-param-data-ratio=11 \
+    --device-batch-size=32 \
+    --total-batch-size=524288 \
    --fp8 \
    --compile-mode=max-autotune-no-cudagraphs \
    --muon-plus \
@ -82,6 +85,8 @@ torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- \
    --bigram-embed-factor=5 \
    --scalar-lr=0.3 \
    --train-log-every=50 \
+    --eval-every=250 \
+    --core-metric-every=5800 \
    --run=$WANDB_RUN
 # evaluate the model: CORE metric, BPB on train/val, and draw samples
 torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch-size=16