diff --git a/dev/bigram_minimal_pr_changes.md b/dev/bigram_minimal_pr_changes.md index 24d8ecf1..bdc11ff2 100644 --- a/dev/bigram_minimal_pr_changes.md +++ b/dev/bigram_minimal_pr_changes.md @@ -6,12 +6,17 @@ best-performing speedrun recipe: ```bash --fp8 +--depth=22 +--num-iterations=11600 +--total-batch-size=524288 --bigram-embed-factor=5 --muon-plus --muon-eq=row --scalar-lr=0.3 --train-log-every=50 --compile-mode=max-autotune-no-cudagraphs +--eval-every=250 +--core-metric-every=5800 ``` It does not include the experimental branches that were tested and rejected: @@ -167,13 +172,17 @@ step-0 validation pass when it is not needed for a speedrun submission. Updates the default speedrun command to use the winning recipe flags: - FP8 -- total batch size `1048576` +- depth `22` +- fixed `11600` optimizer steps +- total batch size `524288` - Muon+ - row equilibration - bigram factor 5 - scalar LR `0.3` - log every 50 training steps - `max-autotune-no-cudagraphs` compile mode +- validation every 250 steps +- one CORE metric pass halfway through at step 5800 This script is the intended entry point for reproducing the submitted run. diff --git a/runs/speedrun.sh b/runs/speedrun.sh index 9f780faf..8dab8cf0 100644 --- a/runs/speedrun.sh +++ b/runs/speedrun.sh @@ -69,12 +69,15 @@ python -m scripts.tok_eval echo "Waiting for dataset download to complete..." wait $DATASET_DOWNLOAD_PID -# d24 model (slightly undertrained to beat GPT-2 => decrease data:params ratio from compute optimal 10.5 (default) to 8) +# d22 Muon+/row-eq + hashed bigram recipe. +# This is the submission default: fixed 11,600 optimizer steps, eval every 250, +# and one in-training CORE pass halfway through. torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- \ - --depth=24 \ - --target-param-data-ratio=8 \ - --device-batch-size=16 \ - --total-batch-size=1048576 \ + --depth=22 \ + --num-iterations=11600 \ + --target-param-data-ratio=11 \ + --device-batch-size=32 \ + --total-batch-size=524288 \ --fp8 \ --compile-mode=max-autotune-no-cudagraphs \ --muon-plus \ @@ -82,6 +85,8 @@ torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- \ --bigram-embed-factor=5 \ --scalar-lr=0.3 \ --train-log-every=50 \ + --eval-every=250 \ + --core-metric-every=5800 \ --run=$WANDB_RUN # evaluate the model: CORE metric, BPB on train/val, and draw samples torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch-size=16