new optimal ratio for d26 training

2026-05-14 03:38:02 +00:00 · 2026-02-06 19:21:27 +00:00 · 2026-02-06 19:21:27 +00:00 · 685271dc8d
commit 685271dc8d
parent e527521a3f
1 changed files with 1 additions and 1 deletions
--- a/runs/speedrun.sh
+++ b/runs/speedrun.sh
@ -70,7 +70,7 @@ echo "Waiting for dataset download to complete..."
 wait $DATASET_DOWNLOAD_PID

 # d24 model (slightly overtrained is enough to beat GPT-2 => increase data:params ratio from compute optimal 10.5 (default) to 12)
-torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=26 --target-param-data-ratio=8.5 --device-batch-size=16 --fp8 --run=$WANDB_RUN
+torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=26 --target-param-data-ratio=8.25 --device-batch-size=16 --fp8 --run=$WANDB_RUN
 # evaluate the model: CORE metric, BPB on train/val, and draw samples
 torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch-size=16