From ae0bf525299633d973d39ecf996edcb48e1fa6f5 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 5 Jan 2026 18:57:46 +0000 Subject: [PATCH] tune hyperparameters based on overnight sweeps. warmdown_ratio is the biggest free win, increasing 0.2 -> 0.4, and embedding lr can be larger bumping 0.2 -> 0.3 --- scripts/base_train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/base_train.py b/scripts/base_train.py index 2390b68..c8345e0 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -47,13 +47,13 @@ parser.add_argument("--target_param_data_ratio", type=int, default=20, help="cal # Optimization parser.add_argument("--device_batch_size", type=int, default=32, help="per-device batch size") parser.add_argument("--total_batch_size", type=int, default=524288, help="total batch size in tokens") -parser.add_argument("--embedding_lr", type=float, default=0.2, help="learning rate for embedding parameters (Adam)") +parser.add_argument("--embedding_lr", type=float, default=0.3, help="learning rate for embedding parameters (Adam)") parser.add_argument("--unembedding_lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") parser.add_argument("--weight_decay", type=float, default=0.0, help="weight decay for embedding/unembedding parameters (Adam)") parser.add_argument("--matrix_lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") parser.add_argument("--grad_clip", type=float, default=1.0, help="gradient clipping value (0.0 = disabled)") parser.add_argument("--warmup_ratio", type=float, default=0.0, help="ratio of iterations for LR warmup") -parser.add_argument("--warmdown_ratio", type=float, default=0.2, help="ratio of iterations for LR warmdown") +parser.add_argument("--warmdown_ratio", type=float, default=0.4, help="ratio of iterations for LR warmdown") parser.add_argument("--final_lr_frac", type=float, default=0.0, help="final LR as fraction of initial LR") parser.add_argument("--resume_from_step", type=int, default=-1, help="resume training from this step (-1 = disable)") # Evaluation