From eaf49a33c8e85c6066878bbacc45edcbc4f6ee83 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 1 Feb 2026 20:15:19 +0000 Subject: [PATCH] fix path which i think was modified during the refactor and this is a bug introduced by claude i believe --- scripts/chat_sft.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/chat_sft.py b/scripts/chat_sft.py index 91300b6..cad0d81 100644 --- a/scripts/chat_sft.py +++ b/scripts/chat_sft.py @@ -48,7 +48,7 @@ parser.add_argument("--max-seq-len", type=int, default=2048, help="max context l parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size") parser.add_argument("--total-batch-size", type=int, default=524288, help="total batch size in tokens") # Optimization -parser.add_argument("--embedding-lr", type=float, default=0.2, help="learning rate for embedding parameters (Adam)") +parser.add_argument("--embedding-lr", type=float, default=0.3, help="learning rate for embedding parameters (Adam)") parser.add_argument("--unembedding-lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") parser.add_argument("--matrix-lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") parser.add_argument("--weight-decay", type=float, default=0.0, help="weight decay for embedding/unembedding parameters (Adam)") @@ -285,7 +285,7 @@ while True: # save checkpoint at the end of the run (only on master process) if master_process and last_step and not args.dry_run: output_dirname = args.model_tag if args.model_tag else f"d{depth}" # e.g. d12 - checkpoint_dir = os.path.join(base_dir, "sft_checkpoints", output_dirname) + checkpoint_dir = os.path.join(base_dir, "chatsft_checkpoints", output_dirname) save_checkpoint( checkpoint_dir, step,