From b19b4f3e4917cc4436099a89dfc517ee7e821a85 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 2 Feb 2026 15:50:14 +0000 Subject: [PATCH] fix bug in speedrun script, batch size that doesn't OOM on 8XH100 for d24 is 16 --- runs/speedrun.sh | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/runs/speedrun.sh b/runs/speedrun.sh index a709462..d390c6d 100644 --- a/runs/speedrun.sh +++ b/runs/speedrun.sh @@ -69,13 +69,10 @@ python -m scripts.tok_eval echo "Waiting for dataset download to complete..." wait $DATASET_DOWNLOAD_PID -# Number of processes/GPUs to use -NPROC_PER_NODE=8 - # d24 model (slightly overtrained is enough to beat GPT-2 => increase data:params ratio from compute optimal 10.5 (default) to 12) -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=24 --target-param-data-ratio=12 --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=24 --target-param-data-ratio=12 --device-batch-size=16 --run=$WANDB_RUN # evaluate the model: CORE metric, BPB on train/val, and draw samples -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval +torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch-size=16 # ----------------------------------------------------------------------------- # SFT (teach the model conversation special tokens, tool use, multiple choice) @@ -85,8 +82,8 @@ torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl # run SFT and eval the model -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_sft -- --run=$WANDB_RUN -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i sft +torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --device-batch-size=16 --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft # chat with the model over CLI! Leave out the -p to chat interactively # python -m scripts.chat_cli -p "Why is the sky blue?"