From c16db281ffe816966e8a4e1ef79b00d4b627228a Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 24 Mar 2026 19:25:34 +0000 Subject: [PATCH] fix small bug with params logging and batch size --- runs/scaling_laws.sh | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/runs/scaling_laws.sh b/runs/scaling_laws.sh index 212e675..0e0b600 100644 --- a/runs/scaling_laws.sh +++ b/runs/scaling_laws.sh @@ -8,7 +8,7 @@ FLOPS_BUDGETS=( 4.64e18 1e19 ) -DEPTHS=(8 10 12 14 16 18 20) +DEPTHS=(10 12 14 16 18 20) NPROC_PER_NODE="${NPROC_PER_NODE:-8}" WANDB_RUN="${WANDB_RUN:-scaling_${LABEL}}" @@ -60,6 +60,15 @@ for flops in "${FLOPS_BUDGETS[@]}"; do # Unique tag for this run TAG="scaling_${flops}_d${d}" + # Reduce --device-batch-size to avoid OOM at larger depths + if [ $d -ge 28 ]; then + DEVICE_BATCH_SIZE_ARG="--device-batch-size=8" + elif [ $d -ge 20 ]; then + DEVICE_BATCH_SIZE_ARG="--device-batch-size=16" + else + DEVICE_BATCH_SIZE_ARG="--device-batch-size=32" + fi + # Record start time START_TIME=$(date +%s) @@ -77,6 +86,7 @@ for flops in "${FLOPS_BUDGETS[@]}"; do --core-metric-max-per-task=-1 \ --sample-every=-1 \ --save-every=-1 \ + $DEVICE_BATCH_SIZE_ARG \ 2>&1 | tee "$RESULTS_DIR/${TAG}_train.log" END_TIME=$(date +%s) @@ -96,8 +106,9 @@ for flops in "${FLOPS_BUDGETS[@]}"; do PARAMS_TOTAL=$(grep "^total " "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') NUM_ITERS=$(grep "Calculated number of iterations" "$LOG_FILE" | tail -1 | sed 's/.*: //' | tr -d ',') - # Calculate tokens trained (iterations * batch_size, default 524288) - TOKENS_TRAINED=$((NUM_ITERS * 524288)) + # Extract actual batch size from log (auto-computed, varies by model size) + BATCH_SIZE=$(grep "Total batch size" "$LOG_FILE" | tail -1 | grep -oP 'Total batch size \K[\d,]+' | tr -d ',') + TOKENS_TRAINED=$((NUM_ITERS * BATCH_SIZE)) # Model dim MODEL_DIM=$((d * 64)) # Val BPB from final eval