From fb2be07e17393a8106fa53d7acff2f20042bbca8 Mon Sep 17 00:00:00 2001
From: geopti <gpaptsis@gmail.com>
Date: Sat, 28 Feb 2026 16:37:04 +0000
Subject: [PATCH 1/2] fix: correct CSV extraction in scaling_laws.sh

Two bugs caused all parameter columns and tokens_trained to be silently
empty/wrong in the results CSV:

1. Parameter grep patterns did not account for the padded key format.
   base_train.py prints parameters as `{key:24s}: {value:,}`, e.g.
   `wte                     : 33,554,432`, so patterns like `grep "wte:"`
   never matched. Fixed by using `grep -P "wte\s+:"` to handle the spaces.

2. tokens_trained was hardcoded as `NUM_ITERS * 524288`, but the batch
   size is auto-computed by base_train.py and may differ from 524288
   depending on the FLOPs budget and model size. Fixed by extracting the
   actual value from the log line "Total number of training tokens: X".
---
 runs/scaling_laws.sh | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/runs/scaling_laws.sh b/runs/scaling_laws.sh
index f1e2fd4..b9b7f9b 100644
--- a/runs/scaling_laws.sh
+++ b/runs/scaling_laws.sh
@@ -86,17 +86,17 @@ for flops in "${FLOPS_BUDGETS[@]}"; do
         LOG_FILE="$RESULTS_DIR/${TAG}_train.log"
 
         # Extract detailed parameter counts (for scaling law analysis with different conventions)
-        PARAMS_WTE=$(grep "wte:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',')
-        PARAMS_BIGRAM=$(grep "bigram_embed:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',')
-        PARAMS_VE=$(grep "value_embeds:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',')
-        PARAMS_LM=$(grep "lm_head:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',')
-        PARAMS_TRANSFORMER=$(grep "transformer_matrices:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',')
-        PARAMS_SCALARS=$(grep "scalars:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',')
-        PARAMS_TOTAL=$(grep "total:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',')
+        PARAMS_WTE=$(grep -P "wte\s+:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',')
+        PARAMS_BIGRAM=$(grep -P "bigram_embed\s+:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',')
+        PARAMS_VE=$(grep -P "value_embeds\s+:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',')
+        PARAMS_LM=$(grep -P "lm_head\s+:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',')
+        PARAMS_TRANSFORMER=$(grep -P "transformer_matrices\s+:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',')
+        PARAMS_SCALARS=$(grep -P "scalars\s+:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',')
+        PARAMS_TOTAL=$(grep -P "total\s+:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',')
 
         NUM_ITERS=$(grep "Calculated number of iterations" "$LOG_FILE" | tail -1 | sed 's/.*: //' | tr -d ',')
-        # Calculate tokens trained (iterations * batch_size, default 524288)
-        TOKENS_TRAINED=$((NUM_ITERS * 524288))
+        # Extract actual tokens trained from log (batch size is auto-computed, may differ from 524288)
+        TOKENS_TRAINED=$(grep "Total number of training tokens:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',')
         # Model dim
         MODEL_DIM=$((d * 64))
         # Val BPB from final eval

From 16755495bce2e1b3da7ee27c201cba104481b9fd Mon Sep 17 00:00:00 2001
From: geopti <gpaptsis@gmail.com>
Date: Sat, 28 Feb 2026 20:43:34 +0000
Subject: [PATCH 2/2] fix(miniseries): extract tokens_trained from log instead
 of hardcoding batch size

Same bug as scaling_laws.sh: TOKENS_TRAINED was computed as NUM_ITERS * 524288,
hardcoding the default total batch size. When base_train auto-computes a different
batch size, the value is wrong. Fix by reading "Total number of training tokens:"
directly from the training log.
---
 runs/miniseries.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runs/miniseries.sh b/runs/miniseries.sh
index 01c4459..726bee0 100644
--- a/runs/miniseries.sh
+++ b/runs/miniseries.sh
@@ -85,7 +85,7 @@ for d in "${DEPTHS[@]}"; do
     NUM_PARAMS=$(grep "Number of parameters:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | head -1 | tr -d ',')
     NUM_SCALING_PARAMS=$(grep "Number of parameters:" "$LOG_FILE" | tail -1 | grep -oP 'scaling: [\d,]+' | grep -oP '[\d,]+' | tr -d ',')
     NUM_ITERS=$(grep "Calculated number of iterations" "$LOG_FILE" | tail -1 | sed 's/.*: //' | tr -d ',')
-    TOKENS_TRAINED=$((NUM_ITERS * 524288))
+    TOKENS_TRAINED=$(grep "Total number of training tokens:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',')
     PARAM_DATA_RATIO=$(python -c "print(f'{$TOKENS_TRAINED / $NUM_SCALING_PARAMS:.2f}')")
     MODEL_DIM=$((d * 64))
     VAL_BPB=$(grep "Validation bpb:" "$LOG_FILE" | tail -1 | grep -oP '[\d.]+$')