From 2ae28292aae2c19c17f8d14ff511d61d87981add Mon Sep 17 00:00:00 2001
From: Max Kruijs Voorberge <maxkruijsv@gmail.com>
Date: Sun, 8 Feb 2026 17:55:38 +0100
Subject: [PATCH] Fix train_loss to average all steps instead of keeping only
 the last

---
 scripts/base_train.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/base_train.py b/scripts/base_train.py
index ccf35e6..72051fa 100644
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@@ -485,13 +485,15 @@ while True:
     # evaluate the gradient
     synchronize()
     t0 = time.time()
+    train_loss = 0.0
     for micro_step in range(grad_accum_steps):
         with autocast_ctx:
             loss = model(x, y)
-        train_loss = loss.detach() # for logging
+        train_loss += loss.detach() # accumulate for logging
         loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
         loss.backward()
         x, y, dataloader_state_dict = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward
+    train_loss = train_loss / grad_accum_steps # average across micro steps
     # step the optimizer
     lrm = get_lr_multiplier(step)
     muon_momentum = get_muon_momentum(step)