From 2ae28292aae2c19c17f8d14ff511d61d87981add Mon Sep 17 00:00:00 2001 From: Max Kruijs Voorberge Date: Sun, 8 Feb 2026 17:55:38 +0100 Subject: [PATCH] Fix train_loss to average all steps instead of keeping only the last --- scripts/base_train.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/base_train.py b/scripts/base_train.py index ccf35e6..72051fa 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -485,13 +485,15 @@ while True: # evaluate the gradient synchronize() t0 = time.time() + train_loss = 0.0 for micro_step in range(grad_accum_steps): with autocast_ctx: loss = model(x, y) - train_loss = loss.detach() # for logging + train_loss += loss.detach() # accumulate for logging loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here loss.backward() x, y, dataloader_state_dict = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward + train_loss = train_loss / grad_accum_steps # average across micro steps # step the optimizer lrm = get_lr_multiplier(step) muon_momentum = get_muon_momentum(step)