Fix tok/sec metrics for base_train and mid_train when gradient accumulation is not 1

2025-12-06 04:12:13 +00:00 · 2025-10-26 01:43:49 -05:00 · 2025-10-26 01:43:49 -05:00 · a9de4b1038
commit a9de4b1038
parent c75fe54aa7
2 changed files with 2 additions and 2 deletions
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@ -294,7 +294,7 @@ for step in range(num_iterations + 1):
    smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss
    debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA
    pct_done = 100 * step / num_iterations
-    tok_per_sec = int(world_tokens_per_fwdbwd / dt)
+    tok_per_sec = int(total_batch_size / dt)
    flops_per_sec = num_flops_per_token * total_batch_size / dt
    promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity
    mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in %
--- a/scripts/mid_train.py
+++ b/scripts/mid_train.py
@ -268,7 +268,7 @@ while True:
    smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss
    debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA
    pct_done = 100 * progress
-    tok_per_sec = int(world_tokens_per_fwdbwd / dt)
+    tok_per_sec = int(total_batch_size / dt)
    flops_per_sec = num_flops_per_token * total_batch_size / dt
    promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity
    mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in %