From a9de4b103858223646e0e8ba29ed32b8516aad8f Mon Sep 17 00:00:00 2001 From: water-vapor Date: Sun, 26 Oct 2025 01:43:49 -0500 Subject: [PATCH] Fix tok/sec metrics for base_train and mid_train when gradient accumulation is not 1 --- scripts/base_train.py | 2 +- scripts/mid_train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/base_train.py b/scripts/base_train.py index 3725805..47ecba4 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -294,7 +294,7 @@ for step in range(num_iterations + 1): smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA pct_done = 100 * step / num_iterations - tok_per_sec = int(world_tokens_per_fwdbwd / dt) + tok_per_sec = int(total_batch_size / dt) flops_per_sec = num_flops_per_token * total_batch_size / dt promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in % diff --git a/scripts/mid_train.py b/scripts/mid_train.py index eedb262..6c2b82f 100644 --- a/scripts/mid_train.py +++ b/scripts/mid_train.py @@ -268,7 +268,7 @@ while True: smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA pct_done = 100 * progress - tok_per_sec = int(world_tokens_per_fwdbwd / dt) + tok_per_sec = int(total_batch_size / dt) flops_per_sec = num_flops_per_token * total_batch_size / dt promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in %