make promised_flops_per_sec_per_gpu global, configurable variable

2026-06-19 20:49:09 +00:00 · 2025-11-08 14:45:18 +05:30 · 2025-11-08 14:45:18 +05:30 · 4322d06058
commit 4322d06058
parent c6b7ab7440
1 changed files with 4 additions and 2 deletions
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@ -58,6 +58,8 @@ eval_tokens = 20*524288 # number of tokens to evaluate val loss on
 core_metric_every = 2000 # every how many steps to evaluate the core metric (-1 = disable)
 core_metric_max_per_task = 500 # examples per task in estimating the core metric
 sample_every = 2000 # every how many steps to sample from the model
+# Logging
+promised_flops_per_sec_per_gpu = 989e12 # Note: only useful for logging. Default value set for bfloat16 H100 SXM, without 2:4 sparsity. Set it to bfloat16 flops without sparsity for your gpu to log correct `mfu` (model flops utilization).
 # Output
 model_tag = "" # optionally override the model tag for the output checkpoint directory name
 # now allow CLI to override the settings via the configurator lol
@ -298,8 +300,8 @@ for step in range(num_iterations + 1):
    pct_done = 100 * step / num_iterations
    tok_per_sec = int(total_batch_size / dt)
    flops_per_sec = num_flops_per_token * total_batch_size / dt
-    promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity
-    mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in %
+    promised_flops_per_sec_all_gpus = promised_flops_per_sec_per_gpu * ddp_world_size 
+    mfu = 100 * flops_per_sec / promised_flops_per_sec_all_gpus # in %
    if step > 10:
        total_training_time += dt # only count the time after the first 10 steps
    print_grad_norm = f" grad norm: {grad_norm:.4f} |" if grad_clip_enabled else ""