Add guard against division by zero in chat_sft when num_tokens is 0

2026-06-15 10:39:08 +00:00 · 2025-10-15 13:24:00 +00:00 · 2025-10-15 13:24:00 +00:00 · 42b05eea7e
commit 42b05eea7e
parent f5001141ec
1 changed files with 8 additions and 3 deletions
--- a/scripts/chat_sft.py
+++ b/scripts/chat_sft.py
@ -221,10 +221,16 @@ for step in range(num_iterations):
        dist.all_reduce(total_loss_sum, op=dist.ReduceOp.SUM)
        dist.all_reduce(num_tokens, op=dist.ReduceOp.SUM) # sum over ranks

-    # Scale gradients by total number of tokens
+    # scale gradients by total number of tokens
+    num_tokens_item = num_tokens.item()
+    if num_tokens_item == 0:
+        print0(f"Warning: the number of valid tokens in train targets is 0 at step {step}, skipping model update")
+        model.zero_grad(set_to_none=True)
+        continue
+
    for param in model.parameters():
        if param.grad is not None:
-            param.grad.div_(num_tokens.item())
+            param.grad.div_(num_tokens_item)

    # learning rate scheduler
    lrm = get_lr_multiplier(step)
@ -238,7 +244,6 @@ for step in range(num_iterations):
    model.zero_grad(set_to_none=True)

    # logging
-    num_tokens_item = num_tokens.item()
    train_loss_item = total_loss_sum.item() / num_tokens_item
    print0(f"Step {step:05d}/{num_iterations:05d} | Training loss: {train_loss_item:.6f}| lrm: {lrm:.6f}| num_tokens: {num_tokens_item:,}")
    wandb_run.log({