fix(model): apply float32 cast before logits softcapping

This change ensures that the logits softcapping operation (tanh) is performed in float32 precision rather than bfloat16. Previously, the code cast to float32 after the tanh operation, which meant the non-linearity was computed with bfloat16 precision
2025-12-06 12:22:18 +00:00 · 2025-11-23 20:12:09 +05:30 · 2025-11-23 20:12:09 +05:30 · 16788eed3c
commit 16788eed3c
parent 4a87a0d19f
1 changed files with 2 additions and 1 deletions
--- a/nanochat/gpt.py
+++ b/nanochat/gpt.py
@ -265,13 +265,14 @@ class GPT(nn.Module):
            # training mode: compute and return the loss
            # TODO: experiment with Liger Kernels / chunked cross-entropy etc.
            logits = self.lm_head(x)
            logits = softcap * torch.tanh(logits / softcap) # logits softcap
            logits = logits.float() # use tf32/fp32 for logits
            logits = softcap * torch.tanh(logits / softcap) # logits softcap
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1, reduction=loss_reduction)
            return loss
        else:
            # inference mode: compute and return the logits
            logits = self.lm_head(x)
            logits = logits.float() # use tf32/fp32 for logits
            logits = softcap * torch.tanh(logits / softcap) # logits softcap
            return logits