call reshape instead of view in case the tensors are not contiguous

2026-06-18 20:19:08 +00:00 · 2026-02-27 01:50:37 +01:00 · 2026-02-27 01:50:37 +01:00 · 83de1b18b1
commit 83de1b18b1
parent c7ba252142
2 changed files with 2 additions and 2 deletions
--- a/nanochat/gpt.py
+++ b/nanochat/gpt.py
@ -416,7 +416,7 @@ class GPT(nn.Module):
        if targets is not None:
            # training: given the targets, compute and return the loss
            # TODO experiment with chunked cross-entropy?
-            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1, reduction=loss_reduction)
+            loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.reshape(-1), ignore_index=-1, reduction=loss_reduction)
            return loss
        else:
            # inference: just return the logits directly
--- a/nanochat/loss_eval.py
+++ b/nanochat/loss_eval.py
@ -32,7 +32,7 @@ def evaluate_bpb(model, batches, steps, token_bytes):
        x, y = next(batch_iter)
        loss2d = model(x, y, loss_reduction='none') # (B, T)
        loss2d = loss2d.view(-1) # flatten
-        y = y.view(-1) # flatten
+        y = y.reshape(-1) # flatten
        if (y.int() < 0).any(): # mps does not currently have kernel for < 0 for int64, only int32
            # slightly more complex code path if some target tokens are ignore_index (e.g. -1)
            # any target token < 0 is to be ignored: do NOT index token_bytes with negatives