Merge 8be3907514 into 1144d186ed

2026-06-19 04:29:09 +00:00 · 2026-02-05 10:43:32 +08:00 · 2026-02-05 10:43:32 +08:00 · fb9a32a9f3
commit fb9a32a9f3
parent 1144d186ed 8be3907514
1 changed files with 1 additions and 0 deletions
--- a/nanochat/optim.py
+++ b/nanochat/optim.py
@ -247,6 +247,7 @@ class MuonAdamW(torch.optim.Optimizer):
        momentum_buffer = state["momentum_buffer"]

        # Second momentum buffer is factored, either per-row or per-column
+        # from NorMuon: https://arxiv.org/abs/2510.05491
        if "second_momentum_buffer" not in state:
            state_shape = (num_params, shape[-2], 1) if shape[-2] >= shape[-1] else (num_params, 1, shape[-1])
            state["second_momentum_buffer"] = torch.zeros(state_shape, dtype=dtype, device=device)