From 8be390751433a8f07bdd2ea9789b06bc0ae80b53 Mon Sep 17 00:00:00 2001
From: zichongli5 <53316944+zichongli5@users.noreply.github.com>
Date: Tue, 3 Feb 2026 21:53:43 -0500
Subject: [PATCH] Add NorMuon paper link

---
 nanochat/optim.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nanochat/optim.py b/nanochat/optim.py
index 190a1ed..8cf9ecc 100644
--- a/nanochat/optim.py
+++ b/nanochat/optim.py
@@ -243,6 +243,7 @@ class MuonAdamW(torch.optim.Optimizer):
         momentum_buffer = state["momentum_buffer"]
 
         # Second momentum buffer is factored, either per-row or per-column
+        # from NorMuon: https://arxiv.org/abs/2510.05491
         if "second_momentum_buffer" not in state:
             state_shape = (num_params, shape[-2], 1) if shape[-2] >= shape[-1] else (num_params, 1, shape[-1])
             state["second_momentum_buffer"] = torch.zeros(state_shape, dtype=dtype, device=device)