From 9b9ef3ef388acd6d9dbe6f422a2de419fac6b6ff Mon Sep 17 00:00:00 2001 From: Chris McCormick Date: Fri, 30 Jan 2026 18:03:20 -0800 Subject: [PATCH] Pass p as tesnor to fused adam We can avoid a couple recompiles by passing the underlying tensor for a parameter instead of the parameter object. --- nanochat/optim.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nanochat/optim.py b/nanochat/optim.py index 190a1ed..ea623fa 100644 --- a/nanochat/optim.py +++ b/nanochat/optim.py @@ -217,7 +217,7 @@ class MuonAdamW(torch.optim.Optimizer): # Fused update: weight_decay -> momentum -> bias_correction -> param_update adamw_step_fused( - p, grad, exp_avg, exp_avg_sq, + p.data, grad, exp_avg, exp_avg_sq, self._adamw_step_t, self._adamw_lr_t, self._adamw_beta1_t, self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t, )