From d1595fb2d170c3cf1faf4aa58dd65c79a51a86a3 Mon Sep 17 00:00:00 2001 From: Chris McCormick Date: Mon, 26 Jan 2026 12:03:22 -0800 Subject: [PATCH] Allow torchrun with 1 device --- nanochat/gpt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nanochat/gpt.py b/nanochat/gpt.py index 12f6d82..3397db6 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -350,8 +350,8 @@ class GPT(nn.Module): dict(params=x0_params, lr=scalar_lr), ] - # MuonAdamW for single-GPU, DistMuonAdamW for multi-GPU (with communication overlap) - OptimizerClass = DistMuonAdamW if ddp else MuonAdamW + # MuonAdamW for single-GPU, DistMuonAdamW for multi-GPU + OptimizerClass = DistMuonAdamW if (ddp and world_size > 1) else MuonAdamW optimizer = OptimizerClass( adamw_groups=adam_groups, muon_params=matrix_params,