From 2dc85662c3f0dbc960ff875b87a72d2dfed694d8 Mon Sep 17 00:00:00 2001
From: Dipesh Babu <dipeshmahato@outlook.com>
Date: Wed, 5 Nov 2025 21:22:35 -0500
Subject: [PATCH 1/2] fix: safe DDP cleanup (check initialized PG, not just
 env)

---
 nanochat/common.py | 28 ++++++++++++++++++++--------
 1 file changed, 20 insertions(+), 8 deletions(-)

diff --git a/nanochat/common.py b/nanochat/common.py
index d4a9828..7efd60d 100644
--- a/nanochat/common.py
+++ b/nanochat/common.py
@@ -113,12 +113,24 @@ def print_banner():
     """
     print0(banner)
 
-def is_ddp():
-    # TODO is there a proper way
-    return int(os.environ.get('RANK', -1)) != -1
+def is_ddp_requested() -> bool:
+    """
+    True if launched by torchrun (env present), even before init.
+    Used to decide whether we *should* initialize a PG.
+    """
+    return all(k in os.environ for k in ("RANK", "LOCAL_RANK", "WORLD_SIZE"))
+
+def is_ddp_initialized() -> bool:
+    """
+    True if torch.distributed is available and the process group is initialized.
+    Used at cleanup to avoid destroying a non-existent PG.
+    """
+    return dist.is_available() and dist.is_initialized()
 
 def get_dist_info():
-    if is_ddp():
+    if is_ddp_requested():
+        # We rely on torchrun's env to decide if we SHOULD init.
+        # (Initialization itself happens in compute init.)
         assert all(var in os.environ for var in ['RANK', 'LOCAL_RANK', 'WORLD_SIZE'])
         ddp_rank = int(os.environ['RANK'])
         ddp_local_rank = int(os.environ['LOCAL_RANK'])
@@ -159,8 +171,8 @@ def compute_init(device_type="cuda"): # cuda|cpu|mps
         torch.set_float32_matmul_precision("high") # uses tf32 instead of fp32 for matmuls
 
     # Distributed setup: Distributed Data Parallel (DDP), optional, and requires CUDA
-    ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
-    if ddp and device_type == "cuda":
+    is_ddp_requested, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
+    if is_ddp_requested and device_type == "cuda":
         device = torch.device("cuda", ddp_local_rank)
         torch.cuda.set_device(device)  # make "cuda" default to this device
         dist.init_process_group(backend="nccl", device_id=device)
@@ -171,11 +183,11 @@ def compute_init(device_type="cuda"): # cuda|cpu|mps
     if ddp_rank == 0:
         logger.info(f"Distributed world size: {ddp_world_size}")
 
-    return ddp, ddp_rank, ddp_local_rank, ddp_world_size, device
+    return is_ddp_requested, ddp_rank, ddp_local_rank, ddp_world_size, device
 
 def compute_cleanup():
     """Companion function to compute_init, to clean things up before script exit"""
-    if is_ddp():
+    if is_ddp_initialized():
         dist.destroy_process_group()
 
 class DummyWandb:

From beb34ac43cdc6d331111eeda0ec85f1a0ac20754 Mon Sep 17 00:00:00 2001
From: Dipesh Babu <dipeshmahato@outlook.com>
Date: Sat, 31 Jan 2026 19:18:48 -0500
Subject: [PATCH 2/2] fix: correct LR warmdown step range

---
 scripts/base_train.py | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/scripts/base_train.py b/scripts/base_train.py
index 7ed6330..cc858d4 100644
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@@ -238,15 +238,28 @@ x, y, dataloader_state_dict = next(train_loader) # kick off load of the very fir
 
 # Learning rate scheduler
 def get_lr_multiplier(it):
-    warmup_iters = round(args.warmup_ratio * num_iterations)
-    warmdown_iters = round(args.warmdown_ratio * num_iterations)
-    if it < warmup_iters:
+    # Note: optimizer steps run for it in [0, num_iterations-1]
+    warmup_iters = int(round(args.warmup_ratio * num_iterations))
+    warmdown_iters = int(round(args.warmdown_ratio * num_iterations))
+
+    # Warmup (avoid division by zero when warmup_iters == 0)
+    if warmup_iters > 0 and it < warmup_iters:
         return (it + 1) / warmup_iters
-    elif it <= num_iterations - warmdown_iters:
-        return 1.0
-    else:
-        progress = (num_iterations - it) / warmdown_iters
-        return progress * 1.0 + (1 - progress) * args.final_lr_frac
+
+    # Warmdown should cover the last `warmdown_iters` optimizer steps:
+    # it in [num_iterations - warmdown_iters, num_iterations - 1]
+    if warmdown_iters > 0:
+        warmdown_start = num_iterations - warmdown_iters
+        # Ensure warmdown doesn't start before warmup ends (prevents overlap weirdness)
+        warmdown_start = max(warmdown_start, warmup_iters)
+
+        if it >= warmdown_start:
+            # progress: 1.0 at warmdown_start, 0.0 at last optimizer step (num_iterations - 1)
+            span = max(1, (num_iterations - 1) - warmdown_start)  # denom >= 1
+            progress = (num_iterations - 1 - it) / span
+            return progress * 1.0 + (1.0 - progress) * args.final_lr_frac
+
+    return 1.0
 
 # Momentum scheduler for Muon optimizer
 def get_muon_momentum(it):