Use gloo backend for DDP on AMD ROCm to avoid NCCL crashes

On consumer AMD hardware (like APUs or gaming GPUs) running ROCm, the default `nccl` backend (which wraps RCCL) often fails with `invalid device function` due to architecture mismatches or kernel issues. This change detects the presence of `torch.version.hip` and forces the `gloo` backend for `torch.distributed.init_process_group`. While `gloo` is slower for data transfer, it is CPU-based and significantly more robust for these setups, ensuring the training script can run without crashing.
2025-12-06 04:12:13 +00:00 · 2025-11-23 06:49:07 +00:00 · 2025-11-23 06:49:07 +00:00 · 1f9b734358
commit 1f9b734358
parent 962deeefb6
1 changed files with 10 additions and 1 deletions
--- a/nanochat/common.py
+++ b/nanochat/common.py
@ -173,7 +173,16 @@ def compute_init(device_type="cuda"): # cuda|cpu|mps
        if device_type == "cuda":
            device = torch.device("cuda", ddp_local_rank)
            torch.cuda.set_device(device)  # make "cuda" default to this device
-            dist.init_process_group(backend="nccl", device_id=device)
+            # On AMD ROCm (especially consumer/APU hardware), NCCL/RCCL can be unstable or mismatched.
            # Fallback to gloo if HIP is detected to avoid "invalid device function" crashes.
            # While slower than NCCL, it ensures functionality on a wider range of AMD hardware.
            is_rocm = hasattr(torch.version, "hip") and torch.version.hip
            backend = "gloo" if is_rocm else "nccl"
            # gloo backend does not accept 'device_id' argument
            if backend == "gloo":
                dist.init_process_group(backend=backend)
            else:
                dist.init_process_group(backend=backend, device_id=device)
            dist.barrier()
        elif device_type == "cpu":
            device = torch.device("cpu")