mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 04:12:13 +00:00
Use gloo backend for DDP on AMD ROCm to avoid NCCL crashes
On consumer AMD hardware (like APUs or gaming GPUs) running ROCm, the default `nccl` backend (which wraps RCCL) often fails with `invalid device function` due to architecture mismatches or kernel issues. This change detects the presence of `torch.version.hip` and forces the `gloo` backend for `torch.distributed.init_process_group`. While `gloo` is slower for data transfer, it is CPU-based and significantly more robust for these setups, ensuring the training script can run without crashing.
This commit is contained in:
parent
962deeefb6
commit
1f9b734358
|
|
@ -173,7 +173,16 @@ def compute_init(device_type="cuda"): # cuda|cpu|mps
|
|||
if device_type == "cuda":
|
||||
device = torch.device("cuda", ddp_local_rank)
|
||||
torch.cuda.set_device(device) # make "cuda" default to this device
|
||||
dist.init_process_group(backend="nccl", device_id=device)
|
||||
# On AMD ROCm (especially consumer/APU hardware), NCCL/RCCL can be unstable or mismatched.
|
||||
# Fallback to gloo if HIP is detected to avoid "invalid device function" crashes.
|
||||
# While slower than NCCL, it ensures functionality on a wider range of AMD hardware.
|
||||
is_rocm = hasattr(torch.version, "hip") and torch.version.hip
|
||||
backend = "gloo" if is_rocm else "nccl"
|
||||
# gloo backend does not accept 'device_id' argument
|
||||
if backend == "gloo":
|
||||
dist.init_process_group(backend=backend)
|
||||
else:
|
||||
dist.init_process_group(backend=backend, device_id=device)
|
||||
dist.barrier()
|
||||
elif device_type == "cpu":
|
||||
device = torch.device("cpu")
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user