Use gloo backend for DDP on AMD ROCm to avoid NCCL crashes

On consumer AMD hardware (like APUs or gaming GPUs) running ROCm, the default `nccl` backend (which wraps RCCL) often fails with `invalid device function` due to architecture mismatches or kernel issues.

This change detects the presence of `torch.version.hip` and forces the `gloo` backend for `torch.distributed.init_process_group`. While `gloo` is slower for data transfer, it is CPU-based and significantly more robust for these setups, ensuring the training script can run without crashing.
This commit is contained in:
google-labs-jules[bot] 2025-11-23 06:49:07 +00:00
parent 962deeefb6
commit 1f9b734358

View File

@ -173,7 +173,16 @@ def compute_init(device_type="cuda"): # cuda|cpu|mps
if device_type == "cuda": if device_type == "cuda":
device = torch.device("cuda", ddp_local_rank) device = torch.device("cuda", ddp_local_rank)
torch.cuda.set_device(device) # make "cuda" default to this device torch.cuda.set_device(device) # make "cuda" default to this device
dist.init_process_group(backend="nccl", device_id=device) # On AMD ROCm (especially consumer/APU hardware), NCCL/RCCL can be unstable or mismatched.
# Fallback to gloo if HIP is detected to avoid "invalid device function" crashes.
# While slower than NCCL, it ensures functionality on a wider range of AMD hardware.
is_rocm = hasattr(torch.version, "hip") and torch.version.hip
backend = "gloo" if is_rocm else "nccl"
# gloo backend does not accept 'device_id' argument
if backend == "gloo":
dist.init_process_group(backend=backend)
else:
dist.init_process_group(backend=backend, device_id=device)
dist.barrier() dist.barrier()
elif device_type == "cpu": elif device_type == "cpu":
device = torch.device("cpu") device = torch.device("cpu")