mirror of
https://github.com/karpathy/nanochat.git
synced 2026-01-04 18:52:36 +00:00
Merge pull request #21 from LokiMetaSmith/fix-amd-triton-reinstall
Use gloo backend for DDP on AMD ROCm to avoid NCCL crashes
This commit is contained in:
commit
da035bf408
|
|
@ -173,7 +173,16 @@ def compute_init(device_type="cuda"): # cuda|cpu|mps
|
||||||
if device_type == "cuda":
|
if device_type == "cuda":
|
||||||
device = torch.device("cuda", ddp_local_rank)
|
device = torch.device("cuda", ddp_local_rank)
|
||||||
torch.cuda.set_device(device) # make "cuda" default to this device
|
torch.cuda.set_device(device) # make "cuda" default to this device
|
||||||
dist.init_process_group(backend="nccl", device_id=device)
|
# On AMD ROCm (especially consumer/APU hardware), NCCL/RCCL can be unstable or mismatched.
|
||||||
|
# Fallback to gloo if HIP is detected to avoid "invalid device function" crashes.
|
||||||
|
# While slower than NCCL, it ensures functionality on a wider range of AMD hardware.
|
||||||
|
is_rocm = hasattr(torch.version, "hip") and torch.version.hip
|
||||||
|
backend = "gloo" if is_rocm else "nccl"
|
||||||
|
# gloo backend does not accept 'device_id' argument
|
||||||
|
if backend == "gloo":
|
||||||
|
dist.init_process_group(backend=backend)
|
||||||
|
else:
|
||||||
|
dist.init_process_group(backend=backend, device_id=device)
|
||||||
dist.barrier()
|
dist.barrier()
|
||||||
elif device_type == "cpu":
|
elif device_type == "cpu":
|
||||||
device = torch.device("cpu")
|
device = torch.device("cpu")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user