Merge pull request #11 from LokiMetaSmith/fix-cpu-ddp-init

Fix CPU DDP crashes: Init Gloo backend, prevent OOM by reducing NPROC…
2025-12-06 04:12:13 +00:00 · 2025-11-22 01:35:48 -06:00 · 2025-11-22 01:35:48 -06:00 · 8009354739
commit 8009354739
parent b5fd54ac1c a35621e726
2 changed files with 12 additions and 2 deletions
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@ -12,12 +12,13 @@ python -m scripts.base_train --depth=4 --max_seq_len=512 --device_batch_size=1 -
 """
 import os
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 import time
 from contextlib import nullcontext
 import wandb
 import torch
 if torch.cuda.is_available():
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 from nanochat.gpt import GPT, GPTConfig
 from nanochat.dataloader import tokenizing_distributed_data_loader, tokenizing_distributed_data_loader_with_state
--- a/speedrun.sh
+++ b/speedrun.sh
@ -1,4 +1,5 @@
 #!/bin/bash
 set -e
 # This script is the "Best ChatGPT clone that $100 can buy",
 # It is designed to run in ~4 hours on 8XH100 node at $3/GPU/hour.
@ -83,7 +84,15 @@ echo "Waiting for dataset download to complete..."
 wait $DATASET_DOWNLOAD_PID
 # Number of processes/GPUs to use
 # Auto-detect if we have GPUs
 if python -c "import torch; exit(0) if torch.cuda.is_available() else exit(1)"; then
    NPROC_PER_NODE=8
 else
    echo "No GPU detected. Defaulting to NPROC_PER_NODE=1 to avoid OOM and using multi-threading."
    NPROC_PER_NODE=1
    # If running on CPU, let PyTorch use all available cores for the single process
    unset OMP_NUM_THREADS
 fi
 # pretrain the d20 model
 torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=20 --run=$WANDB_RUN