mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 04:12:13 +00:00
Merge pull request #11 from LokiMetaSmith/fix-cpu-ddp-init
Fix CPU DDP crashes: Init Gloo backend, prevent OOM by reducing NPROC…
This commit is contained in:
commit
8009354739
|
|
@ -12,12 +12,13 @@ python -m scripts.base_train --depth=4 --max_seq_len=512 --device_batch_size=1 -
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
|
||||||
import time
|
import time
|
||||||
from contextlib import nullcontext
|
from contextlib import nullcontext
|
||||||
|
|
||||||
import wandb
|
import wandb
|
||||||
import torch
|
import torch
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
|
||||||
|
|
||||||
from nanochat.gpt import GPT, GPTConfig
|
from nanochat.gpt import GPT, GPTConfig
|
||||||
from nanochat.dataloader import tokenizing_distributed_data_loader, tokenizing_distributed_data_loader_with_state
|
from nanochat.dataloader import tokenizing_distributed_data_loader, tokenizing_distributed_data_loader_with_state
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
# This script is the "Best ChatGPT clone that $100 can buy",
|
# This script is the "Best ChatGPT clone that $100 can buy",
|
||||||
# It is designed to run in ~4 hours on 8XH100 node at $3/GPU/hour.
|
# It is designed to run in ~4 hours on 8XH100 node at $3/GPU/hour.
|
||||||
|
|
@ -83,7 +84,15 @@ echo "Waiting for dataset download to complete..."
|
||||||
wait $DATASET_DOWNLOAD_PID
|
wait $DATASET_DOWNLOAD_PID
|
||||||
|
|
||||||
# Number of processes/GPUs to use
|
# Number of processes/GPUs to use
|
||||||
|
# Auto-detect if we have GPUs
|
||||||
|
if python -c "import torch; exit(0) if torch.cuda.is_available() else exit(1)"; then
|
||||||
NPROC_PER_NODE=8
|
NPROC_PER_NODE=8
|
||||||
|
else
|
||||||
|
echo "No GPU detected. Defaulting to NPROC_PER_NODE=1 to avoid OOM and using multi-threading."
|
||||||
|
NPROC_PER_NODE=1
|
||||||
|
# If running on CPU, let PyTorch use all available cores for the single process
|
||||||
|
unset OMP_NUM_THREADS
|
||||||
|
fi
|
||||||
|
|
||||||
# pretrain the d20 model
|
# pretrain the d20 model
|
||||||
torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=20 --run=$WANDB_RUN
|
torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=20 --run=$WANDB_RUN
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user