mirror of
https://github.com/karpathy/nanochat.git
synced 2026-03-31 09:05:14 +00:00
102 lines
2.8 KiB
Bash
102 lines
2.8 KiB
Bash
#!/bin/bash
|
|
#
|
|
# Test 8 & 9: DDP Discovery Tests
|
|
# Tests auto-discovery in distributed (multi-GPU) settings
|
|
#
|
|
|
|
set -e
|
|
|
|
echo "=========================================="
|
|
echo "DDP Auto-Discovery Tests"
|
|
echo "=========================================="
|
|
|
|
# Check GPU availability
|
|
NUM_GPUS=$(nvidia-smi --query-gpu=count --format=csv,noheader | head -1)
|
|
echo "Detected $NUM_GPUS GPUs"
|
|
|
|
if [ "$NUM_GPUS" -lt 2 ]; then
|
|
echo "SKIP: Need at least 2 GPUs for DDP tests"
|
|
exit 0
|
|
fi
|
|
|
|
DEPTH=12
|
|
MAX_ITERATIONS=10
|
|
|
|
# Test with 2 GPUs
|
|
echo ""
|
|
echo "Test 8: DDP Discovery (2 GPUs)"
|
|
echo "----------------------------------------"
|
|
LOG_2GPU="tests/results/test_ddp_2gpu.log"
|
|
mkdir -p tests/results
|
|
|
|
torchrun --standalone --nproc_per_node=2 -m scripts.base_train \
|
|
-- \
|
|
--depth=$DEPTH \
|
|
--num_iterations=$MAX_ITERATIONS \
|
|
2>&1 | tee "$LOG_2GPU"
|
|
|
|
if [ ${PIPESTATUS[0]} -ne 0 ]; then
|
|
echo "ERROR: 2-GPU DDP test failed"
|
|
exit 1
|
|
fi
|
|
|
|
# Verify rank 0 ran discovery
|
|
if ! grep -q "Running auto-discovery on rank 0" "$LOG_2GPU"; then
|
|
echo "ERROR: No evidence of rank 0 running discovery"
|
|
exit 1
|
|
fi
|
|
|
|
# Verify rank 1 received the batch size
|
|
if ! grep -q "Received batch size from rank 0\|device_batch_size=" "$LOG_2GPU"; then
|
|
echo "ERROR: No evidence of rank 1 receiving batch size"
|
|
exit 1
|
|
fi
|
|
|
|
# Extract batch sizes from both ranks (if logged separately)
|
|
BATCH_SIZE=$(grep "Auto-discovery found device_batch_size=" "$LOG_2GPU" | grep -oP 'device_batch_size=\K\d+' | head -1)
|
|
|
|
if [ -z "$BATCH_SIZE" ]; then
|
|
echo "ERROR: Could not extract batch size"
|
|
exit 1
|
|
fi
|
|
|
|
echo "✓ 2-GPU test passed! Discovered batch size: $BATCH_SIZE"
|
|
|
|
# Test with 4 GPUs if available
|
|
if [ "$NUM_GPUS" -ge 4 ]; then
|
|
echo ""
|
|
echo "Test 9: DDP Discovery (4 GPUs)"
|
|
echo "----------------------------------------"
|
|
LOG_4GPU="tests/results/test_ddp_4gpu.log"
|
|
|
|
torchrun --standalone --nproc_per_node=4 -m scripts.base_train \
|
|
-- \
|
|
--depth=$DEPTH \
|
|
--num_iterations=$MAX_ITERATIONS \
|
|
2>&1 | tee "$LOG_4GPU"
|
|
|
|
if [ ${PIPESTATUS[0]} -ne 0 ]; then
|
|
echo "ERROR: 4-GPU DDP test failed"
|
|
exit 1
|
|
fi
|
|
|
|
# Verify discovery happened
|
|
if ! grep -q "Auto-discovery found device_batch_size=" "$LOG_4GPU"; then
|
|
echo "ERROR: No discovery message in 4-GPU log"
|
|
exit 1
|
|
fi
|
|
|
|
BATCH_SIZE_4GPU=$(grep "Auto-discovery found device_batch_size=" "$LOG_4GPU" | grep -oP 'device_batch_size=\K\d+' | head -1)
|
|
|
|
echo "✓ 4-GPU test passed! Discovered batch size: $BATCH_SIZE_4GPU"
|
|
else
|
|
echo ""
|
|
echo "SKIP: Test 9 (4 GPUs not available)"
|
|
fi
|
|
|
|
echo ""
|
|
echo "✓ All DDP tests passed!"
|
|
echo " - All ranks completed successfully"
|
|
echo " - No deadlocks or synchronization errors"
|
|
echo " - Batch size properly broadcast across ranks"
|