nanochat/tests/integration/test_ddp_discovery.sh

102 lines
2.8 KiB
Bash

#!/bin/bash
#
# Test 8 & 9: DDP Discovery Tests
# Tests auto-discovery in distributed (multi-GPU) settings
#
set -e
echo "=========================================="
echo "DDP Auto-Discovery Tests"
echo "=========================================="
# Check GPU availability
NUM_GPUS=$(nvidia-smi --query-gpu=count --format=csv,noheader | head -1)
echo "Detected $NUM_GPUS GPUs"
if [ "$NUM_GPUS" -lt 2 ]; then
echo "SKIP: Need at least 2 GPUs for DDP tests"
exit 0
fi
DEPTH=12
MAX_ITERATIONS=10
# Test with 2 GPUs
echo ""
echo "Test 8: DDP Discovery (2 GPUs)"
echo "----------------------------------------"
LOG_2GPU="tests/results/test_ddp_2gpu.log"
mkdir -p tests/results
torchrun --standalone --nproc_per_node=2 -m scripts.base_train \
-- \
--depth=$DEPTH \
--num_iterations=$MAX_ITERATIONS \
2>&1 | tee "$LOG_2GPU"
if [ ${PIPESTATUS[0]} -ne 0 ]; then
echo "ERROR: 2-GPU DDP test failed"
exit 1
fi
# Verify rank 0 ran discovery
if ! grep -q "Running auto-discovery on rank 0" "$LOG_2GPU"; then
echo "ERROR: No evidence of rank 0 running discovery"
exit 1
fi
# Verify rank 1 received the batch size
if ! grep -q "Received batch size from rank 0\|device_batch_size=" "$LOG_2GPU"; then
echo "ERROR: No evidence of rank 1 receiving batch size"
exit 1
fi
# Extract batch sizes from both ranks (if logged separately)
BATCH_SIZE=$(grep "Auto-discovery found device_batch_size=" "$LOG_2GPU" | grep -oP 'device_batch_size=\K\d+' | head -1)
if [ -z "$BATCH_SIZE" ]; then
echo "ERROR: Could not extract batch size"
exit 1
fi
echo "✓ 2-GPU test passed! Discovered batch size: $BATCH_SIZE"
# Test with 4 GPUs if available
if [ "$NUM_GPUS" -ge 4 ]; then
echo ""
echo "Test 9: DDP Discovery (4 GPUs)"
echo "----------------------------------------"
LOG_4GPU="tests/results/test_ddp_4gpu.log"
torchrun --standalone --nproc_per_node=4 -m scripts.base_train \
-- \
--depth=$DEPTH \
--num_iterations=$MAX_ITERATIONS \
2>&1 | tee "$LOG_4GPU"
if [ ${PIPESTATUS[0]} -ne 0 ]; then
echo "ERROR: 4-GPU DDP test failed"
exit 1
fi
# Verify discovery happened
if ! grep -q "Auto-discovery found device_batch_size=" "$LOG_4GPU"; then
echo "ERROR: No discovery message in 4-GPU log"
exit 1
fi
BATCH_SIZE_4GPU=$(grep "Auto-discovery found device_batch_size=" "$LOG_4GPU" | grep -oP 'device_batch_size=\K\d+' | head -1)
echo "✓ 4-GPU test passed! Discovered batch size: $BATCH_SIZE_4GPU"
else
echo ""
echo "SKIP: Test 9 (4 GPUs not available)"
fi
echo ""
echo "✓ All DDP tests passed!"
echo " - All ranks completed successfully"
echo " - No deadlocks or synchronization errors"
echo " - Batch size properly broadcast across ranks"