mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 04:12:13 +00:00
71 lines
2.0 KiB
Bash
71 lines
2.0 KiB
Bash
#!/bin/bash
|
|
#
|
|
# Test 6: Basic Discovery Run
|
|
# Tests that auto-discovery completes successfully on a single GPU
|
|
#
|
|
|
|
set -e # Exit on error
|
|
|
|
echo "=========================================="
|
|
echo "Test 6: Basic Discovery Run (Single GPU)"
|
|
echo "=========================================="
|
|
|
|
# Configuration
|
|
DEPTH=12
|
|
MAX_ITERATIONS=10
|
|
TIMEOUT=30 # seconds
|
|
|
|
# Output log file
|
|
LOG_FILE="tests/results/test_single_gpu_discovery.log"
|
|
mkdir -p tests/results
|
|
|
|
# Run the training script with auto-discovery
|
|
echo "Running: torchrun --standalone --nproc_per_node=1 -m scripts.base_train -- --depth=$DEPTH --auto_batch_size=True --max_iterations=$MAX_ITERATIONS"
|
|
|
|
timeout $TIMEOUT torchrun --standalone --nproc_per_node=1 -m scripts.base_train \
|
|
-- \
|
|
--depth=$DEPTH \
|
|
--num_iterations=$MAX_ITERATIONS \
|
|
2>&1 | tee "$LOG_FILE"
|
|
|
|
# Check exit code
|
|
EXIT_CODE=$?
|
|
if [ $EXIT_CODE -ne 0 ]; then
|
|
echo "ERROR: Training script failed with exit code $EXIT_CODE"
|
|
exit 1
|
|
fi
|
|
|
|
# Verify log contains discovery message
|
|
if ! grep -q "Auto-discovery found device_batch_size=" "$LOG_FILE"; then
|
|
echo "ERROR: Log does not contain 'Auto-discovery found device_batch_size='"
|
|
echo "This suggests auto-discovery was not triggered"
|
|
exit 1
|
|
fi
|
|
|
|
# Verify no OOM errors
|
|
if grep -qi "out of memory\|OOM" "$LOG_FILE"; then
|
|
echo "ERROR: Found OOM error in log"
|
|
exit 1
|
|
fi
|
|
|
|
# Extract discovered batch size
|
|
BATCH_SIZE=$(grep "Auto-discovery found device_batch_size=" "$LOG_FILE" | grep -oP 'device_batch_size=\K\d+' | head -1)
|
|
echo "Discovered batch size: $BATCH_SIZE"
|
|
|
|
# Verify batch size is reasonable
|
|
if [ -z "$BATCH_SIZE" ]; then
|
|
echo "ERROR: Could not extract batch size from log"
|
|
exit 1
|
|
fi
|
|
|
|
if [ "$BATCH_SIZE" -lt 1 ] || [ "$BATCH_SIZE" -gt 128 ]; then
|
|
echo "ERROR: Batch size $BATCH_SIZE is outside reasonable range [1, 128]"
|
|
exit 1
|
|
fi
|
|
|
|
echo "✓ Test passed!"
|
|
echo " - Discovery completed successfully"
|
|
echo " - Found batch size: $BATCH_SIZE"
|
|
echo " - No OOM errors"
|
|
echo " - Training completed $MAX_ITERATIONS iterations"
|