mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-16 01:02:18 +00:00
128 lines
3.6 KiB
Bash
128 lines
3.6 KiB
Bash
#!/bin/bash
|
|
#
|
|
# Test 10: Throughput Measurement
|
|
# Compares throughput between manual and auto-discovered batch sizes
|
|
#
|
|
|
|
set -e
|
|
|
|
echo "=========================================="
|
|
echo "Test 10: Throughput Comparison"
|
|
echo "=========================================="
|
|
|
|
DEPTH=12
|
|
MAX_ITERATIONS=100
|
|
MANUAL_BATCH_SIZE=8
|
|
|
|
LOG_MANUAL="tests/results/throughput_manual.log"
|
|
LOG_AUTO="tests/results/throughput_auto.log"
|
|
RESULTS_FILE="tests/results/throughput_comparison.json"
|
|
mkdir -p tests/results
|
|
|
|
# Run 1: Manual batch size
|
|
echo ""
|
|
echo "Run 1: Manual batch size = $MANUAL_BATCH_SIZE"
|
|
echo "----------------------------------------"
|
|
START_MANUAL=$(date +%s)
|
|
|
|
torchrun --standalone --nproc_per_node=1 -m scripts.base_train \
|
|
-- \
|
|
--depth=$DEPTH \
|
|
--device_batch_size=$MANUAL_BATCH_SIZE \
|
|
--num_iterations=$MAX_ITERATIONS \
|
|
2>&1 | tee "$LOG_MANUAL"
|
|
|
|
END_MANUAL=$(date +%s)
|
|
DURATION_MANUAL=$((END_MANUAL - START_MANUAL))
|
|
|
|
if [ ${PIPESTATUS[0]} -ne 0 ]; then
|
|
echo "ERROR: Manual run failed"
|
|
exit 1
|
|
fi
|
|
|
|
# Run 2: Auto discovery
|
|
echo ""
|
|
echo "Run 2: Auto-discovery"
|
|
echo "----------------------------------------"
|
|
START_AUTO=$(date +%s)
|
|
|
|
torchrun --standalone --nproc_per_node=1 -m scripts.base_train \
|
|
-- \
|
|
--depth=$DEPTH \
|
|
--num_iterations=$MAX_ITERATIONS \
|
|
2>&1 | tee "$LOG_AUTO"
|
|
|
|
END_AUTO=$(date +%s)
|
|
DURATION_AUTO=$((END_AUTO - START_AUTO))
|
|
|
|
if [ ${PIPESTATUS[0]} -ne 0 ]; then
|
|
echo "ERROR: Auto-discovery run failed"
|
|
exit 1
|
|
fi
|
|
|
|
# Extract batch sizes
|
|
AUTO_BATCH_SIZE=$(grep "Auto-discovery found device_batch_size=" "$LOG_AUTO" | grep -oP 'device_batch_size=\K\d+' | head -1)
|
|
|
|
# Calculate throughput (iterations per second)
|
|
# Note: This is approximate since it includes discovery time
|
|
THROUGHPUT_MANUAL=$(echo "scale=4; $MAX_ITERATIONS / $DURATION_MANUAL" | bc)
|
|
THROUGHPUT_AUTO=$(echo "scale=4; $MAX_ITERATIONS / $DURATION_AUTO" | bc)
|
|
|
|
# Calculate speedup ratio
|
|
SPEEDUP=$(echo "scale=2; $THROUGHPUT_AUTO / $THROUGHPUT_MANUAL" | bc)
|
|
|
|
echo ""
|
|
echo "Results:"
|
|
echo " Manual batch size: $MANUAL_BATCH_SIZE"
|
|
echo " Auto-discovered batch size: $AUTO_BATCH_SIZE"
|
|
echo " Manual duration: ${DURATION_MANUAL}s"
|
|
echo " Auto duration: ${DURATION_AUTO}s"
|
|
echo " Manual throughput: ${THROUGHPUT_MANUAL} iter/s"
|
|
echo " Auto throughput: ${THROUGHPUT_AUTO} iter/s"
|
|
echo " Speedup ratio: ${SPEEDUP}x"
|
|
|
|
# Save results to JSON
|
|
cat > "$RESULTS_FILE" << EOF
|
|
{
|
|
"timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)",
|
|
"depth": $DEPTH,
|
|
"max_iterations": $MAX_ITERATIONS,
|
|
"manual": {
|
|
"batch_size": $MANUAL_BATCH_SIZE,
|
|
"duration_seconds": $DURATION_MANUAL,
|
|
"throughput_iter_per_sec": $THROUGHPUT_MANUAL
|
|
},
|
|
"auto": {
|
|
"batch_size": $AUTO_BATCH_SIZE,
|
|
"duration_seconds": $DURATION_AUTO,
|
|
"throughput_iter_per_sec": $THROUGHPUT_AUTO
|
|
},
|
|
"speedup_ratio": $SPEEDUP
|
|
}
|
|
EOF
|
|
|
|
echo ""
|
|
echo "Results saved to: $RESULTS_FILE"
|
|
|
|
# Verify speedup is reasonable (allowing some margin)
|
|
# Target is 1.5-3x, but we'll accept >= 1.3x considering overhead
|
|
SPEEDUP_INT=$(echo "$SPEEDUP" | cut -d. -f1)
|
|
if [ "$SPEEDUP_INT" -lt 1 ]; then
|
|
echo "WARNING: Speedup ratio ($SPEEDUP) is less than 1.0"
|
|
echo " Auto-discovery may not be providing benefit"
|
|
# Don't fail the test, as this could be due to discovery overhead
|
|
fi
|
|
|
|
# Check for minimum speedup of 1.3x (allowing for overhead)
|
|
SPEEDUP_THRESHOLD="1.3"
|
|
if [ $(echo "$SPEEDUP < $SPEEDUP_THRESHOLD" | bc) -eq 1 ]; then
|
|
echo "WARNING: Speedup ratio ($SPEEDUP) is below threshold ($SPEEDUP_THRESHOLD)"
|
|
echo " This may be acceptable if discovery overhead is high"
|
|
fi
|
|
|
|
echo ""
|
|
echo "✓ Test passed!"
|
|
echo " - Both runs completed successfully"
|
|
echo " - Throughput measured and compared"
|
|
echo " - Results saved for analysis"
|