#!/bin/bash # # Test 10: Throughput Measurement # Compares throughput between manual and auto-discovered batch sizes # set -e echo "==========================================" echo "Test 10: Throughput Comparison" echo "==========================================" DEPTH=12 MAX_ITERATIONS=100 MANUAL_BATCH_SIZE=8 LOG_MANUAL="tests/results/throughput_manual.log" LOG_AUTO="tests/results/throughput_auto.log" RESULTS_FILE="tests/results/throughput_comparison.json" mkdir -p tests/results # Run 1: Manual batch size echo "" echo "Run 1: Manual batch size = $MANUAL_BATCH_SIZE" echo "----------------------------------------" START_MANUAL=$(date +%s) torchrun --standalone --nproc_per_node=1 -m scripts.base_train \ -- \ --depth=$DEPTH \ --device_batch_size=$MANUAL_BATCH_SIZE \ --num_iterations=$MAX_ITERATIONS \ 2>&1 | tee "$LOG_MANUAL" END_MANUAL=$(date +%s) DURATION_MANUAL=$((END_MANUAL - START_MANUAL)) if [ ${PIPESTATUS[0]} -ne 0 ]; then echo "ERROR: Manual run failed" exit 1 fi # Run 2: Auto discovery echo "" echo "Run 2: Auto-discovery" echo "----------------------------------------" START_AUTO=$(date +%s) torchrun --standalone --nproc_per_node=1 -m scripts.base_train \ -- \ --depth=$DEPTH \ --num_iterations=$MAX_ITERATIONS \ 2>&1 | tee "$LOG_AUTO" END_AUTO=$(date +%s) DURATION_AUTO=$((END_AUTO - START_AUTO)) if [ ${PIPESTATUS[0]} -ne 0 ]; then echo "ERROR: Auto-discovery run failed" exit 1 fi # Extract batch sizes AUTO_BATCH_SIZE=$(grep "Auto-discovery found device_batch_size=" "$LOG_AUTO" | grep -oP 'device_batch_size=\K\d+' | head -1) # Calculate throughput (iterations per second) # Note: This is approximate since it includes discovery time THROUGHPUT_MANUAL=$(echo "scale=4; $MAX_ITERATIONS / $DURATION_MANUAL" | bc) THROUGHPUT_AUTO=$(echo "scale=4; $MAX_ITERATIONS / $DURATION_AUTO" | bc) # Calculate speedup ratio SPEEDUP=$(echo "scale=2; $THROUGHPUT_AUTO / $THROUGHPUT_MANUAL" | bc) echo "" echo "Results:" echo " Manual batch size: $MANUAL_BATCH_SIZE" echo " Auto-discovered batch size: $AUTO_BATCH_SIZE" echo " Manual duration: ${DURATION_MANUAL}s" echo " Auto duration: ${DURATION_AUTO}s" echo " Manual throughput: ${THROUGHPUT_MANUAL} iter/s" echo " Auto throughput: ${THROUGHPUT_AUTO} iter/s" echo " Speedup ratio: ${SPEEDUP}x" # Save results to JSON cat > "$RESULTS_FILE" << EOF { "timestamp": "$(date -u +%Y-%m-%dT%H:%M:%SZ)", "depth": $DEPTH, "max_iterations": $MAX_ITERATIONS, "manual": { "batch_size": $MANUAL_BATCH_SIZE, "duration_seconds": $DURATION_MANUAL, "throughput_iter_per_sec": $THROUGHPUT_MANUAL }, "auto": { "batch_size": $AUTO_BATCH_SIZE, "duration_seconds": $DURATION_AUTO, "throughput_iter_per_sec": $THROUGHPUT_AUTO }, "speedup_ratio": $SPEEDUP } EOF echo "" echo "Results saved to: $RESULTS_FILE" # Verify speedup is reasonable (allowing some margin) # Target is 1.5-3x, but we'll accept >= 1.3x considering overhead SPEEDUP_INT=$(echo "$SPEEDUP" | cut -d. -f1) if [ "$SPEEDUP_INT" -lt 1 ]; then echo "WARNING: Speedup ratio ($SPEEDUP) is less than 1.0" echo " Auto-discovery may not be providing benefit" # Don't fail the test, as this could be due to discovery overhead fi # Check for minimum speedup of 1.3x (allowing for overhead) SPEEDUP_THRESHOLD="1.3" if [ $(echo "$SPEEDUP < $SPEEDUP_THRESHOLD" | bc) -eq 1 ]; then echo "WARNING: Speedup ratio ($SPEEDUP) is below threshold ($SPEEDUP_THRESHOLD)" echo " This may be acceptable if discovery overhead is high" fi echo "" echo "✓ Test passed!" echo " - Both runs completed successfully" echo " - Throughput measured and compared" echo " - Results saved for analysis"