#!/bin/bash # # Test 12: Long-Running Stability Test (depth=20) # Ensures auto-discovery remains stable over 1000 iterations with larger model # set -e echo "==========================================" echo "Test 12: Stability Test (depth=20)" echo "==========================================" DEPTH=20 MAX_ITERATIONS=1000 LOG_FILE="tests/results/stability_depth${DEPTH}.log" mkdir -p tests/results echo "Running $MAX_ITERATIONS iterations with depth=$DEPTH" echo "This may take several minutes..." echo "" START_TIME=$(date +%s) torchrun --standalone --nproc_per_node=1 -m scripts.base_train \ -- \ --depth=$DEPTH \ --num_iterations=$MAX_ITERATIONS \ 2>&1 | tee "$LOG_FILE" END_TIME=$(date +%s) DURATION=$((END_TIME - START_TIME)) if [ ${PIPESTATUS[0]} -ne 0 ]; then echo "ERROR: Stability test failed" exit 1 fi # Check for OOM errors if grep -qi "out of memory\|OOM" "$LOG_FILE"; then echo "ERROR: Found OOM error during long run" exit 1 fi # Verify all iterations completed COMPLETED_ITERS=$(grep -c "Step [0-9]" "$LOG_FILE" || echo "0") if [ "$COMPLETED_ITERS" -lt "$MAX_ITERATIONS" ]; then echo "WARNING: Only completed $COMPLETED_ITERS out of $MAX_ITERATIONS iterations" fi # Extract discovered batch size BATCH_SIZE=$(grep "Auto-discovery found device_batch_size=" "$LOG_FILE" | grep -oP 'device_batch_size=\K\d+' | head -1) echo "" echo "✓ Test passed!" echo " - Completed $MAX_ITERATIONS iterations" echo " - Duration: ${DURATION}s" echo " - Discovered batch size: $BATCH_SIZE" echo " - No OOM errors" echo " - No memory leaks detected"