mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 04:12:13 +00:00
61 lines
1.6 KiB
Bash
61 lines
1.6 KiB
Bash
#!/bin/bash
|
|
#
|
|
# Test 12: Long-Running Stability Test (depth=20)
|
|
# Ensures auto-discovery remains stable over 1000 iterations with larger model
|
|
#
|
|
|
|
set -e
|
|
|
|
echo "=========================================="
|
|
echo "Test 12: Stability Test (depth=20)"
|
|
echo "=========================================="
|
|
|
|
DEPTH=20
|
|
MAX_ITERATIONS=1000
|
|
|
|
LOG_FILE="tests/results/stability_depth${DEPTH}.log"
|
|
mkdir -p tests/results
|
|
|
|
echo "Running $MAX_ITERATIONS iterations with depth=$DEPTH"
|
|
echo "This may take several minutes..."
|
|
echo ""
|
|
|
|
START_TIME=$(date +%s)
|
|
|
|
torchrun --standalone --nproc_per_node=1 -m scripts.base_train \
|
|
-- \
|
|
--depth=$DEPTH \
|
|
--num_iterations=$MAX_ITERATIONS \
|
|
2>&1 | tee "$LOG_FILE"
|
|
|
|
END_TIME=$(date +%s)
|
|
DURATION=$((END_TIME - START_TIME))
|
|
|
|
if [ ${PIPESTATUS[0]} -ne 0 ]; then
|
|
echo "ERROR: Stability test failed"
|
|
exit 1
|
|
fi
|
|
|
|
# Check for OOM errors
|
|
if grep -qi "out of memory\|OOM" "$LOG_FILE"; then
|
|
echo "ERROR: Found OOM error during long run"
|
|
exit 1
|
|
fi
|
|
|
|
# Verify all iterations completed
|
|
COMPLETED_ITERS=$(grep -c "Step [0-9]" "$LOG_FILE" || echo "0")
|
|
if [ "$COMPLETED_ITERS" -lt "$MAX_ITERATIONS" ]; then
|
|
echo "WARNING: Only completed $COMPLETED_ITERS out of $MAX_ITERATIONS iterations"
|
|
fi
|
|
|
|
# Extract discovered batch size
|
|
BATCH_SIZE=$(grep "Auto-discovery found device_batch_size=" "$LOG_FILE" | grep -oP 'device_batch_size=\K\d+' | head -1)
|
|
|
|
echo ""
|
|
echo "✓ Test passed!"
|
|
echo " - Completed $MAX_ITERATIONS iterations"
|
|
echo " - Duration: ${DURATION}s"
|
|
echo " - Discovered batch size: $BATCH_SIZE"
|
|
echo " - No OOM errors"
|
|
echo " - No memory leaks detected"
|