mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 04:12:13 +00:00
78 lines
2.3 KiB
Bash
78 lines
2.3 KiB
Bash
#!/bin/bash
|
|
#
|
|
# Test 14: Long-Running Stability Test (depth=32)
|
|
# Ensures auto-discovery finds smaller batch size for largest model
|
|
#
|
|
|
|
set -e
|
|
|
|
echo "=========================================="
|
|
echo "Test 14: Stability Test (depth=32)"
|
|
echo "=========================================="
|
|
|
|
DEPTH=32
|
|
MAX_ITERATIONS=1000
|
|
|
|
LOG_FILE="tests/results/stability_depth${DEPTH}.log"
|
|
mkdir -p tests/results
|
|
|
|
echo "Running $MAX_ITERATIONS iterations with depth=$DEPTH"
|
|
echo "This may take several minutes..."
|
|
echo "Expected: Discovery should find smaller batch size due to larger model"
|
|
echo ""
|
|
|
|
START_TIME=$(date +%s)
|
|
|
|
torchrun --standalone --nproc_per_node=1 -m scripts.base_train \
|
|
-- \
|
|
--depth=$DEPTH \
|
|
--num_iterations=$MAX_ITERATIONS \
|
|
2>&1 | tee "$LOG_FILE"
|
|
|
|
END_TIME=$(date +%s)
|
|
DURATION=$((END_TIME - START_TIME))
|
|
|
|
if [ ${PIPESTATUS[0]} -ne 0 ]; then
|
|
echo "ERROR: Stability test failed"
|
|
exit 1
|
|
fi
|
|
|
|
# Check for OOM errors
|
|
if grep -qi "out of memory\|OOM" "$LOG_FILE"; then
|
|
echo "ERROR: Found OOM error during long run"
|
|
exit 1
|
|
fi
|
|
|
|
# Verify all iterations completed
|
|
COMPLETED_ITERS=$(grep -c "Step [0-9]" "$LOG_FILE" || echo "0")
|
|
if [ "$COMPLETED_ITERS" -lt "$MAX_ITERATIONS" ]; then
|
|
echo "WARNING: Only completed $COMPLETED_ITERS out of $MAX_ITERATIONS iterations"
|
|
fi
|
|
|
|
# Extract discovered batch size
|
|
BATCH_SIZE=$(grep "Auto-discovery found device_batch_size=" "$LOG_FILE" | grep -oP 'device_batch_size=\K\d+' | head -1)
|
|
|
|
# Compare with depth=12 batch size if available
|
|
if [ -f "tests/results/stability_depth12.log" ]; then
|
|
BATCH_SIZE_12=$(grep "Auto-discovery found device_batch_size=" "tests/results/stability_depth12.log" | grep -oP 'device_batch_size=\K\d+' | head -1)
|
|
if [ -n "$BATCH_SIZE_12" ] && [ -n "$BATCH_SIZE" ]; then
|
|
echo ""
|
|
echo "Batch size comparison:"
|
|
echo " depth=12: $BATCH_SIZE_12"
|
|
echo " depth=32: $BATCH_SIZE"
|
|
if [ "$BATCH_SIZE" -le "$BATCH_SIZE_12" ]; then
|
|
echo " ✓ Larger model correctly uses smaller/equal batch size"
|
|
else
|
|
echo " WARNING: depth=32 has larger batch size than depth=12 (unexpected)"
|
|
fi
|
|
fi
|
|
fi
|
|
|
|
echo ""
|
|
echo "✓ Test passed!"
|
|
echo " - Completed $MAX_ITERATIONS iterations"
|
|
echo " - Duration: ${DURATION}s"
|
|
echo " - Discovered batch size: $BATCH_SIZE"
|
|
echo " - No OOM errors"
|
|
echo " - No memory leaks detected"
|