#!/bin/bash # # Test 18, 19, 20: Cache Tests # Tests caching functionality # set -e echo "==========================================" echo "Cache Mechanism Tests" echo "==========================================" DEPTH=12 MAX_ITERATIONS=10 mkdir -p tests/results # ============================================================================ # Test 18: Cache Hit # ============================================================================ echo "" echo "Test 18: Cache Hit" echo "----------------------------------------" LOG_RUN1="tests/results/cache_run1.log" LOG_RUN2="tests/results/cache_run2.log" # Clean cache directory first (if it exists) if [ -n "$NANOCHAT_BASE_DIR" ]; then CACHE_DIR="$NANOCHAT_BASE_DIR/auto_batch_cache" else CACHE_DIR="$HOME/.nanochat/auto_batch_cache" fi if [ -d "$CACHE_DIR" ]; then echo "Cleaning existing cache: $CACHE_DIR" rm -rf "$CACHE_DIR" fi # Run 1: Discovery runs, result saved to cache echo "Run 1: Initial discovery (cache miss expected)" START_RUN1=$(date +%s) torchrun --standalone --nproc_per_node=1 -m scripts.base_train \ -- \ --depth=$DEPTH \ --num_iterations=$MAX_ITERATIONS \ 2>&1 | tee "$LOG_RUN1" END_RUN1=$(date +%s) DURATION_RUN1=$((END_RUN1 - START_RUN1)) if [ ${PIPESTATUS[0]} -ne 0 ]; then echo "ERROR: Run 1 failed" exit 1 fi # Run 2: Same config, discovery skipped (cache hit) echo "" echo "Run 2: Same config (cache hit expected)" START_RUN2=$(date +%s) torchrun --standalone --nproc_per_node=1 -m scripts.base_train \ -- \ --depth=$DEPTH \ --num_iterations=$MAX_ITERATIONS \ 2>&1 | tee "$LOG_RUN2" END_RUN2=$(date +%s) DURATION_RUN2=$((END_RUN2 - START_RUN2)) if [ ${PIPESTATUS[0]} -ne 0 ]; then echo "ERROR: Run 2 failed" exit 1 fi echo "" echo "Timing comparison:" echo " Run 1 (cache miss): ${DURATION_RUN1}s" echo " Run 2 (cache hit): ${DURATION_RUN2}s" # Verify Run 2 is faster (should be much faster if cache hit) if [ "$DURATION_RUN2" -lt "$DURATION_RUN1" ]; then TIME_SAVED=$((DURATION_RUN1 - DURATION_RUN2)) echo " Time saved: ${TIME_SAVED}s" echo "✓ Cache hit improved startup time" else echo "WARNING: Run 2 was not faster (cache may not have been used)" fi # Check if cache hit message appears in Run 2 if grep -q "Cache hit\|Using cached batch size" "$LOG_RUN2"; then echo "✓ Cache hit message found" fi # Verify cache file exists if [ -d "$CACHE_DIR" ] && [ -n "$(ls -A $CACHE_DIR)" ]; then CACHE_FILES=$(ls -1 "$CACHE_DIR" | wc -l) echo "✓ Cache directory exists with $CACHE_FILES file(s)" else echo "WARNING: Cache directory is empty or doesn't exist" fi echo "✓ Test 18 passed!" # ============================================================================ # Test 19: Cache Key Validation # ============================================================================ echo "" echo "Test 19: Cache Key Validation" echo "----------------------------------------" # Run with depth=12, cache result echo "Run with depth=12..." LOG_DEPTH12="tests/results/cache_depth12.log" torchrun --standalone --nproc_per_node=1 -m scripts.base_train \ -- \ --depth=12 \ --num_iterations=$MAX_ITERATIONS \ 2>&1 | tee "$LOG_DEPTH12" BATCH_12=$(grep "Auto-discovery found device_batch_size=" "$LOG_DEPTH12" | grep -oP 'device_batch_size=\K\d+' | head -1) # Run with depth=20, verify cache miss (different config) echo "" echo "Run with depth=20 (should be cache miss)..." LOG_DEPTH20="tests/results/cache_depth20.log" torchrun --standalone --nproc_per_node=1 -m scripts.base_train \ -- \ --depth=20 \ --num_iterations=$MAX_ITERATIONS \ 2>&1 | tee "$LOG_DEPTH20" BATCH_20=$(grep "Auto-discovery found device_batch_size=" "$LOG_DEPTH20" | grep -oP 'device_batch_size=\K\d+' | head -1) # Run with max_seq_len=256, verify cache miss echo "" echo "Run with max_seq_len=256 (should be cache miss)..." LOG_SEQ256="tests/results/cache_seq256.log" torchrun --standalone --nproc_per_node=1 -m scripts.base_train \ -- \ --depth=12 \ --max_seq_len=256 \ --num_iterations=$MAX_ITERATIONS \ 2>&1 | tee "$LOG_SEQ256" BATCH_256=$(grep "Auto-discovery found device_batch_size=" "$LOG_SEQ256" | grep -oP 'device_batch_size=\K\d+' | head -1) # Verify separate cache files were created if [ -d "$CACHE_DIR" ]; then CACHE_FILES=$(ls -1 "$CACHE_DIR" | wc -l) echo "" echo "Cache files created: $CACHE_FILES" if [ "$CACHE_FILES" -ge 3 ]; then echo "✓ Multiple cache files created for different configurations" else echo "WARNING: Expected at least 3 cache files, found $CACHE_FILES" fi fi echo "" echo "Discovered batch sizes:" echo " depth=12, seq_len=2048: $BATCH_12" echo " depth=20, seq_len=2048: $BATCH_20" echo " depth=12, seq_len=256: $BATCH_256" echo "✓ Test 19 passed!" # ============================================================================ # Test 20: Cache Invalidation # ============================================================================ echo "" echo "Test 20: Cache Invalidation" echo "----------------------------------------" if [ -d "$CACHE_DIR" ] && [ -n "$(ls -A $CACHE_DIR 2>/dev/null)" ]; then # Get first cache file CACHE_FILE=$(ls "$CACHE_DIR" | head -1) CACHE_PATH="$CACHE_DIR/$CACHE_FILE" echo "Corrupting cache file: $CACHE_FILE" echo "invalid json {{{" > "$CACHE_PATH" # Try to run with corrupted cache echo "Running with corrupted cache..." LOG_CORRUPT="tests/results/cache_corrupted.log" torchrun --standalone --nproc_per_node=1 -m scripts.base_train \ -- \ --depth=12 \ --num_iterations=$MAX_ITERATIONS \ 2>&1 | tee "$LOG_CORRUPT" if [ ${PIPESTATUS[0]} -ne 0 ]; then echo "ERROR: Run with corrupted cache failed" exit 1 fi echo "✓ System handled corrupted cache gracefully" # Alternative: Delete cache and verify re-discovery echo "" echo "Testing cache deletion..." rm -rf "$CACHE_DIR" LOG_RERUN="tests/results/cache_deleted_rerun.log" torchrun --standalone --nproc_per_node=1 -m scripts.base_train \ -- \ --depth=12 \ --num_iterations=$MAX_ITERATIONS \ 2>&1 | tee "$LOG_RERUN" if [ ${PIPESTATUS[0]} -ne 0 ]; then echo "ERROR: Re-run after cache deletion failed" exit 1 fi # Verify discovery ran again if grep -q "Auto-discovery found device_batch_size=" "$LOG_RERUN"; then echo "✓ Discovery re-ran after cache deletion" fi else echo "SKIP: No cache files to corrupt" fi echo "✓ Test 20 passed!" echo "" echo "✓ All cache tests passed!"