nanochat/benchmark_before_after.py

#!/usr/bin/env python3
"""
Benchmark script to measure the actual speedup from optimizations.
Compares your current optimized version against baseline metrics.

Usage:
  python benchmark_before_after.py

This will test:
  1. Inference speed (tokens/sec) - KV-cache impact
  2. Training throughput estimation - Auto batch size + torch.compile
"""

import torch
import time
import os
import sys

print("=" * 80)
print("OPTIMIZATION BENCHMARK - Measuring Actual Speedup")
print("=" * 80)

# Test 1: KV-Cache Inference Speed
print("\n[TEST 1] Inference Speed (KV-Cache Optimization)")
print("-" * 80)

try:
    from nanochat.gpt import GPT, GPTConfig
    from nanochat.tokenizer import get_tokenizer

    # Create a small test model
    print("Creating test model (d12 - small for quick testing)...")
    config = GPTConfig(
        n_layer=12,
        n_head=12,
        n_embd=768,
        vocab_size=65536,
        sequence_len=2048
    )

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model = GPT(config).to(device)
    model.eval()

    print(f"Model created on {device}")
    print(f"Parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M")

    # Test generation speed
    prompt_tokens = list(range(100))  # 100 token prompt
    max_new_tokens = 100

    print(f"\nGenerating {max_new_tokens} tokens with {len(prompt_tokens)} token prompt...")

    # Warmup
    list(model.generate(prompt_tokens[:10], max_tokens=5))

    # Actual benchmark
    torch.cuda.synchronize() if torch.cuda.is_available() else None
    start = time.time()

    tokens_generated = 0
    for token in model.generate(prompt_tokens, max_tokens=max_new_tokens):
        tokens_generated += 1

    torch.cuda.synchronize() if torch.cuda.is_available() else None
    elapsed = time.time() - start

    tokens_per_sec = tokens_generated / elapsed

    print(f"\n✅ Generated {tokens_generated} tokens in {elapsed:.2f}s")
    print(f"✅ Speed: {tokens_per_sec:.1f} tokens/second")
    print(f"\nExpected speedup from KV-cache: 5-10×")
    print(f"  - Without KV-cache (baseline): ~10-20 tok/s")
    print(f"  - With KV-cache (optimized): ~50-200 tok/s")

    if tokens_per_sec > 30:
        print(f"✅ Your implementation: {tokens_per_sec:.1f} tok/s - KV-cache is working!")
    else:
        print(f"⚠️  Your implementation: {tokens_per_sec:.1f} tok/s - might not be optimal")

except Exception as e:
    print(f"❌ Inference test failed: {e}")
    import traceback
    traceback.print_exc()

# Test 2: Auto Batch Size Discovery
print("\n" + "=" * 80)
print("[TEST 2] Auto Batch Size Discovery")
print("-" * 80)

try:
    from nanochat.auto_batch_size import find_optimal_device_batch_size
    from nanochat.gpt import GPT, GPTConfig

    print("Testing auto batch size discovery...")

    # Create a test model
    config = GPTConfig(n_layer=12, n_head=12, n_embd=768, vocab_size=65536)
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model = GPT(config).to(device)

    # Define sample data function
    def data_sample_fn(batch_size):
        return (
            torch.randint(0, 65536, (batch_size, 512), device=device),
            torch.randint(0, 65536, (batch_size, 512), device=device)
        )

    print("\nRunning discovery (this may take 30-60 seconds)...")
    discovered_bs = find_optimal_device_batch_size(
        model=model,
        max_seq_len=512,
        total_batch_size=256,
        ddp_world_size=1,
        data_sample_fn=data_sample_fn,
        safety_margin=0.85,
        enable_cache=False,
        ddp_rank=0
    )

    print(f"\n✅ Discovered optimal batch size: {discovered_bs}")
    print(f"\nExpected improvement:")
    print(f"  - Manual tuning (baseline): Usually conservative, ~40-60% GPU utilization")
    print(f"  - Auto-discovery (optimized): ~90-95% GPU utilization")
    print(f"  - Expected speedup: 2-3×")

    if discovered_bs >= 8:
        print(f"✅ Batch size {discovered_bs} looks good for this GPU!")
    else:
        print(f"⚠️  Batch size {discovered_bs} seems low - might be an issue")

except Exception as e:
    print(f"❌ Auto batch size test failed: {e}")
    import traceback
    traceback.print_exc()

# Test 3: torch.compile status check
print("\n" + "=" * 80)
print("[TEST 3] torch.compile Configuration")
print("-" * 80)

try:
    with open('scripts/chat_sft.py', 'r') as f:
        sft_content = f.read()

    if 'torch.compile(model, dynamic=False)' in sft_content:
        print("✅ torch.compile is enabled with dynamic=False")
        print("✅ Expected speedup: 1.5× for SFT training")
    elif 'torch.compile' in sft_content and '# model = torch.compile' not in sft_content:
        print("✅ torch.compile is enabled")
        print("⚠️  But dynamic=False might not be set")
    else:
        print("❌ torch.compile appears to be disabled")

    if 'ncols = max_seq_len - 1' in sft_content:
        print("✅ Fixed-length padding enabled (required for torch.compile)")
    else:
        print("❌ Fixed-length padding not found")

except Exception as e:
    print(f"❌ Could not check torch.compile: {e}")

# Summary
print("\n" + "=" * 80)
print("BENCHMARK SUMMARY")
print("=" * 80)
print("""
To measure full improvement on actual training:

1. BEFORE (your previous 4-GPU run):
   - Note: Training time, tokens/sec from logs

2. AFTER (this optimized run):
   - Run: speedrun_4gpu.sh on same 4 GPUs
   - Compare: Training time, tokens/sec

Expected combined improvements:
  ✓ Training: 3-4.5× faster (auto batch size + torch.compile)
  ✓ Inference: 5-10× faster (KV-cache)
  ✓ Quality: Better diversity (token broadcasting fix)

Key metrics to track in logs:
  - "tokens/sec" during base_train
  - "step/sec" or "it/s" during training
  - Total wall clock time at the end
  - Inference speed during chat/generation
""")
print("=" * 80)