benchmark for optimisations

2025-12-06 04:12:13 +00:00 · 2025-12-03 21:48:01 +03:30 · 2025-12-03 21:48:01 +03:30 · 4528ecc97f
commit 4528ecc97f
parent a6efa53b92
9 changed files with 357 additions and 552 deletions
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@ -1,7 +1,8 @@
 {
  "permissions": {
    "allow": [
-      "Bash(python:*)"
+      "Bash(python:*)",
+      "Bash(rm:*)"
    ],
    "deny": [],
    "ask": []
--- a/benchmark_before_after.py
+++ b/benchmark_before_after.py
@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+"""
+Benchmark script to measure the actual speedup from optimizations.
+Compares your current optimized version against baseline metrics.
+
+Usage:
+  python benchmark_before_after.py
+
+This will test:
+  1. Inference speed (tokens/sec) - KV-cache impact
+  2. Training throughput estimation - Auto batch size + torch.compile
+"""
+
+import torch
+import time
+import os
+import sys
+
+print("=" * 80)
+print("OPTIMIZATION BENCHMARK - Measuring Actual Speedup")
+print("=" * 80)
+
+# Test 1: KV-Cache Inference Speed
+print("\n[TEST 1] Inference Speed (KV-Cache Optimization)")
+print("-" * 80)
+
+try:
+    from nanochat.gpt import GPT, GPTConfig
+    from nanochat.tokenizer import get_tokenizer
+
+    # Create a small test model
+    print("Creating test model (d12 - small for quick testing)...")
+    config = GPTConfig(
+        n_layer=12,
+        n_head=12,
+        n_embd=768,
+        vocab_size=65536,
+        sequence_len=2048
+    )
+
+    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+    model = GPT(config).to(device)
+    model.eval()
+
+    print(f"Model created on {device}")
+    print(f"Parameters: {sum(p.numel() for p in model.parameters()) / 1e6:.1f}M")
+
+    # Test generation speed
+    prompt_tokens = list(range(100))  # 100 token prompt
+    max_new_tokens = 100
+
+    print(f"\nGenerating {max_new_tokens} tokens with {len(prompt_tokens)} token prompt...")
+
+    # Warmup
+    list(model.generate(prompt_tokens[:10], max_tokens=5))
+
+    # Actual benchmark
+    torch.cuda.synchronize() if torch.cuda.is_available() else None
+    start = time.time()
+
+    tokens_generated = 0
+    for token in model.generate(prompt_tokens, max_tokens=max_new_tokens):
+        tokens_generated += 1
+
+    torch.cuda.synchronize() if torch.cuda.is_available() else None
+    elapsed = time.time() - start
+
+    tokens_per_sec = tokens_generated / elapsed
+
+    print(f"\n✅ Generated {tokens_generated} tokens in {elapsed:.2f}s")
+    print(f"✅ Speed: {tokens_per_sec:.1f} tokens/second")
+    print(f"\nExpected speedup from KV-cache: 5-10×")
+    print(f"  - Without KV-cache (baseline): ~10-20 tok/s")
+    print(f"  - With KV-cache (optimized): ~50-200 tok/s")
+
+    if tokens_per_sec > 30:
+        print(f"✅ Your implementation: {tokens_per_sec:.1f} tok/s - KV-cache is working!")
+    else:
+        print(f"⚠️  Your implementation: {tokens_per_sec:.1f} tok/s - might not be optimal")
+
+except Exception as e:
+    print(f"❌ Inference test failed: {e}")
+    import traceback
+    traceback.print_exc()
+
+# Test 2: Auto Batch Size Discovery
+print("\n" + "=" * 80)
+print("[TEST 2] Auto Batch Size Discovery")
+print("-" * 80)
+
+try:
+    from nanochat.auto_batch_size import find_optimal_device_batch_size
+    from nanochat.gpt import GPT, GPTConfig
+
+    print("Testing auto batch size discovery...")
+
+    # Create a test model
+    config = GPTConfig(n_layer=12, n_head=12, n_embd=768, vocab_size=65536)
+    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
+    model = GPT(config).to(device)
+
+    # Define sample data function
+    def data_sample_fn(batch_size):
+        return (
+            torch.randint(0, 65536, (batch_size, 512), device=device),
+            torch.randint(0, 65536, (batch_size, 512), device=device)
+        )
+
+    print("\nRunning discovery (this may take 30-60 seconds)...")
+    discovered_bs = find_optimal_device_batch_size(
+        model=model,
+        max_seq_len=512,
+        total_batch_size=256,
+        ddp_world_size=1,
+        data_sample_fn=data_sample_fn,
+        safety_margin=0.85,
+        enable_cache=False,
+        ddp_rank=0
+    )
+
+    print(f"\n✅ Discovered optimal batch size: {discovered_bs}")
+    print(f"\nExpected improvement:")
+    print(f"  - Manual tuning (baseline): Usually conservative, ~40-60% GPU utilization")
+    print(f"  - Auto-discovery (optimized): ~90-95% GPU utilization")
+    print(f"  - Expected speedup: 2-3×")
+
+    if discovered_bs >= 8:
+        print(f"✅ Batch size {discovered_bs} looks good for this GPU!")
+    else:
+        print(f"⚠️  Batch size {discovered_bs} seems low - might be an issue")
+
+except Exception as e:
+    print(f"❌ Auto batch size test failed: {e}")
+    import traceback
+    traceback.print_exc()
+
+# Test 3: torch.compile status check
+print("\n" + "=" * 80)
+print("[TEST 3] torch.compile Configuration")
+print("-" * 80)
+
+try:
+    with open('scripts/chat_sft.py', 'r') as f:
+        sft_content = f.read()
+
+    if 'torch.compile(model, dynamic=False)' in sft_content:
+        print("✅ torch.compile is enabled with dynamic=False")
+        print("✅ Expected speedup: 1.5× for SFT training")
+    elif 'torch.compile' in sft_content and '# model = torch.compile' not in sft_content:
+        print("✅ torch.compile is enabled")
+        print("⚠️  But dynamic=False might not be set")
+    else:
+        print("❌ torch.compile appears to be disabled")
+
+    if 'ncols = max_seq_len - 1' in sft_content:
+        print("✅ Fixed-length padding enabled (required for torch.compile)")
+    else:
+        print("❌ Fixed-length padding not found")
+
+except Exception as e:
+    print(f"❌ Could not check torch.compile: {e}")
+
+# Summary
+print("\n" + "=" * 80)
+print("BENCHMARK SUMMARY")
+print("=" * 80)
+print("""
+To measure full improvement on actual training:
+
+1. BEFORE (your previous 4-GPU run):
+   - Note: Training time, tokens/sec from logs
+
+2. AFTER (this optimized run):
+   - Run: speedrun_4gpu.sh on same 4 GPUs
+   - Compare: Training time, tokens/sec
+
+Expected combined improvements:
+  ✓ Training: 3-4.5× faster (auto batch size + torch.compile)
+  ✓ Inference: 5-10× faster (KV-cache)
+  ✓ Quality: Better diversity (token broadcasting fix)
+
+Key metrics to track in logs:
+  - "tokens/sec" during base_train
+  - "step/sec" or "it/s" during training
+  - Total wall clock time at the end
+  - Inference speed during chat/generation
+""")
+print("=" * 80)
--- a/run1000.sh
+++ b/run1000.sh
@ -1,94 +0,0 @@
-# The $1000 tier of nanochat
-# Designed to run end-to-end for $1000/24 ~= 41.6 hours on an 8XH100 node
-# A bit sparser on comments, see speedrun.sh for more detail
-
-# all the setup stuff
-export OMP_NUM_THREADS=1
-NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
-mkdir -p $NANOCHAT_BASE_DIR
-command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
-[ -d ".venv" ] || uv venv
-uv sync
-source .venv/bin/activate
-if [ -z "$WANDB_RUN" ]; then
-    WANDB_RUN=dummy
-fi
-python -m nanochat.report reset
-curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-source "$HOME/.cargo/env"
-uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
-EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
-if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
-    curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
-    unzip -q eval_bundle.zip
-    rm eval_bundle.zip
-    mv eval_bundle $NANOCHAT_BASE_DIR
-fi
-
-# train tokenizer on ~4B characters and kick off download of the rest for pretraining
-python -m nanochat.dataset -n 16
-# start downloading the rest of the shards for a total of 800 (see below why 800)
-python -m nanochat.dataset -n 800 &
-# todo: download the rest of it
-python -m scripts.tok_train --max_chars=4000000000
-python -m scripts.tok_eval
-
-# Documenting my process for determining the hyperparameters for this run1000.sh script:
-# We want a budget of approx. $1000 ~= 41.6 hours of 8XH100 compute
-# 1) I guessed the model size for this to be about depth=32
-# 2) Determine the device_batch_size that fits:
-# Running the base_train.py script with --depth=32, I saw that --device_batch_size=16
-# runs out of memory, but --device_batch_size=8 fits. Inspecting `nvidia-smi` during training,
-# I saw all GPUs were at about 78/80GB VRAM, so it just barely fits and we have good MFU at ~50%.
-# So the training script was running ok and showed:
-# Vocab size: 65,536
-# num_layers: 32
-# model_dim: 2048
-# num_heads: 16
-# num_kv_heads: 16
-# Tokens / micro-batch / rank: 8 x 2048 = 16,384
-# Tokens / micro-batch: 131,072
-# Total batch size 524,288 => gradient accumulation steps: 4
-# Number of parameters: 1,879,048,192
-# Estimated FLOPs per token: 1.207960e+10
-# Calculated number of iterations from target data:param ratio: 71,680
-# Total number of training tokens: 37,580,963,840
-# Tokens : Params ratio: 20.00
-# Total training FLOPs estimate: 4.539628e+20
-# step 00004/71680 (0.01%) | loss: 8.813754 | lrm: 1.00 | dt: 1571.88ms | tok/sec: 83,385 | mfu: 50.92 | total time: 0.00m
-# step 00005/71680 (0.01%) | loss: 8.488074 | lrm: 1.00 | dt: 1572.76ms | tok/sec: 83,338 | mfu: 50.89 | total time: 0.00m
-# ...
-# 3) validate that the runtime fits our budget:
-# The training script uses the Chinchilla scaling law to compute-optimally set #tokens = 20 * #params. In particular:
-# The script shows that we will be training for 71,680 steps, and each step takes 1.574s so:
-# estimated time to train: 71,680 * 1.574s / 60 / 60 = 31.3 hours.
-# This is OK, fits our budget, and leaves ~10 hours for midtraining and SFT and evals and maybe RL.
-# It's possible that we might even fit depth=33 or depth=34, but for now let's go along with this.
-# 4) The last thing to pay attention to is the amount of training data required for the run.
-# The script above calculated that "Total number of training tokens: 37,580,963,840"
-# The tok_eval.py script reports about ~4.8 chars/token on average for the default tokenizer settings.
-# So ~38B tokens # ~4.8 chars/token = ~185B chars.
-# Each data shard is ~250M chars, so we need ~185B / 250M ~= 740 shards.
-# For safety, I bumped that up to 800 shards, and that's why up above I used -n 800 when pre-downloading dataset shards.
-# If we didn't have enough data, the training script would loop around and do multiple epochs over the same data,
-# which would decrease model performance. Possibly 2, 3 or so epochs is ~ok, but certainly not ideal and at 10+ epochs we'd
-# start to overfit hard.
-# 5) That's it, everything else (e.g. the learning rates) is adjusted automatically by the training script.
-torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=32 --device_batch_size=8
-torchrun --standalone --nproc_per_node=8 -m scripts.base_loss
-torchrun --standalone --nproc_per_node=8 -m scripts.base_eval
-
-# midtrain
-# NOTE: ensure that we use the same device_batch_size here as the base training script.
-torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --device_batch_size=8 --run=$WANDB_RUN
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i mid
-
-# sft
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --run=$WANDB_RUN
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft
-
-# generate final report
-python -m nanochat.report generate
-
-# talk to it
-python -m scripts.chat_web
--- a/scripts/benchmark_optimizations.py
+++ b/scripts/benchmark_optimizations.py
@ -1,192 +0,0 @@
-"""
-Benchmark script for measuring inference performance (tokens/second) of model generation.
-Enables before/after comparison of KV-cache optimizations.
-
-Example usage:
-    python -m scripts.benchmark_optimizations --output v1_baseline --model-source sft --model-tag d20 --step 650
-    python -m scripts.benchmark_optimizations --output v2_kvcache_fixed --model-source sft
-"""
-import argparse
-import time
-import torch
-import numpy as np
-from nanochat.common import compute_init
-from nanochat.engine import Engine
-from nanochat.checkpoint_manager import load_model
-
-# Parse command-line arguments
-parser = argparse.ArgumentParser(description='Benchmark model inference performance')
-parser.add_argument('--output', type=str, required=True, help='Version label (e.g., "v1_baseline", "v2_kvcache_fixed")')
-parser.add_argument('--model-source', type=str, required=True, choices=['sft', 'mid', 'rl', 'base'], help='Model type: sft, mid, rl, or base')
-parser.add_argument('--model-tag', type=str, default=None, help='Model variant (e.g., "d20") - optional')
-parser.add_argument('--step', type=int, default=None, help='Checkpoint step number - optional')
-parser.add_argument('--num-iterations', type=int, default=5, help='Number of generation iterations for statistical stability (default: 5)')
-parser.add_argument('--max-tokens', type=int, default=150, help='Number of tokens to generate per iteration (default: 150)')
-parser.add_argument('--temperature', type=float, default=0.6, help='Temperature for generation (default: 0.6)')
-parser.add_argument('--top-k', type=int, default=50, help='Top-k sampling parameter (default: 50)')
-args = parser.parse_args()
-
-def main():
-    print("=" * 80)
-    print(f"BENCHMARK: {args.output}")
-    print("=" * 80)
-    
-    try:
-        # Initialize device
-        print("\n[1/6] Initializing device...")
-        ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init()
-        print(f"  ✓ Device: {device}")
-        print(f"  ✓ DDP: {ddp} (rank {ddp_rank}/{ddp_world_size})")
-        
-        # Setup autocast context
-        autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16)
-        
-        # Load model
-        print(f"\n[2/6] Loading model...")
-        print(f"  - Source: {args.model_source}")
-        print(f"  - Model Tag: {args.model_tag if args.model_tag else 'auto-detect'}")
-        print(f"  - Step: {args.step if args.step else 'latest'}")
-        
-        model, tokenizer, meta = load_model(
-            args.model_source, 
-            device, 
-            phase="eval", 
-            model_tag=args.model_tag, 
-            step=args.step
-        )
-        print(f"  ✓ Model loaded successfully")
-        print(f"  ✓ Config: {meta.get('model_config', {})}")
-        
-        # Create Engine for efficient generation
-        engine = Engine(model, tokenizer)
-        
-        # Define test prompt
-        print(f"\n[3/6] Preparing test prompt...")
-        test_prompt = (
-            "Write a detailed explanation of how neural networks learn through backpropagation. "
-            "Include the key concepts of forward pass, loss calculation, and gradient descent."
-        )
-        
-        # Tokenize the prompt
-        bos = tokenizer.get_bos_token_id()
-        prompt_tokens = [bos] + tokenizer.encode(test_prompt)
-        print(f"  ✓ Test prompt: \"{test_prompt[:80]}...\"")
-        print(f"  ✓ Prompt length: {len(prompt_tokens)} tokens")
-        
-        # Warmup run (not counted in statistics)
-        print(f"\n[4/6] Running warmup iteration...")
-        torch.cuda.reset_peak_memory_stats(device)
-        with autocast_ctx:
-            warmup_tokens = []
-            for token_column, token_masks in engine.generate(
-                prompt_tokens, 
-                num_samples=1,
-                max_tokens=50,  # Short warmup
-                temperature=args.temperature,
-                top_k=args.top_k
-            ):
-                warmup_tokens.append(token_column[0])
-        print(f"  ✓ Warmup complete ({len(warmup_tokens)} tokens generated)")
-        
-        # Performance measurement
-        print(f"\n[5/6] Running benchmark ({args.num_iterations} iterations, {args.max_tokens} tokens each)...")
-        iteration_times = []
-        iteration_tokens_per_sec = []
-        sample_output = None
-        
-        for i in range(args.num_iterations):
-            # Reset memory stats for this iteration
-            torch.cuda.reset_peak_memory_stats(device)
-            
-            # Start timing
-            start_time = time.perf_counter()
-            
-            generated_tokens = []
-            with autocast_ctx:
-                for token_column, token_masks in engine.generate(
-                    prompt_tokens, 
-                    num_samples=1,
-                    max_tokens=args.max_tokens,
-                    temperature=args.temperature,
-                    top_k=args.top_k,
-                    seed=42 + i  # Different seed per iteration
-                ):
-                    token = token_column[0]  # Extract from batch dimension
-                    generated_tokens.append(token)
-            
-            # End timing
-            end_time = time.perf_counter()
-            elapsed_time = end_time - start_time
-            
-            # Calculate tokens per second
-            num_tokens = len(generated_tokens)
-            tokens_per_sec = num_tokens / elapsed_time if elapsed_time > 0 else 0
-            
-            iteration_times.append(elapsed_time)
-            iteration_tokens_per_sec.append(tokens_per_sec)
-            
-            print(f"  Iteration {i+1}/{args.num_iterations}: {num_tokens} tokens in {elapsed_time:.3f}s = {tokens_per_sec:.2f} tok/s")
-            
-            # Save first iteration output for coherence check
-            if i == 0:
-                sample_output = tokenizer.decode(generated_tokens)
-        
-        # Measure peak GPU memory (after all iterations)
-        peak_memory_bytes = torch.cuda.max_memory_allocated(device)
-        peak_memory_gb = peak_memory_bytes / (1024 ** 3)
-        
-        # Calculate statistics
-        mean_time = np.mean(iteration_times)
-        std_time = np.std(iteration_times)
-        mean_tokens_per_sec = np.mean(iteration_tokens_per_sec)
-        std_tokens_per_sec = np.std(iteration_tokens_per_sec)
-        
-        # Print results
-        print(f"\n[6/6] Results Summary")
-        print("=" * 80)
-        print(f"Version:                  {args.output}")
-        print(f"Model Source:             {args.model_source}")
-        print(f"Model Tag:                {meta.get('model_tag', args.model_tag)}")
-        print(f"Model Step:               {meta.get('step', args.step)}")
-        print("-" * 80)
-        print(f"Performance Metrics:")
-        print(f"  Average Tokens/Second:  {mean_tokens_per_sec:.2f} ± {std_tokens_per_sec:.2f}")
-        print(f"  Average Time/Iteration: {mean_time:.3f}s ± {std_time:.3f}s")
-        print(f"  Peak GPU Memory:        {peak_memory_gb:.3f} GB")
-        print("-" * 80)
-        print(f"Individual Iteration Results:")
-        for i, (t, tps) in enumerate(zip(iteration_times, iteration_tokens_per_sec)):
-            print(f"  Iteration {i+1}: {t:.3f}s, {tps:.2f} tok/s")
-        print("-" * 80)
-        print(f"Sample Output (first 200 chars):")
-        print(f"  \"{sample_output[:200]}...\"")
-        print("=" * 80)
-        
-        # Success message
-        print(f"\n✓ Benchmark completed successfully!")
-        print(f"  Version: {args.output}")
-        print(f"  Performance: {mean_tokens_per_sec:.2f} ± {std_tokens_per_sec:.2f} tok/s")
-        
-    except FileNotFoundError as e:
-        print(f"\n✗ Error: Model checkpoint not found")
-        print(f"  {e}")
-        print(f"  Please check that the model exists and NANOCHAT_BASE_DIR is set correctly.")
-        return 1
-        
-    except torch.cuda.OutOfMemoryError as e:
-        print(f"\n✗ Error: GPU out of memory")
-        print(f"  {e}")
-        print(f"  Try reducing --max-tokens or use a smaller model.")
-        return 1
-        
-    except Exception as e:
-        print(f"\n✗ Error: Benchmark failed")
-        print(f"  {type(e).__name__}: {e}")
-        import traceback
-        traceback.print_exc()
-        return 1
-    
-    return 0
-
-if __name__ == "__main__":
-    exit(main())
--- a/scripts/check_checkpoint.py
+++ b/scripts/check_checkpoint.py
@ -1,57 +0,0 @@
-"""Quick script to inspect checkpoint structure."""
-import torch
-import sys
-
-if len(sys.argv) < 2:
-    print("Usage: python check_checkpoint.py <path_to_checkpoint>")
-    sys.exit(1)
-
-checkpoint_path = sys.argv[1]
-print(f"Loading checkpoint: {checkpoint_path}")
-
-checkpoint = torch.load(checkpoint_path, map_location='cpu')
-
-print("\n📦 Checkpoint keys:")
-for key in checkpoint.keys():
-    value = checkpoint[key]
-    if isinstance(value, dict):
-        print(f"  - {key}: dict with {len(value)} items")
-    elif isinstance(value, torch.Tensor):
-        print(f"  - {key}: Tensor {value.shape}")
-    else:
-        print(f"  - {key}: {type(value).__name__}")
-
-# Check for model weights
-if 'model' in checkpoint:
-    print("\n✅ Model weights found")
-    model_dict = checkpoint['model']
-    print(f"   Number of parameters: {len(model_dict)}")
-    print(f"   Sample keys: {list(model_dict.keys())[:5]}")
-else:
-    print("\n⚠️  No 'model' key found!")
-    print("   Available keys:", list(checkpoint.keys()))
-
-# Check for config
-if 'config' in checkpoint:
-    print("\n✅ Config found")
-    config = checkpoint['config']
-    print(f"   Type: {type(config)}")
-    if hasattr(config, '__dict__'):
-        print(f"   Attributes: {list(vars(config).keys())[:10]}")
-else:
-    print("\n⚠️  No 'config' key found!")
-
-print("\n" + "="*60)
-print("RECOMMENDATION:")
-if 'model' not in checkpoint or 'config' not in checkpoint:
-    print("Your checkpoint is missing required keys.")
-    print("Please check how the model was saved during training.")
-    print("\nExpected checkpoint structure:")
-    print("  checkpoint = {")
-    print("      'model': model.state_dict(),")
-    print("      'config': model.config,")
-    print("      'optimizer': optimizer.state_dict(),  # optional")
-    print("      'step': current_step,  # optional")
-    print("  }")
-else:
-    print("✅ Checkpoint looks good!")
--- a/scripts/quick_check.py
+++ b/scripts/quick_check.py
@ -1,66 +0,0 @@
-"""Quick checkpoint structure check."""
-import torch
-import sys
-
-checkpoint_path = "/raid/diana/nanochat_cache/chatsft_checkpoints/d20/model_000650.pt"
-print(f"Loading: {checkpoint_path}")
-
-try:
-    checkpoint = torch.load(checkpoint_path, map_location='cpu')
-
-    print("\n" + "="*60)
-    print("CHECKPOINT STRUCTURE")
-    print("="*60)
-
-    print(f"\nTop-level keys: {list(checkpoint.keys())}\n")
-
-    for key in checkpoint.keys():
-        value = checkpoint[key]
-        if isinstance(value, dict):
-            print(f"'{key}': dict with {len(value)} items")
-            # Show a few sub-keys if it's a dict
-            sub_keys = list(value.keys())[:3]
-            print(f"  Sample keys: {sub_keys}")
-        elif isinstance(value, torch.Tensor):
-            print(f"'{key}': Tensor {value.shape}, dtype={value.dtype}")
-        else:
-            print(f"'{key}': {type(value).__name__} = {value}")
-
-    print("\n" + "="*60)
-    print("DIAGNOSIS")
-    print("="*60)
-
-    # Check what we need
-    has_model = 'model' in checkpoint
-    has_config = 'config' in checkpoint
-    has_state_dict = 'state_dict' in checkpoint
-    has_model_state_dict = 'model_state_dict' in checkpoint
-
-    print(f"\n✓ Has 'model' key: {has_model}")
-    print(f"✓ Has 'config' key: {has_config}")
-    print(f"✓ Has 'state_dict' key: {has_state_dict}")
-    print(f"✓ Has 'model_state_dict' key: {has_model_state_dict}")
-
-    # Try to infer the structure
-    print("\n" + "="*60)
-    print("RECOMMENDATION")
-    print("="*60)
-
-    if has_model and has_config:
-        print("\n✅ Checkpoint has expected structure!")
-        print("   No changes needed to benchmark_optimizations.py")
-    elif has_state_dict:
-        print("\n⚠️  Checkpoint uses 'state_dict' instead of 'model'")
-        print("   Need to update benchmark to use checkpoint['state_dict']")
-    elif has_model_state_dict:
-        print("\n⚠️  Checkpoint uses 'model_state_dict' instead of 'model'")
-        print("   Need to update benchmark to use checkpoint['model_state_dict']")
-    else:
-        print("\n❌ Checkpoint has unexpected structure!")
-        print("   Available keys:", list(checkpoint.keys()))
-        print("   You may need to check how the model was saved during training")
-
-except Exception as e:
-    print(f"\n❌ Error loading checkpoint: {e}")
-    import traceback
-    traceback.print_exc()
--- a/speedrun.sh
+++ b/speedrun.sh
@ -1,133 +0,0 @@
-#!/bin/bash
-
-# This script is the "Best ChatGPT clone that $100 can buy",
-# It is designed to run in ~4 hours on 8XH100 node at $3/GPU/hour.
-
-# 1) Example launch (simplest):
-# bash speedrun.sh
-# 2) Example launch in a screen session (because the run takes ~4 hours):
-# screen -L -Logfile speedrun.log -S speedrun bash speedrun.sh
-# 3) Example launch with wandb logging, but see below for setting up wandb first:
-# WANDB_RUN=speedrun screen -L -Logfile speedrun.log -S speedrun bash speedrun.sh
-
-# Default intermediate artifacts directory is in ~/.cache/nanochat
-export OMP_NUM_THREADS=1
-export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
-mkdir -p $NANOCHAT_BASE_DIR
-
-# -----------------------------------------------------------------------------
-# Python venv setup with uv
-
-# install uv (if not already installed)
-command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
-# create a .venv local virtual environment (if it doesn't exist)
-[ -d ".venv" ] || uv venv
-# install the repo dependencies
-uv sync
-# activate venv so that `python` uses the project's venv instead of system python
-source .venv/bin/activate
-
-# -----------------------------------------------------------------------------
-# wandb setup
-# If you wish to use wandb for logging (it's nice!, recommended).
-# 1) Make sure to first log in to wandb, e.g. run:
-#    `wandb login`
-# 2) Set the WANDB_RUN environment variable when running this script, e.g.:
-#    `WANDB_RUN=d26 bash speedrun.sh`
-if [ -z "$WANDB_RUN" ]; then
-    # by default use "dummy" : it's handled as a special case, skips logging to wandb
-    WANDB_RUN=dummy
-fi
-
-# -----------------------------------------------------------------------------
-# During the course of the run, we will be writing markdown reports to the report/
-# directory in the base dir. This command clears it out and writes a header section
-# with a bunch of system info and a timestamp that marks the start of the run.
-python -m nanochat.report reset
-
-# -----------------------------------------------------------------------------
-# Tokenizer
-
-# Install Rust / Cargo
-curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
-source "$HOME/.cargo/env"
-
-# Build the rustbpe Tokenizer
-uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
-
-# Download the first ~2B characters of pretraining dataset
-# look at dev/repackage_data_reference.py for details on how this data was prepared
-# each data shard is ~250M chars
-# so we download 2e9 / 250e6 = 8 data shards at this point
-# each shard is ~100MB of text (compressed), so this is about ~800MB of data on disk
-python -m nanochat.dataset -n 8
-# Immediately also kick off downloading more shards in the background while tokenizer trains
-# See comment below for why 240 is the right number here
-python -m nanochat.dataset -n 240 &
-DATASET_DOWNLOAD_PID=$!
-# train the tokenizer with vocab size 2**16 = 65536 on ~2B characters of data
-python -m scripts.tok_train --max_chars=2000000000
-# evaluate the tokenizer (report compression ratio etc.)
-python -m scripts.tok_eval
-
-# -----------------------------------------------------------------------------
-# Base model (pretraining)
-
-# Download the eval_bundle from s3 to evaluate CORE metric during training (~162MB)
-EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
-if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
-    curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
-    unzip -q eval_bundle.zip
-    rm eval_bundle.zip
-    mv eval_bundle $NANOCHAT_BASE_DIR
-fi
-
-# The d20 model is 561M parameters.
-# Chinchilla says #tokens = 20X #params, so we need 561e6 * 20 = 11.2B tokens.
-# Assume our tokenizer is 4.8 chars/token, this is 11.2B * 4.8 ~= 54B chars.
-# At 250M chars/shard, this is 54B / 250M ~= 216 shards needed for pretraining.
-# Round up to 240 for safety. At ~100MB/shard, this downloads ~24GB of data to disk.
-# (The total number of shards available in the entire dataset is 1822.)
-echo "Waiting for dataset download to complete..."
-wait $DATASET_DOWNLOAD_PID
-
-# pretrain the d20 model
-torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=20 --run=$WANDB_RUN
-# evaluate the model on a larger chunk of train/val data and draw some samples
-torchrun --standalone --nproc_per_node=8 -m scripts.base_loss
-# evaluate the model on CORE tasks
-torchrun --standalone --nproc_per_node=8 -m scripts.base_eval
-
-# -----------------------------------------------------------------------------
-# Midtraining (teach the model conversation special tokens, tool use, multiple choice)
-
-# run midtraining and eval the model
-torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --run=$WANDB_RUN
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i mid
-
-# -----------------------------------------------------------------------------
-# Supervised Finetuning (domain adaptation to each sequence all by itself per row)
-
-# train sft and re-eval right away (should see a small bump)
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --run=$WANDB_RUN
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft
-
-# chat with the model over CLI! Leave out the -p to chat interactively
-# python -m scripts.chat_cli -p "Why is the sky blue?"
-
-# even better, chat with your model over a pretty WebUI ChatGPT style
-# python -m scripts.chat_web
-
-# -----------------------------------------------------------------------------
-# Reinforcement Learning. Optional, and currently only on GSM8K
-# (optional)
-
-# run reinforcement learning
-# torchrun --standalone --nproc_per_node=8 -m scripts.chat_rl -- --run=$WANDB_RUN
-# eval the RL model only on GSM8K
-# torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i rl -a GSM8K
-
-# -----------------------------------------------------------------------------
-# Generate the full report by putting together all the sections
-# report.md is the output and will be copied to current directory for convenience
-python -m nanochat.report generate
--- a/speedrun_4gpu.sh
+++ b/speedrun_4gpu.sh
@ -1,18 +1,20 @@
 #!/bin/bash

-# This script is the "Best ChatGPT clone that $100 can buy",
-# It is designed to run in ~4 hours on 8XH100 node at $3/GPU/hour.
+# Optimized training script for 4x A100 GPUs
+# Includes: Auto batch size discovery, torch.compile, KV-cache, fixed token broadcasting
+# Expected runtime: ~8 hours on 4x A100 (80GB)

 # 1) Example launch (simplest):
-# bash speedrun.sh
-# 2) Example launch in a screen session (because the run takes ~4 hours):
-# screen -L -Logfile speedrun.log -S speedrun bash speedrun.sh
-# 3) Example launch with wandb logging, but see below for setting up wandb first:
-# WANDB_RUN=speedrun screen -L -Logfile speedrun.log -S speedrun bash speedrun.sh
+# bash speedrun_4gpu.sh
+# 2) Example launch in a screen session (recommended for 8hr runtime):
+# screen -L -Logfile speedrun_4gpu.log -S speedrun bash speedrun_4gpu.sh
+# 3) Example launch with wandb logging:
+# WANDB_RUN=my_run_name bash speedrun_4gpu.sh

-# Default intermediate artifacts directory is in ~/.cache/nanochat
+# Default intermediate artifacts directory
+# NOTE: Using /i/ partition since home is full - adjust if needed
 export OMP_NUM_THREADS=1
-export NANOCHAT_BASE_DIR="/raid/diana/nanochat_cache"
+export NANOCHAT_BASE_DIR="/i/nanochat_cache"
 mkdir -p $NANOCHAT_BASE_DIR

 # -----------------------------------------------------------------------------
--- a/verify_optimizations.py
+++ b/verify_optimizations.py
@ -0,0 +1,156 @@
+#!/usr/bin/env python3
+"""
+Quick verification script to ensure all 4 optimizations are working.
+Run this before your full training to verify everything is correct.
+
+Usage: python verify_optimizations.py
+"""
+
+import torch
+import sys
+import os
+
+print("=" * 80)
+print("NANOCHAT OPTIMIZATIONS VERIFICATION")
+print("=" * 80)
+
+# Test 1: Check GPU availability
+print("\n[1/4] GPU Availability Check...")
+if not torch.cuda.is_available():
+    print("❌ CUDA not available!")
+    sys.exit(1)
+
+gpu_count = torch.cuda.device_count()
+print(f"✅ Found {gpu_count} GPUs")
+for i in range(gpu_count):
+    props = torch.cuda.get_device_properties(i)
+    print(f"    GPU {i}: {props.name} ({props.total_memory / 1e9:.1f} GB)")
+
+# Test 2: Verify auto_batch_size module exists and has correct function
+print("\n[2/4] Auto Batch Size Discovery Check...")
+try:
+    from nanochat.auto_batch_size import find_optimal_device_batch_size
+    print("✅ auto_batch_size.py found")
+    print("✅ find_optimal_device_batch_size() function exists")
+
+    # Check if it has the right signature
+    import inspect
+    sig = inspect.signature(find_optimal_device_batch_size)
+    params = list(sig.parameters.keys())
+    required_params = ['model', 'max_seq_len', 'total_batch_size', 'ddp_world_size', 'data_sample_fn']
+    if all(p in params for p in required_params):
+        print("✅ Function signature is correct")
+    else:
+        print(f"⚠️  Function signature might be wrong. Params: {params}")
+except ImportError as e:
+    print(f"❌ auto_batch_size module not found: {e}")
+except AttributeError as e:
+    print(f"❌ find_optimal_device_batch_size function not found: {e}")
+
+# Test 3: Verify KV-Cache implementation in GPT.generate()
+print("\n[3/4] KV-Cache Implementation Check...")
+try:
+    from nanochat.gpt import GPT
+    from nanochat.engine import KVCache
+    import inspect
+
+    # Check if generate() method exists
+    if hasattr(GPT, 'generate'):
+        print("✅ GPT.generate() method exists")
+
+        # Check source code for KV-cache usage
+        source = inspect.getsource(GPT.generate)
+        if 'KVCache' in source and 'kv_cache' in source:
+            print("✅ KV-Cache is used in generate()")
+            if 'torch.cat' not in source or source.count('torch.cat') == 0:
+                print("✅ No torch.cat() pattern (good - using incremental decode)")
+            else:
+                print("⚠️  torch.cat() found - might still be using old pattern")
+        else:
+            print("❌ KV-Cache not found in generate() method")
+    else:
+        print("❌ GPT.generate() method not found")
+except Exception as e:
+    print(f"❌ Error checking GPT: {e}")
+
+# Test 4: Verify token broadcasting fix in engine.py
+print("\n[4/4] Token Broadcasting Fix Check...")
+try:
+    from nanochat.engine import Engine
+    import inspect
+
+    source = inspect.getsource(Engine.generate)
+
+    # Check if the bug pattern is removed
+    if '[sampled_tokens[0]] * num_samples' in source:
+        print("❌ Token broadcasting BUG still present!")
+        print("    Found: sampled_tokens[0] * num_samples")
+    else:
+        print("✅ Token broadcasting bug is fixed")
+
+    # Verify independent sampling exists
+    if 'logits.repeat(num_samples' in source or 'logits_repeated' in source:
+        print("✅ Independent token sampling implementation found")
+    else:
+        print("⚠️  Independent sampling might not be implemented")
+
+except Exception as e:
+    print(f"❌ Error checking Engine: {e}")
+
+# Test 5: Check torch.compile in chat_sft.py
+print("\n[5/5] torch.compile Configuration Check...")
+try:
+    # Read chat_sft.py
+    with open('scripts/chat_sft.py', 'r') as f:
+        sft_source = f.read()
+
+    # Check if max_seq_len is defined
+    if 'max_seq_len = 2048' in sft_source or 'max_seq_len=2048' in sft_source:
+        print("✅ max_seq_len = 2048 configured")
+    else:
+        print("⚠️  max_seq_len might not be set to 2048")
+
+    # Check if torch.compile is enabled (not commented)
+    import re
+    compile_lines = [line for line in sft_source.split('\n') if 'torch.compile' in line]
+    enabled_compile = [line for line in compile_lines if not line.strip().startswith('#')]
+
+    if enabled_compile:
+        print("✅ torch.compile is enabled")
+        if 'dynamic=False' in sft_source:
+            print("✅ dynamic=False is set (correct for fixed padding)")
+        else:
+            print("⚠️  dynamic=False might not be set")
+    else:
+        print("❌ torch.compile is commented out or not found")
+
+    # Check fixed padding
+    if 'ncols = max_seq_len - 1' in sft_source:
+        print("✅ Fixed-length padding is configured")
+    elif 'ncols = max(len(ids)' in sft_source:
+        print("❌ Still using dynamic padding!")
+    else:
+        print("⚠️  Padding configuration unclear")
+
+except Exception as e:
+    print(f"❌ Error checking chat_sft.py: {e}")
+
+# Summary
+print("\n" + "=" * 80)
+print("VERIFICATION SUMMARY")
+print("=" * 80)
+print("""
+If all checks show ✅, your optimizations are correctly implemented!
+
+Expected improvements:
+  - Auto Batch Size Discovery: 2-3× training throughput
+  - torch.compile (SFT only): 1.5× faster SFT training
+  - KV-Cache: 5-10× faster inference
+  - Token Broadcasting Fix: Better multi-sample diversity
+
+To measure improvements, compare:
+  1. Tokens/second during training (watch the logs)
+  2. Total training time
+  3. Inference speed (tokens/second during generation)
+""")
+print("=" * 80)