nanochat/tests/test_gpt.py

"""
Tests for the GPT model architecture.

Run with:
python -m pytest tests/test_gpt.py -v -s --timeout=60
"""

import torch
import pytest
from nanochat.gpt import GPT, GPTConfig, norm, apply_rotary_emb, repeat_kv


@pytest.fixture
def small_config():
    """A small GPT config for fast testing."""
    return GPTConfig(
        sequence_len=128,
        vocab_size=256,
        n_layer=2,
        n_head=4,
        n_kv_head=2,  # Test MQA with 2:1 ratio
        n_embd=64,
    )


@pytest.fixture
def tiny_config():
    """An even tinier config for quick tests."""
    return GPTConfig(
        sequence_len=32,
        vocab_size=128,
        n_layer=1,
        n_head=2,
        n_kv_head=1,  # Test MQA with 2:1 ratio
        n_embd=32,
    )


def prepare_model_for_testing(model):
    """Prepare model for CPU testing by converting to float32 but keeping rotary embeddings in bfloat16."""
    model = model.float()
    model.cos = model.cos.bfloat16()
    model.sin = model.sin.bfloat16()
    return model


def test_gpt_config():
    """Test that GPTConfig initializes with correct defaults."""
    config = GPTConfig()
    assert config.sequence_len == 1024
    assert config.vocab_size == 50304
    assert config.n_layer == 12
    assert config.n_head == 6
    assert config.n_kv_head == 6
    assert config.n_embd == 768


def test_norm_function():
    """Test the RMSNorm function."""
    x = torch.randn(2, 4, 8)
    y = norm(x)
    # Check shape is preserved
    assert y.shape == x.shape
    # Check it's actually normalized (approximately)
    # RMSNorm: y = x / rms(x) where rms = sqrt(mean(x^2))
    expected_rms = torch.sqrt(torch.mean(x ** 2, dim=-1, keepdim=True))
    expected_y = x / expected_rms
    torch.testing.assert_close(y, expected_y, rtol=1e-4, atol=1e-4)


def test_apply_rotary_emb():
    """Test rotary embeddings application."""
    batch_size, num_heads, seq_len, head_dim = 2, 4, 8, 16
    x = torch.randn(batch_size, seq_len, num_heads, head_dim)

    # Create simple cos/sin for testing
    cos = torch.ones(1, seq_len, 1, head_dim // 2)
    sin = torch.zeros(1, seq_len, 1, head_dim // 2)

    y = apply_rotary_emb(x, cos, sin)

    # Check shape is preserved
    assert y.shape == x.shape
    # Check dtype is preserved
    assert y.dtype == x.dtype


def test_repeat_kv():
    """Test the repeat_kv function for MQA."""
    bs, n_kv_heads, slen, head_dim = 2, 2, 8, 16
    n_rep = 3  # Repeat each KV head 3 times

    x = torch.randn(bs, n_kv_heads, slen, head_dim)
    y = repeat_kv(x, n_rep)

    # Check output shape
    assert y.shape == (bs, n_kv_heads * n_rep, slen, head_dim)

    # Check that heads are repeated correctly
    for i in range(n_kv_heads):
        for j in range(n_rep):
            torch.testing.assert_close(y[:, i * n_rep + j], x[:, i])

    # Test n_rep=1 (no-op case)
    y_no_rep = repeat_kv(x, 1)
    torch.testing.assert_close(y_no_rep, x)


def test_gpt_initialization(small_config):
    """Test that GPT model initializes correctly."""
    model = GPT(small_config)

    # Check model has the right components
    assert hasattr(model, 'transformer')
    assert hasattr(model, 'lm_head')
    assert len(model.transformer.h) == small_config.n_layer

    # Check parameter count is reasonable
    num_params = sum(p.numel() for p in model.parameters())
    assert num_params > 0
    print(f"Small model has {num_params:,} parameters")


def test_gpt_forward_shape(tiny_config):
    """Test that forward pass produces correct output shapes."""
    model = GPT(tiny_config)
    model.init_weights()
    model = prepare_model_for_testing(model)
    model.eval()

    batch_size = 2
    seq_len = 16

    # Create input tokens
    idx = torch.randint(0, tiny_config.vocab_size, (batch_size, seq_len))

    # Forward pass without targets (inference mode)
    with torch.no_grad():
        logits = model(idx)

    # Check output shape
    assert logits.shape == (batch_size, seq_len, tiny_config.vocab_size)


def test_gpt_forward_with_targets(tiny_config):
    """Test forward pass with targets (training mode)."""
    model = GPT(tiny_config)
    model.init_weights()
    model = prepare_model_for_testing(model)
    model.train()

    batch_size = 2
    seq_len = 16

    # Create input tokens and targets
    idx = torch.randint(0, tiny_config.vocab_size, (batch_size, seq_len))
    targets = torch.randint(0, tiny_config.vocab_size, (batch_size, seq_len))

    # Forward pass with targets
    loss = model(idx, targets=targets)

    # Check loss is a scalar
    assert loss.shape == ()
    assert loss.item() > 0  # Cross-entropy loss should be positive
    assert not torch.isnan(loss)
    assert not torch.isinf(loss)


def test_gpt_backward(tiny_config):
    """Test that backward pass works and gradients flow."""
    model = GPT(tiny_config)
    model.init_weights()
    model = prepare_model_for_testing(model)
    model.train()

    batch_size = 2
    seq_len = 8

    idx = torch.randint(0, tiny_config.vocab_size, (batch_size, seq_len))
    targets = torch.randint(0, tiny_config.vocab_size, (batch_size, seq_len))

    # Forward and backward
    loss = model(idx, targets=targets)
    loss.backward()

    # Check that gradients are computed for all parameters
    for name, param in model.named_parameters():
        if param.requires_grad:
            assert param.grad is not None, f"No gradient for {name}"
            assert not torch.isnan(param.grad).any(), f"NaN gradient in {name}"
            assert not torch.isinf(param.grad).any(), f"Inf gradient in {name}"


def test_gpt_generate(tiny_config):
    """Test autoregressive generation."""
    model = GPT(tiny_config)
    model.init_weights()
    model = prepare_model_for_testing(model)
    model.eval()

    # Start with a few tokens
    initial_tokens = [1, 2, 3]
    max_new_tokens = 10

    # Generate tokens
    generated = []
    for token in model.generate(initial_tokens, max_tokens=max_new_tokens, temperature=1.0, seed=42):
        generated.append(token)

    # Check we generated the right number of tokens
    assert len(generated) == max_new_tokens

    # Check all tokens are valid
    for token in generated:
        assert 0 <= token < tiny_config.vocab_size


def test_gpt_generate_deterministic(tiny_config):
    """Test that generation is deterministic with same seed."""
    model = GPT(tiny_config)
    model.init_weights()
    model = prepare_model_for_testing(model)
    model.eval()

    initial_tokens = [1, 2, 3]
    max_new_tokens = 5

    # Generate twice with same seed
    gen1 = list(model.generate(initial_tokens, max_tokens=max_new_tokens, temperature=1.0, seed=42))
    gen2 = list(model.generate(initial_tokens, max_tokens=max_new_tokens, temperature=1.0, seed=42))

    assert gen1 == gen2, "Generation should be deterministic with same seed"


def test_gpt_generate_greedy(tiny_config):
    """Test greedy decoding (temperature=0)."""
    model = GPT(tiny_config)
    model.init_weights()
    model = prepare_model_for_testing(model)
    model.eval()

    initial_tokens = [1, 2, 3]
    max_new_tokens = 5

    # Greedy decoding
    generated = list(model.generate(initial_tokens, max_tokens=max_new_tokens, temperature=0.0))

    assert len(generated) == max_new_tokens
    for token in generated:
        assert 0 <= token < tiny_config.vocab_size


def test_gpt_estimate_flops(small_config):
    """Test FLOP estimation."""
    model = GPT(small_config)
    flops = model.estimate_flops()

    # FLOPs should be positive
    assert flops > 0
    print(f"Estimated FLOPs per token: {flops:,}")


def test_gpt_setup_optimizers(tiny_config):
    """Test optimizer setup."""
    model = GPT(tiny_config)
    model.init_weights()

    # Setup optimizers
    optimizers = model.setup_optimizers()

    # Should return a list of 2 optimizers
    assert len(optimizers) == 2

    # Check they have parameter groups
    for opt in optimizers:
        assert len(opt.param_groups) > 0


def test_gpt_mqa_shapes(small_config):
    """Test that Multi-Query Attention produces correct shapes."""
    # Modify config to have different n_head and n_kv_head
    config = GPTConfig(
        sequence_len=32,
        vocab_size=128,
        n_layer=1,
        n_head=8,
        n_kv_head=2,  # 4:1 ratio
        n_embd=64,
    )

    model = GPT(config)
    model.init_weights()
    model = prepare_model_for_testing(model)
    model.eval()

    batch_size = 2
    seq_len = 16
    idx = torch.randint(0, config.vocab_size, (batch_size, seq_len))

    with torch.no_grad():
        logits = model(idx)

    assert logits.shape == (batch_size, seq_len, config.vocab_size)


def test_gpt_long_sequence(small_config):
    """Test with sequences up to max length."""
    model = GPT(small_config)
    model.init_weights()
    model = prepare_model_for_testing(model)
    model.eval()

    batch_size = 1
    seq_len = small_config.sequence_len  # Full sequence length

    idx = torch.randint(0, small_config.vocab_size, (batch_size, seq_len))

    with torch.no_grad():
        logits = model(idx)

    assert logits.shape == (batch_size, seq_len, small_config.vocab_size)


def test_gpt_embedding_dtype():
    """Test that embeddings are cast to bfloat16."""
    config = GPTConfig(
        sequence_len=32,
        vocab_size=128,
        n_layer=1,
        n_head=2,
        n_kv_head=2,
        n_embd=32,
    )

    model = GPT(config)

    # Check that embeddings are in bfloat16
    assert model.transformer.wte.weight.dtype == torch.bfloat16


def test_gpt_rotary_embeddings_dtype(tiny_config):
    """Test that rotary embeddings are in bfloat16."""
    model = GPT(tiny_config)
    model.init_weights()

    assert model.cos.dtype == torch.bfloat16
    assert model.sin.dtype == torch.bfloat16


def test_gpt_loss_reduction_modes(tiny_config):
    """Test different loss reduction modes."""
    model = GPT(tiny_config)
    model.init_weights()
    model = prepare_model_for_testing(model)
    model.train()

    batch_size = 2
    seq_len = 8

    idx = torch.randint(0, tiny_config.vocab_size, (batch_size, seq_len))
    targets = torch.randint(0, tiny_config.vocab_size, (batch_size, seq_len))

    # Test 'mean' reduction (default)
    loss_mean = model(idx, targets=targets, loss_reduction='mean')
    assert loss_mean.shape == ()

    # Test 'none' reduction
    loss_none = model(idx, targets=targets, loss_reduction='none')
    assert loss_none.shape == (batch_size * seq_len,)

    # The mean of loss_none should be close to loss_mean
    torch.testing.assert_close(loss_none.mean(), loss_mean, rtol=1e-4, atol=1e-4)


def test_gpt_with_ignore_index(tiny_config):
    """Test that -1 targets are ignored in loss."""
    model = GPT(tiny_config)
    model.init_weights()
    model = prepare_model_for_testing(model)
    model.train()

    batch_size = 2
    seq_len = 8

    idx = torch.randint(0, tiny_config.vocab_size, (batch_size, seq_len))
    targets = torch.randint(0, tiny_config.vocab_size, (batch_size, seq_len))

    # Compute loss with all valid targets
    loss_all = model(idx, targets=targets)

    # Mask out ALL targets except first token
    targets_masked = targets.clone()
    targets_masked[:, 1:] = -1
    loss_masked = model(idx, targets=targets_masked)

    # Both should be valid losses
    assert loss_all.item() > 0
    assert loss_masked.item() > 0
    # Loss should be finite
    assert not torch.isnan(loss_masked)
    assert not torch.isinf(loss_masked)

    # Test that all -1 targets produces no loss computation error
    targets_all_masked = torch.full_like(targets, -1)
    # This should still work (loss over 0 tokens)
    try:
        loss_all_masked = model(idx, targets=targets_all_masked)
        # If it works, loss should be finite or zero
        assert not torch.isnan(loss_all_masked) or loss_all_masked.item() == 0
    except:
        # It's okay if this raises an error - that's one way to handle no valid targets
        pass