Merge 412e9a1cbc into dc54a1a307

2026-06-11 17:38:40 +00:00 · 2026-05-07 23:06:38 +10:00 · 2026-05-07 23:06:38 +10:00 · 95b366783a
commit 95b366783a
parent dc54a1a307 412e9a1cbc
6 changed files with 168 additions and 13 deletions
--- a/nanochat/engine.py
+++ b/nanochat/engine.py
@ -102,11 +102,14 @@ class KVCache:
        self.cache_seqlens = torch.zeros(batch_size, dtype=torch.int32, device=device)
        # Previous token's normalized embedding for smear (set by model forward pass)
        self.prev_embedding = None
+        # Previous token id for hashed bigram embeddings (set by model forward pass)
+        self.prev_token = None

    def reset(self):
        """Reset cache to empty state."""
        self.cache_seqlens.zero_()
        self.prev_embedding = None
+        self.prev_token = None

    def get_pos(self):
        """Get current position (assumes all batch elements at same position)."""
@ -135,6 +138,8 @@ class KVCache:
        # Copy smear state: expand batch=1 prev_embedding to num_samples
        if other.prev_embedding is not None:
            self.prev_embedding = other.prev_embedding.expand(self.batch_size, -1, -1).clone()
+        if other.prev_token is not None:
+            self.prev_token = other.prev_token.expand(self.batch_size, -1).clone()

 # -----------------------------------------------------------------------------
@torch.inference_mode()
--- a/nanochat/gpt.py
+++ b/nanochat/gpt.py
@ -37,6 +37,8 @@ class GPTConfig:
    # Characters: L=long (full context), S=short (quarter context)
    # Examples: "L"=all full context, "SL"=alternating, "SSL"=two short then one long
    window_pattern: str = "SSSL"
+    bigram_embed_factor: int = 0
+    bigram_lambda_init: float = 0.05


 def norm(x):
@ -172,6 +174,8 @@ class GPT(nn.Module):
            "wte": nn.Embedding(padded_vocab_size, config.n_embd),
            "h": nn.ModuleList([Block(config, layer_idx) for layer_idx in range(config.n_layer)]),
        })
+        self.bigram_vocab_size = int(config.vocab_size * max(0, int(config.bigram_embed_factor)))
+        self.bigram_embed = nn.Embedding(self.bigram_vocab_size, config.n_embd) if self.bigram_vocab_size > 0 else None
        self.lm_head = Linear(config.n_embd, padded_vocab_size, bias=False)
        # Per-layer learnable scalars (inspired by modded-nanogpt)
        # resid_lambdas: scales the residual stream at each layer (init 1.0 = neutral)
@ -179,6 +183,10 @@ class GPT(nn.Module):
        # Separate parameters so they can have different optimizer treatment
        self.resid_lambdas = nn.Parameter(torch.ones(config.n_layer))   # fake init, real init in init_weights()
        self.x0_lambdas = nn.Parameter(torch.zeros(config.n_layer))     # fake init, real init in init_weights()
+        if self.bigram_embed is not None:
+            self.bigram_lambdas = nn.Parameter(torch.zeros(config.n_layer))
+        else:
+            self.register_buffer("bigram_lambdas", torch.zeros(0), persistent=False)
        # Smear: mix previous token's embedding into current token (cheap bigram-like info)
        self.smear_gate = Linear(24, 1, bias=False)
        self.smear_lambda = nn.Parameter(torch.zeros(1))
@ -216,6 +224,8 @@ class GPT(nn.Module):

        # Embedding and unembedding
        torch.nn.init.normal_(self.transformer.wte.weight, mean=0.0, std=0.8)
+        if self.bigram_embed is not None:
+            torch.nn.init.zeros_(self.bigram_embed.weight)
        torch.nn.init.normal_(self.lm_head.weight, mean=0.0, std=0.001)

        # Transformer blocks: uniform init with bound = sqrt(3) * std (same standard deviation as normal)
@ -237,6 +247,8 @@ class GPT(nn.Module):
        # Decaying x0 init: earlier layers get more input embedding blending
        for i in range(n_layer):
            self.x0_lambdas.data[i] = 0.20 - (0.15 * i / max(n_layer - 1, 1))
+        if self.bigram_embed is not None:
+            torch.nn.init.constant_(self.bigram_lambdas, self.config.bigram_lambda_init)

        # Smear/backout scalars and smear gate must be explicitly initialized 
        torch.nn.init.zeros_(self.smear_lambda)
@ -262,9 +274,25 @@ class GPT(nn.Module):
        # because GradScaler cannot unscale fp16 gradients.
        if COMPUTE_DTYPE != torch.float16:
            self.transformer.wte.to(dtype=COMPUTE_DTYPE)
+            if self.bigram_embed is not None:
+                self.bigram_embed.to(dtype=COMPUTE_DTYPE)
            for ve in self.value_embeds.values():
                ve.to(dtype=COMPUTE_DTYPE)

+    def _bigram_hash(self, idx, prev_idx=None):
+        mod = self.bigram_vocab_size - 1
+        if mod <= 0:
+            raise RuntimeError("bigram hash requested with disabled bigram embedding")
+        idx_i32 = idx.to(torch.int32)
+        out = torch.empty_like(idx_i32)
+        if prev_idx is None:
+            out[:, :1].fill_(mod)
+            out[:, 1:] = torch.bitwise_xor(36313 * idx_i32[:, 1:], 27191 * idx_i32[:, :-1]) % mod
+        else:
+            prev_i32 = prev_idx.to(torch.int32)
+            out[:] = torch.bitwise_xor(36313 * idx_i32, 27191 * prev_i32) % mod
+        return out.to(torch.long)
+
    def _precompute_rotary_embeddings(self, seq_len, head_dim, base=100000, device=None):
        # TODO: bump base theta more? e.g. 100K is more common more recently
        # autodetect the device from model embeddings
@ -329,8 +357,9 @@ class GPT(nn.Module):
        nparams = sum(p.numel() for p in self.parameters())
        # Exclude non-matmul params: embeddings and per-layer scalars
        value_embeds_numel = sum(ve.weight.numel() for ve in self.value_embeds.values())
-        nparams_exclude = (self.transformer.wte.weight.numel() + value_embeds_numel +
-                          self.resid_lambdas.numel() + self.x0_lambdas.numel() +
+        bigram_embed_numel = self.bigram_embed.weight.numel() if self.bigram_embed is not None else 0
+        nparams_exclude = (self.transformer.wte.weight.numel() + value_embeds_numel + bigram_embed_numel +
+                          self.resid_lambdas.numel() + self.x0_lambdas.numel() + self.bigram_lambdas.numel() +
                          self.smear_gate.weight.numel() + self.smear_lambda.numel() + self.backout_lambda.numel())
        h, q, t = self.config.n_head, self.config.n_embd // self.config.n_head, self.config.sequence_len
        # Sum attention FLOPs per layer, accounting for sliding window
@ -356,14 +385,17 @@ class GPT(nn.Module):
        """
        # Count each group separately (mirrors the grouping in setup_optimizers)
        wte = sum(p.numel() for p in self.transformer.wte.parameters())
+        bigram_embed = self.bigram_embed.weight.numel() if self.bigram_embed is not None else 0
        value_embeds = sum(p.numel() for p in self.value_embeds.parameters())
        lm_head = sum(p.numel() for p in self.lm_head.parameters())
        transformer_matrices = sum(p.numel() for p in self.transformer.h.parameters())
-        scalars = self.resid_lambdas.numel() + self.x0_lambdas.numel() + self.smear_gate.weight.numel() + self.smear_lambda.numel() + self.backout_lambda.numel()
-        total = wte + value_embeds + lm_head + transformer_matrices + scalars
+        bigram_lambdas = self.bigram_lambdas.numel() if isinstance(self.bigram_lambdas, nn.Parameter) else 0
+        scalars = self.resid_lambdas.numel() + self.x0_lambdas.numel() + bigram_lambdas + self.smear_gate.weight.numel() + self.smear_lambda.numel() + self.backout_lambda.numel()
+        total = wte + bigram_embed + value_embeds + lm_head + transformer_matrices + scalars
        assert total == sum(p.numel() for p in self.parameters()), "Parameter count mismatch"
        return {
            'wte': wte,
+            'bigram_embed': bigram_embed,
            'value_embeds': value_embeds,
            'lm_head': lm_head,
            'transformer_matrices': transformer_matrices,
@ -371,40 +403,60 @@ class GPT(nn.Module):
            'total': total,
        }

-    def setup_optimizer(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0, scalar_lr=0.5):
+    def setup_optimizer(
+        self,
+        unembedding_lr=0.004,
+        embedding_lr=0.2,
+        bigram_embedding_lr_mult=1.0,
+        bigram_lambda_lr=0.004,
+        matrix_lr=0.02,
+        weight_decay=0.0,
+        scalar_lr=0.5,
+        muon_plus=False,
+        muon_eq_axis=0,
+    ):
        model_dim = self.config.n_embd
        ddp, rank, local_rank, world_size = get_dist_info()

        # Separate out all parameters into groups
        matrix_params = list(self.transformer.h.parameters())
        value_embeds_params = list(self.value_embeds.parameters())
+        bigram_embed_params = list(self.bigram_embed.parameters()) if self.bigram_embed is not None else []
        embedding_params = list(self.transformer.wte.parameters())
        lm_head_params = list(self.lm_head.parameters())
        resid_params = [self.resid_lambdas]
        x0_params = [self.x0_lambdas]
+        bigram_lambda_params = [self.bigram_lambdas] if isinstance(self.bigram_lambdas, nn.Parameter) else []
        smear_params = [self.smear_gate.weight, self.smear_lambda, self.backout_lambda]
-        assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(value_embeds_params) + len(resid_params) + len(x0_params) + len(smear_params)
+        assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(value_embeds_params) + len(bigram_embed_params) + len(resid_params) + len(x0_params) + len(bigram_lambda_params) + len(smear_params)

        # Scale the LR for the AdamW parameters by ∝1/√dmodel (tuned for 768 dim model)
        dmodel_lr_scale = (model_dim / 768) ** -0.5
        print0(f"Scaling the LR for the AdamW parameters ∝1/√({model_dim}/768) = {dmodel_lr_scale:.6f}")

        # Build param_groups with all required fields explicit
+        # AdamW groups (embeddings, lm_head, scalars)
        param_groups = [
-            # AdamW groups (embeddings, lm_head, scalars)
            dict(kind='adamw', params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale, betas=(0.8, 0.96), eps=1e-10, weight_decay=0.01),
            dict(kind='adamw', params=embedding_params, lr=embedding_lr * dmodel_lr_scale, betas=(0.8, 0.995), eps=1e-10, weight_decay=0.001),
            dict(kind='adamw', params=value_embeds_params, lr=embedding_lr * dmodel_lr_scale * 0.5, betas=(0.8, 0.995), eps=1e-10, weight_decay=0.01),
+        ]
+        if bigram_embed_params:
+            param_groups.append(dict(kind='adamw', params=bigram_embed_params, lr=embedding_lr * dmodel_lr_scale * bigram_embedding_lr_mult, betas=(0.75, 0.95), eps=1e-10, weight_decay=0.01))
+        param_groups.extend([
            dict(kind='adamw', params=resid_params, lr=scalar_lr * 0.01, betas=(0.8, 0.95), eps=1e-10, weight_decay=0.05),
            dict(kind='adamw', params=x0_params, lr=scalar_lr, betas=(0.96, 0.95), eps=1e-10, weight_decay=0.0),  # higher beta1 for x0
-            dict(kind='adamw', params=smear_params, lr=0.2, betas=(0.8, 0.95), eps=1e-10, weight_decay=0.0),
-        ]
+        ])
+        if bigram_embed_params:
+            param_groups.append(dict(kind='adamw', params=bigram_lambda_params, lr=bigram_lambda_lr * dmodel_lr_scale, betas=(0.9, 0.95), eps=1e-10, weight_decay=0.0))
+        param_groups.append(dict(kind='adamw', params=smear_params, lr=0.2, betas=(0.8, 0.95), eps=1e-10, weight_decay=0.0))
        # Muon groups (matrix params, grouped by shape for stacking)
        for shape in sorted({p.shape for p in matrix_params}):
            group_params = [p for p in matrix_params if p.shape == shape]
            param_groups.append(dict(
                kind='muon', params=group_params, lr=matrix_lr,
                momentum=0.95, ns_steps=5, beta2=0.9, weight_decay=weight_decay,
+                muon_plus=muon_plus, muon_eq_axis=muon_eq_axis,
            ))

        Factory = DistMuonAdamW if ddp else MuonAdamW
@ -448,6 +500,19 @@ class GPT(nn.Module):
                gate = self.smear_lambda.to(x.dtype) * torch.sigmoid(self.smear_gate(x[:, :, :24]))
                x = x + gate * x_pre_smear

+        # Optional hashed bigram embedding residual. During KV-cache decoding we need the
+        # previous token id because the sequence length is one.
+        if self.bigram_embed is not None:
+            if kv_cache is None or T > 1:
+                bigram_idx = self._bigram_hash(idx)
+            else:
+                bigram_idx = self._bigram_hash(idx, kv_cache.prev_token)
+            x0_bigram = self.bigram_embed(bigram_idx).to(x.dtype)
+        else:
+            x0_bigram = None
+        if kv_cache is not None:
+            kv_cache.prev_token = idx[:, -1:].clone()
+
        # Forward the trunk of the Transformer
        x0 = x  # save initial normalized embedding for x0 residual
        n_layer = self.config.n_layer
@ -455,6 +520,8 @@ class GPT(nn.Module):
        x_backout = None
        for i, block in enumerate(self.transformer.h):
            x = self.resid_lambdas[i] * x + self.x0_lambdas[i] * x0
+            if x0_bigram is not None:
+                x = x + self.bigram_lambdas[i].to(x.dtype) * x0_bigram
            ve = self.value_embeds[str(i)](idx).to(x.dtype) if str(i) in self.value_embeds else None
            x = block(x, ve, cos_sin, self.window_sizes[i], kv_cache)
            if i == backout_layer:
--- a/nanochat/optim.py
+++ b/nanochat/optim.py
@ -100,6 +100,8 @@ def muon_step_fused(
    beta2_t: Tensor,                # () - 0-D CPU tensor, beta2 for second moment
    ns_steps: int,                  # 5 - number of Newton-Schulz/Polar Express iterations
    red_dim: int,                   # -1 or -2 - reduction dimension for variance
+    muon_plus: bool,                # add one Frobenius renormalization after orthogonalization
+    muon_eq_axis: int,              # 0 none, 1 row, 2 column equilibration before orthogonalization
 ) -> None:
    """
    Fused Muon step: momentum -> polar_express -> variance_reduction -> cautious_update
@ -115,6 +117,14 @@ def muon_step_fused(
    # Polar express
    # Cast to bf16 for speed when available; skip cast otherwise (fp16 is unstable here due to limited exponent range)
    X = g.bfloat16() if COMPUTE_DTYPE == torch.bfloat16 else g
+    if muon_eq_axis == 1:
+        target = X.float().norm(dim=(-2, -1), keepdim=True) / (X.size(-2) ** 0.5)
+        row_norm = X.float().norm(dim=-1, keepdim=True).clamp_min(1e-6)
+        X = X * (target / row_norm).to(X.dtype)
+    elif muon_eq_axis == 2:
+        target = X.float().norm(dim=(-2, -1), keepdim=True) / (X.size(-1) ** 0.5)
+        col_norm = X.float().norm(dim=-2, keepdim=True).clamp_min(1e-6)
+        X = X * (target / col_norm).to(X.dtype)
    X = X / (X.norm(dim=(-2, -1), keepdim=True) * 1.01 + 1e-6)
    if g.size(-2) > g.size(-1): # Tall matrix
        for a, b, c in polar_express_coeffs[:ns_steps]:
@ -127,6 +137,10 @@ def muon_step_fused(
            B = b * A + c * (A @ A)
            X = a * X + B @ X
    g = X
+    if muon_plus:
+        target_norm = min(g.size(-2), g.size(-1)) ** 0.5
+        current_norm = g.float().norm(dim=(-2, -1), keepdim=True).clamp_min(1e-6)
+        g = g * (target_norm / current_norm).to(g.dtype)

    # Variance reduction
    beta2 = beta2_t.to(g.dtype)
@ -277,6 +291,8 @@ class MuonAdamW(torch.optim.Optimizer):
            self._muon_beta2_t,
            group["ns_steps"],
            red_dim,
+            group.get("muon_plus", False),
+            group.get("muon_eq_axis", 0),
        )

        # Copy back to original params
@ -486,7 +502,7 @@ class DistMuonAdamW(torch.optim.Optimizer):
                grad_chunk[:num_owned], stacked_owned,
                state["momentum_buffer"][:num_owned], state["second_momentum_buffer"][:num_owned],
                self._muon_momentum_t, self._muon_lr_t, self._muon_wd_t, self._muon_beta2_t,
-                group["ns_steps"], red_dim,
+                group["ns_steps"], red_dim, group.get("muon_plus", False), group.get("muon_eq_axis", 0),
            )
            updated_params[:num_owned].copy_(stacked_owned)

--- a/runs/speedrun.sh
+++ b/runs/speedrun.sh
@ -69,8 +69,20 @@ python -m scripts.tok_eval
 echo "Waiting for dataset download to complete..."
 wait $DATASET_DOWNLOAD_PID

-# d24 model (slightly undertrained to beat GPT-2 => decrease data:params ratio from compute optimal 10.5 (default) to 8)
-torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=24 --target-param-data-ratio=8 --device-batch-size=16 --fp8 --run=$WANDB_RUN
+# d22 Muon+/row-eq + hashed bigram recipe.
+torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- \
+    --run=$WANDB_RUN \
+    --fp8 \
+    --depth=22 \
+    --num-iterations=11600 \
+    --target-param-data-ratio=11 \
+    --total-batch-size=524288 \
+    --scalar-lr=0.3 \
+    --bigram-embed-factor=5 \
+    --muon-plus \
+    --muon-eq=row \
+    --core-metric-every=5800 \
+    --sample-every=-1
 # evaluate the model: CORE metric, BPB on train/val, and draw samples
 torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch-size=16

--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@ -52,6 +52,7 @@ parser.add_argument("--aspect-ratio", type=int, default=64, help="model_dim = de
 parser.add_argument("--head-dim", type=int, default=128, help="target head dimension for attention")
 parser.add_argument("--max-seq-len", type=int, default=2048, help="max context length")
 parser.add_argument("--window-pattern", type=str, default="SSSL", help="sliding window pattern tiled across layers: L=full, S=half context (e.g. 'SSL')")
+parser.add_argument("--bigram-embed-factor", type=int, default=0, help="if >0, add a hashed bigram embedding residual")
 # Training horizon (only one used, in order of precedence)
 parser.add_argument("--num-iterations", type=int, default=-1, help="explicit number of optimization steps (-1 = disable)")
 parser.add_argument("--target-flops", type=float, default=-1.0, help="calculate num_iterations to reach target_flops (-1 = disable)")
@ -64,6 +65,8 @@ parser.add_argument("--unembedding-lr", type=float, default=0.008, help="learnin
 parser.add_argument("--weight-decay", type=float, default=0.28, help="cautious weight decay for the Muon optimizer (for weights)")
 parser.add_argument("--matrix-lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)")
 parser.add_argument("--scalar-lr", type=float, default=0.5, help="learning rate for scalars (resid_lambdas, x0_lambdas)")
+parser.add_argument("--muon-plus", action="store_true", help="apply Muon+ style post-orthogonalization Frobenius renormalization")
+parser.add_argument("--muon-eq", type=str, default="none", choices=["none", "row", "col"], help="apply MuonEq-style row/column equilibration before orthogonalization")
 parser.add_argument("--warmup-steps", type=int, default=40, help="number of steps for LR warmup")
 parser.add_argument("--warmdown-ratio", type=float, default=0.65, help="ratio of iterations for LR warmdown")
 parser.add_argument("--final-lr-frac", type=float, default=0.05, help="final LR as fraction of initial LR")
@ -79,6 +82,8 @@ parser.add_argument("--save-every", type=int, default=-1, help="save checkpoints
 parser.add_argument("--model-tag", type=str, default=None, help="override model tag for checkpoint directory name")
 args = parser.parse_args()
 user_config = vars(args).copy()  # for logging
+if args.bigram_embed_factor < 0:
+    parser.error("--bigram-embed-factor must be non-negative")
 # -----------------------------------------------------------------------------
 # Compute init and wandb logging

@ -137,6 +142,7 @@ def build_model_meta(depth):
        sequence_len=args.max_seq_len, vocab_size=vocab_size,
        n_layer=depth, n_head=num_heads, n_kv_head=num_heads, n_embd=model_dim,
        window_pattern=args.window_pattern,
+        bigram_embed_factor=args.bigram_embed_factor,
    )
    with torch.device("meta"):
        model_meta = GPT(config)
@ -243,7 +249,7 @@ def disable_fp8(model):
 # Compile the model

 orig_model = model # original, uncompiled model, for saving raw model state_dict and for inference/evaluation (because the shapes may change shape)
-model = torch.compile(model, dynamic=False) # the inputs to model will never change shape so dynamic=False is safe
+model = torch.compile(model, dynamic=False, mode="max-autotune-no-cudagraphs") # the inputs to model will never change shape so dynamic=False is safe

 # -----------------------------------------------------------------------------
 # Scaling laws and muP extrapolations to determine the optimal training horizon, batch size, learning rates, weight decay.
@ -305,14 +311,19 @@ if weight_decay_scaled != args.weight_decay:

 # -----------------------------------------------------------------------------
 # Initialize the Optimizer (combined MuonAdamW: Muon for matrix params, AdamW for rest)
+muon_eq_axis = {"none": 0, "row": 1, "col": 2}[args.muon_eq]
+print0(f"Muon options: muon_plus={args.muon_plus}, muon_eq={args.muon_eq}")
 optimizer = model.setup_optimizer(
    # AdamW hyperparameters
    unembedding_lr=args.unembedding_lr * batch_lr_scale,
    embedding_lr=args.embedding_lr * batch_lr_scale,
+    bigram_lambda_lr=0.004 * batch_lr_scale,
    scalar_lr=args.scalar_lr * batch_lr_scale,
    # Muon hyperparameters
    matrix_lr=args.matrix_lr * batch_lr_scale,
    weight_decay=weight_decay_scaled,
+    muon_plus=args.muon_plus,
+    muon_eq_axis=muon_eq_axis,
 )

 if resuming:
--- a/tests/test_engine.py
+++ b/tests/test_engine.py
@ -47,6 +47,25 @@ class MockModel:
        return logits


+class BigramStateModel(MockModel):
+    """Mock model whose greedy next token depends on current and previous token ids."""
+    def forward(self, ids, kv_cache=None):
+        B, T = ids.shape
+        if kv_cache is None:
+            prev = torch.cat([torch.zeros(B, 1, dtype=ids.dtype), ids[:, :-1]], dim=1)
+        else:
+            if T > 1 or kv_cache.prev_token is None:
+                prev = torch.cat([torch.zeros(B, 1, dtype=ids.dtype), ids[:, :-1]], dim=1)
+            else:
+                prev = kv_cache.prev_token
+            kv_cache.prev_token = ids[:, -1:].clone()
+            kv_cache.advance(T)
+        next_token = ((ids + prev + 1) % 256).long()
+        logits = torch.full((B, T, self.vocab_size), -1000.0)
+        logits.scatter_(2, next_token.unsqueeze(-1), 1000.0)
+        return logits
+
+
 class ByteTokenizer:
    """
    Simple byte-level tokenizer for testing.
@ -114,6 +133,7 @@ def test_kv_cache_basic():
    # Test reset
    kv_cache.reset()
    assert kv_cache.get_pos() == 0
+    assert kv_cache.prev_token is None

    # Test get_layer_cache returns correct views
    k_layer0, v_layer0 = kv_cache.get_layer_cache(0)
@ -136,6 +156,7 @@ def test_kv_cache_prefill():
    # Write some data to source cache
    src_cache.k_cache[0, 0, :16, :, :] = 1.0
    src_cache.v_cache[0, 0, :16, :, :] = 2.0
+    src_cache.prev_token = torch.tensor([[123]])
    src_cache.advance(16)

    # Create destination cache with larger seq_len
@ -153,6 +174,29 @@ def test_kv_cache_prefill():
    # Check data was copied
    assert (dst_cache.k_cache[0, 0, :16, :, :] == 1.0).all()
    assert (dst_cache.v_cache[0, 0, :16, :, :] == 2.0).all()
+    assert dst_cache.prev_token.tolist() == [[123]]
+
+
+def test_engine_preserves_bigram_prev_token_state():
+    """Engine KV-cache generation should match naive generation for previous-token state."""
+    model = BigramStateModel()
+    tokenizer = ByteTokenizer()
+    engine = Engine(model, tokenizer)
+    prompt = [261, 17, 23, 42]
+    max_tokens = 8
+
+    def naive_generate(tokens):
+        ids = torch.tensor([tokens], dtype=torch.long)
+        out = []
+        for _ in range(max_tokens):
+            logits = model.forward(ids)
+            next_id = int(logits[:, -1, :].argmax(dim=-1).item())
+            out.append(next_id)
+            ids = torch.cat([ids, torch.tensor([[next_id]], dtype=torch.long)], dim=1)
+        return tokens + out
+
+    results, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=max_tokens)
+    assert results[0] == naive_generate(prompt)


 def test_multi_sample_first_token_diversity():