From dd1f606c526cc43f5a892e2e21210c6f125688dd Mon Sep 17 00:00:00 2001
From: Artemis Git Integration <artemis@turintech.ai>
Date: Mon, 3 Nov 2025 10:00:19 +0000
Subject: [PATCH] feat(gpt): initialize KVCache for efficient generation with
 MQA support

Add KVCache pre-allocation in generate() method to enable efficient key-value caching during token generation, avoiding dynamic reallocation overhead
---
 nanochat/gpt.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/nanochat/gpt.py b/nanochat/gpt.py
index 82f13b6..2280de6 100644
--- a/nanochat/gpt.py
+++ b/nanochat/gpt.py
@@ -305,6 +305,15 @@ class GPT(nn.Module):
         if temperature > 0:
             rng = torch.Generator(device=device)
             rng.manual_seed(seed)
+        # Initialize KV cache for efficient generation
+        kv_length_hint = len(tokens) + max_tokens
+        kv_cache = KVCache(
+            batch_size=1,
+            num_heads=self.config.n_kv_head,
+            seq_len=kv_length_hint,
+            head_dim=self.config.n_embd // self.config.n_head,
+            num_layers=self.config.n_layer
+        )
         ids = torch.tensor([tokens], dtype=torch.long, device=device) # add batch dim
         for _ in range(max_tokens):
             logits = self.forward(ids) # (B, T, vocab_size)