From b70da6d9071b316ce361c316cbe72bb0923ad75c Mon Sep 17 00:00:00 2001 From: SyedaAnshrahGillani Date: Tue, 14 Oct 2025 16:10:48 +0500 Subject: [PATCH] refactor: Pre-allocate larger KVCache to improve performance The `KVCache.insert_kv` function was dynamically resizing the cache tensor whenever it ran out of space. This can be inefficient and lead to performance degradation, especially during long generation sequences. This commit improves performance by pre-allocating a larger cache upfront, reducing the need for frequent resizing and minimizing memory reallocation overhead. --- nanochat/engine.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/nanochat/engine.py b/nanochat/engine.py index f9c5d9a..53ced30 100644 --- a/nanochat/engine.py +++ b/nanochat/engine.py @@ -106,7 +106,12 @@ class KVCache: def insert_kv(self, layer_idx, k, v): # Lazy initialize the cache here because we need to know the dtype/device if self.kv_cache is None: + # Pre-allocate a larger cache to avoid frequent resizing + self.kv_shape = list(self.kv_shape) + self.kv_shape[4] *= 2 # Double the sequence length for pre-allocation self.kv_cache = torch.empty(self.kv_shape, dtype=k.dtype, device=k.device) + self.kv_shape[4] //= 2 # a bit of a hack to restore the original shape for future checks + # Insert new keys/values to the cache and return the full cache so far B, H, T_add, D = k.size() t0, t1 = self.pos, self.pos + T_add