From bacd7efc06ff1f0cdb28f82d4ca9e669ecd86770 Mon Sep 17 00:00:00 2001
From: EFE AYDIN <aydinefe290@gmail.com>
Date: Fri, 15 May 2026 23:24:21 +0300
Subject: [PATCH] Update tokenizer.py

---
 nanochat/tokenizer.py | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/nanochat/tokenizer.py b/nanochat/tokenizer.py
index 6ee0603e..1cf23065 100644
--- a/nanochat/tokenizer.py
+++ b/nanochat/tokenizer.py
@@ -4,6 +4,31 @@ BPE Tokenizer in the style of GPT-4.
 Two implementations are available:
 1) HuggingFace Tokenizer that can do both training and inference but is really confusing
 2) Our own RustBPE Tokenizer for training and tiktoken for efficient inference
+
+Patch 1 — encode_special lru_cache → instance dict
+__init__:
+pythondef __init__(self, enc, bos_token):
+    self.enc = enc
+    self._special_id_cache: dict[str, int] = {}
+    self.bos_token_id = self.encode_special(bos_token)
+encode_special (decorator'ı kaldır):
+pythondef encode_special(self, text):
+    cached = self._special_id_cache.get(text)
+    if cached is not None:
+        return cached
+    v = self.enc.encode_single_token(text)
+    self._special_id_cache[text] = v
+    return v
+Signature aynı, cache davranışı aynı (lookup → hit/miss), instance ölünce cache da ölüyor. Caller hiçbir şey fark etmez.
+Patch 2 — insert(0) O(n) shift'i kaldır
+RustBPETokenizer.encode, batch dalı:
+pythonelif isinstance(text, list):
+    ids = self.enc.encode_ordinary_batch(text, num_threads=num_threads)
+    if prepend is not None:
+        ids = [[prepend_id, *row] for row in ids]
+    if append is not None:
+        for ids_row in ids:
+            ids_row.append(append_id)
 """
 
 import os