From bacd7efc06ff1f0cdb28f82d4ca9e669ecd86770 Mon Sep 17 00:00:00 2001 From: EFE AYDIN Date: Fri, 15 May 2026 23:24:21 +0300 Subject: [PATCH] Update tokenizer.py --- nanochat/tokenizer.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/nanochat/tokenizer.py b/nanochat/tokenizer.py index 6ee0603e..1cf23065 100644 --- a/nanochat/tokenizer.py +++ b/nanochat/tokenizer.py @@ -4,6 +4,31 @@ BPE Tokenizer in the style of GPT-4. Two implementations are available: 1) HuggingFace Tokenizer that can do both training and inference but is really confusing 2) Our own RustBPE Tokenizer for training and tiktoken for efficient inference + +Patch 1 — encode_special lru_cache → instance dict +__init__: +pythondef __init__(self, enc, bos_token): + self.enc = enc + self._special_id_cache: dict[str, int] = {} + self.bos_token_id = self.encode_special(bos_token) +encode_special (decorator'ı kaldır): +pythondef encode_special(self, text): + cached = self._special_id_cache.get(text) + if cached is not None: + return cached + v = self.enc.encode_single_token(text) + self._special_id_cache[text] = v + return v +Signature aynı, cache davranışı aynı (lookup → hit/miss), instance ölünce cache da ölüyor. Caller hiçbir şey fark etmez. +Patch 2 — insert(0) O(n) shift'i kaldır +RustBPETokenizer.encode, batch dalı: +pythonelif isinstance(text, list): + ids = self.enc.encode_ordinary_batch(text, num_threads=num_threads) + if prepend is not None: + ids = [[prepend_id, *row] for row in ids] + if append is not None: + for ids_row in ids: + ids_row.append(append_id) """ import os