mirror of
https://github.com/karpathy/nanochat.git
synced 2026-06-15 02:29:09 +00:00
Update tokenizer.py
This commit is contained in:
parent
990a26332c
commit
bacd7efc06
|
|
@ -4,6 +4,31 @@ BPE Tokenizer in the style of GPT-4.
|
|||
Two implementations are available:
|
||||
1) HuggingFace Tokenizer that can do both training and inference but is really confusing
|
||||
2) Our own RustBPE Tokenizer for training and tiktoken for efficient inference
|
||||
|
||||
Patch 1 — encode_special lru_cache → instance dict
|
||||
__init__:
|
||||
pythondef __init__(self, enc, bos_token):
|
||||
self.enc = enc
|
||||
self._special_id_cache: dict[str, int] = {}
|
||||
self.bos_token_id = self.encode_special(bos_token)
|
||||
encode_special (decorator'ı kaldır):
|
||||
pythondef encode_special(self, text):
|
||||
cached = self._special_id_cache.get(text)
|
||||
if cached is not None:
|
||||
return cached
|
||||
v = self.enc.encode_single_token(text)
|
||||
self._special_id_cache[text] = v
|
||||
return v
|
||||
Signature aynı, cache davranışı aynı (lookup → hit/miss), instance ölünce cache da ölüyor. Caller hiçbir şey fark etmez.
|
||||
Patch 2 — insert(0) O(n) shift'i kaldır
|
||||
RustBPETokenizer.encode, batch dalı:
|
||||
pythonelif isinstance(text, list):
|
||||
ids = self.enc.encode_ordinary_batch(text, num_threads=num_threads)
|
||||
if prepend is not None:
|
||||
ids = [[prepend_id, *row] for row in ids]
|
||||
if append is not None:
|
||||
for ids_row in ids:
|
||||
ids_row.append(append_id)
|
||||
"""
|
||||
|
||||
import os
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user