From d28d69f3eae46f921ab878c75a7713067fa8e157 Mon Sep 17 00:00:00 2001 From: Kian Kyars Date: Sun, 23 Nov 2025 08:07:11 -0700 Subject: [PATCH 1/3] reduce list redundancy --- nanochat/tokenizer.py | 133 +++++------------------------------------- scripts/tok_train.py | 3 +- 2 files changed, 14 insertions(+), 122 deletions(-) diff --git a/nanochat/tokenizer.py b/nanochat/tokenizer.py index 880f854..bbd419b 100644 --- a/nanochat/tokenizer.py +++ b/nanochat/tokenizer.py @@ -3,7 +3,7 @@ BPE Tokenizer in the style of GPT-4. Two implementations are available: 1) HuggingFace Tokenizer that can do both training and inference but is really confusing -2) Our own RustBPE Tokenizer for training and tiktoken for efficient inference +2) Our own PythonBPE Tokenizer for training and tiktoken for efficient inference """ import os @@ -29,124 +29,6 @@ SPECIAL_TOKENS = [ # I haven't validated that this is actually a good idea, TODO. SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" -# ----------------------------------------------------------------------------- -# Generic GPT-4-style tokenizer based on HuggingFace Tokenizer -from tokenizers import Tokenizer as HFTokenizer -from tokenizers import pre_tokenizers, decoders, Regex -from tokenizers.models import BPE -from tokenizers.trainers import BpeTrainer - -class HuggingFaceTokenizer: - """Light wrapper around HuggingFace Tokenizer for some utilities""" - - def __init__(self, tokenizer): - self.tokenizer = tokenizer - - @classmethod - def from_pretrained(cls, hf_path): - # init from a HuggingFace pretrained tokenizer (e.g. "gpt2") - tokenizer = HFTokenizer.from_pretrained(hf_path) - return cls(tokenizer) - - @classmethod - def from_directory(cls, tokenizer_dir): - # init from a local directory on disk (e.g. "out/tokenizer") - tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json") - tokenizer = HFTokenizer.from_file(tokenizer_path) - return cls(tokenizer) - - @classmethod - def train_from_iterator(cls, text_iterator, vocab_size): - # train from an iterator of text - # Configure the HuggingFace Tokenizer - tokenizer = HFTokenizer(BPE( - byte_fallback=True, # needed! - unk_token=None, - fuse_unk=False, - )) - # Normalizer: None - tokenizer.normalizer = None - # Pre-tokenizer: GPT-4 style - # the regex pattern used by GPT-4 to split text into groups before BPE - # NOTE: The pattern was changed from \p{N}{1,3} to \p{N}{1,2} because I suspect it is harmful to - # very small models and smaller vocab sizes, because it is a little bit wasteful in the token space. - # (but I haven't validated this! TODO) - gpt4_split_regex = Regex(SPLIT_PATTERN) # huggingface demands that you wrap it in Regex!! - tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ - pre_tokenizers.Split(pattern=gpt4_split_regex, behavior="isolated", invert=False), - pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False) - ]) - # Decoder: ByteLevel (it pairs together with the ByteLevel pre-tokenizer) - tokenizer.decoder = decoders.ByteLevel() - # Post-processor: None - tokenizer.post_processor = None - # Trainer: BPE - trainer = BpeTrainer( - vocab_size=vocab_size, - show_progress=True, - min_frequency=0, # no minimum frequency - initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), - special_tokens=SPECIAL_TOKENS, - ) - # Kick off the training - tokenizer.train_from_iterator(text_iterator, trainer) - return cls(tokenizer) - - def get_vocab_size(self): - return self.tokenizer.get_vocab_size() - - def get_special_tokens(self): - special_tokens_map = self.tokenizer.get_added_tokens_decoder() - special_tokens = [w.content for w in special_tokens_map.values()] - return special_tokens - - def id_to_token(self, id): - return self.tokenizer.id_to_token(id) - - def _encode_one(self, text, prepend=None, append=None): - # encode a single string - # prepend/append can be either a string of a special token or a token id directly. - assert isinstance(text, str) - ids = [] - if prepend is not None: - prepend_id = prepend if isinstance(prepend, int) else self.encode_special(prepend) - ids.append(prepend_id) - ids.extend(self.tokenizer.encode(text, add_special_tokens=False).ids) - if append is not None: - append_id = append if isinstance(append, int) else self.encode_special(append) - ids.append(append_id) - return ids - - def encode_special(self, text): - # encode a single special token via exact match - return self.tokenizer.token_to_id(text) - - def get_bos_token_id(self): - bos = self.encode_special("<|bos|>") - return bos - - def encode(self, text, *args, **kwargs): - if isinstance(text, str): - return self._encode_one(text, *args, **kwargs) - elif isinstance(text, list): - return [self._encode_one(t, *args, **kwargs) for t in text] - else: - raise ValueError(f"Invalid input type: {type(text)}") - - def __call__(self, *args, **kwargs): - return self.encode(*args, **kwargs) - - def decode(self, ids): - return self.tokenizer.decode(ids, skip_special_tokens=False) - - def save(self, tokenizer_dir): - # save the tokenizer to disk - os.makedirs(tokenizer_dir, exist_ok=True) - tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json") - self.tokenizer.save(tokenizer_path) - print(f"Saved tokenizer to {tokenizer_path}") - -# ----------------------------------------------------------------------------- # Tokenizer based on rustbpe + tiktoken combo import pickle import rustbpe @@ -296,6 +178,7 @@ class RustBPETokenizer: # some sanity checking here around assumptions, to prevent footguns must_be_from = "user" if i % 2 == 0 else "assistant" + # check user vs assistant assert message["role"] == must_be_from, f"Message {i} is from {message['role']} but should be from {must_be_from}" # content can be either a simple string or a list of parts (e.g. containing tool calls) @@ -303,37 +186,47 @@ class RustBPETokenizer: if message["role"] == "user": assert isinstance(content, str), "User messages are simply expected to be strings" - value_ids = self.encode(content) add_tokens(user_start, 0) + value_ids = self.encode(content) add_tokens(value_ids, 0) add_tokens(user_end, 0) + # assitant elif message["role"] == "assistant": + # add assistant start tokens add_tokens(assistant_start, 0) if isinstance(content, str): # simple string => simply add the tokens value_ids = self.encode(content) add_tokens(value_ids, 1) + # then we will go straight to add_tokens for assitant end, unless we have unknown content type + # these are the more nuanced cases elif isinstance(content, list): for part in content: + # for element in list value_ids = self.encode(part["text"]) + # encode each element if part["type"] == "text": # string part => simply add the tokens add_tokens(value_ids, 1) + # if it was text, we add without any other special tokens elif part["type"] == "python": # python tool call => add the tokens inside <|python_start|> and <|python_end|> add_tokens(python_start, 1) + # add python special tokens in this case add_tokens(value_ids, 1) add_tokens(python_end, 1) elif part["type"] == "python_output": # python output => add the tokens inside <|output_start|> and <|output_end|> # none of these tokens are supervised because the tokens come from Python at test time add_tokens(output_start, 0) + # python output, looks like this is the python output of the python generated by the llm add_tokens(value_ids, 0) add_tokens(output_end, 0) else: raise ValueError(f"Unknown part type: {part['type']}") else: raise ValueError(f"Unknown content type: {type(content)}") + # add assitant end tokens add_tokens(assistant_end, 1) # truncate to max_tokens tokens MAX (helps prevent OOMs) diff --git a/scripts/tok_train.py b/scripts/tok_train.py index c2faf17..2fd8fea 100644 --- a/scripts/tok_train.py +++ b/scripts/tok_train.py @@ -77,8 +77,7 @@ vocab_size = tokenizer.get_vocab_size() special_set = set(tokenizer.get_special_tokens()) token_strings = [tokenizer.decode([token_id]) for token_id in range(vocab_size)] token_bytes = [] -for token_id in range(vocab_size): - token_str = token_strings[token_id] # the Python string representation of this token +for token_str in token_strings: if token_str in special_set: token_bytes.append(0) # special characters are not counted else: From 1d719a7c94c35a07885b1044a5275b82585282f1 Mon Sep 17 00:00:00 2001 From: Kian Kyars Date: Sun, 23 Nov 2025 08:07:52 -0700 Subject: [PATCH 2/3] add back hugging face tokenizer --- nanochat/tokenizer.py | 133 +++++++++++++++++++++++++++++++++++++----- 1 file changed, 120 insertions(+), 13 deletions(-) diff --git a/nanochat/tokenizer.py b/nanochat/tokenizer.py index bbd419b..880f854 100644 --- a/nanochat/tokenizer.py +++ b/nanochat/tokenizer.py @@ -3,7 +3,7 @@ BPE Tokenizer in the style of GPT-4. Two implementations are available: 1) HuggingFace Tokenizer that can do both training and inference but is really confusing -2) Our own PythonBPE Tokenizer for training and tiktoken for efficient inference +2) Our own RustBPE Tokenizer for training and tiktoken for efficient inference """ import os @@ -29,6 +29,124 @@ SPECIAL_TOKENS = [ # I haven't validated that this is actually a good idea, TODO. SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" +# ----------------------------------------------------------------------------- +# Generic GPT-4-style tokenizer based on HuggingFace Tokenizer +from tokenizers import Tokenizer as HFTokenizer +from tokenizers import pre_tokenizers, decoders, Regex +from tokenizers.models import BPE +from tokenizers.trainers import BpeTrainer + +class HuggingFaceTokenizer: + """Light wrapper around HuggingFace Tokenizer for some utilities""" + + def __init__(self, tokenizer): + self.tokenizer = tokenizer + + @classmethod + def from_pretrained(cls, hf_path): + # init from a HuggingFace pretrained tokenizer (e.g. "gpt2") + tokenizer = HFTokenizer.from_pretrained(hf_path) + return cls(tokenizer) + + @classmethod + def from_directory(cls, tokenizer_dir): + # init from a local directory on disk (e.g. "out/tokenizer") + tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json") + tokenizer = HFTokenizer.from_file(tokenizer_path) + return cls(tokenizer) + + @classmethod + def train_from_iterator(cls, text_iterator, vocab_size): + # train from an iterator of text + # Configure the HuggingFace Tokenizer + tokenizer = HFTokenizer(BPE( + byte_fallback=True, # needed! + unk_token=None, + fuse_unk=False, + )) + # Normalizer: None + tokenizer.normalizer = None + # Pre-tokenizer: GPT-4 style + # the regex pattern used by GPT-4 to split text into groups before BPE + # NOTE: The pattern was changed from \p{N}{1,3} to \p{N}{1,2} because I suspect it is harmful to + # very small models and smaller vocab sizes, because it is a little bit wasteful in the token space. + # (but I haven't validated this! TODO) + gpt4_split_regex = Regex(SPLIT_PATTERN) # huggingface demands that you wrap it in Regex!! + tokenizer.pre_tokenizer = pre_tokenizers.Sequence([ + pre_tokenizers.Split(pattern=gpt4_split_regex, behavior="isolated", invert=False), + pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False) + ]) + # Decoder: ByteLevel (it pairs together with the ByteLevel pre-tokenizer) + tokenizer.decoder = decoders.ByteLevel() + # Post-processor: None + tokenizer.post_processor = None + # Trainer: BPE + trainer = BpeTrainer( + vocab_size=vocab_size, + show_progress=True, + min_frequency=0, # no minimum frequency + initial_alphabet=pre_tokenizers.ByteLevel.alphabet(), + special_tokens=SPECIAL_TOKENS, + ) + # Kick off the training + tokenizer.train_from_iterator(text_iterator, trainer) + return cls(tokenizer) + + def get_vocab_size(self): + return self.tokenizer.get_vocab_size() + + def get_special_tokens(self): + special_tokens_map = self.tokenizer.get_added_tokens_decoder() + special_tokens = [w.content for w in special_tokens_map.values()] + return special_tokens + + def id_to_token(self, id): + return self.tokenizer.id_to_token(id) + + def _encode_one(self, text, prepend=None, append=None): + # encode a single string + # prepend/append can be either a string of a special token or a token id directly. + assert isinstance(text, str) + ids = [] + if prepend is not None: + prepend_id = prepend if isinstance(prepend, int) else self.encode_special(prepend) + ids.append(prepend_id) + ids.extend(self.tokenizer.encode(text, add_special_tokens=False).ids) + if append is not None: + append_id = append if isinstance(append, int) else self.encode_special(append) + ids.append(append_id) + return ids + + def encode_special(self, text): + # encode a single special token via exact match + return self.tokenizer.token_to_id(text) + + def get_bos_token_id(self): + bos = self.encode_special("<|bos|>") + return bos + + def encode(self, text, *args, **kwargs): + if isinstance(text, str): + return self._encode_one(text, *args, **kwargs) + elif isinstance(text, list): + return [self._encode_one(t, *args, **kwargs) for t in text] + else: + raise ValueError(f"Invalid input type: {type(text)}") + + def __call__(self, *args, **kwargs): + return self.encode(*args, **kwargs) + + def decode(self, ids): + return self.tokenizer.decode(ids, skip_special_tokens=False) + + def save(self, tokenizer_dir): + # save the tokenizer to disk + os.makedirs(tokenizer_dir, exist_ok=True) + tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json") + self.tokenizer.save(tokenizer_path) + print(f"Saved tokenizer to {tokenizer_path}") + +# ----------------------------------------------------------------------------- # Tokenizer based on rustbpe + tiktoken combo import pickle import rustbpe @@ -178,7 +296,6 @@ class RustBPETokenizer: # some sanity checking here around assumptions, to prevent footguns must_be_from = "user" if i % 2 == 0 else "assistant" - # check user vs assistant assert message["role"] == must_be_from, f"Message {i} is from {message['role']} but should be from {must_be_from}" # content can be either a simple string or a list of parts (e.g. containing tool calls) @@ -186,47 +303,37 @@ class RustBPETokenizer: if message["role"] == "user": assert isinstance(content, str), "User messages are simply expected to be strings" - add_tokens(user_start, 0) value_ids = self.encode(content) + add_tokens(user_start, 0) add_tokens(value_ids, 0) add_tokens(user_end, 0) - # assitant elif message["role"] == "assistant": - # add assistant start tokens add_tokens(assistant_start, 0) if isinstance(content, str): # simple string => simply add the tokens value_ids = self.encode(content) add_tokens(value_ids, 1) - # then we will go straight to add_tokens for assitant end, unless we have unknown content type - # these are the more nuanced cases elif isinstance(content, list): for part in content: - # for element in list value_ids = self.encode(part["text"]) - # encode each element if part["type"] == "text": # string part => simply add the tokens add_tokens(value_ids, 1) - # if it was text, we add without any other special tokens elif part["type"] == "python": # python tool call => add the tokens inside <|python_start|> and <|python_end|> add_tokens(python_start, 1) - # add python special tokens in this case add_tokens(value_ids, 1) add_tokens(python_end, 1) elif part["type"] == "python_output": # python output => add the tokens inside <|output_start|> and <|output_end|> # none of these tokens are supervised because the tokens come from Python at test time add_tokens(output_start, 0) - # python output, looks like this is the python output of the python generated by the llm add_tokens(value_ids, 0) add_tokens(output_end, 0) else: raise ValueError(f"Unknown part type: {part['type']}") else: raise ValueError(f"Unknown content type: {type(content)}") - # add assitant end tokens add_tokens(assistant_end, 1) # truncate to max_tokens tokens MAX (helps prevent OOMs) From 2f4f20862d22157f410916a1cf081f9576c1c88b Mon Sep 17 00:00:00 2001 From: Kian Kyars Date: Sun, 23 Nov 2025 08:09:28 -0700 Subject: [PATCH 3/3] add back comment --- scripts/tok_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/tok_train.py b/scripts/tok_train.py index 2fd8fea..1bc9249 100644 --- a/scripts/tok_train.py +++ b/scripts/tok_train.py @@ -77,7 +77,7 @@ vocab_size = tokenizer.get_vocab_size() special_set = set(tokenizer.get_special_tokens()) token_strings = [tokenizer.decode([token_id]) for token_id in range(vocab_size)] token_bytes = [] -for token_str in token_strings: +for token_str in token_strings: # the Python string representation of this token if token_str in special_set: token_bytes.append(0) # special characters are not counted else: