mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-07 12:52:16 +00:00
reduce list redundancy
This commit is contained in:
parent
4a87a0d19f
commit
d28d69f3ea
|
|
@ -3,7 +3,7 @@ BPE Tokenizer in the style of GPT-4.
|
||||||
|
|
||||||
Two implementations are available:
|
Two implementations are available:
|
||||||
1) HuggingFace Tokenizer that can do both training and inference but is really confusing
|
1) HuggingFace Tokenizer that can do both training and inference but is really confusing
|
||||||
2) Our own RustBPE Tokenizer for training and tiktoken for efficient inference
|
2) Our own PythonBPE Tokenizer for training and tiktoken for efficient inference
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
|
@ -29,124 +29,6 @@ SPECIAL_TOKENS = [
|
||||||
# I haven't validated that this is actually a good idea, TODO.
|
# I haven't validated that this is actually a good idea, TODO.
|
||||||
SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
|
SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Generic GPT-4-style tokenizer based on HuggingFace Tokenizer
|
|
||||||
from tokenizers import Tokenizer as HFTokenizer
|
|
||||||
from tokenizers import pre_tokenizers, decoders, Regex
|
|
||||||
from tokenizers.models import BPE
|
|
||||||
from tokenizers.trainers import BpeTrainer
|
|
||||||
|
|
||||||
class HuggingFaceTokenizer:
|
|
||||||
"""Light wrapper around HuggingFace Tokenizer for some utilities"""
|
|
||||||
|
|
||||||
def __init__(self, tokenizer):
|
|
||||||
self.tokenizer = tokenizer
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(cls, hf_path):
|
|
||||||
# init from a HuggingFace pretrained tokenizer (e.g. "gpt2")
|
|
||||||
tokenizer = HFTokenizer.from_pretrained(hf_path)
|
|
||||||
return cls(tokenizer)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_directory(cls, tokenizer_dir):
|
|
||||||
# init from a local directory on disk (e.g. "out/tokenizer")
|
|
||||||
tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
|
|
||||||
tokenizer = HFTokenizer.from_file(tokenizer_path)
|
|
||||||
return cls(tokenizer)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def train_from_iterator(cls, text_iterator, vocab_size):
|
|
||||||
# train from an iterator of text
|
|
||||||
# Configure the HuggingFace Tokenizer
|
|
||||||
tokenizer = HFTokenizer(BPE(
|
|
||||||
byte_fallback=True, # needed!
|
|
||||||
unk_token=None,
|
|
||||||
fuse_unk=False,
|
|
||||||
))
|
|
||||||
# Normalizer: None
|
|
||||||
tokenizer.normalizer = None
|
|
||||||
# Pre-tokenizer: GPT-4 style
|
|
||||||
# the regex pattern used by GPT-4 to split text into groups before BPE
|
|
||||||
# NOTE: The pattern was changed from \p{N}{1,3} to \p{N}{1,2} because I suspect it is harmful to
|
|
||||||
# very small models and smaller vocab sizes, because it is a little bit wasteful in the token space.
|
|
||||||
# (but I haven't validated this! TODO)
|
|
||||||
gpt4_split_regex = Regex(SPLIT_PATTERN) # huggingface demands that you wrap it in Regex!!
|
|
||||||
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
|
|
||||||
pre_tokenizers.Split(pattern=gpt4_split_regex, behavior="isolated", invert=False),
|
|
||||||
pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False)
|
|
||||||
])
|
|
||||||
# Decoder: ByteLevel (it pairs together with the ByteLevel pre-tokenizer)
|
|
||||||
tokenizer.decoder = decoders.ByteLevel()
|
|
||||||
# Post-processor: None
|
|
||||||
tokenizer.post_processor = None
|
|
||||||
# Trainer: BPE
|
|
||||||
trainer = BpeTrainer(
|
|
||||||
vocab_size=vocab_size,
|
|
||||||
show_progress=True,
|
|
||||||
min_frequency=0, # no minimum frequency
|
|
||||||
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
|
|
||||||
special_tokens=SPECIAL_TOKENS,
|
|
||||||
)
|
|
||||||
# Kick off the training
|
|
||||||
tokenizer.train_from_iterator(text_iterator, trainer)
|
|
||||||
return cls(tokenizer)
|
|
||||||
|
|
||||||
def get_vocab_size(self):
|
|
||||||
return self.tokenizer.get_vocab_size()
|
|
||||||
|
|
||||||
def get_special_tokens(self):
|
|
||||||
special_tokens_map = self.tokenizer.get_added_tokens_decoder()
|
|
||||||
special_tokens = [w.content for w in special_tokens_map.values()]
|
|
||||||
return special_tokens
|
|
||||||
|
|
||||||
def id_to_token(self, id):
|
|
||||||
return self.tokenizer.id_to_token(id)
|
|
||||||
|
|
||||||
def _encode_one(self, text, prepend=None, append=None):
|
|
||||||
# encode a single string
|
|
||||||
# prepend/append can be either a string of a special token or a token id directly.
|
|
||||||
assert isinstance(text, str)
|
|
||||||
ids = []
|
|
||||||
if prepend is not None:
|
|
||||||
prepend_id = prepend if isinstance(prepend, int) else self.encode_special(prepend)
|
|
||||||
ids.append(prepend_id)
|
|
||||||
ids.extend(self.tokenizer.encode(text, add_special_tokens=False).ids)
|
|
||||||
if append is not None:
|
|
||||||
append_id = append if isinstance(append, int) else self.encode_special(append)
|
|
||||||
ids.append(append_id)
|
|
||||||
return ids
|
|
||||||
|
|
||||||
def encode_special(self, text):
|
|
||||||
# encode a single special token via exact match
|
|
||||||
return self.tokenizer.token_to_id(text)
|
|
||||||
|
|
||||||
def get_bos_token_id(self):
|
|
||||||
bos = self.encode_special("<|bos|>")
|
|
||||||
return bos
|
|
||||||
|
|
||||||
def encode(self, text, *args, **kwargs):
|
|
||||||
if isinstance(text, str):
|
|
||||||
return self._encode_one(text, *args, **kwargs)
|
|
||||||
elif isinstance(text, list):
|
|
||||||
return [self._encode_one(t, *args, **kwargs) for t in text]
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Invalid input type: {type(text)}")
|
|
||||||
|
|
||||||
def __call__(self, *args, **kwargs):
|
|
||||||
return self.encode(*args, **kwargs)
|
|
||||||
|
|
||||||
def decode(self, ids):
|
|
||||||
return self.tokenizer.decode(ids, skip_special_tokens=False)
|
|
||||||
|
|
||||||
def save(self, tokenizer_dir):
|
|
||||||
# save the tokenizer to disk
|
|
||||||
os.makedirs(tokenizer_dir, exist_ok=True)
|
|
||||||
tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
|
|
||||||
self.tokenizer.save(tokenizer_path)
|
|
||||||
print(f"Saved tokenizer to {tokenizer_path}")
|
|
||||||
|
|
||||||
# -----------------------------------------------------------------------------
|
|
||||||
# Tokenizer based on rustbpe + tiktoken combo
|
# Tokenizer based on rustbpe + tiktoken combo
|
||||||
import pickle
|
import pickle
|
||||||
import rustbpe
|
import rustbpe
|
||||||
|
|
@ -296,6 +178,7 @@ class RustBPETokenizer:
|
||||||
|
|
||||||
# some sanity checking here around assumptions, to prevent footguns
|
# some sanity checking here around assumptions, to prevent footguns
|
||||||
must_be_from = "user" if i % 2 == 0 else "assistant"
|
must_be_from = "user" if i % 2 == 0 else "assistant"
|
||||||
|
# check user vs assistant
|
||||||
assert message["role"] == must_be_from, f"Message {i} is from {message['role']} but should be from {must_be_from}"
|
assert message["role"] == must_be_from, f"Message {i} is from {message['role']} but should be from {must_be_from}"
|
||||||
|
|
||||||
# content can be either a simple string or a list of parts (e.g. containing tool calls)
|
# content can be either a simple string or a list of parts (e.g. containing tool calls)
|
||||||
|
|
@ -303,37 +186,47 @@ class RustBPETokenizer:
|
||||||
|
|
||||||
if message["role"] == "user":
|
if message["role"] == "user":
|
||||||
assert isinstance(content, str), "User messages are simply expected to be strings"
|
assert isinstance(content, str), "User messages are simply expected to be strings"
|
||||||
value_ids = self.encode(content)
|
|
||||||
add_tokens(user_start, 0)
|
add_tokens(user_start, 0)
|
||||||
|
value_ids = self.encode(content)
|
||||||
add_tokens(value_ids, 0)
|
add_tokens(value_ids, 0)
|
||||||
add_tokens(user_end, 0)
|
add_tokens(user_end, 0)
|
||||||
|
# assitant
|
||||||
elif message["role"] == "assistant":
|
elif message["role"] == "assistant":
|
||||||
|
# add assistant start tokens
|
||||||
add_tokens(assistant_start, 0)
|
add_tokens(assistant_start, 0)
|
||||||
if isinstance(content, str):
|
if isinstance(content, str):
|
||||||
# simple string => simply add the tokens
|
# simple string => simply add the tokens
|
||||||
value_ids = self.encode(content)
|
value_ids = self.encode(content)
|
||||||
add_tokens(value_ids, 1)
|
add_tokens(value_ids, 1)
|
||||||
|
# then we will go straight to add_tokens for assitant end, unless we have unknown content type
|
||||||
|
# these are the more nuanced cases
|
||||||
elif isinstance(content, list):
|
elif isinstance(content, list):
|
||||||
for part in content:
|
for part in content:
|
||||||
|
# for element in list
|
||||||
value_ids = self.encode(part["text"])
|
value_ids = self.encode(part["text"])
|
||||||
|
# encode each element
|
||||||
if part["type"] == "text":
|
if part["type"] == "text":
|
||||||
# string part => simply add the tokens
|
# string part => simply add the tokens
|
||||||
add_tokens(value_ids, 1)
|
add_tokens(value_ids, 1)
|
||||||
|
# if it was text, we add without any other special tokens
|
||||||
elif part["type"] == "python":
|
elif part["type"] == "python":
|
||||||
# python tool call => add the tokens inside <|python_start|> and <|python_end|>
|
# python tool call => add the tokens inside <|python_start|> and <|python_end|>
|
||||||
add_tokens(python_start, 1)
|
add_tokens(python_start, 1)
|
||||||
|
# add python special tokens in this case
|
||||||
add_tokens(value_ids, 1)
|
add_tokens(value_ids, 1)
|
||||||
add_tokens(python_end, 1)
|
add_tokens(python_end, 1)
|
||||||
elif part["type"] == "python_output":
|
elif part["type"] == "python_output":
|
||||||
# python output => add the tokens inside <|output_start|> and <|output_end|>
|
# python output => add the tokens inside <|output_start|> and <|output_end|>
|
||||||
# none of these tokens are supervised because the tokens come from Python at test time
|
# none of these tokens are supervised because the tokens come from Python at test time
|
||||||
add_tokens(output_start, 0)
|
add_tokens(output_start, 0)
|
||||||
|
# python output, looks like this is the python output of the python generated by the llm
|
||||||
add_tokens(value_ids, 0)
|
add_tokens(value_ids, 0)
|
||||||
add_tokens(output_end, 0)
|
add_tokens(output_end, 0)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown part type: {part['type']}")
|
raise ValueError(f"Unknown part type: {part['type']}")
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unknown content type: {type(content)}")
|
raise ValueError(f"Unknown content type: {type(content)}")
|
||||||
|
# add assitant end tokens
|
||||||
add_tokens(assistant_end, 1)
|
add_tokens(assistant_end, 1)
|
||||||
|
|
||||||
# truncate to max_tokens tokens MAX (helps prevent OOMs)
|
# truncate to max_tokens tokens MAX (helps prevent OOMs)
|
||||||
|
|
|
||||||
|
|
@ -77,8 +77,7 @@ vocab_size = tokenizer.get_vocab_size()
|
||||||
special_set = set(tokenizer.get_special_tokens())
|
special_set = set(tokenizer.get_special_tokens())
|
||||||
token_strings = [tokenizer.decode([token_id]) for token_id in range(vocab_size)]
|
token_strings = [tokenizer.decode([token_id]) for token_id in range(vocab_size)]
|
||||||
token_bytes = []
|
token_bytes = []
|
||||||
for token_id in range(vocab_size):
|
for token_str in token_strings:
|
||||||
token_str = token_strings[token_id] # the Python string representation of this token
|
|
||||||
if token_str in special_set:
|
if token_str in special_set:
|
||||||
token_bytes.append(0) # special characters are not counted
|
token_bytes.append(0) # special characters are not counted
|
||||||
else:
|
else:
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user