From 23985413aaa30393802f1dbad67c80e698e9bb5a Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 13 Jan 2026 17:50:39 +0000 Subject: [PATCH] adjust the comment on the regex pattern per recent experimnet see dev/LOG.md --- nanochat/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nanochat/tokenizer.py b/nanochat/tokenizer.py index e8ccafa..a2146c2 100644 --- a/nanochat/tokenizer.py +++ b/nanochat/tokenizer.py @@ -26,7 +26,7 @@ SPECIAL_TOKENS = [ # NOTE: this split pattern deviates from GPT-4 in that we use \p{N}{1,2} instead of \p{N}{1,3} # I did this because I didn't want to "waste" too many tokens on numbers for smaller vocab sizes. -# I haven't validated that this is actually a good idea, TODO. +# I verified that 2 is the sweet spot for vocab size of 32K. 1 is a bit worse, 3 was worse still. SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" # -----------------------------------------------------------------------------