adjust the comment on the regex pattern per recent experimnet see dev/LOG.md

2026-03-18 02:43:13 +00:00 · 2026-01-13 17:50:39 +00:00 · 2026-01-13 17:50:39 +00:00 · 23985413aa
commit 23985413aa
parent 64b48d0e5c
1 changed files with 1 additions and 1 deletions
--- a/nanochat/tokenizer.py
+++ b/nanochat/tokenizer.py
@ -26,7 +26,7 @@ SPECIAL_TOKENS = [

 # NOTE: this split pattern deviates from GPT-4 in that we use \p{N}{1,2} instead of \p{N}{1,3}
 # I did this because I didn't want to "waste" too many tokens on numbers for smaller vocab sizes.
-# I haven't validated that this is actually a good idea, TODO.
+# I verified that 2 is the sweet spot for vocab size of 32K. 1 is a bit worse, 3 was worse still.
 SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""

 # -----------------------------------------------------------------------------