From 23985413aaa30393802f1dbad67c80e698e9bb5a Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Tue, 13 Jan 2026 17:50:39 +0000
Subject: [PATCH] adjust the comment on the regex pattern per recent experimnet
 see dev/LOG.md

---
 nanochat/tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nanochat/tokenizer.py b/nanochat/tokenizer.py
index e8ccafa..a2146c2 100644
--- a/nanochat/tokenizer.py
+++ b/nanochat/tokenizer.py
@@ -26,7 +26,7 @@ SPECIAL_TOKENS = [
 
 # NOTE: this split pattern deviates from GPT-4 in that we use \p{N}{1,2} instead of \p{N}{1,3}
 # I did this because I didn't want to "waste" too many tokens on numbers for smaller vocab sizes.
-# I haven't validated that this is actually a good idea, TODO.
+# I verified that 2 is the sweet spot for vocab size of 32K. 1 is a bit worse, 3 was worse still.
 SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
 
 # -----------------------------------------------------------------------------