From 48aaa4b3dfae600b18a9828f9b278afa70fb843e Mon Sep 17 00:00:00 2001
From: Daniel Aioanei <daniel.aioanei@geniussports.com>
Date: Fri, 9 Jan 2026 22:37:53 +0100
Subject: [PATCH] Download the minimum number of parquet shards to train the
 tokenizer reproducibly

---
 dev/runcpu.sh | 2 +-
 run1000.sh    | 2 +-
 speedrun.sh   | 9 +++++----
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/dev/runcpu.sh b/dev/runcpu.sh
index c4a719e..a58bfbc 100755
--- a/dev/runcpu.sh
+++ b/dev/runcpu.sh
@@ -24,7 +24,7 @@ fi
 python -m nanochat.report reset
 
 # train tokenizer on ~1B characters
-python -m nanochat.dataset -n 4
+python -m nanochat.dataset -n 6
 python -m scripts.tok_train --max_chars=1000000000
 python -m scripts.tok_eval
 
diff --git a/run1000.sh b/run1000.sh
index a7a3716..669b279 100644
--- a/run1000.sh
+++ b/run1000.sh
@@ -19,7 +19,7 @@ python -m nanochat.report reset
 curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
 
 # train tokenizer on ~4B characters and kick off download of the rest for pretraining
-python -m nanochat.dataset -n 16
+python -m nanochat.dataset -n 21
 # start downloading the rest of the shards for a total of 800 (see below why 800)
 python -m nanochat.dataset -n 800 &
 # todo: download the rest of it
diff --git a/speedrun.sh b/speedrun.sh
index f9be227..9f445b3 100644
--- a/speedrun.sh
+++ b/speedrun.sh
@@ -50,10 +50,11 @@ python -m nanochat.report reset
 
 # Download the first ~2B characters of pretraining dataset
 # look at dev/repackage_data_reference.py for details on how this data was prepared
-# each data shard is ~250M chars
-# so we download 2e9 / 250e6 = 8 data shards at this point
-# each shard is ~100MB of text (compressed), so this is about ~800MB of data on disk
-python -m nanochat.dataset -n 8
+# each data shard is ~250M chars, but due to the `doc_cap` only ~200M chars are used.
+# The last shard is considered to be eval, so we download 2e9 / 200e6 + 1 = 11 data
+# shards at this point
+# each shard is ~90MB of text (compressed), so this is about ~1GB of data on disk
+python -m nanochat.dataset -n 11
 # Immediately also kick off downloading more shards in the background while tokenizer trains
 # See comment below for why 240 is the right number here
 python -m nanochat.dataset -n 240 &