diff --git a/runs/runcpu.sh b/runs/runcpu.sh
index 853fa1f..ea52df5 100755
--- a/runs/runcpu.sh
+++ b/runs/runcpu.sh
@@ -22,7 +22,7 @@ if [ -z "$WANDB_RUN" ]; then
 fi
 
 # train tokenizer on ~2B characters (~34 seconds on my MacBook Pro M3 Max)
-python -m nanochat.dataset -n 8
+python -m nanochat.dataset -n 11
 python -m scripts.tok_train --max-chars=2000000000
 python -m scripts.tok_eval
 
diff --git a/runs/speedrun.sh b/runs/speedrun.sh
index 62466c7..274dcbf 100644
--- a/runs/speedrun.sh
+++ b/runs/speedrun.sh
@@ -49,9 +49,9 @@ python -m nanochat.report reset
 # Tokenizer
 
 # Download the first ~2B characters of pretraining dataset
-# each data shard is ~250M chars
-# so we download 2e9 / 250e6 = 8 data shards at this point
-# each shard is ~100MB of text (compressed), so this is about ~800MB of data on disk
+# each data shard is ~250M chars, but due to the `doc_cap` only ~200M chars are used.
+# so we download 2e9 / 200e6 + 1 = 11 data shards at this point
+# each shard is ~90MB of text (compressed), so this is about ~1GB of data on disk
 # look at dev/repackage_data_reference.py for details on how this data was prepared
 python -m nanochat.dataset -n 8
 # Immediately also kick off downloading more shards in the background while tokenizer trains