Merge c655043092 into 2f09686724

2026-04-04 06:35:23 +00:00 · 2026-02-13 14:22:46 +01:00 · 2026-02-13 14:22:46 +01:00 · f60cd0eb36
commit f60cd0eb36
parent 2f09686724 c655043092
2 changed files with 4 additions and 4 deletions
--- a/runs/runcpu.sh
+++ b/runs/runcpu.sh
@ -22,7 +22,7 @@ if [ -z "$WANDB_RUN" ]; then
 fi

 # train tokenizer on ~2B characters (~34 seconds on my MacBook Pro M3 Max)
-python -m nanochat.dataset -n 8
+python -m nanochat.dataset -n 11
 python -m scripts.tok_train --max-chars=2000000000
 python -m scripts.tok_eval

--- a/runs/speedrun.sh
+++ b/runs/speedrun.sh
@ -49,9 +49,9 @@ python -m nanochat.report reset
 # Tokenizer

 # Download the first ~2B characters of pretraining dataset
-# each data shard is ~250M chars
-# so we download 2e9 / 250e6 = 8 data shards at this point
-# each shard is ~100MB of text (compressed), so this is about ~800MB of data on disk
+# each data shard is ~250M chars, but due to the `doc_cap` only ~200M chars are used.
+# so we download 2e9 / 200e6 + 1 = 11 data shards at this point
+# each shard is ~90MB of text (compressed), so this is about ~1GB of data on disk
 # look at dev/repackage_data_reference.py for details on how this data was prepared
 python -m nanochat.dataset -n 8
 # Immediately also kick off downloading more shards in the background while tokenizer trains