diff --git a/runs/runcpu.sh b/runs/runcpu.sh index 853fa1f..ea52df5 100755 --- a/runs/runcpu.sh +++ b/runs/runcpu.sh @@ -22,7 +22,7 @@ if [ -z "$WANDB_RUN" ]; then fi # train tokenizer on ~2B characters (~34 seconds on my MacBook Pro M3 Max) -python -m nanochat.dataset -n 8 +python -m nanochat.dataset -n 11 python -m scripts.tok_train --max-chars=2000000000 python -m scripts.tok_eval diff --git a/runs/speedrun.sh b/runs/speedrun.sh index 62466c7..274dcbf 100644 --- a/runs/speedrun.sh +++ b/runs/speedrun.sh @@ -49,9 +49,9 @@ python -m nanochat.report reset # Tokenizer # Download the first ~2B characters of pretraining dataset -# each data shard is ~250M chars -# so we download 2e9 / 250e6 = 8 data shards at this point -# each shard is ~100MB of text (compressed), so this is about ~800MB of data on disk +# each data shard is ~250M chars, but due to the `doc_cap` only ~200M chars are used. +# so we download 2e9 / 200e6 + 1 = 11 data shards at this point +# each shard is ~90MB of text (compressed), so this is about ~1GB of data on disk # look at dev/repackage_data_reference.py for details on how this data was prepared python -m nanochat.dataset -n 8 # Immediately also kick off downloading more shards in the background while tokenizer trains