diff --git a/runs/run1000.sh b/runs/run1000.sh index 5d0b7dc..c04583b 100644 --- a/runs/run1000.sh +++ b/runs/run1000.sh @@ -19,7 +19,7 @@ python -m nanochat.report reset curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl # train tokenizer on ~4B characters and kick off download of the rest for pretraining -python -m nanochat.dataset -n 16 +python -m nanochat.dataset -n 21 # start downloading the rest of the shards for a total of 1200 (see below why 1200) python -m nanochat.dataset -n 1200 & # todo: download the rest of it diff --git a/runs/runcpu.sh b/runs/runcpu.sh index da8f6d1..abe3f6c 100755 --- a/runs/runcpu.sh +++ b/runs/runcpu.sh @@ -24,7 +24,7 @@ if [ -z "$WANDB_RUN" ]; then fi # train tokenizer on ~2B characters (~34 seconds on my MacBook Pro M3 Max) -python -m nanochat.dataset -n 8 +python -m nanochat.dataset -n 11 python -m scripts.tok_train --max-chars=2000000000 python -m scripts.tok_eval diff --git a/runs/speedrun.sh b/runs/speedrun.sh index ef4fa00..3e2e21e 100644 --- a/runs/speedrun.sh +++ b/runs/speedrun.sh @@ -50,10 +50,11 @@ python -m nanochat.report reset # Download the first ~2B characters of pretraining dataset # look at dev/repackage_data_reference.py for details on how this data was prepared -# each data shard is ~250M chars -# so we download 2e9 / 250e6 = 8 data shards at this point -# each shard is ~100MB of text (compressed), so this is about ~800MB of data on disk -python -m nanochat.dataset -n 8 +# each data shard is ~250M chars, but due to the `doc_cap` only ~200M chars are used. +# The last shard is considered to be eval, so we download 2e9 / 200e6 + 1 = 11 data +# shards at this point +# each shard is ~90MB of text (compressed), so this is about ~1GB of data on disk +python -m nanochat.dataset -n 11 # Immediately also kick off downloading more shards in the background while tokenizer trains # See comment below for why 370 is the right number here python -m nanochat.dataset -n 370 &