mirror of
https://github.com/karpathy/nanochat.git
synced 2026-04-04 06:35:23 +00:00
Merge c655043092 into 2f09686724
This commit is contained in:
commit
f60cd0eb36
|
|
@ -22,7 +22,7 @@ if [ -z "$WANDB_RUN" ]; then
|
|||
fi
|
||||
|
||||
# train tokenizer on ~2B characters (~34 seconds on my MacBook Pro M3 Max)
|
||||
python -m nanochat.dataset -n 8
|
||||
python -m nanochat.dataset -n 11
|
||||
python -m scripts.tok_train --max-chars=2000000000
|
||||
python -m scripts.tok_eval
|
||||
|
||||
|
|
|
|||
|
|
@ -49,9 +49,9 @@ python -m nanochat.report reset
|
|||
# Tokenizer
|
||||
|
||||
# Download the first ~2B characters of pretraining dataset
|
||||
# each data shard is ~250M chars
|
||||
# so we download 2e9 / 250e6 = 8 data shards at this point
|
||||
# each shard is ~100MB of text (compressed), so this is about ~800MB of data on disk
|
||||
# each data shard is ~250M chars, but due to the `doc_cap` only ~200M chars are used.
|
||||
# so we download 2e9 / 200e6 + 1 = 11 data shards at this point
|
||||
# each shard is ~90MB of text (compressed), so this is about ~1GB of data on disk
|
||||
# look at dev/repackage_data_reference.py for details on how this data was prepared
|
||||
python -m nanochat.dataset -n 8
|
||||
# Immediately also kick off downloading more shards in the background while tokenizer trains
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user