mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 04:12:13 +00:00
Fix OOM in Japanese tokenizer training by reducing max_chars
Reduce --max_chars from 2B to 500M characters to prevent OOM during BPE training. Japanese text generates significantly more unique sequences than English (92M+ unique sequences observed), causing memory exhaustion during heap construction. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
86a6cf6668
commit
55f8d4acf2
|
|
@ -8,6 +8,7 @@ DEPTH=20
|
|||
DEVICE_BATCH_SIZE=16
|
||||
DATA_SHARDS=30
|
||||
NUM_ITERATIONS=1000
|
||||
#NUM_ITERATIONS=10
|
||||
CACHE_DIR="$HOME/.cache/nanochat"
|
||||
# ========================
|
||||
|
||||
|
|
@ -34,7 +35,7 @@ echo "== 1) 日本語データ準備 =="
|
|||
python -m nanochat.dataset -n "${DATA_SHARDS}" --lang ja
|
||||
|
||||
echo "== 2) 日本語トークナイザ学習 =="
|
||||
python -m scripts.tok_train --max_chars=2000000000
|
||||
python -m scripts.tok_train --max_chars=500000000
|
||||
python -m scripts.tok_eval || true
|
||||
ls -l "${CACHE_DIR}/tokenizer" || true
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user