mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 12:22:18 +00:00
Fix OOM in Japanese tokenizer training by reducing max_chars
Reduce --max_chars from 2B to 500M characters to prevent OOM during BPE training. Japanese text generates significantly more unique sequences than English (92M+ unique sequences observed), causing memory exhaustion during heap construction. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
parent
86a6cf6668
commit
55f8d4acf2
|
|
@ -8,6 +8,7 @@ DEPTH=20
|
||||||
DEVICE_BATCH_SIZE=16
|
DEVICE_BATCH_SIZE=16
|
||||||
DATA_SHARDS=30
|
DATA_SHARDS=30
|
||||||
NUM_ITERATIONS=1000
|
NUM_ITERATIONS=1000
|
||||||
|
#NUM_ITERATIONS=10
|
||||||
CACHE_DIR="$HOME/.cache/nanochat"
|
CACHE_DIR="$HOME/.cache/nanochat"
|
||||||
# ========================
|
# ========================
|
||||||
|
|
||||||
|
|
@ -34,7 +35,7 @@ echo "== 1) 日本語データ準備 =="
|
||||||
python -m nanochat.dataset -n "${DATA_SHARDS}" --lang ja
|
python -m nanochat.dataset -n "${DATA_SHARDS}" --lang ja
|
||||||
|
|
||||||
echo "== 2) 日本語トークナイザ学習 =="
|
echo "== 2) 日本語トークナイザ学習 =="
|
||||||
python -m scripts.tok_train --max_chars=2000000000
|
python -m scripts.tok_train --max_chars=500000000
|
||||||
python -m scripts.tok_eval || true
|
python -m scripts.tok_eval || true
|
||||||
ls -l "${CACHE_DIR}/tokenizer" || true
|
ls -l "${CACHE_DIR}/tokenizer" || true
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user