mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 04:12:13 +00:00
Introduces automatic memory detection and batch size optimization for Apple Silicon Macs in runcpu.sh and runmac_overnight.sh scripts. Adds a comprehensive README_MACOS.md with usage instructions, performance profiles, environment variable overrides, troubleshooting, and expected training times. Updates scripts to allow manual overrides and improve usability for various Mac configurations. Also switched python to arm64 for 2-3x improvement
130 lines
4.3 KiB
Bash
Executable File
130 lines
4.3 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Showing an example run for exercising some of the code paths on the CPU (or MPS on Macbooks)
|
|
# Run as:
|
|
# bash dev/runcpu.sh
|
|
|
|
# NOTE: Training LLMs requires GPU compute and $$$. You will not get far on your Macbook.
|
|
# Think of this run as educational/fun demo, not something you should expect to work well.
|
|
# This is also why I hide this script away in dev/
|
|
|
|
# all the setup stuff
|
|
export OMP_NUM_THREADS=1
|
|
NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
|
|
mkdir -p $NANOCHAT_BASE_DIR
|
|
|
|
# Memory-based configuration for macOS
|
|
# Detect system memory (in GB) or allow manual override
|
|
if [ -z "$MEMORY_SIZE" ]; then
|
|
if [[ "$OSTYPE" == "darwin"* ]]; then
|
|
MEMORY_SIZE=$(sysctl hw.memsize | awk '{print int($2/1024/1024/1024)}')
|
|
echo "Auto-detected macOS memory: ${MEMORY_SIZE}GB"
|
|
else
|
|
# Linux fallback - assume conservative
|
|
MEMORY_SIZE=16
|
|
echo "Non-macOS system, using conservative: ${MEMORY_SIZE}GB"
|
|
fi
|
|
fi
|
|
|
|
# Calculate optimal batch sizes based on available memory
|
|
# Note: total_batch_size must be divisible by (device_batch_size * max_seq_len)
|
|
# With max_seq_len=1024: device_batch_size * 1024 must divide total_batch_size
|
|
if [ $MEMORY_SIZE -ge 128 ]; then
|
|
DEVICE_BATCH_SIZE=16
|
|
TOTAL_BATCH_SIZE=16384 # 16 * 1024 = 16384
|
|
EVAL_TOKENS=16384
|
|
SPLIT_TOKENS=16384
|
|
echo "Memory profile: 128GB+ (High performance)"
|
|
elif [ $MEMORY_SIZE -ge 64 ]; then
|
|
DEVICE_BATCH_SIZE=8
|
|
TOTAL_BATCH_SIZE=8192 # 8 * 1024 = 8192
|
|
EVAL_TOKENS=8192
|
|
SPLIT_TOKENS=8192
|
|
echo "Memory profile: 64GB (Good performance)"
|
|
elif [ $MEMORY_SIZE -ge 32 ]; then
|
|
DEVICE_BATCH_SIZE=4
|
|
TOTAL_BATCH_SIZE=4096 # 4 * 1024 = 4096
|
|
EVAL_TOKENS=4096
|
|
SPLIT_TOKENS=4096
|
|
echo "Memory profile: 32GB (Moderate performance)"
|
|
else
|
|
DEVICE_BATCH_SIZE=1
|
|
TOTAL_BATCH_SIZE=1024 # 1 * 1024 = 1024
|
|
EVAL_TOKENS=2048
|
|
SPLIT_TOKENS=2048
|
|
echo "Memory profile: <32GB (Conservative)"
|
|
fi
|
|
|
|
echo "Using: device_batch_size=$DEVICE_BATCH_SIZE, total_batch_size=$TOTAL_BATCH_SIZE"
|
|
echo ""
|
|
command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
[ -d ".venv" ] || uv venv
|
|
uv sync
|
|
source .venv/bin/activate
|
|
if [ -z "$WANDB_RUN" ]; then
|
|
WANDB_RUN=dummy
|
|
fi
|
|
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
|
source "$HOME/.cargo/env"
|
|
uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
|
|
EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
|
|
if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
|
|
curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
|
|
unzip -q eval_bundle.zip
|
|
rm eval_bundle.zip
|
|
mv eval_bundle $NANOCHAT_BASE_DIR
|
|
fi
|
|
|
|
# wipe the report
|
|
python -m nanochat.report reset
|
|
|
|
# train tokenizer on ~1B characters
|
|
python -m nanochat.dataset -n 4
|
|
python -m scripts.tok_train --max_chars=1000000000
|
|
python -m scripts.tok_eval
|
|
|
|
# train a very small 4 layer model on the CPU/MPS
|
|
# batch sizes are now optimized based on available memory
|
|
# we only run 50 steps of optimization (bump this to get better results)
|
|
python -m scripts.base_train \
|
|
--depth=4 \
|
|
--max_seq_len=1024 \
|
|
--device_batch_size=$DEVICE_BATCH_SIZE \
|
|
--total_batch_size=$TOTAL_BATCH_SIZE \
|
|
--eval_every=50 \
|
|
--eval_tokens=$EVAL_TOKENS \
|
|
--core_metric_every=50 \
|
|
--core_metric_max_per_task=12 \
|
|
--sample_every=50 \
|
|
--num_iterations=50
|
|
python -m scripts.base_loss --device_batch_size=$DEVICE_BATCH_SIZE --split_tokens=$SPLIT_TOKENS
|
|
python -m scripts.base_eval --max-per-task=5
|
|
|
|
# midtraining
|
|
python -m scripts.mid_train \
|
|
--max_seq_len=1024 \
|
|
--device_batch_size=$DEVICE_BATCH_SIZE \
|
|
--eval_every=50 \
|
|
--eval_tokens=$EVAL_TOKENS \
|
|
--total_batch_size=$TOTAL_BATCH_SIZE \
|
|
--num_iterations=100
|
|
# eval results will be terrible, this is just to execute the code paths.
|
|
# note that we lower the execution memory limit to 1MB to avoid warnings on smaller systems
|
|
python -m scripts.chat_eval -i mid --max-new-tokens=128 --max-problems=20
|
|
|
|
# SFT
|
|
python -m scripts.chat_sft \
|
|
--device_batch_size=$DEVICE_BATCH_SIZE \
|
|
--target_examples_per_step=$((DEVICE_BATCH_SIZE * 2)) \
|
|
--num_iterations=100 \
|
|
--eval_steps=4 \
|
|
--eval_metrics_max_problems=16
|
|
|
|
# Chat CLI
|
|
# python -m scripts.chat_cli -p "Why is the sky blue?"
|
|
|
|
# Chat Web
|
|
# python -m scripts.chat_web
|
|
|
|
python -m nanochat.report generate
|