nanochat/dev/runcpu.sh
Jason Kneen 1225ddf00e Add macOS memory-optimized training and documentation
Introduces automatic memory detection and batch size optimization for Apple Silicon Macs in runcpu.sh and runmac_overnight.sh scripts. Adds a comprehensive README_MACOS.md with usage instructions, performance profiles, environment variable overrides, troubleshooting, and expected training times. Updates scripts to allow manual overrides and improve usability for various Mac configurations. Also switched python to arm64 for 2-3x improvement
2025-10-22 07:35:26 +01:00

130 lines
4.3 KiB
Bash
Executable File

#!/bin/bash
# Showing an example run for exercising some of the code paths on the CPU (or MPS on Macbooks)
# Run as:
# bash dev/runcpu.sh
# NOTE: Training LLMs requires GPU compute and $$$. You will not get far on your Macbook.
# Think of this run as educational/fun demo, not something you should expect to work well.
# This is also why I hide this script away in dev/
# all the setup stuff
export OMP_NUM_THREADS=1
NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
mkdir -p $NANOCHAT_BASE_DIR
# Memory-based configuration for macOS
# Detect system memory (in GB) or allow manual override
if [ -z "$MEMORY_SIZE" ]; then
if [[ "$OSTYPE" == "darwin"* ]]; then
MEMORY_SIZE=$(sysctl hw.memsize | awk '{print int($2/1024/1024/1024)}')
echo "Auto-detected macOS memory: ${MEMORY_SIZE}GB"
else
# Linux fallback - assume conservative
MEMORY_SIZE=16
echo "Non-macOS system, using conservative: ${MEMORY_SIZE}GB"
fi
fi
# Calculate optimal batch sizes based on available memory
# Note: total_batch_size must be divisible by (device_batch_size * max_seq_len)
# With max_seq_len=1024: device_batch_size * 1024 must divide total_batch_size
if [ $MEMORY_SIZE -ge 128 ]; then
DEVICE_BATCH_SIZE=16
TOTAL_BATCH_SIZE=16384 # 16 * 1024 = 16384
EVAL_TOKENS=16384
SPLIT_TOKENS=16384
echo "Memory profile: 128GB+ (High performance)"
elif [ $MEMORY_SIZE -ge 64 ]; then
DEVICE_BATCH_SIZE=8
TOTAL_BATCH_SIZE=8192 # 8 * 1024 = 8192
EVAL_TOKENS=8192
SPLIT_TOKENS=8192
echo "Memory profile: 64GB (Good performance)"
elif [ $MEMORY_SIZE -ge 32 ]; then
DEVICE_BATCH_SIZE=4
TOTAL_BATCH_SIZE=4096 # 4 * 1024 = 4096
EVAL_TOKENS=4096
SPLIT_TOKENS=4096
echo "Memory profile: 32GB (Moderate performance)"
else
DEVICE_BATCH_SIZE=1
TOTAL_BATCH_SIZE=1024 # 1 * 1024 = 1024
EVAL_TOKENS=2048
SPLIT_TOKENS=2048
echo "Memory profile: <32GB (Conservative)"
fi
echo "Using: device_batch_size=$DEVICE_BATCH_SIZE, total_batch_size=$TOTAL_BATCH_SIZE"
echo ""
command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
[ -d ".venv" ] || uv venv
uv sync
source .venv/bin/activate
if [ -z "$WANDB_RUN" ]; then
WANDB_RUN=dummy
fi
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source "$HOME/.cargo/env"
uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
unzip -q eval_bundle.zip
rm eval_bundle.zip
mv eval_bundle $NANOCHAT_BASE_DIR
fi
# wipe the report
python -m nanochat.report reset
# train tokenizer on ~1B characters
python -m nanochat.dataset -n 4
python -m scripts.tok_train --max_chars=1000000000
python -m scripts.tok_eval
# train a very small 4 layer model on the CPU/MPS
# batch sizes are now optimized based on available memory
# we only run 50 steps of optimization (bump this to get better results)
python -m scripts.base_train \
--depth=4 \
--max_seq_len=1024 \
--device_batch_size=$DEVICE_BATCH_SIZE \
--total_batch_size=$TOTAL_BATCH_SIZE \
--eval_every=50 \
--eval_tokens=$EVAL_TOKENS \
--core_metric_every=50 \
--core_metric_max_per_task=12 \
--sample_every=50 \
--num_iterations=50
python -m scripts.base_loss --device_batch_size=$DEVICE_BATCH_SIZE --split_tokens=$SPLIT_TOKENS
python -m scripts.base_eval --max-per-task=5
# midtraining
python -m scripts.mid_train \
--max_seq_len=1024 \
--device_batch_size=$DEVICE_BATCH_SIZE \
--eval_every=50 \
--eval_tokens=$EVAL_TOKENS \
--total_batch_size=$TOTAL_BATCH_SIZE \
--num_iterations=100
# eval results will be terrible, this is just to execute the code paths.
# note that we lower the execution memory limit to 1MB to avoid warnings on smaller systems
python -m scripts.chat_eval -i mid --max-new-tokens=128 --max-problems=20
# SFT
python -m scripts.chat_sft \
--device_batch_size=$DEVICE_BATCH_SIZE \
--target_examples_per_step=$((DEVICE_BATCH_SIZE * 2)) \
--num_iterations=100 \
--eval_steps=4 \
--eval_metrics_max_problems=16
# Chat CLI
# python -m scripts.chat_cli -p "Why is the sky blue?"
# Chat Web
# python -m scripts.chat_web
python -m nanochat.report generate