mirror of
https://github.com/karpathy/nanochat.git
synced 2026-01-20 18:34:14 +00:00
71 lines
2.5 KiB
Bash
Executable File
71 lines
2.5 KiB
Bash
Executable File
#!/bin/bash
|
|
|
|
# Showing an example run for exercising some of the code paths on the CPU (or MPS on Macbooks)
|
|
# This script was last updated/tuned on Jan 17, 2026.
|
|
|
|
# Run as:
|
|
# bash dev/cpu_demo_run.sh
|
|
|
|
# NOTE: Training LLMs requires GPU compute and $$$. You will not get far on your Macbook.
|
|
# Think of this run as educational/fun demo, not something you should expect to work well.
|
|
# (This is why I hide this script away in dev/)
|
|
# You may also want to run this script manually and one by one, copy pasting commands into your terminal.
|
|
|
|
# all the setup stuff
|
|
export OMP_NUM_THREADS=1
|
|
export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
|
|
mkdir -p $NANOCHAT_BASE_DIR
|
|
command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
[ -d ".venv" ] || uv venv
|
|
uv sync --extra cpu
|
|
source .venv/bin/activate
|
|
if [ -z "$WANDB_RUN" ]; then
|
|
WANDB_RUN=dummy
|
|
fi
|
|
|
|
# train tokenizer on ~2B characters (~34 seconds on my MacBook Pro M3 Max)
|
|
python -m nanochat.dataset -n 8
|
|
python -m scripts.tok_train --max-chars=2000000000
|
|
python -m scripts.tok_eval
|
|
|
|
# train a small 4 layer model
|
|
# I tuned this run to complete in about 30 minutes on my MacBook Pro M3 Max.
|
|
# To get better results, try increasing num_iterations, or get other ideas from your favorite LLM.
|
|
python -m scripts.base_train \
|
|
--depth=6 \
|
|
--head-dim=64 \
|
|
--window-pattern=L \
|
|
--max-seq-len=512 \
|
|
--device-batch-size=32 \
|
|
--total-batch-size=16384 \
|
|
--eval-every=100 \
|
|
--eval-tokens=524288 \
|
|
--core-metric-every=-1 \
|
|
--sample-every=100 \
|
|
--num-iterations=5000 \
|
|
--run=$WANDB_RUN
|
|
python -m scripts.base_loss --device-batch-size=1 --split-tokens=16384
|
|
python -m scripts.base_eval --max-per-task=16
|
|
|
|
# midtraining (~10 minutes on my MacBook Pro M3 Max)
|
|
curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
|
|
python -m scripts.mid_train \
|
|
--max-seq-len=512 \
|
|
--device-batch-size=32 \
|
|
--total-batch-size=16384 \
|
|
--eval-every=200 \
|
|
--eval-tokens=524288 \
|
|
--num-iterations=1500 \
|
|
--run=$WANDB_RUN
|
|
|
|
# (it's ~ok to skip SFT)
|
|
|
|
# Chat with the model over CLI
|
|
# The model should be able to say that it is Paris.
|
|
# It might even know that the color of the sky is blue.
|
|
# Sometimes the model likes it if you first say Hi before you ask it questions.
|
|
# python -m scripts.chat_cli -i mid -p "What is the capital of France?"
|
|
|
|
# Chat with the model over a pretty WebUI ChatGPT style
|
|
# python -m scripts.chat_web -i mid
|