mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 04:12:13 +00:00
116 lines
4.4 KiB
Bash
Executable File
116 lines
4.4 KiB
Bash
Executable File
#!/bin/bash
|
|
#--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*#
|
|
#_-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*#
|
|
# #
|
|
# CPU Demonstration and Test Run #
|
|
# #
|
|
#_-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*#
|
|
#--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*#
|
|
# This script provides an example run for exercising the project's code paths
|
|
# on a CPU or MPS (for Apple Silicon Macs).
|
|
#
|
|
# To run this script:
|
|
# bash dev/runcpu.sh
|
|
#
|
|
# IMPORTANT NOTE:
|
|
# Training Large Language Models (LLMs) is computationally intensive and requires
|
|
# significant GPU resources and budget. This script is intended as an educational
|
|
# tool and a demonstration. It allows you to verify that the code runs, but it
|
|
# will not produce a high-quality model. It is intentionally placed in the `dev/`
|
|
# directory to emphasize its development and testing purpose.
|
|
|
|
# --- Environment Setup ---
|
|
# Set the number of threads for OpenMP to 1 to avoid potential conflicts.
|
|
export OMP_NUM_THREADS=1
|
|
# Define and create the base directory for nanochat data and caches.
|
|
export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
|
|
mkdir -p $NANOCHAT_BASE_DIR
|
|
|
|
# Install 'uv', a fast Python package installer, if it's not already present.
|
|
command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
|
|
# Create a virtual environment if it doesn't exist.
|
|
[ -d ".venv" ] || uv venv
|
|
# Sync dependencies, including the 'cpu' extra for CPU-only environments.
|
|
uv sync --extra cpu
|
|
# Activate the virtual environment.
|
|
source .venv/bin/activate
|
|
|
|
# Set a dummy Weights & Biases run name if not already defined.
|
|
if [ -z "$WANDB_RUN" ]; then
|
|
WANDB_RUN=dummy
|
|
fi
|
|
|
|
# Install Rust if it's not already installed.
|
|
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
|
|
# Add Rust's package manager, Cargo, to the PATH.
|
|
source "$HOME/.cargo/env"
|
|
# Build the Rust-based BPE tokenizer.
|
|
uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
|
|
|
|
# --- Training and Evaluation Pipeline ---
|
|
# Reset any previous reports to start fresh.
|
|
python -m nanochat.report reset
|
|
|
|
# --- Tokenizer Training ---
|
|
# Download and prepare the dataset for tokenizer training.
|
|
python -m nanochat.dataset -n 4
|
|
# Train the tokenizer on approximately 1 billion characters of text.
|
|
python -m scripts.tok_train --max_chars=1000000000
|
|
# Evaluate the trained tokenizer.
|
|
python -m scripts.tok_eval
|
|
|
|
# --- Base Model Training ---
|
|
# Train a very small, 4-layer model on the CPU.
|
|
# Note: This is a minimal run for demonstration purposes.
|
|
# - Each optimization step processes a single sequence of 1024 tokens.
|
|
# - The run consists of only 50 optimization steps.
|
|
python -m scripts.base_train \
|
|
--depth=4 \
|
|
--max_seq_len=1024 \
|
|
--device_batch_size=1 \
|
|
--total_batch_size=1024 \
|
|
--eval_every=50 \
|
|
--eval_tokens=4096 \
|
|
--core_metric_every=50 \
|
|
--core_metric_max_per_task=12 \
|
|
--sample_every=50 \
|
|
--num_iterations=50
|
|
# Evaluate the loss of the base model.
|
|
python -m scripts.base_loss --device_batch_size=1 --split_tokens=4096
|
|
# Perform a more comprehensive evaluation of the base model.
|
|
python -m scripts.base_eval --max-per-task=16
|
|
|
|
# --- Mid-training ---
|
|
# Continue training the model on a mixture of tasks.
|
|
python -m scripts.mid_train \
|
|
--max_seq_len=1024 \
|
|
--device_batch_size=1 \
|
|
--eval_every=50 \
|
|
--eval_tokens=4096 \
|
|
--total_batch_size=1024 \
|
|
--num_iterations=100
|
|
# Evaluate the mid-trained model. Results are expected to be poor due to the minimal training.
|
|
python -m scripts.chat_eval --source=mid --max-new-tokens=128 --max-problems=20
|
|
|
|
# --- Supervised Fine-Tuning (SFT) ---
|
|
# Fine-tune the model on a dataset of conversations.
|
|
python -m scripts.chat_sft \
|
|
--device_batch_size=1 \
|
|
--target_examples_per_step=4 \
|
|
--num_iterations=100 \
|
|
--eval_steps=4 \
|
|
--eval_metrics_max_problems=16
|
|
|
|
# --- Interactive Chat (Optional) ---
|
|
# Uncomment the following lines to interact with the model via the command line or a web interface.
|
|
#
|
|
# # Command-Line Interface
|
|
# python -m scripts.chat_cli -p "Why is the sky blue?"
|
|
#
|
|
# # Web-based Interface
|
|
# python -m scripts.chat_web
|
|
|
|
# --- Reporting ---
|
|
# Generate a final report summarizing the run.
|
|
python -m nanochat.report generate
|