nanochat/dev/runcpu.sh
google-labs-jules[bot] 51927a9e60 feat: Add comprehensive end-to-end documentation
This commit introduces extensive documentation across the entire nanochat codebase. The goal is to make the project more accessible, educational, and easier for new contributors to understand.

Key additions include:
- A new "Codebase Overview and Data Flow" section in the main README.md, providing a high-level guide to the project structure and training pipeline.
- Detailed, educational docstrings and inline comments in all Python modules within the `nanochat/`, `scripts/`, and `tasks/` directories.
- Explanations of the rationale and implementation details for key components, including Python equivalents for non-Python code where applicable.
- A new `README.md` in the `rustbpe/` directory explaining the BPE algorithm and the decision to use Rust.
- Comprehensive comments in shell scripts and development scripts in the `dev/` directory, clarifying their purpose and usage.
2025-11-24 12:57:49 +00:00

125 lines
4.7 KiB
Bash
Executable File

#!/bin/bash
#--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*#
#_-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*#
# #
# CPU Demonstration and Test Run #
# #
#_-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*#
#--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*#
# This script provides an example run for exercising the project's code paths
# on a CPU or MPS (for Apple Silicon Macs).
#
# To run this script:
# bash dev/runcpu.sh
#
# IMPORTANT NOTE:
# Training Large Language Models (LLMs) is computationally intensive and requires
# significant GPU resources and budget. This script is intended as an educational
# tool and a demonstration. It allows you to verify that the code runs, but it
# will not produce a high-quality model. It is intentionally placed in the `dev/`
# directory to emphasize its development and testing purpose.
# --- Environment Setup ---
# Set the number of threads for OpenMP to 1 to avoid potential conflicts.
export OMP_NUM_THREADS=1
# Define and create the base directory for nanochat data and caches.
export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
mkdir -p $NANOCHAT_BASE_DIR
# Install 'uv', a fast Python package installer, if it's not already present.
command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
# Create a virtual environment if it doesn't exist.
[ -d ".venv" ] || uv venv
# Sync dependencies, including the 'cpu' extra for CPU-only environments.
uv sync --extra cpu
# Activate the virtual environment.
source .venv/bin/activate
# Set a dummy Weights & Biases run name if not already defined.
if [ -z "$WANDB_RUN" ]; then
WANDB_RUN=dummy
fi
# Install Rust if it's not already installed.
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
# Add Rust's package manager, Cargo, to the PATH.
source "$HOME/.cargo/env"
# Build the Rust-based BPE tokenizer.
uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
# Download and set up the evaluation bundle if it's not already cached.
EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
unzip -q eval_bundle.zip
rm eval_bundle.zip
mv eval_bundle $NANOCHAT_BASE_DIR
fi
# --- Training and Evaluation Pipeline ---
# Reset any previous reports to start fresh.
python -m nanochat.report reset
# --- Tokenizer Training ---
# Download and prepare the dataset for tokenizer training.
python -m nanochat.dataset -n 4
# Train the tokenizer on approximately 1 billion characters of text.
python -m scripts.tok_train --max_chars=1000000000
# Evaluate the trained tokenizer.
python -m scripts.tok_eval
# --- Base Model Training ---
# Train a very small, 4-layer model on the CPU.
# Note: This is a minimal run for demonstration purposes.
# - Each optimization step processes a single sequence of 1024 tokens.
# - The run consists of only 50 optimization steps.
python -m scripts.base_train \
--depth=4 \
--max_seq_len=1024 \
--device_batch_size=1 \
--total_batch_size=1024 \
--eval_every=50 \
--eval_tokens=4096 \
--core_metric_every=50 \
--core_metric_max_per_task=12 \
--sample_every=50 \
--num_iterations=50
# Evaluate the loss of the base model.
python -m scripts.base_loss --device_batch_size=1 --split_tokens=4096
# Perform a more comprehensive evaluation of the base model.
python -m scripts.base_eval --max-per-task=16
# --- Mid-training ---
# Continue training the model on a mixture of tasks.
python -m scripts.mid_train \
--max_seq_len=1024 \
--device_batch_size=1 \
--eval_every=50 \
--eval_tokens=4096 \
--total_batch_size=1024 \
--num_iterations=100
# Evaluate the mid-trained model. Results are expected to be poor due to the minimal training.
python -m scripts.chat_eval --source=mid --max-new-tokens=128 --max-problems=20
# --- Supervised Fine-Tuning (SFT) ---
# Fine-tune the model on a dataset of conversations.
python -m scripts.chat_sft \
--device_batch_size=1 \
--target_examples_per_step=4 \
--num_iterations=100 \
--eval_steps=4 \
--eval_metrics_max_problems=16
# --- Interactive Chat (Optional) ---
# Uncomment the following lines to interact with the model via the command line or a web interface.
#
# # Command-Line Interface
# python -m scripts.chat_cli -p "Why is the sky blue?"
#
# # Web-based Interface
# python -m scripts.chat_web
# --- Reporting ---
# Generate a final report summarizing the run.
python -m nanochat.report generate