mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 04:12:13 +00:00
This commit introduces extensive documentation across the entire nanochat codebase. The goal is to make the project more accessible, educational, and easier for new contributors to understand. Key additions include: - A new "Codebase Overview and Data Flow" section in the main README.md, providing a high-level guide to the project structure and training pipeline. - Detailed, educational docstrings and inline comments in all Python modules within the `nanochat/`, `scripts/`, and `tasks/` directories. - Explanations of the rationale and implementation details for key components, including Python equivalents for non-Python code where applicable. - A new `README.md` in the `rustbpe/` directory explaining the BPE algorithm and the decision to use Rust. - Comprehensive comments in shell scripts and development scripts in the `dev/` directory, clarifying their purpose and usage.
87 lines
3.5 KiB
Python
87 lines
3.5 KiB
Python
"""
|
|
This script evaluates the loss of a trained base model and generates samples from it.
|
|
It serves as a quick sanity check to ensure the model has learned sensible
|
|
representations.
|
|
|
|
The script performs two main functions:
|
|
1. **Loss Evaluation:** It calculates the bits-per-byte (BPB) metric on both the
|
|
training and validation splits of the dataset.
|
|
2. **Sampling:** The master process generates text samples from a set of predefined
|
|
prompts to provide a qualitative assessment of the model's capabilities.
|
|
|
|
Usage:
|
|
- To run on a single GPU: `python scripts/base_loss.py`
|
|
- For distributed evaluation: `torchrun --nproc_per_node=<gpus> scripts/base_loss.py`
|
|
"""
|
|
import os
|
|
from contextlib import nullcontext
|
|
import torch
|
|
from nanochat.checkpoint_manager import load_model
|
|
from nanochat.common import compute_init, print0, compute_cleanup, autodetect_device_type
|
|
from nanochat.dataloader import tokenizing_distributed_data_loader
|
|
from nanochat.tokenizer import get_token_bytes
|
|
from nanochat.loss_eval import evaluate_bpb
|
|
from nanochat.engine import Engine
|
|
|
|
# Configuration
|
|
device_batch_size = 32
|
|
split_tokens = 20*524288 # number of tokens to evaluate per split
|
|
model_tag = None # optional model tag for the output directory name
|
|
model_step = None # optional model step for the output directory name
|
|
device_type = "" # cuda|cpu|mps (empty => autodetect)
|
|
exec(open(os.path.join('nanochat', 'configurator.py')).read()) # overrides from command line or config file
|
|
|
|
# Load the base model and the tokenizer
|
|
device_type = autodetect_device_type() if device_type == "" else device_type
|
|
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
|
|
model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=model_tag, step=model_step)
|
|
sequence_len = meta["model_config"]["sequence_len"] # could be arbitrary really
|
|
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
|
|
|
|
# Evaluate the loss on each split
|
|
tokens_per_step = device_batch_size * sequence_len * ddp_world_size
|
|
assert split_tokens % tokens_per_step == 0, "split_tokens must be divisible by tokens_per_step"
|
|
steps = split_tokens // tokens_per_step
|
|
token_bytes = get_token_bytes(device=device)
|
|
bpb_results = {}
|
|
for split_name in ["train", "val"]:
|
|
loader = tokenizing_distributed_data_loader(device_batch_size, sequence_len, split_name, device=device)
|
|
with autocast_ctx:
|
|
bpb = evaluate_bpb(model, loader, steps, token_bytes)
|
|
print0(f"{split_name} bpb: {bpb:.4f}")
|
|
bpb_results[split_name] = bpb
|
|
|
|
# Master process also samples from the model
|
|
samples = []
|
|
if ddp_rank == 0:
|
|
prompts = [
|
|
"The capital of France is",
|
|
"The chemical symbol of gold is",
|
|
"If yesterday was Friday, then tomorrow will be",
|
|
"The opposite of hot is",
|
|
"The planets of the solar system are:",
|
|
"My favorite color is",
|
|
"If 5*x + 3 = 13, then x is",
|
|
]
|
|
engine = Engine(model, tokenizer)
|
|
for prompt in prompts:
|
|
tokens = tokenizer(prompt, prepend="<|bos|>")
|
|
with autocast_ctx:
|
|
sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0)
|
|
sample_str = tokenizer.decode(sample[0])
|
|
print0(sample_str)
|
|
samples.append(sample_str)
|
|
|
|
# Log to report
|
|
from nanochat.report import get_report
|
|
get_report().log(section="Base model loss", data=[
|
|
{
|
|
"train bpb": bpb_results["train"],
|
|
"val bpb": bpb_results["val"],
|
|
},
|
|
{f"sample {i}": sample for i, sample in enumerate(samples)},
|
|
])
|
|
|
|
# Cleanup
|
|
compute_cleanup()
|