upd aligned version

This commit is contained in:
Peiqi Duan 2025-12-30 00:43:23 +08:00 committed by Muheng
parent 08f3255ff5
commit 62e339f3a9
29 changed files with 7244 additions and 0 deletions

View File

76
nanochat-align/adamw.py Normal file
View File

@ -0,0 +1,76 @@
"""
Borrowed from modded-nanogpt. By Keller, @vagrawal, et al.
Not a general optimizer! But works for our specific use.
"""
import torch
import torch.distributed as dist
from torch import Tensor
class DistAdamW(torch.optim.Optimizer):
"""
Distributed AdamW optimizer.
In the style of ZeRO-2, i.e. sharded optimizer states and gradient reduction
"""
def __init__(self, param_groups, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01):
defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
super().__init__(param_groups, defaults)
@torch.compile
@torch.no_grad()
def step(self):
rank = dist.get_rank()
world_size = dist.get_world_size()
reduce_scatter_futures: list[torch.Future] = []
all_reduce_futures: list[torch.Future] = []
grad_slices = []
for group in self.param_groups:
params: list[Tensor] = group["params"]
for base_i in range(len(params)):
grad = params[base_i].grad
rank_size = grad.shape[0] // world_size
grad_slice = torch.empty_like(grad[:rank_size])
reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future())
grad_slices.append(grad_slice)
idx = 0
for group in self.param_groups:
beta1, beta2 = group['betas']
eps = group['eps']
wd = group['weight_decay']
params = group['params']
for base in range(len(params)):
reduce_scatter_futures[idx].wait()
p = params[base]
rank_size = p.shape[0] // world_size
p_slice = p[rank * rank_size:(rank + 1) * rank_size]
lr = group['lr'] * getattr(p, "lr_mul", 1.0)
state = self.state[p]
g_slice = grad_slices[idx]
# State init
if not state:
state['step'] = torch.tensor(0, dtype=torch.int64, device=p.device)
state['exp_avg'] = torch.zeros_like(p_slice)
state['exp_avg_sq'] = torch.zeros_like(p_slice)
exp_avg = state['exp_avg']
exp_avg_sq = state['exp_avg_sq']
state['step'] += 1
t = state['step']
# weight decay
if wd != 0:
eff_weight_decay = lr * wd * getattr(p, "wd_mul", 1.0)
p_slice.mul_(1 - eff_weight_decay)
# update running averages
exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1)
exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2)
# bias corrections
bias1 = 1 - beta1 ** t
bias2 = 1 - beta2 ** t
# compute step
denom = exp_avg_sq.sqrt().add_(eps)
step_size = lr * (torch.sqrt(bias2) / bias1)
update = exp_avg.div(denom).mul_(step_size)
p_slice.add_(other=update, alpha=-1.0)
idx += 1
all_reduce_futures.append(dist.all_gather_into_tensor(p, p_slice, async_op=True).get_future())
torch.futures.collect_all(all_reduce_futures).wait()

View File

@ -0,0 +1,151 @@
"""
Utilities for saving and loading model/optim/state checkpoints.
"""
import os
import re
import glob
import json
import logging
import torch
from nanochat.common import get_base_dir
from nanochat.gpt import GPT, GPTConfig
from nanochat.tokenizer import get_tokenizer
from nanochat.common import setup_default_logging
# Set up logging
setup_default_logging()
logger = logging.getLogger(__name__)
def log0(message):
if int(os.environ.get('RANK', 0)) == 0:
logger.info(message)
def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data, rank=0):
if rank == 0:
os.makedirs(checkpoint_dir, exist_ok=True)
# Save the model state parameters
model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")
torch.save(model_data, model_path)
logger.info(f"Saved model parameters to: {model_path}")
# Save the metadata dict as json
meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
with open(meta_path, "w", encoding="utf-8") as f:
json.dump(meta_data, f, indent=2)
logger.info(f"Saved metadata to: {meta_path}")
# Note that optimizer state is sharded across ranks, so each rank must save its own.
if optimizer_data is not None:
optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt")
torch.save(optimizer_data, optimizer_path)
logger.info(f"Saved optimizer state to: {optimizer_path}")
def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False, rank=0):
# Load the model state
model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")
model_data = torch.load(model_path, map_location=device)
# Load the optimizer state if requested
optimizer_data = None
if load_optimizer:
optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt")
optimizer_data = torch.load(optimizer_path, map_location=device)
# Load the metadata
meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
with open(meta_path, "r", encoding="utf-8") as f:
meta_data = json.load(f)
return model_data, optimizer_data, meta_data
def build_model(checkpoint_dir, step, device, phase):
"""
A bunch of repetitive code to build a model from a given checkpoint.
Returns:
- base model - uncompiled, not wrapped in DDP
- tokenizer
- meta data saved during base model training
"""
assert phase in ["train", "eval"], f"Invalid phase: {phase}"
model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, step, device, load_optimizer=False)
if device.type in {"cpu", "mps"}:
# Convert bfloat16 tensors to float for CPU inference
model_data = {
k: v.float() if v.dtype == torch.bfloat16 else v
for k, v in model_data.items()
}
# Hack: fix torch compile issue, which prepends all keys with _orig_mod.
model_data = {k.removeprefix("_orig_mod."): v for k, v in model_data.items()}
model_config_kwargs = meta_data["model_config"]
log0(f"Building model with config: {model_config_kwargs}")
model_config = GPTConfig(**model_config_kwargs)
with torch.device("meta"):
model = GPT(model_config)
# Load the model state
model.to_empty(device=device)
# Weights are already initialized in GPT.__init__ via self.apply(self._init_weights)
model.load_state_dict(model_data, strict=True, assign=True)
# Put the model in the right training phase / mode
if phase == "eval":
model.eval()
else:
model.train()
# Load the Tokenizer
tokenizer = get_tokenizer()
# Sanity check: compatibility between model and tokenizer
assert tokenizer.get_vocab_size() == model_config_kwargs["vocab_size"]
return model, tokenizer, meta_data
def find_largest_model(checkpoint_dir):
# attempt to guess the model tag: take the biggest model available
model_tags = [f for f in os.listdir(checkpoint_dir) if os.path.isdir(os.path.join(checkpoint_dir, f))]
if not model_tags:
raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}")
# 1) normally all model tags are of the form d<number>, try that first:
candidates = []
for model_tag in model_tags:
match = re.match(r"d(\d+)", model_tag)
if match:
model_depth = int(match.group(1))
candidates.append((model_depth, model_tag))
if candidates:
candidates.sort(key=lambda x: x[0], reverse=True)
return candidates[0][1]
# 2) if that failed, take the most recently updated model:
model_tags.sort(key=lambda x: os.path.getmtime(os.path.join(checkpoint_dir, x)), reverse=True)
return model_tags[0]
def find_last_step(checkpoint_dir):
# Look into checkpoint_dir and find model_<step>.pt with the highest step
checkpoint_files = glob.glob(os.path.join(checkpoint_dir, "model_*.pt"))
if not checkpoint_files:
raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}")
last_step = int(max(os.path.basename(f).split("_")[-1].split(".")[0] for f in checkpoint_files))
return last_step
# -----------------------------------------------------------------------------
# convenience functions that take into account nanochat's directory structure
def load_model_from_dir(checkpoints_dir, device, phase, model_tag=None, step=None):
if model_tag is None:
# guess the model tag by defaulting to the largest model
model_tag = find_largest_model(checkpoints_dir)
log0(f"No model tag provided, guessing model tag: {model_tag}")
checkpoint_dir = os.path.join(checkpoints_dir, model_tag)
if step is None:
# guess the step by defaulting to the last step
step = find_last_step(checkpoint_dir)
assert step is not None, f"No checkpoints found in {checkpoint_dir}"
# build the model
log0(f"Loading model from {checkpoint_dir} with step {step}")
model, tokenizer, meta_data = build_model(checkpoint_dir, step, device, phase)
return model, tokenizer, meta_data
def load_model(source, *args, **kwargs):
model_dir = {
"base": "base_checkpoints",
"mid": "mid_checkpoints",
"sft": "chatsft_checkpoints",
"rl": "chatrl_checkpoints",
}[source]
base_dir = get_base_dir()
checkpoints_dir = os.path.join(base_dir, model_dir)
return load_model_from_dir(checkpoints_dir, *args, **kwargs)

190
nanochat-align/common.py Normal file
View File

@ -0,0 +1,190 @@
"""
Common utilities for nanochat.
"""
import os
import re
import logging
import urllib.request
import torch
import torch.distributed as dist
from filelock import FileLock
class ColoredFormatter(logging.Formatter):
"""Custom formatter that adds colors to log messages."""
# ANSI color codes
COLORS = {
'DEBUG': '\033[36m', # Cyan
'INFO': '\033[32m', # Green
'WARNING': '\033[33m', # Yellow
'ERROR': '\033[31m', # Red
'CRITICAL': '\033[35m', # Magenta
}
RESET = '\033[0m'
BOLD = '\033[1m'
def format(self, record):
# Add color to the level name
levelname = record.levelname
if levelname in self.COLORS:
record.levelname = f"{self.COLORS[levelname]}{self.BOLD}{levelname}{self.RESET}"
# Format the message
message = super().format(record)
# Add color to specific parts of the message
if levelname == 'INFO':
# Highlight numbers and percentages
message = re.sub(r'(\d+\.?\d*\s*(?:GB|MB|%|docs))', rf'{self.BOLD}\1{self.RESET}', message)
message = re.sub(r'(Shard \d+)', rf'{self.COLORS["INFO"]}{self.BOLD}\1{self.RESET}', message)
return message
def setup_default_logging():
handler = logging.StreamHandler()
handler.setFormatter(ColoredFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
logging.basicConfig(
level=logging.INFO,
handlers=[handler]
)
setup_default_logging()
logger = logging.getLogger(__name__)
def get_base_dir():
# co-locate nanochat intermediates with other cached data in ~/.cache (by default)
if os.environ.get("NANOCHAT_BASE_DIR"):
nanochat_dir = os.environ.get("NANOCHAT_BASE_DIR")
else:
home_dir = os.path.expanduser("~")
cache_dir = os.path.join(home_dir, ".cache")
nanochat_dir = os.path.join(cache_dir, "nanochat")
os.makedirs(nanochat_dir, exist_ok=True)
return nanochat_dir
def download_file_with_lock(url, filename, postprocess_fn=None):
"""
Downloads a file from a URL to a local path in the base directory.
Uses a lock file to prevent concurrent downloads among multiple ranks.
"""
base_dir = get_base_dir()
file_path = os.path.join(base_dir, filename)
lock_path = file_path + ".lock"
if os.path.exists(file_path):
return file_path
with FileLock(lock_path):
# Only a single rank can acquire this lock
# All other ranks block until it is released
# Recheck after acquiring lock
if os.path.exists(file_path):
return file_path
# Download the content as bytes
print(f"Downloading {url}...")
with urllib.request.urlopen(url) as response:
content = response.read() # bytes
# Write to local file
with open(file_path, 'wb') as f:
f.write(content)
print(f"Downloaded to {file_path}")
# Run the postprocess function if provided
if postprocess_fn is not None:
postprocess_fn(file_path)
return file_path
def print0(s="",**kwargs):
ddp_rank = int(os.environ.get('RANK', 0))
if ddp_rank == 0:
print(s, **kwargs)
def print_banner():
# Cool DOS Rebel font ASCII banner made with https://manytools.org/hacker-tools/ascii-banner/
banner = """
"""
print0(banner)
def is_ddp():
# TODO is there a proper way
return int(os.environ.get('RANK', -1)) != -1
def get_dist_info():
if is_ddp():
assert all(var in os.environ for var in ['RANK', 'LOCAL_RANK', 'WORLD_SIZE'])
ddp_rank = int(os.environ['RANK'])
ddp_local_rank = int(os.environ['LOCAL_RANK'])
ddp_world_size = int(os.environ['WORLD_SIZE'])
return True, ddp_rank, ddp_local_rank, ddp_world_size
else:
return False, 0, 0, 1
def autodetect_device_type():
# prefer to use CUDA if available, otherwise use MPS, otherwise fallback on CPU
if torch.cuda.is_available():
device_type = "cuda"
elif torch.backends.mps.is_available():
device_type = "mps"
else:
device_type = "cpu"
print0(f"Autodetected device type: {device_type}")
return device_type
def compute_init(device_type="cuda"): # cuda|cpu|mps
"""Basic initialization that we keep doing over and over, so make common."""
assert device_type in ["cuda", "mps", "cpu"], "Invalid device type atm"
if device_type == "cuda":
assert torch.cuda.is_available(), "Your PyTorch installation is not configured for CUDA but device_type is 'cuda'"
if device_type == "mps":
assert torch.backends.mps.is_available(), "Your PyTorch installation is not configured for MPS but device_type is 'mps'"
# Reproducibility
# Note that we set the global seeds here, but most of the code uses explicit rng objects.
# The only place where global rng might be used is nn.Module initialization of the model weights.
torch.manual_seed(42)
if device_type == "cuda":
torch.cuda.manual_seed(42)
# skipping full reproducibility for now, possibly investigate slowdown later
# torch.use_deterministic_algorithms(True)
# Precision
if device_type == "cuda":
torch.set_float32_matmul_precision("high") # uses tf32 instead of fp32 for matmuls
# Distributed setup: Distributed Data Parallel (DDP), optional, and requires CUDA
ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
if ddp and device_type == "cuda":
device = torch.device("cuda", ddp_local_rank)
torch.cuda.set_device(device) # make "cuda" default to this device
dist.init_process_group(backend="nccl", device_id=device)
dist.barrier()
else:
device = torch.device(device_type) # mps|cpu
if ddp_rank == 0:
logger.info(f"Distributed world size: {ddp_world_size}")
return ddp, ddp_rank, ddp_local_rank, ddp_world_size, device
def compute_cleanup():
"""Companion function to compute_init, to clean things up before script exit"""
if is_ddp():
dist.destroy_process_group()
class DummyWandb:
"""Useful if we wish to not use wandb but have all the same signatures"""
def __init__(self):
pass
def log(self, *args, **kwargs):
pass
def finish(self):
pass

View File

@ -0,0 +1,56 @@
"""
Poor Man's Configurator. Probably a terrible idea. Example usage:
$ python train.py config/override_file.py --batch_size=32
this will first run config/override_file.py, then override batch_size to 32
The code in this file will be run as follows from e.g. train.py:
>>> exec(open('configurator.py').read())
So it's not a Python module, it's just shuttling this code away from train.py
The code in this script then overrides the globals()
I know people are not going to love this, I just really dislike configuration
complexity and having to prepend config. to every single variable. If someone
comes up with a better simple Python solution I am all ears.
"""
import os
import sys
from ast import literal_eval
def print0(s="",**kwargs):
ddp_rank = int(os.environ.get('RANK', 0))
if ddp_rank == 0:
print(s, **kwargs)
for arg in sys.argv[1:]:
if '=' not in arg:
# assume it's the name of a config file
assert not arg.startswith('--')
config_file = arg
print0(f"Overriding config with {config_file}:")
with open(config_file) as f:
print0(f.read())
exec(open(config_file).read())
else:
# assume it's a --key=value argument
assert arg.startswith('--')
key, val = arg.split('=')
key = key[2:]
if key in globals():
try:
# attempt to eval it it (e.g. if bool, number, or etc)
attempt = literal_eval(val)
except (SyntaxError, ValueError):
# if that goes wrong, just use the string
attempt = val
# ensure the types match ok
if globals()[key] is not None:
attempt_type = type(attempt)
default_type = type(globals()[key])
assert attempt_type == default_type, f"Type mismatch: {attempt_type} != {default_type}"
# cross fingers
print0(f"Overriding: {key} = {attempt}")
globals()[key] = attempt
else:
raise ValueError(f"Unknown config key: {key}")

262
nanochat-align/core_eval.py Normal file
View File

@ -0,0 +1,262 @@
"""
Functions for evaluating the CORE metric, as described in the DCLM paper.
https://arxiv.org/abs/2406.11794
TODOs:
- All tasks ~match except for squad. We get 31% reference is 37%. Figure out why.
"""
import random
from jinja2 import Template
import torch
import torch.distributed as dist
# -----------------------------------------------------------------------------
# Prompt rendering utilities
def render_prompts_mc(item, continuation_delimiter, fewshot_examples=None):
"""Render complete prompts for a multiple choice question"""
template_str = """
{%- for example in fewshot_examples -%}
{{ example.query }}{{ continuation_delimiter }}{{ example.choices[example.gold] }}
{% endfor -%}
{{ item.query }}{{ continuation_delimiter }}{{ choice }}""".strip()
template = Template(template_str)
fewshot_examples = fewshot_examples or []
context = {
'fewshot_examples': fewshot_examples,
'continuation_delimiter': continuation_delimiter,
'item': item
}
prompts = [template.render(choice=choice, **context) for choice in item['choices']]
return prompts
def render_prompts_schema(item, continuation_delimiter, fewshot_examples=None):
"""Render complete prompts for a schema question"""
template_str = """
{%- for example in fewshot_examples -%}
{{ example.context_options[example.gold] }}{{ continuation_delimiter }}{{ example.continuation }}
{% endfor -%}
{{ context }}{{ continuation_delimiter }}{{ item.continuation }}""".strip()
template = Template(template_str)
fewshot_examples = fewshot_examples or []
context = {
'fewshot_examples': fewshot_examples,
'continuation_delimiter': continuation_delimiter,
'item': item
}
prompts = [template.render(context=context_option, **context)
for context_option in item['context_options']]
return prompts
def render_prompts_lm(item, continuation_delimiter, fewshot_examples=None):
"""
Render complete prompt for a language modeling task.
Notice that we manually trim the context in the template,
which in some datasets seems to have trailing whitespace (which we don't want).
"""
template_str = """
{%- for example in fewshot_examples -%}
{{ example.context | trim }}{{ continuation_delimiter }}{{ example.continuation }}
{% endfor -%}
{{ item.context | trim }}{{ continuation_delimiter }}{% if include_continuation %}{{ item.continuation }}{% endif %}""".strip()
template = Template(template_str)
fewshot_examples = fewshot_examples or []
context = {
'fewshot_examples': fewshot_examples,
'continuation_delimiter': continuation_delimiter,
'item': item
}
# Return two prompts: without and with the continuation
prompt_without = template.render(include_continuation=False, **context)
prompt_with = template.render(include_continuation=True, **context)
# Due to the way the data seems to be stored, I think I need to strip in the case of LM here.
# Otherwise we may get trailing whitespaces in prompt_without (which get absorbed into the next
# token in prompt_with), meaning we don't get a nice and clean prefix in the token space
# to detect the final continuation. Tokenizers...
prompt_without = prompt_without.strip()
return [prompt_without, prompt_with]
def find_common_length(token_sequences, direction='left'):
"""
Find the length of the common prefix or suffix across token sequences
- direction: 'left' for prefix, 'right' for suffix
"""
min_len = min(len(seq) for seq in token_sequences)
indices = {
'left': range(min_len),
'right': range(-1, -min_len-1, -1)
}[direction]
# Find the first position where the token sequences differ
for i, idx in enumerate(indices):
token = token_sequences[0][idx]
if not all(seq[idx] == token for seq in token_sequences):
return i
return min_len
def stack_sequences(tokens, pad_token_id):
"""Stack up a list of token sequences, pad to longest on the right"""
bsz, seq_len = len(tokens), max(len(x) for x in tokens)
input_ids = torch.full((bsz, seq_len), pad_token_id, dtype=torch.long)
for i, x in enumerate(tokens):
input_ids[i, :len(x)] = torch.tensor(x, dtype=torch.long)
return input_ids
def batch_sequences_mc(tokenizer, prompts):
# In multiple choice, contexts are the same but the continuation is different (common prefix)
tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id())
# figure out the start and end of each continuation
answer_start_idx = find_common_length(tokens, direction='left')
start_indices = [answer_start_idx] * len(prompts)
end_indices = [len(x) for x in tokens]
return tokens, start_indices, end_indices
def batch_sequences_schema(tokenizer, prompts):
# In schema tasks, contexts vary but continuation is the same (common suffix)
tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id())
# figure out the start and end of each context
suffix_length = find_common_length(tokens, direction='right')
end_indices = [len(x) for x in tokens]
start_indices = [ei - suffix_length for ei in end_indices]
return tokens, start_indices, end_indices
def batch_sequences_lm(tokenizer, prompts):
# In LM tasks, we have two prompts: without and with continuation
tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id())
tokens_without, tokens_with = tokens
start_idx, end_idx = len(tokens_without), len(tokens_with)
assert start_idx < end_idx, "prompt without is supposed to be a prefix of prompt with"
assert tokens_without == tokens_with[:start_idx], "prompt without is supposed to be a prefix of prompt with"
# we only need the with continuation prompt in the LM task, i.e. batch size of 1
return [tokens_with], [start_idx], [end_idx]
@torch.no_grad()
def forward_model(model, input_ids):
"""
Take BxT tensor of token ids, return BxT tensor of losses and argmax predictions.
The last column of losses is set to nan because we don't have autoregressive targets there.
"""
batch_size, seq_len = input_ids.size()
outputs = model(input_ids)
# Roll the tensor to the left by one position to get the (autoregressive) target ids
target_ids = torch.roll(input_ids, shifts=-1, dims=1)
# Calculate cross entropy at all positions
losses = torch.nn.functional.cross_entropy(
outputs.view(batch_size * seq_len, -1),
target_ids.view(batch_size * seq_len),
reduction='none'
).view(batch_size, seq_len)
# Set the last column to be nan because there is no autoregressive loss there
losses[:, -1] = float('nan')
# Get the argmax predictions at each position
predictions = outputs.argmax(dim=-1)
return losses, predictions
@torch.no_grad()
def evaluate_example(idx, model, tokenizer, data, device, task_meta):
"""Evaluate a single example, return True if correct, False otherwise"""
item = data[idx]
task_type = task_meta['task_type']
num_fewshot = task_meta['num_fewshot']
continuation_delimiter = task_meta['continuation_delimiter']
# Sample few-shot examples (excluding current item)
fewshot_examples = []
if num_fewshot > 0:
rng = random.Random(1234 + idx)
available_indices = [i for i in range(len(data)) if i != idx]
fewshot_indices = rng.sample(available_indices, num_fewshot)
fewshot_examples = [data[i] for i in fewshot_indices]
# Render prompts and batch sequences based on task type
if task_type == 'multiple_choice':
prompts = render_prompts_mc(item, continuation_delimiter, fewshot_examples)
tokens, start_idxs, end_idxs = batch_sequences_mc(tokenizer, prompts)
elif task_type == 'schema':
prompts = render_prompts_schema(item, continuation_delimiter, fewshot_examples)
tokens, start_idxs, end_idxs = batch_sequences_schema(tokenizer, prompts)
elif task_type == 'language_modeling':
prompts = render_prompts_lm(item, continuation_delimiter, fewshot_examples)
tokens, start_idxs, end_idxs = batch_sequences_lm(tokenizer, prompts)
else:
raise ValueError(f"Unsupported task type: {task_type}")
# Some models can't forward sequences beyond a certain length (e.g. GPT-2)
# In these cases, we have to truncate sequences to max length and adjust the indices
if hasattr(model, 'max_seq_len') and model.max_seq_len is not None:
max_tokens = model.max_seq_len
new_tokens, new_start_idxs, new_end_idxs = [], [], []
for t, s, e in zip(tokens, start_idxs, end_idxs):
if len(t) > max_tokens:
num_to_crop = len(t) - max_tokens
new_tokens.append(t[-max_tokens:]) # take the last max_tokens tokens
new_start_idxs.append(s - num_to_crop) # shift the indices down
new_end_idxs.append(e - num_to_crop)
assert s - num_to_crop >= 0, "this should never happen right?"
assert e - num_to_crop >= 0, "this should never happen right?"
else:
new_tokens.append(t) # keep unchanged
new_start_idxs.append(s)
new_end_idxs.append(e)
tokens, start_idxs, end_idxs = new_tokens, new_start_idxs, new_end_idxs
# Stack up all the sequences into a batch
pad_token_id = tokenizer.get_bos_token_id() # use BOS as pad token is ok
input_ids = stack_sequences(tokens, pad_token_id)
input_ids = input_ids.to(device)
# Forward the model, get the autoregressive loss and argmax prediction at each token
losses, predictions = forward_model(model, input_ids)
# See if the losses/predictions come out correctly
if task_type == 'language_modeling':
# language modeling task is currently always batch size 1
si = start_idxs[0]
ei = end_idxs[0]
# predictions[i] predict input_ids[i+1] autoregressively
predicted_tokens = predictions[0, si-1:ei-1]
actual_tokens = input_ids[0, si:ei]
is_correct = torch.all(predicted_tokens == actual_tokens).item()
elif task_type in ['multiple_choice', 'schema']:
# For MC/schema: find the option with lowest average loss
mean_losses = [losses[i, si-1:ei-1].mean().item()
for i, (si, ei) in enumerate(zip(start_idxs, end_idxs))]
pred_idx = mean_losses.index(min(mean_losses))
is_correct = pred_idx == item['gold']
else:
raise ValueError(f"Unsupported task type: {task_type}")
return is_correct
def evaluate_task(model, tokenizer, data, device, task_meta):
"""
This function is responsible for evaluating one task across many examples.
It also handles dispatch to all processes if the script is run with torchrun.
"""
rank = dist.get_rank() if dist.is_initialized() else 0
world_size = dist.get_world_size() if dist.is_initialized() else 1
correct = torch.zeros(len(data), dtype=torch.float32, device=device)
# stride the examples to each rank
for idx in range(rank, len(data), world_size):
is_correct = evaluate_example(idx, model, tokenizer, data, device, task_meta)
correct[idx] = float(is_correct)
# sync results across all the processes if running distributed
if world_size > 1:
dist.barrier()
dist.all_reduce(correct, op=dist.ReduceOp.SUM)
# compute the mean
mean_correct = correct.mean().item()
return mean_correct

View File

@ -0,0 +1,192 @@
from collections import deque
import os
import numpy as np
import torch
import pyarrow.parquet as pq
from nanochat.common import get_dist_info
from nanochat.dataset import list_parquet_files, USE_OPENWEBTEXT
from nanochat.tokenizer import get_tokenizer
# Support for loading openwebtext from HuggingFace datasets
if USE_OPENWEBTEXT:
from datasets import load_dataset
def bin_data_loader_with_state(B, T, data_dir, split="train", device="cuda", resume_state_dict=None):
"""
Load data from .bin files (nanoMoE format) and yield training batches.
Matches nanoMoE's get_batch function exactly.
Args:
B: batch size
T: sequence length (block_size)
data_dir: directory containing train.bin and val.bin files
split: "train" or "val"
device: device to move tensors to
resume_state_dict: optional state dict for resuming training (not used in nanoMoE, kept for compatibility)
Yields:
inputs, targets, state_dict tuples
"""
assert split in ["train", "val"], "split must be 'train' or 'val'"
# Get binary file path
bin_path = os.path.join(data_dir, f'{split}.bin')
if not os.path.exists(bin_path):
raise FileNotFoundError(f"Data file not found: {bin_path}")
device_type = 'cuda' if device.type == 'cuda' else 'cpu'
while True: # infinite iteration
# We recreate np.memmap every batch to avoid a memory leak, as per
# https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
data = np.memmap(bin_path, dtype=np.uint16, mode='r')
# Sample random positions for this batch (matching nanoMoE exactly: len(data) - T)
ix = torch.randint(len(data) - T, (B,))
# Convert memmap slices directly to tensors (matching nanoMoE train.py exactly)
x = torch.stack([torch.from_numpy((data[i:i+T]).astype(np.int64)) for i in ix])
y = torch.stack([torch.from_numpy((data[i+1:i+1+T]).astype(np.int64)) for i in ix])
# Move to device with optional memory pinning (matching nanoMoE exactly)
if device_type == 'cuda':
# Try pinning memory, but fall back if it fails
try:
x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
except RuntimeError:
# Fall back to regular transfer if pin_memory fails
x, y = x.to(device), y.to(device)
else:
x, y = x.to(device), y.to(device)
# Return state_dict for compatibility (nanoMoE doesn't use this, but we keep it)
state_dict = {"pos": 0} # Simple placeholder, not used in nanoMoE
yield x, y, state_dict
def bin_data_loader(*args, **kwargs):
"""Helper function that only emits inputs/targets without state_dict"""
for inputs, targets, state_dict in bin_data_loader_with_state(*args, **kwargs):
yield inputs, targets
def tokenizing_distributed_data_loader_with_state(B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda", resume_state_dict=None):
"""
Stream pretraining text from parquet files, tokenize, yield training batches.
This implementation became a bit more complex because we wish to support approximate resume training.
Instead of turning this into a Class, we opt to return the state_dict with every batch,
and then the caller can pass in a state_dict to resume training from a desired point.
Note that this resumption is atm only *approximate* for simplicity.
We won't repeat the same documents but we might skip a few.
The state_dict that is returned can be later passed into this function via `resume_state_dict` to approximately resume.
Perfect state resumption is possible but would be a lot more bloated, probably not worth it atm.
"""
assert split in ["train", "val"], "split must be 'train' or 'val'"
# infinite iterator over document batches (list of text strings)
ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
def document_batches():
if USE_OPENWEBTEXT:
# Load openwebtext dataset from local cache
# Note: openwebtext only has 'train' split, so we use it for both train and val
cache_dir = os.environ.get("HF_DATASETS_CACHE", "/root/autodl-tmp/huggingface/datasets")
dataset = load_dataset("openwebtext", "plain_text", cache_dir=cache_dir, split="train")
full_dataset_size = len(dataset)
# For validation, use the last 1% of the dataset
if split == "val":
dataset_start = int(full_dataset_size * 0.99)
dataset_size = full_dataset_size - dataset_start
else:
dataset_start = 0
# For training, use 99% of the dataset
dataset_size = int(full_dataset_size * 0.99)
resume_idx = resume_state_dict.get("rg_idx", 0) if resume_state_dict is not None else 0
idx = resume_idx
while True: # iterate infinitely (multi-epoch)
while idx < dataset_size:
# Collect batch_size documents starting from current index with DDP stride
batch = []
batch_indices = []
for i in range(tokenizer_batch_size):
if idx >= dataset_size:
break
# DDP: each rank processes different documents
actual_idx = dataset_start + idx
if actual_idx < full_dataset_size and idx % ddp_world_size == ddp_rank:
text = dataset[actual_idx]["text"]
batch.append(text)
batch_indices.append(idx)
idx += 1
if batch:
yield batch, (0, idx) # Use idx as state
idx = 0 # Reset for next epoch
else:
# Original parquet file iteration
parquet_paths = list_parquet_files()
parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:]
resume_pq_idx = resume_state_dict["pq_idx"] if resume_state_dict is not None else 0
resume_rg_idx = resume_state_dict["rg_idx"] if resume_state_dict is not None else None
pq_idx = resume_pq_idx # we kick off parquet files at the resume index (or by default just 0)
while True: # iterate infinitely (multi-epoch)
while pq_idx < len(parquet_paths): # iterate over all parquet files
filepath = parquet_paths[pq_idx]
pf = pq.ParquetFile(filepath)
# Start from resume point if resuming on same file, otherwise from DDP rank
# I know this state resumption is a little bit tricky and a little bit hacky... sigh.
if resume_rg_idx is not None:
base_idx = resume_rg_idx // ddp_world_size # in units of ddp_world_size
base_idx += 1 # advance by 1 so that we definitely don't repeat data after resuming
rg_idx = base_idx * ddp_world_size + ddp_rank
resume_rg_idx = None # set to None as we only want to do this a single time
else:
rg_idx = ddp_rank
while rg_idx < pf.num_row_groups:
rg = pf.read_row_group(rg_idx)
batch = rg.column('text').to_pylist() # each batch is a parquet group, e.g. 1024 rows
# the tokenizer encode might want to go in even smaller batches, e.g. 128 rows
for i in range(0, len(batch), tokenizer_batch_size):
yield batch[i:i+tokenizer_batch_size], (pq_idx, rg_idx)
rg_idx += ddp_world_size # advance to the next row group (in DDP)
pq_idx += 1 # advance to the next parquet file
batches = document_batches()
# Now emit batches of tokens.
needed_tokens = B * T + 1 # +1 is because we also need the target at the last token
# get the tokenizer and the bos token
tokenizer = get_tokenizer()
bos_token = tokenizer.get_bos_token_id()
# scratch buffer holds the tokens for one iteration
token_buffer = deque() # we stream tokens on the right and pop from the left
while True:
# Accumulate enough tokens for one iteration before yielding.
while len(token_buffer) < needed_tokens:
doc_batch, (pq_idx, rg_idx) = next(batches)
token_lists = tokenizer.encode(doc_batch, prepend=bos_token, num_threads=tokenizer_threads)
for tokens in token_lists:
token_buffer.extend(tokens)
# Move tokens from the deque into the scratch buffer
tokens = [token_buffer.popleft() for _ in range(needed_tokens)]
# CUDA supports memory pinning for asynchronous transfers between CPU and GPU
use_cuda_optimizations = device.type == "cuda"
scratch = torch.tensor(tokens, dtype=torch.long, pin_memory=use_cuda_optimizations) # in PyTorch, long=int64
# Create the inputs/targets as 1D tensors
inputs_cpu = scratch[:-1]
targets_cpu = scratch[1:]
# Reshape to 2D and move to GPU async
inputs = inputs_cpu.view(B, T).to(device=device, non_blocking=use_cuda_optimizations)
targets = targets_cpu.view(B, T).to(device=device, non_blocking=use_cuda_optimizations)
# For openwebtext, we use rg_idx as the state; for parquet files, we use both pq_idx and rg_idx
if USE_OPENWEBTEXT:
state_dict = {"pq_idx": 0, "rg_idx": rg_idx} # Use rg_idx to track position in dataset
else:
state_dict = {"pq_idx": pq_idx, "rg_idx": rg_idx} # we need this in case we wish to approximately resume training
yield inputs, targets, state_dict
def tokenizing_distributed_data_loader(*args, **kwargs):
# helper function that only emits the inputs/targets and not the state_dict
for inputs, targets, state_dict in tokenizing_distributed_data_loader_with_state(*args, **kwargs):
yield inputs, targets

174
nanochat-align/dataset.py Normal file
View File

@ -0,0 +1,174 @@
"""
The base/pretraining dataset is a set of parquet files.
This file contains utilities for:
- iterating over the parquet files and yielding documents from it
- download the files on demand if they are not on disk
For details of how the dataset was prepared, see `repackage_data_reference.py`.
"""
import os
import argparse
import time
import requests
import pyarrow.parquet as pq
from multiprocessing import Pool
from nanochat.common import get_base_dir
# Support for loading openwebtext from HuggingFace datasets
USE_OPENWEBTEXT = os.environ.get("USE_OPENWEBTEXT", "false").lower() == "true"
if USE_OPENWEBTEXT:
from datasets import load_dataset
# -----------------------------------------------------------------------------
# The specifics of the current pretraining dataset
# The URL on the internet where the data is hosted and downloaded from on demand
BASE_URL = "https://huggingface.co/datasets/karpathy/fineweb-edu-100b-shuffle/resolve/main"
MAX_SHARD = 1822 # the last datashard is shard_01822.parquet
index_to_filename = lambda index: f"shard_{index:05d}.parquet" # format of the filenames
base_dir = get_base_dir()
DATA_DIR = os.path.join(base_dir, "base_data")
os.makedirs(DATA_DIR, exist_ok=True)
# -----------------------------------------------------------------------------
# These functions are useful utilities to other modules, can/should be imported
def list_parquet_files(data_dir=None):
""" Looks into a data dir and returns full paths to all parquet files. """
data_dir = DATA_DIR if data_dir is None else data_dir
parquet_files = sorted([
f for f in os.listdir(data_dir)
if f.endswith('.parquet') and not f.endswith('.tmp')
])
parquet_paths = [os.path.join(data_dir, f) for f in parquet_files]
return parquet_paths
def parquets_iter_batched(split, start=0, step=1):
"""
Iterate through the dataset, in batches of underlying row_groups for efficiency.
- split can be "train" or "val". the last parquet file will be val.
- start/step are useful for skipping rows in DDP. e.g. start=rank, step=world_size
"""
assert split in ["train", "val"], "split must be 'train' or 'val'"
# If using openwebtext, load from HuggingFace datasets
if USE_OPENWEBTEXT:
# Load openwebtext dataset from local cache
# Note: openwebtext only has 'train' split, so we use it for both train and val
cache_dir = os.environ.get("HF_DATASETS_CACHE", "/root/autodl-tmp/huggingface/datasets")
dataset = load_dataset("openwebtext", "plain_text", cache_dir=cache_dir, split="train")
# Batch size for efficiency (similar to row_group size)
batch_size = 1024
dataset_size = len(dataset)
# For validation, use the last 1% of the dataset
if split == "val":
val_start = int(dataset_size * 0.99)
dataset_size = dataset_size - val_start
dataset_offset = val_start
else:
dataset_offset = 0
# For training, use 99% of the dataset
dataset_size = int(dataset_size * 0.99)
# Iterate with start/step for DDP support
for i in range(start, dataset_size, step * batch_size):
batch_texts = []
for j in range(i, min(i + step * batch_size, dataset_size), step):
idx = dataset_offset + j
if idx < len(dataset):
text = dataset[idx]["text"]
batch_texts.append(text)
if batch_texts:
yield batch_texts
return
# Original parquet file iteration
parquet_paths = list_parquet_files()
parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:]
for filepath in parquet_paths:
pf = pq.ParquetFile(filepath)
for rg_idx in range(start, pf.num_row_groups, step):
rg = pf.read_row_group(rg_idx)
texts = rg.column('text').to_pylist()
yield texts
# -----------------------------------------------------------------------------
def download_single_file(index):
""" Downloads a single file index, with some backoff """
# Construct the local filepath for this file and skip if it already exists
filename = index_to_filename(index)
filepath = os.path.join(DATA_DIR, filename)
if os.path.exists(filepath):
print(f"Skipping {filepath} (already exists)")
return True
# Construct the remote URL for this file
url = f"{BASE_URL}/{filename}"
print(f"Downloading {filename}...")
# Download with retries
max_attempts = 5
for attempt in range(1, max_attempts + 1):
try:
response = requests.get(url, stream=True, timeout=30)
response.raise_for_status()
# Write to temporary file first
temp_path = filepath + f".tmp"
with open(temp_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=1024 * 1024): # 1MB chunks
if chunk:
f.write(chunk)
# Move temp file to final location
os.rename(temp_path, filepath)
print(f"Successfully downloaded {filename}")
return True
except (requests.RequestException, IOError) as e:
print(f"Attempt {attempt}/{max_attempts} failed for {filename}: {e}")
# Clean up any partial files
for path in [filepath + f".tmp", filepath]:
if os.path.exists(path):
try:
os.remove(path)
except:
pass
# Try a few times with exponential backoff: 2^attempt seconds
if attempt < max_attempts:
wait_time = 2 ** attempt
print(f"Waiting {wait_time} seconds before retry...")
time.sleep(wait_time)
else:
print(f"Failed to download {filename} after {max_attempts} attempts")
return False
return False
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download FineWeb-Edu 100BT dataset shards")
parser.add_argument("-n", "--num-files", type=int, default=-1, help="Number of shards to download (default: -1), -1 = disable")
parser.add_argument("-w", "--num-workers", type=int, default=4, help="Number of parallel download workers (default: 4)")
args = parser.parse_args()
# If using openwebtext, skip download
if USE_OPENWEBTEXT:
print("Using openwebtext dataset from local cache. Skipping download.")
print(f"Set USE_OPENWEBTEXT=true to use openwebtext dataset")
exit(0)
num = MAX_SHARD + 1 if args.num_files == -1 else min(args.num_files, MAX_SHARD + 1)
ids_to_download = list(range(num))
print(f"Downloading {len(ids_to_download)} shards using {args.num_workers} workers...")
print(f"Target directory: {DATA_DIR}")
print()
with Pool(processes=args.num_workers) as pool:
results = pool.map(download_single_file, ids_to_download)
# Report results
successful = sum(1 for success in results if success)
print(f"Done! Downloaded: {successful}/{len(ids_to_download)} shards to {DATA_DIR}")

378
nanochat-align/engine.py Normal file
View File

@ -0,0 +1,378 @@
"""
Engine for efficient inference of our models.
Everything works around token sequences:
- The user can send token sequences to the engine
- The engine returns the next token
Notes:
- The engine knows nothing about tokenization, it's purely token id sequences.
The whole thing is made as efficient as possible.
"""
import torch
import torch.nn.functional as F
import signal
import warnings
from contextlib import contextmanager
from collections import deque
from nanochat.common import compute_init, autodetect_device_type
from nanochat.checkpoint_manager import load_model
from contextlib import nullcontext
# -----------------------------------------------------------------------------
# Calculator tool helpers
@contextmanager
def timeout(duration, formula):
def timeout_handler(signum, frame):
raise Exception(f"'{formula}': timed out after {duration} seconds")
signal.signal(signal.SIGALRM, timeout_handler)
signal.alarm(duration)
yield
signal.alarm(0)
def eval_with_timeout(formula, max_time=3):
try:
with timeout(max_time, formula):
with warnings.catch_warnings():
warnings.simplefilter("ignore", SyntaxWarning)
return eval(formula, {"__builtins__": {}}, {})
except Exception as e:
signal.alarm(0)
# print(f"Warning: Failed to eval {formula}, exception: {e}") # it's ok ignore wrong calculator usage
return None
def use_calculator(expr):
"""
Evaluate a Python expression safely.
Supports both math expressions and string operations like .count()
"""
# Remove commas from numbers
expr = expr.replace(",", "")
# Check if it's a pure math expression (old behavior)
if all([x in "0123456789*+-/.() " for x in expr]):
if "**" in expr: # disallow power operator
return None
return eval_with_timeout(expr)
# Check if it's a string operation we support
# Allow: strings (single/double quotes), .count(), letters, numbers, spaces, parens
allowed_chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'\"()._ "
if not all([x in allowed_chars for x in expr]):
return None
# Disallow dangerous patterns
dangerous_patterns = ['__', 'import', 'exec', 'eval', 'compile', 'open', 'file',
'input', 'raw_input', 'globals', 'locals', 'vars', 'dir',
'getattr', 'setattr', 'delattr', 'hasattr']
expr_lower = expr.lower()
if any(pattern in expr_lower for pattern in dangerous_patterns):
return None
# Only allow .count() method for now (can expand later)
if '.count(' not in expr:
return None
# Evaluate with timeout
return eval_with_timeout(expr)
# -----------------------------------------------------------------------------
class KVCache:
"""
Works hand-in-hand with the GPT model to maintain the KV cache.
Note that the .pos advances automatically after the last layer of the Transformer inserts.
"""
def __init__(self, batch_size, num_heads, seq_len, head_dim, num_layers):
# Each of K/V is of shape (B, H, T, D) and we have one per layer of the Transformer.
self.kv_shape = (num_layers, 2, batch_size, num_heads, seq_len, head_dim)
self.kv_cache = None
self.pos = 0 # current position in time in the cache
def reset(self):
self.pos = 0
def get_pos(self):
return self.pos
def prefill(self, other):
"""
Prefill given another KV cache. Optionally expand along batch dim.
This is used when we do batch 1 prefill and then want to generate
multiple samples in parallel from there.
"""
# 1) validate the shapes
assert self.kv_cache is None, "Cannot prefill a non-empty KV cache"
assert other.kv_cache is not None, "Cannot prefill with a None KV cache"
for ix, (dim1, dim2) in enumerate(zip(self.kv_shape, other.kv_shape)):
# ix 0: num_layers, 1: k/v, 2: batch_size, 3: num_heads, 4: seq_len, 5: head_dim
if ix in [0, 1, 3, 5]:
# num_layers, k/v, num_heads, head_dim must match
assert dim1 == dim2, f"Dim {ix} mismatch: {dim1} != {dim2}"
elif ix == 2:
# batch_size can be expanded
assert dim1 == dim2 or dim2 == 1, f"Batch dim mismatch: {dim1} != {dim2}"
elif ix == 4:
# seq_len: self must be longer than other
assert dim1 >= dim2, f"Seq len mismatch: {dim1} < {dim2}"
# 2) initialize the cache
dtype, device = other.kv_cache.dtype, other.kv_cache.device
self.kv_cache = torch.empty(self.kv_shape, dtype=dtype, device=device)
# 3) copy the data over
self.kv_cache[:, :, :, :, :other.pos, :] = other.kv_cache
# 4) update the pos
self.pos = other.pos
def insert_kv(self, layer_idx, k, v):
# Lazy initialize the cache here because we need to know the dtype/device
if self.kv_cache is None:
self.kv_cache = torch.empty(self.kv_shape, dtype=k.dtype, device=k.device)
# Insert new keys/values to the cache and return the full cache so far
B, H, T_add, D = k.size()
t0, t1 = self.pos, self.pos + T_add
# Dynamically grow the cache if needed
if t1 > self.kv_cache.size(4):
t_needed = t1 + 1024 # as much as we need plus buffer of 1024
t_needed = (t_needed + 1023) & ~1023 # then round up to the nearest multiple of 1024
additional_shape = list(self.kv_cache.shape)
additional_shape[4] = t_needed - self.kv_cache.size(4)
additional_cache = torch.empty(additional_shape, dtype=k.dtype, device=k.device)
self.kv_cache = torch.cat([self.kv_cache, additional_cache], dim=4).contiguous()
self.kv_shape = self.kv_cache.shape
# Insert k, v into the cache
self.kv_cache[layer_idx, 0, :, :, t0:t1] = k
self.kv_cache[layer_idx, 1, :, :, t0:t1] = v
# Return the full cached keys/values up to current position (as a view)
key_view = self.kv_cache[layer_idx, 0, :, :, :t1]
value_view = self.kv_cache[layer_idx, 1, :, :, :t1]
# Increment pos after the last layer of the Transformer processes
if layer_idx == self.kv_cache.size(0) - 1:
self.pos = t1
return key_view, value_view
# -----------------------------------------------------------------------------
@torch.inference_mode()
def sample_next_token(logits, rng, temperature=1.0, top_k=None):
"""Sample a single next token from given logits of shape (B, vocab_size). Returns (B, 1)."""
assert temperature >= 0.0, "temperature must be non-negative"
if temperature == 0.0:
return torch.argmax(logits, dim=-1, keepdim=True)
if top_k is not None:
k = min(top_k, logits.size(-1))
vals, idx = torch.topk(logits, k, dim=-1)
vals = vals / temperature
probs = F.softmax(vals, dim=-1)
choice = torch.multinomial(probs, num_samples=1, generator=rng)
return idx.gather(1, choice)
else:
logits = logits / temperature
probs = F.softmax(logits, dim=-1)
return torch.multinomial(probs, num_samples=1, generator=rng)
# -----------------------------------------------------------------------------
class RowState:
# Per-row state tracking during generation
def __init__(self, current_tokens=None):
self.current_tokens = current_tokens or [] # Current token sequence for this row
self.forced_tokens = deque() # Queue of tokens to force inject
self.in_python_block = False # Whether we are inside a python block
self.python_expr_tokens = [] # Tokens of the current python expression
self.completed = False # Whether this row has completed generation
class Engine:
def __init__(self, model, tokenizer):
self.model = model
self.tokenizer = tokenizer # needed for tool use
@torch.inference_mode()
def generate(self, tokens, num_samples=1, max_tokens=None, temperature=1.0, top_k=None, seed=42):
"""Same as generate, but does single prefill and then clones the KV cache."""
assert isinstance(tokens, list) and isinstance(tokens[0], int), "expecting list of ints"
device = self.model.get_device()
rng = torch.Generator(device=device)
rng.manual_seed(seed)
# Get the special tokens we need to coordinate the tool use state machine
get_special = lambda s: self.tokenizer.encode_special(s)
python_start = get_special("<|python_start|>")
python_end = get_special("<|python_end|>")
output_start = get_special("<|output_start|>")
output_end = get_special("<|output_end|>")
assistant_end = get_special("<|assistant_end|>") # if sampled, ends row
bos = self.tokenizer.get_bos_token_id() # if sampled, ends row
# 1) Run a batch 1 prefill of the prompt tokens
m = self.model.config
kv_model_kwargs = {"num_heads": m.n_kv_head, "head_dim": m.n_embd // m.n_head, "num_layers": m.n_layer}
kv_cache_prefill = KVCache(
batch_size=1,
seq_len=len(tokens),
**kv_model_kwargs,
)
ids = torch.tensor([tokens], dtype=torch.long, device=device)
logits = self.model.forward(ids, kv_cache=kv_cache_prefill)
logits = logits[:, -1, :]
next_ids = sample_next_token(logits, rng, temperature, top_k) # (B, 1)
sampled_tokens = next_ids[:, 0].tolist()
# 2) Replicate the KV cache for each sample/row
kv_length_hint = (len(tokens) + max_tokens) if max_tokens is not None else self.model.config.sequence_len
kv_cache_decode = KVCache(
batch_size=num_samples,
seq_len=kv_length_hint,
**kv_model_kwargs,
)
kv_cache_decode.prefill(kv_cache_prefill)
del kv_cache_prefill # no need to keep this memory around
# 3) Initialize states for each sample
row_states = [RowState(tokens.copy()) for _ in range(num_samples)]
# 4) Main generation loop
num_generated = 0
first_iteration = True
while True:
# Stop condition: we've reached max tokens
if max_tokens is not None and num_generated >= max_tokens:
break
# Stop condition: all rows are completed
if all(state.completed for state in row_states):
break
# Get sampled tokens - either from prefill or from forward pass
if first_iteration:
# Use the tokens we already sampled from prefill
sampled_tokens = [sampled_tokens[0]] * num_samples # Broadcast first token to all rows
# TODO: we should sample a token for each row instead of broadcasting
first_iteration = False
else:
# Forward the model and get the next token for each row
logits = self.model.forward(ids, kv_cache=kv_cache_decode) # (B, T, vocab_size)
logits = logits[:, -1, :] # (B, vocab_size) at last time step
next_ids = sample_next_token(logits, rng, temperature, top_k) # (B, 1)
sampled_tokens = next_ids[:, 0].tolist()
# Process each row: choose the next token, update state, optional tool use
token_column = [] # contains the next token id along each row
token_masks = [] # contains the mask (was it sampled (1) or forced (0)?) along each row
for i, state in enumerate(row_states):
# Select the next token in this row
is_forced = len(state.forced_tokens) > 0 # are there tokens waiting to be forced in deque?
token_masks.append(0 if is_forced else 1) # mask is 0 if forced, 1 if sampled
next_token = state.forced_tokens.popleft() if is_forced else sampled_tokens[i]
token_column.append(next_token)
# Update the state of this row to include the next token
state.current_tokens.append(next_token)
# On <|assistant_end|> or <|bos|>, mark the row as completed
if next_token == assistant_end or next_token == bos:
state.completed = True
# Handle tool logic
if next_token == python_start:
state.in_python_block = True
state.python_expr_tokens = []
elif next_token == python_end and state.in_python_block:
state.in_python_block = False
if state.python_expr_tokens:
expr = self.tokenizer.decode(state.python_expr_tokens)
result = use_calculator(expr)
if result is not None:
result_tokens = self.tokenizer.encode(str(result))
state.forced_tokens.append(output_start)
state.forced_tokens.extend(result_tokens)
state.forced_tokens.append(output_end)
state.python_expr_tokens = []
elif state.in_python_block:
state.python_expr_tokens.append(next_token)
# Yield the token column
yield token_column, token_masks
num_generated += 1
# Prepare ids for next iteration
ids = torch.tensor(token_column, dtype=torch.long, device=device).unsqueeze(1)
def generate_batch(self, tokens, num_samples=1, **kwargs):
"""
Non-streaming batch generation that just returns the final token sequences.
Returns a list of token sequences (list of lists of ints).
Terminal tokens (assistant_end, bos) are not included in the results.
"""
assistant_end = self.tokenizer.encode_special("<|assistant_end|>")
bos = self.tokenizer.get_bos_token_id()
results = [tokens.copy() for _ in range(num_samples)]
masks = [[0] * len(tokens) for _ in range(num_samples)]
completed = [False] * num_samples
for token_column, token_masks in self.generate(tokens, num_samples, **kwargs):
for i, (token, mask) in enumerate(zip(token_column, token_masks)):
if not completed[i]:
if token == assistant_end or token == bos:
completed[i] = True
else:
results[i].append(token)
masks[i].append(mask)
# Stop if all rows are completed
if all(completed):
break
return results, masks
if __name__ == "__main__":
"""
Quick inline test to make sure that the naive/slow model.generate function
is equivalent to the faster Engine.generate function here.
"""
import time
# init compute
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init()
device_type = autodetect_device_type()
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
# load the model and tokenizer
model, tokenizer, meta = load_model("base", device, phase="eval")
bos_token_id = tokenizer.get_bos_token_id()
# common hyperparameters
kwargs = dict(max_tokens=64, temperature=0.0)
# set the starting prompt
prompt_tokens = tokenizer.encode("The chemical formula of water is", prepend=bos_token_id)
# generate the reference sequence using the model.generate() function
generated_tokens = []
torch.cuda.synchronize()
t0 = time.time()
stream = model.generate(prompt_tokens, **kwargs)
with autocast_ctx:
for token in stream:
generated_tokens.append(token)
chunk = tokenizer.decode([token])
print(chunk, end="", flush=True)
print()
torch.cuda.synchronize()
t1 = time.time()
print(f"Reference time: {t1 - t0:.2f}s")
reference_ids = generated_tokens
# generate tokens with Engine
generated_tokens = []
engine = Engine(model, tokenizer)
stream = engine.generate(prompt_tokens, num_samples=1, **kwargs) # note: runs in fp32
torch.cuda.synchronize()
t0 = time.time()
with autocast_ctx:
for token_column, token_masks in stream:
token = token_column[0] # only print out the first row
generated_tokens.append(token)
chunk = tokenizer.decode([token])
print(chunk, end="", flush=True)
print()
torch.cuda.synchronize()
t1 = time.time()
print(f"Engine time: {t1 - t0:.2f}s")
# compare the two sequences
for i in range(len(reference_ids)):
if reference_ids[i] != generated_tokens[i]:
print(f"Mismatch at {i}: {reference_ids[i]} != {generated_tokens[i]}")
break
print(f"Match: {reference_ids == generated_tokens}")

349
nanochat-align/execution.py Normal file
View File

@ -0,0 +1,349 @@
"""
Sandboxed execution utilities for running Python code that comes out of an LLM.
Adapted from OpenAI HumanEval code:
https://github.com/openai/human-eval/blob/master/human_eval/execution.py
What is covered:
- Each execution runs in its own process (can be killed if it hangs or crashes)
- Execution is limited by a timeout to stop infinite loops
- Memory limits are enforced by default (256MB)
- stdout and stderr are captured and returned
- Code runs in a temporary directory that is deleted afterwards
- Dangerous functions are disabled (examples: os.system, os.kill, shutil.rmtree, subprocess.Popen)
What is not covered:
- Not a true security sandbox
- Network access is not blocked (e.g. sockets could be opened)
- Python's dynamic features (e.g. ctypes) could bypass restrictions
- No kernel-level isolation (no seccomp, no containers, no virtualization)
Overall this sandbox is good for evaluation of generated code and protects against
accidental destructive behavior, but it is not safe against malicious adversarial code.
"""
import contextlib
import faulthandler
import io
import multiprocessing
import os
import platform
import signal
import tempfile
from dataclasses import dataclass
from typing import Optional
# -----------------------------------------------------------------------------
@dataclass
class ExecutionResult:
"""Result of executing Python code in a sandbox."""
success: bool
stdout: str
stderr: str
error: Optional[str] = None
timeout: bool = False
memory_exceeded: bool = False
def __repr__(self):
parts = []
parts.append(f"ExecutionResult(success={self.success}")
if self.timeout:
parts.append(", timeout=True")
if self.memory_exceeded:
parts.append(", memory_exceeded=True")
if self.error:
parts.append(f", error={self.error!r}")
if self.stdout:
parts.append(f", stdout={self.stdout!r}")
if self.stderr:
parts.append(f", stderr={self.stderr!r}")
parts.append(")")
return "".join(parts)
@contextlib.contextmanager
def time_limit(seconds: float):
def signal_handler(signum, frame):
raise TimeoutException("Timed out!")
signal.setitimer(signal.ITIMER_REAL, seconds)
signal.signal(signal.SIGALRM, signal_handler)
try:
yield
finally:
signal.setitimer(signal.ITIMER_REAL, 0)
@contextlib.contextmanager
def capture_io():
"""Capture stdout and stderr, and disable stdin."""
stdout_capture = io.StringIO()
stderr_capture = io.StringIO()
stdin_block = WriteOnlyStringIO()
with contextlib.redirect_stdout(stdout_capture):
with contextlib.redirect_stderr(stderr_capture):
with redirect_stdin(stdin_block):
yield stdout_capture, stderr_capture
@contextlib.contextmanager
def create_tempdir():
with tempfile.TemporaryDirectory() as dirname:
with chdir(dirname):
yield dirname
class TimeoutException(Exception):
pass
class WriteOnlyStringIO(io.StringIO):
"""StringIO that throws an exception when it's read from"""
def read(self, *args, **kwargs):
raise IOError
def readline(self, *args, **kwargs):
raise IOError
def readlines(self, *args, **kwargs):
raise IOError
def readable(self, *args, **kwargs):
"""Returns True if the IO object can be read."""
return False
class redirect_stdin(contextlib._RedirectStream): # type: ignore
_stream = "stdin"
@contextlib.contextmanager
def chdir(root):
if root == ".":
yield
return
cwd = os.getcwd()
os.chdir(root)
try:
yield
finally:
os.chdir(cwd)
def reliability_guard(maximum_memory_bytes: Optional[int] = None):
"""
This disables various destructive functions and prevents the generated code
from interfering with the test (e.g. fork bomb, killing other processes,
removing filesystem files, etc.)
WARNING
This function is NOT a security sandbox. Untrusted code, including, model-
generated code, should not be blindly executed outside of one. See the
Codex paper for more information about OpenAI's code sandbox, and proceed
with caution.
"""
if platform.uname().system != "Darwin":
# These resource limit calls seem to fail on macOS (Darwin), skip?
import resource
resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
faulthandler.disable()
import builtins
builtins.exit = None
builtins.quit = None
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.kill = None
os.system = None
os.putenv = None
os.remove = None
os.removedirs = None
os.rmdir = None
os.fchdir = None
os.setuid = None
os.fork = None
os.forkpty = None
os.killpg = None
os.rename = None
os.renames = None
os.truncate = None
os.replace = None
os.unlink = None
os.fchmod = None
os.fchown = None
os.chmod = None
os.chown = None
os.chroot = None
os.fchdir = None
os.lchflags = None
os.lchmod = None
os.lchown = None
os.getcwd = None
os.chdir = None
import shutil
shutil.rmtree = None
shutil.move = None
shutil.chown = None
import subprocess
subprocess.Popen = None # type: ignore
__builtins__["help"] = None
import sys
sys.modules["ipdb"] = None
sys.modules["joblib"] = None
sys.modules["resource"] = None
sys.modules["psutil"] = None
sys.modules["tkinter"] = None
def _unsafe_execute(code: str, timeout: float, maximum_memory_bytes: Optional[int], result_dict):
"""Execute code in a subprocess with safety guards. Results are written to result_dict."""
with create_tempdir():
# These system calls are needed when cleaning up tempdir.
import os
import shutil
rmtree = shutil.rmtree
rmdir = os.rmdir
chdir = os.chdir
unlink = os.unlink
# Disable functionalities that can make destructive changes to the test.
reliability_guard(maximum_memory_bytes=maximum_memory_bytes)
# Default to failure
result_dict.update({
"success": False,
"stdout": "",
"stderr": "",
"timeout": False,
"memory_exceeded": False,
"error": None,
})
try:
exec_globals = {}
with capture_io() as (stdout_capture, stderr_capture):
with time_limit(timeout):
# WARNING
# This program exists to execute untrusted model-generated code. Although
# it is highly unlikely that model-generated code will do something overtly
# malicious in response to this test suite, model-generated code may act
# destructively due to a lack of model capability or alignment.
# Users are strongly encouraged to sandbox this evaluation suite so that it
# does not perform destructive actions on their host or network. For more
# information on how OpenAI sandboxes its code, see the accompanying paper.
# Once you have read this disclaimer and taken appropriate precautions,
# uncomment the following line and proceed at your own risk:
exec(code, exec_globals)
result_dict.update({
"success": True,
"stdout": stdout_capture.getvalue(),
"stderr": stderr_capture.getvalue(),
})
except TimeoutException:
result_dict.update({
"timeout": True,
"error": "Execution timed out",
})
except MemoryError as e:
result_dict.update({
"memory_exceeded": True,
"error": f"Memory limit exceeded: {e}",
})
except BaseException as e:
result_dict.update({
"error": f"{type(e).__name__}: {e}",
})
# Needed for cleaning up.
shutil.rmtree = rmtree
os.rmdir = rmdir
os.chdir = chdir
os.unlink = unlink
def execute_code(
code: str,
timeout: float = 5.0, # 5 seconds default
maximum_memory_bytes: Optional[int] = 256 * 1024 * 1024, # 256MB default
) -> ExecutionResult:
"""
Execute Python code in a sandboxed environment.
Args:
code: Python code to execute as a string
timeout: Maximum execution time in seconds (default: 5.0)
maximum_memory_bytes: Memory limit in bytes (default: 256MB, None to disable)
Returns:
ExecutionResult with success status, stdout/stderr, and error information
Example:
>>> result = execute_code("print('hello world')")
>>> result.success
True
>>> result.stdout
'hello world\\n'
"""
manager = multiprocessing.Manager()
result_dict = manager.dict()
p = multiprocessing.Process(
target=_unsafe_execute,
args=(code, timeout, maximum_memory_bytes, result_dict)
)
p.start()
p.join(timeout=timeout + 1)
if p.is_alive():
p.kill()
return ExecutionResult(
success=False,
stdout="",
stderr="",
error="Execution timed out (process killed)",
timeout=True,
memory_exceeded=False,
)
if not result_dict:
return ExecutionResult(
success=False,
stdout="",
stderr="",
error="Execution failed (no result returned)",
timeout=True,
memory_exceeded=False,
)
return ExecutionResult(
success=result_dict["success"],
stdout=result_dict["stdout"],
stderr=result_dict["stderr"],
error=result_dict["error"],
timeout=result_dict["timeout"],
memory_exceeded=result_dict["memory_exceeded"],
)

731
nanochat-align/gpt.py Normal file
View File

@ -0,0 +1,731 @@
"""
GPT model (rewrite, a lot simpler)
Notable features:
- rotary embeddings (and no positional embeddings)
- QK norm
- tied weights for token embedding and lm_head
- relu^2 activation in MLP
- norm after token embedding
- no learnable params in rmsnorm
- no bias in linear layers
- Group-Query Attention (GQA) support for more efficient inference
"""
import math
import inspect
from functools import partial
from dataclasses import dataclass
import torch
import torch.nn as nn
import torch.nn.functional as F
# ========== MOE ADDITION START ==========
from nanochat.manager import MANAGER
from contextlib import nullcontext
# ========== MOE ADDITION END ==========
@dataclass
class GPTConfig:
sequence_len: int = 1024
block_size: int = 1024 # alias for sequence_len, used by CausalSelfAttention
vocab_size: int = 50304
n_layer: int = 6
n_head: int = 6 # number of query heads
n_kv_head: int = 6 # number of key/value heads (GQA)
n_embd: int = 384
dropout: float = 0.0 # dropout rate
# ========== MOE ADDITION START ==========
# MoE-related configs (added for MoE support)
n_exp: int = 8 # if n_exp = 1 we just use regular MLP layers
top_k: int = 2 # number of active experts
use_aux_loss: bool = True # apply auxiliary loss (from Switch Transformer)
use_router_z_loss: bool = True # apply router z loss (from ST-MoE)
use_noisy_top_k: bool = False
aux_loss_weight: float = 0.01 # default from Switch Transformer
router_z_loss_weight: float = 0.001 # default from ST-MoE
train_capacity: float = 1.25 # default from ST-MoE
eval_capacity: float = 2.0
min_capacity: int = 4 # minimum batch size per expert
stride: int = 2 # one in every stride layers uses MoE
use_switch_tfm_init: bool = True # use weight init scheme from Switch Transformer
switch_tfm_init_scale: float = 1.0
router_use_full_prec: bool = True # use float32 in router
bias: bool = False # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
# ========== MOE ADDITION END ==========
class LayerNorm(nn.Module):
""" LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
def __init__(self, ndim, bias):
super().__init__()
self.weight = nn.Parameter(torch.ones(ndim))
self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
def forward(self, input):
return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
class CausalSelfAttention(nn.Module):
def __init__(self, config):
super().__init__()
assert config.n_embd % config.n_head == 0
# key, query, value projections for all heads, but in a batch
self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
# output projection
self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
# regularization
self.attn_dropout = nn.Dropout(config.dropout)
self.resid_dropout = nn.Dropout(config.dropout)
self.n_head = config.n_head
self.n_embd = config.n_embd
self.dropout = config.dropout
# flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
if not self.flash:
print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
# causal mask to ensure that attention is only applied to the left in the input sequence
self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
.view(1, 1, config.block_size, config.block_size))
def forward(self, x):
B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
# calculate query, key, values for all heads in batch and move head forward to be the batch dim
q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
# causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
if self.flash:
# efficient attention using Flash Attention CUDA kernels
y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
else:
# manual implementation of attention
att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
att = F.softmax(att, dim=-1)
att = self.attn_dropout(att)
y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
# output projection
y = self.resid_dropout(self.c_proj(y))
return y
class MLP(nn.Module):
def __init__(self, config):
super().__init__()
self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
self.gelu = nn.GELU()
self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
self.dropout = nn.Dropout(config.dropout)
def forward(self, x):
x = self.c_fc(x)
x = self.gelu(x)
x = self.c_proj(x)
x = self.dropout(x)
return x
class Router(nn.Module):
def __init__(self, config):
super().__init__()
# router settings
self.top_k = config.top_k
self.n_exp = config.n_exp
assert self.top_k >= 1 and self.top_k <= config.n_exp
self.use_noisy_top_k = config.use_noisy_top_k
self.train_capacity = config.train_capacity
self.eval_capacity = config.eval_capacity
self.min_capacity = config.min_capacity
self.router_use_full_prec = config.router_use_full_prec
# auxiliary / load balancing loss settings
self.use_aux_loss = config.use_aux_loss
self.use_router_z_loss = config.use_router_z_loss
# linear projection for (noisy) softmax gating
# no bias is used, see page 4 eq (4) in (https://arxiv.org/abs/1701.06538)
self.w_g = nn.Linear(config.n_embd, config.n_exp, bias=False)
self.w_noise = nn.Linear(config.n_embd, config.n_exp, bias=False) if self.use_noisy_top_k else None
def forward(self, x):
# optionally run the router in full precision to avoid instability during training
# see discussion on pg. 9 here: https://arxiv.org/abs/2101.03961
# setting enabled to False in autocast automatically puts everything in float32
device_type = 'cuda' if torch.cuda.is_available() else 'cpu' # for later use in torch.autocast
ctx = nullcontext() if not self.router_use_full_prec else torch.amp.autocast(device_type=device_type, enabled=False)
with ctx:
B, T, _ = x.size()
num_tokens = B * T
# eq (4) in (https://arxiv.org/abs/1701.06538)
logits = self.w_g(x) # [B, T, n_exp]
if self.use_noisy_top_k:
# optionally add noise into the router
noise = F.softplus(self.w_noise(x))
noise *= torch.randn_like(noise)
logits += noise
# router z loss, computed on logits (before softmax)
# this loss prevents router logits from becoming too large
if self.use_router_z_loss:
z_loss = self.compute_router_z_loss(logits)
MANAGER.add_router_z_loss(z_loss)
# find top k experts for each token
top_k_logits, top_k_indices = logits.topk(self.top_k, dim=-1) # [B, T, k]
# normalize expert probabilities
# Question: should we normalize over all experts or just top-k?
# we choose to normalize over top-k, other option is commented out below
# Shazeer et al (https://arxiv.org/abs/1701.06538) does only topk
# see page 4 eq (3)-(5), the code for this is commented out below
router_probs = torch.full_like(logits, float('-inf')) # [B, T, n_exp]
router_probs.scatter_(-1, top_k_indices, top_k_logits)
router_probs = F.softmax(router_probs, dim=-1)
# # normalize all router logits (not just top-k) via softmax
# router_probs = F.softmax(logits, dim=-1)
# compute auxiliary load balancing loss
# this loss encourages equal probability assigned to each expert
# and equal load balancing of tokens assigned to each expert
if self.use_aux_loss:
aux_loss = self.compute_aux_loss(router_probs, top_k_indices)
MANAGER.add_aux_loss(aux_loss)
# compute expert capacity
exp_capacity = self.get_capacity(num_tokens)
# make a multi-hot mask of chosen experts, size [B, T, n_exp]
# entries are 0 if expert not chosen and 1 if expert chosen
exp_mask = F.one_hot(top_k_indices, num_classes=self.n_exp) # [B, T, k, n_exp]
exp_mask = exp_mask.view(num_tokens, self.top_k, self.n_exp) # [B * T, k, n_exp]
exp_mask = exp_mask.permute(1, 0, 2) # [k, B * T, n_exp]
# compute cumulative sum of each token over experts, this stores
# the index of each token within the batch of each expert
# NOTE: cumsum should count all top-1 first, top-2 second, etc.
# so that we prioritize top experts when dropping tokens (this is
# done by putting k dimension first for the reshape operation)
exp_rank = exp_mask.reshape(self.top_k * num_tokens, self.n_exp) # [k * B * T, n_exp]
exp_rank = torch.cumsum(exp_rank, dim=0) - 1 # cumulative sum of expert selections [k * B * T, n_exp]
exp_rank = exp_rank.reshape(self.top_k, num_tokens, self.n_exp) # [k, B * T, n_exp]
# mask out (set to zero) entries that go beyond expert capacity
# compute amount of used capacity by taking a sum over mask
exp_mask *= torch.lt(exp_rank, exp_capacity) # [k, B * T, n_exp]
used_capacity = torch.sum(exp_mask, dim=(0, 1)) # [n_exp]
# mask rank to only include tokens that are selected
# perform a sum so each row only contains index of token
# for the expert that is selected in that row
# result is a matrix that contains the position of each token
# in the batch of its corresponding expert
exp_rank = torch.sum(exp_mask * exp_rank, dim=-1) # [k, B * T]
# mask probabilities to only include selected experts
router_probs = router_probs.view(num_tokens, self.n_exp)[None, :] # [1, B * T, n_exp]
exp_weights = exp_mask * router_probs # [k, B * T, n_exp]
# convert rank into one-hot vectors over the available capacity
# stores the position of each token within the capacity of the selected expert
exp_rank_sc = F.one_hot(exp_rank, num_classes=exp_capacity) # [k, B * T, exp_capacity]
# create a vector that stores, for each token, the weight of selected
# experts at token's position in the capacity of that expert
# size of tensor is [B * T, n_exp, exp_capacity]
cb_weight = torch.sum(exp_weights.unsqueeze(3) * exp_rank_sc.unsqueeze(2), dim=0)
sec_mask = cb_weight.bool() # binary mask of selected experts for each token
return used_capacity, cb_weight, sec_mask
def compute_aux_loss(self, expert_probs: torch.Tensor, indices: torch.Tensor):
"""
Computes Switch Transformer auxiliary loss (https://arxiv.org/abs/2101.03961)
See equations (4)-(6) on page 7
"""
# equation (5): compute ratio of tokens allocated to each expert
# total number of tokens is defined as total tokens in batch * k
# (k = 1) for the Switch Transformer
with torch.no_grad():
one_hot_indices = F.one_hot(indices, num_classes=self.n_exp) # [B, T, k, n_exp]
one_hot_indices = torch.sum(one_hot_indices.float(), dim=2) # [B, T, n_exp] (sum over k dimension)
tokens_per_expert = torch.mean(one_hot_indices.float(), dim=(0, 1))
# equation (6): compute ratio of router probability allocated to each expert
prob_per_expert = torch.mean(expert_probs.float(), dim=(0, 1))
# equation (4): take a scaled dot product between prob/token allocation vectors
# multiply the result by the number of experts
return self.n_exp * torch.sum(prob_per_expert * tokens_per_expert)
def compute_router_z_loss(self, logits: torch.Tensor):
"""
Computes ST-MoE router z loss (https://arxiv.org/abs/2202.08906)
See equation (5) on page 7
"""
# exponentiate logits, sum logits of each expert, take log, and square
# code below is the same as:
# > z_loss = torch.exp(logits)
# > z_loss = torch.sum(z_loss, dim=-1)
# > z_loss = torch.log(z_loss) ** 2.0
z_loss = torch.logsumexp(logits, dim=-1) ** 2.0 # [B, T, n_exp]
# sum over all tokens and divide by total number of tokens
return torch.mean(z_loss)
def get_capacity(self, tokens_per_batch):
# expert capacity is given by (tokens_per_batch / num_experts) * capacity_factor
# see eq (3) in Switch Transformer (https://arxiv.org/abs/2101.03961)
capacity_factor = self.train_capacity if self.training else self.eval_capacity
capacity = math.floor(self.top_k * capacity_factor * tokens_per_batch / self.n_exp)
capacity += capacity % 2 # make sure capacity is an even number
capacity = max(capacity, self.min_capacity) # use min capacity
assert capacity > 0
return int(capacity)
class MLPExperts(nn.Module):
"""
implementation of multiple MLP-based experts that can process input
in batch -- based upon ColossalAI OpenMoE but simple, has optional bias, and
uses a bmm instead of a loop over a mm for each expert to improve efficiency
link: https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/moe/experts.py
"""
def __init__(self, config):
# TODO: add param init
super().__init__()
self.bias = config.bias
self.c_fc = nn.Parameter(torch.empty(config.n_exp, config.n_embd, 4 * config.n_embd))
self.c_proj = nn.Parameter(torch.empty(config.n_exp, 4 * config.n_embd, config.n_embd))
self.fc_bias = nn.Parameter(torch.empty(config.n_exp, 1, 4 * config.n_embd)) if self.bias else None
self.proj_bias = nn.Parameter(torch.empty(config.n_exp, 1, config.n_embd)) if self.bias else None
self.gelu = nn.GELU()
self.dropout = nn.Dropout(config.dropout)
def forward(self, x):
x = torch.bmm(x, self.c_fc)
if self.bias:
x += self.fc_bias
x = self.gelu(x)
x = torch.bmm(x, self.c_proj)
if self.bias:
x += self.proj_bias
x = self.dropout(x)
return x
class MOELayer(nn.Module):
def __init__(self, config):
super().__init__()
self.router = Router(config) # (noisy) top k router
self.experts = MLPExperts(config) # group of MLPs (experts)
def forward(self, x: torch.Tensor):
B, T, n_embd = x.size() # track original shape of input
num_tokens = (B * T)
# pass each token through the router
used_capacity, exp_weight, exp_mask = self.router(x)
# flatten out the input
x = x.view(num_tokens, n_embd)
# reshape tokens into batches for each expert
# [n_exp, exp_capacity, B * T] * [B * T, n_embd] -> [n_exp, exp_capacity, n_embd]
exp_batches = exp_mask.permute(1, 2, 0).type_as(x) @ x
# compute expert output
exp_out = self.experts(exp_batches) # [n_exp, exp_capacity, n_embd]
# aggregate expert outputs based on router weights
# eq (2) on page 4 of ST-MoE (https://arxiv.org/abs/2202.08906)
# similar equations are used for other MoE papers
exp_weight = exp_weight.view(num_tokens, -1) # [B * T, n_exp * exp_capacity]
exp_out = exp_out.view(-1, n_embd) # [n_exp * exp_capacity, n_embd]
output = exp_weight @ exp_out # [B * T, n_embd]
# resize output before return
return output.view(B, T, n_embd)
class Block(nn.Module):
def __init__(self, config, use_moe=False):
super().__init__()
self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
self.attn = CausalSelfAttention(config)
self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
if use_moe:
self.mlp = MOELayer(config)
else:
self.mlp = MLP(config)
def forward(self, x):
x = x + self.attn(self.ln_1(x))
x = x + self.mlp(self.ln_2(x))
return x
class GPT(nn.Module):
def __init__(self, config):
super().__init__()
assert config.vocab_size is not None
assert config.block_size is not None
self.config = config
if config.n_exp == 1:
# create normal transformer blocks
blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
else:
# create transformer blocks, placing an MoE block every <stride> layers
blocks = []
for i in range(config.n_layer):
# TODO: how to implement this?
# should we change below to i + 1 ?
use_moe = (i % config.stride) == 0
blocks.append(Block(config, use_moe=use_moe))
blocks = nn.ModuleList(blocks)
self.transformer = nn.ModuleDict(dict(
wte = nn.Embedding(config.vocab_size, config.n_embd),
wpe = nn.Embedding(config.block_size, config.n_embd),
drop = nn.Dropout(config.dropout),
h = blocks,
ln_f = LayerNorm(config.n_embd, bias=config.bias),
))
self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
# with weight tying when using torch.compile() some warnings get generated:
# "UserWarning: functional_call was passed multiple values for tied weights.
# This behavior is deprecated and will be an error in future versions"
# not 100% sure what this is, so far seems to be harmless. TODO investigate
# init all weights
# optionall use switch transformer special init scheme for experts
# See pg. 10 here: https://arxiv.org/abs/2101.03961
self.apply(self._init_weights)
# apply special scaled init to the residual projections, per GPT-2 paper
for pn, p in self.named_parameters():
if pn.endswith('c_proj.weight') or pn.endswith('experts.c_proj'):
torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
# report number of parameters
print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
def get_num_params(self, non_embedding=True):
"""
Return the number of parameters in the model.
For non-embedding count (default), the position embeddings get subtracted.
The token embeddings would too, except due to the parameter sharing these
params are actually used as weights in the final layer, so we include them.
"""
n_params = sum(p.numel() for p in self.parameters())
if non_embedding:
n_params -= self.transformer.wpe.weight.numel()
return n_params
@torch.no_grad()
def _init_weights(self, module):
# optionally use switch transformer-style initialization
# see page 10 for switch init explanation: https://arxiv.org/abs/2101.03961
if isinstance(module, nn.Linear):
if self.config.use_switch_tfm_init:
scale = self.config.switch_tfm_init_scale
# linear layers have flipped dimensions in torch
# size of weights is [out_dim, in_dim]
w_fan_in = module.weight.shape[-1]
w_std = (scale / w_fan_in) ** 0.5
torch.nn.init.trunc_normal_(
module.weight,
mean=0.0,
std=w_std,
a=-2*w_std,
b=2*w_std,
)
else:
# perform standard (normal) initialization of weights
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
# always initialize bias to zero
if module.bias is not None:
torch.nn.init.zeros_(module.bias)
elif isinstance(module, MLPExperts):
# we have to init expert weights manually because
# nn.Parameter is not a type of module in torch
if self.config.use_switch_tfm_init:
scale = self.config.switch_tfm_init_scale
c_fc_fan_in = module.c_fc.shape[-2]
c_fc_std = (scale / c_fc_fan_in) ** 0.5
torch.nn.init.trunc_normal_(
module.c_fc,
mean=0.0,
std=c_fc_std,
a=-2*c_fc_std,
b=2*c_fc_std,
)
c_proj_fan_in = module.c_proj.shape[-2]
c_proj_std = (scale / c_proj_fan_in) ** 0.5
torch.nn.init.trunc_normal_(
module.c_proj,
mean=0.0,
std=c_proj_std,
a=-2*c_proj_std,
b=2*c_proj_std,
)
else:
# perform standard (normal) initialization of weights
torch.nn.init.normal_(module.c_fc, mean=0.0, std=0.02)
torch.nn.init.normal_(module.c_proj, mean=0.0, std=0.02)
# bias is always initialized to zero
if module.fc_bias is not None:
torch.nn.init.zeros_(module.fc_bias)
torch.nn.init.zeros_(module.proj_bias)
elif isinstance(module, nn.Embedding):
# just use standard initialization scheme for embedding always
torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
# def setup_optimizers(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0):
# model_dim = self.config.n_embd
# ddp, rank, local_rank, world_size = get_dist_info()
# # Separate out all parameters into groups
# # ========== MOE MODIFICATION START ==========
# # Separate MoE expert parameters (3D) from regular matrix parameters (2D)
# # Muon optimizer only accepts 2D parameters, so we need to filter out 3D (MoE experts) and 1D (bias/norm) params
# matrix_params = []
# moe_params = [] # MoE expert parameters are 3D and need AdamW optimizer
# other_params = [] # 1D parameters (bias, norm weights) also go to AdamW
# for param in self.transformer.h.parameters():
# if param.ndim == 3: # MoE expert parameters: [n_exp, ...]
# moe_params.append(param)
# elif param.ndim == 2: # Regular 2D matrix parameters for Muon
# matrix_params.append(param)
# else: # 1D parameters (bias, norm weights) go to AdamW
# other_params.append(param)
# # ========== MOE MODIFICATION END ==========
# embedding_params = list(self.transformer.wte.parameters())
# lm_head_params = list(self.lm_head.parameters())
# # Create the AdamW optimizer for the embedding, lm_head, and MoE experts
# # Scale the LR for the AdamW parameters by ∝1/√dmodel (having tuned the LRs for 768 dim model)
# dmodel_lr_scale = (model_dim / 768) ** -0.5
# if rank == 0:
# print(f"Scaling the LR for the AdamW parameters ∝1/√({model_dim}/768) = {dmodel_lr_scale:.6f}")
# if moe_params:
# print(f"Found {len(moe_params)} MoE expert parameters (3D) to optimize with AdamW")
# adam_groups = [
# dict(params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale),
# dict(params=embedding_params, lr=embedding_lr * dmodel_lr_scale),
# ]
# # ========== MOE MODIFICATION START ==========
# # Add MoE expert parameters to AdamW optimizer (use matrix_lr for consistency)
# if moe_params:
# adam_groups.append(dict(params=moe_params, lr=matrix_lr * dmodel_lr_scale))
# # ========== MOE MODIFICATION END ==========
# adamw_kwargs = dict(betas=(0.8, 0.95), eps=1e-10, weight_decay=weight_decay)
# AdamWFactory = DistAdamW if ddp else partial(torch.optim.AdamW, fused=True)
# adamw_optimizer = AdamWFactory(adam_groups, **adamw_kwargs)
# # Create the Muon optimizer for the 2D linear layers only
# muon_kwargs = dict(lr=matrix_lr, momentum=0.95)
# MuonFactory = DistMuon if ddp else Muon
# muon_optimizer = MuonFactory(matrix_params, **muon_kwargs)
# # Combine them the two optimizers into one list
# optimizers = [adamw_optimizer, muon_optimizer]
# for opt in optimizers:
# for group in opt.param_groups:
# group["initial_lr"] = group["lr"]
# return optimizers
def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
# TODO: add expert config
# start with all of the candidate parameters
param_dict = {pn: p for pn, p in self.named_parameters()}
# filter out those that do not require grad
param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
# create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
# i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
# add an extra check for "bias" string to account for bias terms in MoE layers
decay_params = [p for n, p in param_dict.items() if (p.dim() >= 2 and not n.endswith('bias'))]
nodecay_params = [p for n, p in param_dict.items() if (p.dim() < 2 or n.endswith('bias'))]
optim_groups = [
{'params': decay_params, 'weight_decay': weight_decay},
{'params': nodecay_params, 'weight_decay': 0.0}
]
num_decay_params = sum(p.numel() for p in decay_params)
num_nodecay_params = sum(p.numel() for p in nodecay_params)
print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
# Create AdamW optimizer and use the fused version if it is available
fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
use_fused = fused_available and device_type == 'cuda'
extra_args = dict(fused=True) if use_fused else dict()
optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
print(f"using fused AdamW: {use_fused}")
return optimizer
def forward(self, idx, targets=None):
device = idx.device
b, t = idx.size()
assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)
# forward the GPT model itself
tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
x = self.transformer.drop(tok_emb + pos_emb)
for block in self.transformer.h:
x = block(x)
x = self.transformer.ln_f(x)
if targets is not None:
# if we are given some desired targets also calculate the loss
logits = self.lm_head(x)
loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
# add the auxiliary load balancing loss and router z loss to the main loss
if self.config.n_exp > 1 and self.config.use_aux_loss:
loss += self.config.aux_loss_weight * MANAGER.aggregate_aux_loss()
MANAGER.reset_aux_loss()
if self.config.n_exp > 1 and self.config.use_router_z_loss:
loss += self.config.router_z_loss_weight * MANAGER.aggregate_router_z_loss()
MANAGER.reset_router_z_loss()
else:
# inference-time mini-optimization: only forward the lm_head on the very last position
logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
loss = None
return logits, loss
def crop_block_size(self, block_size):
# model surgery to decrease the block size if necessary
# e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
# but want to use a smaller block size for some smaller, simpler model
assert block_size <= self.config.block_size
self.config.block_size = block_size
self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
for block in self.transformer.h:
if hasattr(block.attn, 'bias'):
block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
@classmethod
def from_pretrained(cls, model_type, override_args=None):
assert not 'moe' in model_type, "Pretrained checkpoints not available for MoE"
assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
override_args = override_args or {} # default to empty dict
# only dropout can be overridden see more notes below
assert all(k == 'dropout' for k in override_args)
from transformers import GPT2LMHeadModel
print("loading weights from pretrained gpt: %s" % model_type)
# n_layer, n_head and n_embd are determined from model_type
config_args = {
'gpt2': dict(n_layer=12, n_head=12, n_embd=768), # 124M params
'gpt2-medium': dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
'gpt2-large': dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
'gpt2-xl': dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
}[model_type]
print("forcing vocab_size=50257, block_size=1024, bias=True")
config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
config_args['bias'] = True # always True for GPT model checkpoints
# we can override the dropout rate, if desired
if 'dropout' in override_args:
print(f"overriding dropout rate to {override_args['dropout']}")
config_args['dropout'] = override_args['dropout']
# create a from-scratch initialized minGPT model
config = GPTConfig(**config_args)
model = GPT(config)
sd = model.state_dict()
sd_keys = sd.keys()
sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
# init a huggingface/transformers model
model_hf = GPT2LMHeadModel.from_pretrained(model_type)
sd_hf = model_hf.state_dict()
# copy while ensuring all of the parameters are aligned and match in names and shapes
sd_keys_hf = sd_hf.keys()
sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
# basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
# this means that we have to transpose these weights when we import them
assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
for k in sd_keys_hf:
if any(k.endswith(w) for w in transposed):
# special treatment for the Conv1D weights we need to transpose
assert sd_hf[k].shape[::-1] == sd[k].shape
with torch.no_grad():
sd[k].copy_(sd_hf[k].t())
else:
# vanilla copy over the other parameters
assert sd_hf[k].shape == sd[k].shape
with torch.no_grad():
sd[k].copy_(sd_hf[k])
return model
def estimate_mfu(self, fwdbwd_per_iter, dt):
""" estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
# first estimate the number of flops we do per iteration.
# see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
N = self.get_num_params()
cfg = self.config
L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd//cfg.n_head, cfg.block_size
flops_per_token = 6*N + 12*L*H*Q*T
flops_per_fwdbwd = flops_per_token * T
flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
# express our flops throughput as ratio of A100 bfloat16 peak flops
flops_achieved = flops_per_iter * (1.0/dt) # per second
flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
mfu = flops_achieved / flops_promised
return mfu
@torch.no_grad()
def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
"""
Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
the sequence max_new_tokens times, feeding the predictions back into the model each time.
Most likely you'll want to make sure to be in model.eval() mode of operation for this.
"""
for _ in range(max_new_tokens):
# if the sequence context is growing too long we must crop it at block_size
idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
# forward the model to get the logits for the index in the sequence
logits, _ = self(idx_cond)
# pluck the logits at the final step and scale by desired temperature
logits = logits[:, -1, :] / temperature
# optionally crop the logits to only the top k options
if top_k is not None:
v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
logits[logits < v[:, [-1]]] = -float('Inf')
# apply softmax to convert logits to (normalized) probabilities
probs = F.softmax(logits, dim=-1)
# sample from the distribution
idx_next = torch.multinomial(probs, num_samples=1)
# append sampled index to the running sequence and continue
idx = torch.cat((idx, idx_next), dim=1)
return idx

8
nanochat-align/logo.svg Normal file
View File

@ -0,0 +1,8 @@
<svg width="400" height="400" xmlns="http://www.w3.org/2000/svg">
<defs>
<radialGradient id="g" cx="50%" cy="50%">
<stop offset="0%" style="stop-color:#667eea;stop-opacity:1"></stop>
<stop offset="100%" style="stop-color:#764ba2;stop-opacity:0.3"></stop>
</radialGradient>
</defs>
<path d="M315.9110991546882,231.0582854123025 L228.97777478867204,207.7645713530756 M284.8528137423857,284.8528137423857 L221.21320343559643,221.21320343559643 M231.0582854123025,315.9110991546882 L207.7645713530756,228.97777478867204 M168.94171458769753,315.9110991546882 L192.2354286469244,228.97777478867204 M115.14718625761431,284.8528137423857 L178.78679656440357,221.21320343559643 M84.08890084531181,231.05828541230252 L171.02222521132796,207.76457135307564 M84.0889008453118,168.9417145876975 L171.02222521132796,192.2354286469244 M115.14718625761425,115.14718625761435 L178.78679656440357,178.78679656440357 M168.94171458769753,84.0889008453118 L192.2354286469244,171.02222521132796 M231.05828541230244,84.08890084531178 L207.7645713530756,171.02222521132794 M284.8528137423857,115.14718625761428 L221.21320343559643,178.78679656440357 M315.91109915468815,168.94171458769742 L228.97777478867204,192.23542864692436 " stroke="url(#g)" stroke-width="6" stroke-linecap="round" fill="none"></path><path d="M200,-12 L212,0 L200,12 L188,0 Z" transform="translate(0,200)" fill="#000"></path></svg>

After

Width:  |  Height:  |  Size: 1.4 KiB

102
nanochat-align/loss_eval.py Normal file
View File

@ -0,0 +1,102 @@
"""
A number of functions that help with evaluating a base model.
"""
import math
import torch
import torch.distributed as dist
@torch.no_grad()
def evaluate_bpb(model, batches, steps, token_bytes=None):
"""
Instead of the naive 'mean loss', this function returns the bits per byte (bpb),
which is a tokenization vocab size-independent metric, meaning you are still comparing
apples:apples if you change the vocab size. The way this works is that instead of just
calculating the average loss as usual, you calculate the sum loss, and independently
also the sum bytes (of all the target tokens), and divide. This normalizes the loss by
the number of bytes that the target tokens represent.
The added complexity is so that:
1) All "normal" tokens are normalized by the length of the token in bytes
2) No special tokens (e.g. <|bos|>) are included in the metric - they are masked out.
3) No actively masked tokens (using ignore_index of e.g. -1) are included in the metric.
In addition to evaluate_loss, we need the token_bytes tensor:
It is a 1D tensor of shape (vocab_size,), indicating the number of bytes for
each token id, or 0 if the token is to not be counted (e.g. special tokens).
If token_bytes is None (e.g., when using .bin data with tiktoken), we fall back to
simple mean loss calculation (which is equivalent to bits per token, not bits per byte).
"""
# Get device from model
device = next(model.parameters()).device
# record the losses
total_nats = torch.tensor(0.0, dtype=torch.float32, device=device)
total_bytes = torch.tensor(0, dtype=torch.int64, device=device)
total_tokens = torch.tensor(0, dtype=torch.int64, device=device)
batch_iter = iter(batches)
for _ in range(steps):
x, y = next(batch_iter)
# Model returns (logits, loss) tuple
logits, loss = model(x, y)
# Calculate per-token loss from logits for bpb calculation
loss2d = torch.nn.functional.cross_entropy(
logits.view(-1, logits.size(-1)),
y.view(-1),
ignore_index=-1,
reduction='none'
).view(y.shape)
loss2d = loss2d.view(-1) # flatten
y = y.view(-1) # flatten
if token_bytes is None:
# Fallback: simple token counting (for .bin data with tiktoken)
# Count valid tokens (not ignored)
valid = y >= 0
total_nats += (loss2d * valid.float()).sum()
total_tokens += valid.sum()
else:
# Original bpb calculation with token_bytes
if (y.int() < 0).any(): # mps does not currently have kernel for < 0 for int64, only int32
# slightly more complex code path if some target tokens are ignore_index (e.g. -1)
# any target token < 0 is to be ignored: do NOT index token_bytes with negatives
valid = y >= 0
y_safe = torch.where(valid, y, torch.zeros_like(y))
# map valid targets to their byte length; ignored targets contribute 0 bytes
num_bytes2d = torch.where(
valid,
token_bytes[y_safe],
torch.zeros_like(y, dtype=token_bytes.dtype)
)
total_nats += (loss2d * (num_bytes2d > 0)).sum()
total_bytes += num_bytes2d.sum()
else:
# fast path: no ignored targets, safe to index directly
num_bytes2d = token_bytes[y]
total_nats += (loss2d * (num_bytes2d > 0)).sum()
total_bytes += num_bytes2d.sum()
# sum reduce across all ranks
world_size = dist.get_world_size() if dist.is_initialized() else 1
if world_size > 1:
dist.all_reduce(total_nats, op=dist.ReduceOp.SUM)
dist.all_reduce(total_bytes, op=dist.ReduceOp.SUM)
dist.all_reduce(total_tokens, op=dist.ReduceOp.SUM)
# move both to cpu, calculate bpb and return
total_nats = total_nats.item()
total_bytes = total_bytes.item()
total_tokens = total_tokens.item()
if token_bytes is None:
# Return mean loss (bits per token) when token_bytes not available
if total_tokens == 0:
return float('inf')
return total_nats / (math.log(2) * total_tokens)
else:
# Return bits per byte
if total_bytes == 0:
return float('inf')
bpb = total_nats / (math.log(2) * total_bytes)
return bpb

30
nanochat-align/manager.py Normal file
View File

@ -0,0 +1,30 @@
class MOEManager:
"""
basic wrapper class for tracking, storing, and aggregating auxiliary
losses across multiple MoE layers in the model
"""
def __init__(self):
self.aux_loss = []
self.router_z_loss = []
def reset_aux_loss(self):
self.aux_loss = []
def reset_router_z_loss(self):
self.router_z_loss = []
def add_aux_loss(self, loss):
self.aux_loss.append(loss)
def add_router_z_loss(self, loss):
self.router_z_loss.append(loss)
def aggregate_aux_loss(self):
return sum(self.aux_loss)
def aggregate_router_z_loss(self):
return sum(self.router_z_loss)
MANAGER = MOEManager()

187
nanochat-align/muon.py Normal file
View File

@ -0,0 +1,187 @@
"""
Muon optimizer from Keller et al.
Also a lot of borrowing of ideas from modded-nanogpt.
"""
import torch
from torch import Tensor
import torch.distributed as dist
@torch.compile
def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor:
"""
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
zero even beyond the point where the iteration no longer converges all the way to one everywhere
on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
performance at all relative to UV^T, where USV^T = G is the SVD.
"""
assert G.ndim >= 2 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng
a, b, c = (3.4445, -4.7750, 2.0315)
X = G.bfloat16()
if G.size(-2) > G.size(-1):
X = X.mT
# Ensure spectral norm is at most 1
X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7)
# Perform the NS iterations
for _ in range(steps):
A = X @ X.mT
B = b * A + c * A @ A # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
X = a * X + B @ X
if G.size(-2) > G.size(-1):
X = X.mT
return X
class Muon(torch.optim.Optimizer):
"""
Muon - MomentUm Orthogonalized by Newton-schulz
https://kellerjordan.github.io/posts/muon/
Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
the advantage that it can be stably run in bfloat16 on the GPU.
Some warnings:
- This optimizer should not be used for the embedding layer, the final fully connected layer,
or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW).
- To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions.
Arguments:
lr: The learning rate used by the internal SGD.
momentum: The momentum used by the internal SGD.
nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
ns_steps: The number of Newton-Schulz iteration steps to use.
"""
def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5):
defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps)
params: list[Tensor] = [*params]
param_groups = []
for size in {p.numel() for p in params}:
group = dict(params=[p for p in params if p.numel() == size])
param_groups.append(group)
super().__init__(param_groups, defaults)
@torch.no_grad()
def step(self):
for group in self.param_groups:
params: list[Tensor] = group["params"]
for p in params:
g = p.grad
assert g is not None
state = self.state[p]
if "momentum_buffer" not in state:
state["momentum_buffer"] = torch.zeros_like(g)
buf: Tensor = state["momentum_buffer"]
buf.lerp_(g, 1 - group["momentum"])
g = g.lerp_(buf, group["momentum"]) if group["nesterov"] else buf
g = zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
p.add_(g, alpha=-group["lr"] * max(1, p.size(-2) / p.size(-1))**0.5)
class DistMuon(torch.optim.Optimizer):
"""
Muon: SGD-momentum + (optional) Nesterov, then orthogonalize the 2D update via NewtonSchulz,
finally apply aspect-ratio scaled step. Performs its own distributed synchronization:
- reduce_scatter(AVG) for gradient averaging
- all_gather to replicate updated weights
Notes:
* Designed for 2D parameters (e.g., linear/conv kernels reshaped to 2D). Do not use for 0D/1D
params like embeddings or scalars.
* Momentum buffers are maintained only on the 'owner' rank for each parameter (rank chosen
by block-cyclic assignment below). If you checkpoint optimizer state on a single rank,
consolidate states beforehand.
Args:
params: iterable of Tensors
lr: learning rate
momentum: momentum coefficient in [0,1)
nesterov: if True, Nesterov-style update (g <- lerp(g, buf, momentum)); else use buf
ns_steps: number of NewtonSchulz iterations for the orthogonalization
"""
def __init__(self, params, lr: float = 0.02, momentum: float = 0.95,
nesterov: bool = True, ns_steps: int = 5):
defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps)
params = list(params)
assert all(p.ndim == 2 for p in params), "Muon expects 2D parameters only"
rank = dist.get_rank()
# Group all parameters by their shape
shapes = sorted({p.shape for p in params}) # sort to ensure consistent / deterministic ordering
param_groups = []
for shape in shapes:
group_params = [p for p in params if p.shape == shape]
device, dtype = group_params[0].device, group_params[0].dtype
assert all(p.device == device for p in group_params)
assert all(p.dtype == dtype for p in group_params)
if rank == 0:
print(f"Muon: Grouping {len(group_params)} params of shape {shape}, device {device}, dtype {dtype}")
param_groups.append(dict(params=group_params, zero_buffer=torch.zeros_like(group_params[0])))
super().__init__(param_groups, defaults)
@torch.no_grad()
def step(self):
rank = dist.get_rank()
world_size = dist.get_world_size()
# Ensure all grads exist
assert all(p.grad is not None for group in self.param_groups for p in group["params"]), "All params must have grads"
# Kick off all the reduce scatter operations to average up the gradients across all ranks
all_reduce_futures = []
for group in self.param_groups:
params = group["params"]
zero_buffer = group["zero_buffer"]
# Go through params in groups of world_size.
for base_i in range(0, len(params), world_size):
# The compute owner of each param is rank i % world_size
owner_idx = base_i + rank
# each rank stacks up its chunk of world_size params into a list
rs_input = [p.grad for p in params[base_i:base_i + world_size]]
# pad rs_input with the zero buffer to complete the group
rs_input.extend([zero_buffer] * (world_size - len(rs_input)))
# the output buffer gets strided across the group based on the rank
rs_output = params[owner_idx].grad if owner_idx < len(params) else torch.empty_like(zero_buffer)
# reduce scatter the gradients within this group of world_size params
work = dist.reduce_scatter(rs_output, rs_input, op=dist.ReduceOp.AVG, async_op=True).get_future()
all_reduce_futures.append(work)
# Now each rank computes the update and gathers
future_idx = 0
all_gather_futures = []
for group in self.param_groups:
params = group["params"]
zero_buffer = group["zero_buffer"]
# Go through params in groups of world_size.
for base_i in range(0, len(params), world_size):
# The compute owner of each param is rank i % world_size
owner_idx = base_i + rank # calculate the index of the param that this rank owns
# Wait for the reduce scatter to complete
all_reduce_futures[future_idx].wait() # possibly later we could use wait_any polling instead
future_idx += 1
# Owner computes the Muon update, result is in its param
if owner_idx < len(params):
p = params[owner_idx]
g = p.grad # now averaged across ranks
state = self.state[p]
if "momentum_buffer" not in state:
state["momentum_buffer"] = torch.zeros_like(g)
buf: Tensor = state["momentum_buffer"]
buf.lerp_(g, 1.0 - group["momentum"])
g = g.lerp_(buf, group["momentum"]) if group["nesterov"] else buf
g = zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
scale = (max(1.0, p.size(-2) / p.size(-1)) ** 0.5)
p.add_(g, alpha=-group["lr"] * scale)
# Replicate updated parameters to all ranks
ag_input = params[owner_idx] if owner_idx < len(params) else zero_buffer
ag_output = params[base_i:base_i + world_size]
ag_output.extend([torch.empty_like(zero_buffer) for _ in range(world_size - len(ag_output))]) # pad
work = dist.all_gather(ag_output, ag_input, async_op=True).get_future()
all_gather_futures.append(work)
# Wait for all work to finish
torch.futures.collect_all(all_gather_futures).wait()

408
nanochat-align/report.py Normal file
View File

@ -0,0 +1,408 @@
"""
Utilities for generating training report cards. More messy code than usual, will fix.
"""
import os
import re
import shutil
import subprocess
import socket
import datetime
import platform
import psutil
import torch
def run_command(cmd):
"""Run a shell command and return output, or None if it fails."""
try:
result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=5)
if result.returncode == 0:
return result.stdout.strip()
return None
except:
return None
def get_git_info():
"""Get current git commit, branch, and dirty status."""
info = {}
info['commit'] = run_command("git rev-parse --short HEAD") or "unknown"
info['branch'] = run_command("git rev-parse --abbrev-ref HEAD") or "unknown"
# Check if repo is dirty (has uncommitted changes)
status = run_command("git status --porcelain")
info['dirty'] = bool(status) if status is not None else False
# Get commit message
info['message'] = run_command("git log -1 --pretty=%B") or ""
info['message'] = info['message'].split('\n')[0][:80] # First line, truncated
return info
def get_gpu_info():
"""Get GPU information."""
if not torch.cuda.is_available():
return {"available": False}
num_devices = torch.cuda.device_count()
info = {
"available": True,
"count": num_devices,
"names": [],
"memory_gb": []
}
for i in range(num_devices):
props = torch.cuda.get_device_properties(i)
info["names"].append(props.name)
info["memory_gb"].append(props.total_memory / (1024**3))
# Get CUDA version
info["cuda_version"] = torch.version.cuda or "unknown"
return info
def get_system_info():
"""Get system information."""
info = {}
# Basic system info
info['hostname'] = socket.gethostname()
info['platform'] = platform.system()
info['python_version'] = platform.python_version()
info['torch_version'] = torch.__version__
# CPU and memory
info['cpu_count'] = psutil.cpu_count(logical=False)
info['cpu_count_logical'] = psutil.cpu_count(logical=True)
info['memory_gb'] = psutil.virtual_memory().total / (1024**3)
# User and environment
info['user'] = os.environ.get('USER', 'unknown')
info['nanochat_base_dir'] = os.environ.get('NANOCHAT_BASE_DIR', 'out')
info['working_dir'] = os.getcwd()
return info
def estimate_cost(gpu_info, runtime_hours=None):
"""Estimate training cost based on GPU type and runtime."""
# Rough pricing, from Lambda Cloud
default_rate = 2.0
gpu_hourly_rates = {
"H100": 3.00,
"A100": 1.79,
"V100": 0.55,
}
if not gpu_info.get("available"):
return None
# Try to identify GPU type from name
hourly_rate = None
gpu_name = gpu_info["names"][0] if gpu_info["names"] else "unknown"
for gpu_type, rate in gpu_hourly_rates.items():
if gpu_type in gpu_name:
hourly_rate = rate * gpu_info["count"]
break
if hourly_rate is None:
hourly_rate = default_rate * gpu_info["count"] # Default estimate
return {
"hourly_rate": hourly_rate,
"gpu_type": gpu_name,
"estimated_total": hourly_rate * runtime_hours if runtime_hours else None
}
def generate_header():
"""Generate the header for a training report."""
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
git_info = get_git_info()
gpu_info = get_gpu_info()
sys_info = get_system_info()
cost_info = estimate_cost(gpu_info)
header = f"""# nanochat training report
Generated: {timestamp}
## Environment
### Git Information
- Branch: {git_info['branch']}
- Commit: {git_info['commit']} {"(dirty)" if git_info['dirty'] else "(clean)"}
- Message: {git_info['message']}
### Hardware
- Platform: {sys_info['platform']}
- CPUs: {sys_info['cpu_count']} cores ({sys_info['cpu_count_logical']} logical)
- Memory: {sys_info['memory_gb']:.1f} GB
"""
if gpu_info.get("available"):
gpu_names = ", ".join(set(gpu_info["names"]))
total_vram = sum(gpu_info["memory_gb"])
header += f"""- GPUs: {gpu_info['count']}x {gpu_names}
- GPU Memory: {total_vram:.1f} GB total
- CUDA Version: {gpu_info['cuda_version']}
"""
else:
header += "- GPUs: None available\n"
if cost_info and cost_info["hourly_rate"] > 0:
header += f"""- Hourly Rate: ${cost_info['hourly_rate']:.2f}/hour\n"""
header += f"""
### Software
- Python: {sys_info['python_version']}
- PyTorch: {sys_info['torch_version']}
"""
# bloat metrics: package all of the source code and assess its weight
packaged = run_command('files-to-prompt . -e py -e md -e rs -e html -e toml -e sh --ignore "*target*" --cxml')
num_chars = len(packaged)
num_lines = len(packaged.split('\n'))
num_files = len([x for x in packaged.split('\n') if x.startswith('<source>')])
num_tokens = num_chars // 4 # assume approximately 4 chars per token
# count dependencies via uv.lock
uv_lock_lines = 0
if os.path.exists('uv.lock'):
with open('uv.lock', 'r', encoding='utf-8') as f:
uv_lock_lines = len(f.readlines())
header += f"""
### Bloat
- Characters: {num_chars:,}
- Lines: {num_lines:,}
- Files: {num_files:,}
- Tokens (approx): {num_tokens:,}
- Dependencies (uv.lock lines): {uv_lock_lines:,}
"""
return header
# -----------------------------------------------------------------------------
def slugify(text):
"""Slugify a text string."""
return text.lower().replace(" ", "-")
# the expected files and their order
EXPECTED_FILES = [
"tokenizer-training.md",
"tokenizer-evaluation.md",
"base-model-training.md",
"base-model-loss.md",
"base-model-evaluation.md",
"midtraining.md",
"chat-evaluation-mid.md",
"chat-sft.md",
"chat-evaluation-sft.md",
"chat-rl.md",
"chat-evaluation-rl.md",
]
# the metrics we're currently interested in
chat_metrics = ["ARC-Easy", "ARC-Challenge", "MMLU", "GSM8K", "HumanEval", "ChatCORE"]
def extract(section, keys):
"""simple def to extract a single key from a section"""
if not isinstance(keys, list):
keys = [keys] # convenience
out = {}
for line in section.split("\n"):
for key in keys:
if key in line:
out[key] = line.split(":")[1].strip()
return out
def extract_timestamp(content, prefix):
"""Extract timestamp from content with given prefix."""
for line in content.split('\n'):
if line.startswith(prefix):
time_str = line.split(":", 1)[1].strip()
try:
return datetime.datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")
except:
pass
return None
class Report:
"""Maintains a bunch of logs, generates a final markdown report."""
def __init__(self, report_dir):
os.makedirs(report_dir, exist_ok=True)
self.report_dir = report_dir
def log(self, section, data):
"""Log a section of data to the report."""
slug = slugify(section)
file_name = f"{slug}.md"
file_path = os.path.join(self.report_dir, file_name)
with open(file_path, "w", encoding="utf-8") as f:
f.write(f"## {section}\n")
f.write(f"timestamp: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
for item in data:
if not item:
# skip falsy values like None or empty dict etc.
continue
if isinstance(item, str):
# directly write the string
f.write(item)
else:
# render a dict
for k, v in item.items():
if isinstance(v, float):
vstr = f"{v:.4f}"
elif isinstance(v, int) and v >= 10000:
vstr = f"{v:,.0f}"
else:
vstr = str(v)
f.write(f"- {k}: {vstr}\n")
f.write("\n")
return file_path
def generate(self):
"""Generate the final report."""
report_dir = self.report_dir
report_file = os.path.join(report_dir, "report.md")
print(f"Generating report to {report_file}")
final_metrics = {} # the most important final metrics we'll add as table at the end
start_time = None
end_time = None
with open(report_file, "w", encoding="utf-8") as out_file:
# write the header first
header_file = os.path.join(report_dir, "header.md")
if os.path.exists(header_file):
with open(header_file, "r", encoding="utf-8") as f:
header_content = f.read()
out_file.write(header_content)
start_time = extract_timestamp(header_content, "Run started:")
# capture bloat data for summary later (the stuff after Bloat header and until \n\n)
bloat_data = re.search(r"### Bloat\n(.*?)\n\n", header_content, re.DOTALL)
bloat_data = bloat_data.group(1) if bloat_data else ""
else:
start_time = None # will cause us to not write the total wall clock time
bloat_data = "[bloat data missing]"
print(f"Warning: {header_file} does not exist. Did you forget to run `nanochat reset`?")
# process all the individual sections
for file_name in EXPECTED_FILES:
section_file = os.path.join(report_dir, file_name)
if not os.path.exists(section_file):
print(f"Warning: {section_file} does not exist, skipping")
continue
with open(section_file, "r", encoding="utf-8") as in_file:
section = in_file.read()
# Extract timestamp from this section (the last section's timestamp will "stick" as end_time)
if "rl" not in file_name:
# Skip RL sections for end_time calculation because RL is experimental
end_time = extract_timestamp(section, "timestamp:")
# extract the most important metrics from the sections
if file_name == "base-model-evaluation.md":
final_metrics["base"] = extract(section, "CORE")
if file_name == "chat-evaluation-mid.md":
final_metrics["mid"] = extract(section, chat_metrics)
if file_name == "chat-evaluation-sft.md":
final_metrics["sft"] = extract(section, chat_metrics)
if file_name == "chat-evaluation-rl.md":
final_metrics["rl"] = extract(section, "GSM8K") # RL only evals GSM8K
# append this section of the report
out_file.write(section)
out_file.write("\n")
# add the final metrics table
out_file.write("## Summary\n\n")
# Copy over the bloat metrics from the header
out_file.write(bloat_data)
out_file.write("\n\n")
# Collect all unique metric names
all_metrics = set()
for stage_metrics in final_metrics.values():
all_metrics.update(stage_metrics.keys())
# Custom ordering: CORE first, ChatCORE last, rest in middle
all_metrics = sorted(all_metrics, key=lambda x: (x != "CORE", x == "ChatCORE", x))
# Fixed column widths
stages = ["base", "mid", "sft", "rl"]
metric_width = 15
value_width = 8
# Write table header
header = f"| {'Metric'.ljust(metric_width)} |"
for stage in stages:
header += f" {stage.upper().ljust(value_width)} |"
out_file.write(header + "\n")
# Write separator
separator = f"|{'-' * (metric_width + 2)}|"
for stage in stages:
separator += f"{'-' * (value_width + 2)}|"
out_file.write(separator + "\n")
# Write table rows
for metric in all_metrics:
row = f"| {metric.ljust(metric_width)} |"
for stage in stages:
value = final_metrics.get(stage, {}).get(metric, "-")
row += f" {str(value).ljust(value_width)} |"
out_file.write(row + "\n")
out_file.write("\n")
# Calculate and write total wall clock time
if start_time and end_time:
duration = end_time - start_time
total_seconds = int(duration.total_seconds())
hours = total_seconds // 3600
minutes = (total_seconds % 3600) // 60
out_file.write(f"Total wall clock time: {hours}h{minutes}m\n")
else:
out_file.write("Total wall clock time: unknown\n")
# also cp the report.md file to current directory
print(f"Copying report.md to current directory for convenience")
shutil.copy(report_file, "report.md")
return report_file
def reset(self):
"""Reset the report."""
# Remove section files
for file_name in EXPECTED_FILES:
file_path = os.path.join(self.report_dir, file_name)
if os.path.exists(file_path):
os.remove(file_path)
# Remove report.md if it exists
report_file = os.path.join(self.report_dir, "report.md")
if os.path.exists(report_file):
os.remove(report_file)
# Generate and write the header section with start timestamp
header_file = os.path.join(self.report_dir, "header.md")
header = generate_header()
start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
with open(header_file, "w", encoding="utf-8") as f:
f.write(header)
f.write(f"Run started: {start_time}\n\n---\n\n")
print(f"Reset report and wrote header to {header_file}")
# -----------------------------------------------------------------------------
# nanochat-specific convenience functions
class DummyReport:
def log(self, *args, **kwargs):
pass
def reset(self, *args, **kwargs):
pass
def get_report():
# just for convenience, only rank 0 logs to report
from nanochat.common import get_base_dir, get_dist_info
ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
if ddp_rank == 0:
report_dir = os.path.join(get_base_dir(), "report")
return Report(report_dir)
else:
return DummyReport()
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Generate or reset nanochat training reports.")
parser.add_argument("command", nargs="?", default="generate", choices=["generate", "reset"], help="Operation to perform (default: generate)")
args = parser.parse_args()
if args.command == "generate":
get_report().generate()
elif args.command == "reset":
get_report().reset()

472
nanochat-align/tokenizer.py Normal file
View File

@ -0,0 +1,472 @@
"""
BPE Tokenizer in the style of GPT-4.
Two implementations are available:
1) HuggingFace Tokenizer that can do both training and inference but is really confusing
2) Our own RustBPE Tokenizer for training and tiktoken for efficient inference
"""
import os
import copy
from functools import lru_cache
SPECIAL_TOKENS = [
# every document begins with the Beginning of Sequence (BOS) token that delimits documents
"<|bos|>",
# tokens below are only used during finetuning to render Conversations into token ids
"<|user_start|>", # user messages
"<|user_end|>",
"<|assistant_start|>", # assistant messages
"<|assistant_end|>",
"<|python_start|>", # assistant invokes python REPL tool
"<|python_end|>",
"<|output_start|>", # python REPL outputs back to assistant
"<|output_end|>",
]
# NOTE: this split pattern deviates from GPT-4 in that we use \p{N}{1,2} instead of \p{N}{1,3}
# I did this because I didn't want to "waste" too many tokens on numbers for smaller vocab sizes.
# I haven't validated that this is actually a good idea, TODO.
SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
# -----------------------------------------------------------------------------
# Generic GPT-4-style tokenizer based on HuggingFace Tokenizer
from tokenizers import Tokenizer as HFTokenizer
from tokenizers import pre_tokenizers, decoders, Regex
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
class HuggingFaceTokenizer:
"""Light wrapper around HuggingFace Tokenizer for some utilities"""
def __init__(self, tokenizer):
self.tokenizer = tokenizer
@classmethod
def from_pretrained(cls, hf_path):
# init from a HuggingFace pretrained tokenizer (e.g. "gpt2")
tokenizer = HFTokenizer.from_pretrained(hf_path)
return cls(tokenizer)
@classmethod
def from_directory(cls, tokenizer_dir):
# init from a local directory on disk (e.g. "out/tokenizer")
tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
tokenizer = HFTokenizer.from_file(tokenizer_path)
return cls(tokenizer)
@classmethod
def train_from_iterator(cls, text_iterator, vocab_size):
# train from an iterator of text
# Configure the HuggingFace Tokenizer
tokenizer = HFTokenizer(BPE(
byte_fallback=True, # needed!
unk_token=None,
fuse_unk=False,
))
# Normalizer: None
tokenizer.normalizer = None
# Pre-tokenizer: GPT-4 style
# the regex pattern used by GPT-4 to split text into groups before BPE
# NOTE: The pattern was changed from \p{N}{1,3} to \p{N}{1,2} because I suspect it is harmful to
# very small models and smaller vocab sizes, because it is a little bit wasteful in the token space.
# (but I haven't validated this! TODO)
gpt4_split_regex = Regex(SPLIT_PATTERN) # huggingface demands that you wrap it in Regex!!
tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
pre_tokenizers.Split(pattern=gpt4_split_regex, behavior="isolated", invert=False),
pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False)
])
# Decoder: ByteLevel (it pairs together with the ByteLevel pre-tokenizer)
tokenizer.decoder = decoders.ByteLevel()
# Post-processor: None
tokenizer.post_processor = None
# Trainer: BPE
trainer = BpeTrainer(
vocab_size=vocab_size,
show_progress=True,
min_frequency=0, # no minimum frequency
initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
special_tokens=SPECIAL_TOKENS,
)
# Kick off the training
tokenizer.train_from_iterator(text_iterator, trainer)
return cls(tokenizer)
def get_vocab_size(self):
return self.tokenizer.get_vocab_size()
def get_special_tokens(self):
special_tokens_map = self.tokenizer.get_added_tokens_decoder()
special_tokens = [w.content for w in special_tokens_map.values()]
return special_tokens
def id_to_token(self, id):
return self.tokenizer.id_to_token(id)
def _encode_one(self, text, prepend=None, append=None):
# encode a single string
# prepend/append can be either a string of a special token or a token id directly.
assert isinstance(text, str)
ids = []
if prepend is not None:
prepend_id = prepend if isinstance(prepend, int) else self.encode_special(prepend)
ids.append(prepend_id)
ids.extend(self.tokenizer.encode(text, add_special_tokens=False).ids)
if append is not None:
append_id = append if isinstance(append, int) else self.encode_special(append)
ids.append(append_id)
return ids
def encode_special(self, text):
# encode a single special token via exact match
return self.tokenizer.token_to_id(text)
def get_bos_token_id(self):
bos = self.encode_special("<|bos|>")
return bos
def encode(self, text, *args, **kwargs):
if isinstance(text, str):
return self._encode_one(text, *args, **kwargs)
elif isinstance(text, list):
return [self._encode_one(t, *args, **kwargs) for t in text]
else:
raise ValueError(f"Invalid input type: {type(text)}")
def __call__(self, *args, **kwargs):
return self.encode(*args, **kwargs)
def decode(self, ids):
return self.tokenizer.decode(ids, skip_special_tokens=False)
def save(self, tokenizer_dir):
# save the tokenizer to disk
os.makedirs(tokenizer_dir, exist_ok=True)
tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
self.tokenizer.save(tokenizer_path)
print(f"Saved tokenizer to {tokenizer_path}")
# -----------------------------------------------------------------------------
# Tokenizer based on rustbpe + tiktoken combo
import pickle
import rustbpe
import tiktoken
class RustBPETokenizer:
"""Light wrapper around tiktoken (for efficient inference) but train with rustbpe"""
def __init__(self, enc, bos_token):
self.enc = enc
self.bos_token_id = self.encode_special(bos_token)
@classmethod
def train_from_iterator(cls, text_iterator, vocab_size):
# 1) train using rustbpe
tokenizer = rustbpe.Tokenizer()
# the special tokens are inserted later in __init__, we don't train them here
vocab_size_no_special = vocab_size - len(SPECIAL_TOKENS)
assert vocab_size_no_special >= 256, f"vocab_size_no_special must be at least 256, got {vocab_size_no_special}"
tokenizer.train_from_iterator(text_iterator, vocab_size_no_special, pattern=SPLIT_PATTERN)
# 2) construct the associated tiktoken encoding for inference
pattern = tokenizer.get_pattern()
mergeable_ranks_list = tokenizer.get_mergeable_ranks()
mergeable_ranks = {bytes(k): v for k, v in mergeable_ranks_list}
tokens_offset = len(mergeable_ranks)
special_tokens = {name: tokens_offset + i for i, name in enumerate(SPECIAL_TOKENS)}
enc = tiktoken.Encoding(
name="rustbpe",
pat_str=pattern,
mergeable_ranks=mergeable_ranks, # dict[bytes, int] (token bytes -> merge priority rank)
special_tokens=special_tokens, # dict[str, int] (special token name -> token id)
)
return cls(enc, "<|bos|>")
@classmethod
def from_directory(cls, tokenizer_dir):
pickle_path = os.path.join(tokenizer_dir, "tokenizer.pkl")
with open(pickle_path, "rb") as f:
enc = pickle.load(f)
return cls(enc, "<|bos|>")
@classmethod
def from_pretrained(cls, tiktoken_name):
# https://github.com/openai/tiktoken/blob/eedc8563/tiktoken_ext/openai_public.py
enc = tiktoken.get_encoding(tiktoken_name)
# tiktoken calls the special document delimiter token "<|endoftext|>"
# yes this is confusing because this token is almost always PREPENDED to the beginning of the document
# it most often is used to signal the start of a new sequence to the LLM during inference etc.
# so in nanoChat we always use "<|bos|>" short for "beginning of sequence", but historically it is often called "<|endoftext|>".
return cls(enc, "<|endoftext|>")
def get_vocab_size(self):
return self.enc.n_vocab
def get_special_tokens(self):
return self.enc.special_tokens_set
def id_to_token(self, id):
return self.enc.decode([id])
@lru_cache(maxsize=32)
def encode_special(self, text):
return self.enc.encode_single_token(text)
def get_bos_token_id(self):
return self.bos_token_id
def encode(self, text, prepend=None, append=None, num_threads=8):
# text can be either a string or a list of strings
if prepend is not None:
prepend_id = prepend if isinstance(prepend, int) else self.encode_special(prepend)
if append is not None:
append_id = append if isinstance(append, int) else self.encode_special(append)
if isinstance(text, str):
ids = self.enc.encode_ordinary(text)
if prepend is not None:
ids.insert(0, prepend_id) # TODO: slightly inefficient here? :( hmm
if append is not None:
ids.append(append_id)
elif isinstance(text, list):
ids = self.enc.encode_ordinary_batch(text, num_threads=num_threads)
if prepend is not None:
for ids_row in ids:
ids_row.insert(0, prepend_id) # TODO: same
if append is not None:
for ids_row in ids:
ids_row.append(append_id)
else:
raise ValueError(f"Invalid input type: {type(text)}")
return ids
def __call__(self, *args, **kwargs):
return self.encode(*args, **kwargs)
def decode(self, ids):
return self.enc.decode(ids)
def save(self, tokenizer_dir):
# save the encoding object to disk
os.makedirs(tokenizer_dir, exist_ok=True)
pickle_path = os.path.join(tokenizer_dir, "tokenizer.pkl")
with open(pickle_path, "wb") as f:
pickle.dump(self.enc, f)
print(f"Saved tokenizer encoding to {pickle_path}")
def render_conversation(self, conversation, max_tokens=2048):
"""
Tokenize a single Chat conversation (which we call a "doc" or "document" here).
Returns:
- ids: list[int] is a list of token ids of this rendered conversation
- mask: list[int] of same length, mask = 1 for tokens that the Assistant is expected to train on.
"""
# ids, masks that we will return and a helper function to help build them up.
ids, mask = [], []
def add_tokens(token_ids, mask_val):
if isinstance(token_ids, int):
token_ids = [token_ids]
ids.extend(token_ids)
mask.extend([mask_val] * len(token_ids))
# sometimes the first message is a system message...
# => just merge it with the second (user) message
if conversation["messages"][0]["role"] == "system":
# some conversation surgery is necessary here for now...
conversation = copy.deepcopy(conversation) # avoid mutating the original
messages = conversation["messages"]
assert messages[1]["role"] == "user", "System message must be followed by a user message"
messages[1]["content"] = messages[0]["content"] + "\n\n" + messages[1]["content"]
messages = messages[1:]
else:
messages = conversation["messages"]
assert len(messages) >= 1, f"Conversation has less than 1 message: {messages}"
# fetch all the special tokens we need
bos = self.get_bos_token_id()
user_start, user_end = self.encode_special("<|user_start|>"), self.encode_special("<|user_end|>")
assistant_start, assistant_end = self.encode_special("<|assistant_start|>"), self.encode_special("<|assistant_end|>")
python_start, python_end = self.encode_special("<|python_start|>"), self.encode_special("<|python_end|>")
output_start, output_end = self.encode_special("<|output_start|>"), self.encode_special("<|output_end|>")
# now we can tokenize the conversation
add_tokens(bos, 0)
for i, message in enumerate(messages):
# some sanity checking here around assumptions, to prevent footguns
must_be_from = "user" if i % 2 == 0 else "assistant"
assert message["role"] == must_be_from, f"Message {i} is from {message['role']} but should be from {must_be_from}"
# content can be either a simple string or a list of parts (e.g. containing tool calls)
content = message["content"]
if message["role"] == "user":
assert isinstance(content, str), "User messages are simply expected to be strings"
value_ids = self.encode(content)
add_tokens(user_start, 0)
add_tokens(value_ids, 0)
add_tokens(user_end, 0)
elif message["role"] == "assistant":
add_tokens(assistant_start, 0)
if isinstance(content, str):
# simple string => simply add the tokens
value_ids = self.encode(content)
add_tokens(value_ids, 1)
elif isinstance(content, list):
for part in content:
value_ids = self.encode(part["text"])
if part["type"] == "text":
# string part => simply add the tokens
add_tokens(value_ids, 1)
elif part["type"] == "python":
# python tool call => add the tokens inside <|python_start|> and <|python_end|>
add_tokens(python_start, 1)
add_tokens(value_ids, 1)
add_tokens(python_end, 1)
elif part["type"] == "python_output":
# python output => add the tokens inside <|output_start|> and <|output_end|>
# none of these tokens are supervised because the tokens come from Python at test time
add_tokens(output_start, 0)
add_tokens(value_ids, 0)
add_tokens(output_end, 0)
else:
raise ValueError(f"Unknown part type: {part['type']}")
else:
raise ValueError(f"Unknown content type: {type(content)}")
add_tokens(assistant_end, 1)
# truncate to max_tokens tokens MAX (helps prevent OOMs)
ids = ids[:max_tokens]
mask = mask[:max_tokens]
return ids, mask
def visualize_tokenization(self, ids, mask, with_token_id=False):
"""Small helper function useful in debugging: visualize the tokenization of render_conversation"""
RED = '\033[91m'
GREEN = '\033[92m'
RESET = '\033[0m'
GRAY = '\033[90m'
tokens = []
for i, (token_id, mask_val) in enumerate(zip(ids, mask)):
token_str = self.decode([token_id])
color = GREEN if mask_val == 1 else RED
tokens.append(f"{color}{token_str}{RESET}")
if with_token_id:
tokens.append(f"{GRAY}({token_id}){RESET}")
return '|'.join(tokens)
def render_for_completion(self, conversation):
"""
Used during Reinforcement Learning. In that setting, we want to
render the conversation priming the Assistant for a completion.
Unlike the Chat SFT case, we don't need to return the mask.
"""
# We have some surgery to do: we need to pop the last message (of the Assistant)
conversation = copy.deepcopy(conversation) # avoid mutating the original
messages = conversation["messages"]
assert messages[-1]["role"] == "assistant", "Last message must be from the Assistant"
messages.pop() # remove the last message (of the Assistant) inplace
# Now tokenize the conversation
ids, mask = self.render_conversation(conversation)
# Finally, to prime the Assistant for a completion, append the Assistant start token
assistant_start = self.encode_special("<|assistant_start|>")
ids.append(assistant_start)
return ids
# -----------------------------------------------------------------------------
# Tiktoken GPT-2 Tokenizer (for compatibility with nanoMoE)
class TiktokenGPT2Tokenizer:
"""Wrapper around tiktoken GPT-2 tokenizer for compatibility with nanoMoE"""
def __init__(self, enc=None):
import tiktoken
self.enc = enc if enc is not None else tiktoken.get_encoding("gpt2")
# GPT-2 uses 50256 as EOT token, which we can use as BOS
self.bos_token_id = self.enc.eot_token # 50256
@classmethod
def from_pretrained(cls, encoding_name="gpt2"):
import tiktoken
enc = tiktoken.get_encoding(encoding_name)
return cls(enc)
def get_vocab_size(self):
return self.enc.n_vocab
def get_special_tokens(self):
# GPT-2 doesn't have special tokens in the same way, but has EOT
return []
def id_to_token(self, id):
try:
return self.enc.decode([id])
except:
return f"<unk_{id}>"
def encode_special(self, text):
# GPT-2 doesn't have special tokens, return EOT token as fallback
return self.enc.eot_token
def get_bos_token_id(self):
return self.bos_token_id
def encode(self, text, prepend=None, append=None, num_threads=1):
"""
Encode text using tiktoken GPT-2 tokenizer.
prepend/append can be token ids or None.
"""
if isinstance(text, str):
ids = self.enc.encode_ordinary(text) # encode_ordinary ignores special tokens
if prepend is not None:
prepend_id = prepend if isinstance(prepend, int) else self.get_bos_token_id()
ids = [prepend_id] + ids
if append is not None:
append_id = append if isinstance(append, int) else self.enc.eot_token
ids.append(append_id)
return ids
elif isinstance(text, list):
return [self.encode(t, prepend=prepend, append=append, num_threads=num_threads) for t in text]
else:
raise ValueError(f"Invalid input type: {type(text)}")
def __call__(self, *args, **kwargs):
return self.encode(*args, **kwargs)
def decode(self, ids):
if isinstance(ids, list) and len(ids) > 0 and isinstance(ids[0], list):
# List of lists
return [self.enc.decode(seq) for seq in ids]
else:
# Single sequence
return self.enc.decode(ids)
# -----------------------------------------------------------------------------
# nanochat-specific convenience functions
def get_tokenizer(use_tiktoken_gpt2=False):
"""
Get tokenizer. If use_tiktoken_gpt2=True, returns TiktokenGPT2Tokenizer for nanoMoE compatibility.
Otherwise returns the default nanochat tokenizer.
"""
if use_tiktoken_gpt2:
return TiktokenGPT2Tokenizer.from_pretrained("gpt2")
from nanochat.common import get_base_dir
base_dir = get_base_dir()
tokenizer_dir = os.path.join(base_dir, "tokenizer")
# return HuggingFaceTokenizer.from_directory(tokenizer_dir)
return RustBPETokenizer.from_directory(tokenizer_dir)
def get_token_bytes(device="cpu"):
import torch
from nanochat.common import get_base_dir
base_dir = get_base_dir()
tokenizer_dir = os.path.join(base_dir, "tokenizer")
token_bytes_path = os.path.join(tokenizer_dir, "token_bytes.pt")
assert os.path.exists(token_bytes_path), f"Token bytes not found at {token_bytes_path}? It gets written by tok_train.py"
with open(token_bytes_path, "rb") as f:
token_bytes = torch.load(f, map_location=device)
return token_bytes

562
nanochat-align/ui.html Normal file
View File

@ -0,0 +1,562 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0, viewport-fit=cover">
<title>NanoChat</title>
<link rel="icon" type="image/svg+xml" href="/logo.svg">
<style>
:root {
color-scheme: light;
}
* {
box-sizing: border-box;
}
body {
font-family: ui-sans-serif, -apple-system, system-ui, "Segoe UI", Helvetica, "Apple Color Emoji", Arial, sans-serif, "Segoe UI Emoji", "Segoe UI Symbol";
background-color: #ffffff;
color: #111827;
min-height: 100dvh;
margin: 0;
display: flex;
flex-direction: column;
}
.header {
background-color: #ffffff;
padding: 1.25rem 1.5rem;
}
.header-left {
display: flex;
align-items: center;
gap: 0.75rem;
}
.header-logo {
height: 32px;
width: auto;
}
.header h1 {
font-size: 1.25rem;
font-weight: 600;
margin: 0;
color: #111827;
}
.new-conversation-btn {
width: 32px;
height: 32px;
padding: 0;
border: 1px solid #e5e7eb;
border-radius: 0.5rem;
background-color: #ffffff;
color: #6b7280;
cursor: pointer;
display: flex;
align-items: center;
justify-content: center;
transition: all 0.2s ease;
}
.new-conversation-btn:hover {
background-color: #f3f4f6;
border-color: #d1d5db;
color: #374151;
}
.chat-container {
flex: 1;
overflow-y: auto;
background-color: #ffffff;
}
.chat-wrapper {
max-width: 48rem;
margin: 0 auto;
padding: 2rem 1.5rem 3rem;
display: flex;
flex-direction: column;
gap: 0.75rem;
}
.message {
display: flex;
justify-content: flex-start;
margin-bottom: 0.5rem;
color: #0d0d0d;
}
.message.assistant {
justify-content: flex-start;
}
.message.user {
justify-content: flex-end;
}
.message-content {
white-space: pre-wrap;
line-height: 1.6;
max-width: 100%;
}
.message.assistant .message-content {
background: transparent;
border: none;
padding: 0.25rem 0;
cursor: pointer;
border-radius: 0.5rem;
padding: 0.5rem;
margin-left: -0.5rem;
transition: background-color 0.2s ease;
}
.message.assistant .message-content:hover {
background-color: #f9fafb;
}
.message.user .message-content {
background-color: #f3f4f6;
border-radius: 1.25rem;
padding: 0.8rem 1rem;
max-width: 65%;
cursor: pointer;
transition: background-color 0.2s ease;
}
.message.user .message-content:hover {
background-color: #e5e7eb;
}
.message.console .message-content {
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', 'Consolas', 'Courier New', monospace;
font-size: 0.875rem;
background-color: #fafafa;
padding: 0.75rem 1rem;
color: #374151;
max-width: 80%;
}
.input-container {
background-color: #ffffff;
padding: 1rem;
padding-bottom: calc(1rem + env(safe-area-inset-bottom))
}
.input-wrapper {
max-width: 48rem;
margin: 0 auto;
display: flex;
gap: 0.75rem;
align-items: flex-end;
}
.chat-input {
flex: 1;
padding: 0.8rem 1rem;
border: 1px solid #d1d5db;
border-radius: 0.75rem;
background-color: #ffffff;
color: #111827;
font-size: 1rem;
line-height: 1.5;
resize: none;
outline: none;
min-height: 54px;
max-height: 200px;
transition: border-color 0.2s ease, box-shadow 0.2s ease;
}
.chat-input::placeholder {
color: #9ca3af;
}
.chat-input:focus {
border-color: #2563eb;
box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);
}
.send-button {
flex-shrink: 0;
padding: 0;
width: 54px;
height: 54px;
border: 1px solid #111827;
border-radius: 0.75rem;
background-color: #111827;
color: #ffffff;
display: flex;
align-items: center;
justify-content: center;
cursor: pointer;
transition: background-color 0.2s ease, border-color 0.2s ease, color 0.2s ease;
}
.send-button:hover:not(:disabled) {
background-color: #2563eb;
border-color: #2563eb;
}
.send-button:disabled {
cursor: not-allowed;
border-color: #d1d5db;
background-color: #e5e7eb;
color: #9ca3af;
}
.typing-indicator {
display: inline-block;
color: #6b7280;
letter-spacing: 0.15em;
}
.typing-indicator::after {
content: '···';
animation: typing 1.4s infinite;
}
@keyframes typing {
0%, 60%, 100% { opacity: 0.2; }
30% { opacity: 1; }
}
.error-message {
background-color: #fee2e2;
border: 1px solid #fecaca;
color: #b91c1c;
padding: 0.75rem 1rem;
border-radius: 0.75rem;
margin-top: 0.5rem;
}
</style>
</head>
<body>
<div class="header">
<div class="header-left">
<button class="new-conversation-btn" onclick="newConversation()" title="New Conversation (Ctrl+Shift+N)">
<svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<path d="M12 5v14"></path>
<path d="M5 12h14"></path>
</svg>
</button>
<h1>nanochat</h1>
</div>
</div>
<div class="chat-container" id="chatContainer">
<div class="chat-wrapper" id="chatWrapper">
<!-- Messages will be added here -->
</div>
</div>
<div class="input-container">
<div class="input-wrapper">
<textarea
id="chatInput"
class="chat-input"
placeholder="Ask anything"
rows="1"
onkeydown="handleKeyDown(event)"
></textarea>
<button id="sendButton" class="send-button" onclick="sendMessage()" disabled>
<svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
<path d="M22 2L11 13"></path>
<path d="M22 2l-7 20-4-9-9-4 20-7z"></path>
</svg>
</button>
</div>
</div>
<script>
const API_URL = '';
const chatContainer = document.getElementById('chatContainer');
const chatWrapper = document.getElementById('chatWrapper');
const chatInput = document.getElementById('chatInput');
const sendButton = document.getElementById('sendButton');
let messages = [];
let isGenerating = false;
let currentTemperature = 0.8;
let currentTopK = 50;
chatInput.addEventListener('input', function() {
this.style.height = 'auto';
this.style.height = Math.min(this.scrollHeight, 200) + 'px';
sendButton.disabled = !this.value.trim() || isGenerating;
});
function handleKeyDown(event) {
if (event.key === 'Enter' && !event.shiftKey) {
event.preventDefault();
sendMessage();
}
}
document.addEventListener('keydown', function(event) {
// Ctrl+Shift+N for new conversation
if (event.ctrlKey && event.shiftKey && event.key === 'N') {
event.preventDefault();
if (!isGenerating) {
newConversation();
}
}
});
function newConversation() {
messages = [];
chatWrapper.innerHTML = '';
chatInput.value = '';
chatInput.style.height = 'auto';
sendButton.disabled = false;
isGenerating = false;
chatInput.focus();
}
function addMessage(role, content, messageIndex = null) {
const messageDiv = document.createElement('div');
messageDiv.className = `message ${role}`;
const contentDiv = document.createElement('div');
contentDiv.className = 'message-content';
contentDiv.textContent = content;
// Add click handler for user messages to enable editing
if (role === 'user' && messageIndex !== null) {
contentDiv.setAttribute('data-message-index', messageIndex);
contentDiv.setAttribute('title', 'Click to edit and restart from here');
contentDiv.addEventListener('click', function() {
if (!isGenerating) {
editMessage(messageIndex);
}
});
}
// Add click handler for assistant messages to enable regeneration
if (role === 'assistant' && messageIndex !== null) {
contentDiv.setAttribute('data-message-index', messageIndex);
contentDiv.setAttribute('title', 'Click to regenerate this response');
contentDiv.addEventListener('click', function() {
if (!isGenerating) {
regenerateMessage(messageIndex);
}
});
}
messageDiv.appendChild(contentDiv);
chatWrapper.appendChild(messageDiv);
chatContainer.scrollTop = chatContainer.scrollHeight;
return contentDiv;
}
function editMessage(messageIndex) {
// Find the message in the messages array
if (messageIndex < 0 || messageIndex >= messages.length) return;
const messageToEdit = messages[messageIndex];
if (messageToEdit.role !== 'user') return;
// Copy message content to input
chatInput.value = messageToEdit.content;
chatInput.style.height = 'auto';
chatInput.style.height = Math.min(chatInput.scrollHeight, 200) + 'px';
// Remove this message and all subsequent messages from the array
messages = messages.slice(0, messageIndex);
// Remove message elements from DOM starting from messageIndex
const allMessages = chatWrapper.querySelectorAll('.message');
for (let i = messageIndex; i < allMessages.length; i++) {
allMessages[i].remove();
}
// Enable send button and focus input
sendButton.disabled = false;
chatInput.focus();
}
async function generateAssistantResponse() {
isGenerating = true;
sendButton.disabled = true;
const assistantContent = addMessage('assistant', '');
assistantContent.innerHTML = '<span class="typing-indicator"></span>';
try {
const response = await fetch(`${API_URL}/chat/completions`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
},
body: JSON.stringify({
messages: messages,
temperature: currentTemperature,
top_k: currentTopK,
max_tokens: 512
}),
});
if (!response.ok) {
throw new Error(`HTTP error! status: ${response.status}`);
}
const reader = response.body.getReader();
const decoder = new TextDecoder();
let fullResponse = '';
assistantContent.textContent = '';
while (true) {
const { done, value } = await reader.read();
if (done) break;
const chunk = decoder.decode(value);
const lines = chunk.split('\n');
for (const line of lines) {
if (line.startsWith('data: ')) {
try {
const data = JSON.parse(line.slice(6));
if (data.token) {
fullResponse += data.token;
assistantContent.textContent = fullResponse;
chatContainer.scrollTop = chatContainer.scrollHeight;
}
} catch (e) {
}
}
}
}
const assistantMessageIndex = messages.length;
messages.push({ role: 'assistant', content: fullResponse });
// Add click handler to regenerate this assistant message
assistantContent.setAttribute('data-message-index', assistantMessageIndex);
assistantContent.setAttribute('title', 'Click to regenerate this response');
assistantContent.addEventListener('click', function() {
if (!isGenerating) {
regenerateMessage(assistantMessageIndex);
}
});
} catch (error) {
console.error('Error:', error);
assistantContent.innerHTML = `<div class="error-message">Error: ${error.message}</div>`;
} finally {
isGenerating = false;
sendButton.disabled = !chatInput.value.trim();
}
}
async function regenerateMessage(messageIndex) {
// Find the message in the messages array
if (messageIndex < 0 || messageIndex >= messages.length) return;
const messageToRegenerate = messages[messageIndex];
if (messageToRegenerate.role !== 'assistant') return;
// Remove this message and all subsequent messages from the array
messages = messages.slice(0, messageIndex);
// Remove message elements from DOM starting from messageIndex
const allMessages = chatWrapper.querySelectorAll('.message');
for (let i = messageIndex; i < allMessages.length; i++) {
allMessages[i].remove();
}
// Regenerate the assistant response
await generateAssistantResponse();
}
function handleSlashCommand(command) {
const parts = command.trim().split(/\s+/);
const cmd = parts[0].toLowerCase();
const arg = parts[1];
if (cmd === '/temperature') {
if (arg === undefined) {
addMessage('console', `Current temperature: ${currentTemperature}`);
} else {
const temp = parseFloat(arg);
if (isNaN(temp) || temp < 0 || temp > 2) {
addMessage('console', 'Invalid temperature. Must be between 0.0 and 2.0');
} else {
currentTemperature = temp;
addMessage('console', `Temperature set to ${currentTemperature}`);
}
}
return true;
} else if (cmd === '/topk') {
if (arg === undefined) {
addMessage('console', `Current top-k: ${currentTopK}`);
} else {
const topk = parseInt(arg);
if (isNaN(topk) || topk < 1 || topk > 200) {
addMessage('console', 'Invalid top-k. Must be between 1 and 200');
} else {
currentTopK = topk;
addMessage('console', `Top-k set to ${currentTopK}`);
}
}
return true;
} else if (cmd === '/clear') {
newConversation();
return true;
} else if (cmd === '/help') {
addMessage('console',
'Available commands:\n' +
'/temperature - Show current temperature\n' +
'/temperature <value> - Set temperature (0.0-2.0)\n' +
'/topk - Show current top-k\n' +
'/topk <value> - Set top-k (1-200)\n' +
'/clear - Clear conversation\n' +
'/help - Show this help message'
);
return true;
}
return false;
}
async function sendMessage() {
const message = chatInput.value.trim();
if (!message || isGenerating) return;
// Handle slash commands
if (message.startsWith('/')) {
chatInput.value = '';
chatInput.style.height = 'auto';
handleSlashCommand(message);
return;
}
chatInput.value = '';
chatInput.style.height = 'auto';
const userMessageIndex = messages.length;
messages.push({ role: 'user', content: message });
addMessage('user', message, userMessageIndex);
await generateAssistantResponse();
}
sendButton.disabled = false;
// Autofocus the chat input on page load
chatInput.focus();
fetch(`${API_URL}/health`)
.then(response => response.json())
.then(data => {
console.log('Engine status:', data);
})
.catch(error => {
console.error('Engine not available:', error);
chatWrapper.innerHTML = '<div class="error-message">Engine not running. Please start engine.py first.</div>';
});
</script>
</body>
</html>

212
scripts-align/base_eval.py Normal file
View File

@ -0,0 +1,212 @@
"""
Evaluate the CORE metric for a given model.
Run on a single GPU:
python -m scripts.base_eval
Run with torchrun on e.g. 8 GPUs:
torchrun --nproc_per_node=8 -m scripts.base_eval
The script will print the CORE metric to the console.
"""
import os
import csv
import time
import json
import yaml
import shutil
import random
import zipfile
import tempfile
from contextlib import nullcontext
import torch
from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type, download_file_with_lock
from nanochat.tokenizer import HuggingFaceTokenizer
from nanochat.checkpoint_manager import load_model
from nanochat.core_eval import evaluate_task
# -----------------------------------------------------------------------------
# nanochat specific function dealing with I/O etc.
# ~162MB of data needed to evaluate the CORE metric
EVAL_BUNDLE_URL = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip"
def place_eval_bundle(file_path):
# here file_path is the path to the eval_bundle.zip file
# we need to unzip it and place it in the base directory
base_dir = get_base_dir()
eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
with tempfile.TemporaryDirectory() as tmpdir:
with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.extractall(tmpdir)
extracted_bundle_dir = os.path.join(tmpdir, "eval_bundle")
shutil.move(extracted_bundle_dir, eval_bundle_dir)
print0(f"Placed eval_bundle directory at {eval_bundle_dir}")
def evaluate_model(model, tokenizer, device, max_per_task=-1):
"""
Evaluate a base model on the CORE benchmark.
- max_per_task: crop the data to this many examples per task for testing (-1 = disable)
"""
# Load config and task metadata
base_dir = get_base_dir()
eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
# Download the eval bundle to disk (and unzip if needed)
if not os.path.exists(eval_bundle_dir):
download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle)
config_path = os.path.join(eval_bundle_dir, "core.yaml")
data_base_path = os.path.join(eval_bundle_dir, "eval_data")
eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
with open(config_path, 'r', encoding='utf-8') as f:
config = yaml.safe_load(f)
tasks = config['icl_tasks']
# Load random baseline values from eval metadata
random_baselines = {}
with open(eval_meta_data, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
task_name = row['Eval Task']
random_baseline = row['Random baseline']
random_baselines[task_name] = float(random_baseline)
# Evaluate each task
results = {}
centered_results = {}
for task in tasks:
start_time = time.time()
label = task['label']
task_meta = {
'task_type': task['icl_task_type'],
'dataset_uri': task['dataset_uri'],
'num_fewshot': task['num_fewshot'][0],
'continuation_delimiter': task.get('continuation_delimiter', ' ')
}
print0(f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ", end='')
# Load data for this task
data_path = os.path.join(data_base_path, task_meta['dataset_uri'])
with open(data_path, 'r', encoding='utf-8') as f:
data = [json.loads(line.strip()) for line in f]
# shuffle the data because in many cases it appears ordered but we want
# the ability to only run a subset of the data for debugging purposes etc.
shuffle_rng = random.Random(1337)
shuffle_rng.shuffle(data)
if max_per_task > 0:
data = data[:max_per_task]
# run the evaluation for this task
accuracy = evaluate_task(model, tokenizer, data, device, task_meta)
results[label] = accuracy
random_baseline = random_baselines[label]
centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline)
centered_results[label] = centered_result
end_time = time.time()
print0(f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {end_time - start_time:.2f}s")
core_metric = sum(centered_results.values()) / len(centered_results)
out = {
"results": results,
"centered_results": centered_results,
"core_metric": core_metric
}
return out
# -----------------------------------------------------------------------------
# HuggingFace loading utilities and light wrappers for a model
class ModelWrapper:
"""Lightweight wrapper for a HuggingFace model"""
def __init__(self, model, max_seq_len=None):
self.model = model
self.max_seq_len = max_seq_len
def __call__(self, input_ids):
outputs = self.model(input_ids)
logits = outputs.logits
return logits
def load_hf_model(hf_path: str, device):
print0(f"Loading model from: {hf_path}")
# Load the model
from transformers import AutoModelForCausalLM
model = AutoModelForCausalLM.from_pretrained(hf_path)
model.to(device)
model.eval()
max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None
model = ModelWrapper(model, max_seq_len=max_seq_len)
# Load the tokenizer
tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path)
return model, tokenizer
# -----------------------------------------------------------------------------
def main():
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate')
parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)')
args = parser.parse_args()
# distributed / precision setup
device_type = autodetect_device_type()
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
# Load model and tokenizer from command line or from file system
if args.hf_path is not None:
# atm assume that if a path is given, it's a huggingface model path
hf_path = args.hf_path
print0(f"Loading huggingface model from: {hf_path}")
model, tokenizer = load_hf_model(hf_path, device)
model_name = hf_path # just for logging
model_slug = hf_path.replace("/", "-") # for the output csv file
else:
# load a local model from the file system
model, tokenizer, meta = load_model("base", device, phase="eval")
model_name = f"base_model (step {meta['step']})" # just for logging
model_slug = f"base_model_{meta['step']:06d}" # for the output csv file
# Evaluate the model
with autocast_ctx:
out = evaluate_model(model, tokenizer, device, max_per_task=args.max_per_task)
# Write out the results to a csv file
core_metric = None
centered_results = {}
if ddp_rank == 0:
base_dir = get_base_dir()
output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv")
os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
results = out["results"]
centered_results = out["centered_results"]
core_metric = out["core_metric"]
with open(output_csv_path, 'w', encoding='utf-8', newline='') as f:
f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n")
for label in results:
f.write(f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n")
f.write(f"{'CORE':<35}, {'':<10}, {core_metric:<10.6f}\n")
# Print the content of the csv file to console too
print0("="*80)
print0(f"Model: {model_name}")
print0("="*80)
with open(output_csv_path, 'r', encoding='utf-8') as f:
print0(f.read())
# Log to report
from nanochat.report import get_report
get_report().log(section="Base model evaluation", data=[
{
"Model": model_name,
"CORE metric": core_metric,
},
centered_results, # the full table
])
compute_cleanup()
if __name__ == "__main__":
main()

View File

@ -0,0 +1,79 @@
"""
Loads a checkpoint, and:
- Evaluates the loss on a larger chunk of train/val splits
- Samples from the model
Example run as:
torchrun --standalone --nproc_per_node=8 -m scripts.base_loss
"""
import os
from contextlib import nullcontext
import torch
from nanochat.checkpoint_manager import load_model
from nanochat.common import compute_init, print0, compute_cleanup, autodetect_device_type
from nanochat.dataloader import tokenizing_distributed_data_loader
from nanochat.tokenizer import get_token_bytes
from nanochat.loss_eval import evaluate_bpb
from nanochat.engine import Engine
# Configuration
device_batch_size = 32
split_tokens = 20*524288 # number of tokens to evaluate per split
model_tag = None # optional model tag for the output directory name
model_step = None # optional model step for the output directory name
device_type = "" # cuda|cpu|mps (empty => autodetect)
exec(open(os.path.join('nanochat', 'configurator.py')).read()) # overrides from command line or config file
# Load the base model and the tokenizer
device_type = autodetect_device_type() if device_type == "" else device_type
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=model_tag, step=model_step)
sequence_len = meta["model_config"]["sequence_len"] # could be arbitrary really
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
# Evaluate the loss on each split
tokens_per_step = device_batch_size * sequence_len * ddp_world_size
assert split_tokens % tokens_per_step == 0, "split_tokens must be divisible by tokens_per_step"
steps = split_tokens // tokens_per_step
token_bytes = get_token_bytes(device=device)
bpb_results = {}
for split_name in ["train", "val"]:
loader = tokenizing_distributed_data_loader(device_batch_size, sequence_len, split_name, device=device)
with autocast_ctx:
bpb = evaluate_bpb(model, loader, steps, token_bytes)
print0(f"{split_name} bpb: {bpb:.4f}")
bpb_results[split_name] = bpb
# Master process also samples from the model
samples = []
if ddp_rank == 0:
prompts = [
"The capital of France is",
"The chemical symbol of gold is",
"If yesterday was Friday, then tomorrow will be",
"The opposite of hot is",
"The planets of the solar system are:",
"My favorite color is",
"If 5*x + 3 = 13, then x is",
]
engine = Engine(model, tokenizer)
for prompt in prompts:
tokens = tokenizer(prompt, prepend="<|bos|>")
with autocast_ctx:
sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0)
sample_str = tokenizer.decode(sample[0])
print0(sample_str)
samples.append(sample_str)
# Log to report
from nanochat.report import get_report
get_report().log(section="Base model loss", data=[
{
"train bpb": bpb_results["train"],
"val bpb": bpb_results["val"],
},
{f"sample {i}": sample for i, sample in enumerate(samples)},
])
# Cleanup
compute_cleanup()

549
scripts-align/base_train.py Normal file
View File

@ -0,0 +1,549 @@
"""
Train model. Run as:
python base_train.py
or distributed as:
torchrun --nproc_per_node=8 base_train.py
If you are only on CPU/Macbook, you'll want to train a much much smaller LLM. Example:
python -m scripts.base_train --depth=4 --max_seq_len=512 --device_batch_size=1 --eval_iters=10 --core_metric_every=-1 --total_batch_size=512 --num_iterations=20
"""
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import time
import math
import pickle
from contextlib import nullcontext
import numpy as np
import torch
import torch._dynamo
torch._dynamo.config.suppress_errors = True
import wandb
# Import from nanoMoE model (keeping train.py's original model)
import sys
sys.path.insert(0, '/root/nanoMoE')
from model import GPTConfig, GPT
from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, print_banner, get_base_dir, autodetect_device_type
from nanochat.tokenizer import get_tokenizer
from nanochat.checkpoint_manager import save_checkpoint, load_checkpoint
from nanochat.engine import Engine
from scripts.base_eval import evaluate_model
print_banner()
# -----------------------------------------------------------------------------
# User settings
run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
# Runtime
device_type = "" # cuda|cpu|mps (empty => autodetect good device type default, in order: CUDA > MPS > CPU)
# Data loading
use_bin_data = True # if True, use .bin files (nanoMoE format) instead of parquet/text
data_dir = "" # directory containing train.bin and val.bin files (only used if use_bin_data=True)
# Model architecture
depth = 6 # the depth of the Transformer model to train (matches nanoMoE n_layer=6), rest of the kwargs are derived
max_seq_len = 1024 # max context length (matches nanoMoE block_size=1024)
dropout = 0.0 # for pretraining 0 is good, for finetuning try 0.1+ (matches nanoMoE)
bias = False # do we use bias inside LayerNorm and Linear layers? (matches nanoMoE)
# MoE settings (matching nanoMoE config/train_nano_moe.py)
n_exp = 8 # number of experts (matches train_nano_moe.py)
top_k = 2 # number of active experts (matches train_nano_moe.py)
use_aux_loss = True # apply auxiliary loss (from Switch Transformer) (matches train_nano_moe.py)
use_router_z_loss = True # apply router z loss (from ST-MoE) (matches train_nano_moe.py)
use_noisy_top_k = False # use noisy top-k routing (matches train_nano_moe.py)
aux_loss_weight = 0.01 # auxiliary loss weight (matches train_nano_moe.py)
router_z_loss_weight = 0.001 # router z loss weight (matches train_nano_moe.py)
train_capacity = 1.25 # training capacity factor (matches train_nano_moe.py)
eval_capacity = 2.0 # evaluation capacity factor (matches train_nano_moe.py)
min_capacity = 4 # minimum batch size per expert (default from ST-MoE)
stride = 2 # one in every stride layers uses MoE (matches train_nano_moe.py)
use_switch_tfm_init = True # use weight init scheme from Switch Transformer (matches train_nano_moe.py)
switch_tfm_init_scale = 1.0 # scale for switch transformer init (matches train_nano_moe.py)
router_use_full_prec = True # use float32 in router (matches train_nano_moe.py)
# Training horizon. Only one of these 3 will be used, in this order of precedence.
num_iterations = 50000 # explicit number of steps (matches nanoMoE max_iters=50000, makes total tokens ~25B)
target_flops = -1.0 # calculate num_iterations to reach target_flops. Useful for scaling laws experiments (-1 = disable)
target_param_data_ratio = -1 # calculate num_iterations to maintain fixed data:param ratio (Chinchilla=20) (-1 = disable)
# Optimization
device_batch_size = 12 # per-device batch size (matches nanoMoE batch_size=12)
total_batch_size = 491520 # total desired batch size in #tokens (matches nanoMoE: 12 * 1024 * 40 = 491,520 for 8 GPUs)
embedding_lr = 0.0006 # learning rate for the embedding parameters (Adam)
unembedding_lr = 0.0006 # learning rate for the unembedding parameters (Adam)
weight_decay = 0.1 # weight decay (matches nanoMoE weight_decay=1e-1)
matrix_lr = 0.0006 # learning rate for the matrix parameters (Muon)
learning_rate = 6e-4 # learning rate for AdamW optimizer (matches nanoMoE: 6e-4)
betas = (0.9, 0.95) # betas for AdamW optimizer (matches nanoMoE: beta1=0.9, beta2=0.95)
grad_clip = 1.0 # gradient clipping value (0.0 = disabled)
decay_lr = True # whether to decay the learning rate (matches train_nano_moe.py)
# Learning rate decay parameters (matching train.py and train_nano_moe.py)
warmup_iters = 2000 # how many steps to warm up for (matches train.py default)
lr_decay_iters = 50000 # learning rate decay iterations (matches train_nano_moe.py)
min_lr = 6e-5 # minimum learning rate (matches train.py default, which equals 6e-4 * 0.1)
final_lr_frac = 0.1 # final learning rate as fraction of initial learning rate (for compatibility)
resume_from_step = -1 # resume training from this step of the optimization (-1 = disable)
# Evaluation
eval_every = 500 # every how many steps to evaluate the model for val bpb (matches nanoMoE eval_interval=500)
eval_iters = 200 # number of iterations to evaluate val loss on (matches nanoMoE eval_iters=200)
log_interval = 10 # every how many steps to log training metrics (matches nanoMoE log_interval=10)
core_metric_every = -1 # every how many steps to evaluate the core metric (-1 = disable)
core_metric_max_per_task = -1 # examples per task in estimating the core metric
sample_every = 2000 # every how many steps to sample from the model
save_every = 1000 # every how many steps to save model checkpoints (-1 = disable, and save only at the end of the run)
# System
compile = True # use PyTorch 2.0 to compile the model to be faster (matches nanoMoE)
# Output
model_tag = "" # optionally override the model tag for the output checkpoint directory name
# now allow CLI to override the settings via the configurator lol
config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
exec(open(os.path.join('nanochat', 'configurator.py')).read()) # overrides from command line or config file
user_config = {k: globals()[k] for k in config_keys} # will be useful for logging
# -----------------------------------------------------------------------------
# Compute init
device_type = autodetect_device_type() if device_type == "" else device_type
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
# Set random seed (matching nanoMoE/train.py)
seed_offset = ddp_rank if ddp else 0 # each process gets a different seed in DDP mode
torch.manual_seed(1337 + seed_offset)
# Set tf32 precision (matching nanoMoE/train.py)
if device_type == 'cuda':
torch.backends.cuda.matmul.allow_tf32 = True # allow tf32 on matmul
torch.backends.cudnn.allow_tf32 = True # allow tf32 on cudnn
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None
get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0
# wandb logging init
use_dummy_wandb = run == "dummy" or not master_process
wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat", name=run, config=user_config)
# Tokenizer will be useful for evaluation, also we need the vocab size
# Use tiktoken GPT-2 tokenizer for compatibility with nanoMoE .bin data format
tokenizer = get_tokenizer(use_tiktoken_gpt2=True)
vocab_size = tokenizer.get_vocab_size()
print0(f"Vocab size: {vocab_size:,}")
print0("Using tiktoken GPT-2 tokenizer (nanoMoE compatible)")
# Model kwargs are derived from the desired depth of the model
# For nanoMoE, we use n_layer, n_head, n_embd directly
n_layer = 6
model_dim = 384 # matches train_nano_moe.py
num_heads = 6 # matches train_nano_moe.py
n_head = num_heads
n_embd = model_dim
num_kv_heads = num_heads
print0(f"num_layers: {n_layer}")
print0(f"model_dim: {model_dim}")
print0(f"num_heads: {num_heads}")
print0(f"num_kv_heads: {num_kv_heads}")
# Optimizer / data / training length related hyperparameters
# figure out the needed gradient accumulation to reach the desired total batch size
tokens_per_fwdbwd = device_batch_size * max_seq_len # tokens per iteration for a single rank
world_tokens_per_fwdbwd = tokens_per_fwdbwd * ddp_world_size # total tokens per iteration for all ranks
assert total_batch_size % world_tokens_per_fwdbwd == 0
grad_accum_steps = total_batch_size // world_tokens_per_fwdbwd
print0(f"Tokens / micro-batch / rank: {device_batch_size} x {max_seq_len} = {tokens_per_fwdbwd:,}")
print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}")
print0(f"Total batch size {total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}")
# -----------------------------------------------------------------------------
# Initialize the Model
# Create a new model with random weights (using nanoMoE GPT)
if not data_dir:
# Default to nanoMoE data directory structure
data_dir = "/root/nanoMoE/data/openwebtext"
# Attempt to derive vocab_size from the dataset
meta_path = os.path.join(data_dir, 'meta.pkl')
meta_vocab_size = None
if os.path.exists(meta_path):
with open(meta_path, 'rb') as f:
meta = pickle.load(f)
meta_vocab_size = meta['vocab_size']
print0(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")
model_config_kwargs = dict(
n_layer=n_layer,
n_head=n_head,
n_embd=n_embd,
block_size=max_seq_len,
vocab_size=meta_vocab_size if meta_vocab_size is not None else 50304,
dropout=dropout,
bias=bias,
# MoE parameters (matching train_nano_moe.py)
n_exp=n_exp,
top_k=top_k,
use_aux_loss=use_aux_loss,
use_router_z_loss=use_router_z_loss,
use_noisy_top_k=use_noisy_top_k,
aux_loss_weight=aux_loss_weight,
router_z_loss_weight=router_z_loss_weight,
train_capacity=train_capacity,
eval_capacity=eval_capacity,
min_capacity=min_capacity,
stride=stride,
use_switch_tfm_init=use_switch_tfm_init,
switch_tfm_init_scale=switch_tfm_init_scale,
router_use_full_prec=router_use_full_prec,
)
gptconf = GPTConfig(**model_config_kwargs)
model = GPT(gptconf)
model.to(device)
# If we are resuming, overwrite the model parameters with those of the checkpoint
base_dir = get_base_dir()
output_dirname = model_tag if model_tag else f"d{depth}" # e.g. d6
checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname)
resuming = resume_from_step != -1
# if resuming:
# print0(f"Resuming optimization from step {resume_from_step}")
# model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, resume_from_step, device, load_optimizer=True, rank=ddp_rank)
# model.load_state_dict(model_data, strict=True, assign=True)
# del model_data # free up this memory after the copy
orig_model = model # original, uncompiled model, for saving raw model state_dict and for inference/evaluation (because the shapes may change shape)
# Calculate FLOPs per token manually (based on PaLM paper Appendix B) before compilation
nparams_embedding = orig_model.transformer.wte.weight.numel()
num_params = sum(p.numel() for p in orig_model.parameters())
l, h, q, t = model_config_kwargs['n_layer'], model_config_kwargs['n_head'], model_config_kwargs['n_embd'] // model_config_kwargs['n_head'], model_config_kwargs['block_size']
num_flops_per_token = 6 * (num_params - nparams_embedding) + 12 * l * h * q * t
print0(f"Number of parameters: {num_params:,}")
print0(f"Estimated FLOPs per token: {num_flops_per_token:e}")
# Initialize GradScaler (matching nanoMoE train.py - before optimizer)
# note: float16 data type will automatically use a GradScaler
dtype_actual = 'bfloat16' if device_type == 'cuda' and torch.cuda.is_bf16_supported() else 'float16'
scaler = torch.cuda.amp.GradScaler(enabled=(dtype_actual == 'float16'))
# Initialize the Optimizer (AdamW for all parameters) - BEFORE DDP wrapping (matching nanoMoE)
optimizer = model.configure_optimizers(weight_decay=weight_decay, learning_rate=learning_rate, betas=betas, device_type=device_type)
adamw_optimizer = optimizer
# Compile the model (matching nanoMoE)
if compile:
if master_process:
print0("compiling the model... (takes a ~minute)")
model = torch.compile(model, dynamic=False) # the inputs to model will never change shape so dynamic=False is safe
# Wrap model into DDP container (matching nanoMoE train.py)
from torch.nn.parallel import DistributedDataParallel as DDP
if ddp:
model = DDP(model, device_ids=[ddp_local_rank] if device_type == "cuda" else None)
# Calculate number of iterations. Either it is given, or from target flops, or from target data:param ratio (in that order)
assert num_iterations > 0 or target_param_data_ratio > 0 or target_flops > 0
if num_iterations > 0:
print0(f"Using user-provided number of iterations: {num_iterations:,}")
elif target_flops > 0:
# calculate the number of iterations from the target flops
num_iterations = round(target_flops / (num_flops_per_token * total_batch_size))
print0(f"Calculated number of iterations from target FLOPs: {num_iterations:,}")
elif target_param_data_ratio > 0:
# calculate the number of iterations from the target param data ratio
target_tokens = target_param_data_ratio * num_params
num_iterations = target_tokens // total_batch_size
print0(f"Calculated number of iterations from target data:param ratio: {num_iterations:,}")
else:
raise ValueError("No training horizon specified")
total_tokens = total_batch_size * num_iterations
print0(f"Total number of training tokens: {total_tokens:,}")
print0(f"Tokens : Params ratio: {total_batch_size * num_iterations / num_params:.2f}") # Chinchilla is ~20
print0(f"Total training FLOPs estimate: {num_flops_per_token * total_tokens:e}")
# if resuming:
# for opt, dat in zip(optimizer, optimizer_data):
# if opt is not None and dat is not None:
# opt.load_state_dict(dat)
# del optimizer_data # free up the memory
# -----------------------------------------------------------------------------
# Data loading (nanoMoE style - simple get_batch function)
if not data_dir:
# Default to nanoMoE data directory structure
data_dir = "/root/nanoMoE/data/openwebtext"
print0(f"Using .bin data loader from: {data_dir}")
# poor man's data loader (matching nanoMoE/train.py)
def get_batch(split):
# We recreate np.memmap every batch to avoid a memory leak, as per
# https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
if split == 'train':
data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
else:
data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
ix = torch.randint(len(data) - max_seq_len, (device_batch_size,))
x = torch.stack([torch.from_numpy((data[i:i+max_seq_len]).astype(np.int64)) for i in ix])
y = torch.stack([torch.from_numpy((data[i+1:i+1+max_seq_len]).astype(np.int64)) for i in ix])
if device_type == 'cuda':
# pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
else:
x, y = x.to(device), y.to(device)
return x, y
# helps estimate an arbitrarily accurate loss over either split using many batches (matching nanoMoE/train.py)
@torch.no_grad()
def estimate_loss():
out = {}
model.eval()
for split in ['train', 'val']:
losses = torch.zeros(eval_iters)
for k in range(eval_iters):
X, Y = get_batch(split)
with autocast_ctx:
_, loss = model(X, Y)
losses[k] = loss.item()
out[split] = losses.mean()
model.train()
return out
# fetch the very first batch
x, y = get_batch('train')
# -----------------------------------------------------------------------------
# Set up hyperparameter schedulers
# Learning rate scheduler (cosine decay with warmup) - matching nanoMoE/train.py exactly
def get_lr(it):
# 1) linear warmup for warmup_iters steps
if it < warmup_iters:
return learning_rate * (it + 1) / (warmup_iters + 1)
# 2) if it > lr_decay_iters, return min learning rate
if it > lr_decay_iters:
return min_lr
# 3) in between, use cosine decay down to min learning rate
decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
assert 0 <= decay_ratio <= 1
coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff ranges 0..1
return min_lr + coeff * (learning_rate - min_lr)
# -----------------------------------------------------------------------------
# Loop state (variables updated by the training loop)
if not resuming:
step = 0
min_val_bpb = float("inf")
smooth_train_loss = 0 # EMA of training loss
total_training_time = 0 # total wall-clock time of training
val_bpb = None # Will be set during evaluation
else:
step = meta_data["step"]
loop_state = meta_data["loop_state"]
min_val_bpb = loop_state["min_val_bpb"]
smooth_train_loss = loop_state["smooth_train_loss"]
total_training_time = loop_state["total_training_time"]
val_bpb = None # Will be set during evaluation
# -----------------------------------------------------------------------------
# Training loop
while True:
last_step = step == num_iterations # loop runs num_iterations+1 times so that we can eval/save at the end
flops_so_far = num_flops_per_token * total_batch_size * step
# determine and set the learning rate for this iteration (matching nanoMoE/train.py)
lr = get_lr(step) if decay_lr else learning_rate
for param_group in optimizer.param_groups:
param_group['lr'] = lr
# once in a while: evaluate the val loss (all ranks participate, matching nanoMoE/train.py)
if last_step or step % eval_every == 0:
losses = estimate_loss()
val_loss = losses['val'].item()
train_loss_eval = losses['train'].item()
print0(f"Step {step:05d} | Train loss: {train_loss_eval:.4f}, Val loss: {val_loss:.4f}")
if val_loss < min_val_bpb:
min_val_bpb = val_loss
wandb_run.log({
"step": step,
"total_training_flops": flops_so_far,
"total_training_time": total_training_time,
"val/loss": val_loss,
"train/loss_eval": train_loss_eval,
})
val_bpb = val_loss # for compatibility with existing code
# once in a while: estimate the CORE metric (all ranks participate)
# use the original uncompiled model because the inputs keep changing shape
results = {}
if core_metric_every > 0 and (last_step or (step > 0 and step % core_metric_every == 0)):
model.eval()
with autocast_ctx:
results = evaluate_model(orig_model, tokenizer, device, max_per_task=core_metric_max_per_task)
print0(f"Step {step:05d} | CORE metric: {results['core_metric']:.4f}")
wandb_run.log({
"step": step,
"total_training_flops": flops_so_far,
"core_metric": results["core_metric"],
"centered_results": results["centered_results"],
})
model.train()
# once in a while: sample from the model (only on master process)
# use the original uncompiled model because the inputs keep changing shape
if master_process and (last_step or (step > 0 and step % sample_every == 0)):
model.eval()
prompts = [
"The capital of France is",
"The chemical symbol of gold is",
"If yesterday was Friday, then tomorrow will be",
"The opposite of hot is",
"The planets of the solar system are:",
"My favorite color is",
"If 5*x + 3 = 13, then x is",
]
engine = Engine(orig_model, tokenizer) # use orig_model to avoid recompilation
for prompt in prompts:
tokens = tokenizer(prompt, prepend="<|bos|>")
with autocast_ctx:
sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0)
print0(tokenizer.decode(sample[0]))
model.train()
# save checkpoint: at the end of the run, or every save_every steps, except at the first step or the resume step
if last_step or (step > 0 and step != resume_from_step and save_every > 0 and step % save_every == 0):
save_checkpoint(
checkpoint_dir,
step,
orig_model.state_dict(), # model parameters
optimizer.state_dict(), # optimizer states
{ # metadata saved as json
"step": step,
"model_config": model_config_kwargs,
"user_config": user_config, # inputs to the training script
"device_batch_size": device_batch_size,
"max_seq_len": max_seq_len,
"loop_state": { # all loop state (other than step) so that we can resume training
"min_val_bpb": min_val_bpb,
"smooth_train_loss": smooth_train_loss,
"total_training_time": total_training_time,
},
},
rank=ddp_rank,
)
# termination conditions (TODO: possibly also add loss explosions etc.)
if last_step:
break
# -------------------------------------------------------------------------
# forward backward update, with optional gradient accumulation to simulate larger batch size
# and using the GradScaler if data type is float16 (matching nanoMoE train.py exactly)
synchronize()
t0 = time.time()
for micro_step in range(grad_accum_steps):
if ddp:
# in DDP training we only need to sync gradients at the last micro step.
# the official way to do this is with model.no_sync() context manager, but
# I really dislike that this bloats the code and forces us to repeat code
# looking at the source of that context manager, it just toggles this variable
model.require_backward_grad_sync = (micro_step == grad_accum_steps - 1)
with autocast_ctx:
_, loss = model(x, y) # nanoMoE model returns (logits, loss)
loss = loss / grad_accum_steps # scale the loss to account for gradient accumulation
# immediately async prefetch next batch while model is doing the forward pass on the GPU
x, y = get_batch('train')
# backward pass, with gradient scaling if training in fp16
scaler.scale(loss).backward()
# clip the gradient
grad_clip_enabled = grad_clip > 0.0
grad_norm = None
if grad_clip_enabled:
scaler.unscale_(optimizer)
# clip_grad_norm_ returns the gradient norm before clipping
grad_norm_tensor = torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
grad_norm = grad_norm_tensor.item() # GPU tensor -> CPU float (note: cpu-gpu sync point)
# step the optimizer and scaler if training in fp16
scaler.step(optimizer)
scaler.update()
# flush the gradients as soon as we can, no need for this memory anymore
optimizer.zero_grad(set_to_none=True)
synchronize()
t1 = time.time()
dt = t1 - t0
train_loss = loss.detach() # for logging (after scaling)
# -------------------------------------------------------------------------
# logging (base_train.py style - keeping all the detailed logging)
ema_beta = 0.9 # EMA decay factor for some smoothing just for nicer logging
# scale up to undo the division above, approximating the true total loss (exact would have been a sum)
lossf = loss.item() * grad_accum_steps
smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * lossf # EMA the training loss
debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA
pct_done = 100 * step / num_iterations
tok_per_sec = int(total_batch_size / dt)
flops_per_sec = num_flops_per_token * total_batch_size / dt
promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity
mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in %
if step > 10:
total_training_time += dt # only count the time after the first 10 steps
print_grad_norm = f" grad norm: {grad_norm:.4f} |" if grad_clip_enabled and grad_norm is not None else ""
lr_str = f"lr: {lr:.2e} |" if decay_lr else ""
print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} |{print_grad_norm} {lr_str}dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | total time: {total_training_time/60:.2f}m")
if step % 100 == 0:
log_data = {
"step": step,
"total_training_flops": flops_so_far,
"total_training_time": total_training_time,
"train/loss": debiased_smooth_loss,
"train/dt": dt,
"train/tok_per_sec": tok_per_sec,
"train/mfu": mfu,
}
if decay_lr:
log_data["lr"] = lr
if grad_clip_enabled:
log_data["train/grad_norm"] = grad_norm
wandb_run.log(log_data)
# state update
step += 1
# print a few more stats
print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB")
print0(f"Total training time: {total_training_time/60:.2f}m")
print0(f"Minimum validation bpb: {min_val_bpb:.4f}")
# Log to report
from nanochat.report import get_report
get_report().log(section="Base model training", data=[
user_config, # CLI args
{ # stats about the training setup
"Number of parameters": num_params,
"Number of FLOPs per token": f"{num_flops_per_token:e}",
"Calculated number of iterations": num_iterations,
"Number of training tokens": total_tokens,
"Tokens : Params ratio": total_batch_size * num_iterations / num_params,
"DDP world size": ddp_world_size,
"final_lr_frac": final_lr_frac,
},
{ # stats about training outcomes
"Minimum validation bpb": min_val_bpb,
"Final validation bpb": val_bpb,
"CORE metric estimate": results.get("core_metric", None),
"MFU %": f"{mfu:.2f}%",
"Total training flops": f"{flops_so_far:e}",
"Total training time": f"{total_training_time/60:.2f}m",
"Peak memory usage": f"{get_max_memory() / 1024 / 1024:.2f}MiB",
}
])
# cleanup
wandb_run.finish() # wandb run finish
compute_cleanup()

105
scripts-align/chat_cli.py Normal file
View File

@ -0,0 +1,105 @@
"""
New and upgraded chat mode because a lot of the code has changed since the last one.
Intended to be run single GPU only atm:
python -m scripts.chat_cli -i mid
"""
import argparse
import torch
from nanochat.common import compute_init, autodetect_device_type
from contextlib import nullcontext
from nanochat.engine import Engine
from nanochat.checkpoint_manager import load_model
parser = argparse.ArgumentParser(description='Chat with the model')
parser.add_argument('-i', '--source', type=str, default="sft", help="Source of the model: sft|mid|rl")
parser.add_argument('-g', '--model-tag', type=str, default=None, help='Model tag to load')
parser.add_argument('-s', '--step', type=int, default=None, help='Step to load')
parser.add_argument('-p', '--prompt', type=str, default='', help='Prompt the model, get a single response back')
parser.add_argument('-t', '--temperature', type=float, default=0.6, help='Temperature for generation')
parser.add_argument('-k', '--top-k', type=int, default=50, help='Top-k sampling parameter')
parser.add_argument('--device-type', type=str, default='', choices=['cuda', 'cpu', 'mps'], help='Device type for evaluation: cuda|cpu|mps. empty => autodetect')
parser.add_argument('-d', '--dtype', type=str, default='bfloat16', choices=['float32', 'bfloat16'])
args = parser.parse_args()
# Init the model and tokenizer
device_type = autodetect_device_type() if args.device_type == "" else args.device_type
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
ptdtype = torch.float32 if args.dtype == 'float32' else torch.bfloat16
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
model, tokenizer, meta = load_model(args.source, device, phase="eval", model_tag=args.model_tag, step=args.step)
# Special tokens for the chat state machine
bos = tokenizer.get_bos_token_id()
user_start, user_end = tokenizer.encode_special("<|user_start|>"), tokenizer.encode_special("<|user_end|>")
assistant_start, assistant_end = tokenizer.encode_special("<|assistant_start|>"), tokenizer.encode_special("<|assistant_end|>")
# Create Engine for efficient generation
engine = Engine(model, tokenizer)
print("\nNanoChat Interactive Mode")
print("-" * 50)
print("Type 'quit' or 'exit' to end the conversation")
print("Type 'clear' to start a new conversation")
print("-" * 50)
conversation_tokens = [bos]
while True:
if args.prompt:
# Get the prompt from the launch command
user_input = args.prompt
else:
# Get the prompt interactively from the console
try:
user_input = input("\nUser: ").strip()
except (EOFError, KeyboardInterrupt):
print("\nGoodbye!")
break
# Handle special commands
if user_input.lower() in ['quit', 'exit']:
print("Goodbye!")
break
if user_input.lower() == 'clear':
conversation_tokens = [bos]
print("Conversation cleared.")
continue
if not user_input:
continue
# Add User message to the conversation
conversation_tokens.append(user_start)
conversation_tokens.extend(tokenizer.encode(user_input))
conversation_tokens.append(user_end)
# Kick off the assistant
conversation_tokens.append(assistant_start)
generate_kwargs = {
"num_samples": 1,
"max_tokens": 256,
"temperature": args.temperature,
"top_k": args.top_k,
}
response_tokens = []
print("\nAssistant: ", end="", flush=True)
with autocast_ctx:
for token_column, token_masks in engine.generate(conversation_tokens, **generate_kwargs):
token = token_column[0] # pop the batch dimension (num_samples=1)
response_tokens.append(token)
token_text = tokenizer.decode([token])
print(token_text, end="", flush=True)
print()
# we have to ensure that the assistant end token is the last token
# so even if generation ends due to max tokens, we have to append it to the end
if response_tokens[-1] != assistant_end:
response_tokens.append(assistant_end)
conversation_tokens.extend(response_tokens)
# In the prompt mode, we only want a single response and exit
if args.prompt:
break

257
scripts-align/chat_eval.py Normal file
View File

@ -0,0 +1,257 @@
"""
Evaluate the Chat model.
All the generic code lives here, and all the evaluation-specific
code lives in nanochat directory and is imported from here.
Example runs:
python -m scripts.chat_eval -a ARC-Easy
torchrun --nproc_per_node=8 -m scripts.chat_eval -- -a ARC-Easy
"""
import argparse
from functools import partial
from contextlib import nullcontext
import torch
import torch.distributed as dist
from nanochat.common import compute_init, compute_cleanup, get_dist_info, print0, autodetect_device_type
from nanochat.checkpoint_manager import load_model
from nanochat.engine import Engine
from tasks.humaneval import HumanEval
from tasks.mmlu import MMLU
from tasks.arc import ARC
from tasks.gsm8k import GSM8K
from tasks.spellingbee import SpellingBee
# -----------------------------------------------------------------------------
# Generative evaluation loop (we go one problem at a time, sample, evaluate)
def run_generative_eval(task_object, tokenizer, model, engine, num_samples, max_new_tokens, temperature, top_k, max_problems=None):
ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
device = model.get_device()
num_problems = len(task_object) if max_problems is None else min(len(task_object), max_problems)
# Run the evaluation
num_passed, total = 0, 0
for i in range(ddp_rank, num_problems, ddp_world_size):
conversation = task_object[i]
# Tokenize the prompt
encoded_prompt = tokenizer.render_for_completion(conversation)
# Get the completions
results, _ = engine.generate_batch(
encoded_prompt,
num_samples=num_samples,
max_tokens=max_new_tokens,
temperature=temperature,
top_k=top_k,
)
# Decode the completions as text
prefix_length = len(encoded_prompt)
completions = [tokenizer.decode(result_tokens[prefix_length:]) for result_tokens in results]
# Evaluate success criteria
outcomes = [task_object.evaluate(conversation, completion) for completion in completions]
passed = any(outcomes)
# Keep stats
total += 1
num_passed += int(passed)
# Logging (overwrite the same line in the console)
print(f"\r\033[KRank {ddp_rank} | {num_passed}/{total} ({100*num_passed/total:.2f}%)", end='', flush=True)
# Finish the in-place progress line with a newline before final summary
print()
# Aggregate results across all ranks
if ddp:
num_passed_tensor = torch.tensor([num_passed], dtype=torch.long, device=device)
total_tensor = torch.tensor([total], dtype=torch.long, device=device)
dist.all_reduce(num_passed_tensor, op=dist.ReduceOp.SUM)
dist.all_reduce(total_tensor, op=dist.ReduceOp.SUM)
num_passed = num_passed_tensor.item()
total = total_tensor.item()
print0("=" * 50)
print0(f"Final: {num_passed}/{total} ({100*num_passed/total:.2f}%)")
# Return the accuracy
return num_passed/total
# -----------------------------------------------------------------------------
# Categorical evaluation loop
# A lot easier because we don't have to sample. Therefore, we can actually go
# batches at a time and just check the logits for correct answer choices.
def run_categorical_eval(task_object, tokenizer, model, batch_size, max_problems=None):
ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
device = model.get_device()
bos = tokenizer.get_bos_token_id() # use BOS as pad token is ok, these positions are ignored
# We'll process batches of independent problems at a time because there is no sampling needed
num_problems = len(task_object) if max_problems is None else min(len(task_object), max_problems)
ceil_div = lambda x, y: -(-x // y)
num_batches = ceil_div(num_problems, batch_size)
# Run the evaluation
letter_to_id_cache = {} # many letters will repeat often, let's save the tokenizer some work
num_passed, total = 0, 0
for i in range(ddp_rank, num_batches, ddp_world_size):
i0, i1 = i * batch_size, min((i + 1) * batch_size, num_problems)
# Prepare the batch of problems. They might all be of different length, so we pad/collate them.
conversations = [task_object[ii] for ii in range(i0, i1)]
prompt_ids = [tokenizer.render_for_completion(conversation) for conversation in conversations] # TODO: remake the way this works
max_length = max(len(ids) for ids in prompt_ids)
answer_time_positions = [len(ids) - 1 for ids in prompt_ids] # where the last token is (and the predicted answer)
padded_prompt_ids = [ids + [bos] * (max_length - len(ids)) for ids in prompt_ids]
prompt_ids = torch.tensor(padded_prompt_ids, dtype=torch.long, device=device)
# Get the logits for the whole batch of conversations in parallel (efficiency win here)
with torch.no_grad():
logits = model(prompt_ids) # (B, T, V)
# Focus on the available answer on just the letters corresponding to choices
# Note that this helps the evaluation a lot because it specifically narrows the focus to only the available letters
# The much harder alternative would be to just generate from the Assistant and check if it responded with the correct
# letter (e.g. A, B, C, D), but evaluations typically make the task easier in this way.
for idx, conversation in enumerate(conversations):
# get the token ids of all the available letters of this problem
letters = conversation['letters']
letter_ids = []
for letter in letters:
if not letter in letter_to_id_cache:
encoded_letter = tokenizer.encode(letter)
assert len(encoded_letter) == 1, "Each letter must be a single token"
letter_to_id_cache[letter] = encoded_letter[0]
letter_ids.append(letter_to_id_cache[letter])
# focus logits just down to the answer position and the available letters of the answer
answer_pos = answer_time_positions[idx]
focus_logits = logits[idx, answer_pos, letter_ids]
# get the argmax letter (the predicted answer)
argmax_letter_id = focus_logits.argmax(dim=-1).item()
predicted_letter = letters[argmax_letter_id]
# evaluate the outcome
outcome = task_object.evaluate(conversation, predicted_letter)
num_passed += int(outcome)
total += 1
# Aggregate results across all ranks
if ddp:
num_passed_tensor = torch.tensor([num_passed], dtype=torch.long, device=device)
total_tensor = torch.tensor([total], dtype=torch.long, device=device)
dist.all_reduce(num_passed_tensor, op=dist.ReduceOp.SUM)
dist.all_reduce(total_tensor, op=dist.ReduceOp.SUM)
num_passed = num_passed_tensor.item()
total = total_tensor.item()
average = num_passed/total
print0(f"Final: {num_passed}/{total} ({100*average:.2f}%)")
return average
# -----------------------------------------------------------------------------
def run_chat_eval(task_name, model, tokenizer, engine,
batch_size=1, num_samples=1, max_new_tokens=512, temperature=0.0, top_k=50,
max_problems=None):
# Create the evaluation object
task_module = {
'HumanEval': HumanEval,
'MMLU': partial(MMLU, subset="all", split="test"),
'ARC-Easy': partial(ARC, subset="ARC-Easy", split="test"),
'ARC-Challenge': partial(ARC, subset="ARC-Challenge", split="test"),
'GSM8K': partial(GSM8K, subset="main", split="test"),
'SpellingBee': partial(SpellingBee, size=256, split="test"),
}[task_name]
task_object = task_module()
# Run the evaluation
if task_object.eval_type == 'generative':
acc = run_generative_eval(task_object, tokenizer, model, engine, num_samples, max_new_tokens, temperature, top_k, max_problems=max_problems)
elif task_object.eval_type == 'categorical':
acc = run_categorical_eval(task_object, tokenizer, model, batch_size, max_problems=max_problems)
else:
raise ValueError(f"Unsupported task evaluation type: {task_object.eval_type}")
return acc
# -----------------------------------------------------------------------------
if __name__ == "__main__":
# Parse command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument('-i', '--source', type=str, required=True, help="Source of the model: sft|mid|rl")
parser.add_argument('-a', '--task-name', type=str, default=None, help="Task name. Default = all tasks. Use | to split multiple tasks.")
parser.add_argument('-d', '--dtype', type=str, default='bfloat16', choices=['float32', 'bfloat16'])
parser.add_argument('-t', '--temperature', type=float, default=0.0)
parser.add_argument('-m', '--max-new-tokens', type=int, default=512)
parser.add_argument('-n', '--num-samples', type=int, default=1)
parser.add_argument('-k', '--top-k', type=int, default=50)
parser.add_argument('-b', '--batch-size', type=int, default=8, help='Batch size for categorical evaluation')
parser.add_argument('-g', '--model-tag', type=str, default=None, help='Model tag to load')
parser.add_argument('-s', '--step', type=int, default=None, help='Step to load')
parser.add_argument('-x', '--max-problems', type=int, default=None, help='Max problems to evaluate')
parser.add_argument('--device-type', type=str, default='', choices=['cuda', 'cpu', 'mps'], help='Device type for evaluation: cuda|cpu|mps. empty => autodetect')
args = parser.parse_args()
device_type = autodetect_device_type() if args.device_type == "" else args.device_type
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
ptdtype = torch.float32 if args.dtype == 'float32' else torch.bfloat16
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
model, tokenizer, meta = load_model(args.source, device, phase="eval", model_tag=args.model_tag, step=args.step)
engine = Engine(model, tokenizer)
# Get the tasks to evaluate on
all_tasks = ['ARC-Easy', 'ARC-Challenge', 'MMLU', 'GSM8K', 'HumanEval', 'SpellingBee']
baseline_accuracies = {
'ARC-Easy': 0.25, # multiple choice 1 of 4 => 25%
'ARC-Challenge': 0.25, # multiple choice 1 of 4 => 25%
'MMLU': 0.25, # multiple choice 1 of 4 => 25%
'GSM8K': 0.0, # open-ended => 0%
'HumanEval': 0.0, # open-ended => 0%
'SpellingBee': 0.0, # open-ended => 0%
}
task_names = all_tasks if args.task_name is None else args.task_name.split('|')
# Run all the task evaluations sequentially
results = {}
for task_name in task_names:
with autocast_ctx:
acc = run_chat_eval(
task_name,
model, tokenizer, engine,
batch_size=args.batch_size,
num_samples=args.num_samples,
max_new_tokens=args.max_new_tokens,
temperature=args.temperature,
top_k=args.top_k,
max_problems=args.max_problems,
)
results[task_name] = acc
print0(f"{task_name} accuracy: {100 * acc:.2f}%")
# Log to report
from nanochat.report import get_report
all_tasks_were_evaluated = all(task_name in results for task_name in all_tasks)
# calculate the ChatCORE metric if we can (similar to CORE, it's the mean centered accuracy)
# this way, ChatCORE ranges from 0 (at random baseline) to 1 (peak performance)
chatcore_metric_dict = {}
if all_tasks_were_evaluated:
centered_mean = 0
for task_name, acc in results.items():
baseline_acc = baseline_accuracies.get(task_name, 0.0)
centered_acc = (acc - baseline_acc) / (1.0 - baseline_acc)
centered_mean += centered_acc
chatcore_metric = centered_mean / len(results)
chatcore_metric_dict = {"ChatCORE metric": chatcore_metric}
get_report().log(section="Chat evaluation " + args.source, data=[
vars(args), # CLI args
results,
chatcore_metric_dict,
])
compute_cleanup()

332
scripts-align/chat_rl.py Normal file
View File

@ -0,0 +1,332 @@
"""
Reinforcement learning on GSM8K via "GRPO".
I put GRPO in quotes because we actually end up with something a lot
simpler and more similar to just REINFORCE:
1) Delete trust region, so there is no KL regularization to a reference model
2) We are on policy, so there's no need for PPO ratio+clip.
3) We use GAPO style normalization that is token-level, not sequence-level.
4) Instead of z-score normalization (r - mu)/sigma, only use (r - mu) as the advantage.
1 GPU:
python -m scripts.chat_rl
8 GPUs:
torchrun --standalone --nproc_per_node=8 -m scripts.chat_rl -- --run=default
"""
import os
import itertools
import re
import wandb
import torch
import torch.distributed as dist
from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, DummyWandb
from nanochat.checkpoint_manager import save_checkpoint, load_model
from nanochat.engine import Engine
from tasks.gsm8k import GSM8K
# RL hyperparameters
run = "dummy" # wandb run name
source = "sft" # mid|sft
dtype = "bfloat16"
device_batch_size = 8 # no forward pass will go above this to not OOM
examples_per_step = 16 # in total and across all ranks (note: examples, not samples/completions!)
num_samples = 16 # number of samples per example (/question)
max_new_tokens = 256
temperature = 1.0
top_k = 50 # TODO: try None?
unembedding_lr = 0.004
embedding_lr = 0.2
matrix_lr = 0.02
weight_decay = 0.0
init_lr_frac = 0.05
num_epochs = 1 # how many epochs of gsm8k to train on
save_every = 60 # every how many steps to save the model
eval_every = 60 # every how many steps to evaluate the model for val pass@k
eval_examples = 400 # number of examples used for evaluating pass@k
# now allow CLI to override the settings via the configurator lol
config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
exec(open(os.path.join('nanochat', 'configurator.py')).read()) # overrides from command line or config file
user_config = {k: globals()[k] for k in config_keys} # will be useful for logging
# -----------------------------------------------------------------------------
# Init compute/precision
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init()
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
dtype = torch.float32 if dtype == 'float32' else torch.bfloat16
autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=dtype)
# wandb logging init
use_dummy_wandb = run == "dummy" or not master_process
wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat-rl", name=run, config=user_config)
# Init model and tokenizer
model, tokenizer, meta = load_model(source, device, phase="eval")
engine = Engine(model, tokenizer) # for sampling rollouts
# -----------------------------------------------------------------------------
# Rollout / sampling generator loop that yields batches of examples for training
train_task = GSM8K(subset="main", split="train")
val_task = GSM8K(subset="main", split="test")
num_steps = (len(train_task) // examples_per_step) * num_epochs
print0(f"Calculated number of steps: {num_steps}")
@torch.no_grad()
def get_batch():
assistant_end = tokenizer.encode_special("<|assistant_end|>") # ok to use this token, it's only for padding and isn't used in the loss.
rank_indices = range(ddp_rank, len(train_task), ddp_world_size) # each rank is responsible for different examples in the training data
for example_idx in itertools.cycle(rank_indices):
# First get the full conversation of both user and assistant messages
conversation = train_task[example_idx]
# Tokenize the conversation, deleting the last Assistant message and priming the Assistant for a completion instead
# (i.e. keep the <|assistant_start|>, but delete everything after it)
tokens = tokenizer.render_for_completion(conversation)
prefix_length = len(tokens)
# Generate num_samples samples using batched generation, use loop to avoid OOMs
model.eval() # ensure the model is in eval mode
generated_token_sequences = []
masks = []
num_sampling_steps = num_samples // device_batch_size # go sequentially to prevent OOMs
for sampling_step in range(num_sampling_steps):
seed = hash((step, example_idx, sampling_step)) & 0x7FFFFFFF # positive half of int32
with autocast_ctx:
generated_token_sequences_batch, masks_batch = engine.generate_batch(
tokens,
num_samples=device_batch_size,
max_tokens=max_new_tokens,
temperature=temperature,
top_k=top_k,
seed=seed, # must make sure to change the seed for each sampling step
)
generated_token_sequences.extend(generated_token_sequences_batch)
masks.extend(masks_batch)
# Calculate the rewards for each sample
rewards = []
for sample_tokens in generated_token_sequences:
# Get just the generated tokens (after the prompt)
generated_tokens = sample_tokens[prefix_length:]
# Decode the generated response
generated_text = tokenizer.decode(generated_tokens)
# Calculate the reward
reward = train_task.reward(conversation, generated_text)
rewards.append(reward)
# Pad the sequences so that their lengths (in time) match
max_length = max(len(seq) for seq in generated_token_sequences)
padded_generated_token_sequences = [seq + [assistant_end] * (max_length - len(seq)) for seq in generated_token_sequences]
padded_masks = [mask + [0] * (max_length - len(mask)) for mask in masks]
# Stack up the sequences and masks into PyTorch tensors
ids = torch.tensor(padded_generated_token_sequences, dtype=torch.long, device=device)
mask_ids = torch.tensor(padded_masks, dtype=torch.long, device=device)
# Generate autoregressive inputs and targets to the Transformer
inputs = ids[:, :-1]
targets = ids[:, 1:].clone() # clone to avoid in-place modification:
targets[mask_ids[:, 1:] == 0] = -1 # <-- inplace modification right here. -1 is the ignore index
# NOTE also that the Engine returns mask=0 for BOTH the prompt tokens AND the tool use tokens.
# So we will (correctly) end up not training on the prompt tokens, or the tool use forced tokens.
rewards = torch.tensor(rewards, dtype=torch.float, device=device)
# Calculate the advantages by simply subtracting the mean (instead of z-score (x-mu)/sigma)
mu = rewards.mean()
advantages = rewards - mu
# yield inputs/targets as (B, T) of ids and rewards as (B,) of floats
yield generated_token_sequences, inputs, targets, rewards, advantages
# -----------------------------------------------------------------------------
# Simple evaluation loop for GSM8K pass@k
def run_gsm8k_eval(task, tokenizer, engine,
max_examples=None,
num_samples=1,
max_completion_tokens=256,
temperature=0.0,
top_k=50
):
"""
Evaluates GSM8K task and returns a list of records of evaluation outcomes.
In a distributed setting, all ranks cooperate but this function will NOT
do the reduction across ranks. This is the responsibility of the caller.
Because the evaluation can take a while, this function will yield records one by one.
"""
max_examples = min(max_examples, len(task)) if max_examples is not None else len(task)
for idx in range(ddp_rank, max_examples, ddp_world_size):
conversation = task[idx]
tokens = tokenizer.render_for_completion(conversation)
prefix_length = len(tokens)
# Generate k samples using batched generation inside the Engine
assert num_samples <= device_batch_size # usually this is true. we can add a loop if not...
generated_token_sequences, masks = engine.generate_batch(
tokens,
num_samples=num_samples,
max_tokens=max_completion_tokens,
temperature=temperature,
top_k=top_k
)
# Check each sample for correctness
outcomes = []
for sample_tokens in generated_token_sequences:
generated_tokens = sample_tokens[prefix_length:]
generated_text = tokenizer.decode(generated_tokens)
is_correct = task.evaluate(conversation, generated_text)
outcomes.append({
"is_correct": is_correct
})
# A bit bloated because I wanted to do more complex logging at one point.
record = {
"idx": idx,
"outcomes": outcomes,
}
yield record
# -----------------------------------------------------------------------------
# Training loop
# Init the optimizer
optimizers = model.setup_optimizers(
unembedding_lr=unembedding_lr,
embedding_lr=embedding_lr,
matrix_lr=matrix_lr,
weight_decay=weight_decay,
)
# Set the initial learning rate as a fraction of the base learning rate
for opt in optimizers:
for group in opt.param_groups:
group["lr"] = group["lr"] * init_lr_frac
group["initial_lr"] = group["lr"] # save the initial learning so we can decay easily later
# Learning rate scheduler: simple rampdown to zero over num_steps
def get_lr_multiplier(it):
lrm = 1.0 - it / num_steps
return lrm
# Calculate the number of examples each rank handles to achieve the desired examples_per_step
print0(f"Total sequences per step: {examples_per_step * num_samples}") # total batch size in sequences/step
assert examples_per_step % ddp_world_size == 0, "Desired examples per step must be divisible by the number of ranks"
examples_per_rank = examples_per_step // ddp_world_size # per GPU
print0(f"Calculated examples per rank: {examples_per_rank}")
# Kick off the training loop
batch_iterator = get_batch()
for step in range(num_steps):
# Evaluate the model once in a while and log to wandb
if step % eval_every == 0:
model.eval()
passk = torch.zeros(device_batch_size, device=device) # pass@k for k=1..device_batch_size
with autocast_ctx:
records_iter = run_gsm8k_eval(val_task, tokenizer, engine, num_samples=device_batch_size, max_examples=eval_examples, temperature=1.0)
records = list(records_iter) # collect all records
for k in range(1, device_batch_size + 1):
passk[k - 1] = sum(any(o["is_correct"] for o in r["outcomes"][:k]) for r in records)
num_records = torch.tensor(len(records), dtype=torch.long, device=device)
if ddp:
dist.all_reduce(num_records, op=dist.ReduceOp.SUM)
dist.all_reduce(passk, op=dist.ReduceOp.SUM)
passk = passk / num_records.item() # normalize by the total number of records
print_passk = [f"Pass@{k}: {passk[k - 1].item():.4f}" for k in range(1, device_batch_size + 1)]
print0(f"Step {step} | {', '.join(print_passk)}")
log_passk = {f"pass@{k}": passk[k - 1].item() for k in range(1, device_batch_size + 1)}
wandb_run.log({
"step": step,
**log_passk,
})
# Forward/Backward on rollouts over multiple examples in the dataset
rewards_list = []
sequence_lengths = []
for example_step in range(examples_per_rank):
# Get one batch corresponding to one example in the training dataset
sequences_all, inputs_all, targets_all, rewards_all, advantages_all = next(batch_iterator)
# Evaluate the loss and gradients
model.train() # ensure the model is in train mode
# We need one more loop because we can never exceed the device_batch_size
assert inputs_all.size(0) % device_batch_size == 0
num_passes = inputs_all.size(0) // device_batch_size
for pass_idx in range(num_passes):
# Pluck out the batch for this pass
b0, b1 = pass_idx * device_batch_size, (pass_idx + 1) * device_batch_size
inputs = inputs_all[b0:b1]
targets = targets_all[b0:b1]
rewards = rewards_all[b0:b1]
advantages = advantages_all[b0:b1]
# Calculate log probabilities. Note that the loss calculates NLL = -logp, so we negate
with autocast_ctx:
_, loss2d = model(inputs, targets, loss_reduction='none')
logp = -loss2d.view_as(inputs) # (B, T)
# Calculate the PG objective. Note that ignore_index=-1 ensures that invalid tokens have loss 0.
pg_obj = (logp * advantages.unsqueeze(-1)).sum()
# normalize by the number of valid tokens, number of passes, and examples_per_rank
num_valid = (targets >= 0).sum().clamp(min=1)
pg_obj = pg_obj / (num_valid * num_passes * examples_per_rank)
# Note, there is no need to add PPO ratio+clip because we are on policy
# Finally, formulate the loss that we want to minimize (instead of objective we wish to maximize)
loss = -pg_obj
loss.backward()
print0(f"Step {step}/{num_steps} | Example step {example_step} | Pass {pass_idx} | loss: {loss.item():.6f} | Average reward: {rewards.mean().item()}")
# For logging
rewards_list.append(rewards_all.mean().item())
sequence_lengths.extend(len(seq) for seq in sequences_all)
# A bunch of logging for how the rollouts went this step
mean_reward = sum(rewards_list) / len(rewards_list)
mean_sequence_length = sum(sequence_lengths) / len(sequence_lengths)
if ddp: # aggregate across ranks
mean_reward_tensor = torch.tensor(mean_reward, dtype=torch.float, device=device)
mean_sequence_length_tensor = torch.tensor(mean_sequence_length, dtype=torch.float, device=device)
dist.all_reduce(mean_reward_tensor, op=dist.ReduceOp.AVG)
dist.all_reduce(mean_sequence_length_tensor, op=dist.ReduceOp.AVG)
mean_reward = mean_reward_tensor.item()
mean_sequence_length = mean_sequence_length_tensor.item()
print0(f"Step {step}/{num_steps} | Average reward: {mean_reward} | Average sequence length: {mean_sequence_length:.2f}")
wandb_run.log({
"step": step,
"reward": mean_reward,
"sequence_length": mean_sequence_length,
})
# Update the model parameters
lrm = get_lr_multiplier(step)
for opt in optimizers: # first set the learning rate
for group in opt.param_groups:
group["lr"] = group["initial_lr"] * lrm
for opt in optimizers: # then step the optimizers
opt.step()
model.zero_grad(set_to_none=True)
wandb_run.log({
"step": step,
"lrm": lrm,
})
# Master process saves the model once in a while. Skip first step. Save last step.
if master_process and ((step > 0 and step % save_every == 0) or step == num_steps - 1):
base_dir = get_base_dir()
depth = model.config.n_layer
model_tag = f"d{depth}" # base the model tag on the depth of the base model
checkpoint_dir = os.path.join(base_dir, "chatrl_checkpoints", model_tag)
model_config_kwargs = model.config.__dict__ # slightly naughty, abusing the simplicity of GPTConfig, TODO nicer
save_checkpoint(
checkpoint_dir,
step,
model.state_dict(),
None, # note: we don't bother to save the optimizer state
{
"model_config": model_config_kwargs,
}
)
print(f"✅ Saved model checkpoint to {checkpoint_dir}")
# Log to report
from nanochat.report import get_report
get_report().log(section="Chat RL", data=[
user_config, # CLI args
])
wandb_run.finish() # wandb run finish
compute_cleanup()

285
scripts-align/chat_sft.py Normal file
View File

@ -0,0 +1,285 @@
"""
Finetune a base model to be a chat model.
Run on one GPU e.g. for debugging:
python -m scripts.chat_sft
Or torchrun for training:
torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft
"""
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import wandb
import torch
import torch.distributed as dist
from contextlib import nullcontext
from nanochat.common import compute_init, compute_cleanup, get_base_dir, print0, DummyWandb, autodetect_device_type
from nanochat.checkpoint_manager import load_model
from nanochat.checkpoint_manager import save_checkpoint
from nanochat.engine import Engine
from scripts.chat_eval import run_chat_eval
from tasks.common import TaskMixture
from tasks.arc import ARC
from tasks.gsm8k import GSM8K
from tasks.smoltalk import SmolTalk
from tasks.customjson import CustomJSON
from tasks.spellingbee import SimpleSpelling, SpellingBee
# -----------------------------------------------------------------------------
# SFT Hyperparameters
run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
# input model options
source = "mid" # base|mid , which checkpoint to load the model from (base model or midtrained model)
model_tag = None # model tag to load the model from (base model or midtrained model)
step = None # step to load the model from (base model or midtrained model)
# compute/precision
device_type = "" # cuda|cpu|mps (empty => autodetect)
dtype = "bfloat16"
device_batch_size = 4 # max to avoid OOM
# optimization
num_epochs = 1
num_iterations = -1 # override number of iterations (-1 = disable, use num_epochs to derive it)
target_examples_per_step = 32
unembedding_lr = 0.004
embedding_lr = 0.2
matrix_lr = 0.02
weight_decay = 0.0
init_lr_frac = 0.02
# evaluation and logging there of
eval_every = 100
eval_steps = 100
eval_metrics_every = 200
eval_metrics_max_problems = 1024
# now allow CLI to override the settings via the configurator lol
config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
exec(open(os.path.join('nanochat', 'configurator.py')).read()) # overrides from command line or config file
user_config = {k: globals()[k] for k in config_keys} # possibly useful for logging
# -----------------------------------------------------------------------------
# Compute init
device_type = autodetect_device_type() if device_type == "" else device_type
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
master_process = ddp_rank == 0
ptdtype = torch.float32 if dtype == 'float32' else torch.bfloat16
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
# wandb logging init
use_dummy_wandb = run == "dummy" or not master_process
wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat-sft", name=run, config=user_config, save_code=True)
# Load the model and tokenizer
model, tokenizer, meta = load_model(source, device, phase="train", model_tag=model_tag, step=step)
orig_model = model # original, uncompiled model
# model = torch.compile(model, dynamic=True) # doesn't work super well because of variable lengths of inputs
engine = Engine(model, tokenizer) # will be used for inline model evaluation only
# -----------------------------------------------------------------------------
# Task data mixture we'll train on
identity_conversations_filepath = os.path.join(get_base_dir(), "identity_conversations.jsonl")
train_ds = TaskMixture([
ARC(subset="ARC-Easy", split="train"), # 2.3K rows
ARC(subset="ARC-Challenge", split="train"), # 1.1K rows
GSM8K(subset="main", split="train"), # 8K rows
SmolTalk(split="train", stop=10_000), # 10K rows of smoltalk
CustomJSON(filepath=identity_conversations_filepath), # 1K rows of synthetic identity conversations
SimpleSpelling(size=300, split="train"), # 300 rows of Simple Spelling (e.g. spell the word 'apple')
SpellingBee(size=300, split="train"), # 300 rows of Spelling Bee (e.g. how many 'r' are in 'strawberry'?)
]) # 2.3K + 1.1K + 8K + 10K + 1K + 0.3K + 0.3K = 23K rows
val_ds = SmolTalk(split="test") # general conversations, 24K rows (though we don't actually use all of it)
# -----------------------------------------------------------------------------
# DataLoader
def sft_data_generator(dataset, batch_size):
pad_token_id = tokenizer.encode_special("<|assistant_end|>") # use <|assistant_end|> as the pad token is ok, these positions are masked in the loss
# prepares a list of tokenized conversations into a batch and yields
def collate_and_yield(batch):
nrows = len(batch)
ncols = max(len(ids) for ids, mask in batch) - 1 # seq of n creates inputs/targets of n-1
inputs = torch.full((nrows, ncols), pad_token_id, dtype=torch.long)
targets = torch.full((nrows, ncols), -1, dtype=torch.long) # -1 is ignore index
for i, (ids, mask) in enumerate(batch):
n = len(ids)
ids_tensor = torch.tensor(ids, dtype=torch.long)
inputs[i, :n-1] = ids_tensor[:-1]
# recall -1 is the ignore index, so mask out targets where mask is 0
row_targets = ids_tensor[1:]
# mask[1:] omits the mask for the BOS token, which is never a target atm so it's ok
mask_tensor = torch.tensor(mask[1:], dtype=torch.long)
row_targets[mask_tensor == 0] = -1 # mask out targets where mask is 0
targets[i, :n-1] = row_targets
inputs = inputs.to(device) # move to device
targets = targets.to(device)
return inputs, targets
# iterates over the dataset in epochs, tokenizes
batch = []
while True:
for i in range(ddp_rank, len(dataset), ddp_world_size):
doc = dataset[i]
ids, mask = tokenizer.render_conversation(doc)
batch.append((ids, mask))
if len(batch) == batch_size:
yield collate_and_yield(batch)
batch = []
examples_per_step = device_batch_size * ddp_world_size
print0(f"Target examples per step: {target_examples_per_step}")
print0(f"Device batch size: {device_batch_size}")
print0(f"Examples per step is device_batch_size * ddp_world_size: {examples_per_step}")
assert target_examples_per_step % examples_per_step == 0, "Target examples per step must be divisible by examples per step"
grad_accum_steps = target_examples_per_step // examples_per_step
print0(f"=> Setting grad accum steps: {grad_accum_steps}")
if num_iterations == -1:
# derive num_iterations from num_epochs and the size of the dataset
assert num_epochs > 0, "num_epochs must be positive if num_iterations is -1"
num_iterations = (len(train_ds) // target_examples_per_step) * num_epochs
train_loader = sft_data_generator(train_ds, batch_size=device_batch_size)
build_val_loader = lambda: sft_data_generator(val_ds, batch_size=device_batch_size)
# -----------------------------------------------------------------------------
# Initialize the Optimizer
optimizers = model.setup_optimizers(
unembedding_lr=unembedding_lr,
embedding_lr=embedding_lr,
matrix_lr=matrix_lr,
weight_decay=weight_decay,
)
# Set the initial learning rate as a fraction of the base learning rate
for opt in optimizers:
for group in opt.param_groups:
group["lr"] = group["lr"] * init_lr_frac
group["initial_lr"] = group["lr"] # save the initial learning so we can decay easily later
# -----------------------------------------------------------------------------
# Training loop
# Learning rate scheduler
def get_lr_multiplier(it):
lrm = 1.0 - it / num_iterations
return lrm
# Go!
step = 0
train_iter = iter(train_loader)
for step in range(num_iterations):
last_step = step == num_iterations - 1
# evaluate the validation loss
if last_step or step % eval_every == 0:
model.eval()
val_iter = iter(build_val_loader())
losses = []
for _ in range(eval_steps):
val_inputs, val_targets = next(val_iter)
with torch.no_grad(), autocast_ctx:
loss = model(val_inputs, val_targets)
losses.append(loss)
val_loss = torch.stack(losses).mean() # average over eval_steps
if ddp:
dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) # average over ranks
val_loss = val_loss.item()
print0(f"Step {step:05d} | Validation loss: {val_loss:.6f}")
wandb_run.log({
"step": step,
"val_loss": val_loss,
})
model.train()
# evaluate accuracy of the multiple choice tasks (which are quick to run)
if last_step or (step > 0 and step % eval_metrics_every == 0):
model.eval()
metrics = {}
with torch.no_grad(), autocast_ctx:
# note that because these are inside no_grad, we can usually afford to at least ~2X the batch size
metrics["mmlu_acc"] = run_chat_eval("MMLU", model, tokenizer, engine, batch_size=device_batch_size*2, max_problems=eval_metrics_max_problems)
metrics["arc_easy_acc"] = run_chat_eval("ARC-Easy", model, tokenizer, engine, batch_size=device_batch_size*2, max_problems=eval_metrics_max_problems)
metrics_str = ', '.join(f'{k}: {v:.6f}' for k, v in metrics.items())
print0(f"Step {step:05d} | {metrics_str}")
wandb_run.log({
"step": step,
**metrics,
})
model.train()
if last_step:
break
# evaluate the gradient
num_tokens = torch.tensor(0, device=device) # the number of "active" tokens of supervision seen
for micro_step in range(grad_accum_steps):
train_inputs, train_targets = next(train_iter)
with autocast_ctx:
loss = model(train_inputs, train_targets)
train_loss = loss.detach() # for logging
loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
loss.backward() # accumulate the gradient
num_tokens += (train_targets >= 0).sum()
if ddp:
dist.all_reduce(num_tokens, op=dist.ReduceOp.SUM) # sum over ranks
# learning rate scheduler
lrm = get_lr_multiplier(step)
for opt in optimizers:
for group in opt.param_groups:
group["lr"] = group["initial_lr"] * lrm
# step the optimizers
for opt in optimizers:
opt.step()
model.zero_grad(set_to_none=True)
# logging
train_loss_item = train_loss.item()
num_tokens_item = num_tokens.item()
print0(f"Step {step:05d}/{num_iterations:05d} | Training loss: {train_loss_item:.6f}| lrm: {lrm:.6f}| num_tokens: {num_tokens_item:,}")
wandb_run.log({
"step": step,
"lrm": lrm,
"train_loss": train_loss_item,
"num_tokens": num_tokens_item,
})
step += 1
# Save the model at the end of the run
if master_process:
base_dir = get_base_dir()
depth = model.config.n_layer
model_tag = f"d{depth}" # base the model tag on the depth of the base model
checkpoint_dir = os.path.join(base_dir, "chatsft_checkpoints", model_tag)
model_config_kwargs = model.config.__dict__ # slightly naughty, abusing the simplicity of GPTConfig, TODO nicer
save_checkpoint(
checkpoint_dir,
step,
model.state_dict(),
None, # note: we don't bother to save the optimizer state
{
"step": step,
"val_loss": val_loss,
**metrics,
"model_config": model_config_kwargs,
}
)
print(f"✅ Saved model checkpoint to {checkpoint_dir}")
# Log to report
from nanochat.report import get_report
get_report().log(section="Chat SFT", data=[
user_config, # CLI args
{
"Training rows": len(train_ds),
"Number of iterations": num_iterations,
"Training loss": train_loss_item,
"Validation loss": val_loss,
},
])
# Cleanup
wandb_run.finish()
compute_cleanup()

415
scripts-align/chat_web.py Normal file
View File

@ -0,0 +1,415 @@
#!/usr/bin/env python3
"""
Unified web chat server - serves both UI and API from a single FastAPI instance.
Uses data parallelism to distribute requests across multiple GPUs. Each GPU loads
a full copy of the model, and incoming requests are distributed to available workers.
Launch examples:
- single available GPU (default)
python -m scripts.chat_web
- 4 GPUs
python -m scripts.chat_web --num-gpus 4
To chat, open the URL printed in the console. (If on cloud box, make sure to use public IP)
Endpoints:
GET / - Chat UI
POST /chat/completions - Chat API (streaming only)
GET /health - Health check with worker pool status
GET /stats - Worker pool statistics and GPU utilization
Abuse Prevention:
- Maximum 500 messages per request
- Maximum 8000 characters per message
- Maximum 32000 characters total conversation length
- Temperature clamped to 0.0-2.0
- Top-k clamped to 1-200
- Max tokens clamped to 1-4096
"""
import argparse
import json
import os
import torch
import asyncio
import logging
import random
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import StreamingResponse, HTMLResponse, FileResponse
from pydantic import BaseModel
from typing import List, Optional, AsyncGenerator
from dataclasses import dataclass
from contextlib import nullcontext
from nanochat.common import compute_init, autodetect_device_type
from nanochat.checkpoint_manager import load_model
from nanochat.engine import Engine
# Abuse prevention limits
MAX_MESSAGES_PER_REQUEST = 500
MAX_MESSAGE_LENGTH = 8000
MAX_TOTAL_CONVERSATION_LENGTH = 32000
MIN_TEMPERATURE = 0.0
MAX_TEMPERATURE = 2.0
MIN_TOP_K = 1
MAX_TOP_K = 200
MIN_MAX_TOKENS = 1
MAX_MAX_TOKENS = 4096
parser = argparse.ArgumentParser(description='NanoChat Web Server')
parser.add_argument('-n', '--num-gpus', type=int, default=1, help='Number of GPUs to use (default: 1)')
parser.add_argument('-i', '--source', type=str, default="sft", help="Source of the model: sft|mid|rl")
parser.add_argument('-t', '--temperature', type=float, default=0.8, help='Default temperature for generation')
parser.add_argument('-k', '--top-k', type=int, default=50, help='Default top-k sampling parameter')
parser.add_argument('-m', '--max-tokens', type=int, default=512, help='Default max tokens for generation')
parser.add_argument('-g', '--model-tag', type=str, default=None, help='Model tag to load')
parser.add_argument('-s', '--step', type=int, default=None, help='Step to load')
parser.add_argument('-p', '--port', type=int, default=8000, help='Port to run the server on')
parser.add_argument('-d', '--dtype', type=str, default='bfloat16', choices=['float32', 'bfloat16'])
parser.add_argument('--device-type', type=str, default='', choices=['cuda', 'cpu', 'mps'], help='Device type for evaluation: cuda|cpu|mps. empty => autodetect')
parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to bind the server to')
args = parser.parse_args()
# Configure logging for conversation traffic
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
device_type = autodetect_device_type() if args.device_type == "" else args.device_type
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
ptdtype = torch.float32 if args.dtype == 'float32' else torch.bfloat16
@dataclass
class Worker:
"""A worker with a model loaded on a specific GPU."""
gpu_id: int
device: torch.device
engine: Engine
tokenizer: object
autocast_ctx: torch.amp.autocast
class WorkerPool:
"""Pool of workers, each with a model replica on a different GPU."""
def __init__(self, num_gpus: Optional[int] = None):
if num_gpus is None:
if device_type == "cuda":
num_gpus = torch.cuda.device_count()
else:
num_gpus = 1 # e.g. cpu|mps
self.num_gpus = num_gpus
self.workers: List[Worker] = []
self.available_workers: asyncio.Queue = asyncio.Queue()
async def initialize(self, source: str, model_tag: Optional[str] = None, step: Optional[int] = None):
"""Load model on each GPU."""
print(f"Initializing worker pool with {self.num_gpus} GPUs...")
if self.num_gpus > 1:
assert device_type == "cuda", "Only CUDA supports multiple workers/GPUs. cpu|mps does not."
for gpu_id in range(self.num_gpus):
if device_type == "cuda":
device = torch.device(f"cuda:{gpu_id}")
print(f"Loading model on GPU {gpu_id}...")
else:
device = torch.device(device_type) # e.g. cpu|mps
print(f"Loading model on {device_type}...")
model, tokenizer, _ = load_model(source, device, phase="eval", model_tag=model_tag, step=step)
engine = Engine(model, tokenizer)
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
worker = Worker(
gpu_id=gpu_id,
device=device,
engine=engine,
tokenizer=tokenizer,
autocast_ctx=autocast_ctx
)
self.workers.append(worker)
await self.available_workers.put(worker)
print(f"All {self.num_gpus} workers initialized!")
async def acquire_worker(self) -> Worker:
"""Get an available worker from the pool."""
return await self.available_workers.get()
async def release_worker(self, worker: Worker):
"""Return a worker to the pool."""
await self.available_workers.put(worker)
class ChatMessage(BaseModel):
role: str
content: str
class ChatRequest(BaseModel):
messages: List[ChatMessage]
temperature: Optional[float] = None
max_tokens: Optional[int] = None
top_k: Optional[int] = None
def validate_chat_request(request: ChatRequest):
"""Validate chat request to prevent abuse."""
# Check number of messages
if len(request.messages) == 0:
raise HTTPException(status_code=400, detail="At least one message is required")
if len(request.messages) > MAX_MESSAGES_PER_REQUEST:
raise HTTPException(
status_code=400,
detail=f"Too many messages. Maximum {MAX_MESSAGES_PER_REQUEST} messages allowed per request"
)
# Check individual message lengths and total conversation length
total_length = 0
for i, message in enumerate(request.messages):
if not message.content:
raise HTTPException(status_code=400, detail=f"Message {i} has empty content")
msg_length = len(message.content)
if msg_length > MAX_MESSAGE_LENGTH:
raise HTTPException(
status_code=400,
detail=f"Message {i} is too long. Maximum {MAX_MESSAGE_LENGTH} characters allowed per message"
)
total_length += msg_length
if total_length > MAX_TOTAL_CONVERSATION_LENGTH:
raise HTTPException(
status_code=400,
detail=f"Total conversation is too long. Maximum {MAX_TOTAL_CONVERSATION_LENGTH} characters allowed"
)
# Validate role values
for i, message in enumerate(request.messages):
if message.role not in ["user", "assistant"]:
raise HTTPException(
status_code=400,
detail=f"Message {i} has invalid role. Must be 'user', 'assistant', or 'system'"
)
# Validate temperature
if request.temperature is not None:
if not (MIN_TEMPERATURE <= request.temperature <= MAX_TEMPERATURE):
raise HTTPException(
status_code=400,
detail=f"Temperature must be between {MIN_TEMPERATURE} and {MAX_TEMPERATURE}"
)
# Validate top_k
if request.top_k is not None:
if not (MIN_TOP_K <= request.top_k <= MAX_TOP_K):
raise HTTPException(
status_code=400,
detail=f"top_k must be between {MIN_TOP_K} and {MAX_TOP_K}"
)
# Validate max_tokens
if request.max_tokens is not None:
if not (MIN_MAX_TOKENS <= request.max_tokens <= MAX_MAX_TOKENS):
raise HTTPException(
status_code=400,
detail=f"max_tokens must be between {MIN_MAX_TOKENS} and {MAX_MAX_TOKENS}"
)
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Load models on all GPUs on startup."""
print("Loading nanochat models across GPUs...")
app.state.worker_pool = WorkerPool(num_gpus=args.num_gpus)
await app.state.worker_pool.initialize(args.source, model_tag=args.model_tag, step=args.step)
print(f"Server ready at http://localhost:{args.port}")
yield
app = FastAPI(lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
@app.get("/")
async def root():
"""Serve the chat UI."""
ui_html_path = os.path.join("nanochat", "ui.html")
with open(ui_html_path, "r", encoding="utf-8") as f:
html_content = f.read()
# Replace the API_URL to use the same origin
html_content = html_content.replace(
"const API_URL = `http://${window.location.hostname}:8000`;",
"const API_URL = '';"
)
return HTMLResponse(content=html_content)
@app.get("/logo.svg")
async def logo():
"""Serve the NanoChat logo for favicon and header."""
logo_path = os.path.join("nanochat", "logo.svg")
return FileResponse(logo_path, media_type="image/svg+xml")
async def generate_stream(
worker: Worker,
tokens,
temperature=None,
max_new_tokens=None,
top_k=None
) -> AsyncGenerator[str, None]:
"""Generate assistant response with streaming."""
temperature = temperature if temperature is not None else args.temperature
max_new_tokens = max_new_tokens if max_new_tokens is not None else args.max_tokens
top_k = top_k if top_k is not None else args.top_k
assistant_end = worker.tokenizer.encode_special("<|assistant_end|>")
bos = worker.tokenizer.get_bos_token_id()
# Accumulate tokens to properly handle multi-byte UTF-8 characters (like emojis)
accumulated_tokens = []
# Track the last complete UTF-8 string (without replacement characters)
last_clean_text = ""
with worker.autocast_ctx:
for token_column, token_masks in worker.engine.generate(
tokens,
num_samples=1,
max_tokens=max_new_tokens,
temperature=temperature,
top_k=top_k,
seed=random.randint(0, 2**31 - 1)
):
token = token_column[0]
# Stopping criteria
if token == assistant_end or token == bos:
break
# Append the token to sequence
accumulated_tokens.append(token)
# Decode all accumulated tokens to get proper UTF-8 handling
# Note that decode is a quite efficient operation, basically table lookup and string concat
current_text = worker.tokenizer.decode(accumulated_tokens)
# Only emit text if it doesn't end with a replacement character
# This ensures we don't emit incomplete UTF-8 sequences
if not current_text.endswith('<EFBFBD>'):
# Extract only the new text since last clean decode
new_text = current_text[len(last_clean_text):]
if new_text: # Only yield if there's new content
yield f"data: {json.dumps({'token': new_text, 'gpu': worker.gpu_id}, ensure_ascii=False)}\n\n"
last_clean_text = current_text
yield f"data: {json.dumps({'done': True})}\n\n"
@app.post("/chat/completions")
async def chat_completions(request: ChatRequest):
"""Chat completion endpoint (streaming only) - uses worker pool for multi-GPU."""
# Basic validation to prevent abuse
validate_chat_request(request)
# Log incoming conversation to console
logger.info("="*20)
for i, message in enumerate(request.messages):
logger.info(f"[{message.role.upper()}]: {message.content}")
logger.info("-"*20)
# Acquire a worker from the pool (will wait if all are busy)
worker_pool = app.state.worker_pool
worker = await worker_pool.acquire_worker()
try:
# Build conversation tokens
bos = worker.tokenizer.get_bos_token_id()
user_start = worker.tokenizer.encode_special("<|user_start|>")
user_end = worker.tokenizer.encode_special("<|user_end|>")
assistant_start = worker.tokenizer.encode_special("<|assistant_start|>")
assistant_end = worker.tokenizer.encode_special("<|assistant_end|>")
conversation_tokens = [bos]
for message in request.messages:
if message.role == "user":
conversation_tokens.append(user_start)
conversation_tokens.extend(worker.tokenizer.encode(message.content))
conversation_tokens.append(user_end)
elif message.role == "assistant":
conversation_tokens.append(assistant_start)
conversation_tokens.extend(worker.tokenizer.encode(message.content))
conversation_tokens.append(assistant_end)
conversation_tokens.append(assistant_start)
# Streaming response with worker release after completion
response_tokens = []
async def stream_and_release():
try:
async for chunk in generate_stream(
worker,
conversation_tokens,
temperature=request.temperature,
max_new_tokens=request.max_tokens,
top_k=request.top_k
):
# Accumulate response for logging
chunk_data = json.loads(chunk.replace("data: ", "").strip())
if "token" in chunk_data:
response_tokens.append(chunk_data["token"])
yield chunk
finally:
# Log the assistant response to console
full_response = "".join(response_tokens)
logger.info(f"[ASSISTANT] (GPU {worker.gpu_id}): {full_response}")
logger.info("="*20)
# Release worker back to pool after streaming is done
await worker_pool.release_worker(worker)
return StreamingResponse(
stream_and_release(),
media_type="text/event-stream"
)
except Exception as e:
# Make sure to release worker even on error
await worker_pool.release_worker(worker)
raise e
@app.get("/health")
async def health():
"""Health check endpoint."""
worker_pool = getattr(app.state, 'worker_pool', None)
return {
"status": "ok",
"ready": worker_pool is not None and len(worker_pool.workers) > 0,
"num_gpus": worker_pool.num_gpus if worker_pool else 0,
"available_workers": worker_pool.available_workers.qsize() if worker_pool else 0
}
@app.get("/stats")
async def stats():
"""Get worker pool statistics."""
worker_pool = app.state.worker_pool
return {
"total_workers": len(worker_pool.workers),
"available_workers": worker_pool.available_workers.qsize(),
"busy_workers": len(worker_pool.workers) - worker_pool.available_workers.qsize(),
"workers": [
{
"gpu_id": w.gpu_id,
"device": str(w.device)
} for w in worker_pool.workers
]
}
if __name__ == "__main__":
import uvicorn
print(f"Starting NanoChat Web Server")
print(f"Temperature: {args.temperature}, Top-k: {args.top_k}, Max tokens: {args.max_tokens}")
uvicorn.run(app, host=args.host, port=args.port)

311
scripts-align/mid_train.py Normal file
View File

@ -0,0 +1,311 @@
"""
Midtrain the model. Same as pretraining but simpler.
Run as:
python -m scripts.mid_train
Or torchrun for training:
torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --device_batch_size=16
"""
from collections import deque
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import time
import wandb
import torch
from contextlib import nullcontext
from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, get_base_dir, autodetect_device_type
from nanochat.tokenizer import get_token_bytes
from nanochat.checkpoint_manager import save_checkpoint
from nanochat.loss_eval import evaluate_bpb
from nanochat.checkpoint_manager import load_model
import torch.distributed as dist
from tasks.common import TaskMixture
from tasks.gsm8k import GSM8K
from tasks.mmlu import MMLU
from tasks.smoltalk import SmolTalk
from tasks.customjson import CustomJSON
from tasks.spellingbee import SimpleSpelling, SpellingBee
# -----------------------------------------------------------------------------
run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
device_type = "" # cuda|cpu|mps (empty => autodetect)
model_tag = None # model tag to load the model from (base model or midtrained model)
step = None # step to load the model from (base model or midtrained model)
dtype = "bfloat16"
num_iterations = -1 # explicit number of steps of the optimization (-1 = disable)
max_seq_len = 2048
device_batch_size = 32
unembedding_lr = 0.004
embedding_lr = 0.2
matrix_lr = 0.02
init_lr_frac = 1.0 # initial learning rate is this fraction of the base learning rate
weight_decay = 0.0
eval_every = 150 # -1 = disable
eval_tokens = 20*524288
total_batch_size = 524288
dry_run = 0 # dry_run=1 is for experiments: we will log to wandb but we won't write checkpoints or report
config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
exec(open(os.path.join('nanochat', 'configurator.py')).read()) # overrides from command line or config file
user_config = {k: globals()[k] for k in config_keys} # possibly useful for logging
# -----------------------------------------------------------------------------
# Compute init
device_type = autodetect_device_type() if device_type == "" else device_type
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
master_process = ddp_rank == 0
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None
get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0
# wandb logging init
use_dummy_wandb = run == "dummy" or not master_process
wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat-mid", name=run, config=user_config)
# Load the model and tokenizer
model, tokenizer, meta = load_model("base", device, phase="train", model_tag=model_tag, step=step)
pretrain_batch_size = meta.get("device_batch_size", None)
if pretrain_batch_size is not None and device_batch_size > pretrain_batch_size:
print0(f"FOOTGUN WARNING: base model training used device_batch_size {pretrain_batch_size}, did you pass in a good --device_batch_size to this script?")
orig_model = model
model = torch.compile(model, dynamic=False)
depth = model.config.n_layer
num_flops_per_token = model.estimate_flops()
tokens_per_fwdbwd = device_batch_size * max_seq_len # tokens per iteration for a single rank
world_tokens_per_fwdbwd = tokens_per_fwdbwd * ddp_world_size # total tokens per iteration for all ranks
assert total_batch_size % world_tokens_per_fwdbwd == 0
grad_accum_steps = total_batch_size // world_tokens_per_fwdbwd
print0(f"Tokens / micro-batch / rank: {device_batch_size} x {max_seq_len} = {tokens_per_fwdbwd:,}")
print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}")
print0(f"Total batch size {total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}")
token_bytes = get_token_bytes(device=device)
# Initialize the Optimizer (Muon for Linear layers, AdamW for embedding and lm_head)
optimizers = model.setup_optimizers(unembedding_lr=unembedding_lr, embedding_lr=embedding_lr, matrix_lr=matrix_lr, weight_decay=weight_decay)
adamw_optimizer, muon_optimizer = optimizers
# Override the initial learning rate as a fraction of the base learning rate
for opt in optimizers:
for group in opt.param_groups:
group["lr"] = group["lr"] * init_lr_frac
group["initial_lr"] = group["lr"] # save the initial learning so we can decay easily later
# Midtraining data mixture and DataLoader
base_dir = get_base_dir()
identity_conversations_filepath = os.path.join(base_dir, "identity_conversations.jsonl")
train_dataset = TaskMixture([
SmolTalk(split="train"), # 460K rows of general conversations
MMLU(subset="auxiliary_train", split="train"), # 100K rows of multiple choice problems drawn from ARC, MC_TEST, OBQA, RACE
GSM8K(subset="main", split="train"), # 8K rows teaching simple math and (calculator) tool use
CustomJSON(filepath=identity_conversations_filepath), # 1000 rows of synthetic identity conversations
CustomJSON(filepath=identity_conversations_filepath), # let's do 2 epochs of these
SimpleSpelling(size=200000, split="train"), # 200K rows of Simple Spelling (e.g. spell the word 'apple')
SpellingBee(size=80000, split="train"), # 80K rows of Spelling Bee (e.g. how many 'r' are in 'strawberry'?)
]) # total: 460K + 100K + 8K + 200K + 80K = 848K rows
val_dataset = TaskMixture([
SmolTalk(split="test"), # 24K rows in test set
MMLU(subset="all", split="test", stop=5200), # 14K rows in test set, use only 5.2K to match the train ratios
GSM8K(subset="main", split="test", stop=420), # 1.32K rows in test set, use only 420 to match the train ratios
]) # total: 24K + 14K + 1.32K ~= 39K rows
# DataLoader is defined here, it emits inputs, targets : 2D tensors of shape (device_batch_size, max_seq_len)
# A big problem is that we don't know the final num_iterations in advance. So we create
# these two global variables and update them from within the data generator.
last_step = False # we will toggle this to True when we reach the end of the dataset
approx_progress = 0.0 # will go from 0 to 1 over the course of the epoch
def mid_data_generator(split):
global last_step, approx_progress
assert split in {"train", "val"}, "split must be 'train' or 'val'"
dataset = train_dataset if split == "train" else val_dataset
dataset_size = len(dataset)
assert dataset_size > 0
needed_tokens = device_batch_size * max_seq_len + 1 # to form one training batch of inputs,targets
token_buffer = deque()
# CUDA supports memory pinning for faster transfers between CPU and GPU:
scratch = torch.empty(needed_tokens, dtype=torch.int64, pin_memory=(device_type == "cuda"))
cursor = ddp_rank # increments by ddp_world_size each time, so each rank processes unique documents
it = 0 # iteration counter
while True:
# Accumulate enough tokens for one iteration before yielding
while len(token_buffer) < needed_tokens:
conversation = dataset[cursor]
ids, _ = tokenizer.render_conversation(conversation)
token_buffer.extend(ids)
cursor += ddp_world_size
if cursor >= dataset_size:
cursor -= dataset_size # wrap around for another epoch
if split == "train":
last_step = True # toggle last_step to True, which will terminate the training loop
# Stopping condition to respect num_iterations, if given
it += 1
if num_iterations > 0 and it >= num_iterations:
last_step = True # toggle last_step to True, which will terminate the training loop
# Build up inputs/targets and yield
for i in range(needed_tokens):
scratch[i] = token_buffer.popleft()
inputs_cpu = scratch[:-1].to(dtype=torch.int32)
targets_cpu = scratch[1:]
inputs = inputs_cpu.view(device_batch_size, max_seq_len).to(device=device, dtype=torch.int32, non_blocking=True)
targets = targets_cpu.view(device_batch_size, max_seq_len).to(device=device, dtype=torch.int64, non_blocking=True)
if split == "train":
if num_iterations > 0:
approx_progress = it / num_iterations # calculate progress from the max number of iterations
else:
approx_progress = cursor / dataset_size # approximate progress as a fraction of the dataset
yield inputs, targets
train_loader = mid_data_generator("train")
build_val_loader = lambda: mid_data_generator("val")
progress = 0 # will go from 0 to 1 over the course of the epoch
# Learning rate scheduler
def get_lr_multiplier(progress):
# first 80% of training: no decay, then linearly ramp down to 0.
return 1 if progress < 0.8 else 1 - (progress - 0.8) / 0.2
# Momentum scheduler for Muon optimizer
def get_muon_momentum(it):
frac = min(it / 300, 1)
momentum = (1 - frac) * 0.85 + frac * 0.95
return momentum
# -----------------------------------------------------------------------------
# Training loop
x, y = next(train_loader) # prefetch the very first batch of data
min_val_bpb = float("inf")
smooth_train_loss = 0 # EMA of training loss
ema_beta = 0.9 # EMA decay factor
total_training_time = 0 # total wall-clock time of training
step = 0
while True:
flops_so_far = num_flops_per_token * total_batch_size * step
# Synchronize last_step across all ranks to avoid hangs in the distributed setting
if ddp:
last_step_tensor = torch.tensor(last_step, dtype=torch.int32, device=device)
dist.all_reduce(last_step_tensor, op=dist.ReduceOp.MAX)
last_step = bool(last_step_tensor.item())
# once in a while: evaluate the val bpb (all ranks participate)
if eval_every > 0 and (last_step or step % eval_every == 0):
model.eval()
val_loader = build_val_loader()
eval_steps = eval_tokens // (device_batch_size * max_seq_len * ddp_world_size)
with autocast_ctx:
val_bpb = evaluate_bpb(model, val_loader, eval_steps, token_bytes)
print0(f"Step {step:05d} | Validation bpb: {val_bpb:.4f}")
if val_bpb < min_val_bpb:
min_val_bpb = val_bpb
wandb_run.log({
"step": step,
"total_training_flops": flops_so_far,
"total_training_time": total_training_time,
"val/bpb": val_bpb,
})
model.train()
# save checkpoint at the end of the run (only on master process)
if master_process and last_step and not dry_run:
output_dirname = f"d{depth}" # e.g. d12
checkpoint_dir = os.path.join(base_dir, "mid_checkpoints", output_dirname)
save_checkpoint(
checkpoint_dir,
step,
orig_model.state_dict(),
[opt.state_dict() for opt in optimizers], # TODO: make sure saving across ranks is done correctly
{
"step": step,
"val_bpb": val_bpb, # loss at last step
"model_config": {
"sequence_len": max_seq_len,
"vocab_size": tokenizer.get_vocab_size(),
"n_layer": depth,
"n_head": model.config.n_head,
"n_kv_head": model.config.n_kv_head,
"n_embd": model.config.n_embd,
},
"user_config": user_config, # inputs to the training script
}
)
if last_step:
break
# -------------------------------------------------------------------------
# single training step
# evaluate the gradient
synchronize()
t0 = time.time()
for micro_step in range(grad_accum_steps):
with autocast_ctx:
loss = model(x, y)
train_loss = loss.detach() # for logging
loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
loss.backward()
x, y = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward
progress = max(progress, approx_progress) # only increase progress monotonically
# step the optimizers
lrm = get_lr_multiplier(progress)
for opt in optimizers:
for group in opt.param_groups:
group["lr"] = group["initial_lr"] * lrm
muon_momentum = get_muon_momentum(step)
for group in muon_optimizer.param_groups:
group["momentum"] = muon_momentum
for opt in optimizers:
opt.step()
model.zero_grad(set_to_none=True)
synchronize()
t1 = time.time()
dt = t1 - t0
# -------------------------------------------------------------------------
# State
step += 1
# logging
smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss
debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA
pct_done = 100 * progress
tok_per_sec = int(total_batch_size / dt)
flops_per_sec = num_flops_per_token * total_batch_size / dt
promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity
mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in %
if step > 10:
total_training_time += dt # only count the time after the first 10 steps
print0(f"step {step:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | total time: {total_training_time/60:.2f}m")
if step % 10 == 0:
wandb_run.log({
"step": step,
"total_training_flops": flops_so_far,
"total_training_time": total_training_time,
"train/loss": debiased_smooth_loss,
"train/lrm": lrm,
"train/dt": dt,
"train/tok_per_sec": tok_per_sec,
"train/mfu": mfu,
})
# print a few more stats
print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB")
print0(f"Total training time: {total_training_time/60:.2f}m")
print0(f"Minimum validation bpb: {min_val_bpb:.4f}")
# Log to report
if not dry_run:
from nanochat.report import get_report
get_report().log(section="Midtraining", data=[
user_config, # CLI args
{ # stats about the training setup
"Number of iterations": step,
"DDP world size": ddp_world_size,
},
{ # stats about training outcomes
"Minimum validation bpb": min_val_bpb,
}
])
# cleanup
wandb_run.finish() # wandb run finish
compute_cleanup()

265
scripts-align/tok_eval.py Normal file
View File

@ -0,0 +1,265 @@
"""
Evaluate compression ratio of the tokenizer.
"""
from nanochat.tokenizer import get_tokenizer, RustBPETokenizer
from nanochat.dataset import parquets_iter_batched
# Random text I got from a random website this morning
news_text = r"""
(Washington, D.C., July 9, 2025)- Yesterday, Mexicos National Service of Agro-Alimentary Health, Safety, and Quality (SENASICA) reported a new case of New World Screwworm (NWS) in Ixhuatlan de Madero, Veracruz in Mexico, which is approximately 160 miles northward of the current sterile fly dispersal grid, on the eastern side of the country and 370 miles south of the U.S./Mexico border. This new northward detection comes approximately two months after northern detections were reported in Oaxaca and Veracruz, less than 700 miles away from the U.S. border, which triggered the closure of our ports to Mexican cattle, bison, and horses on May 11, 2025.
While USDA announced a risk-based phased port re-opening strategy for cattle, bison, and equine from Mexico beginning as early as July 7, 2025, this newly reported NWS case raises significant concern about the previously reported information shared by Mexican officials and severely compromises the outlined port reopening schedule of five ports from July 7-September 15. Therefore, in order to protect American livestock and our nations food supply, Secretary Rollins has ordered the closure of livestock trade through southern ports of entry effective immediately.
The United States has promised to be vigilant and after detecting this new NWS case, we are pausing the planned port reopenings to further quarantine and target this deadly pest in Mexico. We must see additional progress combatting NWS in Veracruz and other nearby Mexican states in order to reopen livestock ports along the Southern border, said U.S. Secretary of Agriculture Brooke L. Rollins. Thanks to the aggressive monitoring by USDA staff in the U.S. and in Mexico, we have been able to take quick and decisive action to respond to the spread of this deadly pest.
""".strip()
# Random Korean text (to test non-English compression)
korean_text = r"""
정직한 사실 위에, 공정한 시선을 더하다
Herald Korea Times
헤럴드코리아타임즈는 정치, 경제, 사회, 문화 한국 사회 전반의 주요 이슈를 심도 있게 다루는 종합 온라인 신문사입니다.
우리는 단순히 뉴스를 전달하는 것이 아니라, 사실(Fact) 기반한 양측의 시각을 균형 있게 조명하며, 독자 여러분이 스스로 판단할 있는 정보의 균형 제공합니다.
한국 언론의 오랜 문제로 지적되어 정치적 편향, 이념적 왜곡에서 벗어나
오직 정직함과 공정함을 원칙으로 삼는 언론을 지향합니다.
어느 한쪽의 주장만을 확대하거나 감추지 않고,
**모든 쟁점에 대해 무엇이 쟁점인지, 누가 무엇을 주장하는지, 사실은 무엇인지** 명확히 전달하는 집중합니다.
""".strip()
# Random piece of code
code_text = r"""
class BasicTokenizer(Tokenizer):
def __init__(self):
super().__init__()
def train(self, text, vocab_size, verbose=False):
assert vocab_size >= 256
num_merges = vocab_size - 256
# input text preprocessing
text_bytes = text.encode("utf-8") # raw bytes
ids = list(text_bytes) # list of integers in range 0..255
# iteratively merge the most common pairs to create new tokens
merges = {} # (int, int) -> int
vocab = {idx: bytes([idx]) for idx in range(256)} # int -> bytes
for i in range(num_merges):
# count up the number of times every consecutive pair appears
stats = get_stats(ids)
# find the pair with the highest count
pair = max(stats, key=stats.get)
# mint a new token: assign it the next available id
idx = 256 + i
# replace all occurrences of pair in ids with idx
ids = merge(ids, pair, idx)
# save the merge
merges[pair] = idx
vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
# prints
if verbose:
print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
""".strip()
math_text = r"""
\documentclass[12pt]{article}
\usepackage{amsmath,amsthm,amssymb}
\usepackage[margin=1in]{geometry}
\newtheorem{theorem}{Theorem}
\newtheorem*{remark}{Remark}
\begin{document}
\begin{center}
{\Large A Cute Identity: The Sum of Cubes is a Square}
\end{center}
\begin{theorem}
For every integer $n \ge 1$,
\[
\sum_{k=1}^{n} k^{3} \;=\; \left(\frac{n(n+1)}{2}\right)^{2}.
\]
\end{theorem}
\begin{proof}[Proof 1 (Induction)]
Let $S(n) = \sum_{k=1}^{n} k^3$. For $n=1$, $S(1)=1=(1\cdot 2/2)^2$, so the base case holds.
Assume $S(n)=\big(\tfrac{n(n+1)}{2}\big)^2$ for some $n\ge 1$.
Then
\[
S(n+1)
= S(n) + (n+1)^3
= \left(\frac{n(n+1)}{2}\right)^2 + (n+1)^3.
\]
Factor out $(n+1)^2$:
\[
S(n+1)
= (n+1)^2\left( \frac{n^2}{4} + (n+1) \right)
= (n+1)^2\left( \frac{n^2 + 4n + 4}{4} \right)
= (n+1)^2\left( \frac{(n+2)^2}{4} \right).
\]
Thus
\[
S(n+1)=\left(\frac{(n+1)(n+2)}{2}\right)^2,
\]
which matches the claimed formula with $n$ replaced by $n+1$. By induction, the identity holds for all $n\ge 1$.
\end{proof}
\begin{proof}[Proof 2 (Algebraic telescoping)]
Recall the binomial identity
\[
(k+1)^4 - k^4 = 4k^3 + 6k^2 + 4k + 1.
\]
Summing both sides from $k=0$ to $n$ telescopes:
\[
(n+1)^4 - 0^4
= \sum_{k=0}^{n}\big(4k^3 + 6k^2 + 4k + 1\big)
= 4\sum_{k=1}^{n}k^3 + 6\sum_{k=1}^{n}k^2 + 4\sum_{k=1}^{n}k + (n+1).
\]
Using the standard sums
\[
\sum_{k=1}^{n}k = \frac{n(n+1)}{2}
\quad\text{and}\quad
\sum_{k=1}^{n}k^2 = \frac{n(n+1)(2n+1)}{6},
\]
solve for $\sum_{k=1}^{n}k^3$ to get
\[
\sum_{k=1}^{n}k^3 = \left(\frac{n(n+1)}{2}\right)^2.
\]
\end{proof}
\begin{remark}
Geometrically, the identity says: ``adding up $1^3,2^3,\dots,n^3$ builds a perfect squarenamely the square of the $n$th triangular number. This is why one sometimes calls it the \emph{sum-of-cubes is a square} phenomenon.
\end{remark}
\end{document}
""".strip()
science_text = r"""
Photosynthesis is a photochemical energy transduction process in which light-harvesting pigmentprotein complexes within the thylakoid membranes of oxygenic phototrophs absorb photons and initiate charge separation at the reaction center, driving the linear electron transport chain from water to NADP via photosystem II, the cytochrome b₆f complex, and photosystem I, concomitantly generating a trans-thylakoid proton motive force utilized by chloroplastic ATP synthase. The light-dependent reactions produce ATP and NADPH, which fuel the CalvinBensonBassham cycle in the stroma, wherein ribulose-1,5-bisphosphate is carboxylated by ribulose-1,5-bisphosphate carboxylase/oxygenase (RuBisCO) to form 3-phosphoglycerate, subsequently reduced and regenerated through a series of enzymatic steps, enabling net assimilation of CO₂ into triose phosphates and ultimately carbohydrates. This process is tightly regulated by photoprotective mechanisms, redox feedback, and metabolite flux, representing a central biochemical pathway coupling solar energy capture to the biospheres primary productivity.
""".strip()
# The tokenizer was trained on data from earlier shards, so it has seen this data
train_docs = next(parquets_iter_batched(split="train"))
train_text = "\n".join(train_docs)
val_docs = next(parquets_iter_batched(split="val"))
val_text = "\n".join(val_docs)
all_text = [
("news", news_text),
("korean", korean_text),
("code", code_text),
("math", math_text),
("science", science_text),
("fwe-train", train_text),
]
if val_text:
all_text.append(("fwe-val", val_text))
# Try out current default compared to GPT-2 and GPT-4 tokenizers
tokenizer_results = {}
vocab_sizes = {}
for tokenizer_name in ["gpt2", "gpt4", "ours"]:
if tokenizer_name == "gpt2":
tokenizer = RustBPETokenizer.from_pretrained("gpt2") # gpt-2 base model tokenizer
elif tokenizer_name == "gpt4":
tokenizer = RustBPETokenizer.from_pretrained("cl100k_base") # gpt-4 base model tokenizer
else:
tokenizer = get_tokenizer()
vocab_sizes[tokenizer_name] = tokenizer.get_vocab_size()
tokenizer_results[tokenizer_name] = {}
for name, text in all_text:
encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)
assert decoded == text
encoded_bytes = text.encode('utf-8')
ratio = len(encoded_bytes) / len(encoded)
tokenizer_results[tokenizer_name][name] = {
'bytes': len(encoded_bytes),
'tokens': len(encoded),
'ratio': ratio
}
# ANSI color codes
GREEN = '\033[92m'
RED = '\033[91m'
RESET = '\033[0m'
# Print vocab sizes
print(f"\nVocab sizes:")
print(f"GPT-2: {vocab_sizes['gpt2']}")
print(f"GPT-4: {vocab_sizes['gpt4']}")
print(f"Ours: {vocab_sizes['ours']}")
def print_comparison(baseline_name, baseline_results, ours_results, all_text):
"""Print comparison table between baseline tokenizer and ours."""
print(f"\nComparison with {baseline_name}:")
print("=" * 95)
print(f"{'Text Type':<10} {'Bytes':<8} {baseline_name:<15} {'Ours':<15} {'Relative':<12} {'Better':<10}")
print(f"{'':10} {'':8} {'Tokens':<7} {'Ratio':<7} {'Tokens':<7} {'Ratio':<7} {'Diff %':<12}")
print("-" * 95)
for name, text in all_text:
baseline_data = baseline_results[name]
ours_data = ours_results[name]
# Calculate relative difference (positive means ours is better, negative means worse)
# Using tokens: fewer tokens is better, so we calculate (baseline_tokens - ours_tokens) / baseline_tokens
relative_diff = ((baseline_data['tokens'] - ours_data['tokens']) / baseline_data['tokens']) * 100
# Determine which has better compression (higher ratio = better)
if baseline_data['ratio'] > ours_data['ratio']:
baseline_color, ours_color = GREEN, RED
better = baseline_name
diff_color = RED
elif ours_data['ratio'] > baseline_data['ratio']:
baseline_color, ours_color = RED, GREEN
better = "Ours"
diff_color = GREEN
else:
baseline_color, ours_color = "", ""
better = "Tie"
diff_color = ""
print(f"{name:<10} {baseline_data['bytes']:<8} "
f"{baseline_color}{baseline_data['tokens']:<7}{RESET} "
f"{baseline_color}{baseline_data['ratio']:<7.2f}{RESET} "
f"{ours_color}{ours_data['tokens']:<7}{RESET} "
f"{ours_color}{ours_data['ratio']:<7.2f}{RESET} "
f"{diff_color}{relative_diff:+7.1f}%{RESET} "
f"{better:<10}")
# Print comparisons
print_comparison("GPT-2", tokenizer_results['gpt2'], tokenizer_results['ours'], all_text)
print_comparison("GPT-4", tokenizer_results['gpt4'], tokenizer_results['ours'], all_text)
# Log to report
from nanochat.report import get_report
lines = []
for baseline_name in ["GPT-2", "GPT-4"]:
baseline_key = baseline_name.lower().replace('-', '')
baseline_results = tokenizer_results[baseline_key]
ours_results = tokenizer_results['ours']
lines.append(f"### Comparison with {baseline_name}")
lines.append("")
lines.append("| Text Type | Bytes | " + baseline_name + " Tokens | " + baseline_name + " Ratio | Ours Tokens | Ours Ratio | Relative Diff % |")
lines.append("|-----------|-------|--------------|--------------|-------------|------------|-----------------|")
for name, text in all_text:
baseline_data = baseline_results[name]
ours_data = ours_results[name]
relative_diff = ((baseline_data['tokens'] - ours_data['tokens']) / baseline_data['tokens']) * 100
lines.append(f"| {name} | {baseline_data['bytes']} | {baseline_data['tokens']} | {baseline_data['ratio']:.2f} | {ours_data['tokens']} | {ours_data['ratio']:.2f} | {relative_diff:+.1f}% |")
lines.append("")
report_markdown = "\n".join(lines)
get_report().log(section="Tokenizer evaluation", data=[
report_markdown,
])

106
scripts-align/tok_train.py Normal file
View File

@ -0,0 +1,106 @@
"""
Train a tokenizer using the HuggingFace Tokenizers library.
In the style of GPT-4 tokenizer.
"""
import os
import time
import argparse
import torch
from nanochat.tokenizer import RustBPETokenizer
from nanochat.common import get_base_dir
from nanochat.dataset import parquets_iter_batched
# -----------------------------------------------------------------------------
# Parse command line arguments
parser = argparse.ArgumentParser(description='Train a BPE tokenizer')
parser.add_argument('--max_chars', type=int, default=10_000_000_000, help='Maximum characters to train on (default: 10B)')
parser.add_argument('--doc_cap', type=int, default=10_000, help='Maximum characters per document (default: 10,000)')
parser.add_argument('--vocab_size', type=int, default=65536, help='Vocabulary size (default: 65536 = 2^16)')
args = parser.parse_args()
print(f"max_chars: {args.max_chars:,}")
print(f"doc_cap: {args.doc_cap:,}")
print(f"vocab_size: {args.vocab_size:,}")
# -----------------------------------------------------------------------------
# Text iterator
def text_iterator():
"""
1) Flatten the batches into a single iterator
2) Crop every document to args.doc_cap characters
3) Break when we've seen args.max_chars characters
"""
nchars = 0
for batch in parquets_iter_batched(split="train"):
for doc in batch:
doc_text = doc
if len(doc_text) > args.doc_cap:
doc_text = doc_text[:args.doc_cap]
nchars += len(doc_text)
yield doc_text
if nchars > args.max_chars:
return
text_iter = text_iterator()
# -----------------------------------------------------------------------------
# Train the tokenizer
t0 = time.time()
tokenizer = RustBPETokenizer.train_from_iterator(text_iter, args.vocab_size)
t1 = time.time()
train_time = t1 - t0
print(f"Training time: {train_time:.2f}s")
# -----------------------------------------------------------------------------
# Save the tokenizer to disk
base_dir = get_base_dir()
tokenizer_dir = os.path.join(base_dir, "tokenizer")
tokenizer.save(tokenizer_dir)
# -----------------------------------------------------------------------------
# Quick inline sanity check
test_text = """Hello world! This is a test.
Numbers: 123, 4567, 89
Contractions: I'm, you're, it's
Special chars: @#$%^&*()
Unicode: 你好世界 🌍"""
encoded = tokenizer.encode(test_text)
decoded = tokenizer.decode(encoded)
assert decoded == test_text
# -----------------------------------------------------------------------------
# One more thing: we wish to cache a mapping from token id to number of bytes of that token
# for efficient evaluation of bits per byte. Unlike the typical mean loss, this
# allows us to report a loss that is invariant to the vocab size of the tokenizer.
# The bits per byte on the validation set is then one of the primary metrics we care about.
vocab_size = tokenizer.get_vocab_size()
special_set = set(tokenizer.get_special_tokens())
token_strings = [tokenizer.decode([token_id]) for token_id in range(vocab_size)]
token_bytes = []
for token_id in range(vocab_size):
token_str = token_strings[token_id] # the Python string representation of this token
if token_str in special_set:
token_bytes.append(0) # special characters are not counted
else:
id_bytes = len(token_str.encode("utf-8")) # number of bytes that make up this token
token_bytes.append(id_bytes)
token_bytes = torch.tensor(token_bytes, dtype=torch.int32, device='cpu')
token_bytes_path = os.path.join(tokenizer_dir, "token_bytes.pt")
with open(token_bytes_path, "wb") as f:
torch.save(token_bytes, f)
print(f"Saved token_bytes to {token_bytes_path}")
# Log to report
from nanochat.report import get_report
token_bytes_nonzero = (token_bytes[token_bytes > 0]).to(dtype=torch.float32)
get_report().log(section="Tokenizer training", data=[
vars(args), # argparse command line arguments
{"train_time": train_time},
{"num_special_tokens": len(special_set)},
{
"token_bytes_min": int(token_bytes_nonzero.min().item()),
"token_bytes_max": int(token_bytes_nonzero.max().item()),
"token_bytes_mean": token_bytes_nonzero.mean().item(),
"token_bytes_std": token_bytes_nonzero.std().item(),
}
])