fix merge conflict manually, cursor fails

This commit is contained in:
Kian Kyars 2025-11-26 19:00:38 -07:00
commit 24c04f0ca7
24 changed files with 305 additions and 197 deletions

View File

@ -113,7 +113,7 @@ files-to-prompt . -e py -e md -e rs -e html -e toml -e sh --ignore "*target*" --
This includes all py, rs, html, toml, sh files, excludes the `rustbpe/target` folder, and chooses the cxml output format. Everything is written to the `packaged.txt` file, which atm measures ~330KB (i.e. well below ~100K tokens for a state of the art LLM), and ~8K lines of code in 45 files. This includes all py, rs, html, toml, sh files, excludes the `rustbpe/target` folder, and chooses the cxml output format. Everything is written to the `packaged.txt` file, which atm measures ~330KB (i.e. well below ~100K tokens for a state of the art LLM), and ~8K lines of code in 45 files.
Alternatively, I recommend using [DeepWiki](https://deepwiki.com/) from Devin/Cognition to ask questions of this repo. In the URL of this repo, simply change github.com to deepwiki.com, and you're off. Alternatively, I recommend using [DeepWiki](https://deepwiki.com/karpathy/nanochat) from Devin/Cognition to ask questions of this repo. In the URL of this repo, simply change github.com to deepwiki.com, and you're off.
## Tests ## Tests
@ -184,6 +184,7 @@ python -m pytest tests/test_rustbpe.py -v -s
│ ├── smoltalk.py # Conglomerate dataset of SmolTalk from HF │ ├── smoltalk.py # Conglomerate dataset of SmolTalk from HF
│ └── spellingbee.py # Task teaching model to spell/count letters │ └── spellingbee.py # Task teaching model to spell/count letters
├── tests ├── tests
│ └── test_engine.py
│ └── test_rustbpe.py │ └── test_rustbpe.py
└── uv.lock └── uv.lock
``` ```
@ -201,6 +202,7 @@ Current LLM policy: disclosure. When submitting a PR, please declare any parts t
- Thank you to [HuggingFace](https://huggingface.co/) for fineweb and smoltalk. - Thank you to [HuggingFace](https://huggingface.co/) for fineweb and smoltalk.
- Thank you [Lambda](https://lambda.ai/service/gpu-cloud) for the compute used in developing this project. - Thank you [Lambda](https://lambda.ai/service/gpu-cloud) for the compute used in developing this project.
- Thank you to chief LLM whisperer 🧙‍♂️ Alec Radford for advice/guidance. - Thank you to chief LLM whisperer 🧙‍♂️ Alec Radford for advice/guidance.
- Thank you to the repo czar Sofie [@svlandeg](https://github.com/svlandeg) for help with managing issues, pull requests and discussions of nanochat.
## Cite ## Cite

View File

@ -37,7 +37,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
from nanochat.common import get_base_dir from nanochat.common import get_base_dir
api_key = open("openroutertoken.txt").read().strip() api_key = open("openroutertoken.txt", "r", encoding="utf-8").read().strip()
url = "https://openrouter.ai/api/v1/chat/completions" url = "https://openrouter.ai/api/v1/chat/completions"
headers = { headers = {
@ -45,7 +45,7 @@ headers = {
"Content-Type": "application/json" "Content-Type": "application/json"
} }
readme = open("README.md").read().strip() readme = open("README.md", "r", encoding="utf-8").read().strip()
prompt = r""" prompt = r"""
I want to generate synthetic data for an LLM to teach it about its identity. Here is the identity I want: I want to generate synthetic data for an LLM to teach it about its identity. Here is the identity I want:

Binary file not shown.

Before

Width:  |  Height:  |  Size: 19 KiB

After

Width:  |  Height:  |  Size: 1.3 KiB

View File

@ -22,13 +22,6 @@ fi
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source "$HOME/.cargo/env" source "$HOME/.cargo/env"
uv run maturin develop --release --manifest-path rustbpe/Cargo.toml uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
unzip -q eval_bundle.zip
rm eval_bundle.zip
mv eval_bundle $NANOCHAT_BASE_DIR
fi
# wipe the report # wipe the report
python -m nanochat.report reset python -m nanochat.report reset

View File

@ -20,37 +20,36 @@ def log0(message):
if int(os.environ.get('RANK', 0)) == 0: if int(os.environ.get('RANK', 0)) == 0:
logger.info(message) logger.info(message)
def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data): def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data, rank=0):
assert int(os.environ.get('RANK', 0)) == 0 # prevent footguns for now if rank == 0:
os.makedirs(checkpoint_dir, exist_ok=True) os.makedirs(checkpoint_dir, exist_ok=True)
# Save the model state (parameters) # Save the model state parameters
model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt") model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")
torch.save(model_data, model_path) torch.save(model_data, model_path)
log0(f"Saved model file to: {model_path}") logger.info(f"Saved model parameters to: {model_path}")
# Save the optimizer state (useful for SFT or any other fine-tuning) # Save the metadata dict as json
meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
with open(meta_path, "w", encoding="utf-8") as f:
json.dump(meta_data, f, indent=2)
logger.info(f"Saved metadata to: {meta_path}")
# Note that optimizer state is sharded across ranks, so each rank must save its own.
if optimizer_data is not None: if optimizer_data is not None:
optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}.pt") optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt")
torch.save(optimizer_data, optimizer_path) torch.save(optimizer_data, optimizer_path)
log0(f"Saved optimizer file to: {optimizer_path}") logger.info(f"Saved optimizer state to: {optimizer_path}")
# Save the metadata dict as json
meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
with open(meta_path, "w") as f:
json.dump(meta_data, f, indent=2)
log0(f"Saved metadata file to: {meta_path}")
def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False, rank=0):
def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False):
# Load the model state # Load the model state
model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt") model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")
model_data = torch.load(model_path, map_location=device) model_data = torch.load(model_path, map_location=device)
# Load the optimizer state if requested # Load the optimizer state if requested
optimizer_data = None optimizer_data = None
if load_optimizer: if load_optimizer:
optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}.pt") optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt")
optimizer_data = torch.load(optimizer_path, map_location=device) optimizer_data = torch.load(optimizer_path, map_location=device)
# Load the metadata # Load the metadata
meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json") meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
with open(meta_path, "r") as f: with open(meta_path, "r", encoding="utf-8") as f:
meta_data = json.load(f) meta_data = json.load(f)
return model_data, optimizer_data, meta_data return model_data, optimizer_data, meta_data
@ -65,8 +64,14 @@ def build_model(checkpoint_dir, step, device, phase):
""" """
assert phase in ["train", "eval"], f"Invalid phase: {phase}" assert phase in ["train", "eval"], f"Invalid phase: {phase}"
model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, step, device, load_optimizer=False) model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, step, device, load_optimizer=False)
if device.type in {"cpu", "mps"}:
# Convert bfloat16 tensors to float for CPU inference
model_data = {
k: v.float() if v.dtype == torch.bfloat16 else v
for k, v in model_data.items()
}
# Hack: fix torch compile issue, which prepends all keys with _orig_mod. # Hack: fix torch compile issue, which prepends all keys with _orig_mod.
model_data = {k.lstrip("_orig_mod."): v for k, v in model_data.items()} model_data = {k.removeprefix("_orig_mod."): v for k, v in model_data.items()}
model_config_kwargs = meta_data["model_config"] model_config_kwargs = meta_data["model_config"]
log0(f"Building model with config: {model_config_kwargs}") log0(f"Building model with config: {model_config_kwargs}")
model_config = GPTConfig(**model_config_kwargs) model_config = GPTConfig(**model_config_kwargs)

View File

@ -5,10 +5,10 @@ Common utilities for nanochat.
import os import os
import re import re
import logging import logging
import fcntl
import urllib.request import urllib.request
import torch import torch
import torch.distributed as dist import torch.distributed as dist
from filelock import FileLock
class ColoredFormatter(logging.Formatter): class ColoredFormatter(logging.Formatter):
"""Custom formatter that adds colors to log messages.""" """Custom formatter that adds colors to log messages."""
@ -58,7 +58,7 @@ def get_base_dir():
os.makedirs(nanochat_dir, exist_ok=True) os.makedirs(nanochat_dir, exist_ok=True)
return nanochat_dir return nanochat_dir
def download_file_with_lock(url, filename): def download_file_with_lock(url, filename, postprocess_fn=None):
""" """
Downloads a file from a URL to a local path in the base directory. Downloads a file from a URL to a local path in the base directory.
Uses a lock file to prevent concurrent downloads among multiple ranks. Uses a lock file to prevent concurrent downloads among multiple ranks.
@ -70,29 +70,27 @@ def download_file_with_lock(url, filename):
if os.path.exists(file_path): if os.path.exists(file_path):
return file_path return file_path
with open(lock_path, 'w') as lock_file: with FileLock(lock_path):
# Only a single rank can acquire this lock # Only a single rank can acquire this lock
# All other ranks block until it is released # All other ranks block until it is released
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
# Recheck after acquiring lock
if os.path.exists(file_path): if os.path.exists(file_path):
return file_path return file_path
# Download the content as bytes
print(f"Downloading {url}...") print(f"Downloading {url}...")
with urllib.request.urlopen(url) as response: with urllib.request.urlopen(url) as response:
content = response.read().decode('utf-8') content = response.read() # bytes
with open(file_path, 'w') as f: # Write to local file
with open(file_path, 'wb') as f:
f.write(content) f.write(content)
print(f"Downloaded to {file_path}") print(f"Downloaded to {file_path}")
# Clean up the lock file after the lock is released # Run the postprocess function if provided
try: if postprocess_fn is not None:
os.remove(lock_path) postprocess_fn(file_path)
except OSError:
pass # Ignore if already removed by another process
return file_path return file_path
@ -104,15 +102,15 @@ def print0(s="",**kwargs):
def print_banner(): def print_banner():
# Cool DOS Rebel font ASCII banner made with https://manytools.org/hacker-tools/ascii-banner/ # Cool DOS Rebel font ASCII banner made with https://manytools.org/hacker-tools/ascii-banner/
banner = """ banner = """
""" """
print0(banner) print0(banner)
def is_ddp(): def is_ddp():
@ -150,6 +148,8 @@ def compute_init(device_type="cuda"): # cuda|cpu|mps
assert torch.backends.mps.is_available(), "Your PyTorch installation is not configured for MPS but device_type is 'mps'" assert torch.backends.mps.is_available(), "Your PyTorch installation is not configured for MPS but device_type is 'mps'"
# Reproducibility # Reproducibility
# Note that we set the global seeds here, but most of the code uses explicit rng objects.
# The only place where global rng might be used is nn.Module initialization of the model weights.
torch.manual_seed(42) torch.manual_seed(42)
if device_type == "cuda": if device_type == "cuda":
torch.cuda.manual_seed(42) torch.cuda.manual_seed(42)
@ -164,7 +164,7 @@ def compute_init(device_type="cuda"): # cuda|cpu|mps
ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info() ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
if ddp and device_type == "cuda": if ddp and device_type == "cuda":
device = torch.device("cuda", ddp_local_rank) device = torch.device("cuda", ddp_local_rank)
torch.cuda.set_device(device) # make "cuda" default to this device torch.cuda.set_device(device) # make "cuda" default to this device
dist.init_process_group(backend="nccl", device_id=device) dist.init_process_group(backend="nccl", device_id=device)
dist.barrier() dist.barrier()
else: else:

View File

@ -1,49 +1,87 @@
from collections import deque from collections import deque
import torch import torch
import pyarrow.parquet as pq
from nanochat.common import get_dist_info from nanochat.common import get_dist_info
from nanochat.dataset import parquets_iter_batched from nanochat.dataset import list_parquet_files
from nanochat.tokenizer import get_tokenizer from nanochat.tokenizer import get_tokenizer
def tokenizing_distributed_data_loader(B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda"): def tokenizing_distributed_data_loader_with_state(B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda", resume_state_dict=None):
"""Stream pretraining text from parquet files, tokenize, yield training batches.""" """
Stream pretraining text from parquet files, tokenize, yield training batches.
This implementation became a bit more complex because we wish to support approximate resume training.
Instead of turning this into a Class, we opt to return the state_dict with every batch,
and then the caller can pass in a state_dict to resume training from a desired point.
Note that this resumption is atm only *approximate* for simplicity.
We won't repeat the same documents but we might skip a few.
The state_dict that is returned can be later passed into this function via `resume_state_dict` to approximately resume.
Perfect state resumption is possible but would be a lot more bloated, probably not worth it atm.
"""
assert split in ["train", "val"], "split must be 'train' or 'val'" assert split in ["train", "val"], "split must be 'train' or 'val'"
# infinite iterator over document batches (list of text strings)
ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info() ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
def document_batches():
parquet_paths = list_parquet_files()
parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:]
resume_pq_idx = resume_state_dict["pq_idx"] if resume_state_dict is not None else 0
resume_rg_idx = resume_state_dict["rg_idx"] if resume_state_dict is not None else None
pq_idx = resume_pq_idx # we kick off parquet files at the resume index (or by default just 0)
while True: # iterate infinitely (multi-epoch)
while pq_idx < len(parquet_paths): # iterate over all parquet files
filepath = parquet_paths[pq_idx]
pf = pq.ParquetFile(filepath)
# Start from resume point if resuming on same file, otherwise from DDP rank
# I know this state resumption is a little bit tricky and a little bit hacky... sigh.
if resume_rg_idx is not None:
base_idx = resume_rg_idx // ddp_world_size # in units of ddp_world_size
base_idx += 1 # advance by 1 so that we definitely don't repeat data after resuming
rg_idx = base_idx * ddp_world_size + ddp_rank
resume_rg_idx = None # set to None as we only want to do this a single time
else:
rg_idx = ddp_rank
while rg_idx < pf.num_row_groups:
rg = pf.read_row_group(rg_idx)
batch = rg.column('text').to_pylist() # each batch is a parquet group, e.g. 1024 rows
# the tokenizer encode might want to go in even smaller batches, e.g. 128 rows
for i in range(0, len(batch), tokenizer_batch_size):
yield batch[i:i+tokenizer_batch_size], (pq_idx, rg_idx)
rg_idx += ddp_world_size # advance to the next row group (in DDP)
pq_idx += 1 # advance to the next parquet file
batches = document_batches()
# Now emit batches of tokens.
needed_tokens = B * T + 1 # +1 is because we also need the target at the last token needed_tokens = B * T + 1 # +1 is because we also need the target at the last token
# get the tokenizer and the bos token # get the tokenizer and the bos token
tokenizer = get_tokenizer() tokenizer = get_tokenizer()
bos_token = tokenizer.get_bos_token_id() bos_token = tokenizer.get_bos_token_id()
# scratch buffer holds the tokens for one iteration # scratch buffer holds the tokens for one iteration
token_buffer = deque() # we stream tokens on the right and pop from the left token_buffer = deque() # we stream tokens on the right and pop from the left
# infinite iterator over document batches
def document_batches():
while True:
# batch will iterate in group size of the parquet files, usually e.g. 1024 rows
for batch in parquets_iter_batched(split=split, start=ddp_rank, step=ddp_world_size):
# for the tokenizer we might want to go in usually smaller batches, e.g. 128 rows
for i in range(0, len(batch), tokenizer_batch_size):
yield batch[i:i+tokenizer_batch_size]
batches = document_batches()
batch_index = 0
while True: while True:
# Accumulate enough tokens for one iteration before yielding. # Accumulate enough tokens for one iteration before yielding.
while len(token_buffer) < needed_tokens: while len(token_buffer) < needed_tokens:
doc_batch = next(batches) doc_batch, (pq_idx, rg_idx) = next(batches)
token_lists = tokenizer.encode(doc_batch, prepend=bos_token, num_threads=tokenizer_threads) token_lists = tokenizer.encode(doc_batch, prepend=bos_token, num_threads=tokenizer_threads)
for tokens in token_lists: for tokens in token_lists:
token_buffer.extend(tokens) token_buffer.extend(tokens)
batch_index += 1
# Move tokens from the deque into the scratch buffer # Move tokens from the deque into the scratch buffer
tokens = [token_buffer.popleft() for _ in range(needed_tokens)] tokens = [token_buffer.popleft() for _ in range(needed_tokens)]
# CUDA supports memory pinning for faster transfers between CPU and GPU: # CUDA supports memory pinning for asynchronous transfers between CPU and GPU
scratch = torch.tensor(tokens, dtype=torch.int64, pin_memory=(device == "cuda")) use_cuda_optimizations = device == "cuda"
scratch = torch.tensor(tokens, dtype=torch.long, pin_memory=use_cuda_optimizations) # in PyTorch, long=int64
# Create the inputs/targets as 1D tensors # Create the inputs/targets as 1D tensors
inputs_cpu = scratch[:-1].to(dtype=torch.int32) inputs_cpu = scratch[:-1]
targets_cpu = scratch[1:] targets_cpu = scratch[1:]
# Reshape to 2D and move to GPU async # Reshape to 2D and move to GPU async
inputs = inputs_cpu.view(B, T).to(device=device, dtype=torch.int32, non_blocking=True) inputs = inputs_cpu.view(B, T).to(device=device, non_blocking=use_cuda_optimizations)
targets = targets_cpu.view(B, T).to(device=device, dtype=torch.int64, non_blocking=True) targets = targets_cpu.view(B, T).to(device=device, non_blocking=use_cuda_optimizations)
state_dict = {"pq_idx": pq_idx, "rg_idx": rg_idx} # we need this in case we wish to approximately resume training
yield inputs, targets, state_dict
def tokenizing_distributed_data_loader(*args, **kwargs):
# helper function that only emits the inputs/targets and not the state_dict
for inputs, targets, state_dict in tokenizing_distributed_data_loader_with_state(*args, **kwargs):
yield inputs, targets yield inputs, targets

View File

@ -17,8 +17,9 @@ import signal
import warnings import warnings
from contextlib import contextmanager from contextlib import contextmanager
from collections import deque from collections import deque
from nanochat.common import compute_init from nanochat.common import compute_init, autodetect_device_type
from nanochat.checkpoint_manager import load_model from nanochat.checkpoint_manager import load_model
from contextlib import nullcontext
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Calculator tool helpers # Calculator tool helpers
@ -37,7 +38,7 @@ def eval_with_timeout(formula, max_time=3):
with timeout(max_time, formula): with timeout(max_time, formula):
with warnings.catch_warnings(): with warnings.catch_warnings():
warnings.simplefilter("ignore", SyntaxWarning) warnings.simplefilter("ignore", SyntaxWarning)
return eval(formula) return eval(formula, {"__builtins__": {}}, {})
except Exception as e: except Exception as e:
signal.alarm(0) signal.alarm(0)
# print(f"Warning: Failed to eval {formula}, exception: {e}") # it's ok ignore wrong calculator usage # print(f"Warning: Failed to eval {formula}, exception: {e}") # it's ok ignore wrong calculator usage
@ -107,9 +108,10 @@ class KVCache:
assert self.kv_cache is None, "Cannot prefill a non-empty KV cache" assert self.kv_cache is None, "Cannot prefill a non-empty KV cache"
assert other.kv_cache is not None, "Cannot prefill with a None KV cache" assert other.kv_cache is not None, "Cannot prefill with a None KV cache"
for ix, (dim1, dim2) in enumerate(zip(self.kv_shape, other.kv_shape)): for ix, (dim1, dim2) in enumerate(zip(self.kv_shape, other.kv_shape)):
# ix 0: num_layers, 1: k/v, 2: batch_size, 3: num_heads, 4: seq_len, 5: head_dim
if ix in [0, 1, 3, 5]: if ix in [0, 1, 3, 5]:
# num_layers, batch_size, num_heads, head_dim must match # num_layers, k/v, num_heads, head_dim must match
assert dim1 == dim2, f"Batch dim mismatch: {dim1} != {dim2}" assert dim1 == dim2, f"Dim {ix} mismatch: {dim1} != {dim2}"
elif ix == 2: elif ix == 2:
# batch_size can be expanded # batch_size can be expanded
assert dim1 == dim2 or dim2 == 1, f"Batch dim mismatch: {dim1} != {dim2}" assert dim1 == dim2 or dim2 == 1, f"Batch dim mismatch: {dim1} != {dim2}"
@ -327,6 +329,9 @@ if __name__ == "__main__":
import time import time
# init compute # init compute
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init() ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init()
device_type = autodetect_device_type()
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
# load the model and tokenizer # load the model and tokenizer
model, tokenizer, meta = load_model("base", device, phase="eval") model, tokenizer, meta = load_model("base", device, phase="eval")
bos_token_id = tokenizer.get_bos_token_id() bos_token_id = tokenizer.get_bos_token_id()
@ -339,10 +344,11 @@ if __name__ == "__main__":
torch.cuda.synchronize() torch.cuda.synchronize()
t0 = time.time() t0 = time.time()
stream = model.generate(prompt_tokens, **kwargs) stream = model.generate(prompt_tokens, **kwargs)
for token in stream: with autocast_ctx:
generated_tokens.append(token) for token in stream:
chunk = tokenizer.decode([token]) generated_tokens.append(token)
print(chunk, end="", flush=True) chunk = tokenizer.decode([token])
print(chunk, end="", flush=True)
print() print()
torch.cuda.synchronize() torch.cuda.synchronize()
t1 = time.time() t1 = time.time()
@ -354,11 +360,12 @@ if __name__ == "__main__":
stream = engine.generate(prompt_tokens, num_samples=1, **kwargs) # note: runs in fp32 stream = engine.generate(prompt_tokens, num_samples=1, **kwargs) # note: runs in fp32
torch.cuda.synchronize() torch.cuda.synchronize()
t0 = time.time() t0 = time.time()
for token_column, token_masks in stream: with autocast_ctx:
token = token_column[0] # only print out the first row for token_column, token_masks in stream:
generated_tokens.append(token) token = token_column[0] # only print out the first row
chunk = tokenizer.decode([token]) generated_tokens.append(token)
print(chunk, end="", flush=True) chunk = tokenizer.decode([token])
print(chunk, end="", flush=True)
print() print()
torch.cuda.synchronize() torch.cuda.synchronize()
t1 = time.time() t1 = time.time()

View File

@ -8,7 +8,7 @@ Notable features:
- norm after token embedding - norm after token embedding
- no learnable params in rmsnorm - no learnable params in rmsnorm
- no bias in linear layers - no bias in linear layers
- Multi-Query Attention (MQA) support for more efficient inference - Group-Query Attention (GQA) support for more efficient inference
""" """
import math import math
@ -29,7 +29,7 @@ class GPTConfig:
vocab_size: int = 50304 vocab_size: int = 50304
n_layer: int = 12 n_layer: int = 12
n_head: int = 6 # number of query heads n_head: int = 6 # number of query heads
n_kv_head: int = 6 # number of key/value heads (MQA) n_kv_head: int = 6 # number of key/value heads (GQA)
n_embd: int = 768 n_embd: int = 768
@ -244,7 +244,7 @@ class GPT(nn.Module):
def forward(self, idx, targets=None, kv_cache=None, loss_reduction='mean'): def forward(self, idx, targets=None, kv_cache=None, loss_reduction='mean'):
B, T = idx.size() B, T = idx.size()
# Grab the rotary embeddings for the current sequence length (they are of shape (1, seq_len, 1, head_dim)) # Grab the rotary embeddings for the current sequence length (they are of shape (1, seq_len, 1, head_dim/2))
assert T <= self.cos.size(1), f"Sequence length grew beyond the rotary embeddings cache: {T} > {self.cos.size(1)}" assert T <= self.cos.size(1), f"Sequence length grew beyond the rotary embeddings cache: {T} > {self.cos.size(1)}"
assert idx.device == self.cos.device, f"Rotary embeddings and idx are on different devices: {idx.device} != {self.cos.device}" assert idx.device == self.cos.device, f"Rotary embeddings and idx are on different devices: {idx.device} != {self.cos.device}"
assert self.cos.dtype == torch.bfloat16, "Rotary embeddings must be in bfloat16" assert self.cos.dtype == torch.bfloat16, "Rotary embeddings must be in bfloat16"

View File

@ -9,9 +9,9 @@ import torch.distributed as dist
def evaluate_bpb(model, batches, steps, token_bytes): def evaluate_bpb(model, batches, steps, token_bytes):
""" """
Instead of the naive 'mean loss', this function returns the bits per byte (bpb), Instead of the naive 'mean loss', this function returns the bits per byte (bpb),
which is a tokenization vocab size-indepedent metric, meaning you are still comparing which is a tokenization vocab size-independent metric, meaning you are still comparing
apples:apples if you change the vocab size. The way this works is that instead of just apples:apples if you change the vocab size. The way this works is that instead of just
calculating the average loss as usual, you calculate the sum loss, and indepependently calculating the average loss as usual, you calculate the sum loss, and independently
also the sum bytes (of all the target tokens), and divide. This normalizes the loss by also the sum bytes (of all the target tokens), and divide. This normalizes the loss by
the number of bytes that the target tokens represent. the number of bytes that the target tokens represent.

View File

@ -170,7 +170,7 @@ Generated: {timestamp}
# count dependencies via uv.lock # count dependencies via uv.lock
uv_lock_lines = 0 uv_lock_lines = 0
if os.path.exists('uv.lock'): if os.path.exists('uv.lock'):
with open('uv.lock', 'r') as f: with open('uv.lock', 'r', encoding='utf-8') as f:
uv_lock_lines = len(f.readlines()) uv_lock_lines = len(f.readlines())
header += f""" header += f"""
@ -242,7 +242,7 @@ class Report:
file_name = f"{slug}.md" file_name = f"{slug}.md"
file_path = os.path.join(self.report_dir, file_name) file_path = os.path.join(self.report_dir, file_name)
mode = "a" if append else "w" mode = "a" if append else "w"
with open(file_path, mode) as f: with open(file_path, mode, encoding="utf-8") as f:
if append and os.path.exists(file_path) and os.path.getsize(file_path) > 0: if append and os.path.exists(file_path) and os.path.getsize(file_path) > 0:
f.write("\n") f.write("\n")
f.write(f"## {section}\n") f.write(f"## {section}\n")
@ -275,11 +275,11 @@ class Report:
final_metrics = {} # the most important final metrics we'll add as table at the end final_metrics = {} # the most important final metrics we'll add as table at the end
start_time = None start_time = None
end_time = None end_time = None
with open(report_file, "w") as out_file: with open(report_file, "w", encoding="utf-8") as out_file:
# write the header first # write the header first
header_file = os.path.join(report_dir, "header.md") header_file = os.path.join(report_dir, "header.md")
if os.path.exists(header_file): if os.path.exists(header_file):
with open(header_file, "r") as f: with open(header_file, "r", encoding="utf-8") as f:
header_content = f.read() header_content = f.read()
out_file.write(header_content) out_file.write(header_content)
start_time = extract_timestamp(header_content, "Run started:") start_time = extract_timestamp(header_content, "Run started:")
@ -296,7 +296,7 @@ class Report:
if not os.path.exists(section_file): if not os.path.exists(section_file):
print(f"Warning: {section_file} does not exist, skipping") print(f"Warning: {section_file} does not exist, skipping")
continue continue
with open(section_file, "r") as in_file: with open(section_file, "r", encoding="utf-8") as in_file:
section = in_file.read() section = in_file.read()
# Extract timestamp from this section (the last section's timestamp will "stick" as end_time) # Extract timestamp from this section (the last section's timestamp will "stick" as end_time)
if "rl" not in file_name: if "rl" not in file_name:
@ -376,7 +376,7 @@ class Report:
header_file = os.path.join(self.report_dir, "header.md") header_file = os.path.join(self.report_dir, "header.md")
header = generate_header() header = generate_header()
start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
with open(header_file, "w") as f: with open(header_file, "w", encoding="utf-8") as f:
f.write(header) f.write(header)
f.write(f"Run started: {start_time}\n\n---\n\n") f.write(f"Run started: {start_time}\n\n---\n\n")
print(f"Reset report and wrote header to {header_file}") print(f"Reset report and wrote header to {header_file}")

View File

@ -8,7 +8,6 @@ dependencies = [
"datasets>=4.0.0", "datasets>=4.0.0",
"fastapi>=0.117.1", "fastapi>=0.117.1",
"files-to-prompt>=0.6", "files-to-prompt>=0.6",
"numpy==1.26.4",
"psutil>=7.1.0", "psutil>=7.1.0",
"regex>=2025.9.1", "regex>=2025.9.1",
"setuptools>=80.9.0", "setuptools>=80.9.0",

View File

@ -19,13 +19,6 @@ python -m nanochat.report reset
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source "$HOME/.cargo/env" source "$HOME/.cargo/env"
uv run maturin develop --release --manifest-path rustbpe/Cargo.toml uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
unzip -q eval_bundle.zip
rm eval_bundle.zip
mv eval_bundle $NANOCHAT_BASE_DIR
fi
curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
# train tokenizer on ~4B characters and kick off download of the rest for pretraining # train tokenizer on ~4B characters and kick off download of the rest for pretraining
@ -77,18 +70,22 @@ python -m scripts.tok_eval
# which would decrease model performance. Possibly 2, 3 or so epochs is ~ok, but certainly not ideal and at 10+ epochs we'd # which would decrease model performance. Possibly 2, 3 or so epochs is ~ok, but certainly not ideal and at 10+ epochs we'd
# start to overfit hard. # start to overfit hard.
# 5) That's it, everything else (e.g. the learning rates) is adjusted automatically by the training script. # 5) That's it, everything else (e.g. the learning rates) is adjusted automatically by the training script.
torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=32 --device_batch_size=8
torchrun --standalone --nproc_per_node=8 -m scripts.base_loss # Number of processes/GPUs to use
torchrun --standalone --nproc_per_node=8 -m scripts.base_eval NPROC_PER_NODE=8
torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=32 --device_batch_size=8 --run=$WANDB_RUN
torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss
torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval
# midtrain # midtrain
# NOTE: ensure that we use the same device_batch_size here as the base training script. # NOTE: ensure that we use the same device_batch_size here as the base training script.
torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --device_batch_size=8 --run=$WANDB_RUN torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.mid_train -- --device_batch_size=8 --run=$WANDB_RUN
torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i mid torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i mid
# sft # sft
torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --run=$WANDB_RUN torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_sft -- --run=$WANDB_RUN
torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i sft
# generate final report # generate final report
python -m nanochat.report generate python -m nanochat.report generate

View File

@ -1,49 +1,76 @@
""" """
Evlauate the CORE metric for a given model. Evaluate the CORE metric for a given model.
Run on a single GPU: Run on a single GPU:
python base_eval.py python -m scripts.base_eval
Run with torchrun on e.g. 8 GPUs: Run with torchrun on e.g. 8 GPUs:
torchrun --nproc_per_node=8 base_eval.py torchrun --nproc_per_node=8 -m scripts.base_eval
The script will print the CORE metric to the console. The script will print the CORE metric to the console.
""" """
import os import os
import sys import csv
import time import time
import json import json
import random
import yaml import yaml
import shutil
import random
import zipfile
import tempfile
from contextlib import nullcontext from contextlib import nullcontext
import pandas as pd
import torch import torch
from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type, download_file_with_lock
from nanochat.tokenizer import HuggingFaceTokenizer from nanochat.tokenizer import HuggingFaceTokenizer
from nanochat.checkpoint_manager import load_model from nanochat.checkpoint_manager import load_model
from nanochat.core_eval import evaluate_task from nanochat.core_eval import evaluate_task
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# nanoChat specific function dealing with I/O etc. # nanochat specific function dealing with I/O etc.
# ~162MB of data needed to evaluate the CORE metric
EVAL_BUNDLE_URL = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip"
def place_eval_bundle(file_path):
# here file_path is the path to the eval_bundle.zip file
# we need to unzip it and place it in the base directory
base_dir = get_base_dir()
eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
with tempfile.TemporaryDirectory() as tmpdir:
with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.extractall(tmpdir)
extracted_bundle_dir = os.path.join(tmpdir, "eval_bundle")
shutil.move(extracted_bundle_dir, eval_bundle_dir)
print0(f"Placed eval_bundle directory at {eval_bundle_dir}")
def evaluate_model(model, tokenizer, device, max_per_task=-1): def evaluate_model(model, tokenizer, device, max_per_task=-1):
""" """
Evaluate a base model on the CORE benchmark. Evaluate a base model on the CORE benchmark.
- max_per_task: crop the data to this many examples per task for testing (-1 = disable) - max_per_task: crop the data to this many examples per task for testing (-1 = disable)
TODO: clean up this function, delete the need for all the files, for pandas dependency, etc.
""" """
# Load config and task metadata # Load config and task metadata
base_dir = get_base_dir() base_dir = get_base_dir()
eval_bundle_dir = os.path.join(base_dir, "eval_bundle") eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
# Download the eval bundle to disk (and unzip if needed)
if not os.path.exists(eval_bundle_dir):
download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle)
config_path = os.path.join(eval_bundle_dir, "core.yaml") config_path = os.path.join(eval_bundle_dir, "core.yaml")
data_base_path = os.path.join(eval_bundle_dir, "eval_data") data_base_path = os.path.join(eval_bundle_dir, "eval_data")
eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv") eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
with open(config_path, 'r') as f: with open(config_path, 'r', encoding='utf-8') as f:
config = yaml.safe_load(f) config = yaml.safe_load(f)
tasks = config['icl_tasks'] tasks = config['icl_tasks']
eval_metadata = pd.read_csv(eval_meta_data)
# Load random baseline values from eval metadata
random_baselines = {}
with open(eval_meta_data, 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
task_name = row['Eval Task']
random_baseline = row['Random baseline']
random_baselines[task_name] = float(random_baseline)
# Evaluate each task # Evaluate each task
results = {} results = {}
@ -61,7 +88,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
# Load data for this task # Load data for this task
data_path = os.path.join(data_base_path, task_meta['dataset_uri']) data_path = os.path.join(data_base_path, task_meta['dataset_uri'])
with open(data_path, 'r') as f: with open(data_path, 'r', encoding='utf-8') as f:
data = [json.loads(line.strip()) for line in f] data = [json.loads(line.strip()) for line in f]
# shuffle the data because in many cases it appears ordered but we want # shuffle the data because in many cases it appears ordered but we want
@ -75,8 +102,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
accuracy = evaluate_task(model, tokenizer, data, device, task_meta) accuracy = evaluate_task(model, tokenizer, data, device, task_meta)
results[label] = accuracy results[label] = accuracy
row = eval_metadata[eval_metadata["Eval Task"] == label] random_baseline = random_baselines[label]
random_baseline = row["Random baseline"].values[0]
centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline) centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline)
centered_results[label] = centered_result centered_results[label] = centered_result
end_time = time.time() end_time = time.time()
@ -158,7 +184,7 @@ def main():
results = out["results"] results = out["results"]
centered_results = out["centered_results"] centered_results = out["centered_results"]
core_metric = out["core_metric"] core_metric = out["core_metric"]
with open(output_csv_path, 'w') as f: with open(output_csv_path, 'w', encoding='utf-8', newline='') as f:
f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n") f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n")
for label in results: for label in results:
f.write(f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n") f.write(f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n")
@ -167,7 +193,7 @@ def main():
print0("="*80) print0("="*80)
print0(f"Model: {model_name}") print0(f"Model: {model_name}")
print0("="*80) print0("="*80)
with open(output_csv_path, 'r') as f: with open(output_csv_path, 'r', encoding='utf-8') as f:
print0(f.read()) print0(f.read())
# Log to report # Log to report

View File

@ -20,10 +20,10 @@ import wandb
import torch import torch
from nanochat.gpt import GPT, GPTConfig from nanochat.gpt import GPT, GPTConfig
from nanochat.dataloader import tokenizing_distributed_data_loader from nanochat.dataloader import tokenizing_distributed_data_loader, tokenizing_distributed_data_loader_with_state
from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, print_banner, get_base_dir, autodetect_device_type from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, print_banner, get_base_dir, autodetect_device_type
from nanochat.tokenizer import get_tokenizer, get_token_bytes from nanochat.tokenizer import get_tokenizer, get_token_bytes
from nanochat.checkpoint_manager import save_checkpoint from nanochat.checkpoint_manager import save_checkpoint, load_checkpoint
from nanochat.loss_eval import evaluate_bpb from nanochat.loss_eval import evaluate_bpb
from nanochat.engine import Engine from nanochat.engine import Engine
from scripts.base_eval import evaluate_model from scripts.base_eval import evaluate_model
@ -52,12 +52,14 @@ grad_clip = 1.0 # gradient clipping value (0.0 = disabled)
warmup_ratio = 0.0 # ratio of iterations for LR warmup warmup_ratio = 0.0 # ratio of iterations for LR warmup
warmdown_ratio = 0.2 # ratio of iterations for LR warmdown warmdown_ratio = 0.2 # ratio of iterations for LR warmdown
final_lr_frac = 0.0 # final LR is this fraction of the initial LR final_lr_frac = 0.0 # final LR is this fraction of the initial LR
resume_from_step = -1 # resume training from this step of the optimization (-1 = disable)
# Evaluation # Evaluation
eval_every = 250 # every how many steps to evaluate the model for val bpb eval_every = 250 # every how many steps to evaluate the model for val bpb
eval_tokens = 20*524288 # number of tokens to evaluate val loss on eval_tokens = 20*524288 # number of tokens to evaluate val loss on
core_metric_every = 2000 # every how many steps to evaluate the core metric (-1 = disable) core_metric_every = 2000 # every how many steps to evaluate the core metric (-1 = disable)
core_metric_max_per_task = 500 # examples per task in estimating the core metric core_metric_max_per_task = 500 # examples per task in estimating the core metric
sample_every = 2000 # every how many steps to sample from the model sample_every = 2000 # every how many steps to sample from the model
save_every = -1 # every how many steps to save model checkpoints (-1 = disable, and save only at the end of the run)
# Output # Output
model_tag = "" # optionally override the model tag for the output checkpoint directory name model_tag = "" # optionally override the model tag for the output checkpoint directory name
# now allow CLI to override the settings via the configurator lol # now allow CLI to override the settings via the configurator lol
@ -103,16 +105,31 @@ grad_accum_steps = total_batch_size // world_tokens_per_fwdbwd
print0(f"Tokens / micro-batch / rank: {device_batch_size} x {max_seq_len} = {tokens_per_fwdbwd:,}") print0(f"Tokens / micro-batch / rank: {device_batch_size} x {max_seq_len} = {tokens_per_fwdbwd:,}")
print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}") print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}")
print0(f"Total batch size {total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}") print0(f"Total batch size {total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}")
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Initialize the Model # Initialize the Model
# Create a new model with random weights
model_config_kwargs = dict(sequence_len=max_seq_len, vocab_size=vocab_size, n_layer=num_layers, n_head=num_heads, n_kv_head=num_kv_heads, n_embd=model_dim) model_config_kwargs = dict(sequence_len=max_seq_len, vocab_size=vocab_size, n_layer=num_layers, n_head=num_heads, n_kv_head=num_kv_heads, n_embd=model_dim)
with torch.device("meta"): with torch.device("meta"):
model_config = GPTConfig(**model_config_kwargs) model_config = GPTConfig(**model_config_kwargs)
model = GPT(model_config) model = GPT(model_config)
model.to_empty(device=device) model.to_empty(device=device)
model.init_weights() model.init_weights()
orig_model = model # original, uncompiled model, for saving raw model state_dict
model = torch.compile(model, dynamic=False) # TODO: dynamic True/False think through # If we are resuming, overwrite the model parameters with those of the checkpoint
base_dir = get_base_dir()
output_dirname = model_tag if model_tag else f"d{depth}" # e.g. d12
checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname)
resuming = resume_from_step != -1
if resuming:
print0(f"Resuming optimization from step {resume_from_step}")
model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, resume_from_step, device, load_optimizer=True, rank=ddp_rank)
model.load_state_dict(model_data, strict=True, assign=True)
del model_data # free up this memory after the copy
orig_model = model # original, uncompiled model, for saving raw model state_dict and for inference/evaluation (because the shapes may change shape)
model = torch.compile(model, dynamic=False) # the inputs to model will never change shape so dynamic=False is safe
num_params = sum(p.numel() for p in model.parameters()) num_params = sum(p.numel() for p in model.parameters())
print0(f"Number of parameters: {num_params:,}") print0(f"Number of parameters: {num_params:,}")
num_flops_per_token = model.estimate_flops() num_flops_per_token = model.estimate_flops()
@ -143,12 +160,18 @@ print0(f"Total training FLOPs estimate: {num_flops_per_token * total_tokens:e}")
optimizers = model.setup_optimizers(unembedding_lr=unembedding_lr, embedding_lr=embedding_lr, matrix_lr=matrix_lr, weight_decay=weight_decay) optimizers = model.setup_optimizers(unembedding_lr=unembedding_lr, embedding_lr=embedding_lr, matrix_lr=matrix_lr, weight_decay=weight_decay)
adamw_optimizer, muon_optimizer = optimizers adamw_optimizer, muon_optimizer = optimizers
if resuming:
for opt, dat in zip(optimizers, optimizer_data):
opt.load_state_dict(dat)
del optimizer_data # free up the memory
# -----------------------------------------------------------------------------
# Initialize the DataLoaders for train/val # Initialize the DataLoaders for train/val
base_dir = get_base_dir()
tokens_dir = os.path.join(base_dir, "tokenized_data") tokens_dir = os.path.join(base_dir, "tokenized_data")
train_loader = tokenizing_distributed_data_loader(device_batch_size, max_seq_len, split="train", device=device) dataloader_resume_state_dict = None if not resuming else meta_data["dataloader_state_dict"]
train_loader = tokenizing_distributed_data_loader_with_state(device_batch_size, max_seq_len, split="train", device=device, resume_state_dict=dataloader_resume_state_dict)
build_val_loader = lambda: tokenizing_distributed_data_loader(device_batch_size, max_seq_len, split="val", device=device) build_val_loader = lambda: tokenizing_distributed_data_loader(device_batch_size, max_seq_len, split="val", device=device)
x, y = next(train_loader) # kick off load of the very first batch of data x, y, dataloader_state_dict = next(train_loader) # kick off load of the very first batch of data
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Set up hyperparameter schedulers # Set up hyperparameter schedulers
@ -171,15 +194,25 @@ def get_muon_momentum(it):
momentum = (1 - frac) * 0.85 + frac * 0.95 momentum = (1 - frac) * 0.85 + frac * 0.95
return momentum return momentum
# -----------------------------------------------------------------------------
# Loop state (variables updated by the training loop)
if not resuming:
step = 0
min_val_bpb = float("inf")
smooth_train_loss = 0 # EMA of training loss
total_training_time = 0 # total wall-clock time of training
else:
step = meta_data["step"]
loop_state = meta_data["loop_state"]
min_val_bpb = loop_state["min_val_bpb"]
smooth_train_loss = loop_state["smooth_train_loss"]
total_training_time = loop_state["total_training_time"]
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Training loop # Training loop
min_val_bpb = float("inf") while True:
smooth_train_loss = 0 # EMA of training loss last_step = step == num_iterations # loop runs num_iterations+1 times so that we can eval/save at the end
ema_beta = 0.9 # EMA decay factor
total_training_time = 0 # total wall-clock time of training
# note that we run +1 steps only so that we can eval and save at the end
for step in range(num_iterations + 1):
last_step = step == num_iterations
flops_so_far = num_flops_per_token * total_batch_size * step flops_so_far = num_flops_per_token * total_batch_size * step
# once in a while: evaluate the val bpb (all ranks participate) # once in a while: evaluate the val bpb (all ranks participate)
@ -237,25 +270,31 @@ for step in range(num_iterations + 1):
print0(tokenizer.decode(sample[0])) print0(tokenizer.decode(sample[0]))
model.train() model.train()
# save checkpoint at the end of the run (only on master process) # save checkpoint: at the end of the run, or every save_every steps, except at the first step or the resume step
if master_process and last_step: if last_step or (step > 0 and step != resume_from_step and save_every > 0 and step % save_every == 0):
output_dirname = model_tag if model_tag else f"d{depth}" # e.g. d12
checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname)
save_checkpoint( save_checkpoint(
checkpoint_dir, checkpoint_dir,
step, step,
orig_model.state_dict(), orig_model.state_dict(), # model parameters
[opt.state_dict() for opt in optimizers], # TODO: make sure saving across ranks is done correctly [opt.state_dict() for opt in optimizers], # optimizer states
{ { # metadata saved as json
"step": step, "step": step,
"val_bpb": val_bpb, # loss at last step "val_bpb": val_bpb, # loss at last step
"model_config": model_config_kwargs, "model_config": model_config_kwargs,
"user_config": user_config, # inputs to the training script "user_config": user_config, # inputs to the training script
"device_batch_size": device_batch_size, "device_batch_size": device_batch_size,
"max_seq_len": max_seq_len, "max_seq_len": max_seq_len,
} "dataloader_state_dict": dataloader_state_dict,
"loop_state": { # all loop state (other than step) so that we can resume training
"min_val_bpb": min_val_bpb,
"smooth_train_loss": smooth_train_loss,
"total_training_time": total_training_time,
},
},
rank=ddp_rank,
) )
# termination conditions (TODO: possibly also add loss explosions etc.)
if last_step: if last_step:
break break
@ -270,10 +309,12 @@ for step in range(num_iterations + 1):
train_loss = loss.detach() # for logging train_loss = loss.detach() # for logging
loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
loss.backward() loss.backward()
x, y = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward x, y, dataloader_state_dict = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward
# gradient clipping (TODO possibly experiment with) # gradient clipping
if grad_clip > 0.0: grad_clip_enabled = grad_clip > 0.0
torch.nn.utils.clip_grad_norm_(orig_model.parameters(), grad_clip) if grad_clip_enabled:
grad_norm_tensor = torch.nn.utils.clip_grad_norm_(orig_model.parameters(), grad_clip)
grad_norm = grad_norm_tensor.item() # GPU tensor -> CPU float (note: cpu-gpu sync point)
# step the optimizers # step the optimizers
lrm = get_lr_multiplier(step) lrm = get_lr_multiplier(step)
for opt in optimizers: for opt in optimizers:
@ -291,18 +332,20 @@ for step in range(num_iterations + 1):
# ------------------------------------------------------------------------- # -------------------------------------------------------------------------
# logging # logging
ema_beta = 0.9 # EMA decay factor for some smoothing just for nicer logging
smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss
debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA
pct_done = 100 * step / num_iterations pct_done = 100 * step / num_iterations
tok_per_sec = int(world_tokens_per_fwdbwd / dt) tok_per_sec = int(total_batch_size / dt)
flops_per_sec = num_flops_per_token * total_batch_size / dt flops_per_sec = num_flops_per_token * total_batch_size / dt
promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity
mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in % mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in %
if step > 10: if step > 10:
total_training_time += dt # only count the time after the first 10 steps total_training_time += dt # only count the time after the first 10 steps
print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | total time: {total_training_time/60:.2f}m") print_grad_norm = f" grad norm: {grad_norm:.4f} |" if grad_clip_enabled else ""
print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} |{print_grad_norm} lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | total time: {total_training_time/60:.2f}m")
if step % 100 == 0: if step % 100 == 0:
wandb_run.log({ log_data = {
"step": step, "step": step,
"total_training_flops": flops_so_far, "total_training_flops": flops_so_far,
"total_training_time": total_training_time, "total_training_time": total_training_time,
@ -311,7 +354,13 @@ for step in range(num_iterations + 1):
"train/dt": dt, "train/dt": dt,
"train/tok_per_sec": tok_per_sec, "train/tok_per_sec": tok_per_sec,
"train/mfu": mfu, "train/mfu": mfu,
}) }
if grad_clip_enabled:
log_data["train/grad_norm"] = grad_norm
wandb_run.log(log_data)
# state update
step += 1
# print a few more stats # print a few more stats
print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB") print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB")

View File

@ -1,6 +1,6 @@
""" """
Evaluate the Chat model. Evaluate the Chat model.
All the generic code lives here, and all the evlauation-specific All the generic code lives here, and all the evaluation-specific
code lives in nanochat directory and is imported from here. code lives in nanochat directory and is imported from here.
Example runs: Example runs:

View File

@ -192,7 +192,7 @@ for step in range(num_iterations):
}) })
model.train() model.train()
# evlauate accuracy of the multiple choice tasks (which are quick to run) # evaluate accuracy of the multiple choice tasks (which are quick to run)
if last_step or (step > 0 and step % eval_metrics_every == 0): if last_step or (step > 0 and step % eval_metrics_every == 0):
model.eval() model.eval()
metrics = {} metrics = {}

View File

@ -243,7 +243,7 @@ app.add_middleware(
async def root(): async def root():
"""Serve the chat UI.""" """Serve the chat UI."""
ui_html_path = os.path.join("nanochat", "ui.html") ui_html_path = os.path.join("nanochat", "ui.html")
with open(ui_html_path, "r") as f: with open(ui_html_path, "r", encoding="utf-8") as f:
html_content = f.read() html_content = f.read()
# Replace the API_URL to use the same origin # Replace the API_URL to use the same origin
html_content = html_content.replace( html_content = html_content.replace(

View File

@ -268,7 +268,7 @@ while True:
smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss
debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA
pct_done = 100 * progress pct_done = 100 * progress
tok_per_sec = int(world_tokens_per_fwdbwd / dt) tok_per_sec = int(total_batch_size / dt)
flops_per_sec = num_flops_per_token * total_batch_size / dt flops_per_sec = num_flops_per_token * total_batch_size / dt
promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity
mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in % mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in %

View File

@ -73,15 +73,6 @@ python -m scripts.tok_eval
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Base model (pretraining) # Base model (pretraining)
# Download the eval_bundle from s3 to evaluate CORE metric during training (~162MB)
EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
unzip -q eval_bundle.zip
rm eval_bundle.zip
mv eval_bundle $NANOCHAT_BASE_DIR
fi
# The d20 model is 561M parameters. # The d20 model is 561M parameters.
# Chinchilla says #tokens = 20X #params, so we need 561e6 * 20 = 11.2B tokens. # Chinchilla says #tokens = 20X #params, so we need 561e6 * 20 = 11.2B tokens.
# Assume our tokenizer is 4.8 chars/token, this is 11.2B * 4.8 ~= 54B chars. # Assume our tokenizer is 4.8 chars/token, this is 11.2B * 4.8 ~= 54B chars.
@ -91,12 +82,15 @@ fi
echo "Waiting for dataset download to complete..." echo "Waiting for dataset download to complete..."
wait $DATASET_DOWNLOAD_PID wait $DATASET_DOWNLOAD_PID
# Number of processes/GPUs to use
NPROC_PER_NODE=8
# pretrain the d20 model # pretrain the d20 model
torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=20 --run=$WANDB_RUN torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=20 --run=$WANDB_RUN
# evaluate the model on a larger chunk of train/val data and draw some samples # evaluate the model on a larger chunk of train/val data and draw some samples
torchrun --standalone --nproc_per_node=8 -m scripts.base_loss torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss
# evaluate the model on CORE tasks # evaluate the model on CORE tasks
torchrun --standalone --nproc_per_node=8 -m scripts.base_eval torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Midtraining (teach the model conversation special tokens, tool use, multiple choice) # Midtraining (teach the model conversation special tokens, tool use, multiple choice)
@ -106,15 +100,15 @@ torchrun --standalone --nproc_per_node=8 -m scripts.base_eval
curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
# run midtraining and eval the model # run midtraining and eval the model
torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --run=$WANDB_RUN torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.mid_train -- --run=$WANDB_RUN
torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i mid torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i mid
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Supervised Finetuning (domain adaptation to each sequence all by itself per row) # Supervised Finetuning (domain adaptation to each sequence all by itself per row)
# train sft and re-eval right away (should see a small bump) # train sft and re-eval right away (should see a small bump)
torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --run=$WANDB_RUN torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_sft -- --run=$WANDB_RUN
torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i sft
# chat with the model over CLI! Leave out the -p to chat interactively # chat with the model over CLI! Leave out the -p to chat interactively
# python -m scripts.chat_cli -p "Why is the sky blue?" # python -m scripts.chat_cli -p "Why is the sky blue?"
@ -127,9 +121,9 @@ torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft
# (optional) # (optional)
# run reinforcement learning # run reinforcement learning
# torchrun --standalone --nproc_per_node=8 -m scripts.chat_rl -- --run=$WANDB_RUN # torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_rl -- --run=$WANDB_RUN
# eval the RL model only on GSM8K # eval the RL model only on GSM8K
# torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i rl -a GSM8K # torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i rl -a GSM8K
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Generate the full report by putting together all the sections # Generate the full report by putting together all the sections

View File

@ -32,7 +32,7 @@ class CustomJSON(Task):
print("-" * 80) print("-" * 80)
else: else:
with open(filepath, 'r') as f: with open(filepath, 'r', encoding='utf-8') as f:
for line in f: for line in f:
line = line.strip() line = line.strip()
if not line: # skip empty lines if not line: # skip empty lines

View File

@ -119,7 +119,7 @@ class SpellingBee(Task):
self.split = split self.split = split
filename = WORD_LIST_URL.split("/")[-1] filename = WORD_LIST_URL.split("/")[-1]
word_list_path = download_file_with_lock(WORD_LIST_URL, filename) word_list_path = download_file_with_lock(WORD_LIST_URL, filename)
with open(word_list_path) as f: with open(word_list_path, 'r', encoding='utf-8') as f:
words = [line.strip() for line in f] words = [line.strip() for line in f]
self.words = words self.words = words
@ -238,7 +238,7 @@ class SimpleSpelling(Task):
self.split = split self.split = split
filename = WORD_LIST_URL.split("/")[-1] filename = WORD_LIST_URL.split("/")[-1]
word_list_path = download_file_with_lock(WORD_LIST_URL, filename) word_list_path = download_file_with_lock(WORD_LIST_URL, filename)
with open(word_list_path) as f: with open(word_list_path, 'r', encoding='utf-8') as f:
words = [line.strip() for line in f] words = [line.strip() for line in f]
rng = random.Random(42) rng = random.Random(42)
rng.shuffle(words) # use a different word order than the SpellingBee task rng.shuffle(words) # use a different word order than the SpellingBee task

View File

@ -455,13 +455,13 @@ def enwik8_path():
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def enwik8_small(enwik8_path): def enwik8_small(enwik8_path):
"""Fixture providing 100KB of enwik8 for quick tests.""" """Fixture providing 100KB of enwik8 for quick tests."""
with open(enwik8_path, "r") as f: with open(enwik8_path, "r", encoding="utf-8") as f:
return f.read(100_000) return f.read(100_000)
@pytest.fixture(scope="module") @pytest.fixture(scope="module")
def enwik8_large(enwik8_path): def enwik8_large(enwik8_path):
"""Fixture providing 10MB of enwik8 for performance tests.""" """Fixture providing 10MB of enwik8 for performance tests."""
with open(enwik8_path, "r") as f: with open(enwik8_path, "r", encoding="utf-8") as f:
return f.read(10**7) return f.read(10**7)
def time_function(func, *args, **kwargs): def time_function(func, *args, **kwargs):

18
uv.lock
View File

@ -311,7 +311,7 @@ name = "exceptiongroup"
version = "1.3.0" version = "1.3.0"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "typing-extensions", marker = "python_full_version < '3.12' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, { name = "typing-extensions", marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
] ]
sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" } sdist = { url = "https://files.pythonhosted.org/packages/0b/9f/a65090624ecf468cdca03533906e7c69ed7588582240cfe7cc9e770b50eb/exceptiongroup-1.3.0.tar.gz", hash = "sha256:b241f5885f560bc56a59ee63ca4c6a8bfa46ae4ad651af316d4e81817bb9fd88", size = 29749, upload-time = "2025-05-10T17:42:51.123Z" }
wheels = [ wheels = [
@ -777,7 +777,6 @@ dependencies = [
{ name = "datasets" }, { name = "datasets" },
{ name = "fastapi" }, { name = "fastapi" },
{ name = "files-to-prompt" }, { name = "files-to-prompt" },
{ name = "numpy" },
{ name = "psutil" }, { name = "psutil" },
{ name = "regex" }, { name = "regex" },
{ name = "setuptools" }, { name = "setuptools" },
@ -811,7 +810,6 @@ requires-dist = [
{ name = "datasets", specifier = ">=4.0.0" }, { name = "datasets", specifier = ">=4.0.0" },
{ name = "fastapi", specifier = ">=0.117.1" }, { name = "fastapi", specifier = ">=0.117.1" },
{ name = "files-to-prompt", specifier = ">=0.6" }, { name = "files-to-prompt", specifier = ">=0.6" },
{ name = "numpy", specifier = "==1.26.4" },
{ name = "psutil", specifier = ">=7.1.0" }, { name = "psutil", specifier = ">=7.1.0" },
{ name = "regex", specifier = ">=2025.9.1" }, { name = "regex", specifier = ">=2025.9.1" },
{ name = "setuptools", specifier = ">=80.9.0" }, { name = "setuptools", specifier = ">=80.9.0" },
@ -951,7 +949,7 @@ name = "nvidia-cudnn-cu12"
version = "9.10.2.21" version = "9.10.2.21"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "nvidia-cublas-cu12", marker = "extra == 'extra-8-nanochat-gpu'" }, { name = "nvidia-cublas-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
] ]
wheels = [ wheels = [
{ url = "https://files.pythonhosted.org/packages/fa/41/e79269ce215c857c935fd86bcfe91a451a584dfc27f1e068f568b9ad1ab7/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8", size = 705026878, upload-time = "2025-06-06T21:52:51.348Z" }, { url = "https://files.pythonhosted.org/packages/fa/41/e79269ce215c857c935fd86bcfe91a451a584dfc27f1e068f568b9ad1ab7/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8", size = 705026878, upload-time = "2025-06-06T21:52:51.348Z" },
@ -964,7 +962,7 @@ name = "nvidia-cufft-cu12"
version = "11.3.3.83" version = "11.3.3.83"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "nvidia-nvjitlink-cu12", marker = "extra == 'extra-8-nanochat-gpu'" }, { name = "nvidia-nvjitlink-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
] ]
wheels = [ wheels = [
{ url = "https://files.pythonhosted.org/packages/60/bc/7771846d3a0272026c416fbb7e5f4c1f146d6d80704534d0b187dd6f4800/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:848ef7224d6305cdb2a4df928759dca7b1201874787083b6e7550dd6765ce69a", size = 193109211, upload-time = "2025-03-07T01:44:56.873Z" }, { url = "https://files.pythonhosted.org/packages/60/bc/7771846d3a0272026c416fbb7e5f4c1f146d6d80704534d0b187dd6f4800/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:848ef7224d6305cdb2a4df928759dca7b1201874787083b6e7550dd6765ce69a", size = 193109211, upload-time = "2025-03-07T01:44:56.873Z" },
@ -996,9 +994,9 @@ name = "nvidia-cusolver-cu12"
version = "11.7.3.90" version = "11.7.3.90"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "nvidia-cublas-cu12", marker = "extra == 'extra-8-nanochat-gpu'" }, { name = "nvidia-cublas-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "nvidia-cusparse-cu12", marker = "extra == 'extra-8-nanochat-gpu'" }, { name = "nvidia-cusparse-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "nvidia-nvjitlink-cu12", marker = "extra == 'extra-8-nanochat-gpu'" }, { name = "nvidia-nvjitlink-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
] ]
wheels = [ wheels = [
{ url = "https://files.pythonhosted.org/packages/c8/32/f7cd6ce8a7690544d084ea21c26e910a97e077c9b7f07bf5de623ee19981/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:db9ed69dbef9715071232caa9b69c52ac7de3a95773c2db65bdba85916e4e5c0", size = 267229841, upload-time = "2025-03-07T01:46:54.356Z" }, { url = "https://files.pythonhosted.org/packages/c8/32/f7cd6ce8a7690544d084ea21c26e910a97e077c9b7f07bf5de623ee19981/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:db9ed69dbef9715071232caa9b69c52ac7de3a95773c2db65bdba85916e4e5c0", size = 267229841, upload-time = "2025-03-07T01:46:54.356Z" },
@ -1011,7 +1009,7 @@ name = "nvidia-cusparse-cu12"
version = "12.5.8.93" version = "12.5.8.93"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "nvidia-nvjitlink-cu12", marker = "extra == 'extra-8-nanochat-gpu'" }, { name = "nvidia-nvjitlink-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
] ]
wheels = [ wheels = [
{ url = "https://files.pythonhosted.org/packages/bc/f7/cd777c4109681367721b00a106f491e0d0d15cfa1fd59672ce580ce42a97/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b6c161cb130be1a07a27ea6923df8141f3c295852f4b260c65f18f3e0a091dc", size = 288117129, upload-time = "2025-03-07T01:47:40.407Z" }, { url = "https://files.pythonhosted.org/packages/bc/f7/cd777c4109681367721b00a106f491e0d0d15cfa1fd59672ce580ce42a97/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b6c161cb130be1a07a27ea6923df8141f3c295852f4b260c65f18f3e0a091dc", size = 288117129, upload-time = "2025-03-07T01:47:40.407Z" },
@ -1955,7 +1953,7 @@ name = "triton"
version = "3.4.0" version = "3.4.0"
source = { registry = "https://pypi.org/simple" } source = { registry = "https://pypi.org/simple" }
dependencies = [ dependencies = [
{ name = "setuptools", marker = "extra == 'extra-8-nanochat-gpu'" }, { name = "setuptools", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
] ]
wheels = [ wheels = [
{ url = "https://files.pythonhosted.org/packages/62/ee/0ee5f64a87eeda19bbad9bc54ae5ca5b98186ed00055281fd40fb4beb10e/triton-3.4.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7ff2785de9bc02f500e085420273bb5cc9c9bb767584a4aa28d6e360cec70128", size = 155430069, upload-time = "2025-07-30T19:58:21.715Z" }, { url = "https://files.pythonhosted.org/packages/62/ee/0ee5f64a87eeda19bbad9bc54ae5ca5b98186ed00055281fd40fb4beb10e/triton-3.4.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:7ff2785de9bc02f500e085420273bb5cc9c9bb767584a4aa28d6e360cec70128", size = 155430069, upload-time = "2025-07-30T19:58:21.715Z" },