From ed519b0f24ea1620a81653012103122a2e367d83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ph=C3=BAc=20H=2E=20L=C3=AA=20Kh=E1=BA=AFc?= Date: Fri, 17 Oct 2025 17:21:25 +0700 Subject: [PATCH 1/7] Update engine.py with correct error message on assert --- nanochat/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nanochat/engine.py b/nanochat/engine.py index de1253a..eb3fcac 100644 --- a/nanochat/engine.py +++ b/nanochat/engine.py @@ -83,7 +83,7 @@ class KVCache: for ix, (dim1, dim2) in enumerate(zip(self.kv_shape, other.kv_shape)): if ix in [0, 1, 3, 5]: # num_layers, batch_size, num_heads, head_dim must match - assert dim1 == dim2, f"Batch dim mismatch: {dim1} != {dim2}" + assert dim1 == dim2, f"Dim {ix} mismatch: {dim1} != {dim2}" elif ix == 2: # batch_size can be expanded assert dim1 == dim2 or dim2 == 1, f"Batch dim mismatch: {dim1} != {dim2}" From fca2b8cd07a0929fb5a0368522f11a061d03e52e Mon Sep 17 00:00:00 2001 From: Marius Wachtler Date: Fri, 24 Oct 2025 14:29:35 -0500 Subject: [PATCH 2/7] harden eval: prevent the calc tool from accessing globals and locals By passing empty globals() and locals() to eval() we can prevent simple malicious cases where the user gets the model to output something like ``` or "a".count("a")``` e.g. ```signal.raise_signal(9) or "a".count("a")``` which would kill the process. or one could maybe get it to output secrets etc. I think to make it 100% secure one would need to parse the AST and only execute secure nodes but this should make it much more robust. --- nanochat/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nanochat/engine.py b/nanochat/engine.py index fee06a1..77530c5 100644 --- a/nanochat/engine.py +++ b/nanochat/engine.py @@ -37,7 +37,7 @@ def eval_with_timeout(formula, max_time=3): with timeout(max_time, formula): with warnings.catch_warnings(): warnings.simplefilter("ignore", SyntaxWarning) - return eval(formula) + return eval(formula, {"__builtins__": {}}, {}) except Exception as e: signal.alarm(0) # print(f"Warning: Failed to eval {formula}, exception: {e}") # it's ok ignore wrong calculator usage From a9de4b103858223646e0e8ba29ed32b8516aad8f Mon Sep 17 00:00:00 2001 From: water-vapor Date: Sun, 26 Oct 2025 01:43:49 -0500 Subject: [PATCH 3/7] Fix tok/sec metrics for base_train and mid_train when gradient accumulation is not 1 --- scripts/base_train.py | 2 +- scripts/mid_train.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/base_train.py b/scripts/base_train.py index 3725805..47ecba4 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -294,7 +294,7 @@ for step in range(num_iterations + 1): smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA pct_done = 100 * step / num_iterations - tok_per_sec = int(world_tokens_per_fwdbwd / dt) + tok_per_sec = int(total_batch_size / dt) flops_per_sec = num_flops_per_token * total_batch_size / dt promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in % diff --git a/scripts/mid_train.py b/scripts/mid_train.py index eedb262..6c2b82f 100644 --- a/scripts/mid_train.py +++ b/scripts/mid_train.py @@ -268,7 +268,7 @@ while True: smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA pct_done = 100 * progress - tok_per_sec = int(world_tokens_per_fwdbwd / dt) + tok_per_sec = int(total_batch_size / dt) flops_per_sec = num_flops_per_token * total_batch_size / dt promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in % From 5e0987a431553a84ba82d835d1da5daccd70d095 Mon Sep 17 00:00:00 2001 From: Ajeesh Sunil <98960341+Aj-esh@users.noreply.github.com> Date: Tue, 28 Oct 2025 20:05:38 +0000 Subject: [PATCH 4/7] numpy isnt acting as a dependency for nanochat, so isnt it better to remove numpy from dependencies list --- pyproject.toml | 1 - 1 file changed, 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index da674f4..3d03c4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,6 @@ dependencies = [ "datasets>=4.0.0", "fastapi>=0.117.1", "files-to-prompt>=0.6", - "numpy==1.26.4", "psutil>=7.1.0", "regex>=2025.9.1", "setuptools>=80.9.0", From f15732524a1cbe782c4546ef9db458cd88d7df1e Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 1 Nov 2025 14:13:29 +0000 Subject: [PATCH 5/7] make deepwiki link better --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f13dba0..18ea5ce 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,7 @@ files-to-prompt . -e py -e md -e rs -e html -e toml -e sh --ignore "*target*" -- This includes all py, rs, html, toml, sh files, excludes the `rustbpe/target` folder, and chooses the cxml output format. Everything is written to the `packaged.txt` file, which atm measures ~330KB (i.e. well below ~100K tokens for a state of the art LLM), and ~8K lines of code in 45 files. -Alternatively, I recommend using [DeepWiki](https://deepwiki.com/) from Devin/Cognition to ask questions of this repo. In the URL of this repo, simply change github.com to deepwiki.com, and you're off. +Alternatively, I recommend using [DeepWiki](https://deepwiki.com/karpathy/nanochat) from Devin/Cognition to ask questions of this repo. In the URL of this repo, simply change github.com to deepwiki.com, and you're off. ## Tests From 7d2c4a3d957bd9cdc1e4e54b1ab8a947ffc74edb Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 1 Nov 2025 15:28:30 +0000 Subject: [PATCH 6/7] delete pandas dep in base_eval use csv instead --- scripts/base_eval.py | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/scripts/base_eval.py b/scripts/base_eval.py index 8efde4f..c488c8a 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -1,5 +1,5 @@ """ -Evlauate the CORE metric for a given model. +Evaluate the CORE metric for a given model. Run on a single GPU: python base_eval.py @@ -10,14 +10,13 @@ torchrun --nproc_per_node=8 base_eval.py The script will print the CORE metric to the console. """ import os -import sys +import csv import time import json import random import yaml from contextlib import nullcontext -import pandas as pd import torch from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type @@ -26,13 +25,12 @@ from nanochat.checkpoint_manager import load_model from nanochat.core_eval import evaluate_task # ----------------------------------------------------------------------------- -# nanoChat specific function dealing with I/O etc. +# nanochat specific function dealing with I/O etc. def evaluate_model(model, tokenizer, device, max_per_task=-1): """ Evaluate a base model on the CORE benchmark. - max_per_task: crop the data to this many examples per task for testing (-1 = disable) - TODO: clean up this function, delete the need for all the files, for pandas dependency, etc. """ # Load config and task metadata base_dir = get_base_dir() @@ -43,7 +41,15 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): with open(config_path, 'r') as f: config = yaml.safe_load(f) tasks = config['icl_tasks'] - eval_metadata = pd.read_csv(eval_meta_data) + + # Load random baseline values from eval metadata + random_baselines = {} + with open(eval_meta_data, 'r', encoding='utf-8') as f: + reader = csv.DictReader(f) + for row in reader: + task_name = row['Eval Task'] + random_baseline = row['Random baseline'] + random_baselines[task_name] = float(random_baseline) # Evaluate each task results = {} @@ -75,8 +81,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): accuracy = evaluate_task(model, tokenizer, data, device, task_meta) results[label] = accuracy - row = eval_metadata[eval_metadata["Eval Task"] == label] - random_baseline = row["Random baseline"].values[0] + random_baseline = random_baselines[label] centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline) centered_results[label] = centered_result end_time = time.time() From cf587acb1a51003463c7eda250e95842802b80fd Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 1 Nov 2025 16:04:38 +0000 Subject: [PATCH 7/7] move eval bundle download to be lazy and inside the python code so that we can substantially simplify the run bash scripts --- dev/runcpu.sh | 7 ------- nanochat/common.py | 14 ++++++++++---- run1000.sh | 7 ------- scripts/base_eval.py | 29 +++++++++++++++++++++++++---- speedrun.sh | 9 --------- 5 files changed, 35 insertions(+), 31 deletions(-) diff --git a/dev/runcpu.sh b/dev/runcpu.sh index 469e51d..ffacefa 100755 --- a/dev/runcpu.sh +++ b/dev/runcpu.sh @@ -22,13 +22,6 @@ fi curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y source "$HOME/.cargo/env" uv run maturin develop --release --manifest-path rustbpe/Cargo.toml -EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip -if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then - curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL - unzip -q eval_bundle.zip - rm eval_bundle.zip - mv eval_bundle $NANOCHAT_BASE_DIR -fi # wipe the report python -m nanochat.report reset diff --git a/nanochat/common.py b/nanochat/common.py index a5a6d2e..8272378 100644 --- a/nanochat/common.py +++ b/nanochat/common.py @@ -58,7 +58,7 @@ def get_base_dir(): os.makedirs(nanochat_dir, exist_ok=True) return nanochat_dir -def download_file_with_lock(url, filename): +def download_file_with_lock(url, filename, postprocess_fn=None): """ Downloads a file from a URL to a local path in the base directory. Uses a lock file to prevent concurrent downloads among multiple ranks. @@ -76,18 +76,24 @@ def download_file_with_lock(url, filename): # All other ranks block until it is released fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX) + # Recheck after acquiring lock (another process may have downloaded it) if os.path.exists(file_path): return file_path + # Download the content as bytes print(f"Downloading {url}...") with urllib.request.urlopen(url) as response: - content = response.read().decode('utf-8') + content = response.read() # bytes - with open(file_path, 'w') as f: + # Write to local file + with open(file_path, 'wb') as f: f.write(content) - print(f"Downloaded to {file_path}") + # Run the postprocess function if provided + if postprocess_fn is not None: + postprocess_fn(file_path) + # Clean up the lock file after the lock is released try: os.remove(lock_path) diff --git a/run1000.sh b/run1000.sh index 6f454e0..e0bc4c4 100644 --- a/run1000.sh +++ b/run1000.sh @@ -19,13 +19,6 @@ python -m nanochat.report reset curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y source "$HOME/.cargo/env" uv run maturin develop --release --manifest-path rustbpe/Cargo.toml -EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip -if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then - curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL - unzip -q eval_bundle.zip - rm eval_bundle.zip - mv eval_bundle $NANOCHAT_BASE_DIR -fi curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl # train tokenizer on ~4B characters and kick off download of the rest for pretraining diff --git a/scripts/base_eval.py b/scripts/base_eval.py index c488c8a..21f7bac 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -2,10 +2,10 @@ Evaluate the CORE metric for a given model. Run on a single GPU: -python base_eval.py +python -m scripts.base_eval Run with torchrun on e.g. 8 GPUs: -torchrun --nproc_per_node=8 base_eval.py +torchrun --nproc_per_node=8 -m scripts.base_eval The script will print the CORE metric to the console. """ @@ -13,13 +13,16 @@ import os import csv import time import json -import random import yaml +import shutil +import random +import zipfile +import tempfile from contextlib import nullcontext import torch -from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type +from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type, download_file_with_lock from nanochat.tokenizer import HuggingFaceTokenizer from nanochat.checkpoint_manager import load_model from nanochat.core_eval import evaluate_task @@ -27,6 +30,21 @@ from nanochat.core_eval import evaluate_task # ----------------------------------------------------------------------------- # nanochat specific function dealing with I/O etc. +# ~162MB of data needed to evaluate the CORE metric +EVAL_BUNDLE_URL = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip" + +def place_eval_bundle(file_path): + # here file_path is the path to the eval_bundle.zip file + # we need to unzip it and place it in the base directory + base_dir = get_base_dir() + eval_bundle_dir = os.path.join(base_dir, "eval_bundle") + with tempfile.TemporaryDirectory() as tmpdir: + with zipfile.ZipFile(file_path, 'r') as zip_ref: + zip_ref.extractall(tmpdir) + extracted_bundle_dir = os.path.join(tmpdir, "eval_bundle") + shutil.move(extracted_bundle_dir, eval_bundle_dir) + print0(f"Placed eval_bundle directory at {eval_bundle_dir}") + def evaluate_model(model, tokenizer, device, max_per_task=-1): """ Evaluate a base model on the CORE benchmark. @@ -35,6 +53,9 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): # Load config and task metadata base_dir = get_base_dir() eval_bundle_dir = os.path.join(base_dir, "eval_bundle") + # Download the eval bundle to disk (and unzip if needed) + if not os.path.exists(eval_bundle_dir): + download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle) config_path = os.path.join(eval_bundle_dir, "core.yaml") data_base_path = os.path.join(eval_bundle_dir, "eval_data") eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv") diff --git a/speedrun.sh b/speedrun.sh index 35dd39e..32c8870 100644 --- a/speedrun.sh +++ b/speedrun.sh @@ -73,15 +73,6 @@ python -m scripts.tok_eval # ----------------------------------------------------------------------------- # Base model (pretraining) -# Download the eval_bundle from s3 to evaluate CORE metric during training (~162MB) -EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip -if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then - curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL - unzip -q eval_bundle.zip - rm eval_bundle.zip - mv eval_bundle $NANOCHAT_BASE_DIR -fi - # The d20 model is 561M parameters. # Chinchilla says #tokens = 20X #params, so we need 561e6 * 20 = 11.2B tokens. # Assume our tokenizer is 4.8 chars/token, this is 11.2B * 4.8 ~= 54B chars.