From ed519b0f24ea1620a81653012103122a2e367d83 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ph=C3=BAc=20H=2E=20L=C3=AA=20Kh=E1=BA=AFc?= <lkhphuc@pm.me>
Date: Fri, 17 Oct 2025 17:21:25 +0700
Subject: [PATCH 1/7] Update engine.py with correct error message on assert

---
 nanochat/engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/nanochat/engine.py b/nanochat/engine.py
index de1253a..eb3fcac 100644
--- a/nanochat/engine.py
+++ b/nanochat/engine.py
@@ -83,7 +83,7 @@ class KVCache:
         for ix, (dim1, dim2) in enumerate(zip(self.kv_shape, other.kv_shape)):
             if ix in [0, 1, 3, 5]:
                 # num_layers, batch_size, num_heads, head_dim must match
-                assert dim1 == dim2, f"Batch dim mismatch: {dim1} != {dim2}"
+                assert dim1 == dim2, f"Dim {ix} mismatch: {dim1} != {dim2}"
             elif ix == 2:
                 # batch_size can be expanded
                 assert dim1 == dim2 or dim2 == 1, f"Batch dim mismatch: {dim1} != {dim2}"

From fca2b8cd07a0929fb5a0368522f11a061d03e52e Mon Sep 17 00:00:00 2001
From: Marius Wachtler <undingen@gmail.com>
Date: Fri, 24 Oct 2025 14:29:35 -0500
Subject: [PATCH 2/7] harden eval: prevent the calc tool from accessing globals
 and locals By passing empty globals() and locals() to eval() we can prevent
 simple malicious cases where the user gets the model to output something like

```<global variable/func> or "a".count("a")```
e.g.
```signal.raise_signal(9) or "a".count("a")``` which would kill the process.
or one could maybe get it to output secrets etc.

I think to make it 100% secure one would need to parse the AST and only execute secure nodes but this should make it much more robust.
---
 nanochat/engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nanochat/engine.py b/nanochat/engine.py
index fee06a1..77530c5 100644
--- a/nanochat/engine.py
+++ b/nanochat/engine.py
@@ -37,7 +37,7 @@ def eval_with_timeout(formula, max_time=3):
         with timeout(max_time, formula):
             with warnings.catch_warnings():
                 warnings.simplefilter("ignore", SyntaxWarning)
-                return eval(formula)
+                return eval(formula, {"__builtins__": {}}, {})
     except Exception as e:
         signal.alarm(0)
         # print(f"Warning: Failed to eval {formula}, exception: {e}") # it's ok ignore wrong calculator usage

From a9de4b103858223646e0e8ba29ed32b8516aad8f Mon Sep 17 00:00:00 2001
From: water-vapor <vapor@uchicago.edu>
Date: Sun, 26 Oct 2025 01:43:49 -0500
Subject: [PATCH 3/7] Fix tok/sec metrics for base_train and mid_train when
 gradient accumulation is not 1

---
 scripts/base_train.py | 2 +-
 scripts/mid_train.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/base_train.py b/scripts/base_train.py
index 3725805..47ecba4 100644
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@@ -294,7 +294,7 @@ for step in range(num_iterations + 1):
     smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss
     debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA
     pct_done = 100 * step / num_iterations
-    tok_per_sec = int(world_tokens_per_fwdbwd / dt)
+    tok_per_sec = int(total_batch_size / dt)
     flops_per_sec = num_flops_per_token * total_batch_size / dt
     promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity
     mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in %
diff --git a/scripts/mid_train.py b/scripts/mid_train.py
index eedb262..6c2b82f 100644
--- a/scripts/mid_train.py
+++ b/scripts/mid_train.py
@@ -268,7 +268,7 @@ while True:
     smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss
     debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA
     pct_done = 100 * progress
-    tok_per_sec = int(world_tokens_per_fwdbwd / dt)
+    tok_per_sec = int(total_batch_size / dt)
     flops_per_sec = num_flops_per_token * total_batch_size / dt
     promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity
     mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in %

From 5e0987a431553a84ba82d835d1da5daccd70d095 Mon Sep 17 00:00:00 2001
From: Ajeesh Sunil <98960341+Aj-esh@users.noreply.github.com>
Date: Tue, 28 Oct 2025 20:05:38 +0000
Subject: [PATCH 4/7] numpy isnt acting as a dependency for nanochat, so isnt
 it better to remove numpy from dependencies list

---
 pyproject.toml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index da674f4..3d03c4b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,6 @@ dependencies = [
     "datasets>=4.0.0",
     "fastapi>=0.117.1",
     "files-to-prompt>=0.6",
-    "numpy==1.26.4",
     "psutil>=7.1.0",
     "regex>=2025.9.1",
     "setuptools>=80.9.0",

From f15732524a1cbe782c4546ef9db458cd88d7df1e Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sat, 1 Nov 2025 14:13:29 +0000
Subject: [PATCH 5/7] make deepwiki link better

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f13dba0..18ea5ce 100644
--- a/README.md
+++ b/README.md
@@ -113,7 +113,7 @@ files-to-prompt . -e py -e md -e rs -e html -e toml -e sh --ignore "*target*" --
 
 This includes all py, rs, html, toml, sh files, excludes the `rustbpe/target` folder, and chooses the cxml output format. Everything is written to the `packaged.txt` file, which atm measures ~330KB (i.e. well below ~100K tokens for a state of the art LLM), and ~8K lines of code in 45 files.
 
-Alternatively, I recommend using [DeepWiki](https://deepwiki.com/) from Devin/Cognition to ask questions of this repo. In the URL of this repo, simply change github.com to deepwiki.com, and you're off.
+Alternatively, I recommend using [DeepWiki](https://deepwiki.com/karpathy/nanochat) from Devin/Cognition to ask questions of this repo. In the URL of this repo, simply change github.com to deepwiki.com, and you're off.
 
 ## Tests
 

From 7d2c4a3d957bd9cdc1e4e54b1ab8a947ffc74edb Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sat, 1 Nov 2025 15:28:30 +0000
Subject: [PATCH 6/7] delete pandas dep in base_eval use csv instead

---
 scripts/base_eval.py | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/scripts/base_eval.py b/scripts/base_eval.py
index 8efde4f..c488c8a 100644
--- a/scripts/base_eval.py
+++ b/scripts/base_eval.py
@@ -1,5 +1,5 @@
 """
-Evlauate the CORE metric for a given model.
+Evaluate the CORE metric for a given model.
 
 Run on a single GPU:
 python base_eval.py
@@ -10,14 +10,13 @@ torchrun --nproc_per_node=8 base_eval.py
 The script will print the CORE metric to the console.
 """
 import os
-import sys
+import csv
 import time
 import json
 import random
 import yaml
 from contextlib import nullcontext
 
-import pandas as pd
 import torch
 
 from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type
@@ -26,13 +25,12 @@ from nanochat.checkpoint_manager import load_model
 from nanochat.core_eval import evaluate_task
 
 # -----------------------------------------------------------------------------
-# nanoChat specific function dealing with I/O etc.
+# nanochat specific function dealing with I/O etc.
 
 def evaluate_model(model, tokenizer, device, max_per_task=-1):
     """
     Evaluate a base model on the CORE benchmark.
     - max_per_task: crop the data to this many examples per task for testing (-1 = disable)
-    TODO: clean up this function, delete the need for all the files, for pandas dependency, etc.
     """
     # Load config and task metadata
     base_dir = get_base_dir()
@@ -43,7 +41,15 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
     with open(config_path, 'r') as f:
         config = yaml.safe_load(f)
     tasks = config['icl_tasks']
-    eval_metadata = pd.read_csv(eval_meta_data)
+
+    # Load random baseline values from eval metadata
+    random_baselines = {}
+    with open(eval_meta_data, 'r', encoding='utf-8') as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            task_name = row['Eval Task']
+            random_baseline = row['Random baseline']
+            random_baselines[task_name] = float(random_baseline)
 
     # Evaluate each task
     results = {}
@@ -75,8 +81,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
         accuracy = evaluate_task(model, tokenizer, data, device, task_meta)
 
         results[label] = accuracy
-        row = eval_metadata[eval_metadata["Eval Task"] == label]
-        random_baseline = row["Random baseline"].values[0]
+        random_baseline = random_baselines[label]
         centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline)
         centered_results[label] = centered_result
         end_time = time.time()

From cf587acb1a51003463c7eda250e95842802b80fd Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Sat, 1 Nov 2025 16:04:38 +0000
Subject: [PATCH 7/7] move eval bundle download to be lazy and inside the
 python code so that we can substantially simplify the run bash scripts

---
 dev/runcpu.sh        |  7 -------
 nanochat/common.py   | 14 ++++++++++----
 run1000.sh           |  7 -------
 scripts/base_eval.py | 29 +++++++++++++++++++++++++----
 speedrun.sh          |  9 ---------
 5 files changed, 35 insertions(+), 31 deletions(-)

diff --git a/dev/runcpu.sh b/dev/runcpu.sh
index 469e51d..ffacefa 100755
--- a/dev/runcpu.sh
+++ b/dev/runcpu.sh
@@ -22,13 +22,6 @@ fi
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 source "$HOME/.cargo/env"
 uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
-EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
-if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
-    curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
-    unzip -q eval_bundle.zip
-    rm eval_bundle.zip
-    mv eval_bundle $NANOCHAT_BASE_DIR
-fi
 
 # wipe the report
 python -m nanochat.report reset
diff --git a/nanochat/common.py b/nanochat/common.py
index a5a6d2e..8272378 100644
--- a/nanochat/common.py
+++ b/nanochat/common.py
@@ -58,7 +58,7 @@ def get_base_dir():
     os.makedirs(nanochat_dir, exist_ok=True)
     return nanochat_dir
 
-def download_file_with_lock(url, filename):
+def download_file_with_lock(url, filename, postprocess_fn=None):
     """
     Downloads a file from a URL to a local path in the base directory.
     Uses a lock file to prevent concurrent downloads among multiple ranks.
@@ -76,18 +76,24 @@ def download_file_with_lock(url, filename):
         # All other ranks block until it is released
         fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
 
+        # Recheck after acquiring lock (another process may have downloaded it)
         if os.path.exists(file_path):
             return file_path
 
+        # Download the content as bytes
         print(f"Downloading {url}...")
         with urllib.request.urlopen(url) as response:
-            content = response.read().decode('utf-8')
+            content = response.read() # bytes
 
-        with open(file_path, 'w') as f:
+        # Write to local file
+        with open(file_path, 'wb') as f:
             f.write(content)
-
         print(f"Downloaded to {file_path}")
 
+        # Run the postprocess function if provided
+        if postprocess_fn is not None:
+            postprocess_fn(file_path)
+
     # Clean up the lock file after the lock is released
     try:
         os.remove(lock_path)
diff --git a/run1000.sh b/run1000.sh
index 6f454e0..e0bc4c4 100644
--- a/run1000.sh
+++ b/run1000.sh
@@ -19,13 +19,6 @@ python -m nanochat.report reset
 curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 source "$HOME/.cargo/env"
 uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
-EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
-if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
-    curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
-    unzip -q eval_bundle.zip
-    rm eval_bundle.zip
-    mv eval_bundle $NANOCHAT_BASE_DIR
-fi
 curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
 
 # train tokenizer on ~4B characters and kick off download of the rest for pretraining
diff --git a/scripts/base_eval.py b/scripts/base_eval.py
index c488c8a..21f7bac 100644
--- a/scripts/base_eval.py
+++ b/scripts/base_eval.py
@@ -2,10 +2,10 @@
 Evaluate the CORE metric for a given model.
 
 Run on a single GPU:
-python base_eval.py
+python -m scripts.base_eval
 
 Run with torchrun on e.g. 8 GPUs:
-torchrun --nproc_per_node=8 base_eval.py
+torchrun --nproc_per_node=8 -m scripts.base_eval
 
 The script will print the CORE metric to the console.
 """
@@ -13,13 +13,16 @@ import os
 import csv
 import time
 import json
-import random
 import yaml
+import shutil
+import random
+import zipfile
+import tempfile
 from contextlib import nullcontext
 
 import torch
 
-from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type
+from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type, download_file_with_lock
 from nanochat.tokenizer import HuggingFaceTokenizer
 from nanochat.checkpoint_manager import load_model
 from nanochat.core_eval import evaluate_task
@@ -27,6 +30,21 @@ from nanochat.core_eval import evaluate_task
 # -----------------------------------------------------------------------------
 # nanochat specific function dealing with I/O etc.
 
+# ~162MB of data needed to evaluate the CORE metric
+EVAL_BUNDLE_URL = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip"
+
+def place_eval_bundle(file_path):
+    # here file_path is the path to the eval_bundle.zip file
+    # we need to unzip it and place it in the base directory
+    base_dir = get_base_dir()
+    eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
+    with tempfile.TemporaryDirectory() as tmpdir:
+        with zipfile.ZipFile(file_path, 'r') as zip_ref:
+            zip_ref.extractall(tmpdir)
+        extracted_bundle_dir = os.path.join(tmpdir, "eval_bundle")
+        shutil.move(extracted_bundle_dir, eval_bundle_dir)
+    print0(f"Placed eval_bundle directory at {eval_bundle_dir}")
+
 def evaluate_model(model, tokenizer, device, max_per_task=-1):
     """
     Evaluate a base model on the CORE benchmark.
@@ -35,6 +53,9 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
     # Load config and task metadata
     base_dir = get_base_dir()
     eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
+    # Download the eval bundle to disk (and unzip if needed)
+    if not os.path.exists(eval_bundle_dir):
+        download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle)
     config_path = os.path.join(eval_bundle_dir, "core.yaml")
     data_base_path = os.path.join(eval_bundle_dir, "eval_data")
     eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
diff --git a/speedrun.sh b/speedrun.sh
index 35dd39e..32c8870 100644
--- a/speedrun.sh
+++ b/speedrun.sh
@@ -73,15 +73,6 @@ python -m scripts.tok_eval
 # -----------------------------------------------------------------------------
 # Base model (pretraining)
 
-# Download the eval_bundle from s3 to evaluate CORE metric during training (~162MB)
-EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
-if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
-    curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
-    unzip -q eval_bundle.zip
-    rm eval_bundle.zip
-    mv eval_bundle $NANOCHAT_BASE_DIR
-fi
-
 # The d20 model is 561M parameters.
 # Chinchilla says #tokens = 20X #params, so we need 561e6 * 20 = 11.2B tokens.
 # Assume our tokenizer is 4.8 chars/token, this is 11.2B * 4.8 ~= 54B chars.