move eval bundle download to be lazy and inside the python code so that we can substantially simplify the run bash scripts

This commit is contained in:
Andrej Karpathy 2025-11-01 16:04:38 +00:00
parent 7d2c4a3d95
commit cf587acb1a
5 changed files with 35 additions and 31 deletions

View File

@ -22,13 +22,6 @@ fi
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source "$HOME/.cargo/env" source "$HOME/.cargo/env"
uv run maturin develop --release --manifest-path rustbpe/Cargo.toml uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
unzip -q eval_bundle.zip
rm eval_bundle.zip
mv eval_bundle $NANOCHAT_BASE_DIR
fi
# wipe the report # wipe the report
python -m nanochat.report reset python -m nanochat.report reset

View File

@ -58,7 +58,7 @@ def get_base_dir():
os.makedirs(nanochat_dir, exist_ok=True) os.makedirs(nanochat_dir, exist_ok=True)
return nanochat_dir return nanochat_dir
def download_file_with_lock(url, filename): def download_file_with_lock(url, filename, postprocess_fn=None):
""" """
Downloads a file from a URL to a local path in the base directory. Downloads a file from a URL to a local path in the base directory.
Uses a lock file to prevent concurrent downloads among multiple ranks. Uses a lock file to prevent concurrent downloads among multiple ranks.
@ -76,18 +76,24 @@ def download_file_with_lock(url, filename):
# All other ranks block until it is released # All other ranks block until it is released
fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX) fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX)
# Recheck after acquiring lock (another process may have downloaded it)
if os.path.exists(file_path): if os.path.exists(file_path):
return file_path return file_path
# Download the content as bytes
print(f"Downloading {url}...") print(f"Downloading {url}...")
with urllib.request.urlopen(url) as response: with urllib.request.urlopen(url) as response:
content = response.read().decode('utf-8') content = response.read() # bytes
with open(file_path, 'w') as f: # Write to local file
with open(file_path, 'wb') as f:
f.write(content) f.write(content)
print(f"Downloaded to {file_path}") print(f"Downloaded to {file_path}")
# Run the postprocess function if provided
if postprocess_fn is not None:
postprocess_fn(file_path)
# Clean up the lock file after the lock is released # Clean up the lock file after the lock is released
try: try:
os.remove(lock_path) os.remove(lock_path)

View File

@ -19,13 +19,6 @@ python -m nanochat.report reset
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source "$HOME/.cargo/env" source "$HOME/.cargo/env"
uv run maturin develop --release --manifest-path rustbpe/Cargo.toml uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
unzip -q eval_bundle.zip
rm eval_bundle.zip
mv eval_bundle $NANOCHAT_BASE_DIR
fi
curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
# train tokenizer on ~4B characters and kick off download of the rest for pretraining # train tokenizer on ~4B characters and kick off download of the rest for pretraining

View File

@ -2,10 +2,10 @@
Evaluate the CORE metric for a given model. Evaluate the CORE metric for a given model.
Run on a single GPU: Run on a single GPU:
python base_eval.py python -m scripts.base_eval
Run with torchrun on e.g. 8 GPUs: Run with torchrun on e.g. 8 GPUs:
torchrun --nproc_per_node=8 base_eval.py torchrun --nproc_per_node=8 -m scripts.base_eval
The script will print the CORE metric to the console. The script will print the CORE metric to the console.
""" """
@ -13,13 +13,16 @@ import os
import csv import csv
import time import time
import json import json
import random
import yaml import yaml
import shutil
import random
import zipfile
import tempfile
from contextlib import nullcontext from contextlib import nullcontext
import torch import torch
from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type, download_file_with_lock
from nanochat.tokenizer import HuggingFaceTokenizer from nanochat.tokenizer import HuggingFaceTokenizer
from nanochat.checkpoint_manager import load_model from nanochat.checkpoint_manager import load_model
from nanochat.core_eval import evaluate_task from nanochat.core_eval import evaluate_task
@ -27,6 +30,21 @@ from nanochat.core_eval import evaluate_task
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# nanochat specific function dealing with I/O etc. # nanochat specific function dealing with I/O etc.
# ~162MB of data needed to evaluate the CORE metric
EVAL_BUNDLE_URL = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip"
def place_eval_bundle(file_path):
# here file_path is the path to the eval_bundle.zip file
# we need to unzip it and place it in the base directory
base_dir = get_base_dir()
eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
with tempfile.TemporaryDirectory() as tmpdir:
with zipfile.ZipFile(file_path, 'r') as zip_ref:
zip_ref.extractall(tmpdir)
extracted_bundle_dir = os.path.join(tmpdir, "eval_bundle")
shutil.move(extracted_bundle_dir, eval_bundle_dir)
print0(f"Placed eval_bundle directory at {eval_bundle_dir}")
def evaluate_model(model, tokenizer, device, max_per_task=-1): def evaluate_model(model, tokenizer, device, max_per_task=-1):
""" """
Evaluate a base model on the CORE benchmark. Evaluate a base model on the CORE benchmark.
@ -35,6 +53,9 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
# Load config and task metadata # Load config and task metadata
base_dir = get_base_dir() base_dir = get_base_dir()
eval_bundle_dir = os.path.join(base_dir, "eval_bundle") eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
# Download the eval bundle to disk (and unzip if needed)
if not os.path.exists(eval_bundle_dir):
download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle)
config_path = os.path.join(eval_bundle_dir, "core.yaml") config_path = os.path.join(eval_bundle_dir, "core.yaml")
data_base_path = os.path.join(eval_bundle_dir, "eval_data") data_base_path = os.path.join(eval_bundle_dir, "eval_data")
eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv") eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")

View File

@ -73,15 +73,6 @@ python -m scripts.tok_eval
# ----------------------------------------------------------------------------- # -----------------------------------------------------------------------------
# Base model (pretraining) # Base model (pretraining)
# Download the eval_bundle from s3 to evaluate CORE metric during training (~162MB)
EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
unzip -q eval_bundle.zip
rm eval_bundle.zip
mv eval_bundle $NANOCHAT_BASE_DIR
fi
# The d20 model is 561M parameters. # The d20 model is 561M parameters.
# Chinchilla says #tokens = 20X #params, so we need 561e6 * 20 = 11.2B tokens. # Chinchilla says #tokens = 20X #params, so we need 561e6 * 20 = 11.2B tokens.
# Assume our tokenizer is 4.8 chars/token, this is 11.2B * 4.8 ~= 54B chars. # Assume our tokenizer is 4.8 chars/token, this is 11.2B * 4.8 ~= 54B chars.