merge two files base_loss and base_eval into a single file, it's nicer this way, and unify the huggingface code associated with both

2026-07-06 21:09:15 +00:00 · 2026-02-01 02:36:43 +00:00 · 2026-02-01 02:36:43 +00:00 · 0307997f9b
commit 0307997f9b
parent 1ddaad1c1c
5 changed files with 219 additions and 261 deletions
--- a/README.md
+++ b/README.md
@ -142,8 +142,7 @@ I've published a number of guides that might contain helpful information:
 │   ├── scaling_laws.sh             # Scaling laws experiments
 │   └── speedrun.sh                 # Train the ~$100 nanochat d20
 ├── scripts
-│   ├── base_eval.py                # Base model: calculate CORE score
-│   ├── base_loss.py                # Base model: calculate bits per byte, sample
+│   ├── base_eval.py                # Base model: CORE score, bits per byte, samples
 │   ├── base_train.py               # Base model: train
 │   ├── chat_cli.py                 # Chat model: talk to over CLI
 │   ├── chat_eval.py                # Chat model: eval tasks
--- a/runs/runcpu.sh
+++ b/runs/runcpu.sh
@ -42,8 +42,7 @@ python -m scripts.base_train \
    --sample-every=100 \
    --num-iterations=5000 \
    --run=$WANDB_RUN
-python -m scripts.base_loss --device-batch-size=1 --split-tokens=16384
-python -m scripts.base_eval --max-per-task=16
+python -m scripts.base_eval --device-batch-size=1 --split-tokens=16384 --max-per-task=16

 # SFT (~10 minutes on my MacBook Pro M3 Max)
 curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
--- a/runs/speedrun.sh
+++ b/runs/speedrun.sh
@ -74,9 +74,7 @@ NPROC_PER_NODE=8

 # d24 model (slightly overtrained is enough to beat GPT-2 => increase data:params ratio from compute optimal 10.5 (default) to 12)
 torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=24 --target-param-data-ratio=12 --run=$WANDB_RUN
-# evaluate the model on a larger chunk of train/val data and draw some samples
-torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss
-# evaluate the model on CORE tasks
+# evaluate the model: CORE metric, BPB on train/val, and draw samples
 torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval

 # -----------------------------------------------------------------------------
--- a/scripts/base_eval.py
+++ b/scripts/base_eval.py
@ -1,13 +1,23 @@
 """
-Evaluate the CORE metric for a given model.
+Unified evaluation script for base models.

-Run on a single GPU:
-python -m scripts.base_eval
+Supports three evaluation modes (comma-separated):
+  --eval core    : CORE metric (accuracy on ICL tasks)
+  --eval bpb     : Bits per byte on train/val splits
+  --eval sample  : Generate samples from the model

-Run with torchrun on e.g. 8 GPUs:
-torchrun --nproc_per_node=8 -m scripts.base_eval
+Default is all three: --eval core,bpb,sample

-The script will print the CORE metric to the console.
+Examples:
+
+    # Evaluate a HuggingFace model (e.g. GPT-2 124M) using 8 GPUs
+    torchrun --nproc_per_node=8 -m scripts.base_eval --hf-path openai-community/gpt2
+
+    # Evaluate a nanochat model (e.g. d24) using 8 GPUs
+    torchrun --nproc_per_node=8 -m scripts.base_eval --model-tag d24 --device-batch-size=16
+
+    # Quick/approximate evaluation using a single GPU
+    python -m scripts.base_eval --model-tag d24 --device-batch-size=16 --max-per-task=100 --split-tokens=524288
 """
 import os
 import csv
@ -18,24 +28,74 @@ import shutil
 import random
 import zipfile
 import tempfile
+import argparse
 from contextlib import nullcontext

 import torch

 from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type, download_file_with_lock
-from nanochat.tokenizer import HuggingFaceTokenizer
+from nanochat.tokenizer import HuggingFaceTokenizer, get_token_bytes
 from nanochat.checkpoint_manager import load_model
 from nanochat.core_eval import evaluate_task
+from nanochat.dataloader import tokenizing_distributed_data_loader_bos_bestfit
+from nanochat.loss_eval import evaluate_bpb
+from nanochat.engine import Engine

 # -----------------------------------------------------------------------------
-# nanochat specific function dealing with I/O etc.
+# HuggingFace loading utilities
+
+class ModelWrapper:
+    """Lightweight wrapper to give HuggingFace models a nanochat-compatible interface."""
+    def __init__(self, model, max_seq_len=None):
+        self.model = model
+        self.max_seq_len = max_seq_len
+
+    def __call__(self, input_ids, targets=None, loss_reduction='mean'):
+        logits = self.model(input_ids).logits
+        if targets is None:
+            return logits
+        loss = torch.nn.functional.cross_entropy(
+            logits.view(-1, logits.size(-1)),
+            targets.view(-1),
+            ignore_index=-1,
+            reduction=loss_reduction
+        )
+        return loss
+
+    def get_device(self):
+        return next(self.model.parameters()).device
+
+
+def load_hf_model(hf_path: str, device):
+    """Load a HuggingFace model and tokenizer."""
+    print0(f"Loading HuggingFace model from: {hf_path}")
+    from transformers import AutoModelForCausalLM
+    model = AutoModelForCausalLM.from_pretrained(hf_path)
+    model.to(device)
+    model.eval()
+    max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None
+    model = ModelWrapper(model, max_seq_len=max_seq_len)
+    tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path)
+    return model, tokenizer
+
+
+def get_hf_token_bytes(tokenizer, device="cpu"):
+    """Compute token_bytes tensor for a HuggingFace tokenizer."""
+    vocab_size = tokenizer.tokenizer.get_vocab_size()
+    token_bytes = torch.zeros(vocab_size, dtype=torch.int64, device=device)
+    for token_id in range(vocab_size):
+        token_str = tokenizer.tokenizer.decode([token_id])
+        token_bytes[token_id] = len(token_str.encode('utf-8'))
+    return token_bytes
+
+# -----------------------------------------------------------------------------
+# CORE evaluation

-# ~162MB of data needed to evaluate the CORE metric
 EVAL_BUNDLE_URL = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip"

+
 def place_eval_bundle(file_path):
-    # here file_path is the path to the eval_bundle.zip file
-    # we need to unzip it and place it in the base directory
+    """Unzip eval_bundle.zip and place it in the base directory."""
    base_dir = get_base_dir()
    eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
    with tempfile.TemporaryDirectory() as tmpdir:
@ -45,25 +105,27 @@ def place_eval_bundle(file_path):
        shutil.move(extracted_bundle_dir, eval_bundle_dir)
    print0(f"Placed eval_bundle directory at {eval_bundle_dir}")

-def evaluate_model(model, tokenizer, device, max_per_task=-1):
+
+def evaluate_core(model, tokenizer, device, max_per_task=-1):
    """
    Evaluate a base model on the CORE benchmark.
-    - max_per_task: crop the data to this many examples per task for testing (-1 = disable)
+    Returns dict with results, centered_results, and core_metric.
    """
-    # Load config and task metadata
    base_dir = get_base_dir()
    eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
-    # Download the eval bundle to disk (and unzip if needed)
+    # Download the eval bundle if needed
    if not os.path.exists(eval_bundle_dir):
        download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle)
+
    config_path = os.path.join(eval_bundle_dir, "core.yaml")
    data_base_path = os.path.join(eval_bundle_dir, "eval_data")
    eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
+
    with open(config_path, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    tasks = config['icl_tasks']

-    # Load random baseline values from eval metadata
+    # Load random baseline values
    random_baselines = {}
    with open(eval_meta_data, 'r', encoding='utf-8') as f:
        reader = csv.DictReader(f)
@ -86,27 +148,23 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
        }
        print0(f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ", end='')

-        # Load data for this task
        data_path = os.path.join(data_base_path, task_meta['dataset_uri'])
        with open(data_path, 'r', encoding='utf-8') as f:
            data = [json.loads(line.strip()) for line in f]

-        # shuffle the data because in many cases it appears ordered but we want
-        # the ability to only run a subset of the data for debugging purposes etc.
+        # Shuffle for consistent subsampling when using max_per_task
        shuffle_rng = random.Random(1337)
        shuffle_rng.shuffle(data)
        if max_per_task > 0:
            data = data[:max_per_task]

-        # run the evaluation for this task
        accuracy = evaluate_task(model, tokenizer, data, device, task_meta)
-
        results[label] = accuracy
        random_baseline = random_baselines[label]
        centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline)
        centered_results[label] = centered_result
-        end_time = time.time()
-        print0(f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {end_time - start_time:.2f}s")
+        elapsed = time.time() - start_time
+        print0(f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {elapsed:.2f}s")

    core_metric = sum(centered_results.values()) / len(centered_results)
    out = {
@ -117,98 +175,157 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
    return out

 # -----------------------------------------------------------------------------
-# HuggingFace loading utilities and light wrappers for a model
+# Main

-class ModelWrapper:
-    """Lightweight wrapper for a HuggingFace model"""
-    def __init__(self, model, max_seq_len=None):
-        self.model = model
-        self.max_seq_len = max_seq_len
-
-    def __call__(self, input_ids):
-        outputs = self.model(input_ids)
-        logits = outputs.logits
-        return logits
-
-def load_hf_model(hf_path: str, device):
-    print0(f"Loading model from: {hf_path}")
-    # Load the model
-    from transformers import AutoModelForCausalLM
-    model = AutoModelForCausalLM.from_pretrained(hf_path)
-    model.to(device)
-    model.eval()
-    max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None
-    model = ModelWrapper(model, max_seq_len=max_seq_len)
-    # Load the tokenizer
-    tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path)
-    return model, tokenizer
-
-# -----------------------------------------------------------------------------
 def main():
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate')
-    parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)')
-    parser.add_argument('--model-tag', type=str, default=None, help='optional model tag for the output directory name')
-    parser.add_argument('--step', type=str, default=None, help='optional model step for the output directory name')
+    parser = argparse.ArgumentParser(description="Base model evaluation")
+    parser.add_argument('--eval', type=str, default='core,bpb,sample', help='Comma-separated evaluations to run: core,bpb,sample (default: all)')
+    parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path (e.g. openai-community/gpt2)')
+    parser.add_argument('--model-tag', type=str, default=None, help='nanochat model tag to identify the checkpoint directory')
+    parser.add_argument('--step', type=int, default=None, help='Model step to load (default = last)')
+    parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per CORE task (-1 = all)')
+    parser.add_argument('--device-batch-size', type=int, default=32, help='Per-device batch size for BPB evaluation')
+    parser.add_argument('--split-tokens', type=int, default=40*524288, help='Number of tokens to evaluate per split for BPB')
+    parser.add_argument('--device-type', type=str, default='', help='cuda|cpu|mps (empty = autodetect)')
    args = parser.parse_args()

-    # distributed / precision setup
-    device_type = autodetect_device_type()
+    # Parse evaluation modes
+    eval_modes = set(mode.strip() for mode in args.eval.split(','))
+    valid_modes = {'core', 'bpb', 'sample'}
+    invalid = eval_modes - valid_modes
+    if invalid:
+        parser.error(f"Invalid eval modes: {invalid}. Valid: {valid_modes}")
+
+    # Distributed / precision setup
+    device_type = autodetect_device_type() if args.device_type == '' else args.device_type
    ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
    autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()

-    # Load model and tokenizer from command line or from file system
-    if args.hf_path is not None:
-        # atm assume that if a path is given, it's a huggingface model path
-        hf_path = args.hf_path
-        print0(f"Loading huggingface model from: {hf_path}")
-        model, tokenizer = load_hf_model(hf_path, device)
-        model_name = hf_path # just for logging
-        model_slug = hf_path.replace("/", "-") # for the output csv file
+    # Load model and tokenizer
+    is_hf_model = args.hf_path is not None
+    if is_hf_model:
+        model, tokenizer = load_hf_model(args.hf_path, device)
+        sequence_len = model.max_seq_len or 1024
+        token_bytes = get_hf_token_bytes(tokenizer, device=device)
+        model_name = args.hf_path
+        model_slug = args.hf_path.replace("/", "-")
    else:
-        # load a local model from the file system
        model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=args.model_tag, step=args.step)
-        model_name = f"base_model (step {meta['step']})" # just for logging
-        model_slug = f"base_model_{meta['step']:06d}" # for the output csv file
+        sequence_len = meta["model_config"]["sequence_len"]
+        token_bytes = get_token_bytes(device=device)
+        model_name = f"base_model (step {meta['step']})"
+        model_slug = f"base_model_{meta['step']:06d}"

-    # Evaluate the model
-    with autocast_ctx:
-        out = evaluate_model(model, tokenizer, device, max_per_task=args.max_per_task)
+    print0(f"Evaluating model: {model_name}")
+    print0(f"Eval modes: {', '.join(sorted(eval_modes))}")

-    # Write out the results to a csv file
-    core_metric = None
-    centered_results = {}
-    if ddp_rank == 0:
-        base_dir = get_base_dir()
-        output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv")
-        os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
-        results = out["results"]
-        centered_results = out["centered_results"]
-        core_metric = out["core_metric"]
-        with open(output_csv_path, 'w', encoding='utf-8', newline='') as f:
-            f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n")
-            for label in results:
-                f.write(f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n")
-            f.write(f"{'CORE':<35}, {'':<10}, {core_metric:<10.6f}\n")
-        # Print the content of the csv file to console too
+    # Results to log
+    core_results = None
+    bpb_results = {}
+    samples = []
+    unconditioned_samples = []
+
+    # --- CORE evaluation ---
+    if 'core' in eval_modes:
+        print0("\n" + "="*80)
+        print0("CORE Evaluation")
        print0("="*80)
-        print0(f"Model: {model_name}")
-        print0("="*80)
-        with open(output_csv_path, 'r', encoding='utf-8') as f:
-            print0(f.read())
+        with autocast_ctx:
+            core_results = evaluate_core(model, tokenizer, device, max_per_task=args.max_per_task)

-    # Log to report
+        # Write CSV output
+        if ddp_rank == 0:
+            base_dir = get_base_dir()
+            output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv")
+            os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
+            with open(output_csv_path, 'w', encoding='utf-8', newline='') as f:
+                f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n")
+                for label in core_results["results"]:
+                    acc = core_results["results"][label]
+                    centered = core_results["centered_results"][label]
+                    f.write(f"{label:<35}, {acc:<10.6f}, {centered:<10.6f}\n")
+                f.write(f"{'CORE':<35}, {'':<10}, {core_results['core_metric']:<10.6f}\n")
+            print0(f"\nResults written to: {output_csv_path}")
+            print0(f"CORE metric: {core_results['core_metric']:.4f}")
+
+    # --- BPB evaluation ---
+    if 'bpb' in eval_modes:
+        print0("\n" + "="*80)
+        print0("BPB Evaluation")
+        print0("="*80)
+        tokens_per_step = args.device_batch_size * sequence_len * ddp_world_size
+        if args.split_tokens % tokens_per_step != 0:
+            # Adjust to nearest multiple
+            args.split_tokens = (args.split_tokens // tokens_per_step) * tokens_per_step
+            print0(f"Adjusted split_tokens to {args.split_tokens} (must be divisible by {tokens_per_step})")
+        steps = args.split_tokens // tokens_per_step
+
+        for split_name in ["train", "val"]:
+            loader = tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, sequence_len, split_name, device=device)
+            with autocast_ctx:
+                bpb = evaluate_bpb(model, loader, steps, token_bytes)
+            bpb_results[split_name] = bpb
+            print0(f"{split_name} bpb: {bpb:.6f}")
+
+    # --- Sampling ---
+    if 'sample' in eval_modes and not is_hf_model:
+        print0("\n" + "="*80)
+        print0("Model Samples")
+        print0("="*80)
+        if ddp_rank == 0:
+            prompts = [
+                "The capital of France is",
+                "The chemical symbol of gold is",
+                "If yesterday was Friday, then tomorrow will be",
+                "The opposite of hot is",
+                "The planets of the solar system are:",
+                "My favorite color is",
+                "If 5*x + 3 = 13, then x is",
+            ]
+            engine = Engine(model, tokenizer)
+            print0("\nConditioned samples:")
+            for prompt in prompts:
+                tokens = tokenizer(prompt, prepend="<|bos|>")
+                with autocast_ctx:
+                    sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0)
+                sample_str = tokenizer.decode(sample[0])
+                print0("-" * 80)
+                print0(sample_str)
+                samples.append(sample_str)
+
+            print0("\nUnconditioned samples:")
+            tokens = tokenizer("", prepend="<|bos|>")
+            with autocast_ctx:
+                uncond, _ = engine.generate_batch(tokens, num_samples=8, max_tokens=128, temperature=1.0)
+            for sample in uncond:
+                sample_str = tokenizer.decode(sample)
+                print0("-" * 80)
+                print0(sample_str)
+                unconditioned_samples.append(sample_str)
+    elif 'sample' in eval_modes and is_hf_model:
+        print0("\nSkipping sampling for HuggingFace models (not supported)")
+
+    # --- Log to report ---
    from nanochat.report import get_report
-    get_report().log(section="Base model evaluation", data=[
-        {
-            "Model": model_name,
-            "CORE metric": core_metric,
-        },
-        centered_results, # the full table
-    ])
+    report_data = [{"model": model_name}]
+
+    if core_results:
+        report_data[0]["CORE metric"] = core_results["core_metric"]
+        report_data.append(core_results["centered_results"])
+
+    if bpb_results:
+        report_data[0]["train bpb"] = bpb_results.get("train")
+        report_data[0]["val bpb"] = bpb_results.get("val")
+
+    if samples:
+        report_data.append({f"sample {i}": s for i, s in enumerate(samples)})
+    if unconditioned_samples:
+        report_data.append({f"unconditioned {i}": s for i, s in enumerate(unconditioned_samples)})
+
+    get_report().log(section="Base model evaluation", data=report_data)

    compute_cleanup()

+
 if __name__ == "__main__":
    main()
--- a/scripts/base_loss.py
+++ b/scripts/base_loss.py
@ -1,155 +0,0 @@
-"""
-Loads a checkpoint, and:
- Evaluates the loss on a larger chunk of train/val splits
- Samples from the model
-
-Example run as:
-torchrun --standalone --nproc_per_node=8 -m scripts.base_loss
-
-To evaluate a HuggingFace model:
-python -m scripts.base_loss --hf-path openai-community/gpt2
-"""
-import argparse
-from contextlib import nullcontext
-import torch
-from nanochat.checkpoint_manager import load_model
-from nanochat.common import compute_init, print0, compute_cleanup, autodetect_device_type
-from nanochat.dataloader import tokenizing_distributed_data_loader_bos_bestfit
-from nanochat.tokenizer import get_token_bytes, HuggingFaceTokenizer
-from nanochat.loss_eval import evaluate_bpb
-from nanochat.engine import Engine
-
-# -----------------------------------------------------------------------------
-# HuggingFace loading utilities, making the APIs match up to those of nanochat
-
-class ModelWrapper:
-    """Lightweight wrapper for a HuggingFace model"""
-    def __init__(self, model, max_seq_len=None):
-        self.model = model
-        self.max_seq_len = max_seq_len
-
-    def __call__(self, input_ids, targets=None, loss_reduction='mean'):
-        logits = self.model(input_ids).logits
-        if targets is None:
-            return logits
-        else:
-            loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1, reduction=loss_reduction)
-            return loss
-
-    def get_device(self):
-        return next(self.model.parameters()).device
-
-def load_hf_model(hf_path: str, device):
-    print0(f"Loading model from: {hf_path}")
-    from transformers import AutoModelForCausalLM
-    model = AutoModelForCausalLM.from_pretrained(hf_path)
-    model.to(device)
-    model.eval()
-    max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None
-    model = ModelWrapper(model, max_seq_len=max_seq_len)
-    tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path)
-    return model, tokenizer
-
-def get_hf_token_bytes(tokenizer, device="cpu"):
-    """Compute token_bytes tensor for a HuggingFace tokenizer."""
-    vocab_size = tokenizer.tokenizer.get_vocab_size()
-    token_bytes = torch.zeros(vocab_size, dtype=torch.int64, device=device)
-    for token_id in range(vocab_size):
-        token_str = tokenizer.tokenizer.decode([token_id])
-        token_bytes[token_id] = len(token_str.encode('utf-8')) # Count UTF-8 bytes
-    return token_bytes
-
-# CLI arguments
-parser = argparse.ArgumentParser(description="Evaluate loss on train/val splits and sample from model")
-parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size")
-parser.add_argument("--split-tokens", type=int, default=40*524288, help="number of tokens to evaluate per split")
-parser.add_argument("--model-tag", type=str, default=None, help="model tag for checkpoint directory")
-parser.add_argument("--model-step", type=int, default=None, help="model step to load")
-parser.add_argument("--device-type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)")
-parser.add_argument("--hf-path", type=str, default=None, help="HuggingFace model path (e.g. openai-community/gpt2)")
-args = parser.parse_args()
-
-# Load the base model and the tokenizer
-device_type = autodetect_device_type() if args.device_type == "" else args.device_type
-ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
-print0(f"Device: {device} | DDP rank: {ddp_rank} | DDP local rank: {ddp_local_rank} | DDP world size: {ddp_world_size}")
-
-if args.hf_path is not None:
-    # Load HuggingFace model
-    model, tokenizer = load_hf_model(args.hf_path, device)
-    sequence_len = model.max_seq_len if model.max_seq_len else 1024
-    token_bytes = get_hf_token_bytes(tokenizer, device=device)
-    model_name = args.hf_path
-else:
-    # Load local nanochat model
-    model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=args.model_tag, step=args.model_step)
-    sequence_len = meta["model_config"]["sequence_len"]
-    token_bytes = get_token_bytes(device=device)
-    model_name = f"base_model (step {meta['step']})"
-
-autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
-
-print0(f"Evaluating model: {model_name}")
-
-# Evaluate the loss on each split
-tokens_per_step = args.device_batch_size * sequence_len * ddp_world_size
-assert args.split_tokens % tokens_per_step == 0, "split_tokens must be divisible by tokens_per_step"
-steps = args.split_tokens // tokens_per_step
-bpb_results = {}
-for split_name in ["train", "val"]:
-    loader = tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, sequence_len, split_name, device=device)
-    with autocast_ctx:
-        bpb = evaluate_bpb(model, loader, steps, token_bytes)
-    print0(f"{split_name} bpb: {bpb:.4f}")
-    bpb_results[split_name] = bpb
-    print0(f"Model: {model_name}, {split_name} bpb: {bpb:.6f}")
-
-# Master process also samples from the model for some basic knowledge-eliciting prompts (only for nanochat models)
-samples = []
-if ddp_rank == 0 and args.hf_path is None:
-    prompts = [
-        "The capital of France is",
-        "The chemical symbol of gold is",
-        "If yesterday was Friday, then tomorrow will be",
-        "The opposite of hot is",
-        "The planets of the solar system are:",
-        "My favorite color is",
-        "If 5*x + 3 = 13, then x is",
-    ]
-    engine = Engine(model, tokenizer)
-    for prompt in prompts:
-        tokens = tokenizer(prompt, prepend="<|bos|>")
-        with autocast_ctx:
-            sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0)
-        sample_str = tokenizer.decode(sample[0])
-        print0("-" * 80)
-        print0(sample_str)
-        samples.append(sample_str)
-
-# Draw some unconditioned samples from the model (only for nanochat models)
-unconditioned_samples = []
-if ddp_rank == 0 and args.hf_path is None:
-    engine = Engine(model, tokenizer)
-    tokens = tokenizer("", prepend="<|bos|>")
-    with autocast_ctx:
-        samples, _ = engine.generate_batch(tokens, num_samples=8, max_tokens=128, temperature=1.0)
-    for sample in samples:
-        sample_str = tokenizer.decode(sample)
-        print0("-" * 80)
-        print0(sample_str)
-        unconditioned_samples.append(sample_str)
-
-# Log to report
-from nanochat.report import get_report
-get_report().log(section="Base model loss", data=[
-    {
-        "model": model_name,
-        "train bpb": bpb_results["train"],
-        "val bpb": bpb_results["val"],
-    },
-    {f"sample {i}": sample for i, sample in enumerate(samples)},
-    {f"unconditioned sample {i}": sample for i, sample in enumerate(unconditioned_samples)},
-])
-
-# Cleanup
-compute_cleanup()