align the upstream design

2026-04-14 12:58:34 +00:00 · 2026-01-06 05:50:48 +00:00 · 2026-01-06 05:50:48 +00:00 · 952ea5137a
commit 952ea5137a
parent 74d94c923f
24 changed files with 1001 additions and 4304 deletions
--- a/nanochat/checkpoint_manager.py
+++ b/nanochat/checkpoint_manager.py
@ -8,10 +8,10 @@ import json
 import logging
 import torch

-from nanochat_moe.common import get_base_dir
-from nanochat_moe.gpt import GPT, GPTConfig
-from nanochat_moe.tokenizer import get_tokenizer
-from nanochat_moe.common import setup_default_logging
+from nanochat.common import get_base_dir
+from nanochat.gpt import GPT, GPTConfig
+from nanochat.tokenizer import get_tokenizer
+from nanochat.common import setup_default_logging

 # Set up logging
 setup_default_logging()
@ -79,7 +79,7 @@ def build_model(checkpoint_dir, step, device, phase):
        model = GPT(model_config)
    # Load the model state
    model.to_empty(device=device)
-    model.init_weights() # note: this is dumb, but we need to init the rotary embeddings. TODO: fix model re-init
+    # Weights are already initialized in GPT.__init__ via self.apply(self._init_weights)
    model.load_state_dict(model_data, strict=True, assign=True)
    # Put the model in the right training phase / mode
    if phase == "eval":
--- a/nanochat/common.py
+++ b/nanochat/common.py
@ -58,7 +58,7 @@ def get_base_dir():
    os.makedirs(nanochat_dir, exist_ok=True)
    return nanochat_dir

-def download_file_with_lock(url, filename, postprocess_fn=None, timeout=300):
+def download_file_with_lock(url, filename, postprocess_fn=None):
    """
    Downloads a file from a URL to a local path in the base directory.
    Uses a lock file to prevent concurrent downloads among multiple ranks.
@ -70,19 +70,7 @@ def download_file_with_lock(url, filename, postprocess_fn=None, timeout=300):
    if os.path.exists(file_path):
        return file_path

-    # Check if lock file exists and is stale (older than timeout seconds)
-    if os.path.exists(lock_path):
-        import time
-        lock_age = time.time() - os.path.getmtime(lock_path)
-        if lock_age > timeout:
-            logger.warning(f"Lock file {lock_path} is {lock_age:.0f}s old, removing stale lock")
-            try:
-                os.remove(lock_path)
-            except:
-                pass
-
-    # FileLock timeout is for acquiring lock, not for download
-    with FileLock(lock_path, timeout=60):
+    with FileLock(lock_path):
        # Only a single rank can acquire this lock
        # All other ranks block until it is released

@ -90,21 +78,10 @@ def download_file_with_lock(url, filename, postprocess_fn=None, timeout=300):
        if os.path.exists(file_path):
            return file_path

-        # Download the content as bytes with timeout
+        # Download the content as bytes
        print(f"Downloading {url}...")
-        import socket
-        socket.setdefaulttimeout(timeout)
-        try:
-            with urllib.request.urlopen(url, timeout=timeout) as response:
-                content = response.read() # bytes
-        except (urllib.error.URLError, socket.timeout) as e:
-            logger.error(f"Failed to download {url}: {e}")
-            # Clean up lock file on failure
-            try:
-                os.remove(lock_path)
-            except:
-                pass
-            raise
+        with urllib.request.urlopen(url) as response:
+            content = response.read() # bytes

        # Write to local file
        with open(file_path, 'wb') as f:
--- a/nanochat/dataloader.py
+++ b/nanochat/dataloader.py
@ -1,12 +1,74 @@
 from collections import deque
+import os
+import numpy as np

 import torch
 import pyarrow.parquet as pq
-from tqdm import tqdm

-from nanochat_moe.common import get_dist_info, print0
-from nanochat_moe.dataset import list_parquet_files
-from nanochat_moe.tokenizer import get_tokenizer
+from nanochat.common import get_dist_info
+from nanochat.dataset import list_parquet_files, USE_OPENWEBTEXT
+from nanochat.tokenizer import get_tokenizer
+
+# Support for loading openwebtext from local parquet files
+if USE_OPENWEBTEXT:
+    pass  # No special imports needed, we use pyarrow.parquet directly
+
+def bin_data_loader_with_state(B, T, data_dir, split="train", device="cuda", resume_state_dict=None):
+    """
+    Load data from .bin files (nanoMoE format) and yield training batches.
+    Matches nanoMoE's get_batch function exactly.
+    
+    Args:
+        B: batch size
+        T: sequence length (block_size)
+        data_dir: directory containing train.bin and val.bin files
+        split: "train" or "val"
+        device: device to move tensors to
+        resume_state_dict: optional state dict for resuming training (not used in nanoMoE, kept for compatibility)
+    
+    Yields:
+        inputs, targets, state_dict tuples
+    """
+    assert split in ["train", "val"], "split must be 'train' or 'val'"
+    
+    # Get binary file path
+    bin_path = os.path.join(data_dir, f'{split}.bin')
+    if not os.path.exists(bin_path):
+        raise FileNotFoundError(f"Data file not found: {bin_path}")
+    
+    device_type = 'cuda' if device.type == 'cuda' else 'cpu'
+    
+    while True:  # infinite iteration
+        # We recreate np.memmap every batch to avoid a memory leak, as per
+        # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
+        data = np.memmap(bin_path, dtype=np.uint16, mode='r')
+        
+        # Sample random positions for this batch (matching nanoMoE exactly: len(data) - T)
+        ix = torch.randint(len(data) - T, (B,))
+        
+        # Convert memmap slices directly to tensors (matching nanoMoE train.py exactly)
+        x = torch.stack([torch.from_numpy((data[i:i+T]).astype(np.int64)) for i in ix])
+        y = torch.stack([torch.from_numpy((data[i+1:i+1+T]).astype(np.int64)) for i in ix])
+        
+        # Move to device with optional memory pinning (matching nanoMoE exactly)
+        if device_type == 'cuda':
+            # Try pinning memory, but fall back if it fails
+            try:
+                x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
+            except RuntimeError:
+                # Fall back to regular transfer if pin_memory fails
+                x, y = x.to(device), y.to(device)
+        else:
+            x, y = x.to(device), y.to(device)
+        
+        # Return state_dict for compatibility (nanoMoE doesn't use this, but we keep it)
+        state_dict = {"pos": 0}  # Simple placeholder, not used in nanoMoE
+        yield x, y, state_dict
+
+def bin_data_loader(*args, **kwargs):
+    """Helper function that only emits inputs/targets without state_dict"""
+    for inputs, targets, state_dict in bin_data_loader_with_state(*args, **kwargs):
+        yield inputs, targets

 def tokenizing_distributed_data_loader_with_state(B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda", resume_state_dict=None):
    """
@ -25,68 +87,119 @@ def tokenizing_distributed_data_loader_with_state(B, T, split, tokenizer_threads

    # infinite iterator over document batches (list of text strings)
    ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
-    show_progress = ddp_rank == 0  # only show progress on rank 0
-    
-    print0(f"[DataLoader] Initializing dataloader for split={split}, rank={ddp_rank}/{ddp_world_size}")
-    
    def document_batches():
-        from nanochat_moe.dataset import DATA_DIR
-        print0(f"[DataLoader] Listing parquet files from: {DATA_DIR}")
-        parquet_paths = list_parquet_files()
-        parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:]
-        print0(f"[DataLoader] Found {len(parquet_paths)} parquet files for {split} split")
-        if len(parquet_paths) == 0:
-            print0(f"[DataLoader] WARNING: No parquet files found! Check if data directory exists and contains .parquet files.")
-        
-        resume_pq_idx = resume_state_dict["pq_idx"] if resume_state_dict is not None else 0
-        resume_rg_idx = resume_state_dict["rg_idx"] if resume_state_dict is not None else None
-        pq_idx = resume_pq_idx # we kick off parquet files at the resume index (or by default just 0)
-        pbar = None
-        epoch = 0
-        while True: # iterate infinitely (multi-epoch)
-            if show_progress and pbar is None:
-                # Use position=0 and leave=True to ensure progress bar displays correctly
-                pbar = tqdm(total=len(parquet_paths), desc=f"Tokenizing {split} data (epoch {epoch})", unit="file", leave=True, position=0, file=None)
-            while pq_idx < len(parquet_paths): # iterate over all parquet files
-                filepath = parquet_paths[pq_idx]
+        if USE_OPENWEBTEXT:
+            # Load openwebtext dataset directly from local parquet files (no download, no API calls)
+            # Note: openwebtext only has 'train' split, so we use it for both train and val
+            # Try multiple possible paths
+            parquet_dir = None
+            for possible_dir in [
+                "/thullms/dpq23/.cache/huggingface/datasets/openwebtext/plain_text",
+                "/thullms/public/openwebtext_new/openwebtext/plain_text"
+            ]:
+                if os.path.exists(possible_dir) and os.path.exists(os.path.join(possible_dir, "train-00000-of-00080.parquet")):
+                    parquet_dir = possible_dir
+                    break
+            
+            if parquet_dir is None:
+                raise RuntimeError("Could not find openwebtext parquet files in expected locations")
+            
+            # Load all parquet files
+            parquet_files = sorted([f for f in os.listdir(parquet_dir) if f.endswith('.parquet')])
+            parquet_paths = [os.path.join(parquet_dir, f) for f in parquet_files]
+            
+            # Calculate total rows to determine train/val split
+            total_rows = 0
+            for filepath in parquet_paths:
                pf = pq.ParquetFile(filepath)
-                if show_progress:
-                    pbar.set_postfix({"file": f"{pq_idx+1}/{len(parquet_paths)}"})
-                # Start from resume point if resuming on same file, otherwise from DDP rank
-                # I know this state resumption is a little bit tricky and a little bit hacky... sigh.
-                if resume_rg_idx is not None:
-                    base_idx = resume_rg_idx // ddp_world_size # in units of ddp_world_size
-                    base_idx += 1 # advance by 1 so that we definitely don't repeat data after resuming
-                    rg_idx = base_idx * ddp_world_size + ddp_rank
-                    resume_rg_idx = None # set to None as we only want to do this a single time
-                else:
-                    rg_idx = ddp_rank
-                while rg_idx < pf.num_row_groups:
-                    rg = pf.read_row_group(rg_idx)
-                    batch = rg.column('text').to_pylist() # each batch is a parquet group, e.g. 1024 rows
-                    # the tokenizer encode might want to go in even smaller batches, e.g. 128 rows
-                    for i in range(0, len(batch), tokenizer_batch_size):
-                        yield batch[i:i+tokenizer_batch_size], (pq_idx, rg_idx)
-                    rg_idx += ddp_world_size # advance to the next row group (in DDP)
-                if show_progress:
-                    pbar.update(1)
-                pq_idx += 1 # advance to the next parquet file
-            # Finished one epoch, reset for next epoch
-            if show_progress:
-                pbar.close()
-                pbar = None
-            epoch += 1
-            pq_idx = 0  # reset to start of files for next epoch
+                total_rows += pf.metadata.num_rows
+            
+            # For validation, use the last 1% of the dataset
+            if split == "val":
+                val_start_row = int(total_rows * 0.99)
+                # Find which file contains the validation start
+                current_row = 0
+                val_file_start_idx = 0
+                for i, filepath in enumerate(parquet_paths):
+                    pf = pq.ParquetFile(filepath)
+                    if current_row + pf.metadata.num_rows > val_start_row:
+                        val_file_start_idx = i
+                        break
+                    current_row += pf.metadata.num_rows
+                parquet_paths = parquet_paths[val_file_start_idx:]
+            else:
+                # For training, use 99% of the dataset - limit to first 99% of files
+                train_end_row = int(total_rows * 0.99)
+                current_row = 0
+                train_file_end_idx = len(parquet_paths)
+                for i, filepath in enumerate(parquet_paths):
+                    pf = pq.ParquetFile(filepath)
+                    if current_row + pf.metadata.num_rows >= train_end_row:
+                        train_file_end_idx = i + 1
+                        break
+                    current_row += pf.metadata.num_rows
+                parquet_paths = parquet_paths[:train_file_end_idx]
+            
+            # Now iterate through parquet files similar to original code
+            resume_pq_idx = resume_state_dict.get("pq_idx", 0) if resume_state_dict is not None else 0
+            resume_rg_idx = resume_state_dict.get("rg_idx", 0) if resume_state_dict is not None else None
+            pq_idx = resume_pq_idx
+            while True: # iterate infinitely (multi-epoch)
+                while pq_idx < len(parquet_paths): # iterate over all parquet files
+                    filepath = parquet_paths[pq_idx]
+                    pf = pq.ParquetFile(filepath)
+                    # Start from resume point if resuming on same file, otherwise from DDP rank
+                    if resume_rg_idx is not None:
+                        base_idx = resume_rg_idx // ddp_world_size
+                        base_idx += 1 # advance by 1 so that we definitely don't repeat data after resuming
+                        rg_idx = base_idx * ddp_world_size + ddp_rank
+                        resume_rg_idx = None # set to None as we only want to do this a single time
+                    else:
+                        rg_idx = ddp_rank
+                    while rg_idx < pf.num_row_groups:
+                        rg = pf.read_row_group(rg_idx)
+                        batch = rg.column('text').to_pylist() # each batch is a parquet group, e.g. 1024 rows
+                        # the tokenizer encode might want to go in even smaller batches, e.g. 128 rows
+                        for i in range(0, len(batch), tokenizer_batch_size):
+                            yield batch[i:i+tokenizer_batch_size], (pq_idx, rg_idx)
+                        rg_idx += ddp_world_size # advance to the next row group (in DDP)
+                    pq_idx += 1 # advance to the next parquet file
+                pq_idx = 0  # Reset for next epoch
+        else:
+            # Original parquet file iteration
+            parquet_paths = list_parquet_files()
+            parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:]
+            resume_pq_idx = resume_state_dict["pq_idx"] if resume_state_dict is not None else 0
+            resume_rg_idx = resume_state_dict["rg_idx"] if resume_state_dict is not None else None
+            pq_idx = resume_pq_idx # we kick off parquet files at the resume index (or by default just 0)
+            while True: # iterate infinitely (multi-epoch)
+                while pq_idx < len(parquet_paths): # iterate over all parquet files
+                    filepath = parquet_paths[pq_idx]
+                    pf = pq.ParquetFile(filepath)
+                    # Start from resume point if resuming on same file, otherwise from DDP rank
+                    # I know this state resumption is a little bit tricky and a little bit hacky... sigh.
+                    if resume_rg_idx is not None:
+                        base_idx = resume_rg_idx // ddp_world_size # in units of ddp_world_size
+                        base_idx += 1 # advance by 1 so that we definitely don't repeat data after resuming
+                        rg_idx = base_idx * ddp_world_size + ddp_rank
+                        resume_rg_idx = None # set to None as we only want to do this a single time
+                    else:
+                        rg_idx = ddp_rank
+                    while rg_idx < pf.num_row_groups:
+                        rg = pf.read_row_group(rg_idx)
+                        batch = rg.column('text').to_pylist() # each batch is a parquet group, e.g. 1024 rows
+                        # the tokenizer encode might want to go in even smaller batches, e.g. 128 rows
+                        for i in range(0, len(batch), tokenizer_batch_size):
+                            yield batch[i:i+tokenizer_batch_size], (pq_idx, rg_idx)
+                        rg_idx += ddp_world_size # advance to the next row group (in DDP)
+                    pq_idx += 1 # advance to the next parquet file
    batches = document_batches()

    # Now emit batches of tokens.
    needed_tokens = B * T + 1 # +1 is because we also need the target at the last token
    # get the tokenizer and the bos token
-    print0(f"[DataLoader] Loading tokenizer...")
    tokenizer = get_tokenizer()
-    print0(f"[DataLoader] Tokenizer loaded, vocab_size={tokenizer.get_vocab_size()}")
    bos_token = tokenizer.get_bos_token_id()
-    print0(f"[DataLoader] Starting to yield batches (needed_tokens={needed_tokens})...")
    # scratch buffer holds the tokens for one iteration
    token_buffer = deque() # we stream tokens on the right and pop from the left
    while True:
@ -99,7 +212,7 @@ def tokenizing_distributed_data_loader_with_state(B, T, split, tokenizer_threads
        # Move tokens from the deque into the scratch buffer
        tokens = [token_buffer.popleft() for _ in range(needed_tokens)]
        # CUDA supports memory pinning for asynchronous transfers between CPU and GPU
-        use_cuda_optimizations = device == "cuda"
+        use_cuda_optimizations = device.type == "cuda"
        scratch = torch.tensor(tokens, dtype=torch.long, pin_memory=use_cuda_optimizations) # in PyTorch, long=int64
        # Create the inputs/targets as 1D tensors
        inputs_cpu = scratch[:-1]
@ -107,7 +220,11 @@ def tokenizing_distributed_data_loader_with_state(B, T, split, tokenizer_threads
        # Reshape to 2D and move to GPU async
        inputs = inputs_cpu.view(B, T).to(device=device, non_blocking=use_cuda_optimizations)
        targets = targets_cpu.view(B, T).to(device=device, non_blocking=use_cuda_optimizations)
-        state_dict = {"pq_idx": pq_idx, "rg_idx": rg_idx} # we need this in case we wish to approximately resume training
+        # For openwebtext, we use rg_idx as the state; for parquet files, we use both pq_idx and rg_idx
+        if USE_OPENWEBTEXT:
+            state_dict = {"pq_idx": 0, "rg_idx": rg_idx}  # Use rg_idx to track position in dataset
+        else:
+            state_dict = {"pq_idx": pq_idx, "rg_idx": rg_idx} # we need this in case we wish to approximately resume training
        yield inputs, targets, state_dict

 def tokenizing_distributed_data_loader(*args, **kwargs):
--- a/nanochat/dataset.py
+++ b/nanochat/dataset.py
@ -14,7 +14,10 @@ import requests
 import pyarrow.parquet as pq
 from multiprocessing import Pool

-from nanochat_moe.common import get_base_dir
+from nanochat.common import get_base_dir
+
+# Support for loading openwebtext from local parquet files
+USE_OPENWEBTEXT = True

 # -----------------------------------------------------------------------------
 # The specifics of the current pretraining dataset
@ -23,12 +26,8 @@ from nanochat_moe.common import get_base_dir
 BASE_URL = "https://huggingface.co/datasets/karpathy/fineweb-edu-100b-shuffle/resolve/main"
 MAX_SHARD = 1822 # the last datashard is shard_01822.parquet
 index_to_filename = lambda index: f"shard_{index:05d}.parquet" # format of the filenames
-# Support custom data directory via NANOCHAT_DATA_DIR environment variable
-if os.environ.get("NANOCHAT_DATA_DIR"):
-    DATA_DIR = os.environ.get("NANOCHAT_DATA_DIR")
-else:
-    base_dir = get_base_dir()
-    DATA_DIR = os.path.join(base_dir, "base_data")
+base_dir = get_base_dir()
+DATA_DIR = os.path.join(base_dir, "base_data")
 os.makedirs(DATA_DIR, exist_ok=True)

 # -----------------------------------------------------------------------------
@ -51,6 +50,70 @@ def parquets_iter_batched(split, start=0, step=1):
    - start/step are useful for skipping rows in DDP. e.g. start=rank, step=world_size
    """
    assert split in ["train", "val"], "split must be 'train' or 'val'"
+    
+    # If using openwebtext, load from local parquet files directly (no download, no API calls)
+    if USE_OPENWEBTEXT:
+        # Load openwebtext dataset directly from local parquet files (hardcoded path)
+        # Note: openwebtext only has 'train' split, so we use it for both train and val
+        # Try multiple possible paths
+        parquet_dir = None
+        for possible_dir in [
+            "/thullms/dpq23/.cache/huggingface/datasets/openwebtext/plain_text",
+            "/thullms/public/openwebtext_new/openwebtext/plain_text"
+        ]:
+            if os.path.exists(possible_dir) and os.path.exists(os.path.join(possible_dir, "train-00000-of-00080.parquet")):
+                parquet_dir = possible_dir
+                break
+        
+        if parquet_dir is None:
+            raise RuntimeError("Could not find openwebtext parquet files in expected locations")
+        
+        # Load all parquet files directly
+        parquet_files = sorted([f for f in os.listdir(parquet_dir) if f.endswith('.parquet')])
+        parquet_paths = [os.path.join(parquet_dir, f) for f in parquet_files]
+        
+        # Calculate total rows to determine train/val split
+        total_rows = 0
+        for filepath in parquet_paths:
+            pf = pq.ParquetFile(filepath)
+            total_rows += pf.metadata.num_rows
+        
+        # For validation, use the last 1% of the dataset
+        if split == "val":
+            val_start_row = int(total_rows * 0.99)
+            # Find which file contains the validation start
+            current_row = 0
+            val_file_start_idx = 0
+            for i, filepath in enumerate(parquet_paths):
+                pf = pq.ParquetFile(filepath)
+                if current_row + pf.metadata.num_rows > val_start_row:
+                    val_file_start_idx = i
+                    break
+                current_row += pf.metadata.num_rows
+            parquet_paths = parquet_paths[val_file_start_idx:]
+        else:
+            # For training, use 99% of the dataset - limit to first 99% of files
+            train_end_row = int(total_rows * 0.99)
+            current_row = 0
+            train_file_end_idx = len(parquet_paths)
+            for i, filepath in enumerate(parquet_paths):
+                pf = pq.ParquetFile(filepath)
+                if current_row + pf.metadata.num_rows >= train_end_row:
+                    train_file_end_idx = i + 1
+                    break
+                current_row += pf.metadata.num_rows
+            parquet_paths = parquet_paths[:train_file_end_idx]
+        
+        # Now iterate through parquet files similar to original code
+        for filepath in parquet_paths:
+            pf = pq.ParquetFile(filepath)
+            for rg_idx in range(start, pf.num_row_groups, step):
+                rg = pf.read_row_group(rg_idx)
+                texts = rg.column('text').to_pylist()
+                yield texts
+        return
+    
+    # Original parquet file iteration
    parquet_paths = list_parquet_files()
    parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:]
    for filepath in parquet_paths:
@ -113,12 +176,62 @@ def download_single_file(index):
    return False


+def download_openwebtext():
+    """Check if openwebtext dataset exists locally (no download, no API calls)."""
+    if not USE_OPENWEBTEXT:
+        print("USE_OPENWEBTEXT is False. Skipping openwebtext check.")
+        return False
+    
+    print("Checking for openwebtext dataset in local directories...")
+    
+    # Try multiple possible paths
+    parquet_dir = None
+    for possible_dir in [
+        "/thullms/dpq23/.cache/huggingface/datasets/openwebtext/plain_text",
+        "/thullms/public/openwebtext_new/openwebtext/plain_text"
+    ]:
+        if os.path.exists(possible_dir) and os.path.exists(os.path.join(possible_dir, "train-00000-of-00080.parquet")):
+            parquet_dir = possible_dir
+            break
+    
+    if parquet_dir is None:
+        print("ERROR: Could not find openwebtext parquet files in expected locations:")
+        print("  - /thullms/dpq23/.cache/huggingface/datasets/openwebtext/plain_text")
+        print("  - /thullms/public/openwebtext_new/openwebtext/plain_text")
+        return False
+    
+    # Count total rows from parquet files
+    parquet_files = sorted([f for f in os.listdir(parquet_dir) if f.endswith('.parquet')])
+    total_rows = 0
+    for filename in parquet_files:
+        filepath = os.path.join(parquet_dir, filename)
+        pf = pq.ParquetFile(filepath)
+        total_rows += pf.metadata.num_rows
+    
+    print(f"Found openwebtext dataset at: {parquet_dir}")
+    print(f"Total parquet files: {len(parquet_files)}")
+    print(f"Total rows: {total_rows:,}")
+    print("OpenWebText dataset is ready to use (loaded directly from local parquet files).")
+    return True
+
+
 if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Download FineWeb-Edu 100BT dataset shards")
    parser.add_argument("-n", "--num-files", type=int, default=-1, help="Number of shards to download (default: -1), -1 = disable")
    parser.add_argument("-w", "--num-workers", type=int, default=4, help="Number of parallel download workers (default: 4)")
    args = parser.parse_args()

+    # If using openwebtext, download it
+    if USE_OPENWEBTEXT:
+        print("Using openwebtext dataset from HuggingFace datasets.")
+        success = download_openwebtext()
+        if success:
+            print("Done! OpenWebText dataset is ready to use.")
+        else:
+            print("Failed to download openwebtext dataset.")
+            exit(1)
+        exit(0)
+
    num = MAX_SHARD + 1 if args.num_files == -1 else min(args.num_files, MAX_SHARD + 1)
    ids_to_download = list(range(num))
    print(f"Downloading {len(ids_to_download)} shards using {args.num_workers} workers...")
--- a/nanochat/engine.py
+++ b/nanochat/engine.py
@ -17,8 +17,8 @@ import signal
 import warnings
 from contextlib import contextmanager
 from collections import deque
-from nanochat_moe.common import compute_init, autodetect_device_type
-from nanochat_moe.checkpoint_manager import load_model
+from nanochat.common import compute_init, autodetect_device_type
+from nanochat.checkpoint_manager import load_model
 from contextlib import nullcontext 

 # -----------------------------------------------------------------------------
--- a/nanochat/gpt.py
+++ b/nanochat/gpt.py
--- a/nanochat/loss_eval.py
+++ b/nanochat/loss_eval.py
@ -6,7 +6,7 @@ import torch
 import torch.distributed as dist

@torch.no_grad()
-def evaluate_bpb(model, batches, steps, token_bytes):
+def evaluate_bpb(model, batches, steps, token_bytes=None):
    """
    Instead of the naive 'mean loss', this function returns the bits per byte (bpb),
    which is a tokenization vocab size-independent metric, meaning you are still comparing
@ -23,43 +23,80 @@ def evaluate_bpb(model, batches, steps, token_bytes):
    In addition to evaluate_loss, we need the token_bytes tensor:
    It is a 1D tensor of shape (vocab_size,), indicating the number of bytes for
    each token id, or 0 if the token is to not be counted (e.g. special tokens).
+    
+    If token_bytes is None (e.g., when using .bin data with tiktoken), we fall back to
+    simple mean loss calculation (which is equivalent to bits per token, not bits per byte).
    """
+    # Get device from model
+    device = next(model.parameters()).device
+    
    # record the losses
-    total_nats = torch.tensor(0.0, dtype=torch.float32, device=model.get_device())
-    total_bytes = torch.tensor(0, dtype=torch.int64, device=model.get_device())
+    total_nats = torch.tensor(0.0, dtype=torch.float32, device=device)
+    total_bytes = torch.tensor(0, dtype=torch.int64, device=device)
+    total_tokens = torch.tensor(0, dtype=torch.int64, device=device)
    batch_iter = iter(batches)
+    
    for _ in range(steps):
        x, y = next(batch_iter)
-        loss2d = model(x, y, loss_reduction='none') # (B, T)
+        # Model returns (logits, loss) tuple
+        logits, loss = model(x, y)
+        # Calculate per-token loss from logits for bpb calculation
+        loss2d = torch.nn.functional.cross_entropy(
+            logits.view(-1, logits.size(-1)), 
+            y.view(-1), 
+            ignore_index=-1, 
+            reduction='none'
+        ).view(y.shape)
        loss2d = loss2d.view(-1) # flatten
        y = y.view(-1) # flatten
-        if (y.int() < 0).any(): # mps does not currently have kernel for < 0 for int64, only int32
-            # slightly more complex code path if some target tokens are ignore_index (e.g. -1)
-            # any target token < 0 is to be ignored: do NOT index token_bytes with negatives
+        
+        if token_bytes is None:
+            # Fallback: simple token counting (for .bin data with tiktoken)
+            # Count valid tokens (not ignored)
            valid = y >= 0
-            y_safe = torch.where(valid, y, torch.zeros_like(y))
-            # map valid targets to their byte length; ignored targets contribute 0 bytes
-            num_bytes2d = torch.where(
-                valid,
-                token_bytes[y_safe],
-                torch.zeros_like(y, dtype=token_bytes.dtype)
-            )
-            total_nats += (loss2d * (num_bytes2d > 0)).sum()
-            total_bytes += num_bytes2d.sum()
+            total_nats += (loss2d * valid.float()).sum()
+            total_tokens += valid.sum()
        else:
-            # fast path: no ignored targets, safe to index directly
-            num_bytes2d = token_bytes[y]
-            total_nats += (loss2d * (num_bytes2d > 0)).sum()
-            total_bytes += num_bytes2d.sum()
+            # Original bpb calculation with token_bytes
+            if (y.int() < 0).any(): # mps does not currently have kernel for < 0 for int64, only int32
+                # slightly more complex code path if some target tokens are ignore_index (e.g. -1)
+                # any target token < 0 is to be ignored: do NOT index token_bytes with negatives
+                valid = y >= 0
+                y_safe = torch.where(valid, y, torch.zeros_like(y))
+                # map valid targets to their byte length; ignored targets contribute 0 bytes
+                num_bytes2d = torch.where(
+                    valid,
+                    token_bytes[y_safe],
+                    torch.zeros_like(y, dtype=token_bytes.dtype)
+                )
+                total_nats += (loss2d * (num_bytes2d > 0)).sum()
+                total_bytes += num_bytes2d.sum()
+            else:
+                # fast path: no ignored targets, safe to index directly
+                num_bytes2d = token_bytes[y]
+                total_nats += (loss2d * (num_bytes2d > 0)).sum()
+                total_bytes += num_bytes2d.sum()
+    
    # sum reduce across all ranks
    world_size = dist.get_world_size() if dist.is_initialized() else 1
    if world_size > 1:
        dist.all_reduce(total_nats, op=dist.ReduceOp.SUM)
        dist.all_reduce(total_bytes, op=dist.ReduceOp.SUM)
+        dist.all_reduce(total_tokens, op=dist.ReduceOp.SUM)
+    
    # move both to cpu, calculate bpb and return
    total_nats = total_nats.item()
    total_bytes = total_bytes.item()
-    if total_bytes == 0:
-        return float('inf')
-    bpb = total_nats / (math.log(2) * total_bytes)
-    return bpb
+    total_tokens = total_tokens.item()
+    
+    if token_bytes is None:
+        # Return mean loss (bits per token) when token_bytes not available
+        if total_tokens == 0:
+            return float('inf')
+        return total_nats / (math.log(2) * total_tokens)
+    else:
+        # Return bits per byte
+        if total_bytes == 0:
+            return float('inf')
+        bpb = total_nats / (math.log(2) * total_bytes)
+        return bpb
--- a/nanochat/manager.py
+++ b/nanochat/manager.py
@ -1,11 +1,6 @@
-"""
-MoE Manager for tracking auxiliary losses across multiple MoE layers
-"""
-import torch
-
 class MOEManager:
    """
-    Basic wrapper class for tracking, storing, and aggregating auxiliary
+    basic wrapper class for tracking, storing, and aggregating auxiliary
    losses across multiple MoE layers in the model
    """

@ -26,10 +21,10 @@ class MOEManager:
        self.router_z_loss.append(loss)
    
    def aggregate_aux_loss(self):
-        return sum(self.aux_loss) if self.aux_loss else torch.tensor(0.0)
-    
+        return sum(self.aux_loss)
+
    def aggregate_router_z_loss(self):
-        return sum(self.router_z_loss) if self.router_z_loss else torch.tensor(0.0)
+        return sum(self.router_z_loss)

 MANAGER = MOEManager()

--- a/nanochat/report.py
+++ b/nanochat/report.py
@ -389,7 +389,7 @@ class DummyReport:

 def get_report():
    # just for convenience, only rank 0 logs to report
-    from nanochat_moe.common import get_base_dir, get_dist_info
+    from nanochat.common import get_base_dir, get_dist_info
    ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
    if ddp_rank == 0:
        report_dir = os.path.join(get_base_dir(), "report")
--- a/nanochat/standard.py
+++ b/nanochat/standard.py
@ -1,661 +0,0 @@
-"""
-Full definition of a GPT Language Model, all of it in this single file.
-References:
-1) the official GPT-2 TensorFlow implementation released by OpenAI:
-https://github.com/openai/gpt-2/blob/master/src/model.py
-2) huggingface/transformers PyTorch implementation:
-https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
-"""
-
-import math
-import inspect
-from dataclasses import dataclass
-from contextlib import nullcontext
-
-import torch
-import torch.nn as nn
-from torch.nn import functional as F
-
-from manager import MANAGER
-
-class LayerNorm(nn.Module):
-    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
-
-    def __init__(self, ndim, bias):
-        super().__init__()
-        self.weight = nn.Parameter(torch.ones(ndim))
-        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
-
-    def forward(self, input):
-        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
-
-class CausalSelfAttention(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        assert config.n_embd % config.n_head == 0
-        # key, query, value projections for all heads, but in a batch
-        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
-        # output projection
-        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
-        # regularization
-        self.attn_dropout = nn.Dropout(config.dropout)
-        self.resid_dropout = nn.Dropout(config.dropout)
-        self.n_head = config.n_head
-        self.n_embd = config.n_embd
-        self.dropout = config.dropout
-        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
-        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
-        if not self.flash:
-            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
-            # causal mask to ensure that attention is only applied to the left in the input sequence
-            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
-                                        .view(1, 1, config.block_size, config.block_size))
-
-    def forward(self, x):
-        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
-
-        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
-        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
-        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
-        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
-        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
-
-        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
-        if self.flash:
-            # efficient attention using Flash Attention CUDA kernels
-            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
-        else:
-            # manual implementation of attention
-            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
-            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
-            att = F.softmax(att, dim=-1)
-            att = self.attn_dropout(att)
-            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
-        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
-
-        # output projection
-        y = self.resid_dropout(self.c_proj(y))
-        return y
-
-class Router(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-
-        # router settings
-        self.top_k = config.top_k
-        self.n_exp = config.n_exp
-        assert self.top_k >= 1 and self.top_k <= config.n_exp
-        self.use_noisy_top_k = config.use_noisy_top_k
-        self.train_capacity = config.train_capacity
-        self.eval_capacity = config.eval_capacity
-        self.min_capacity = config.min_capacity
-        self.router_use_full_prec = config.router_use_full_prec
-
-        # auxiliary / load balancing loss settings
-        self.use_aux_loss = config.use_aux_loss
-        self.use_router_z_loss = config.use_router_z_loss
-
-        # linear projection for (noisy) softmax gating
-        # no bias is used, see page 4 eq (4) in (https://arxiv.org/abs/1701.06538)
-        self.w_g = nn.Linear(config.n_embd, config.n_exp, bias=False)
-        self.w_noise = nn.Linear(config.n_embd, config.n_exp, bias=False) if self.use_noisy_top_k else None
-    
-    def forward(self, x):
-        # optionally run the router in full precision to avoid instability during training
-        # see discussion on pg. 9 here: https://arxiv.org/abs/2101.03961
-        # setting enabled to False in autocast automatically puts everything in float32
-        device_type = 'cuda' if torch.cuda.is_available() else 'cpu' # for later use in torch.autocast
-        ctx = nullcontext() if not self.router_use_full_prec else torch.amp.autocast(device_type=device_type, enabled=False)
-
-        with ctx:
-            B, T, _ = x.size()
-            num_tokens = B * T
-
-            # eq (4) in (https://arxiv.org/abs/1701.06538)
-            logits = self.w_g(x)  # [B, T, n_exp]
-            if self.use_noisy_top_k:
-                # optionally add noise into the router
-                noise = F.softplus(self.w_noise(x))
-                noise *= torch.randn_like(noise)
-                logits += noise
-
-            # router z loss, computed on logits (before softmax)
-            # this loss prevents router logits from becoming too large
-            if self.use_router_z_loss:
-                z_loss = self.compute_router_z_loss(logits)
-                MANAGER.add_router_z_loss(z_loss)
-
-            # find top k experts for each token
-            top_k_logits, top_k_indices = logits.topk(self.top_k, dim=-1) # [B, T, k]
-
-            # normalize expert probabilities
-            # Question: should we normalize over all experts or just top-k?
-            # we choose to normalize over top-k, other option is commented out below
-
-            # Shazeer et al (https://arxiv.org/abs/1701.06538) does only topk
-            # see page 4 eq (3)-(5), the code for this is commented out below
-            router_probs = torch.full_like(logits, float('-inf'))  # [B, T, n_exp]
-            router_probs.scatter_(-1, top_k_indices, top_k_logits)
-            router_probs = F.softmax(router_probs, dim=-1)
-
-            # # normalize all router logits (not just top-k) via softmax      
-            # router_probs = F.softmax(logits, dim=-1)
-
-            # compute auxiliary load balancing loss
-            # this loss encourages equal probability assigned to each expert
-            # and equal load balancing of tokens assigned to each expert
-            if self.use_aux_loss:
-                aux_loss = self.compute_aux_loss(router_probs, top_k_indices)
-                MANAGER.add_aux_loss(aux_loss)
-
-            # compute expert capacity
-            exp_capacity = self.get_capacity(num_tokens)
-
-            # make a multi-hot mask of chosen experts, size [B, T, n_exp]
-            # entries are 0 if expert not chosen and 1 if expert chosen
-            exp_mask = F.one_hot(top_k_indices, num_classes=self.n_exp)  # [B, T, k, n_exp]
-            exp_mask = exp_mask.view(num_tokens, self.top_k, self.n_exp)  # [B * T, k, n_exp]
-            exp_mask = exp_mask.permute(1, 0, 2) # [k, B * T, n_exp]
-
-            # compute cumulative sum of each token over experts, this stores
-            # the index of each token within the batch of each expert
-            # NOTE: cumsum should count all top-1 first, top-2 second, etc.
-            # so that we prioritize top experts when dropping tokens (this is
-            # done by putting k dimension first for the reshape operation)
-            exp_rank = exp_mask.reshape(self.top_k * num_tokens, self.n_exp)  # [k * B * T, n_exp]
-            exp_rank = torch.cumsum(exp_rank, dim=0) - 1  # cumulative sum of expert selections [k * B * T, n_exp]
-            exp_rank = exp_rank.reshape(self.top_k, num_tokens, self.n_exp)  # [k, B * T, n_exp]
-
-            # mask out (set to zero) entries that go beyond expert capacity
-            # compute amount of used capacity by taking a sum over mask
-            exp_mask *= torch.lt(exp_rank, exp_capacity) # [k, B * T, n_exp]
-            used_capacity = torch.sum(exp_mask, dim=(0, 1)) # [n_exp]
-
-            # mask rank to only include tokens that are selected
-            # perform a sum so each row only contains index of token
-            # for the expert that is selected in that row
-            # result is a matrix that contains the position of each token
-            # in the batch of its corresponding expert
-            exp_rank = torch.sum(exp_mask * exp_rank, dim=-1)  # [k, B * T]
-
-            # mask probabilities to only include selected experts
-            router_probs = router_probs.view(num_tokens, self.n_exp)[None, :] # [1, B * T, n_exp]
-            exp_weights = exp_mask * router_probs # [k, B * T, n_exp]
-
-            # convert rank into one-hot vectors over the available capacity
-            # stores the position of each token within the capacity of the selected expert
-            exp_rank_sc = F.one_hot(exp_rank, num_classes=exp_capacity) # [k, B * T, exp_capacity]
-
-            # create a vector that stores, for each token, the weight of selected
-            # experts at token's position in the capacity of that expert
-            # size of tensor is [B * T, n_exp, exp_capacity]
-            cb_weight = torch.sum(exp_weights.unsqueeze(3) * exp_rank_sc.unsqueeze(2), dim=0)
-            sec_mask = cb_weight.bool() # binary mask of selected experts for each token
-            return used_capacity, cb_weight, sec_mask
-    
-    def compute_aux_loss(self, expert_probs: torch.Tensor, indices: torch.Tensor):
-        """
-        Computes Switch Transformer auxiliary loss (https://arxiv.org/abs/2101.03961)
-        See equations (4)-(6) on page 7
-        """
-
-        # equation (5): compute ratio of tokens allocated to each expert
-        # total number of tokens is defined as total tokens in batch * k
-        # (k = 1) for the Switch Transformer
-        with torch.no_grad():
-            one_hot_indices = F.one_hot(indices, num_classes=self.n_exp)  # [B, T, k, n_exp]
-            one_hot_indices = torch.sum(one_hot_indices.float(), dim=2)  # [B, T, n_exp] (sum over k dimension)
-            tokens_per_expert = torch.mean(one_hot_indices.float(), dim=(0, 1))
-
-        # equation (6): compute ratio of router probability allocated to each expert
-        prob_per_expert = torch.mean(expert_probs.float(), dim=(0, 1))
-
-        # equation (4): take a scaled dot product between prob/token allocation vectors
-        # multiply the result by the number of experts
-        return self.n_exp * torch.sum(prob_per_expert * tokens_per_expert)
-    
-    def compute_router_z_loss(self, logits: torch.Tensor):
-        """
-        Computes ST-MoE router z loss (https://arxiv.org/abs/2202.08906)
-        See equation (5) on page 7
-        """
-    
-        # exponentiate logits, sum logits of each expert, take log, and square
-        # code below is the same as:
-        # > z_loss = torch.exp(logits)
-        # > z_loss = torch.sum(z_loss, dim=-1)
-        # > z_loss = torch.log(z_loss) ** 2.0
-        z_loss = torch.logsumexp(logits, dim=-1) ** 2.0  # [B, T, n_exp]
-
-        # sum over all tokens and divide by total number of tokens
-        return torch.mean(z_loss)
-
-    def get_capacity(self, tokens_per_batch):
-        # expert capacity is given by (tokens_per_batch / num_experts) * capacity_factor
-        # see eq (3) in Switch Transformer (https://arxiv.org/abs/2101.03961)
-        capacity_factor = self.train_capacity if self.training else self.eval_capacity
-        capacity = math.floor(self.top_k * capacity_factor * tokens_per_batch / self.n_exp)
-        capacity += capacity % 2 # make sure capacity is an even number
-        capacity = max(capacity, self.min_capacity) # use min capacity
-        assert capacity > 0
-        return int(capacity)
-
-class MLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
-        self.gelu    = nn.GELU()
-        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
-        self.dropout = nn.Dropout(config.dropout)
-
-    def forward(self, x):
-        x = self.c_fc(x)
-        x = self.gelu(x)
-        x = self.c_proj(x)
-        x = self.dropout(x)
-        return x
-
-class MLPExperts(nn.Module):
-    """
-    implementation of multiple MLP-based experts that can process input
-    in batch -- based upon ColossalAI OpenMoE but simple, has optional bias, and
-    uses a bmm instead of a loop over a mm for each expert to improve efficiency
-    link: https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/moe/experts.py
-    """
-    def __init__(self, config):
-        # TODO: add param init
-        super().__init__()
-        self.bias = config.bias
-
-        self.c_fc = nn.Parameter(torch.empty(config.n_exp, config.n_embd, 4 * config.n_embd))
-        self.c_proj = nn.Parameter(torch.empty(config.n_exp, 4 * config.n_embd, config.n_embd))
-        self.fc_bias = nn.Parameter(torch.empty(config.n_exp, 1, 4 * config.n_embd)) if self.bias else None
-        self.proj_bias = nn.Parameter(torch.empty(config.n_exp, 1, config.n_embd)) if self.bias else None
-        self.gelu = nn.GELU()
-        self.dropout = nn.Dropout(config.dropout)
-    
-
-    def forward(self, x):
-        x = torch.bmm(x, self.c_fc)
-        if self.bias:
-            x += self.fc_bias
-        x = self.gelu(x)
-        x = torch.bmm(x, self.c_proj)
-        if self.bias:
-            x += self.proj_bias
-        x = self.dropout(x)
-        return x
-
-class MOELayer(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.router = Router(config) # (noisy) top k router
-        self.experts = MLPExperts(config) # group of MLPs (experts)
-
-    def forward(self, x: torch.Tensor):
-        B, T, n_embd = x.size() # track original shape of input
-        num_tokens = (B * T)
-
-        # pass each token through the router
-        used_capacity, exp_weight, exp_mask = self.router(x)
-
-        # flatten out the input
-        x = x.view(num_tokens, n_embd)
-
-        # reshape tokens into batches for each expert
-        # [n_exp, exp_capacity, B * T] * [B * T, n_embd] -> [n_exp, exp_capacity, n_embd]
-        exp_batches = exp_mask.permute(1, 2, 0).type_as(x) @ x
-
-        # compute expert output
-        exp_out = self.experts(exp_batches) # [n_exp, exp_capacity, n_embd]
-
-        # aggregate expert outputs based on router weights
-        # eq (2) on page 4 of ST-MoE (https://arxiv.org/abs/2202.08906)
-        # similar equations are used for other MoE papers
-        exp_weight = exp_weight.view(num_tokens, -1) # [B * T, n_exp * exp_capacity]
-        exp_out = exp_out.view(-1, n_embd) # [n_exp * exp_capacity, n_embd] 
-        output = exp_weight @ exp_out # [B * T, n_embd]
-        
-        # resize output before return
-        return output.view(B, T, n_embd)
-
-class Block(nn.Module):
-
-    def __init__(self, config, use_moe=False):
-        super().__init__()
-        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
-        self.attn = CausalSelfAttention(config)
-        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
-        if use_moe:
-            self.mlp = MOELayer(config)
-        else:
-            self.mlp = MLP(config)
-
-    def forward(self, x):
-        x = x + self.attn(self.ln_1(x))
-        x = x + self.mlp(self.ln_2(x))
-        return x
-
-@dataclass
-class GPTConfig:
-    block_size: int = 1024
-    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
-    n_layer: int = 12
-    n_head: int = 12
-    n_embd: int = 768
-    dropout: float = 0.0
-    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
-
-    # MoE-related configs 
-    n_exp: int = 1 # if n_exp = 1 we just use regular MLP layers
-    top_k: int = 2
-    use_aux_loss: bool = False # apply auxiliary loss (from Switch Transformer) in router
-    use_router_z_loss: bool = False # apply router z loss (from ST-MoE)
-    use_noisy_top_k: bool = False
-    aux_loss_weight: float = 0.01 # default setting from Switch Transformer (see top of page 8)
-    router_z_loss_weight: float = 0.001 # default setting from ST-MoE (see page 8 eq. 6)
-    train_capacity: float = 1.25  # default setting from ST-MoE (see top of page 6)
-    eval_capacity: float = 2.0
-    min_capacity: int = 4  # minimum batch size to send to any single expert
-    stride: int = 2 # one in every stride layers are converted to an MoE
-    use_switch_tfm_init: bool = False  # use weight init scheme from Switch Transformer
-    switch_tfm_init_scale: float = 1.0
-    router_use_full_prec: bool = False  # use float32 precision in the router
-
-
-class GPT(nn.Module):
-
-    def __init__(self, config):
-        super().__init__()
-        assert config.vocab_size is not None
-        assert config.block_size is not None
-        self.config = config
-
-        if config.n_exp == 1:
-            # create normal transformer blocks
-            blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
-        else:
-            # create transformer blocks, placing an MoE block every <stride> layers
-            blocks = []
-            for i in range(config.n_layer):
-                # TODO: how to implement this?
-                # should we change below to i + 1 ?
-                use_moe = (i % config.stride) == 0
-                blocks.append(Block(config, use_moe=use_moe))
-            blocks = nn.ModuleList(blocks)
-
-        self.transformer = nn.ModuleDict(dict(
-            wte = nn.Embedding(config.vocab_size, config.n_embd),
-            wpe = nn.Embedding(config.block_size, config.n_embd),
-            drop = nn.Dropout(config.dropout),
-            h = blocks,
-            ln_f = LayerNorm(config.n_embd, bias=config.bias),
-        ))
-        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
-        # with weight tying when using torch.compile() some warnings get generated:
-        # "UserWarning: functional_call was passed multiple values for tied weights.
-        # This behavior is deprecated and will be an error in future versions"
-        # not 100% sure what this is, so far seems to be harmless. TODO investigate
-        self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
-
-        # init all weights
-        # optionall use switch transformer special init scheme for experts
-        # See pg. 10 here: https://arxiv.org/abs/2101.03961
-        self.apply(self._init_weights)
-        # apply special scaled init to the residual projections, per GPT-2 paper
-        for pn, p in self.named_parameters():
-            if pn.endswith('c_proj.weight') or pn.endswith('experts.c_proj'):
-                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
-
-        # report number of parameters
-        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
-
-    def get_num_params(self, non_embedding=True):
-        """
-        Return the number of parameters in the model.
-        For non-embedding count (default), the position embeddings get subtracted.
-        The token embeddings would too, except due to the parameter sharing these
-        params are actually used as weights in the final layer, so we include them.
-        """
-        n_params = sum(p.numel() for p in self.parameters())
-        if non_embedding:
-            n_params -= self.transformer.wpe.weight.numel()
-        return n_params
-
-    @torch.no_grad()
-    def _init_weights(self, module):
-        # optionally use switch transformer-style initialization
-        # see page 10 for switch init explanation: https://arxiv.org/abs/2101.03961
-        if isinstance(module, nn.Linear):
-            if self.config.use_switch_tfm_init:
-                scale = self.config.switch_tfm_init_scale
-
-                # linear layers have flipped dimensions in torch
-                # size of weights is [out_dim, in_dim] 
-                w_fan_in = module.weight.shape[-1]
-                w_std = (scale / w_fan_in) ** 0.5
-                torch.nn.init.trunc_normal_(
-                    module.weight,
-                    mean=0.0,
-                    std=w_std,
-                    a=-2*w_std,
-                    b=2*w_std,
-                )
-            else:
-                # perform standard (normal) initialization of weights
-                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
-
-            # always initialize bias to zero
-            if module.bias is not None:
-                torch.nn.init.zeros_(module.bias)
-        elif isinstance(module, MLPExperts):
-            # we have to init expert weights manually because
-            # nn.Parameter is not a type of module in torch
-            if self.config.use_switch_tfm_init:
-                scale = self.config.switch_tfm_init_scale
-
-                c_fc_fan_in = module.c_fc.shape[-2]
-                c_fc_std = (scale / c_fc_fan_in) ** 0.5
-                torch.nn.init.trunc_normal_(
-                    module.c_fc,
-                    mean=0.0,
-                    std=c_fc_std,
-                    a=-2*c_fc_std,
-                    b=2*c_fc_std,
-                )
-
-                c_proj_fan_in = module.c_proj.shape[-2]
-                c_proj_std = (scale / c_proj_fan_in) ** 0.5
-                torch.nn.init.trunc_normal_(
-                    module.c_proj,
-                    mean=0.0,
-                    std=c_proj_std,
-                    a=-2*c_proj_std,
-                    b=2*c_proj_std,
-                )
-            else:
-                # perform standard (normal) initialization of weights
-                torch.nn.init.normal_(module.c_fc, mean=0.0, std=0.02)
-                torch.nn.init.normal_(module.c_proj, mean=0.0, std=0.02)
-
-            # bias is always initialized to zero
-            if module.fc_bias is not None:
-                torch.nn.init.zeros_(module.fc_bias)
-                torch.nn.init.zeros_(module.proj_bias)
-        elif isinstance(module, nn.Embedding):
-            # just use standard initialization scheme for embedding always
-            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
-
-    def forward(self, idx, targets=None):
-        device = idx.device
-        b, t = idx.size()
-        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
-        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)
-
-        # forward the GPT model itself
-        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
-        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
-        x = self.transformer.drop(tok_emb + pos_emb)
-        for block in self.transformer.h:
-            x = block(x)
-        x = self.transformer.ln_f(x)
-
-        if targets is not None:
-            # if we are given some desired targets also calculate the loss
-            logits = self.lm_head(x)
-            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
-
-            # add the auxiliary load balancing loss and router z loss to the main loss
-            if self.config.n_exp > 1 and self.config.use_aux_loss:
-                loss += self.config.aux_loss_weight * MANAGER.aggregate_aux_loss()
-                MANAGER.reset_aux_loss()
-            if self.config.n_exp > 1 and self.config.use_router_z_loss:
-                loss += self.config.router_z_loss_weight * MANAGER.aggregate_router_z_loss()
-                MANAGER.reset_router_z_loss()
-        else:
-            # inference-time mini-optimization: only forward the lm_head on the very last position
-            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
-            loss = None
-
-        return logits, loss
-
-    def crop_block_size(self, block_size):
-        # model surgery to decrease the block size if necessary
-        # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
-        # but want to use a smaller block size for some smaller, simpler model
-        assert block_size <= self.config.block_size
-        self.config.block_size = block_size
-        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
-        for block in self.transformer.h:
-            if hasattr(block.attn, 'bias'):
-                block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
-
-    @classmethod
-    def from_pretrained(cls, model_type, override_args=None):
-        assert not 'moe' in model_type, "Pretrained checkpoints not available for MoE"
-        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
-        override_args = override_args or {} # default to empty dict
-        # only dropout can be overridden see more notes below
-        assert all(k == 'dropout' for k in override_args)
-        from transformers import GPT2LMHeadModel
-        print("loading weights from pretrained gpt: %s" % model_type)
-
-        # n_layer, n_head and n_embd are determined from model_type
-        config_args = {
-            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
-            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
-            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
-            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
-        }[model_type]
-        print("forcing vocab_size=50257, block_size=1024, bias=True")
-        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
-        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
-        config_args['bias'] = True # always True for GPT model checkpoints
-        # we can override the dropout rate, if desired
-        if 'dropout' in override_args:
-            print(f"overriding dropout rate to {override_args['dropout']}")
-            config_args['dropout'] = override_args['dropout']
-        # create a from-scratch initialized minGPT model
-        config = GPTConfig(**config_args)
-        model = GPT(config)
-        sd = model.state_dict()
-        sd_keys = sd.keys()
-        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
-
-        # init a huggingface/transformers model
-        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
-        sd_hf = model_hf.state_dict()
-
-        # copy while ensuring all of the parameters are aligned and match in names and shapes
-        sd_keys_hf = sd_hf.keys()
-        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
-        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
-        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
-        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
-        # this means that we have to transpose these weights when we import them
-        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
-        for k in sd_keys_hf:
-            if any(k.endswith(w) for w in transposed):
-                # special treatment for the Conv1D weights we need to transpose
-                assert sd_hf[k].shape[::-1] == sd[k].shape
-                with torch.no_grad():
-                    sd[k].copy_(sd_hf[k].t())
-            else:
-                # vanilla copy over the other parameters
-                assert sd_hf[k].shape == sd[k].shape
-                with torch.no_grad():
-                    sd[k].copy_(sd_hf[k])
-
-        return model
-
-    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
-        # TODO: add expert config
-        # start with all of the candidate parameters
-        param_dict = {pn: p for pn, p in self.named_parameters()}
-        # filter out those that do not require grad
-        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
-        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
-        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
-        # add an extra check for "bias" string to account for bias terms in MoE layers
-        decay_params = [p for n, p in param_dict.items() if (p.dim() >= 2 and not n.endswith('bias'))]
-        nodecay_params = [p for n, p in param_dict.items() if (p.dim() < 2 or n.endswith('bias'))]
-        optim_groups = [
-            {'params': decay_params, 'weight_decay': weight_decay},
-            {'params': nodecay_params, 'weight_decay': 0.0}
-        ]
-        num_decay_params = sum(p.numel() for p in decay_params)
-        num_nodecay_params = sum(p.numel() for p in nodecay_params)
-        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
-        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
-        # Create AdamW optimizer and use the fused version if it is available
-        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
-        use_fused = fused_available and device_type == 'cuda'
-        extra_args = dict(fused=True) if use_fused else dict()
-        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
-        print(f"using fused AdamW: {use_fused}")
-
-        return optimizer
-
-    def estimate_mfu(self, fwdbwd_per_iter, dt):
-        """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
-        # first estimate the number of flops we do per iteration.
-        # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
-        N = self.get_num_params()
-        cfg = self.config
-        L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd//cfg.n_head, cfg.block_size
-        flops_per_token = 6*N + 12*L*H*Q*T
-        flops_per_fwdbwd = flops_per_token * T
-        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
-        # express our flops throughput as ratio of A100 bfloat16 peak flops
-        flops_achieved = flops_per_iter * (1.0/dt) # per second
-        flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
-        mfu = flops_achieved / flops_promised
-        return mfu
-
-    @torch.no_grad()
-    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
-        """
-        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
-        the sequence max_new_tokens times, feeding the predictions back into the model each time.
-        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
-        """
-        for _ in range(max_new_tokens):
-            # if the sequence context is growing too long we must crop it at block_size
-            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
-            # forward the model to get the logits for the index in the sequence
-            logits, _ = self(idx_cond)
-            # pluck the logits at the final step and scale by desired temperature
-            logits = logits[:, -1, :] / temperature
-            # optionally crop the logits to only the top k options
-            if top_k is not None:
-                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-                logits[logits < v[:, [-1]]] = -float('Inf')
-            # apply softmax to convert logits to (normalized) probabilities
-            probs = F.softmax(logits, dim=-1)
-            # sample from the distribution
-            idx_next = torch.multinomial(probs, num_samples=1)
-            # append sampled index to the running sequence and continue
-            idx = torch.cat((idx, idx_next), dim=1)
-
-        return idx   
--- a/nanochat/tokenizer.py
+++ b/nanochat/tokenizer.py
@ -376,11 +376,85 @@ class RustBPETokenizer:
        ids.append(assistant_start)
        return ids

+# -----------------------------------------------------------------------------
+# Tiktoken GPT-2 Tokenizer (for compatibility with nanoMoE)
+class TiktokenGPT2Tokenizer:
+    """Wrapper around tiktoken GPT-2 tokenizer for compatibility with nanoMoE"""
+    
+    def __init__(self, enc=None):
+        import tiktoken
+        self.enc = enc if enc is not None else tiktoken.get_encoding("gpt2")
+        # GPT-2 uses 50256 as EOT token, which we can use as BOS
+        self.bos_token_id = self.enc.eot_token  # 50256
+    
+    @classmethod
+    def from_pretrained(cls, encoding_name="gpt2"):
+        import tiktoken
+        enc = tiktoken.get_encoding(encoding_name)
+        return cls(enc)
+    
+    def get_vocab_size(self):
+        return self.enc.n_vocab
+    
+    def get_special_tokens(self):
+        # GPT-2 doesn't have special tokens in the same way, but has EOT
+        return []
+    
+    def id_to_token(self, id):
+        try:
+            return self.enc.decode([id])
+        except:
+            return f"<unk_{id}>"
+    
+    def encode_special(self, text):
+        # GPT-2 doesn't have special tokens, return EOT token as fallback
+        return self.enc.eot_token
+    
+    def get_bos_token_id(self):
+        return self.bos_token_id
+    
+    def encode(self, text, prepend=None, append=None, num_threads=1):
+        """
+        Encode text using tiktoken GPT-2 tokenizer.
+        prepend/append can be token ids or None.
+        """
+        if isinstance(text, str):
+            ids = self.enc.encode_ordinary(text)  # encode_ordinary ignores special tokens
+            if prepend is not None:
+                prepend_id = prepend if isinstance(prepend, int) else self.get_bos_token_id()
+                ids = [prepend_id] + ids
+            if append is not None:
+                append_id = append if isinstance(append, int) else self.enc.eot_token
+                ids.append(append_id)
+            return ids
+        elif isinstance(text, list):
+            return [self.encode(t, prepend=prepend, append=append, num_threads=num_threads) for t in text]
+        else:
+            raise ValueError(f"Invalid input type: {type(text)}")
+    
+    def __call__(self, *args, **kwargs):
+        return self.encode(*args, **kwargs)
+    
+    def decode(self, ids):
+        if isinstance(ids, list) and len(ids) > 0 and isinstance(ids[0], list):
+            # List of lists
+            return [self.enc.decode(seq) for seq in ids]
+        else:
+            # Single sequence
+            return self.enc.decode(ids)
+
 # -----------------------------------------------------------------------------
 # nanochat-specific convenience functions

-def get_tokenizer():
-    from nanochat_moe.common import get_base_dir
+def get_tokenizer(use_tiktoken_gpt2=False):
+    """
+    Get tokenizer. If use_tiktoken_gpt2=True, returns TiktokenGPT2Tokenizer for nanoMoE compatibility.
+    Otherwise returns the default nanochat tokenizer.
+    """
+    if use_tiktoken_gpt2:
+        return TiktokenGPT2Tokenizer.from_pretrained("gpt2")
+    
+    from nanochat.common import get_base_dir
    base_dir = get_base_dir()
    tokenizer_dir = os.path.join(base_dir, "tokenizer")
    # return HuggingFaceTokenizer.from_directory(tokenizer_dir)
@ -388,7 +462,7 @@ def get_tokenizer():

 def get_token_bytes(device="cpu"):
    import torch
-    from nanochat_moe.common import get_base_dir
+    from nanochat.common import get_base_dir
    base_dir = get_base_dir()
    tokenizer_dir = os.path.join(base_dir, "tokenizer")
    token_bytes_path = os.path.join(tokenizer_dir, "token_bytes.pt")
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@ -27,13 +27,14 @@ import wandb

 # Import from nanoMoE model (keeping train.py's original model)
 import sys
-sys.path.insert(0, '/root/nanoMoE')
-from model import GPTConfig, GPT
+from nanochat.gpt import GPTConfig, GPT

 from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, print_banner, get_base_dir, autodetect_device_type
-from nanochat.tokenizer import get_tokenizer
+from nanochat.tokenizer import get_tokenizer, get_token_bytes
 from nanochat.checkpoint_manager import save_checkpoint, load_checkpoint
 from nanochat.engine import Engine
+from nanochat.dataloader import tokenizing_distributed_data_loader_with_state, tokenizing_distributed_data_loader
+from nanochat.loss_eval import evaluate_bpb
 from scripts.base_eval import evaluate_model

 print_banner()
@ -43,9 +44,6 @@ print_banner()
 run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
 # Runtime
 device_type = "" # cuda|cpu|mps (empty => autodetect good device type default, in order: CUDA > MPS > CPU)
-# Data loading
-use_bin_data = True # if True, use .bin files (nanoMoE format) instead of parquet/text
-data_dir = "" # directory containing train.bin and val.bin files (only used if use_bin_data=True)
 # Model architecture
 depth = 6 # the depth of the Transformer model to train (matches nanoMoE n_layer=6), rest of the kwargs are derived
 max_seq_len = 1024 # max context length (matches nanoMoE block_size=1024)
@ -89,12 +87,12 @@ final_lr_frac = 0.1 # final learning rate as fraction of initial learning rate (

 resume_from_step = -1 # resume training from this step of the optimization (-1 = disable)
 # Evaluation
-eval_every = 500 # every how many steps to evaluate the model for val bpb (matches nanoMoE eval_interval=500)
+eval_every = 500000000 # every how many steps to evaluate the model for val bpb (matches nanoMoE eval_interval=500)
 eval_iters = 200 # number of iterations to evaluate val loss on (matches nanoMoE eval_iters=200)
 log_interval = 10 # every how many steps to log training metrics (matches nanoMoE log_interval=10)
 core_metric_every = -1 # every how many steps to evaluate the core metric (-1 = disable)
 core_metric_max_per_task = -1 # examples per task in estimating the core metric
-sample_every = 2000 # every how many steps to sample from the model
+sample_every = 200000000 # every how many steps to sample from the model
 save_every = 1000 # every how many steps to save model checkpoints (-1 = disable, and save only at the end of the run)
 # System
 compile = True # use PyTorch 2.0 to compile the model to be faster (matches nanoMoE)
@ -130,11 +128,10 @@ use_dummy_wandb = run == "dummy" or not master_process
 wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat", name=run, config=user_config)

 # Tokenizer will be useful for evaluation, also we need the vocab size
-# Use tiktoken GPT-2 tokenizer for compatibility with nanoMoE .bin data format
-tokenizer = get_tokenizer(use_tiktoken_gpt2=True)
+tokenizer = get_tokenizer()
+token_bytes = get_token_bytes(device=device)
 vocab_size = tokenizer.get_vocab_size()
 print0(f"Vocab size: {vocab_size:,}")
-print0("Using tiktoken GPT-2 tokenizer (nanoMoE compatible)")

 # Model kwargs are derived from the desired depth of the model
 # For nanoMoE, we use n_layer, n_head, n_embd directly
@ -162,26 +159,17 @@ print0(f"Total batch size {total_batch_size:,} => gradient accumulation steps: {
 # -----------------------------------------------------------------------------
 # Initialize the Model

-# Create a new model with random weights (using nanoMoE GPT)
-if not data_dir:
-    # Default to nanoMoE data directory structure
-    data_dir = "/root/nanoMoE/data/openwebtext"
-
-# Attempt to derive vocab_size from the dataset
-meta_path = os.path.join(data_dir, 'meta.pkl')
-meta_vocab_size = None
-if os.path.exists(meta_path):
-    with open(meta_path, 'rb') as f:
-        meta = pickle.load(f)
-    meta_vocab_size = meta['vocab_size']
-    print0(f"found vocab_size = {meta_vocab_size} (inside {meta_path})")
+# Get base directory for data and checkpoints
+base_dir = get_base_dir()

+# Use vocab_size from tokenizer (already obtained above)
+# This ensures the model vocab_size matches the tokenizer vocab_size
 model_config_kwargs = dict(
    n_layer=n_layer, 
    n_head=n_head, 
    n_embd=n_embd,
    block_size=max_seq_len, 
-    vocab_size=meta_vocab_size if meta_vocab_size is not None else 50304, 
+    vocab_size=vocab_size,  # Use vocab_size from tokenizer, not hardcoded 
    dropout=dropout, 
    bias=bias,
    # MoE parameters (matching train_nano_moe.py)
@ -206,10 +194,9 @@ model = GPT(gptconf)
 model.to(device)

 # If we are resuming, overwrite the model parameters with those of the checkpoint
-base_dir = get_base_dir()
 output_dirname = model_tag if model_tag else f"d{depth}" # e.g. d6
 checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname)
-resuming = resume_from_step != -1
+resuming = False
 # if resuming:
 #     print0(f"Resuming optimization from step {resume_from_step}")
 #     model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, resume_from_step, device, load_optimizer=True, rank=ddp_rank)
@ -272,48 +259,11 @@ print0(f"Total training FLOPs estimate: {num_flops_per_token * total_tokens:e}")
 #     del optimizer_data # free up the memory

 # -----------------------------------------------------------------------------
-# Data loading (nanoMoE style - simple get_batch function)
-if not data_dir:
-    # Default to nanoMoE data directory structure
-    data_dir = "/root/nanoMoE/data/openwebtext"
-print0(f"Using .bin data loader from: {data_dir}")
-
-# poor man's data loader (matching nanoMoE/train.py)
-def get_batch(split):
-    # We recreate np.memmap every batch to avoid a memory leak, as per
-    # https://stackoverflow.com/questions/45132940/numpy-memmap-memory-usage-want-to-iterate-once/61472122#61472122
-    if split == 'train':
-        data = np.memmap(os.path.join(data_dir, 'train.bin'), dtype=np.uint16, mode='r')
-    else:
-        data = np.memmap(os.path.join(data_dir, 'val.bin'), dtype=np.uint16, mode='r')
-    ix = torch.randint(len(data) - max_seq_len, (device_batch_size,))
-    x = torch.stack([torch.from_numpy((data[i:i+max_seq_len]).astype(np.int64)) for i in ix])
-    y = torch.stack([torch.from_numpy((data[i+1:i+1+max_seq_len]).astype(np.int64)) for i in ix])
-    if device_type == 'cuda':
-        # pin arrays x,y, which allows us to move them to GPU asynchronously (non_blocking=True)
-        x, y = x.pin_memory().to(device, non_blocking=True), y.pin_memory().to(device, non_blocking=True)
-    else:
-        x, y = x.to(device), y.to(device)
-    return x, y
-
-# helps estimate an arbitrarily accurate loss over either split using many batches (matching nanoMoE/train.py)
-@torch.no_grad()
-def estimate_loss():
-    out = {}
-    model.eval()
-    for split in ['train', 'val']:
-        losses = torch.zeros(eval_iters)
-        for k in range(eval_iters):
-            X, Y = get_batch(split)
-            with autocast_ctx:
-                _, loss = model(X, Y)
-            losses[k] = loss.item()
-        out[split] = losses.mean()
-    model.train()
-    return out
-
-# fetch the very first batch
-x, y = get_batch('train')
+# Initialize the DataLoaders for train/val (like nanochat-run)
+dataloader_resume_state_dict = None if not resuming else meta_data.get("dataloader_state_dict")
+train_loader = tokenizing_distributed_data_loader_with_state(device_batch_size, max_seq_len, split="train", device=device, resume_state_dict=dataloader_resume_state_dict)
+build_val_loader = lambda: tokenizing_distributed_data_loader(device_batch_size, max_seq_len, split="val", device=device)
+x, y, dataloader_state_dict = next(train_loader) # kick off load of the very first batch of data

 # -----------------------------------------------------------------------------
 # Set up hyperparameter schedulers
@ -361,22 +311,23 @@ while True:
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

-    # once in a while: evaluate the val loss (all ranks participate, matching nanoMoE/train.py)
-    if last_step or step % eval_every == 0:
-        losses = estimate_loss()
-        val_loss = losses['val'].item()
-        train_loss_eval = losses['train'].item()
-        print0(f"Step {step:05d} | Train loss: {train_loss_eval:.4f}, Val loss: {val_loss:.4f}")
-        if val_loss < min_val_bpb:
-            min_val_bpb = val_loss
+    # once in a while: evaluate the val bpb (all ranks participate)
+    if step % eval_every == 0:
+        model.eval()
+        val_loader = build_val_loader()
+        eval_steps = eval_iters  # use eval_iters as number of evaluation steps
+        with autocast_ctx:
+            val_bpb = evaluate_bpb(model, val_loader, eval_steps, token_bytes)
+        print0(f"Step {step:05d} | Validation bpb: {val_bpb:.4f}")
+        if val_bpb < min_val_bpb:
+            min_val_bpb = val_bpb
        wandb_run.log({
            "step": step,
            "total_training_flops": flops_so_far,
            "total_training_time": total_training_time,
-            "val/loss": val_loss,
-            "train/loss_eval": train_loss_eval,
+            "val/bpb": val_bpb,
        })
-        val_bpb = val_loss  # for compatibility with existing code
+        model.train()

    # once in a while: estimate the CORE metric (all ranks participate)
    # use the original uncompiled model because the inputs keep changing shape
@ -433,6 +384,7 @@ while True:
                    "smooth_train_loss": smooth_train_loss,
                    "total_training_time": total_training_time,
                },
+                "dataloader_state_dict": dataloader_state_dict, # for resuming data loading
            },
            rank=ddp_rank,
        )
@ -457,7 +409,7 @@ while True:
            _, loss = model(x, y)  # nanoMoE model returns (logits, loss)
            loss = loss / grad_accum_steps # scale the loss to account for gradient accumulation
        # immediately async prefetch next batch while model is doing the forward pass on the GPU
-        x, y = get_batch('train')
+        x, y, dataloader_state_dict = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward
        # backward pass, with gradient scaling if training in fp16
        scaler.scale(loss).backward()
    # clip the gradient
--- a/scripts_moe/base_eval.py
+++ b/scripts_moe/base_eval.py
@ -1,212 +0,0 @@
-"""
-Evaluate the CORE metric for a given model.
-
-Run on a single GPU:
-python -m scripts.base_eval
-
-Run with torchrun on e.g. 8 GPUs:
-torchrun --nproc_per_node=8 -m scripts.base_eval
-
-The script will print the CORE metric to the console.
-"""
-import os
-import csv
-import time
-import json
-import yaml
-import shutil
-import random
-import zipfile
-import tempfile
-from contextlib import nullcontext
-
-import torch
-
-from nanochat_moe.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type, download_file_with_lock
-from nanochat_moe.tokenizer import HuggingFaceTokenizer
-from nanochat_moe.checkpoint_manager import load_model
-from nanochat_moe.core_eval import evaluate_task
-
-# -----------------------------------------------------------------------------
-# nanochat specific function dealing with I/O etc.
-
-# ~162MB of data needed to evaluate the CORE metric
-EVAL_BUNDLE_URL = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip"
-
-def place_eval_bundle(file_path):
-    # here file_path is the path to the eval_bundle.zip file
-    # we need to unzip it and place it in the base directory
-    base_dir = get_base_dir()
-    eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
-    with tempfile.TemporaryDirectory() as tmpdir:
-        with zipfile.ZipFile(file_path, 'r') as zip_ref:
-            zip_ref.extractall(tmpdir)
-        extracted_bundle_dir = os.path.join(tmpdir, "eval_bundle")
-        shutil.move(extracted_bundle_dir, eval_bundle_dir)
-    print0(f"Placed eval_bundle directory at {eval_bundle_dir}")
-
-def evaluate_model(model, tokenizer, device, max_per_task=-1):
-    """
-    Evaluate a base model on the CORE benchmark.
-    - max_per_task: crop the data to this many examples per task for testing (-1 = disable)
-    """
-    # Load config and task metadata
-    base_dir = get_base_dir()
-    eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
-    # Download the eval bundle to disk (and unzip if needed)
-    if not os.path.exists(eval_bundle_dir):
-        download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle)
-    config_path = os.path.join(eval_bundle_dir, "core.yaml")
-    data_base_path = os.path.join(eval_bundle_dir, "eval_data")
-    eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
-    with open(config_path, 'r', encoding='utf-8') as f:
-        config = yaml.safe_load(f)
-    tasks = config['icl_tasks']
-
-    # Load random baseline values from eval metadata
-    random_baselines = {}
-    with open(eval_meta_data, 'r', encoding='utf-8') as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            task_name = row['Eval Task']
-            random_baseline = row['Random baseline']
-            random_baselines[task_name] = float(random_baseline)
-
-    # Evaluate each task
-    results = {}
-    centered_results = {}
-    for task in tasks:
-        start_time = time.time()
-        label = task['label']
-        task_meta = {
-            'task_type': task['icl_task_type'],
-            'dataset_uri': task['dataset_uri'],
-            'num_fewshot': task['num_fewshot'][0],
-            'continuation_delimiter': task.get('continuation_delimiter', ' ')
-        }
-        print0(f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ", end='')
-
-        # Load data for this task
-        data_path = os.path.join(data_base_path, task_meta['dataset_uri'])
-        with open(data_path, 'r', encoding='utf-8') as f:
-            data = [json.loads(line.strip()) for line in f]
-
-        # shuffle the data because in many cases it appears ordered but we want
-        # the ability to only run a subset of the data for debugging purposes etc.
-        shuffle_rng = random.Random(1337)
-        shuffle_rng.shuffle(data)
-        if max_per_task > 0:
-            data = data[:max_per_task]
-
-        # run the evaluation for this task
-        accuracy = evaluate_task(model, tokenizer, data, device, task_meta)
-
-        results[label] = accuracy
-        random_baseline = random_baselines[label]
-        centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline)
-        centered_results[label] = centered_result
-        end_time = time.time()
-        print0(f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {end_time - start_time:.2f}s")
-
-    core_metric = sum(centered_results.values()) / len(centered_results)
-    out = {
-        "results": results,
-        "centered_results": centered_results,
-        "core_metric": core_metric
-    }
-    return out
-
-# -----------------------------------------------------------------------------
-# HuggingFace loading utilities and light wrappers for a model
-
-class ModelWrapper:
-    """Lightweight wrapper for a HuggingFace model"""
-    def __init__(self, model, max_seq_len=None):
-        self.model = model
-        self.max_seq_len = max_seq_len
-
-    def __call__(self, input_ids):
-        outputs = self.model(input_ids)
-        logits = outputs.logits
-        return logits
-
-def load_hf_model(hf_path: str, device):
-    print0(f"Loading model from: {hf_path}")
-    # Load the model
-    from transformers import AutoModelForCausalLM
-    model = AutoModelForCausalLM.from_pretrained(hf_path)
-    model.to(device)
-    model.eval()
-    max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None
-    model = ModelWrapper(model, max_seq_len=max_seq_len)
-    # Load the tokenizer
-    tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path)
-    return model, tokenizer
-
-# -----------------------------------------------------------------------------
-def main():
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate')
-    parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)')
-    args = parser.parse_args()
-
-    # distributed / precision setup
-    device_type = autodetect_device_type()
-    ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
-    autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
-
-    # Load model and tokenizer from command line or from file system
-    if args.hf_path is not None:
-        # atm assume that if a path is given, it's a huggingface model path
-        hf_path = args.hf_path
-        print0(f"Loading huggingface model from: {hf_path}")
-        model, tokenizer = load_hf_model(hf_path, device)
-        model_name = hf_path # just for logging
-        model_slug = hf_path.replace("/", "-") # for the output csv file
-    else:
-        # load a local model from the file system
-        model, tokenizer, meta = load_model("base", device, phase="eval")
-        model_name = f"base_model (step {meta['step']})" # just for logging
-        model_slug = f"base_model_{meta['step']:06d}" # for the output csv file
-
-    # Evaluate the model
-    with autocast_ctx:
-        out = evaluate_model(model, tokenizer, device, max_per_task=args.max_per_task)
-
-    # Write out the results to a csv file
-    core_metric = None
-    centered_results = {}
-    if ddp_rank == 0:
-        base_dir = get_base_dir()
-        output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv")
-        os.makedirs(os.path.dirname(output_csv_path), exist_ok=True)
-        results = out["results"]
-        centered_results = out["centered_results"]
-        core_metric = out["core_metric"]
-        with open(output_csv_path, 'w', encoding='utf-8', newline='') as f:
-            f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n")
-            for label in results:
-                f.write(f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n")
-            f.write(f"{'CORE':<35}, {'':<10}, {core_metric:<10.6f}\n")
-        # Print the content of the csv file to console too
-        print0("="*80)
-        print0(f"Model: {model_name}")
-        print0("="*80)
-        with open(output_csv_path, 'r', encoding='utf-8') as f:
-            print0(f.read())
-
-    # Log to report
-    from nanochat_moe.report import get_report
-    get_report().log(section="Base model evaluation", data=[
-        {
-            "Model": model_name,
-            "CORE metric": core_metric,
-        },
-        centered_results, # the full table
-    ])
-
-    compute_cleanup()
-
-if __name__ == "__main__":
-    main()
--- a/scripts_moe/base_loss.py
+++ b/scripts_moe/base_loss.py
@ -1,79 +0,0 @@
-"""
-Loads a checkpoint, and:
- Evaluates the loss on a larger chunk of train/val splits
- Samples from the model
-
-Example run as:
-torchrun --standalone --nproc_per_node=8 -m scripts.base_loss
-"""
-import os
-from contextlib import nullcontext
-import torch
-from nanochat_moe.checkpoint_manager import load_model
-from nanochat_moe.common import compute_init, print0, compute_cleanup, autodetect_device_type
-from nanochat_moe.dataloader import tokenizing_distributed_data_loader
-from nanochat_moe.tokenizer import get_token_bytes
-from nanochat_moe.loss_eval import evaluate_bpb
-from nanochat_moe.engine import Engine
-
-# Configuration
-device_batch_size = 32
-split_tokens = 20*524288  # number of tokens to evaluate per split
-model_tag = None # optional model tag for the output directory name
-model_step = None # optional model step for the output directory name
-device_type = "" # cuda|cpu|mps (empty => autodetect)
-exec(open(os.path.join('nanochat_moe', 'configurator.py')).read()) # overrides from command line or config file
-
-# Load the base model and the tokenizer
-device_type = autodetect_device_type() if device_type == "" else device_type
-ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
-model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=model_tag, step=model_step)
-sequence_len = meta["model_config"]["sequence_len"] # could be arbitrary really
-autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
-
-# Evaluate the loss on each split
-tokens_per_step = device_batch_size * sequence_len * ddp_world_size
-assert split_tokens % tokens_per_step == 0, "split_tokens must be divisible by tokens_per_step"
-steps = split_tokens // tokens_per_step
-token_bytes = get_token_bytes(device=device)
-bpb_results = {}
-for split_name in ["train", "val"]:
-    loader = tokenizing_distributed_data_loader(device_batch_size, sequence_len, split_name, device=device)
-    with autocast_ctx:
-        bpb = evaluate_bpb(model, loader, steps, token_bytes)
-    print0(f"{split_name} bpb: {bpb:.4f}")
-    bpb_results[split_name] = bpb
-
-# Master process also samples from the model
-samples = []
-if ddp_rank == 0:
-    prompts = [
-        "The capital of France is",
-        "The chemical symbol of gold is",
-        "If yesterday was Friday, then tomorrow will be",
-        "The opposite of hot is",
-        "The planets of the solar system are:",
-        "My favorite color is",
-        "If 5*x + 3 = 13, then x is",
-    ]
-    engine = Engine(model, tokenizer)
-    for prompt in prompts:
-        tokens = tokenizer(prompt, prepend="<|bos|>")
-        with autocast_ctx:
-            sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0)
-        sample_str = tokenizer.decode(sample[0])
-        print0(sample_str)
-        samples.append(sample_str)
-
-# Log to report
-from nanochat_moe.report import get_report
-get_report().log(section="Base model loss", data=[
-    {
-        "train bpb": bpb_results["train"],
-        "val bpb": bpb_results["val"],
-    },
-    {f"sample {i}": sample for i, sample in enumerate(samples)},
-])
-
-# Cleanup
-compute_cleanup()
--- a/scripts_moe/base_train.py
+++ b/scripts_moe/base_train.py
@ -1,413 +0,0 @@
-"""
-Train model. Run as:
-
-python base_train.py
-
-or distributed as:
-
-torchrun --nproc_per_node=8 base_train.py
-
-If you are only on CPU/Macbook, you'll want to train a much much smaller LLM. Example:
-python -m scripts.base_train --depth=4 --max_seq_len=512 --device_batch_size=1 --eval_tokens=512 --core_metric_every=-1 --total_batch_size=512 --num_iterations=20
-"""
-
-import os
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
-import time
-from contextlib import nullcontext
-
-import wandb
-import torch
-
-from nanochat_moe.gpt import GPT, GPTConfig
-from nanochat_moe.dataloader import tokenizing_distributed_data_loader, tokenizing_distributed_data_loader_with_state
-from nanochat_moe.common import compute_init, compute_cleanup, print0, DummyWandb, print_banner, get_base_dir, autodetect_device_type
-from nanochat_moe.tokenizer import get_tokenizer, get_token_bytes
-from nanochat_moe.checkpoint_manager import save_checkpoint, load_checkpoint
-from nanochat_moe.loss_eval import evaluate_bpb
-from nanochat_moe.engine import Engine
-from scripts_moe.base_eval import evaluate_model
-print_banner()
-
-# -----------------------------------------------------------------------------
-# User settings
-run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
-# Runtime
-device_type = "" # cuda|cpu|mps (empty => autodetect good device type default, in order: CUDA > MPS > CPU)
-# Model architecture
-depth = 6 # the depth of the Transformer model to train (matches nanoMoE n_layer=6), rest of the kwargs are derived
-max_seq_len = 1024 # max context length (matches nanoMoE block_size=1024)
-# Training horizon. Only one of these 3 will be used, in this order of precedence.
-num_iterations = 50000 # explicit number of steps (matches nanoMoE max_iters=50000, makes total tokens ~25B)
-target_flops = -1.0 # calculate num_iterations to reach target_flops. Useful for scaling laws experiments (-1 = disable)
-target_param_data_ratio = -1 # calculate num_iterations to maintain fixed data:param ratio (Chinchilla=20) (-1 = disable)
-# Optimization
-device_batch_size = 12 # per-device batch size (matches nanoMoE batch_size=12)
-total_batch_size = 491520 # total desired batch size in #tokens (matches nanoMoE: 12 * 1024 * 40 = 491,520 for 8 GPUs)
-embedding_lr = 0.2 # learning rate for the embedding parameters (Adam)
-unembedding_lr = 0.004 # learning rate for the unembedding parameters (Adam)
-weight_decay = 0.1 # weight decay (matches nanoMoE weight_decay=1e-1)
-matrix_lr = 0.02 # learning rate for the matrix parameters (Muon)
-grad_clip = 1.0 # gradient clipping value (0.0 = disabled)
-warmup_ratio = 0.0 # ratio of iterations for LR warmup
-warmdown_ratio = 0.2 # ratio of iterations for LR warmdown
-final_lr_frac = 0.0 # final LR is this fraction of the initial LR
-resume_from_step = -1 # resume training from this step of the optimization (-1 = disable)
-# Evaluation
-eval_every = 500 # every how many steps to evaluate the model for val bpb (matches nanoMoE eval_interval=500)
-eval_tokens = 200 * 1024 # number of tokens to evaluate val loss on (matches nanoMoE eval_iters=200, approximate)
-core_metric_every = 2000 # every how many steps to evaluate the core metric (-1 = disable)
-core_metric_max_per_task = 500 # examples per task in estimating the core metric
-sample_every = 2000 # every how many steps to sample from the model
-save_every = 1000 # every how many steps to save model checkpoints (-1 = disable, and save only at the end of the run)
-# Output
-model_tag = "" # optionally override the model tag for the output checkpoint directory name
-# now allow CLI to override the settings via the configurator lol
-
-
-
-
-config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
-exec(open(os.path.join('nanochat_moe', 'configurator.py')).read()) # overrides from command line or config file
-user_config = {k: globals()[k] for k in config_keys} # will be useful for logging
-# -----------------------------------------------------------------------------
-
-# Compute init
-device_type = autodetect_device_type() if device_type == "" else device_type
-ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
-master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
-autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
-synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None
-get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0
-
-# wandb logging init
-use_dummy_wandb = run == "dummy" or not master_process
-wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat", name=run, config=user_config)
-
-# Tokenizer will be useful for evaluation, also we need the vocab size
-tokenizer = get_tokenizer()
-token_bytes = get_token_bytes(device=device)
-vocab_size = tokenizer.get_vocab_size()
-print0(f"Vocab size: {vocab_size:,}")
-
-# Model kwargs are derived from the desired depth of the model
-num_layers = 6
-model_dim = 384
-num_heads = 6
-num_kv_heads = 6
-print0(f"num_layers: {num_layers}")
-print0(f"model_dim: {model_dim}")
-print0(f"num_heads: {num_heads}")
-print0(f"num_kv_heads: {num_kv_heads}")
-
-# Optimizer / data / training length related hyperparameters
-# figure out the needed gradient accumulation to reach the desired total batch size
-tokens_per_fwdbwd = device_batch_size * max_seq_len # tokens per iteration for a single rank
-world_tokens_per_fwdbwd = tokens_per_fwdbwd * ddp_world_size # total tokens per iteration for all ranks
-assert total_batch_size % world_tokens_per_fwdbwd == 0
-grad_accum_steps = total_batch_size // world_tokens_per_fwdbwd
-print0(f"Tokens / micro-batch / rank: {device_batch_size} x {max_seq_len} = {tokens_per_fwdbwd:,}")
-print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}")
-print0(f"Total batch size {total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}")
-
-# -----------------------------------------------------------------------------
-# Initialize the Model
-
-# Create a new model with random weights
-model_config_kwargs = dict(sequence_len=max_seq_len, vocab_size=vocab_size, n_layer=num_layers, n_head=num_heads, n_kv_head=num_kv_heads, n_embd=model_dim)
-with torch.device("meta"):
-    model_config = GPTConfig(**model_config_kwargs)
-    model = GPT(model_config)
-model.to_empty(device=device)
-model.init_weights()
-
-# If we are resuming, overwrite the model parameters with those of the checkpoint
-base_dir = get_base_dir()
-output_dirname = model_tag if model_tag else f"d{depth}" # e.g. d12
-checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname)
-resuming = resume_from_step != -1
-# Auto-detect last checkpoint if resume_from_step is -1 but checkpoints exist
-if not resuming and os.path.exists(checkpoint_dir):
-    from nanochat_moe.checkpoint_manager import find_last_step
-    try:
-        last_checkpoint_step = find_last_step(checkpoint_dir)
-        print0(f"Found existing checkpoint at step {last_checkpoint_step}, auto-resuming...")
-        resume_from_step = last_checkpoint_step
-        resuming = True
-    except FileNotFoundError:
-        # No checkpoints found, start fresh
-        pass
-if resuming:
-    print0(f"Resuming optimization from step {resume_from_step}")
-    model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, resume_from_step, device, load_optimizer=True, rank=ddp_rank)
-    model.load_state_dict(model_data, strict=True, assign=True)
-    del model_data # free up this memory after the copy
-
-orig_model = model # original, uncompiled model, for saving raw model state_dict and for inference/evaluation (because the shapes may change shape)
-model = torch.compile(model, dynamic=False) # the inputs to model will never change shape so dynamic=False is safe
-num_params = sum(p.numel() for p in model.parameters())
-print0(f"Number of parameters: {num_params:,}")
-num_flops_per_token = model.estimate_flops()
-print0(f"Estimated FLOPs per token: {num_flops_per_token:e}")
-
-# Calculate number of iterations. Either it is given, or from target flops, or from target data:param ratio (in that order)
-assert num_iterations > 0 or target_param_data_ratio > 0 or target_flops > 0
-if num_iterations > 0:
-    print0(f"Using user-provided number of iterations: {num_iterations:,}")
-elif target_flops > 0:
-    # calculate the number of iterations from the target flops
-    num_iterations = round(target_flops / (num_flops_per_token * total_batch_size))
-    print0(f"Calculated number of iterations from target FLOPs: {num_iterations:,}")
-elif target_param_data_ratio > 0:
-    # calculate the number of iterations from the target param data ratio
-    target_tokens = target_param_data_ratio * num_params
-    num_iterations = target_tokens // total_batch_size
-    print0(f"Calculated number of iterations from target data:param ratio: {num_iterations:,}")
-else:
-    raise ValueError("No training horizon specified")
-total_tokens = total_batch_size * num_iterations
-print0(f"Total number of training tokens: {total_tokens:,}")
-print0(f"Tokens : Params ratio: {total_batch_size * num_iterations / num_params:.2f}") # Chinchilla is ~20
-print0(f"Total training FLOPs estimate: {num_flops_per_token * total_tokens:e}")
-
-# -----------------------------------------------------------------------------
-# Initialize the Optimizer (Muon for Linear layers, AdamW for embedding and lm_head)
-optimizers = model.setup_optimizers(unembedding_lr=unembedding_lr, embedding_lr=embedding_lr, matrix_lr=matrix_lr, weight_decay=weight_decay)
-adamw_optimizer, muon_optimizer = optimizers
-
-if resuming:
-    for opt, dat in zip(optimizers, optimizer_data):
-        opt.load_state_dict(dat)
-    del optimizer_data # free up the memory
-
-# -----------------------------------------------------------------------------
-# Initialize the DataLoaders for train/val
-tokens_dir = os.path.join(base_dir, "tokenized_data")
-dataloader_resume_state_dict = None if not resuming else meta_data["dataloader_state_dict"]
-train_loader = tokenizing_distributed_data_loader_with_state(device_batch_size, max_seq_len, split="train", device=device, resume_state_dict=dataloader_resume_state_dict)
-build_val_loader = lambda: tokenizing_distributed_data_loader(device_batch_size, max_seq_len, split="val", device=device)
-x, y, dataloader_state_dict = next(train_loader) # kick off load of the very first batch of data
-
-# -----------------------------------------------------------------------------
-# Set up hyperparameter schedulers
-
-# Learning rate scheduler
-def get_lr_multiplier(it):
-    warmup_iters = round(warmup_ratio * num_iterations)
-    warmdown_iters = round(warmdown_ratio * num_iterations)
-    if it < warmup_iters:
-        return (it + 1) / warmup_iters
-    elif it <= num_iterations - warmdown_iters:
-        return 1.0
-    else:
-        progress = (num_iterations - it) / warmdown_iters
-        return progress * 1.0 + (1 - progress) * final_lr_frac
-
-# Momentum scheduler for Muon optimizer
-def get_muon_momentum(it):
-    frac = min(it / 300, 1)
-    momentum = (1 - frac) * 0.85 + frac * 0.95
-    return momentum
-
-# -----------------------------------------------------------------------------
-# Loop state (variables updated by the training loop)
-
-if not resuming:
-    step = 0
-    min_val_bpb = float("inf")
-    smooth_train_loss = 0 # EMA of training loss
-    total_training_time = 0 # total wall-clock time of training
-else:
-    step = meta_data["step"]
-    loop_state = meta_data["loop_state"]
-    min_val_bpb = loop_state["min_val_bpb"]
-    smooth_train_loss = loop_state["smooth_train_loss"]
-    total_training_time = loop_state["total_training_time"]
-
-# -----------------------------------------------------------------------------
-# Training loop
-while True:
-    last_step = step == num_iterations # loop runs num_iterations+1 times so that we can eval/save at the end
-    flops_so_far = num_flops_per_token * total_batch_size * step
-
-    # once in a while: evaluate the val bpb (all ranks participate)
-    if last_step or step % eval_every == 0:
-        model.eval()
-        val_loader = build_val_loader()
-        eval_steps = eval_tokens // (device_batch_size * max_seq_len * ddp_world_size)
-        with autocast_ctx:
-            val_bpb = evaluate_bpb(model, val_loader, eval_steps, token_bytes)
-        print0(f"Step {step:05d} | Validation bpb: {val_bpb:.4f}")
-        if val_bpb < min_val_bpb:
-            min_val_bpb = val_bpb
-        wandb_run.log({
-            "step": step,
-            "total_training_flops": flops_so_far,
-            "total_training_time": total_training_time,
-            "val/bpb": val_bpb,
-        })
-        model.train()
-
-    # once in a while: estimate the CORE metric (all ranks participate)
-    # use the original uncompiled model because the inputs keep changing shape
-    results = {}
-    if core_metric_every > 0 and (last_step or (step > 0 and step % core_metric_every == 0)):
-        model.eval()
-        with autocast_ctx:
-            results = evaluate_model(orig_model, tokenizer, device, max_per_task=core_metric_max_per_task)
-        print0(f"Step {step:05d} | CORE metric: {results['core_metric']:.4f}")
-        wandb_run.log({
-            "step": step,
-            "total_training_flops": flops_so_far,
-            "core_metric": results["core_metric"],
-            "centered_results": results["centered_results"],
-        })
-        model.train()
-
-    # once in a while: sample from the model (only on master process)
-    # use the original uncompiled model because the inputs keep changing shape
-    if master_process and (last_step or (step > 0 and step % sample_every == 0)):
-        model.eval()
-        prompts = [
-            "The capital of France is",
-            "The chemical symbol of gold is",
-            "If yesterday was Friday, then tomorrow will be",
-            "The opposite of hot is",
-            "The planets of the solar system are:",
-            "My favorite color is",
-            "If 5*x + 3 = 13, then x is",
-        ]
-        engine = Engine(orig_model, tokenizer) # use orig_model to avoid recompilation
-        for prompt in prompts:
-            tokens = tokenizer(prompt, prepend="<|bos|>")
-            with autocast_ctx:
-                sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0)
-            print0(tokenizer.decode(sample[0]))
-        model.train()
-
-    # save checkpoint: at the end of the run, or every save_every steps, except at the first step or the resume step
-    if last_step or (step > 0 and step != resume_from_step and save_every > 0 and step % save_every == 0):
-        save_checkpoint(
-            checkpoint_dir,
-            step,
-            orig_model.state_dict(), # model parameters
-            [opt.state_dict() for opt in optimizers], # optimizer states
-            { # metadata saved as json
-                "step": step,
-                "val_bpb": val_bpb, # loss at last step
-                "model_config": model_config_kwargs,
-                "user_config": user_config, # inputs to the training script
-                "device_batch_size": device_batch_size,
-                "max_seq_len": max_seq_len,
-                "dataloader_state_dict": dataloader_state_dict,
-                "loop_state": { # all loop state (other than step) so that we can resume training
-                    "min_val_bpb": min_val_bpb,
-                    "smooth_train_loss": smooth_train_loss,
-                    "total_training_time": total_training_time,
-                },
-            },
-            rank=ddp_rank,
-        )
-
-    # termination conditions (TODO: possibly also add loss explosions etc.)
-    if last_step:
-        break
-
-    # -------------------------------------------------------------------------
-    # single training step
-    # evaluate the gradient
-    synchronize()
-    t0 = time.time()
-    for micro_step in range(grad_accum_steps):
-        with autocast_ctx:
-            loss = model(x, y)
-        train_loss = loss.detach() # for logging
-        loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
-        loss.backward()
-        x, y, dataloader_state_dict = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward
-    # gradient clipping
-    grad_clip_enabled = grad_clip > 0.0
-    if grad_clip_enabled:
-        grad_norm_tensor = torch.nn.utils.clip_grad_norm_(orig_model.parameters(), grad_clip)
-        grad_norm = grad_norm_tensor.item() # GPU tensor -> CPU float (note: cpu-gpu sync point)
-    # step the optimizers
-    lrm = get_lr_multiplier(step)
-    for opt in optimizers:
-        for group in opt.param_groups:
-            group["lr"] = group["initial_lr"] * lrm
-    muon_momentum = get_muon_momentum(step)
-    for group in muon_optimizer.param_groups:
-        group["momentum"] = muon_momentum
-    for opt in optimizers:
-        opt.step()
-    model.zero_grad(set_to_none=True)
-    synchronize()
-    t1 = time.time()
-    dt = t1 - t0
-    # -------------------------------------------------------------------------
-
-    # logging
-    ema_beta = 0.9 # EMA decay factor for some smoothing just for nicer logging
-    smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss
-    debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA
-    pct_done = 100 * step / num_iterations
-    tok_per_sec = int(total_batch_size / dt)
-    flops_per_sec = num_flops_per_token * total_batch_size / dt
-    promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity
-    mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in %
-    if step > 10:
-        total_training_time += dt # only count the time after the first 10 steps
-    print_grad_norm = f" grad norm: {grad_norm:.4f} |" if grad_clip_enabled else ""
-    print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} |{print_grad_norm} lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | total time: {total_training_time/60:.2f}m")
-    if step % 100 == 0:
-        log_data = {
-            "step": step,
-            "total_training_flops": flops_so_far,
-            "total_training_time": total_training_time,
-            "train/loss": debiased_smooth_loss,
-            "train/lrm": lrm,
-            "train/dt": dt,
-            "train/tok_per_sec": tok_per_sec,
-            "train/mfu": mfu,
-        }
-        if grad_clip_enabled:
-            log_data["train/grad_norm"] = grad_norm
-        wandb_run.log(log_data)
-
-    # state update
-    step += 1
-
-# print a few more stats
-print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB")
-print0(f"Total training time: {total_training_time/60:.2f}m")
-print0(f"Minimum validation bpb: {min_val_bpb:.4f}")
-
-# Log to report
-from nanochat_moe.report import get_report
-get_report().log(section="Base model training", data=[
-    user_config, # CLI args
-    { # stats about the training setup
-        "Number of parameters": num_params,
-        "Number of FLOPs per token": f"{num_flops_per_token:e}",
-        "Calculated number of iterations": num_iterations,
-        "Number of training tokens": total_tokens,
-        "Tokens : Params ratio": total_batch_size * num_iterations / num_params,
-        "DDP world size": ddp_world_size,
-        "warmup_ratio": warmup_ratio,
-        "warmdown_ratio": warmdown_ratio,
-        "final_lr_frac": final_lr_frac,
-    },
-    { # stats about training outcomes
-        "Minimum validation bpb": min_val_bpb,
-        "Final validation bpb": val_bpb,
-        "CORE metric estimate": results.get("core_metric", None),
-        "MFU %": f"{mfu:.2f}%",
-        "Total training flops": f"{flops_so_far:e}",
-        "Total training time": f"{total_training_time/60:.2f}m",
-        "Peak memory usage": f"{get_max_memory() / 1024 / 1024:.2f}MiB",
-    }
-])
-
-# cleanup
-wandb_run.finish() # wandb run finish
-compute_cleanup()
--- a/scripts_moe/chat_cli.py
+++ b/scripts_moe/chat_cli.py
@ -1,105 +0,0 @@
-"""
-New and upgraded chat mode because a lot of the code has changed since the last one.
-
-Intended to be run single GPU only atm:
-python -m scripts.chat_cli -i mid
-"""
-import argparse
-import torch
-from nanochat.common import compute_init, autodetect_device_type
-from contextlib import nullcontext
-from nanochat.engine import Engine
-from nanochat.checkpoint_manager import load_model
-
-parser = argparse.ArgumentParser(description='Chat with the model')
-parser.add_argument('-i', '--source', type=str, default="sft", help="Source of the model: sft|mid|rl")
-parser.add_argument('-g', '--model-tag', type=str, default=None, help='Model tag to load')
-parser.add_argument('-s', '--step', type=int, default=None, help='Step to load')
-parser.add_argument('-p', '--prompt', type=str, default='', help='Prompt the model, get a single response back')
-parser.add_argument('-t', '--temperature', type=float, default=0.6, help='Temperature for generation')
-parser.add_argument('-k', '--top-k', type=int, default=50, help='Top-k sampling parameter')
-parser.add_argument('--device-type', type=str, default='', choices=['cuda', 'cpu', 'mps'], help='Device type for evaluation: cuda|cpu|mps. empty => autodetect')
-parser.add_argument('-d', '--dtype', type=str, default='bfloat16', choices=['float32', 'bfloat16'])
-args = parser.parse_args()
-
-# Init the model and tokenizer
-
-device_type = autodetect_device_type() if args.device_type == "" else args.device_type
-ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
-ptdtype = torch.float32 if args.dtype == 'float32' else torch.bfloat16
-autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
-model, tokenizer, meta = load_model(args.source, device, phase="eval", model_tag=args.model_tag, step=args.step)
-
-# Special tokens for the chat state machine
-bos = tokenizer.get_bos_token_id()
-user_start, user_end = tokenizer.encode_special("<|user_start|>"), tokenizer.encode_special("<|user_end|>")
-assistant_start, assistant_end = tokenizer.encode_special("<|assistant_start|>"), tokenizer.encode_special("<|assistant_end|>")
-
-# Create Engine for efficient generation
-engine = Engine(model, tokenizer)
-
-print("\nNanoChat Interactive Mode")
-print("-" * 50)
-print("Type 'quit' or 'exit' to end the conversation")
-print("Type 'clear' to start a new conversation")
-print("-" * 50)
-
-conversation_tokens = [bos]
-
-while True:
-
-    if args.prompt:
-        # Get the prompt from the launch command
-        user_input = args.prompt
-    else:
-        # Get the prompt interactively from the console
-        try:
-            user_input = input("\nUser: ").strip()
-        except (EOFError, KeyboardInterrupt):
-            print("\nGoodbye!")
-            break
-
-    # Handle special commands
-    if user_input.lower() in ['quit', 'exit']:
-        print("Goodbye!")
-        break
-
-    if user_input.lower() == 'clear':
-        conversation_tokens = [bos]
-        print("Conversation cleared.")
-        continue
-
-    if not user_input:
-        continue
-
-    # Add User message to the conversation
-    conversation_tokens.append(user_start)
-    conversation_tokens.extend(tokenizer.encode(user_input))
-    conversation_tokens.append(user_end)
-
-    # Kick off the assistant
-    conversation_tokens.append(assistant_start)
-    generate_kwargs = {
-        "num_samples": 1,
-        "max_tokens": 256,
-        "temperature": args.temperature,
-        "top_k": args.top_k,
-    }
-    response_tokens = []
-    print("\nAssistant: ", end="", flush=True)
-    with autocast_ctx:
-        for token_column, token_masks in engine.generate(conversation_tokens, **generate_kwargs):
-            token = token_column[0] # pop the batch dimension (num_samples=1)
-            response_tokens.append(token)
-            token_text = tokenizer.decode([token])
-            print(token_text, end="", flush=True)
-    print()
-    # we have to ensure that the assistant end token is the last token
-    # so even if generation ends due to max tokens, we have to append it to the end
-    if response_tokens[-1] != assistant_end:
-        response_tokens.append(assistant_end)
-    conversation_tokens.extend(response_tokens)
-
-    # In the prompt mode, we only want a single response and exit
-    if args.prompt:
-        break
--- a/scripts_moe/chat_eval.py
+++ b/scripts_moe/chat_eval.py
@ -1,257 +0,0 @@
-"""
-Evaluate the Chat model.
-All the generic code lives here, and all the evaluation-specific
-code lives in nanochat directory and is imported from here.
-
-Example runs:
-python -m scripts.chat_eval -a ARC-Easy
-torchrun --nproc_per_node=8 -m scripts.chat_eval -- -a ARC-Easy
-"""
-
-import argparse
-from functools import partial
-from contextlib import nullcontext
-
-import torch
-import torch.distributed as dist
-
-from nanochat.common import compute_init, compute_cleanup, get_dist_info, print0, autodetect_device_type
-from nanochat.checkpoint_manager import load_model
-from nanochat.engine import Engine
-
-from tasks.humaneval import HumanEval
-from tasks.mmlu import MMLU
-from tasks.arc import ARC
-from tasks.gsm8k import GSM8K
-from tasks.spellingbee import SpellingBee
-
-# -----------------------------------------------------------------------------
-# Generative evaluation loop (we go one problem at a time, sample, evaluate)
-
-def run_generative_eval(task_object, tokenizer, model, engine, num_samples, max_new_tokens, temperature, top_k, max_problems=None):
-
-    ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
-    device = model.get_device()
-
-    num_problems = len(task_object) if max_problems is None else min(len(task_object), max_problems)
-
-    # Run the evaluation
-    num_passed, total = 0, 0
-    for i in range(ddp_rank, num_problems, ddp_world_size):
-        conversation = task_object[i]
-
-        # Tokenize the prompt
-        encoded_prompt = tokenizer.render_for_completion(conversation)
-        # Get the completions
-        results, _ = engine.generate_batch(
-            encoded_prompt,
-            num_samples=num_samples,
-            max_tokens=max_new_tokens,
-            temperature=temperature,
-            top_k=top_k,
-        )
-        # Decode the completions as text
-        prefix_length = len(encoded_prompt)
-        completions = [tokenizer.decode(result_tokens[prefix_length:]) for result_tokens in results]
-        # Evaluate success criteria
-        outcomes = [task_object.evaluate(conversation, completion) for completion in completions]
-        passed = any(outcomes)
-
-        # Keep stats
-        total += 1
-        num_passed += int(passed)
-
-        # Logging (overwrite the same line in the console)
-        print(f"\r\033[KRank {ddp_rank} | {num_passed}/{total} ({100*num_passed/total:.2f}%)", end='', flush=True)
-
-    # Finish the in-place progress line with a newline before final summary
-    print()
-
-    # Aggregate results across all ranks
-    if ddp:
-        num_passed_tensor = torch.tensor([num_passed], dtype=torch.long, device=device)
-        total_tensor = torch.tensor([total], dtype=torch.long, device=device)
-        dist.all_reduce(num_passed_tensor, op=dist.ReduceOp.SUM)
-        dist.all_reduce(total_tensor, op=dist.ReduceOp.SUM)
-        num_passed = num_passed_tensor.item()
-        total = total_tensor.item()
-
-    print0("=" * 50)
-    print0(f"Final: {num_passed}/{total} ({100*num_passed/total:.2f}%)")
-
-    # Return the accuracy
-    return num_passed/total
-
-# -----------------------------------------------------------------------------
-# Categorical evaluation loop
-# A lot easier because we don't have to sample. Therefore, we can actually go
-# batches at a time and just check the logits for correct answer choices.
-
-def run_categorical_eval(task_object, tokenizer, model, batch_size, max_problems=None):
-
-    ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
-    device = model.get_device()
-    bos = tokenizer.get_bos_token_id() # use BOS as pad token is ok, these positions are ignored
-
-    # We'll process batches of independent problems at a time because there is no sampling needed
-    num_problems = len(task_object) if max_problems is None else min(len(task_object), max_problems)
-    ceil_div = lambda x, y: -(-x // y)
-    num_batches = ceil_div(num_problems, batch_size)
-
-    # Run the evaluation
-    letter_to_id_cache = {} # many letters will repeat often, let's save the tokenizer some work
-    num_passed, total = 0, 0
-    for i in range(ddp_rank, num_batches, ddp_world_size):
-        i0, i1 = i * batch_size, min((i + 1) * batch_size, num_problems)
-
-        # Prepare the batch of problems. They might all be of different length, so we pad/collate them.
-        conversations = [task_object[ii] for ii in range(i0, i1)]
-        prompt_ids = [tokenizer.render_for_completion(conversation) for conversation in conversations] # TODO: remake the way this works
-        max_length = max(len(ids) for ids in prompt_ids)
-        answer_time_positions = [len(ids) - 1 for ids in prompt_ids] # where the last token is (and the predicted answer)
-        padded_prompt_ids = [ids + [bos] * (max_length - len(ids)) for ids in prompt_ids]
-        prompt_ids = torch.tensor(padded_prompt_ids, dtype=torch.long, device=device)
-
-        # Get the logits for the whole batch of conversations in parallel (efficiency win here)
-        with torch.no_grad():
-            logits = model(prompt_ids) # (B, T, V)
-
-        # Focus on the available answer on just the letters corresponding to choices
-        # Note that this helps the evaluation a lot because it specifically narrows the focus to only the available letters
-        # The much harder alternative would be to just generate from the Assistant and check if it responded with the correct
-        # letter (e.g. A, B, C, D), but evaluations typically make the task easier in this way.
-        for idx, conversation in enumerate(conversations):
-            # get the token ids of all the available letters of this problem
-            letters = conversation['letters']
-            letter_ids = []
-            for letter in letters:
-                if not letter in letter_to_id_cache:
-                    encoded_letter = tokenizer.encode(letter)
-                    assert len(encoded_letter) == 1, "Each letter must be a single token"
-                    letter_to_id_cache[letter] = encoded_letter[0]
-                letter_ids.append(letter_to_id_cache[letter])
-            # focus logits just down to the answer position and the available letters of the answer
-            answer_pos = answer_time_positions[idx]
-            focus_logits = logits[idx, answer_pos, letter_ids]
-            # get the argmax letter (the predicted answer)
-            argmax_letter_id = focus_logits.argmax(dim=-1).item()
-            predicted_letter = letters[argmax_letter_id]
-            # evaluate the outcome
-            outcome = task_object.evaluate(conversation, predicted_letter)
-            num_passed += int(outcome)
-            total += 1
-
-    # Aggregate results across all ranks
-    if ddp:
-        num_passed_tensor = torch.tensor([num_passed], dtype=torch.long, device=device)
-        total_tensor = torch.tensor([total], dtype=torch.long, device=device)
-        dist.all_reduce(num_passed_tensor, op=dist.ReduceOp.SUM)
-        dist.all_reduce(total_tensor, op=dist.ReduceOp.SUM)
-        num_passed = num_passed_tensor.item()
-        total = total_tensor.item()
-
-    average = num_passed/total
-    print0(f"Final: {num_passed}/{total} ({100*average:.2f}%)")
-    return average
-
-# -----------------------------------------------------------------------------
-
-def run_chat_eval(task_name, model, tokenizer, engine,
-                   batch_size=1, num_samples=1, max_new_tokens=512, temperature=0.0, top_k=50,
-                   max_problems=None):
-    # Create the evaluation object
-    task_module = {
-        'HumanEval': HumanEval,
-        'MMLU': partial(MMLU, subset="all", split="test"),
-        'ARC-Easy': partial(ARC, subset="ARC-Easy", split="test"),
-        'ARC-Challenge': partial(ARC, subset="ARC-Challenge", split="test"),
-        'GSM8K': partial(GSM8K, subset="main", split="test"),
-        'SpellingBee': partial(SpellingBee, size=256, split="test"),
-    }[task_name]
-    task_object = task_module()
-    # Run the evaluation
-    if task_object.eval_type == 'generative':
-        acc = run_generative_eval(task_object, tokenizer, model, engine, num_samples, max_new_tokens, temperature, top_k, max_problems=max_problems)
-    elif task_object.eval_type == 'categorical':
-        acc = run_categorical_eval(task_object, tokenizer, model, batch_size, max_problems=max_problems)
-    else:
-        raise ValueError(f"Unsupported task evaluation type: {task_object.eval_type}")
-    return acc
-
-# -----------------------------------------------------------------------------
-if __name__ == "__main__":
-
-    # Parse command-line arguments
-    parser = argparse.ArgumentParser()
-    parser.add_argument('-i', '--source', type=str, required=True, help="Source of the model: sft|mid|rl")
-    parser.add_argument('-a', '--task-name', type=str, default=None, help="Task name. Default = all tasks. Use | to split multiple tasks.")
-    parser.add_argument('-d', '--dtype', type=str, default='bfloat16', choices=['float32', 'bfloat16'])
-    parser.add_argument('-t', '--temperature', type=float, default=0.0)
-    parser.add_argument('-m', '--max-new-tokens', type=int, default=512)
-    parser.add_argument('-n', '--num-samples', type=int, default=1)
-    parser.add_argument('-k', '--top-k', type=int, default=50)
-    parser.add_argument('-b', '--batch-size', type=int, default=8, help='Batch size for categorical evaluation')
-    parser.add_argument('-g', '--model-tag', type=str, default=None, help='Model tag to load')
-    parser.add_argument('-s', '--step', type=int, default=None, help='Step to load')
-    parser.add_argument('-x', '--max-problems', type=int, default=None, help='Max problems to evaluate')
-    parser.add_argument('--device-type', type=str, default='', choices=['cuda', 'cpu', 'mps'], help='Device type for evaluation: cuda|cpu|mps. empty => autodetect')
-    args = parser.parse_args()
-
-    device_type = autodetect_device_type() if args.device_type == "" else args.device_type
-    ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
-    ptdtype = torch.float32 if args.dtype == 'float32' else torch.bfloat16
-    autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
-
-    model, tokenizer, meta = load_model(args.source, device, phase="eval", model_tag=args.model_tag, step=args.step)
-    engine = Engine(model, tokenizer)
-
-    # Get the tasks to evaluate on
-    all_tasks = ['ARC-Easy', 'ARC-Challenge', 'MMLU', 'GSM8K', 'HumanEval', 'SpellingBee']
-    baseline_accuracies = {
-        'ARC-Easy': 0.25, # multiple choice 1 of 4 => 25%
-        'ARC-Challenge': 0.25, # multiple choice 1 of 4 => 25%
-        'MMLU': 0.25, # multiple choice 1 of 4 => 25%
-        'GSM8K': 0.0, # open-ended => 0%
-        'HumanEval': 0.0, # open-ended => 0%
-        'SpellingBee': 0.0, # open-ended => 0%
-    }
-    task_names = all_tasks if args.task_name is None else args.task_name.split('|')
-
-    # Run all the task evaluations sequentially
-    results = {}
-    for task_name in task_names:
-        with autocast_ctx:
-            acc = run_chat_eval(
-                task_name,
-                model, tokenizer, engine,
-                batch_size=args.batch_size,
-                num_samples=args.num_samples,
-                max_new_tokens=args.max_new_tokens,
-                temperature=args.temperature,
-                top_k=args.top_k,
-                max_problems=args.max_problems,
-            )
-            results[task_name] = acc
-            print0(f"{task_name} accuracy: {100 * acc:.2f}%")
-
-    # Log to report
-    from nanochat.report import get_report
-    all_tasks_were_evaluated = all(task_name in results for task_name in all_tasks)
-    # calculate the ChatCORE metric if we can (similar to CORE, it's the mean centered accuracy)
-    # this way, ChatCORE ranges from 0 (at random baseline) to 1 (peak performance)
-    chatcore_metric_dict = {}
-    if all_tasks_were_evaluated:
-        centered_mean = 0
-        for task_name, acc in results.items():
-            baseline_acc = baseline_accuracies.get(task_name, 0.0)
-            centered_acc = (acc - baseline_acc) / (1.0 - baseline_acc)
-            centered_mean += centered_acc
-        chatcore_metric = centered_mean / len(results)
-        chatcore_metric_dict = {"ChatCORE metric": chatcore_metric}
-    get_report().log(section="Chat evaluation " + args.source, data=[
-        vars(args), # CLI args
-        results,
-        chatcore_metric_dict,
-    ])
-
-    compute_cleanup()
--- a/scripts_moe/chat_rl.py
+++ b/scripts_moe/chat_rl.py
@ -1,333 +0,0 @@
-"""
-Reinforcement learning on GSM8K via "GRPO".
-
-I put GRPO in quotes because we actually end up with something a lot
-simpler and more similar to just REINFORCE:
-
-1) Delete trust region, so there is no KL regularization to a reference model
-2) We are on policy, so there's no need for PPO ratio+clip.
-3) We use GAPO style normalization that is token-level, not sequence-level.
-4) Instead of z-score normalization (r - mu)/sigma, only use (r - mu) as the advantage.
-
-1 GPU:
-python -m scripts.chat_rl
-
-8 GPUs:
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_rl -- --run=default
-"""
-
-import os
-import itertools
-import re
-import wandb
-import torch
-import torch.distributed as dist
-
-from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, DummyWandb
-from nanochat.checkpoint_manager import save_checkpoint, load_model
-from nanochat.engine import Engine
-from tasks.gsm8k import GSM8K
-
-# RL hyperparameters
-run = "dummy" # wandb run name
-source = "sft" # mid|sft|rl
-model_tag = None # model tag to load the model from (if source="rl")
-step = None # step to load the model from (if source="rl")
-dtype = "bfloat16"
-device_batch_size = 8 # no forward pass will go above this to not OOM
-examples_per_step = 16 # in total and across all ranks (note: examples, not samples/completions!)
-num_samples = 16 # number of samples per example (/question)
-max_new_tokens = 256
-temperature = 1.0
-top_k = 50 # TODO: try None?
-unembedding_lr = 0.004
-embedding_lr = 0.2
-matrix_lr = 0.02
-weight_decay = 0.0
-init_lr_frac = 0.05
-num_epochs = 1 # how many epochs of gsm8k to train on
-save_every = 60 # every how many steps to save the model
-eval_every = 60 # every how many steps to evaluate the model for val pass@k
-eval_examples = 400 # number of examples used for evaluating pass@k
-# now allow CLI to override the settings via the configurator lol
-config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
-exec(open(os.path.join('nanochat', 'configurator.py')).read()) # overrides from command line or config file
-user_config = {k: globals()[k] for k in config_keys} # will be useful for logging
-# -----------------------------------------------------------------------------
-
-# Init compute/precision
-ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init()
-master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
-dtype = torch.float32 if dtype == 'float32' else torch.bfloat16
-autocast_ctx = torch.amp.autocast(device_type="cuda", dtype=dtype)
-
-# wandb logging init
-use_dummy_wandb = run == "dummy" or not master_process
-wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat-rl", name=run, config=user_config)
-
-# Init model and tokenizer
-model, tokenizer, meta = load_model(source, device, phase="eval", model_tag=model_tag, step=step)
-engine = Engine(model, tokenizer) # for sampling rollouts
-
-# -----------------------------------------------------------------------------
-# Rollout / sampling generator loop that yields batches of examples for training
-
-train_task = GSM8K(subset="main", split="train")
-val_task = GSM8K(subset="main", split="test")
-num_steps = (len(train_task) // examples_per_step) * num_epochs
-print0(f"Calculated number of steps: {num_steps}")
-
-@torch.no_grad()
-def get_batch():
-    assistant_end = tokenizer.encode_special("<|assistant_end|>") # ok to use this token, it's only for padding and isn't used in the loss.
-    rank_indices = range(ddp_rank, len(train_task), ddp_world_size) # each rank is responsible for different examples in the training data
-    for example_idx in itertools.cycle(rank_indices):
-
-        # First get the full conversation of both user and assistant messages
-        conversation = train_task[example_idx]
-
-        # Tokenize the conversation, deleting the last Assistant message and priming the Assistant for a completion instead
-        # (i.e. keep the <|assistant_start|>, but delete everything after it)
-        tokens = tokenizer.render_for_completion(conversation)
-        prefix_length = len(tokens)
-
-        # Generate num_samples samples using batched generation, use loop to avoid OOMs
-        model.eval() # ensure the model is in eval mode
-        generated_token_sequences = []
-        masks = []
-        num_sampling_steps = num_samples // device_batch_size # go sequentially to prevent OOMs
-        for sampling_step in range(num_sampling_steps):
-            seed = hash((step, example_idx, sampling_step)) & 0x7FFFFFFF # positive half of int32
-            with autocast_ctx:
-                generated_token_sequences_batch, masks_batch = engine.generate_batch(
-                    tokens,
-                    num_samples=device_batch_size,
-                    max_tokens=max_new_tokens,
-                    temperature=temperature,
-                    top_k=top_k,
-                    seed=seed, # must make sure to change the seed for each sampling step
-                )
-            generated_token_sequences.extend(generated_token_sequences_batch)
-            masks.extend(masks_batch)
-
-        # Calculate the rewards for each sample
-        rewards = []
-        for sample_tokens in generated_token_sequences:
-            # Get just the generated tokens (after the prompt)
-            generated_tokens = sample_tokens[prefix_length:]
-            # Decode the generated response
-            generated_text = tokenizer.decode(generated_tokens)
-            # Calculate the reward
-            reward = train_task.reward(conversation, generated_text)
-            rewards.append(reward)
-
-        # Pad the sequences so that their lengths (in time) match
-        max_length = max(len(seq) for seq in generated_token_sequences)
-        padded_generated_token_sequences = [seq + [assistant_end] * (max_length - len(seq)) for seq in generated_token_sequences]
-        padded_masks = [mask + [0] * (max_length - len(mask)) for mask in masks]
-        # Stack up the sequences and masks into PyTorch tensors
-        ids = torch.tensor(padded_generated_token_sequences, dtype=torch.long, device=device)
-        mask_ids = torch.tensor(padded_masks, dtype=torch.long, device=device)
-        # Generate autoregressive inputs and targets to the Transformer
-        inputs = ids[:, :-1]
-        targets = ids[:, 1:].clone() # clone to avoid in-place modification:
-        targets[mask_ids[:, 1:] == 0] = -1 # <-- inplace modification right here. -1 is the ignore index
-        # NOTE also that the Engine returns mask=0 for BOTH the prompt tokens AND the tool use tokens.
-        # So we will (correctly) end up not training on the prompt tokens, or the tool use forced tokens.
-        rewards = torch.tensor(rewards, dtype=torch.float, device=device)
-        # Calculate the advantages by simply subtracting the mean (instead of z-score (x-mu)/sigma)
-        mu = rewards.mean()
-        advantages = rewards - mu
-        # yield inputs/targets as (B, T) of ids and rewards as (B,) of floats
-        yield generated_token_sequences, inputs, targets, rewards, advantages
-
-# -----------------------------------------------------------------------------
-# Simple evaluation loop for GSM8K pass@k
-def run_gsm8k_eval(task, tokenizer, engine,
-    max_examples=None,
-    num_samples=1,
-    max_completion_tokens=256,
-    temperature=0.0,
-    top_k=50
-):
-    """
-    Evaluates GSM8K task and returns a list of records of evaluation outcomes.
-    In a distributed setting, all ranks cooperate but this function will NOT
-    do the reduction across ranks. This is the responsibility of the caller.
-    Because the evaluation can take a while, this function will yield records one by one.
-    """
-    max_examples = min(max_examples, len(task)) if max_examples is not None else len(task)
-    for idx in range(ddp_rank, max_examples, ddp_world_size):
-        conversation = task[idx]
-        tokens = tokenizer.render_for_completion(conversation)
-        prefix_length = len(tokens)
-        # Generate k samples using batched generation inside the Engine
-        assert num_samples <= device_batch_size # usually this is true. we can add a loop if not...
-        generated_token_sequences, masks = engine.generate_batch(
-            tokens,
-            num_samples=num_samples,
-            max_tokens=max_completion_tokens,
-            temperature=temperature,
-            top_k=top_k
-        )
-        # Check each sample for correctness
-        outcomes = []
-        for sample_tokens in generated_token_sequences:
-            generated_tokens = sample_tokens[prefix_length:]
-            generated_text = tokenizer.decode(generated_tokens)
-            is_correct = task.evaluate(conversation, generated_text)
-            outcomes.append({
-                "is_correct": is_correct
-            })
-        # A bit bloated because I wanted to do more complex logging at one point.
-        record = {
-            "idx": idx,
-            "outcomes": outcomes,
-        }
-        yield record
-
-# -----------------------------------------------------------------------------
-# Training loop
-
-# Init the optimizer
-optimizers = model.setup_optimizers(
-    unembedding_lr=unembedding_lr,
-    embedding_lr=embedding_lr,
-    matrix_lr=matrix_lr,
-    weight_decay=weight_decay,
-)
-
-# Set the initial learning rate as a fraction of the base learning rate
-for opt in optimizers:
-    for group in opt.param_groups:
-        group["lr"] = group["lr"] * init_lr_frac
-        group["initial_lr"] = group["lr"] # save the initial learning so we can decay easily later
-
-# Learning rate scheduler: simple rampdown to zero over num_steps
-def get_lr_multiplier(it):
-    lrm = 1.0 - it / num_steps
-    return lrm
-
-# Calculate the number of examples each rank handles to achieve the desired examples_per_step
-print0(f"Total sequences per step: {examples_per_step * num_samples}") # total batch size in sequences/step
-assert examples_per_step % ddp_world_size == 0, "Desired examples per step must be divisible by the number of ranks"
-examples_per_rank = examples_per_step // ddp_world_size # per GPU
-print0(f"Calculated examples per rank: {examples_per_rank}")
-
-# Kick off the training loop
-batch_iterator = get_batch()
-for step in range(num_steps):
-
-    # Evaluate the model once in a while and log to wandb
-    if step % eval_every == 0:
-        model.eval()
-        passk = torch.zeros(device_batch_size, device=device) # pass@k for k=1..device_batch_size
-        with autocast_ctx:
-            records_iter = run_gsm8k_eval(val_task, tokenizer, engine, num_samples=device_batch_size, max_examples=eval_examples, temperature=1.0)
-            records = list(records_iter) # collect all records
-        for k in range(1, device_batch_size + 1):
-            passk[k - 1] = sum(any(o["is_correct"] for o in r["outcomes"][:k]) for r in records)
-        num_records = torch.tensor(len(records), dtype=torch.long, device=device)
-        if ddp:
-            dist.all_reduce(num_records, op=dist.ReduceOp.SUM)
-            dist.all_reduce(passk, op=dist.ReduceOp.SUM)
-        passk = passk / num_records.item() # normalize by the total number of records
-        print_passk = [f"Pass@{k}: {passk[k - 1].item():.4f}" for k in range(1, device_batch_size + 1)]
-        print0(f"Step {step} | {', '.join(print_passk)}")
-        log_passk = {f"pass@{k}": passk[k - 1].item() for k in range(1, device_batch_size + 1)}
-        wandb_run.log({
-            "step": step,
-            **log_passk,
-        })
-
-    # Forward/Backward on rollouts over multiple examples in the dataset
-    rewards_list = []
-    sequence_lengths = []
-    for example_step in range(examples_per_rank):
-        # Get one batch corresponding to one example in the training dataset
-        sequences_all, inputs_all, targets_all, rewards_all, advantages_all = next(batch_iterator)
-        # Evaluate the loss and gradients
-        model.train() # ensure the model is in train mode
-        # We need one more loop because we can never exceed the device_batch_size
-        assert inputs_all.size(0) % device_batch_size == 0
-        num_passes = inputs_all.size(0) // device_batch_size
-        for pass_idx in range(num_passes):
-            # Pluck out the batch for this pass
-            b0, b1 = pass_idx * device_batch_size, (pass_idx + 1) * device_batch_size
-            inputs = inputs_all[b0:b1]
-            targets = targets_all[b0:b1]
-            rewards = rewards_all[b0:b1]
-            advantages = advantages_all[b0:b1]
-            # Calculate log probabilities. Note that the loss calculates NLL = -logp, so we negate
-            with autocast_ctx:
-                logp = -model(inputs, targets, loss_reduction='none').view_as(inputs) # (B, T)
-            # Calculate the PG objective. Note that ignore_index=-1 ensures that invalid tokens have loss 0.
-            pg_obj = (logp * advantages.unsqueeze(-1)).sum()
-            # normalize by the number of valid tokens, number of passes, and examples_per_rank
-            num_valid = (targets >= 0).sum().clamp(min=1)
-            pg_obj = pg_obj / (num_valid * num_passes * examples_per_rank)
-            # Note, there is no need to add PPO ratio+clip because we are on policy
-            # Finally, formulate the loss that we want to minimize (instead of objective we wish to maximize)
-            loss = -pg_obj
-            loss.backward()
-            print0(f"Step {step}/{num_steps} | Example step {example_step} | Pass {pass_idx} | loss: {loss.item():.6f} | Average reward: {rewards.mean().item()}")
-        # For logging
-        rewards_list.append(rewards_all.mean().item())
-        sequence_lengths.extend(len(seq) for seq in sequences_all)
-
-    # A bunch of logging for how the rollouts went this step
-    mean_reward = sum(rewards_list) / len(rewards_list)
-    mean_sequence_length = sum(sequence_lengths) / len(sequence_lengths)
-    if ddp: # aggregate across ranks
-        mean_reward_tensor = torch.tensor(mean_reward, dtype=torch.float, device=device)
-        mean_sequence_length_tensor = torch.tensor(mean_sequence_length, dtype=torch.float, device=device)
-        dist.all_reduce(mean_reward_tensor, op=dist.ReduceOp.AVG)
-        dist.all_reduce(mean_sequence_length_tensor, op=dist.ReduceOp.AVG)
-        mean_reward = mean_reward_tensor.item()
-        mean_sequence_length = mean_sequence_length_tensor.item()
-    print0(f"Step {step}/{num_steps} | Average reward: {mean_reward} | Average sequence length: {mean_sequence_length:.2f}")
-    wandb_run.log({
-        "step": step,
-        "reward": mean_reward,
-        "sequence_length": mean_sequence_length,
-    })
-
-    # Update the model parameters
-    lrm = get_lr_multiplier(step)
-    for opt in optimizers: # first set the learning rate
-        for group in opt.param_groups:
-            group["lr"] = group["initial_lr"] * lrm
-    for opt in optimizers: # then step the optimizers
-        opt.step()
-    model.zero_grad(set_to_none=True)
-    wandb_run.log({
-        "step": step,
-        "lrm": lrm,
-    })
-
-    # Master process saves the model once in a while. Skip first step. Save last step.
-    if master_process and ((step > 0 and step % save_every == 0) or step == num_steps - 1):
-        base_dir = get_base_dir()
-        depth = model.config.n_layer
-        model_tag = f"d{depth}" # base the model tag on the depth of the base model
-        checkpoint_dir = os.path.join(base_dir, "chatrl_checkpoints", model_tag)
-        model_config_kwargs = model.config.__dict__ # slightly naughty, abusing the simplicity of GPTConfig, TODO nicer
-        save_checkpoint(
-            checkpoint_dir,
-            step,
-            model.state_dict(),
-            None, # note: we don't bother to save the optimizer state
-            {
-                "model_config": model_config_kwargs,
-            }
-        )
-        print(f"✅ Saved model checkpoint to {checkpoint_dir}")
-
-# Log to report
-from nanochat.report import get_report
-get_report().log(section="Chat RL", data=[
-    user_config, # CLI args
-])
-
-wandb_run.finish() # wandb run finish
-compute_cleanup()
--- a/scripts_moe/chat_sft.py
+++ b/scripts_moe/chat_sft.py
@ -1,285 +0,0 @@
-"""
-Finetune a base model to be a chat model.
-Run on one GPU e.g. for debugging:
-
-python -m scripts.chat_sft
-
-Or torchrun for training:
-
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft
-"""
-
-import os
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
-
-import wandb
-import torch
-import torch.distributed as dist
-from contextlib import nullcontext
-
-from nanochat.common import compute_init, compute_cleanup, get_base_dir, print0, DummyWandb, autodetect_device_type
-from nanochat.checkpoint_manager import load_model
-from nanochat.checkpoint_manager import save_checkpoint
-from nanochat.engine import Engine
-from scripts.chat_eval import run_chat_eval
-
-from tasks.common import TaskMixture
-from tasks.arc import ARC
-from tasks.gsm8k import GSM8K
-from tasks.smoltalk import SmolTalk
-from tasks.customjson import CustomJSON
-from tasks.spellingbee import SimpleSpelling, SpellingBee
-
-# -----------------------------------------------------------------------------
-# SFT Hyperparameters
-run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
-# input model options
-source = "mid" # base|mid , which checkpoint to load the model from (base model or midtrained model)
-model_tag = None # model tag to load the model from (base model or midtrained model)
-step = None # step to load the model from (base model or midtrained model)
-# compute/precision
-device_type = "" # cuda|cpu|mps (empty => autodetect)
-dtype = "bfloat16"
-device_batch_size = 4 # max to avoid OOM
-# optimization
-num_epochs = 1
-num_iterations = -1 # override number of iterations (-1 = disable, use num_epochs to derive it)
-target_examples_per_step = 32
-unembedding_lr = 0.004
-embedding_lr = 0.2
-matrix_lr = 0.02
-weight_decay = 0.0
-init_lr_frac = 0.02
-# evaluation and logging there of
-eval_every = 100
-eval_steps = 100
-eval_metrics_every = 200
-eval_metrics_max_problems = 1024
-# now allow CLI to override the settings via the configurator lol
-config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
-exec(open(os.path.join('nanochat', 'configurator.py')).read()) # overrides from command line or config file
-user_config = {k: globals()[k] for k in config_keys} # possibly useful for logging
-# -----------------------------------------------------------------------------
-
-# Compute init
-device_type = autodetect_device_type() if device_type == "" else device_type
-ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
-master_process = ddp_rank == 0
-ptdtype = torch.float32 if dtype == 'float32' else torch.bfloat16
-autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
-
-# wandb logging init
-use_dummy_wandb = run == "dummy" or not master_process
-wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat-sft", name=run, config=user_config, save_code=True)
-
-# Load the model and tokenizer
-model, tokenizer, meta = load_model(source, device, phase="train", model_tag=model_tag, step=step)
-orig_model = model # original, uncompiled model
-# model = torch.compile(model, dynamic=True) # doesn't work super well because of variable lengths of inputs
-engine = Engine(model, tokenizer) # will be used for inline model evaluation only
-
-# -----------------------------------------------------------------------------
-# Task data mixture we'll train on
-identity_conversations_filepath = os.path.join(get_base_dir(), "identity_conversations.jsonl")
-train_ds = TaskMixture([
-    ARC(subset="ARC-Easy", split="train"), # 2.3K rows
-    ARC(subset="ARC-Challenge", split="train"), # 1.1K rows
-    GSM8K(subset="main", split="train"), # 8K rows
-    SmolTalk(split="train", stop=10_000), # 10K rows of smoltalk
-    CustomJSON(filepath=identity_conversations_filepath), # 1K rows of synthetic identity conversations
-    SimpleSpelling(size=300, split="train"), # 300 rows of Simple Spelling (e.g. spell the word 'apple')
-    SpellingBee(size=300, split="train"), # 300 rows of Spelling Bee (e.g. how many 'r' are in 'strawberry'?)
-]) # 2.3K + 1.1K + 8K + 10K + 1K + 0.3K + 0.3K = 23K rows
-val_ds = SmolTalk(split="test") # general conversations, 24K rows (though we don't actually use all of it)
-
-# -----------------------------------------------------------------------------
-# DataLoader
-
-def sft_data_generator(dataset, batch_size):
-    pad_token_id = tokenizer.encode_special("<|assistant_end|>") # use <|assistant_end|> as the pad token is ok, these positions are masked in the loss
-    # prepares a list of tokenized conversations into a batch and yields
-    def collate_and_yield(batch):
-        nrows = len(batch)
-        ncols = max(len(ids) for ids, mask in batch) - 1 # seq of n creates inputs/targets of n-1
-        inputs = torch.full((nrows, ncols), pad_token_id, dtype=torch.long)
-        targets = torch.full((nrows, ncols), -1, dtype=torch.long) # -1 is ignore index
-        for i, (ids, mask) in enumerate(batch):
-            n = len(ids)
-            ids_tensor = torch.tensor(ids, dtype=torch.long)
-            inputs[i, :n-1] = ids_tensor[:-1]
-            # recall -1 is the ignore index, so mask out targets where mask is 0
-            row_targets = ids_tensor[1:]
-            # mask[1:] omits the mask for the BOS token, which is never a target atm so it's ok
-            mask_tensor = torch.tensor(mask[1:], dtype=torch.long)
-            row_targets[mask_tensor == 0] = -1 # mask out targets where mask is 0
-            targets[i, :n-1] = row_targets
-        inputs = inputs.to(device) # move to device
-        targets = targets.to(device)
-        return inputs, targets
-    # iterates over the dataset in epochs, tokenizes
-    batch = []
-    while True:
-        for i in range(ddp_rank, len(dataset), ddp_world_size):
-            doc = dataset[i]
-            ids, mask = tokenizer.render_conversation(doc)
-            batch.append((ids, mask))
-            if len(batch) == batch_size:
-                yield collate_and_yield(batch)
-                batch = []
-
-examples_per_step = device_batch_size * ddp_world_size
-print0(f"Target examples per step: {target_examples_per_step}")
-print0(f"Device batch size: {device_batch_size}")
-print0(f"Examples per step is device_batch_size * ddp_world_size: {examples_per_step}")
-assert target_examples_per_step % examples_per_step == 0, "Target examples per step must be divisible by examples per step"
-grad_accum_steps = target_examples_per_step // examples_per_step
-print0(f"=> Setting grad accum steps: {grad_accum_steps}")
-
-if num_iterations == -1:
-    # derive num_iterations from num_epochs and the size of the dataset
-    assert num_epochs > 0, "num_epochs must be positive if num_iterations is -1"
-    num_iterations = (len(train_ds) // target_examples_per_step) * num_epochs
-train_loader = sft_data_generator(train_ds, batch_size=device_batch_size)
-build_val_loader = lambda: sft_data_generator(val_ds, batch_size=device_batch_size)
-
-# -----------------------------------------------------------------------------
-# Initialize the Optimizer
-
-optimizers = model.setup_optimizers(
-    unembedding_lr=unembedding_lr,
-    embedding_lr=embedding_lr,
-    matrix_lr=matrix_lr,
-    weight_decay=weight_decay,
-)
-# Set the initial learning rate as a fraction of the base learning rate
-for opt in optimizers:
-    for group in opt.param_groups:
-        group["lr"] = group["lr"] * init_lr_frac
-        group["initial_lr"] = group["lr"] # save the initial learning so we can decay easily later
-
-# -----------------------------------------------------------------------------
-# Training loop
-
-# Learning rate scheduler
-def get_lr_multiplier(it):
-    lrm = 1.0 - it / num_iterations
-    return lrm
-
-# Go!
-step = 0
-train_iter = iter(train_loader)
-for step in range(num_iterations):
-    last_step = step == num_iterations - 1
-
-    # evaluate the validation loss
-    if last_step or step % eval_every == 0:
-        model.eval()
-        val_iter = iter(build_val_loader())
-        losses = []
-        for _ in range(eval_steps):
-            val_inputs, val_targets = next(val_iter)
-            with torch.no_grad(), autocast_ctx:
-                loss = model(val_inputs, val_targets)
-            losses.append(loss)
-        val_loss = torch.stack(losses).mean() # average over eval_steps
-        if ddp:
-            dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) # average over ranks
-        val_loss = val_loss.item()
-        print0(f"Step {step:05d} | Validation loss: {val_loss:.6f}")
-        wandb_run.log({
-            "step": step,
-            "val_loss": val_loss,
-        })
-        model.train()
-
-    # evaluate accuracy of the multiple choice tasks (which are quick to run)
-    if last_step or (step > 0 and step % eval_metrics_every == 0):
-        model.eval()
-        metrics = {}
-        with torch.no_grad(), autocast_ctx:
-            # note that because these are inside no_grad, we can usually afford to at least ~2X the batch size
-            metrics["mmlu_acc"] = run_chat_eval("MMLU", model, tokenizer, engine, batch_size=device_batch_size*2, max_problems=eval_metrics_max_problems)
-            metrics["arc_easy_acc"] = run_chat_eval("ARC-Easy", model, tokenizer, engine, batch_size=device_batch_size*2, max_problems=eval_metrics_max_problems)
-        metrics_str = ', '.join(f'{k}: {v:.6f}' for k, v in metrics.items())
-        print0(f"Step {step:05d} | {metrics_str}")
-        wandb_run.log({
-            "step": step,
-            **metrics,
-        })
-        model.train()
-
-    if last_step:
-        break
-
-    # evaluate the gradient
-    num_tokens = torch.tensor(0, device=device) # the number of "active" tokens of supervision seen
-    for micro_step in range(grad_accum_steps):
-        train_inputs, train_targets = next(train_iter)
-        with autocast_ctx:
-            loss = model(train_inputs, train_targets)
-        train_loss = loss.detach() # for logging
-        loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
-        loss.backward() # accumulate the gradient
-        num_tokens += (train_targets >= 0).sum()
-    if ddp:
-        dist.all_reduce(num_tokens, op=dist.ReduceOp.SUM) # sum over ranks
-
-    # learning rate scheduler
-    lrm = get_lr_multiplier(step)
-    for opt in optimizers:
-        for group in opt.param_groups:
-            group["lr"] = group["initial_lr"] * lrm
-
-    # step the optimizers
-    for opt in optimizers:
-        opt.step()
-    model.zero_grad(set_to_none=True)
-
-    # logging
-    train_loss_item = train_loss.item()
-    num_tokens_item = num_tokens.item()
-    print0(f"Step {step:05d}/{num_iterations:05d} | Training loss: {train_loss_item:.6f}| lrm: {lrm:.6f}| num_tokens: {num_tokens_item:,}")
-    wandb_run.log({
-        "step": step,
-        "lrm": lrm,
-        "train_loss": train_loss_item,
-        "num_tokens": num_tokens_item,
-    })
-    step += 1
-
-# Save the model at the end of the run
-if master_process:
-    base_dir = get_base_dir()
-    depth = model.config.n_layer
-    model_tag = f"d{depth}" # base the model tag on the depth of the base model
-    checkpoint_dir = os.path.join(base_dir, "chatsft_checkpoints", model_tag)
-    model_config_kwargs = model.config.__dict__ # slightly naughty, abusing the simplicity of GPTConfig, TODO nicer
-    save_checkpoint(
-        checkpoint_dir,
-        step,
-        model.state_dict(),
-        None, # note: we don't bother to save the optimizer state
-        {
-            "step": step,
-            "val_loss": val_loss,
-            **metrics,
-            "model_config": model_config_kwargs,
-        }
-    )
-    print(f"✅ Saved model checkpoint to {checkpoint_dir}")
-
-# Log to report
-from nanochat.report import get_report
-get_report().log(section="Chat SFT", data=[
-    user_config, # CLI args
-    {
-        "Training rows": len(train_ds),
-        "Number of iterations": num_iterations,
-        "Training loss": train_loss_item,
-        "Validation loss": val_loss,
-    },
-])
-
-# Cleanup
-wandb_run.finish()
-compute_cleanup()
--- a/scripts_moe/chat_web.py
+++ b/scripts_moe/chat_web.py
@ -1,415 +0,0 @@
-#!/usr/bin/env python3
-"""
-Unified web chat server - serves both UI and API from a single FastAPI instance.
-
-Uses data parallelism to distribute requests across multiple GPUs. Each GPU loads
-a full copy of the model, and incoming requests are distributed to available workers.
-
-Launch examples:
-
- single available GPU (default)
-python -m scripts.chat_web
-
- 4 GPUs
-python -m scripts.chat_web --num-gpus 4
-
-To chat, open the URL printed in the console. (If on cloud box, make sure to use public IP)
-
-Endpoints:
-  GET  /           - Chat UI
-  POST /chat/completions - Chat API (streaming only)
-  GET  /health     - Health check with worker pool status
-  GET  /stats      - Worker pool statistics and GPU utilization
-
-Abuse Prevention:
-  - Maximum 500 messages per request
-  - Maximum 8000 characters per message
-  - Maximum 32000 characters total conversation length
-  - Temperature clamped to 0.0-2.0
-  - Top-k clamped to 1-200
-  - Max tokens clamped to 1-4096
-"""
-
-import argparse
-import json
-import os
-import torch
-import asyncio
-import logging
-import random
-from contextlib import asynccontextmanager
-from fastapi import FastAPI, HTTPException
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import StreamingResponse, HTMLResponse, FileResponse
-from pydantic import BaseModel
-from typing import List, Optional, AsyncGenerator
-from dataclasses import dataclass
-from contextlib import nullcontext
-from nanochat.common import compute_init, autodetect_device_type
-from nanochat.checkpoint_manager import load_model
-from nanochat.engine import Engine
-
-# Abuse prevention limits
-MAX_MESSAGES_PER_REQUEST = 500
-MAX_MESSAGE_LENGTH = 8000
-MAX_TOTAL_CONVERSATION_LENGTH = 32000
-MIN_TEMPERATURE = 0.0
-MAX_TEMPERATURE = 2.0
-MIN_TOP_K = 1
-MAX_TOP_K = 200
-MIN_MAX_TOKENS = 1
-MAX_MAX_TOKENS = 4096
-
-parser = argparse.ArgumentParser(description='NanoChat Web Server')
-parser.add_argument('-n', '--num-gpus', type=int, default=1, help='Number of GPUs to use (default: 1)')
-parser.add_argument('-i', '--source', type=str, default="sft", help="Source of the model: sft|mid|rl")
-parser.add_argument('-t', '--temperature', type=float, default=0.8, help='Default temperature for generation')
-parser.add_argument('-k', '--top-k', type=int, default=50, help='Default top-k sampling parameter')
-parser.add_argument('-m', '--max-tokens', type=int, default=512, help='Default max tokens for generation')
-parser.add_argument('-g', '--model-tag', type=str, default=None, help='Model tag to load')
-parser.add_argument('-s', '--step', type=int, default=None, help='Step to load')
-parser.add_argument('-p', '--port', type=int, default=8000, help='Port to run the server on')
-parser.add_argument('-d', '--dtype', type=str, default='bfloat16', choices=['float32', 'bfloat16'])
-parser.add_argument('--device-type', type=str, default='', choices=['cuda', 'cpu', 'mps'], help='Device type for evaluation: cuda|cpu|mps. empty => autodetect')
-parser.add_argument('--host', type=str, default='0.0.0.0', help='Host to bind the server to')
-args = parser.parse_args()
-
-# Configure logging for conversation traffic
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(message)s',
-    datefmt='%Y-%m-%d %H:%M:%S'
-)
-logger = logging.getLogger(__name__)
-
-device_type = autodetect_device_type() if args.device_type == "" else args.device_type
-ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
-ptdtype = torch.float32 if args.dtype == 'float32' else torch.bfloat16
-
-@dataclass
-class Worker:
-    """A worker with a model loaded on a specific GPU."""
-    gpu_id: int
-    device: torch.device
-    engine: Engine
-    tokenizer: object
-    autocast_ctx: torch.amp.autocast
-
-class WorkerPool:
-    """Pool of workers, each with a model replica on a different GPU."""
-
-    def __init__(self, num_gpus: Optional[int] = None):
-        if num_gpus is None:
-            if device_type == "cuda":
-                num_gpus = torch.cuda.device_count()
-            else:
-                num_gpus = 1 # e.g. cpu|mps
-        self.num_gpus = num_gpus
-        self.workers: List[Worker] = []
-        self.available_workers: asyncio.Queue = asyncio.Queue()
-
-    async def initialize(self, source: str, model_tag: Optional[str] = None, step: Optional[int] = None):
-        """Load model on each GPU."""
-        print(f"Initializing worker pool with {self.num_gpus} GPUs...")
-        if self.num_gpus > 1:
-            assert device_type == "cuda", "Only CUDA supports multiple workers/GPUs. cpu|mps does not."
-
-        for gpu_id in range(self.num_gpus):
-
-            if device_type == "cuda":
-                device = torch.device(f"cuda:{gpu_id}")
-                print(f"Loading model on GPU {gpu_id}...")
-            else:
-                device = torch.device(device_type) # e.g. cpu|mps
-                print(f"Loading model on {device_type}...")
-
-            model, tokenizer, _ = load_model(source, device, phase="eval", model_tag=model_tag, step=step)
-            engine = Engine(model, tokenizer)
-            autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
-
-            worker = Worker(
-                gpu_id=gpu_id,
-                device=device,
-                engine=engine,
-                tokenizer=tokenizer,
-                autocast_ctx=autocast_ctx
-            )
-            self.workers.append(worker)
-            await self.available_workers.put(worker)
-
-        print(f"All {self.num_gpus} workers initialized!")
-
-    async def acquire_worker(self) -> Worker:
-        """Get an available worker from the pool."""
-        return await self.available_workers.get()
-
-    async def release_worker(self, worker: Worker):
-        """Return a worker to the pool."""
-        await self.available_workers.put(worker)
-
-class ChatMessage(BaseModel):
-    role: str
-    content: str
-
-class ChatRequest(BaseModel):
-    messages: List[ChatMessage]
-    temperature: Optional[float] = None
-    max_tokens: Optional[int] = None
-    top_k: Optional[int] = None
-
-def validate_chat_request(request: ChatRequest):
-    """Validate chat request to prevent abuse."""
-    # Check number of messages
-    if len(request.messages) == 0:
-        raise HTTPException(status_code=400, detail="At least one message is required")
-    if len(request.messages) > MAX_MESSAGES_PER_REQUEST:
-        raise HTTPException(
-            status_code=400,
-            detail=f"Too many messages. Maximum {MAX_MESSAGES_PER_REQUEST} messages allowed per request"
-        )
-
-    # Check individual message lengths and total conversation length
-    total_length = 0
-    for i, message in enumerate(request.messages):
-        if not message.content:
-            raise HTTPException(status_code=400, detail=f"Message {i} has empty content")
-
-        msg_length = len(message.content)
-        if msg_length > MAX_MESSAGE_LENGTH:
-            raise HTTPException(
-                status_code=400,
-                detail=f"Message {i} is too long. Maximum {MAX_MESSAGE_LENGTH} characters allowed per message"
-            )
-        total_length += msg_length
-
-    if total_length > MAX_TOTAL_CONVERSATION_LENGTH:
-        raise HTTPException(
-            status_code=400,
-            detail=f"Total conversation is too long. Maximum {MAX_TOTAL_CONVERSATION_LENGTH} characters allowed"
-        )
-
-    # Validate role values
-    for i, message in enumerate(request.messages):
-        if message.role not in ["user", "assistant"]:
-            raise HTTPException(
-                status_code=400,
-                detail=f"Message {i} has invalid role. Must be 'user', 'assistant', or 'system'"
-            )
-
-    # Validate temperature
-    if request.temperature is not None:
-        if not (MIN_TEMPERATURE <= request.temperature <= MAX_TEMPERATURE):
-            raise HTTPException(
-                status_code=400,
-                detail=f"Temperature must be between {MIN_TEMPERATURE} and {MAX_TEMPERATURE}"
-            )
-
-    # Validate top_k
-    if request.top_k is not None:
-        if not (MIN_TOP_K <= request.top_k <= MAX_TOP_K):
-            raise HTTPException(
-                status_code=400,
-                detail=f"top_k must be between {MIN_TOP_K} and {MAX_TOP_K}"
-            )
-
-    # Validate max_tokens
-    if request.max_tokens is not None:
-        if not (MIN_MAX_TOKENS <= request.max_tokens <= MAX_MAX_TOKENS):
-            raise HTTPException(
-                status_code=400,
-                detail=f"max_tokens must be between {MIN_MAX_TOKENS} and {MAX_MAX_TOKENS}"
-            )
-
-@asynccontextmanager
-async def lifespan(app: FastAPI):
-    """Load models on all GPUs on startup."""
-    print("Loading nanochat models across GPUs...")
-    app.state.worker_pool = WorkerPool(num_gpus=args.num_gpus)
-    await app.state.worker_pool.initialize(args.source, model_tag=args.model_tag, step=args.step)
-    print(f"Server ready at http://localhost:{args.port}")
-    yield
-
-app = FastAPI(lifespan=lifespan)
-
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-
-@app.get("/")
-async def root():
-    """Serve the chat UI."""
-    ui_html_path = os.path.join("nanochat", "ui.html")
-    with open(ui_html_path, "r", encoding="utf-8") as f:
-        html_content = f.read()
-    # Replace the API_URL to use the same origin
-    html_content = html_content.replace(
-        "const API_URL = `http://${window.location.hostname}:8000`;",
-        "const API_URL = '';"
-    )
-    return HTMLResponse(content=html_content)
-
-
-@app.get("/logo.svg")
-async def logo():
-    """Serve the NanoChat logo for favicon and header."""
-    logo_path = os.path.join("nanochat", "logo.svg")
-    return FileResponse(logo_path, media_type="image/svg+xml")
-
-async def generate_stream(
-    worker: Worker,
-    tokens,
-    temperature=None,
-    max_new_tokens=None,
-    top_k=None
-) -> AsyncGenerator[str, None]:
-    """Generate assistant response with streaming."""
-    temperature = temperature if temperature is not None else args.temperature
-    max_new_tokens = max_new_tokens if max_new_tokens is not None else args.max_tokens
-    top_k = top_k if top_k is not None else args.top_k
-
-    assistant_end = worker.tokenizer.encode_special("<|assistant_end|>")
-    bos = worker.tokenizer.get_bos_token_id()
-
-    # Accumulate tokens to properly handle multi-byte UTF-8 characters (like emojis)
-    accumulated_tokens = []
-    # Track the last complete UTF-8 string (without replacement characters)
-    last_clean_text = ""
-
-    with worker.autocast_ctx:
-        for token_column, token_masks in worker.engine.generate(
-            tokens,
-            num_samples=1,
-            max_tokens=max_new_tokens,
-            temperature=temperature,
-            top_k=top_k,
-            seed=random.randint(0, 2**31 - 1)
-        ):
-            token = token_column[0]
-
-            # Stopping criteria
-            if token == assistant_end or token == bos:
-                break
-
-            # Append the token to sequence
-            accumulated_tokens.append(token)
-            # Decode all accumulated tokens to get proper UTF-8 handling
-            # Note that decode is a quite efficient operation, basically table lookup and string concat
-            current_text = worker.tokenizer.decode(accumulated_tokens)
-            # Only emit text if it doesn't end with a replacement character
-            # This ensures we don't emit incomplete UTF-8 sequences
-            if not current_text.endswith('<EFBFBD>'):
-                # Extract only the new text since last clean decode
-                new_text = current_text[len(last_clean_text):]
-                if new_text:  # Only yield if there's new content
-                    yield f"data: {json.dumps({'token': new_text, 'gpu': worker.gpu_id}, ensure_ascii=False)}\n\n"
-                    last_clean_text = current_text
-
-    yield f"data: {json.dumps({'done': True})}\n\n"
-
-@app.post("/chat/completions")
-async def chat_completions(request: ChatRequest):
-    """Chat completion endpoint (streaming only) - uses worker pool for multi-GPU."""
-
-    # Basic validation to prevent abuse
-    validate_chat_request(request)
-
-    # Log incoming conversation to console
-    logger.info("="*20)
-    for i, message in enumerate(request.messages):
-        logger.info(f"[{message.role.upper()}]: {message.content}")
-    logger.info("-"*20)
-
-    # Acquire a worker from the pool (will wait if all are busy)
-    worker_pool = app.state.worker_pool
-    worker = await worker_pool.acquire_worker()
-
-    try:
-        # Build conversation tokens
-        bos = worker.tokenizer.get_bos_token_id()
-        user_start = worker.tokenizer.encode_special("<|user_start|>")
-        user_end = worker.tokenizer.encode_special("<|user_end|>")
-        assistant_start = worker.tokenizer.encode_special("<|assistant_start|>")
-        assistant_end = worker.tokenizer.encode_special("<|assistant_end|>")
-
-        conversation_tokens = [bos]
-        for message in request.messages:
-            if message.role == "user":
-                conversation_tokens.append(user_start)
-                conversation_tokens.extend(worker.tokenizer.encode(message.content))
-                conversation_tokens.append(user_end)
-            elif message.role == "assistant":
-                conversation_tokens.append(assistant_start)
-                conversation_tokens.extend(worker.tokenizer.encode(message.content))
-                conversation_tokens.append(assistant_end)
-
-        conversation_tokens.append(assistant_start)
-
-        # Streaming response with worker release after completion
-        response_tokens = []
-        async def stream_and_release():
-            try:
-                async for chunk in generate_stream(
-                    worker,
-                    conversation_tokens,
-                    temperature=request.temperature,
-                    max_new_tokens=request.max_tokens,
-                    top_k=request.top_k
-                ):
-                    # Accumulate response for logging
-                    chunk_data = json.loads(chunk.replace("data: ", "").strip())
-                    if "token" in chunk_data:
-                        response_tokens.append(chunk_data["token"])
-                    yield chunk
-            finally:
-                # Log the assistant response to console
-                full_response = "".join(response_tokens)
-                logger.info(f"[ASSISTANT] (GPU {worker.gpu_id}): {full_response}")
-                logger.info("="*20)
-                # Release worker back to pool after streaming is done
-                await worker_pool.release_worker(worker)
-
-        return StreamingResponse(
-            stream_and_release(),
-            media_type="text/event-stream"
-        )
-    except Exception as e:
-        # Make sure to release worker even on error
-        await worker_pool.release_worker(worker)
-        raise e
-
-@app.get("/health")
-async def health():
-    """Health check endpoint."""
-    worker_pool = getattr(app.state, 'worker_pool', None)
-    return {
-        "status": "ok",
-        "ready": worker_pool is not None and len(worker_pool.workers) > 0,
-        "num_gpus": worker_pool.num_gpus if worker_pool else 0,
-        "available_workers": worker_pool.available_workers.qsize() if worker_pool else 0
-    }
-
-@app.get("/stats")
-async def stats():
-    """Get worker pool statistics."""
-    worker_pool = app.state.worker_pool
-    return {
-        "total_workers": len(worker_pool.workers),
-        "available_workers": worker_pool.available_workers.qsize(),
-        "busy_workers": len(worker_pool.workers) - worker_pool.available_workers.qsize(),
-        "workers": [
-            {
-                "gpu_id": w.gpu_id,
-                "device": str(w.device)
-            } for w in worker_pool.workers
-        ]
-    }
-
-if __name__ == "__main__":
-    import uvicorn
-    print(f"Starting NanoChat Web Server")
-    print(f"Temperature: {args.temperature}, Top-k: {args.top_k}, Max tokens: {args.max_tokens}")
-    uvicorn.run(app, host=args.host, port=args.port)
--- a/scripts_moe/mid_train.py
+++ b/scripts_moe/mid_train.py
@ -1,311 +0,0 @@
-"""
-Midtrain the model. Same as pretraining but simpler.
-Run as:
-
-python -m scripts.mid_train
-
-Or torchrun for training:
-
-torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --device_batch_size=16
-"""
-
-from collections import deque
-import os
-os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
-import time
-import wandb
-import torch
-from contextlib import nullcontext
-from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, get_base_dir, autodetect_device_type
-from nanochat.tokenizer import get_token_bytes
-from nanochat.checkpoint_manager import save_checkpoint
-from nanochat.loss_eval import evaluate_bpb
-from nanochat.checkpoint_manager import load_model
-import torch.distributed as dist
-
-from tasks.common import TaskMixture
-from tasks.gsm8k import GSM8K
-from tasks.mmlu import MMLU
-from tasks.smoltalk import SmolTalk
-from tasks.customjson import CustomJSON
-from tasks.spellingbee import SimpleSpelling, SpellingBee
-
-# -----------------------------------------------------------------------------
-run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
-device_type = "" # cuda|cpu|mps (empty => autodetect)
-model_tag = None # model tag to load the model from (base model or midtrained model)
-step = None # step to load the model from (base model or midtrained model)
-dtype = "bfloat16"
-num_iterations = -1 # explicit number of steps of the optimization (-1 = disable)
-max_seq_len = 2048
-device_batch_size = 32
-unembedding_lr = 0.004
-embedding_lr = 0.2
-matrix_lr = 0.02
-init_lr_frac = 1.0 # initial learning rate is this fraction of the base learning rate
-weight_decay = 0.0
-eval_every = 150 # -1 = disable
-eval_tokens = 20*524288
-total_batch_size = 524288
-dry_run = 0 # dry_run=1 is for experiments: we will log to wandb but we won't write checkpoints or report
-config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))]
-exec(open(os.path.join('nanochat', 'configurator.py')).read()) # overrides from command line or config file
-user_config = {k: globals()[k] for k in config_keys} # possibly useful for logging
-# -----------------------------------------------------------------------------
-
-# Compute init
-device_type = autodetect_device_type() if device_type == "" else device_type
-ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
-master_process = ddp_rank == 0
-autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
-synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None
-get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0
-
-# wandb logging init
-use_dummy_wandb = run == "dummy" or not master_process
-wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat-mid", name=run, config=user_config)
-
-# Load the model and tokenizer
-model, tokenizer, meta = load_model("base", device, phase="train", model_tag=model_tag, step=step)
-pretrain_batch_size = meta.get("device_batch_size", None)
-if pretrain_batch_size is not None and device_batch_size > pretrain_batch_size:
-    print0(f"FOOTGUN WARNING: base model training used device_batch_size {pretrain_batch_size}, did you pass in a good --device_batch_size to this script?")
-orig_model = model
-model = torch.compile(model, dynamic=False)
-depth = model.config.n_layer
-num_flops_per_token = model.estimate_flops()
-tokens_per_fwdbwd = device_batch_size * max_seq_len # tokens per iteration for a single rank
-world_tokens_per_fwdbwd = tokens_per_fwdbwd * ddp_world_size # total tokens per iteration for all ranks
-assert total_batch_size % world_tokens_per_fwdbwd == 0
-grad_accum_steps = total_batch_size // world_tokens_per_fwdbwd
-print0(f"Tokens / micro-batch / rank: {device_batch_size} x {max_seq_len} = {tokens_per_fwdbwd:,}")
-print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}")
-print0(f"Total batch size {total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}")
-token_bytes = get_token_bytes(device=device)
-
-# Initialize the Optimizer (Muon for Linear layers, AdamW for embedding and lm_head)
-optimizers = model.setup_optimizers(unembedding_lr=unembedding_lr, embedding_lr=embedding_lr, matrix_lr=matrix_lr, weight_decay=weight_decay)
-adamw_optimizer, muon_optimizer = optimizers
-# Override the initial learning rate as a fraction of the base learning rate
-for opt in optimizers:
-    for group in opt.param_groups:
-        group["lr"] = group["lr"] * init_lr_frac
-        group["initial_lr"] = group["lr"] # save the initial learning so we can decay easily later
-
-# Midtraining data mixture and DataLoader
-base_dir = get_base_dir()
-identity_conversations_filepath = os.path.join(base_dir, "identity_conversations.jsonl")
-train_dataset = TaskMixture([
-    SmolTalk(split="train"), # 460K rows of general conversations
-    MMLU(subset="auxiliary_train", split="train"), # 100K rows of multiple choice problems drawn from ARC, MC_TEST, OBQA, RACE
-    GSM8K(subset="main", split="train"), # 8K rows teaching simple math and (calculator) tool use
-    CustomJSON(filepath=identity_conversations_filepath), # 1000 rows of synthetic identity conversations
-    CustomJSON(filepath=identity_conversations_filepath), # let's do 2 epochs of these
-    SimpleSpelling(size=200000, split="train"), # 200K rows of Simple Spelling (e.g. spell the word 'apple')
-    SpellingBee(size=80000, split="train"), # 80K rows of Spelling Bee (e.g. how many 'r' are in 'strawberry'?)
-]) # total: 460K + 100K + 8K + 200K + 80K = 848K rows
-val_dataset = TaskMixture([
-    SmolTalk(split="test"), # 24K rows in test set
-    MMLU(subset="all", split="test", stop=5200), # 14K rows in test set, use only 5.2K to match the train ratios
-    GSM8K(subset="main", split="test", stop=420), # 1.32K rows in test set, use only 420 to match the train ratios
-]) # total: 24K + 14K + 1.32K ~= 39K rows
-# DataLoader is defined here, it emits inputs, targets : 2D tensors of shape (device_batch_size, max_seq_len)
-# A big problem is that we don't know the final num_iterations in advance. So we create
-# these two global variables and update them from within the data generator.
-last_step = False # we will toggle this to True when we reach the end of the dataset
-approx_progress = 0.0 # will go from 0 to 1 over the course of the epoch
-def mid_data_generator(split):
-    global last_step, approx_progress
-    assert split in {"train", "val"}, "split must be 'train' or 'val'"
-    dataset = train_dataset if split == "train" else val_dataset
-    dataset_size = len(dataset)
-    assert dataset_size > 0
-    needed_tokens = device_batch_size * max_seq_len + 1 # to form one training batch of inputs,targets
-    token_buffer = deque()
-    # CUDA supports memory pinning for faster transfers between CPU and GPU:
-    scratch = torch.empty(needed_tokens, dtype=torch.int64, pin_memory=(device_type == "cuda"))
-    cursor = ddp_rank # increments by ddp_world_size each time, so each rank processes unique documents
-    it = 0 # iteration counter
-    while True:
-        # Accumulate enough tokens for one iteration before yielding
-        while len(token_buffer) < needed_tokens:
-            conversation = dataset[cursor]
-            ids, _ = tokenizer.render_conversation(conversation)
-            token_buffer.extend(ids)
-            cursor += ddp_world_size
-            if cursor >= dataset_size:
-                cursor -= dataset_size # wrap around for another epoch
-                if split == "train":
-                    last_step = True # toggle last_step to True, which will terminate the training loop
-        # Stopping condition to respect num_iterations, if given
-        it += 1
-        if num_iterations > 0 and it >= num_iterations:
-            last_step = True # toggle last_step to True, which will terminate the training loop
-        # Build up inputs/targets and yield
-        for i in range(needed_tokens):
-            scratch[i] = token_buffer.popleft()
-        inputs_cpu = scratch[:-1].to(dtype=torch.int32)
-        targets_cpu = scratch[1:]
-        inputs = inputs_cpu.view(device_batch_size, max_seq_len).to(device=device, dtype=torch.int32, non_blocking=True)
-        targets = targets_cpu.view(device_batch_size, max_seq_len).to(device=device, dtype=torch.int64, non_blocking=True)
-        if split == "train":
-            if num_iterations > 0:
-                approx_progress = it / num_iterations # calculate progress from the max number of iterations
-            else:
-                approx_progress = cursor / dataset_size # approximate progress as a fraction of the dataset
-        yield inputs, targets
-
-train_loader = mid_data_generator("train")
-build_val_loader = lambda: mid_data_generator("val")
-progress = 0 # will go from 0 to 1 over the course of the epoch
-
-# Learning rate scheduler
-def get_lr_multiplier(progress):
-    # first 80% of training: no decay, then linearly ramp down to 0.
-    return 1 if progress < 0.8 else 1 - (progress - 0.8) / 0.2
-
-# Momentum scheduler for Muon optimizer
-def get_muon_momentum(it):
-    frac = min(it / 300, 1)
-    momentum = (1 - frac) * 0.85 + frac * 0.95
-    return momentum
-
-# -----------------------------------------------------------------------------
-# Training loop
-x, y = next(train_loader) # prefetch the very first batch of data
-min_val_bpb = float("inf")
-smooth_train_loss = 0 # EMA of training loss
-ema_beta = 0.9 # EMA decay factor
-total_training_time = 0 # total wall-clock time of training
-step = 0
-while True:
-    flops_so_far = num_flops_per_token * total_batch_size * step
-
-    # Synchronize last_step across all ranks to avoid hangs in the distributed setting
-    if ddp:
-        last_step_tensor = torch.tensor(last_step, dtype=torch.int32, device=device)
-        dist.all_reduce(last_step_tensor, op=dist.ReduceOp.MAX)
-        last_step = bool(last_step_tensor.item())
-
-    # once in a while: evaluate the val bpb (all ranks participate)
-    if eval_every > 0 and (last_step or step % eval_every == 0):
-        model.eval()
-        val_loader = build_val_loader()
-        eval_steps = eval_tokens // (device_batch_size * max_seq_len * ddp_world_size)
-        with autocast_ctx:
-            val_bpb = evaluate_bpb(model, val_loader, eval_steps, token_bytes)
-        print0(f"Step {step:05d} | Validation bpb: {val_bpb:.4f}")
-        if val_bpb < min_val_bpb:
-            min_val_bpb = val_bpb
-        wandb_run.log({
-            "step": step,
-            "total_training_flops": flops_so_far,
-            "total_training_time": total_training_time,
-            "val/bpb": val_bpb,
-        })
-        model.train()
-
-    # save checkpoint at the end of the run (only on master process)
-    if master_process and last_step and not dry_run:
-        output_dirname = f"d{depth}" # e.g. d12
-        checkpoint_dir = os.path.join(base_dir, "mid_checkpoints", output_dirname)
-        save_checkpoint(
-            checkpoint_dir,
-            step,
-            orig_model.state_dict(),
-            [opt.state_dict() for opt in optimizers], # TODO: make sure saving across ranks is done correctly
-            {
-                "step": step,
-                "val_bpb": val_bpb, # loss at last step
-                "model_config": {
-                    "sequence_len": max_seq_len,
-                    "vocab_size": tokenizer.get_vocab_size(),
-                    "n_layer": depth,
-                    "n_head": model.config.n_head,
-                    "n_kv_head": model.config.n_kv_head,
-                    "n_embd": model.config.n_embd,
-                },
-                "user_config": user_config, # inputs to the training script
-            }
-        )
-
-    if last_step:
-        break
-
-    # -------------------------------------------------------------------------
-    # single training step
-    # evaluate the gradient
-    synchronize()
-    t0 = time.time()
-    for micro_step in range(grad_accum_steps):
-        with autocast_ctx:
-            loss = model(x, y)
-        train_loss = loss.detach() # for logging
-        loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
-        loss.backward()
-        x, y = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward
-        progress = max(progress, approx_progress) # only increase progress monotonically
-    # step the optimizers
-    lrm = get_lr_multiplier(progress)
-    for opt in optimizers:
-        for group in opt.param_groups:
-            group["lr"] = group["initial_lr"] * lrm
-    muon_momentum = get_muon_momentum(step)
-    for group in muon_optimizer.param_groups:
-        group["momentum"] = muon_momentum
-    for opt in optimizers:
-        opt.step()
-    model.zero_grad(set_to_none=True)
-    synchronize()
-    t1 = time.time()
-    dt = t1 - t0
-    # -------------------------------------------------------------------------
-
-    # State
-    step += 1
-
-    # logging
-    smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss
-    debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA
-    pct_done = 100 * progress
-    tok_per_sec = int(total_batch_size / dt)
-    flops_per_sec = num_flops_per_token * total_batch_size / dt
-    promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity
-    mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in %
-    if step > 10:
-        total_training_time += dt # only count the time after the first 10 steps
-    print0(f"step {step:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | total time: {total_training_time/60:.2f}m")
-    if step % 10 == 0:
-        wandb_run.log({
-            "step": step,
-            "total_training_flops": flops_so_far,
-            "total_training_time": total_training_time,
-            "train/loss": debiased_smooth_loss,
-            "train/lrm": lrm,
-            "train/dt": dt,
-            "train/tok_per_sec": tok_per_sec,
-            "train/mfu": mfu,
-        })
-
-# print a few more stats
-print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB")
-print0(f"Total training time: {total_training_time/60:.2f}m")
-print0(f"Minimum validation bpb: {min_val_bpb:.4f}")
-
-# Log to report
-if not dry_run:
-    from nanochat.report import get_report
-    get_report().log(section="Midtraining", data=[
-        user_config, # CLI args
-        { # stats about the training setup
-            "Number of iterations": step,
-            "DDP world size": ddp_world_size,
-        },
-        { # stats about training outcomes
-            "Minimum validation bpb": min_val_bpb,
-        }
-    ])
-
-# cleanup
-wandb_run.finish() # wandb run finish
-compute_cleanup()
--- a/scripts_moe/quick_infer.py
+++ b/scripts_moe/quick_infer.py
@ -1,235 +0,0 @@
-"""
-Quick local inference for nanochat-MoE checkpoints (pretrain-style, plain text).
-
-Example:
-  uv run python scripts_moe/quick_infer.py --model-tag d20 --prompt "what's 1+1 equal to?"
-  uv run python scripts_moe/quick_infer.py --hf-path hf-export/moe_gpt2 --prompt "what's 1+1 equal to?"
-"""
-
-import argparse
-import json
-import os
-import sys
-
-import torch
-
-REPO_ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
-if REPO_ROOT not in sys.path:
-    sys.path.insert(0, REPO_ROOT)
-
-import nanochat_moe.manager as _moe_manager
-sys.modules.setdefault("manager", _moe_manager)
-
-from nanochat_moe.checkpoint_manager import find_last_step, find_largest_model
-from nanochat_moe.common import get_base_dir
-from nanochat_moe.standard import GPT, GPTConfig
-from nanochat_moe.tokenizer import RustBPETokenizer, get_tokenizer
-
-
-def pad_vocab_weights(state_dict, target_vocab):
-    for key in ("transformer.wte.weight", "lm_head.weight"):
-        if key not in state_dict:
-            continue
-        tensor = state_dict[key]
-        if tensor.size(0) >= target_vocab:
-            continue
-        pad_rows = target_vocab - tensor.size(0)
-        pad = tensor.new_zeros((pad_rows, tensor.size(1)))
-        state_dict[key] = torch.cat([tensor, pad], dim=0)
-
-
-def load_tokenizer(mode):
-    if mode == "gpt2":
-        return RustBPETokenizer.from_pretrained("gpt2")
-    if mode == "cache":
-        return get_tokenizer()
-    raise ValueError(f"Unknown tokenizer mode: {mode}")
-
-
-def load_hf_model(hf_path, device):
-    from transformers import AutoModelForCausalLM, AutoTokenizer
-
-    model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True)
-    tokenizer = AutoTokenizer.from_pretrained(hf_path, trust_remote_code=True)
-    model.to(device)
-    model.eval()
-    vocab_limit = None
-    if tokenizer.vocab_size < model.config.vocab_size:
-        vocab_limit = tokenizer.vocab_size
-        print(
-            f"Warning: tokenizer vocab {tokenizer.vocab_size} < model vocab {model.config.vocab_size}; "
-            "clamping logits to tokenizer vocab."
-        )
-    return model, tokenizer, vocab_limit
-
-
-def load_standard_model(source, device, model_tag, step, tokenizer):
-    base_dir = get_base_dir()
-    checkpoint_root = os.path.join(base_dir, f"{source}_checkpoints")
-    if model_tag is None:
-        model_tag = find_largest_model(checkpoint_root)
-    checkpoint_dir = os.path.join(checkpoint_root, model_tag)
-    if step is None:
-        step = find_last_step(checkpoint_dir)
-
-    model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")
-    meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
-    model_data = torch.load(model_path, map_location=device)
-    if device.type in {"cpu", "mps"}:
-        model_data = {
-            k: (v.float() if isinstance(v, torch.Tensor) and v.dtype == torch.bfloat16 else v)
-            for k, v in model_data.items()
-        }
-    model_data = {k.removeprefix("_orig_mod."): v for k, v in model_data.items()}
-    with open(meta_path, "r", encoding="utf-8") as f:
-        meta = json.load(f)
-
-    cfg_kwargs = meta["model_config"]
-    tok_vocab = tokenizer.get_vocab_size()
-    cfg_vocab = cfg_kwargs.get("vocab_size")
-    vocab_limit = None
-    if tok_vocab > cfg_vocab:
-        print(
-            f"Warning: tokenizer vocab {tok_vocab} > model vocab {cfg_vocab}; "
-            "padding embeddings to match."
-        )
-        cfg_kwargs = dict(cfg_kwargs)
-        cfg_kwargs["vocab_size"] = tok_vocab
-        pad_vocab_weights(model_data, tok_vocab)
-    elif tok_vocab < cfg_vocab:
-        vocab_limit = tok_vocab
-        print(
-            f"Warning: tokenizer vocab {tok_vocab} < model vocab {cfg_vocab}; "
-            "clamping logits to tokenizer vocab."
-        )
-
-    model = GPT(GPTConfig(**cfg_kwargs))
-    model.load_state_dict(model_data, strict=True)
-    model.to(device)
-    model.eval()
-    return model, tokenizer, meta, model_tag, step, vocab_limit
-
-
-def build_prompt_tokens(tokenizer, question: str):
-    prompt = f"Question: {question}\nAnswer:"
-    bos = tokenizer.get_bos_token_id()
-    return tokenizer.encode(prompt, prepend=bos), prompt
-
-
-def build_prompt_tokens_hf(tokenizer, question: str):
-    prompt = f"Question: {question}\nAnswer:"
-    ids = tokenizer.encode(prompt, add_special_tokens=False)
-    bos_id = getattr(tokenizer, "bos_token_id", None)
-    if bos_id is not None:
-        ids = [bos_id] + ids
-    return ids, prompt
-
-
-def sample_next_token(logits, temperature, top_k):
-    if top_k is not None:
-        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-        logits = torch.where(
-            logits < v[:, [-1]],
-            torch.full_like(logits, -float("inf")),
-            logits,
-        )
-    if temperature <= 0:
-        return torch.argmax(logits, dim=-1, keepdim=True)
-    logits = logits / temperature
-    probs = torch.softmax(logits, dim=-1)
-    return torch.multinomial(probs, num_samples=1)
-
-
-@torch.inference_mode()
-def generate_tokens(model, input_ids, max_new_tokens, temperature, top_k, vocab_limit):
-    for _ in range(max_new_tokens):
-        idx_cond = (
-            input_ids
-            if input_ids.size(1) <= model.config.block_size
-            else input_ids[:, -model.config.block_size :]
-        )
-        logits, _ = model(idx_cond)
-        logits = logits[:, -1, :]
-        if vocab_limit is not None and vocab_limit < logits.size(-1):
-            logits = logits[:, :vocab_limit]
-        next_token = sample_next_token(logits, temperature, top_k)
-        input_ids = torch.cat((input_ids, next_token), dim=1)
-    return input_ids
-
-
-@torch.inference_mode()
-def generate_tokens_hf(model, input_ids, max_new_tokens, temperature, top_k, vocab_limit):
-    for _ in range(max_new_tokens):
-        logits = model(input_ids).logits
-        logits = logits[:, -1, :]
-        if vocab_limit is not None and vocab_limit < logits.size(-1):
-            logits = logits[:, :vocab_limit]
-        next_token = sample_next_token(logits, temperature, top_k)
-        input_ids = torch.cat((input_ids, next_token), dim=1)
-    return input_ids
-
-
-@torch.inference_mode()
-def main():
-    parser = argparse.ArgumentParser(description="Run a quick nanochat-MoE inference")
-    parser.add_argument("--source", type=str, default="base", choices=["base", "mid", "sft", "rl"])
-    parser.add_argument("--model-tag", type=str, default="d20")
-    parser.add_argument("--step", type=int, default=None)
-    parser.add_argument("--hf-path", type=str, default=None)
-    parser.add_argument("--prompt", type=str, default="what's 1+1 equal to?")
-    parser.add_argument("--max-tokens", type=int, default=64)
-    parser.add_argument("--temperature", type=float, default=0.0)
-    parser.add_argument("--top-k", type=int, default=None)
-    parser.add_argument("--tokenizer", type=str, default="gpt2", choices=["gpt2", "cache"])
-    args = parser.parse_args()
-
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-    if args.hf_path:
-        model, tokenizer, vocab_limit = load_hf_model(args.hf_path, device)
-        prompt_tokens, prompt_text = build_prompt_tokens_hf(tokenizer, args.prompt)
-        input_ids = torch.tensor([prompt_tokens], dtype=torch.long, device=device)
-        output_ids = generate_tokens_hf(
-            model,
-            input_ids,
-            max_new_tokens=args.max_tokens,
-            temperature=args.temperature,
-            top_k=args.top_k,
-            vocab_limit=vocab_limit,
-        )
-        generated_tokens = output_ids[0].tolist()[len(prompt_tokens) :]
-        output_text = tokenizer.decode(generated_tokens)
-        print(f"Loaded HF model: {args.hf_path}")
-    else:
-        tokenizer = load_tokenizer(args.tokenizer)
-        model, tokenizer, _, resolved_tag, resolved_step, vocab_limit = load_standard_model(
-            args.source, device, args.model_tag, args.step, tokenizer
-        )
-        prompt_tokens, prompt_text = build_prompt_tokens(tokenizer, args.prompt)
-        if len(prompt_tokens) > model.config.block_size:
-            print(
-                f"Warning: prompt length {len(prompt_tokens)} exceeds block_size "
-                f"{model.config.block_size}; context will be truncated."
-            )
-
-        input_ids = torch.tensor([prompt_tokens], dtype=torch.long, device=device)
-        output_ids = generate_tokens(
-            model,
-            input_ids,
-            max_new_tokens=args.max_tokens,
-            temperature=args.temperature,
-            top_k=args.top_k,
-            vocab_limit=vocab_limit,
-        )
-        generated_tokens = output_ids[0].tolist()[len(prompt_tokens) :]
-        output_text = tokenizer.decode(generated_tokens)
-        print(f"Loaded: {args.source}/{resolved_tag} step {resolved_step}")
-    # print("Prompt:", prompt_text)
-    # print("Output:", output_text)
-    print("===============Prompt===============")
-    print(prompt_text)
-    print("===============Output===============")
-    print(output_text)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts_moe/tok_eval.py
+++ b/scripts_moe/tok_eval.py
@ -1,265 +0,0 @@
-"""
-Evaluate compression ratio of the tokenizer.
-"""
-
-from nanochat.tokenizer import get_tokenizer, RustBPETokenizer
-from nanochat.dataset import parquets_iter_batched
-
-# Random text I got from a random website this morning
-news_text = r"""
-(Washington, D.C., July 9, 2025)- Yesterday, Mexico’s National Service of Agro-Alimentary Health, Safety, and Quality (SENASICA) reported a new case of New World Screwworm (NWS) in Ixhuatlan de Madero, Veracruz in Mexico, which is approximately 160 miles northward of the current sterile fly dispersal grid, on the eastern side of the country and 370 miles south of the U.S./Mexico border. This new northward detection comes approximately two months after northern detections were reported in Oaxaca and Veracruz, less than 700 miles away from the U.S. border, which triggered the closure of our ports to Mexican cattle, bison, and horses on May 11, 2025.
-
-While USDA announced a risk-based phased port re-opening strategy for cattle, bison, and equine from Mexico beginning as early as July 7, 2025, this newly reported NWS case raises significant concern about the previously reported information shared by Mexican officials and severely compromises the outlined port reopening schedule of five ports from July 7-September 15. Therefore, in order to protect American livestock and our nation’s food supply, Secretary Rollins has ordered the closure of livestock trade through southern ports of entry effective immediately.
-
-“The United States has promised to be vigilant — and after detecting this new NWS case, we are pausing the planned port reopening’s to further quarantine and target this deadly pest in Mexico. We must see additional progress combatting NWS in Veracruz and other nearby Mexican states in order to reopen livestock ports along the Southern border,” said U.S. Secretary of Agriculture Brooke L. Rollins. “Thanks to the aggressive monitoring by USDA staff in the U.S. and in Mexico, we have been able to take quick and decisive action to respond to the spread of this deadly pest.”
-""".strip()
-
-# Random Korean text (to test non-English compression)
-korean_text = r"""
-정직한 사실 위에, 공정한 시선을 더하다
-Herald Korea Times
-
-헤럴드코리아타임즈는 정치, 경제, 사회, 문화 등 한국 사회 전반의 주요 이슈를 심도 있게 다루는 종합 온라인 신문사입니다.
-
-우리는 단순히 뉴스를 전달하는 것이 아니라, 사실(Fact)에 기반한 양측의 시각을 균형 있게 조명하며, 독자 여러분이 스스로 판단할 수 있는 ‘정보의 균형’을 제공합니다.
-
-한국 언론의 오랜 문제로 지적되어 온 정치적 편향, 이념적 왜곡에서 벗어나
-오직 정직함과 공정함을 원칙으로 삼는 언론을 지향합니다.
-어느 한쪽의 주장만을 확대하거나 감추지 않고,
-**모든 쟁점에 대해 ‘무엇이 쟁점인지’, ‘누가 무엇을 주장하는지’, ‘사실은 무엇인지’**를 명확히 전달하는 데 집중합니다.
-""".strip()
-
-# Random piece of code
-code_text = r"""
-class BasicTokenizer(Tokenizer):
-
-    def __init__(self):
-        super().__init__()
-
-    def train(self, text, vocab_size, verbose=False):
-        assert vocab_size >= 256
-        num_merges = vocab_size - 256
-
-        # input text preprocessing
-        text_bytes = text.encode("utf-8") # raw bytes
-        ids = list(text_bytes) # list of integers in range 0..255
-
-        # iteratively merge the most common pairs to create new tokens
-        merges = {} # (int, int) -> int
-        vocab = {idx: bytes([idx]) for idx in range(256)} # int -> bytes
-        for i in range(num_merges):
-            # count up the number of times every consecutive pair appears
-            stats = get_stats(ids)
-            # find the pair with the highest count
-            pair = max(stats, key=stats.get)
-            # mint a new token: assign it the next available id
-            idx = 256 + i
-            # replace all occurrences of pair in ids with idx
-            ids = merge(ids, pair, idx)
-            # save the merge
-            merges[pair] = idx
-            vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
-            # prints
-            if verbose:
-                print(f"merge {i+1}/{num_merges}: {pair} -> {idx} ({vocab[idx]}) had {stats[pair]} occurrences")
-""".strip()
-
-math_text = r"""
-\documentclass[12pt]{article}
-\usepackage{amsmath,amsthm,amssymb}
-\usepackage[margin=1in]{geometry}
-
-\newtheorem{theorem}{Theorem}
-\newtheorem*{remark}{Remark}
-
-\begin{document}
-
-\begin{center}
-{\Large A Cute Identity: The Sum of Cubes is a Square}
-\end{center}
-
-\begin{theorem}
-For every integer $n \ge 1$,
-\[
-\sum_{k=1}^{n} k^{3} \;=\; \left(\frac{n(n+1)}{2}\right)^{2}.
-\]
-\end{theorem}
-
-\begin{proof}[Proof 1 (Induction)]
-Let $S(n) = \sum_{k=1}^{n} k^3$. For $n=1$, $S(1)=1=(1\cdot 2/2)^2$, so the base case holds.
-
-Assume $S(n)=\big(\tfrac{n(n+1)}{2}\big)^2$ for some $n\ge 1$.
-Then
-\[
-S(n+1)
-= S(n) + (n+1)^3
-= \left(\frac{n(n+1)}{2}\right)^2 + (n+1)^3.
-\]
-Factor out $(n+1)^2$:
-\[
-S(n+1)
-= (n+1)^2\left( \frac{n^2}{4} + (n+1) \right)
-= (n+1)^2\left( \frac{n^2 + 4n + 4}{4} \right)
-= (n+1)^2\left( \frac{(n+2)^2}{4} \right).
-\]
-Thus
-\[
-S(n+1)=\left(\frac{(n+1)(n+2)}{2}\right)^2,
-\]
-which matches the claimed formula with $n$ replaced by $n+1$. By induction, the identity holds for all $n\ge 1$.
-\end{proof}
-
-\begin{proof}[Proof 2 (Algebraic telescoping)]
-Recall the binomial identity
-\[
-(k+1)^4 - k^4 = 4k^3 + 6k^2 + 4k + 1.
-\]
-Summing both sides from $k=0$ to $n$ telescopes:
-\[
-(n+1)^4 - 0^4
-= \sum_{k=0}^{n}\big(4k^3 + 6k^2 + 4k + 1\big)
-= 4\sum_{k=1}^{n}k^3 + 6\sum_{k=1}^{n}k^2 + 4\sum_{k=1}^{n}k + (n+1).
-\]
-Using the standard sums
-\[
-\sum_{k=1}^{n}k = \frac{n(n+1)}{2}
-\quad\text{and}\quad
-\sum_{k=1}^{n}k^2 = \frac{n(n+1)(2n+1)}{6},
-\]
-solve for $\sum_{k=1}^{n}k^3$ to get
-\[
-\sum_{k=1}^{n}k^3 = \left(\frac{n(n+1)}{2}\right)^2.
-\]
-\end{proof}
-
-\begin{remark}
-Geometrically, the identity says: ``adding up $1^3,2^3,\dots,n^3$ builds a perfect square’’—namely the square of the $n$th triangular number. This is why one sometimes calls it the \emph{sum-of-cubes is a square} phenomenon.
-\end{remark}
-
-\end{document}
-""".strip()
-
-science_text = r"""
-Photosynthesis is a photochemical energy transduction process in which light-harvesting pigment–protein complexes within the thylakoid membranes of oxygenic phototrophs absorb photons and initiate charge separation at the reaction center, driving the linear electron transport chain from water to NADP⁺ via photosystem II, the cytochrome b₆f complex, and photosystem I, concomitantly generating a trans-thylakoid proton motive force utilized by chloroplastic ATP synthase. The light-dependent reactions produce ATP and NADPH, which fuel the Calvin–Benson–Bassham cycle in the stroma, wherein ribulose-1,5-bisphosphate is carboxylated by ribulose-1,5-bisphosphate carboxylase/oxygenase (RuBisCO) to form 3-phosphoglycerate, subsequently reduced and regenerated through a series of enzymatic steps, enabling net assimilation of CO₂ into triose phosphates and ultimately carbohydrates. This process is tightly regulated by photoprotective mechanisms, redox feedback, and metabolite flux, representing a central biochemical pathway coupling solar energy capture to the biosphere’s primary productivity.
-""".strip()
-
-# The tokenizer was trained on data from earlier shards, so it has seen this data
-train_docs = next(parquets_iter_batched(split="train"))
-train_text = "\n".join(train_docs)
-val_docs = next(parquets_iter_batched(split="val"))
-val_text = "\n".join(val_docs)
-
-all_text = [
-    ("news", news_text),
-    ("korean", korean_text),
-    ("code", code_text),
-    ("math", math_text),
-    ("science", science_text),
-    ("fwe-train", train_text),
-]
-if val_text:
-    all_text.append(("fwe-val", val_text))
-
-# Try out current default compared to GPT-2 and GPT-4 tokenizers
-tokenizer_results = {}
-vocab_sizes = {}
-
-for tokenizer_name in ["gpt2", "gpt4", "ours"]:
-
-    if tokenizer_name == "gpt2":
-        tokenizer = RustBPETokenizer.from_pretrained("gpt2") # gpt-2 base model tokenizer
-    elif tokenizer_name == "gpt4":
-        tokenizer = RustBPETokenizer.from_pretrained("cl100k_base") # gpt-4 base model tokenizer
-    else:
-        tokenizer = get_tokenizer()
-
-    vocab_sizes[tokenizer_name] = tokenizer.get_vocab_size()
-    tokenizer_results[tokenizer_name] = {}
-
-    for name, text in all_text:
-        encoded = tokenizer.encode(text)
-        decoded = tokenizer.decode(encoded)
-        assert decoded == text
-
-        encoded_bytes = text.encode('utf-8')
-        ratio = len(encoded_bytes) / len(encoded)
-        tokenizer_results[tokenizer_name][name] = {
-            'bytes': len(encoded_bytes),
-            'tokens': len(encoded),
-            'ratio': ratio
-        }
-
-# ANSI color codes
-GREEN = '\033[92m'
-RED = '\033[91m'
-RESET = '\033[0m'
-
-# Print vocab sizes
-print(f"\nVocab sizes:")
-print(f"GPT-2: {vocab_sizes['gpt2']}")
-print(f"GPT-4: {vocab_sizes['gpt4']}")
-print(f"Ours: {vocab_sizes['ours']}")
-
-def print_comparison(baseline_name, baseline_results, ours_results, all_text):
-    """Print comparison table between baseline tokenizer and ours."""
-    print(f"\nComparison with {baseline_name}:")
-    print("=" * 95)
-    print(f"{'Text Type':<10} {'Bytes':<8} {baseline_name:<15} {'Ours':<15} {'Relative':<12} {'Better':<10}")
-    print(f"{'':10} {'':8} {'Tokens':<7} {'Ratio':<7} {'Tokens':<7} {'Ratio':<7} {'Diff %':<12}")
-    print("-" * 95)
-
-    for name, text in all_text:
-        baseline_data = baseline_results[name]
-        ours_data = ours_results[name]
-
-        # Calculate relative difference (positive means ours is better, negative means worse)
-        # Using tokens: fewer tokens is better, so we calculate (baseline_tokens - ours_tokens) / baseline_tokens
-        relative_diff = ((baseline_data['tokens'] - ours_data['tokens']) / baseline_data['tokens']) * 100
-
-        # Determine which has better compression (higher ratio = better)
-        if baseline_data['ratio'] > ours_data['ratio']:
-            baseline_color, ours_color = GREEN, RED
-            better = baseline_name
-            diff_color = RED
-        elif ours_data['ratio'] > baseline_data['ratio']:
-            baseline_color, ours_color = RED, GREEN
-            better = "Ours"
-            diff_color = GREEN
-        else:
-            baseline_color, ours_color = "", ""
-            better = "Tie"
-            diff_color = ""
-
-        print(f"{name:<10} {baseline_data['bytes']:<8} "
-              f"{baseline_color}{baseline_data['tokens']:<7}{RESET} "
-              f"{baseline_color}{baseline_data['ratio']:<7.2f}{RESET} "
-              f"{ours_color}{ours_data['tokens']:<7}{RESET} "
-              f"{ours_color}{ours_data['ratio']:<7.2f}{RESET} "
-              f"{diff_color}{relative_diff:+7.1f}%{RESET}     "
-              f"{better:<10}")
-
-# Print comparisons
-print_comparison("GPT-2", tokenizer_results['gpt2'], tokenizer_results['ours'], all_text)
-print_comparison("GPT-4", tokenizer_results['gpt4'], tokenizer_results['ours'], all_text)
-
-# Log to report
-from nanochat.report import get_report
-lines = []
-for baseline_name in ["GPT-2", "GPT-4"]:
-    baseline_key = baseline_name.lower().replace('-', '')
-    baseline_results = tokenizer_results[baseline_key]
-    ours_results = tokenizer_results['ours']
-    lines.append(f"### Comparison with {baseline_name}")
-    lines.append("")
-    lines.append("| Text Type | Bytes | " + baseline_name + " Tokens | " + baseline_name + " Ratio | Ours Tokens | Ours Ratio | Relative Diff % |")
-    lines.append("|-----------|-------|--------------|--------------|-------------|------------|-----------------|")
-    for name, text in all_text:
-        baseline_data = baseline_results[name]
-        ours_data = ours_results[name]
-        relative_diff = ((baseline_data['tokens'] - ours_data['tokens']) / baseline_data['tokens']) * 100
-        lines.append(f"| {name} | {baseline_data['bytes']} | {baseline_data['tokens']} | {baseline_data['ratio']:.2f} | {ours_data['tokens']} | {ours_data['ratio']:.2f} | {relative_diff:+.1f}% |")
-    lines.append("")
-report_markdown = "\n".join(lines)
-get_report().log(section="Tokenizer evaluation", data=[
-    report_markdown,
-])
--- a/scripts_moe/tok_train.py
+++ b/scripts_moe/tok_train.py
@ -1,106 +0,0 @@
-"""
-Train a tokenizer using the HuggingFace Tokenizers library.
-In the style of GPT-4 tokenizer.
-"""
-import os
-import time
-import argparse
-import torch
-from nanochat.tokenizer import RustBPETokenizer
-from nanochat.common import get_base_dir
-from nanochat.dataset import parquets_iter_batched
-
-# -----------------------------------------------------------------------------
-# Parse command line arguments
-
-parser = argparse.ArgumentParser(description='Train a BPE tokenizer')
-parser.add_argument('--max_chars', type=int, default=10_000_000_000, help='Maximum characters to train on (default: 10B)')
-parser.add_argument('--doc_cap', type=int, default=10_000, help='Maximum characters per document (default: 10,000)')
-parser.add_argument('--vocab_size', type=int, default=65536, help='Vocabulary size (default: 65536 = 2^16)')
-args = parser.parse_args()
-print(f"max_chars: {args.max_chars:,}")
-print(f"doc_cap: {args.doc_cap:,}")
-print(f"vocab_size: {args.vocab_size:,}")
-
-# -----------------------------------------------------------------------------
-# Text iterator
-
-def text_iterator():
-    """
-    1) Flatten the batches into a single iterator
-    2) Crop every document to args.doc_cap characters
-    3) Break when we've seen args.max_chars characters
-    """
-    nchars = 0
-    for batch in parquets_iter_batched(split="train"):
-        for doc in batch:
-            doc_text = doc
-            if len(doc_text) > args.doc_cap:
-                doc_text = doc_text[:args.doc_cap]
-            nchars += len(doc_text)
-            yield doc_text
-            if nchars > args.max_chars:
-                return
-text_iter = text_iterator()
-
-# -----------------------------------------------------------------------------
-# Train the tokenizer
-t0 = time.time()
-tokenizer = RustBPETokenizer.train_from_iterator(text_iter, args.vocab_size)
-t1 = time.time()
-train_time = t1 - t0
-print(f"Training time: {train_time:.2f}s")
-
-# -----------------------------------------------------------------------------
-# Save the tokenizer to disk
-base_dir = get_base_dir()
-tokenizer_dir = os.path.join(base_dir, "tokenizer")
-tokenizer.save(tokenizer_dir)
-
-# -----------------------------------------------------------------------------
-# Quick inline sanity check
-test_text = """Hello world! This is a test.
-Numbers: 123, 4567, 89
-Contractions: I'm, you're, it's
-Special chars: @#$%^&*()
-Unicode: 你好世界 🌍"""
-encoded = tokenizer.encode(test_text)
-decoded = tokenizer.decode(encoded)
-assert decoded == test_text
-
-# -----------------------------------------------------------------------------
-# One more thing: we wish to cache a mapping from token id to number of bytes of that token
-# for efficient evaluation of bits per byte. Unlike the typical mean loss, this
-# allows us to report a loss that is invariant to the vocab size of the tokenizer.
-# The bits per byte on the validation set is then one of the primary metrics we care about.
-vocab_size = tokenizer.get_vocab_size()
-special_set = set(tokenizer.get_special_tokens())
-token_strings = [tokenizer.decode([token_id]) for token_id in range(vocab_size)]
-token_bytes = []
-for token_id in range(vocab_size):
-    token_str = token_strings[token_id] # the Python string representation of this token
-    if token_str in special_set:
-        token_bytes.append(0) # special characters are not counted
-    else:
-        id_bytes = len(token_str.encode("utf-8")) # number of bytes that make up this token
-        token_bytes.append(id_bytes)
-token_bytes = torch.tensor(token_bytes, dtype=torch.int32, device='cpu')
-token_bytes_path = os.path.join(tokenizer_dir, "token_bytes.pt")
-with open(token_bytes_path, "wb") as f:
-    torch.save(token_bytes, f)
-print(f"Saved token_bytes to {token_bytes_path}")
-
-# Log to report
-from nanochat.report import get_report
-token_bytes_nonzero = (token_bytes[token_bytes > 0]).to(dtype=torch.float32)
-get_report().log(section="Tokenizer training", data=[
-    vars(args), # argparse command line arguments
-    {"train_time": train_time},
-    {"num_special_tokens": len(special_set)},
-    {
-        "token_bytes_min": int(token_bytes_nonzero.min().item()),
-        "token_bytes_max": int(token_bytes_nonzero.max().item()),
-        "token_bytes_mean": token_bytes_nonzero.mean().item(),
-        "token_bytes_std": token_bytes_nonzero.std().item(),
-    }
-])