Update some basic things.

2026-01-29 23:04:51 +00:00 · 2025-12-13 20:52:45 +08:00 · 2025-12-13 20:52:45 +08:00 · ef830fd7f9
commit ef830fd7f9
parent b12aa2699d
20 changed files with 4884 additions and 0 deletions
--- a/nanochat_moe/init.py
+++ b/nanochat_moe/init.py
--- a/nanochat_moe/adamw.py
+++ b/nanochat_moe/adamw.py
@ -0,0 +1,76 @@
+"""
+Borrowed from modded-nanogpt. By Keller, @vagrawal, et al.
+Not a general optimizer! But works for our specific use.
+"""
+import torch
+import torch.distributed as dist
+from torch import Tensor
+
+
+class DistAdamW(torch.optim.Optimizer):
+    """
+    Distributed AdamW optimizer.
+    In the style of ZeRO-2, i.e. sharded optimizer states and gradient reduction
+    """
+    def __init__(self, param_groups, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01):
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
+        super().__init__(param_groups, defaults)
+
+    @torch.compile
+    @torch.no_grad()
+    def step(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        reduce_scatter_futures: list[torch.Future] = []
+        all_reduce_futures: list[torch.Future] = []
+        grad_slices = []
+        for group in self.param_groups:
+            params: list[Tensor] = group["params"]
+            for base_i in range(len(params)):
+                grad = params[base_i].grad
+                rank_size = grad.shape[0] // world_size
+                grad_slice = torch.empty_like(grad[:rank_size])
+                reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future())
+                grad_slices.append(grad_slice)
+
+        idx = 0
+        for group in self.param_groups:
+            beta1, beta2 = group['betas']
+            eps = group['eps']
+            wd = group['weight_decay']
+            params = group['params']
+            for base in range(len(params)):
+                reduce_scatter_futures[idx].wait()
+                p = params[base]
+                rank_size = p.shape[0] // world_size
+                p_slice = p[rank * rank_size:(rank + 1) * rank_size]
+                lr = group['lr'] * getattr(p, "lr_mul", 1.0)
+                state = self.state[p]
+                g_slice = grad_slices[idx]
+                # State init
+                if not state:
+                    state['step'] = torch.tensor(0, dtype=torch.int64, device=p.device)
+                    state['exp_avg'] = torch.zeros_like(p_slice)
+                    state['exp_avg_sq'] = torch.zeros_like(p_slice)
+                exp_avg = state['exp_avg']
+                exp_avg_sq = state['exp_avg_sq']
+                state['step'] += 1
+                t = state['step']
+                # weight decay
+                if wd != 0:
+                    eff_weight_decay = lr * wd * getattr(p, "wd_mul", 1.0)
+                    p_slice.mul_(1 - eff_weight_decay)
+                # update running averages
+                exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1)
+                exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2)
+                # bias corrections
+                bias1 = 1 - beta1 ** t
+                bias2 = 1 - beta2 ** t
+                # compute step
+                denom = exp_avg_sq.sqrt().add_(eps)
+                step_size = lr * (torch.sqrt(bias2) / bias1)
+                update = exp_avg.div(denom).mul_(step_size)
+                p_slice.add_(other=update, alpha=-1.0)
+                idx += 1
+                all_reduce_futures.append(dist.all_gather_into_tensor(p, p_slice, async_op=True).get_future())
+        torch.futures.collect_all(all_reduce_futures).wait()
--- a/nanochat_moe/checkpoint_manager.py
+++ b/nanochat_moe/checkpoint_manager.py
@ -0,0 +1,151 @@
+"""
+Utilities for saving and loading model/optim/state checkpoints.
+"""
+import os
+import re
+import glob
+import json
+import logging
+import torch
+
+from nanochat_moe.common import get_base_dir
+from nanochat_moe.gpt import GPT, GPTConfig
+from nanochat_moe.tokenizer import get_tokenizer
+from nanochat_moe.common import setup_default_logging
+
+# Set up logging
+setup_default_logging()
+logger = logging.getLogger(__name__)
+def log0(message):
+    if int(os.environ.get('RANK', 0)) == 0:
+        logger.info(message)
+
+def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data, rank=0):
+    if rank == 0:
+        os.makedirs(checkpoint_dir, exist_ok=True)
+        # Save the model state parameters
+        model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")
+        torch.save(model_data, model_path)
+        logger.info(f"Saved model parameters to: {model_path}")
+        # Save the metadata dict as json
+        meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
+        with open(meta_path, "w", encoding="utf-8") as f:
+            json.dump(meta_data, f, indent=2)
+        logger.info(f"Saved metadata to: {meta_path}")
+    # Note that optimizer state is sharded across ranks, so each rank must save its own.
+    if optimizer_data is not None:
+        optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt")
+        torch.save(optimizer_data, optimizer_path)
+        logger.info(f"Saved optimizer state to: {optimizer_path}")
+
+def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False, rank=0):
+    # Load the model state
+    model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")
+    model_data = torch.load(model_path, map_location=device)
+    # Load the optimizer state if requested
+    optimizer_data = None
+    if load_optimizer:
+        optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt")
+        optimizer_data = torch.load(optimizer_path, map_location=device)
+    # Load the metadata
+    meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
+    with open(meta_path, "r", encoding="utf-8") as f:
+        meta_data = json.load(f)
+    return model_data, optimizer_data, meta_data
+
+
+def build_model(checkpoint_dir, step, device, phase):
+    """
+    A bunch of repetitive code to build a model from a given checkpoint.
+    Returns:
+    - base model - uncompiled, not wrapped in DDP
+    - tokenizer
+    - meta data saved during base model training
+    """
+    assert phase in ["train", "eval"], f"Invalid phase: {phase}"
+    model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, step, device, load_optimizer=False)
+    if device.type in {"cpu", "mps"}:
+        # Convert bfloat16 tensors to float for CPU inference
+        model_data = {
+            k: v.float() if v.dtype == torch.bfloat16 else v
+            for k, v in model_data.items()
+        }
+    # Hack: fix torch compile issue, which prepends all keys with _orig_mod.
+    model_data = {k.removeprefix("_orig_mod."): v for k, v in model_data.items()}
+    model_config_kwargs = meta_data["model_config"]
+    log0(f"Building model with config: {model_config_kwargs}")
+    model_config = GPTConfig(**model_config_kwargs)
+    with torch.device("meta"):
+        model = GPT(model_config)
+    # Load the model state
+    model.to_empty(device=device)
+    model.init_weights() # note: this is dumb, but we need to init the rotary embeddings. TODO: fix model re-init
+    model.load_state_dict(model_data, strict=True, assign=True)
+    # Put the model in the right training phase / mode
+    if phase == "eval":
+        model.eval()
+    else:
+        model.train()
+    # Load the Tokenizer
+    tokenizer = get_tokenizer()
+    # Sanity check: compatibility between model and tokenizer
+    assert tokenizer.get_vocab_size() == model_config_kwargs["vocab_size"]
+    return model, tokenizer, meta_data
+
+
+def find_largest_model(checkpoint_dir):
+    # attempt to guess the model tag: take the biggest model available
+    model_tags = [f for f in os.listdir(checkpoint_dir) if os.path.isdir(os.path.join(checkpoint_dir, f))]
+    if not model_tags:
+        raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}")
+    # 1) normally all model tags are of the form d<number>, try that first:
+    candidates = []
+    for model_tag in model_tags:
+        match = re.match(r"d(\d+)", model_tag)
+        if match:
+            model_depth = int(match.group(1))
+            candidates.append((model_depth, model_tag))
+    if candidates:
+        candidates.sort(key=lambda x: x[0], reverse=True)
+        return candidates[0][1]
+    # 2) if that failed, take the most recently updated model:
+    model_tags.sort(key=lambda x: os.path.getmtime(os.path.join(checkpoint_dir, x)), reverse=True)
+    return model_tags[0]
+
+
+def find_last_step(checkpoint_dir):
+    # Look into checkpoint_dir and find model_<step>.pt with the highest step
+    checkpoint_files = glob.glob(os.path.join(checkpoint_dir, "model_*.pt"))
+    if not checkpoint_files:
+        raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}")
+    last_step = int(max(os.path.basename(f).split("_")[-1].split(".")[0] for f in checkpoint_files))
+    return last_step
+
+# -----------------------------------------------------------------------------
+# convenience functions that take into account nanochat's directory structure
+
+def load_model_from_dir(checkpoints_dir, device, phase, model_tag=None, step=None):
+    if model_tag is None:
+        # guess the model tag by defaulting to the largest model
+        model_tag = find_largest_model(checkpoints_dir)
+        log0(f"No model tag provided, guessing model tag: {model_tag}")
+    checkpoint_dir = os.path.join(checkpoints_dir, model_tag)
+    if step is None:
+        # guess the step by defaulting to the last step
+        step = find_last_step(checkpoint_dir)
+    assert step is not None, f"No checkpoints found in {checkpoint_dir}"
+    # build the model
+    log0(f"Loading model from {checkpoint_dir} with step {step}")
+    model, tokenizer, meta_data = build_model(checkpoint_dir, step, device, phase)
+    return model, tokenizer, meta_data
+
+def load_model(source, *args, **kwargs):
+    model_dir = {
+        "base": "base_checkpoints",
+        "mid": "mid_checkpoints",
+        "sft": "chatsft_checkpoints",
+        "rl": "chatrl_checkpoints",
+    }[source]
+    base_dir = get_base_dir()
+    checkpoints_dir = os.path.join(base_dir, model_dir)
+    return load_model_from_dir(checkpoints_dir, *args, **kwargs)
--- a/nanochat_moe/common.py
+++ b/nanochat_moe/common.py
@ -0,0 +1,213 @@
+"""
+Common utilities for nanochat.
+"""
+
+import os
+import re
+import logging
+import urllib.request
+import torch
+import torch.distributed as dist
+from filelock import FileLock
+
+class ColoredFormatter(logging.Formatter):
+    """Custom formatter that adds colors to log messages."""
+    # ANSI color codes
+    COLORS = {
+        'DEBUG': '\033[36m',    # Cyan
+        'INFO': '\033[32m',     # Green
+        'WARNING': '\033[33m',  # Yellow
+        'ERROR': '\033[31m',    # Red
+        'CRITICAL': '\033[35m', # Magenta
+    }
+    RESET = '\033[0m'
+    BOLD = '\033[1m'
+    def format(self, record):
+        # Add color to the level name
+        levelname = record.levelname
+        if levelname in self.COLORS:
+            record.levelname = f"{self.COLORS[levelname]}{self.BOLD}{levelname}{self.RESET}"
+        # Format the message
+        message = super().format(record)
+        # Add color to specific parts of the message
+        if levelname == 'INFO':
+            # Highlight numbers and percentages
+            message = re.sub(r'(\d+\.?\d*\s*(?:GB|MB|%|docs))', rf'{self.BOLD}\1{self.RESET}', message)
+            message = re.sub(r'(Shard \d+)', rf'{self.COLORS["INFO"]}{self.BOLD}\1{self.RESET}', message)
+        return message
+
+def setup_default_logging():
+    handler = logging.StreamHandler()
+    handler.setFormatter(ColoredFormatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s'))
+    logging.basicConfig(
+        level=logging.INFO,
+        handlers=[handler]
+    )
+
+setup_default_logging()
+logger = logging.getLogger(__name__)
+
+def get_base_dir():
+    # co-locate nanochat intermediates with other cached data in ~/.cache (by default)
+    if os.environ.get("NANOCHAT_BASE_DIR"):
+        nanochat_dir = os.environ.get("NANOCHAT_BASE_DIR")
+    else:
+        home_dir = os.path.expanduser("~")
+        cache_dir = os.path.join(home_dir, ".cache")
+        nanochat_dir = os.path.join(cache_dir, "nanochat")
+    os.makedirs(nanochat_dir, exist_ok=True)
+    return nanochat_dir
+
+def download_file_with_lock(url, filename, postprocess_fn=None, timeout=300):
+    """
+    Downloads a file from a URL to a local path in the base directory.
+    Uses a lock file to prevent concurrent downloads among multiple ranks.
+    """
+    base_dir = get_base_dir()
+    file_path = os.path.join(base_dir, filename)
+    lock_path = file_path + ".lock"
+
+    if os.path.exists(file_path):
+        return file_path
+
+    # Check if lock file exists and is stale (older than timeout seconds)
+    if os.path.exists(lock_path):
+        import time
+        lock_age = time.time() - os.path.getmtime(lock_path)
+        if lock_age > timeout:
+            logger.warning(f"Lock file {lock_path} is {lock_age:.0f}s old, removing stale lock")
+            try:
+                os.remove(lock_path)
+            except:
+                pass
+
+    # FileLock timeout is for acquiring lock, not for download
+    with FileLock(lock_path, timeout=60):
+        # Only a single rank can acquire this lock
+        # All other ranks block until it is released
+
+        # Recheck after acquiring lock
+        if os.path.exists(file_path):
+            return file_path
+
+        # Download the content as bytes with timeout
+        print(f"Downloading {url}...")
+        import socket
+        socket.setdefaulttimeout(timeout)
+        try:
+            with urllib.request.urlopen(url, timeout=timeout) as response:
+                content = response.read() # bytes
+        except (urllib.error.URLError, socket.timeout) as e:
+            logger.error(f"Failed to download {url}: {e}")
+            # Clean up lock file on failure
+            try:
+                os.remove(lock_path)
+            except:
+                pass
+            raise
+
+        # Write to local file
+        with open(file_path, 'wb') as f:
+            f.write(content)
+        print(f"Downloaded to {file_path}")
+
+        # Run the postprocess function if provided
+        if postprocess_fn is not None:
+            postprocess_fn(file_path)
+
+    return file_path
+
+def print0(s="",**kwargs):
+    ddp_rank = int(os.environ.get('RANK', 0))
+    if ddp_rank == 0:
+        print(s, **kwargs)
+
+def print_banner():
+    # Cool DOS Rebel font ASCII banner made with https://manytools.org/hacker-tools/ascii-banner/
+    banner = """
+                                                       █████                █████
+                                                      ░░███                ░░███
+     ████████    ██████   ████████    ██████   ██████  ░███████    ██████  ███████
+    ░░███░░███  ░░░░░███ ░░███░░███  ███░░███ ███░░███ ░███░░███  ░░░░░███░░░███░
+     ░███ ░███   ███████  ░███ ░███ ░███ ░███░███ ░░░  ░███ ░███   ███████  ░███
+     ░███ ░███  ███░░███  ░███ ░███ ░███ ░███░███  ███ ░███ ░███  ███░░███  ░███ ███
+     ████ █████░░████████ ████ █████░░██████ ░░██████  ████ █████░░███████  ░░█████
+    ░░░░ ░░░░░  ░░░░░░░░ ░░░░ ░░░░░  ░░░░░░   ░░░░░░  ░░░░ ░░░░░  ░░░░░░░░   ░░░░░
+    """
+    print0(banner)
+
+def is_ddp():
+    # TODO is there a proper way
+    return int(os.environ.get('RANK', -1)) != -1
+
+def get_dist_info():
+    if is_ddp():
+        assert all(var in os.environ for var in ['RANK', 'LOCAL_RANK', 'WORLD_SIZE'])
+        ddp_rank = int(os.environ['RANK'])
+        ddp_local_rank = int(os.environ['LOCAL_RANK'])
+        ddp_world_size = int(os.environ['WORLD_SIZE'])
+        return True, ddp_rank, ddp_local_rank, ddp_world_size
+    else:
+        return False, 0, 0, 1
+
+def autodetect_device_type():
+    # prefer to use CUDA if available, otherwise use MPS, otherwise fallback on CPU
+    if torch.cuda.is_available():
+        device_type = "cuda"
+    elif torch.backends.mps.is_available():
+        device_type = "mps"
+    else:
+        device_type = "cpu"
+    print0(f"Autodetected device type: {device_type}")
+    return device_type
+
+def compute_init(device_type="cuda"): # cuda|cpu|mps
+    """Basic initialization that we keep doing over and over, so make common."""
+
+    assert device_type in ["cuda", "mps", "cpu"], "Invalid device type atm"
+    if device_type == "cuda":
+        assert torch.cuda.is_available(), "Your PyTorch installation is not configured for CUDA but device_type is 'cuda'"
+    if device_type == "mps":
+        assert torch.backends.mps.is_available(), "Your PyTorch installation is not configured for MPS but device_type is 'mps'"
+
+    # Reproducibility
+    # Note that we set the global seeds here, but most of the code uses explicit rng objects.
+    # The only place where global rng might be used is nn.Module initialization of the model weights.
+    torch.manual_seed(42)
+    if device_type == "cuda":
+        torch.cuda.manual_seed(42)
+    # skipping full reproducibility for now, possibly investigate slowdown later
+    # torch.use_deterministic_algorithms(True)
+
+    # Precision
+    if device_type == "cuda":
+        torch.set_float32_matmul_precision("high") # uses tf32 instead of fp32 for matmuls
+
+    # Distributed setup: Distributed Data Parallel (DDP), optional, and requires CUDA
+    ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
+    if ddp and device_type == "cuda":
+        device = torch.device("cuda", ddp_local_rank)
+        torch.cuda.set_device(device)  # make "cuda" default to this device
+        dist.init_process_group(backend="nccl", device_id=device)
+        dist.barrier()
+    else:
+        device = torch.device(device_type) # mps|cpu
+
+    if ddp_rank == 0:
+        logger.info(f"Distributed world size: {ddp_world_size}")
+
+    return ddp, ddp_rank, ddp_local_rank, ddp_world_size, device
+
+def compute_cleanup():
+    """Companion function to compute_init, to clean things up before script exit"""
+    if is_ddp():
+        dist.destroy_process_group()
+
+class DummyWandb:
+    """Useful if we wish to not use wandb but have all the same signatures"""
+    def __init__(self):
+        pass
+    def log(self, *args, **kwargs):
+        pass
+    def finish(self):
+        pass
--- a/nanochat_moe/configurator.py
+++ b/nanochat_moe/configurator.py
@ -0,0 +1,56 @@
+"""
+Poor Man's Configurator. Probably a terrible idea. Example usage:
+$ python train.py config/override_file.py --batch_size=32
+this will first run config/override_file.py, then override batch_size to 32
+
+The code in this file will be run as follows from e.g. train.py:
+>>> exec(open('configurator.py').read())
+
+So it's not a Python module, it's just shuttling this code away from train.py
+The code in this script then overrides the globals()
+
+I know people are not going to love this, I just really dislike configuration
+complexity and having to prepend config. to every single variable. If someone
+comes up with a better simple Python solution I am all ears.
+"""
+
+import os
+import sys
+from ast import literal_eval
+
+def print0(s="",**kwargs):
+    ddp_rank = int(os.environ.get('RANK', 0))
+    if ddp_rank == 0:
+        print(s, **kwargs)
+
+for arg in sys.argv[1:]:
+    if '=' not in arg:
+        # assume it's the name of a config file
+        assert not arg.startswith('--')
+        config_file = arg
+        print0(f"Overriding config with {config_file}:")
+        with open(config_file) as f:
+            print0(f.read())
+        exec(open(config_file).read())
+    else:
+        # assume it's a --key=value argument
+        assert arg.startswith('--')
+        key, val = arg.split('=')
+        key = key[2:]
+        if key in globals():
+            try:
+                # attempt to eval it it (e.g. if bool, number, or etc)
+                attempt = literal_eval(val)
+            except (SyntaxError, ValueError):
+                # if that goes wrong, just use the string
+                attempt = val
+            # ensure the types match ok
+            if globals()[key] is not None:
+                attempt_type = type(attempt)
+                default_type = type(globals()[key])
+                assert attempt_type == default_type, f"Type mismatch: {attempt_type} != {default_type}"
+            # cross fingers
+            print0(f"Overriding: {key} = {attempt}")
+            globals()[key] = attempt
+        else:
+            raise ValueError(f"Unknown config key: {key}")
--- a/nanochat_moe/core_eval.py
+++ b/nanochat_moe/core_eval.py
@ -0,0 +1,262 @@
+"""
+Functions for evaluating the CORE metric, as described in the DCLM paper.
+https://arxiv.org/abs/2406.11794
+
+TODOs:
+- All tasks ~match except for squad. We get 31% reference is 37%. Figure out why.
+"""
+import random
+
+from jinja2 import Template
+import torch
+import torch.distributed as dist
+
+# -----------------------------------------------------------------------------
+# Prompt rendering utilities
+
+def render_prompts_mc(item, continuation_delimiter, fewshot_examples=None):
+    """Render complete prompts for a multiple choice question"""
+    template_str = """
+{%- for example in fewshot_examples -%}
+{{ example.query }}{{ continuation_delimiter }}{{ example.choices[example.gold] }}
+
+{% endfor -%}
+{{ item.query }}{{ continuation_delimiter }}{{ choice }}""".strip()
+    template = Template(template_str)
+    fewshot_examples = fewshot_examples or []
+    context = {
+        'fewshot_examples': fewshot_examples,
+        'continuation_delimiter': continuation_delimiter,
+        'item': item
+    }
+    prompts = [template.render(choice=choice, **context) for choice in item['choices']]
+    return prompts
+
+
+def render_prompts_schema(item, continuation_delimiter, fewshot_examples=None):
+    """Render complete prompts for a schema question"""
+    template_str = """
+{%- for example in fewshot_examples -%}
+{{ example.context_options[example.gold] }}{{ continuation_delimiter }}{{ example.continuation }}
+
+{% endfor -%}
+{{ context }}{{ continuation_delimiter }}{{ item.continuation }}""".strip()
+    template = Template(template_str)
+    fewshot_examples = fewshot_examples or []
+    context = {
+        'fewshot_examples': fewshot_examples,
+        'continuation_delimiter': continuation_delimiter,
+        'item': item
+    }
+    prompts = [template.render(context=context_option, **context)
+               for context_option in item['context_options']]
+    return prompts
+
+
+def render_prompts_lm(item, continuation_delimiter, fewshot_examples=None):
+    """
+    Render complete prompt for a language modeling task.
+    Notice that we manually trim the context in the template,
+    which in some datasets seems to have trailing whitespace (which we don't want).
+    """
+    template_str = """
+{%- for example in fewshot_examples -%}
+{{ example.context | trim }}{{ continuation_delimiter }}{{ example.continuation }}
+
+{% endfor -%}
+{{ item.context | trim }}{{ continuation_delimiter }}{% if include_continuation %}{{ item.continuation }}{% endif %}""".strip()
+    template = Template(template_str)
+    fewshot_examples = fewshot_examples or []
+    context = {
+        'fewshot_examples': fewshot_examples,
+        'continuation_delimiter': continuation_delimiter,
+        'item': item
+    }
+    # Return two prompts: without and with the continuation
+    prompt_without = template.render(include_continuation=False, **context)
+    prompt_with = template.render(include_continuation=True, **context)
+    # Due to the way the data seems to be stored, I think I need to strip in the case of LM here.
+    # Otherwise we may get trailing whitespaces in prompt_without (which get absorbed into the next
+    # token in prompt_with), meaning we don't get a nice and clean prefix in the token space
+    # to detect the final continuation. Tokenizers...
+    prompt_without = prompt_without.strip()
+    return [prompt_without, prompt_with]
+
+
+def find_common_length(token_sequences, direction='left'):
+    """
+    Find the length of the common prefix or suffix across token sequences
+    - direction: 'left' for prefix, 'right' for suffix
+    """
+    min_len = min(len(seq) for seq in token_sequences)
+    indices = {
+        'left': range(min_len),
+        'right': range(-1, -min_len-1, -1)
+    }[direction]
+    # Find the first position where the token sequences differ
+    for i, idx in enumerate(indices):
+        token = token_sequences[0][idx]
+        if not all(seq[idx] == token for seq in token_sequences):
+            return i
+    return min_len
+
+
+def stack_sequences(tokens, pad_token_id):
+    """Stack up a list of token sequences, pad to longest on the right"""
+    bsz, seq_len = len(tokens), max(len(x) for x in tokens)
+    input_ids = torch.full((bsz, seq_len), pad_token_id, dtype=torch.long)
+    for i, x in enumerate(tokens):
+        input_ids[i, :len(x)] = torch.tensor(x, dtype=torch.long)
+    return input_ids
+
+
+def batch_sequences_mc(tokenizer, prompts):
+    # In multiple choice, contexts are the same but the continuation is different (common prefix)
+    tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id())
+    # figure out the start and end of each continuation
+    answer_start_idx = find_common_length(tokens, direction='left')
+    start_indices = [answer_start_idx] * len(prompts)
+    end_indices = [len(x) for x in tokens]
+    return tokens, start_indices, end_indices
+
+
+def batch_sequences_schema(tokenizer, prompts):
+    # In schema tasks, contexts vary but continuation is the same (common suffix)
+    tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id())
+    # figure out the start and end of each context
+    suffix_length = find_common_length(tokens, direction='right')
+    end_indices = [len(x) for x in tokens]
+    start_indices = [ei - suffix_length for ei in end_indices]
+    return tokens, start_indices, end_indices
+
+
+def batch_sequences_lm(tokenizer, prompts):
+    # In LM tasks, we have two prompts: without and with continuation
+    tokens = tokenizer(prompts, prepend=tokenizer.get_bos_token_id())
+    tokens_without, tokens_with = tokens
+    start_idx, end_idx = len(tokens_without), len(tokens_with)
+    assert start_idx < end_idx, "prompt without is supposed to be a prefix of prompt with"
+    assert tokens_without == tokens_with[:start_idx], "prompt without is supposed to be a prefix of prompt with"
+    # we only need the with continuation prompt in the LM task, i.e. batch size of 1
+    return [tokens_with], [start_idx], [end_idx]
+
+
+@torch.no_grad()
+def forward_model(model, input_ids):
+    """
+    Take BxT tensor of token ids, return BxT tensor of losses and argmax predictions.
+    The last column of losses is set to nan because we don't have autoregressive targets there.
+    """
+    batch_size, seq_len = input_ids.size()
+    outputs = model(input_ids)
+    # Roll the tensor to the left by one position to get the (autoregressive) target ids
+    target_ids = torch.roll(input_ids, shifts=-1, dims=1)
+    # Calculate cross entropy at all positions
+    losses = torch.nn.functional.cross_entropy(
+        outputs.view(batch_size * seq_len, -1),
+        target_ids.view(batch_size * seq_len),
+        reduction='none'
+    ).view(batch_size, seq_len)
+    # Set the last column to be nan because there is no autoregressive loss there
+    losses[:, -1] = float('nan')
+    # Get the argmax predictions at each position
+    predictions = outputs.argmax(dim=-1)
+    return losses, predictions
+
+
+@torch.no_grad()
+def evaluate_example(idx, model, tokenizer, data, device, task_meta):
+    """Evaluate a single example, return True if correct, False otherwise"""
+    item = data[idx]
+    task_type = task_meta['task_type']
+    num_fewshot = task_meta['num_fewshot']
+    continuation_delimiter = task_meta['continuation_delimiter']
+
+    # Sample few-shot examples (excluding current item)
+    fewshot_examples = []
+    if num_fewshot > 0:
+        rng = random.Random(1234 + idx)
+        available_indices = [i for i in range(len(data)) if i != idx]
+        fewshot_indices = rng.sample(available_indices, num_fewshot)
+        fewshot_examples = [data[i] for i in fewshot_indices]
+
+    # Render prompts and batch sequences based on task type
+    if task_type == 'multiple_choice':
+        prompts = render_prompts_mc(item, continuation_delimiter, fewshot_examples)
+        tokens, start_idxs, end_idxs = batch_sequences_mc(tokenizer, prompts)
+    elif task_type == 'schema':
+        prompts = render_prompts_schema(item, continuation_delimiter, fewshot_examples)
+        tokens, start_idxs, end_idxs = batch_sequences_schema(tokenizer, prompts)
+    elif task_type == 'language_modeling':
+        prompts = render_prompts_lm(item, continuation_delimiter, fewshot_examples)
+        tokens, start_idxs, end_idxs = batch_sequences_lm(tokenizer, prompts)
+    else:
+        raise ValueError(f"Unsupported task type: {task_type}")
+
+    # Some models can't forward sequences beyond a certain length (e.g. GPT-2)
+    # In these cases, we have to truncate sequences to max length and adjust the indices
+    if hasattr(model, 'max_seq_len') and model.max_seq_len is not None:
+        max_tokens = model.max_seq_len
+        new_tokens, new_start_idxs, new_end_idxs = [], [], []
+        for t, s, e in zip(tokens, start_idxs, end_idxs):
+            if len(t) > max_tokens:
+                num_to_crop = len(t) - max_tokens
+                new_tokens.append(t[-max_tokens:]) # take the last max_tokens tokens
+                new_start_idxs.append(s - num_to_crop) # shift the indices down
+                new_end_idxs.append(e - num_to_crop)
+                assert s - num_to_crop >= 0, "this should never happen right?"
+                assert e - num_to_crop >= 0, "this should never happen right?"
+            else:
+                new_tokens.append(t) # keep unchanged
+                new_start_idxs.append(s)
+                new_end_idxs.append(e)
+        tokens, start_idxs, end_idxs = new_tokens, new_start_idxs, new_end_idxs
+
+    # Stack up all the sequences into a batch
+    pad_token_id = tokenizer.get_bos_token_id() # use BOS as pad token is ok
+    input_ids = stack_sequences(tokens, pad_token_id)
+    input_ids = input_ids.to(device)
+
+    # Forward the model, get the autoregressive loss and argmax prediction at each token
+    losses, predictions = forward_model(model, input_ids)
+
+    # See if the losses/predictions come out correctly
+    if task_type == 'language_modeling':
+        # language modeling task is currently always batch size 1
+        si = start_idxs[0]
+        ei = end_idxs[0]
+        # predictions[i] predict input_ids[i+1] autoregressively
+        predicted_tokens = predictions[0, si-1:ei-1]
+        actual_tokens = input_ids[0, si:ei]
+        is_correct = torch.all(predicted_tokens == actual_tokens).item()
+    elif task_type in ['multiple_choice', 'schema']:
+        # For MC/schema: find the option with lowest average loss
+        mean_losses = [losses[i, si-1:ei-1].mean().item()
+                        for i, (si, ei) in enumerate(zip(start_idxs, end_idxs))]
+        pred_idx = mean_losses.index(min(mean_losses))
+        is_correct = pred_idx == item['gold']
+    else:
+        raise ValueError(f"Unsupported task type: {task_type}")
+
+    return is_correct
+
+
+def evaluate_task(model, tokenizer, data, device, task_meta):
+    """
+    This function is responsible for evaluating one task across many examples.
+    It also handles dispatch to all processes if the script is run with torchrun.
+    """
+    rank = dist.get_rank() if dist.is_initialized() else 0
+    world_size = dist.get_world_size() if dist.is_initialized() else 1
+    correct = torch.zeros(len(data), dtype=torch.float32, device=device)
+    # stride the examples to each rank
+    for idx in range(rank, len(data), world_size):
+        is_correct = evaluate_example(idx, model, tokenizer, data, device, task_meta)
+        correct[idx] = float(is_correct)
+    # sync results across all the processes if running distributed
+    if world_size > 1:
+        dist.barrier()
+        dist.all_reduce(correct, op=dist.ReduceOp.SUM)
+    # compute the mean
+    mean_correct = correct.mean().item()
+    return mean_correct
--- a/nanochat_moe/dataloader.py
+++ b/nanochat_moe/dataloader.py
@ -0,0 +1,116 @@
+from collections import deque
+
+import torch
+import pyarrow.parquet as pq
+from tqdm import tqdm
+
+from nanochat_moe.common import get_dist_info, print0
+from nanochat_moe.dataset import list_parquet_files
+from nanochat_moe.tokenizer import get_tokenizer
+
+def tokenizing_distributed_data_loader_with_state(B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda", resume_state_dict=None):
+    """
+    Stream pretraining text from parquet files, tokenize, yield training batches.
+
+    This implementation became a bit more complex because we wish to support approximate resume training.
+    Instead of turning this into a Class, we opt to return the state_dict with every batch,
+    and then the caller can pass in a state_dict to resume training from a desired point.
+    Note that this resumption is atm only *approximate* for simplicity.
+    We won't repeat the same documents but we might skip a few.
+    The state_dict that is returned can be later passed into this function via `resume_state_dict` to approximately resume.
+
+    Perfect state resumption is possible but would be a lot more bloated, probably not worth it atm.
+    """
+    assert split in ["train", "val"], "split must be 'train' or 'val'"
+
+    # infinite iterator over document batches (list of text strings)
+    ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
+    show_progress = ddp_rank == 0  # only show progress on rank 0
+    
+    print0(f"[DataLoader] Initializing dataloader for split={split}, rank={ddp_rank}/{ddp_world_size}")
+    
+    def document_batches():
+        from nanochat_moe.dataset import DATA_DIR
+        print0(f"[DataLoader] Listing parquet files from: {DATA_DIR}")
+        parquet_paths = list_parquet_files()
+        parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:]
+        print0(f"[DataLoader] Found {len(parquet_paths)} parquet files for {split} split")
+        if len(parquet_paths) == 0:
+            print0(f"[DataLoader] WARNING: No parquet files found! Check if data directory exists and contains .parquet files.")
+        
+        resume_pq_idx = resume_state_dict["pq_idx"] if resume_state_dict is not None else 0
+        resume_rg_idx = resume_state_dict["rg_idx"] if resume_state_dict is not None else None
+        pq_idx = resume_pq_idx # we kick off parquet files at the resume index (or by default just 0)
+        pbar = None
+        epoch = 0
+        while True: # iterate infinitely (multi-epoch)
+            if show_progress and pbar is None:
+                # Use position=0 and leave=True to ensure progress bar displays correctly
+                pbar = tqdm(total=len(parquet_paths), desc=f"Tokenizing {split} data (epoch {epoch})", unit="file", leave=True, position=0, file=None)
+            while pq_idx < len(parquet_paths): # iterate over all parquet files
+                filepath = parquet_paths[pq_idx]
+                pf = pq.ParquetFile(filepath)
+                if show_progress:
+                    pbar.set_postfix({"file": f"{pq_idx+1}/{len(parquet_paths)}"})
+                # Start from resume point if resuming on same file, otherwise from DDP rank
+                # I know this state resumption is a little bit tricky and a little bit hacky... sigh.
+                if resume_rg_idx is not None:
+                    base_idx = resume_rg_idx // ddp_world_size # in units of ddp_world_size
+                    base_idx += 1 # advance by 1 so that we definitely don't repeat data after resuming
+                    rg_idx = base_idx * ddp_world_size + ddp_rank
+                    resume_rg_idx = None # set to None as we only want to do this a single time
+                else:
+                    rg_idx = ddp_rank
+                while rg_idx < pf.num_row_groups:
+                    rg = pf.read_row_group(rg_idx)
+                    batch = rg.column('text').to_pylist() # each batch is a parquet group, e.g. 1024 rows
+                    # the tokenizer encode might want to go in even smaller batches, e.g. 128 rows
+                    for i in range(0, len(batch), tokenizer_batch_size):
+                        yield batch[i:i+tokenizer_batch_size], (pq_idx, rg_idx)
+                    rg_idx += ddp_world_size # advance to the next row group (in DDP)
+                if show_progress:
+                    pbar.update(1)
+                pq_idx += 1 # advance to the next parquet file
+            # Finished one epoch, reset for next epoch
+            if show_progress:
+                pbar.close()
+                pbar = None
+            epoch += 1
+            pq_idx = 0  # reset to start of files for next epoch
+    batches = document_batches()
+
+    # Now emit batches of tokens.
+    needed_tokens = B * T + 1 # +1 is because we also need the target at the last token
+    # get the tokenizer and the bos token
+    print0(f"[DataLoader] Loading tokenizer...")
+    tokenizer = get_tokenizer()
+    print0(f"[DataLoader] Tokenizer loaded, vocab_size={tokenizer.get_vocab_size()}")
+    bos_token = tokenizer.get_bos_token_id()
+    print0(f"[DataLoader] Starting to yield batches (needed_tokens={needed_tokens})...")
+    # scratch buffer holds the tokens for one iteration
+    token_buffer = deque() # we stream tokens on the right and pop from the left
+    while True:
+        # Accumulate enough tokens for one iteration before yielding.
+        while len(token_buffer) < needed_tokens:
+            doc_batch, (pq_idx, rg_idx) = next(batches)
+            token_lists = tokenizer.encode(doc_batch, prepend=bos_token, num_threads=tokenizer_threads)
+            for tokens in token_lists:
+                token_buffer.extend(tokens)
+        # Move tokens from the deque into the scratch buffer
+        tokens = [token_buffer.popleft() for _ in range(needed_tokens)]
+        # CUDA supports memory pinning for asynchronous transfers between CPU and GPU
+        use_cuda_optimizations = device == "cuda"
+        scratch = torch.tensor(tokens, dtype=torch.long, pin_memory=use_cuda_optimizations) # in PyTorch, long=int64
+        # Create the inputs/targets as 1D tensors
+        inputs_cpu = scratch[:-1]
+        targets_cpu = scratch[1:]
+        # Reshape to 2D and move to GPU async
+        inputs = inputs_cpu.view(B, T).to(device=device, non_blocking=use_cuda_optimizations)
+        targets = targets_cpu.view(B, T).to(device=device, non_blocking=use_cuda_optimizations)
+        state_dict = {"pq_idx": pq_idx, "rg_idx": rg_idx} # we need this in case we wish to approximately resume training
+        yield inputs, targets, state_dict
+
+def tokenizing_distributed_data_loader(*args, **kwargs):
+    # helper function that only emits the inputs/targets and not the state_dict
+    for inputs, targets, state_dict in tokenizing_distributed_data_loader_with_state(*args, **kwargs):
+        yield inputs, targets
--- a/nanochat_moe/dataset.py
+++ b/nanochat_moe/dataset.py
@ -0,0 +1,132 @@
+"""
+The base/pretraining dataset is a set of parquet files.
+This file contains utilities for:
+- iterating over the parquet files and yielding documents from it
+- download the files on demand if they are not on disk
+
+For details of how the dataset was prepared, see `repackage_data_reference.py`.
+"""
+
+import os
+import argparse
+import time
+import requests
+import pyarrow.parquet as pq
+from multiprocessing import Pool
+
+from nanochat_moe.common import get_base_dir
+
+# -----------------------------------------------------------------------------
+# The specifics of the current pretraining dataset
+
+# The URL on the internet where the data is hosted and downloaded from on demand
+BASE_URL = "https://huggingface.co/datasets/karpathy/fineweb-edu-100b-shuffle/resolve/main"
+MAX_SHARD = 1822 # the last datashard is shard_01822.parquet
+index_to_filename = lambda index: f"shard_{index:05d}.parquet" # format of the filenames
+# Support custom data directory via NANOCHAT_DATA_DIR environment variable
+if os.environ.get("NANOCHAT_DATA_DIR"):
+    DATA_DIR = os.environ.get("NANOCHAT_DATA_DIR")
+else:
+    base_dir = get_base_dir()
+    DATA_DIR = os.path.join(base_dir, "base_data")
+os.makedirs(DATA_DIR, exist_ok=True)
+
+# -----------------------------------------------------------------------------
+# These functions are useful utilities to other modules, can/should be imported
+
+def list_parquet_files(data_dir=None):
+    """ Looks into a data dir and returns full paths to all parquet files. """
+    data_dir = DATA_DIR if data_dir is None else data_dir
+    parquet_files = sorted([
+        f for f in os.listdir(data_dir)
+        if f.endswith('.parquet') and not f.endswith('.tmp')
+    ])
+    parquet_paths = [os.path.join(data_dir, f) for f in parquet_files]
+    return parquet_paths
+
+def parquets_iter_batched(split, start=0, step=1):
+    """
+    Iterate through the dataset, in batches of underlying row_groups for efficiency.
+    - split can be "train" or "val". the last parquet file will be val.
+    - start/step are useful for skipping rows in DDP. e.g. start=rank, step=world_size
+    """
+    assert split in ["train", "val"], "split must be 'train' or 'val'"
+    parquet_paths = list_parquet_files()
+    parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:]
+    for filepath in parquet_paths:
+        pf = pq.ParquetFile(filepath)
+        for rg_idx in range(start, pf.num_row_groups, step):
+            rg = pf.read_row_group(rg_idx)
+            texts = rg.column('text').to_pylist()
+            yield texts
+
+# -----------------------------------------------------------------------------
+def download_single_file(index):
+    """ Downloads a single file index, with some backoff """
+
+    # Construct the local filepath for this file and skip if it already exists
+    filename = index_to_filename(index)
+    filepath = os.path.join(DATA_DIR, filename)
+    if os.path.exists(filepath):
+        print(f"Skipping {filepath} (already exists)")
+        return True
+
+    # Construct the remote URL for this file
+    url = f"{BASE_URL}/{filename}"
+    print(f"Downloading {filename}...")
+
+    # Download with retries
+    max_attempts = 5
+    for attempt in range(1, max_attempts + 1):
+        try:
+            response = requests.get(url, stream=True, timeout=30)
+            response.raise_for_status()
+            # Write to temporary file first
+            temp_path = filepath + f".tmp"
+            with open(temp_path, 'wb') as f:
+                for chunk in response.iter_content(chunk_size=1024 * 1024):  # 1MB chunks
+                    if chunk:
+                        f.write(chunk)
+            # Move temp file to final location
+            os.rename(temp_path, filepath)
+            print(f"Successfully downloaded {filename}")
+            return True
+
+        except (requests.RequestException, IOError) as e:
+            print(f"Attempt {attempt}/{max_attempts} failed for {filename}: {e}")
+            # Clean up any partial files
+            for path in [filepath + f".tmp", filepath]:
+                if os.path.exists(path):
+                    try:
+                        os.remove(path)
+                    except:
+                        pass
+            # Try a few times with exponential backoff: 2^attempt seconds
+            if attempt < max_attempts:
+                wait_time = 2 ** attempt
+                print(f"Waiting {wait_time} seconds before retry...")
+                time.sleep(wait_time)
+            else:
+                print(f"Failed to download {filename} after {max_attempts} attempts")
+                return False
+
+    return False
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Download FineWeb-Edu 100BT dataset shards")
+    parser.add_argument("-n", "--num-files", type=int, default=-1, help="Number of shards to download (default: -1), -1 = disable")
+    parser.add_argument("-w", "--num-workers", type=int, default=4, help="Number of parallel download workers (default: 4)")
+    args = parser.parse_args()
+
+    num = MAX_SHARD + 1 if args.num_files == -1 else min(args.num_files, MAX_SHARD + 1)
+    ids_to_download = list(range(num))
+    print(f"Downloading {len(ids_to_download)} shards using {args.num_workers} workers...")
+    print(f"Target directory: {DATA_DIR}")
+    print()
+    with Pool(processes=args.num_workers) as pool:
+        results = pool.map(download_single_file, ids_to_download)
+
+    # Report results
+    successful = sum(1 for success in results if success)
+    print(f"Done! Downloaded: {successful}/{len(ids_to_download)} shards to {DATA_DIR}")
--- a/nanochat_moe/engine.py
+++ b/nanochat_moe/engine.py
@ -0,0 +1,378 @@
+"""
+Engine for efficient inference of our models.
+
+Everything works around token sequences:
+- The user can send token sequences to the engine
+- The engine returns the next token
+
+Notes:
+- The engine knows nothing about tokenization, it's purely token id sequences.
+
+The whole thing is made as efficient as possible.
+"""
+
+import torch
+import torch.nn.functional as F
+import signal
+import warnings
+from contextlib import contextmanager
+from collections import deque
+from nanochat_moe.common import compute_init, autodetect_device_type
+from nanochat_moe.checkpoint_manager import load_model
+from contextlib import nullcontext 
+
+# -----------------------------------------------------------------------------
+# Calculator tool helpers
+@contextmanager
+def timeout(duration, formula):
+    def timeout_handler(signum, frame):
+        raise Exception(f"'{formula}': timed out after {duration} seconds")
+
+    signal.signal(signal.SIGALRM, timeout_handler)
+    signal.alarm(duration)
+    yield
+    signal.alarm(0)
+
+def eval_with_timeout(formula, max_time=3):
+    try:
+        with timeout(max_time, formula):
+            with warnings.catch_warnings():
+                warnings.simplefilter("ignore", SyntaxWarning)
+                return eval(formula, {"__builtins__": {}}, {})
+    except Exception as e:
+        signal.alarm(0)
+        # print(f"Warning: Failed to eval {formula}, exception: {e}") # it's ok ignore wrong calculator usage
+        return None
+
+def use_calculator(expr):
+    """
+    Evaluate a Python expression safely.
+    Supports both math expressions and string operations like .count()
+    """
+    # Remove commas from numbers
+    expr = expr.replace(",", "")
+
+    # Check if it's a pure math expression (old behavior)
+    if all([x in "0123456789*+-/.() " for x in expr]):
+        if "**" in expr:  # disallow power operator
+            return None
+        return eval_with_timeout(expr)
+
+    # Check if it's a string operation we support
+    # Allow: strings (single/double quotes), .count(), letters, numbers, spaces, parens
+    allowed_chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789'\"()._ "
+    if not all([x in allowed_chars for x in expr]):
+        return None
+
+    # Disallow dangerous patterns
+    dangerous_patterns = ['__', 'import', 'exec', 'eval', 'compile', 'open', 'file',
+                         'input', 'raw_input', 'globals', 'locals', 'vars', 'dir',
+                         'getattr', 'setattr', 'delattr', 'hasattr']
+    expr_lower = expr.lower()
+    if any(pattern in expr_lower for pattern in dangerous_patterns):
+        return None
+
+    # Only allow .count() method for now (can expand later)
+    if '.count(' not in expr:
+        return None
+
+    # Evaluate with timeout
+    return eval_with_timeout(expr)
+
+# -----------------------------------------------------------------------------
+class KVCache:
+    """
+    Works hand-in-hand with the GPT model to maintain the KV cache.
+    Note that the .pos advances automatically after the last layer of the Transformer inserts.
+    """
+
+    def __init__(self, batch_size, num_heads, seq_len, head_dim, num_layers):
+        # Each of K/V is of shape (B, H, T, D) and we have one per layer of the Transformer.
+        self.kv_shape = (num_layers, 2, batch_size, num_heads, seq_len, head_dim)
+        self.kv_cache = None
+        self.pos = 0 # current position in time in the cache
+
+    def reset(self):
+        self.pos = 0
+
+    def get_pos(self):
+        return self.pos
+
+    def prefill(self, other):
+        """
+        Prefill given another KV cache. Optionally expand along batch dim.
+        This is used when we do batch 1 prefill and then want to generate
+        multiple samples in parallel from there.
+        """
+        # 1) validate the shapes
+        assert self.kv_cache is None, "Cannot prefill a non-empty KV cache"
+        assert other.kv_cache is not None, "Cannot prefill with a None KV cache"
+        for ix, (dim1, dim2) in enumerate(zip(self.kv_shape, other.kv_shape)):
+            # ix 0: num_layers, 1: k/v, 2: batch_size, 3: num_heads, 4: seq_len, 5: head_dim
+            if ix in [0, 1, 3, 5]:
+                # num_layers, k/v, num_heads, head_dim must match
+                assert dim1 == dim2, f"Dim {ix} mismatch: {dim1} != {dim2}"
+            elif ix == 2:
+                # batch_size can be expanded
+                assert dim1 == dim2 or dim2 == 1, f"Batch dim mismatch: {dim1} != {dim2}"
+            elif ix == 4:
+                # seq_len: self must be longer than other
+                assert dim1 >= dim2, f"Seq len mismatch: {dim1} < {dim2}"
+        # 2) initialize the cache
+        dtype, device = other.kv_cache.dtype, other.kv_cache.device
+        self.kv_cache = torch.empty(self.kv_shape, dtype=dtype, device=device)
+        # 3) copy the data over
+        self.kv_cache[:, :, :, :, :other.pos, :] = other.kv_cache
+        # 4) update the pos
+        self.pos = other.pos
+
+    def insert_kv(self, layer_idx, k, v):
+        # Lazy initialize the cache here because we need to know the dtype/device
+        if self.kv_cache is None:
+            self.kv_cache = torch.empty(self.kv_shape, dtype=k.dtype, device=k.device)
+        # Insert new keys/values to the cache and return the full cache so far
+        B, H, T_add, D = k.size()
+        t0, t1 = self.pos, self.pos + T_add
+        # Dynamically grow the cache if needed
+        if t1 > self.kv_cache.size(4):
+            t_needed = t1 + 1024 # as much as we need plus buffer of 1024
+            t_needed = (t_needed + 1023) & ~1023 # then round up to the nearest multiple of 1024
+            additional_shape = list(self.kv_cache.shape)
+            additional_shape[4] = t_needed - self.kv_cache.size(4)
+            additional_cache = torch.empty(additional_shape, dtype=k.dtype, device=k.device)
+            self.kv_cache = torch.cat([self.kv_cache, additional_cache], dim=4).contiguous()
+            self.kv_shape = self.kv_cache.shape
+        # Insert k, v into the cache
+        self.kv_cache[layer_idx, 0, :, :, t0:t1] = k
+        self.kv_cache[layer_idx, 1, :, :, t0:t1] = v
+        # Return the full cached keys/values up to current position (as a view)
+        key_view = self.kv_cache[layer_idx, 0, :, :, :t1]
+        value_view = self.kv_cache[layer_idx, 1, :, :, :t1]
+        # Increment pos after the last layer of the Transformer processes
+        if layer_idx == self.kv_cache.size(0) - 1:
+            self.pos = t1
+        return key_view, value_view
+
+
+# -----------------------------------------------------------------------------
+@torch.inference_mode()
+def sample_next_token(logits, rng, temperature=1.0, top_k=None):
+    """Sample a single next token from given logits of shape (B, vocab_size). Returns (B, 1)."""
+    assert temperature >= 0.0, "temperature must be non-negative"
+    if temperature == 0.0:
+        return torch.argmax(logits, dim=-1, keepdim=True)
+    if top_k is not None:
+        k = min(top_k, logits.size(-1))
+        vals, idx = torch.topk(logits, k, dim=-1)
+        vals = vals / temperature
+        probs = F.softmax(vals, dim=-1)
+        choice = torch.multinomial(probs, num_samples=1, generator=rng)
+        return idx.gather(1, choice)
+    else:
+        logits = logits / temperature
+        probs = F.softmax(logits, dim=-1)
+        return torch.multinomial(probs, num_samples=1, generator=rng)
+
+# -----------------------------------------------------------------------------
+
+class RowState:
+    # Per-row state tracking during generation
+    def __init__(self, current_tokens=None):
+        self.current_tokens = current_tokens or [] # Current token sequence for this row
+        self.forced_tokens = deque() # Queue of tokens to force inject
+        self.in_python_block = False # Whether we are inside a python block
+        self.python_expr_tokens = [] # Tokens of the current python expression
+        self.completed = False # Whether this row has completed generation
+
+class Engine:
+
+    def __init__(self, model, tokenizer):
+        self.model = model
+        self.tokenizer = tokenizer # needed for tool use
+
+    @torch.inference_mode()
+    def generate(self, tokens, num_samples=1, max_tokens=None, temperature=1.0, top_k=None, seed=42):
+        """Same as generate, but does single prefill and then clones the KV cache."""
+        assert isinstance(tokens, list) and isinstance(tokens[0], int), "expecting list of ints"
+        device = self.model.get_device()
+        rng = torch.Generator(device=device)
+        rng.manual_seed(seed)
+
+        # Get the special tokens we need to coordinate the tool use state machine
+        get_special = lambda s: self.tokenizer.encode_special(s)
+        python_start = get_special("<|python_start|>")
+        python_end = get_special("<|python_end|>")
+        output_start = get_special("<|output_start|>")
+        output_end = get_special("<|output_end|>")
+        assistant_end = get_special("<|assistant_end|>") # if sampled, ends row
+        bos = self.tokenizer.get_bos_token_id() # if sampled, ends row
+
+        # 1) Run a batch 1 prefill of the prompt tokens
+        m = self.model.config
+        kv_model_kwargs = {"num_heads": m.n_kv_head, "head_dim": m.n_embd // m.n_head, "num_layers": m.n_layer}
+        kv_cache_prefill = KVCache(
+            batch_size=1,
+            seq_len=len(tokens),
+            **kv_model_kwargs,
+        )
+        ids = torch.tensor([tokens], dtype=torch.long, device=device)
+        logits = self.model.forward(ids, kv_cache=kv_cache_prefill)
+        logits = logits[:, -1, :]
+        next_ids = sample_next_token(logits, rng, temperature, top_k)  # (B, 1)
+        sampled_tokens = next_ids[:, 0].tolist()
+
+        # 2) Replicate the KV cache for each sample/row
+        kv_length_hint = (len(tokens) + max_tokens) if max_tokens is not None else self.model.config.sequence_len
+        kv_cache_decode = KVCache(
+            batch_size=num_samples,
+            seq_len=kv_length_hint,
+            **kv_model_kwargs,
+        )
+        kv_cache_decode.prefill(kv_cache_prefill)
+        del kv_cache_prefill # no need to keep this memory around
+
+        # 3) Initialize states for each sample
+        row_states = [RowState(tokens.copy()) for _ in range(num_samples)]
+
+        # 4) Main generation loop
+        num_generated = 0
+        first_iteration = True
+        while True:
+            # Stop condition: we've reached max tokens
+            if max_tokens is not None and num_generated >= max_tokens:
+                break
+            # Stop condition: all rows are completed
+            if all(state.completed for state in row_states):
+                break
+
+            # Get sampled tokens - either from prefill or from forward pass
+            if first_iteration:
+                # Use the tokens we already sampled from prefill
+                sampled_tokens = [sampled_tokens[0]] * num_samples  # Broadcast first token to all rows
+                # TODO: we should sample a token for each row instead of broadcasting
+                first_iteration = False
+            else:
+                # Forward the model and get the next token for each row
+                logits = self.model.forward(ids, kv_cache=kv_cache_decode)  # (B, T, vocab_size)
+                logits = logits[:, -1, :]  # (B, vocab_size) at last time step
+                next_ids = sample_next_token(logits, rng, temperature, top_k)  # (B, 1)
+                sampled_tokens = next_ids[:, 0].tolist()
+
+            # Process each row: choose the next token, update state, optional tool use
+            token_column = [] # contains the next token id along each row
+            token_masks = [] # contains the mask (was it sampled (1) or forced (0)?) along each row
+            for i, state in enumerate(row_states):
+                # Select the next token in this row
+                is_forced = len(state.forced_tokens) > 0 # are there tokens waiting to be forced in deque?
+                token_masks.append(0 if is_forced else 1) # mask is 0 if forced, 1 if sampled
+                next_token = state.forced_tokens.popleft() if is_forced else sampled_tokens[i]
+                token_column.append(next_token)
+                # Update the state of this row to include the next token
+                state.current_tokens.append(next_token)
+                # On <|assistant_end|> or <|bos|>, mark the row as completed
+                if next_token == assistant_end or next_token == bos:
+                    state.completed = True
+                # Handle tool logic
+                if next_token == python_start:
+                    state.in_python_block = True
+                    state.python_expr_tokens = []
+                elif next_token == python_end and state.in_python_block:
+                    state.in_python_block = False
+                    if state.python_expr_tokens:
+                        expr = self.tokenizer.decode(state.python_expr_tokens)
+                        result = use_calculator(expr)
+                        if result is not None:
+                            result_tokens = self.tokenizer.encode(str(result))
+                            state.forced_tokens.append(output_start)
+                            state.forced_tokens.extend(result_tokens)
+                            state.forced_tokens.append(output_end)
+                    state.python_expr_tokens = []
+                elif state.in_python_block:
+                    state.python_expr_tokens.append(next_token)
+
+            # Yield the token column
+            yield token_column, token_masks
+            num_generated += 1
+            # Prepare ids for next iteration
+            ids = torch.tensor(token_column, dtype=torch.long, device=device).unsqueeze(1)
+
+    def generate_batch(self, tokens, num_samples=1, **kwargs):
+        """
+        Non-streaming batch generation that just returns the final token sequences.
+        Returns a list of token sequences (list of lists of ints).
+        Terminal tokens (assistant_end, bos) are not included in the results.
+        """
+        assistant_end = self.tokenizer.encode_special("<|assistant_end|>")
+        bos = self.tokenizer.get_bos_token_id()
+        results = [tokens.copy() for _ in range(num_samples)]
+        masks = [[0] * len(tokens) for _ in range(num_samples)]
+        completed = [False] * num_samples
+        for token_column, token_masks in self.generate(tokens, num_samples, **kwargs):
+            for i, (token, mask) in enumerate(zip(token_column, token_masks)):
+                if not completed[i]:
+                    if token == assistant_end or token == bos:
+                        completed[i] = True
+                    else:
+                        results[i].append(token)
+                        masks[i].append(mask)
+            # Stop if all rows are completed
+            if all(completed):
+                break
+        return results, masks
+
+
+if __name__ == "__main__":
+    """
+    Quick inline test to make sure that the naive/slow model.generate function
+    is equivalent to the faster Engine.generate function here.
+    """
+    import time
+    # init compute
+    ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init()
+    device_type = autodetect_device_type()
+    autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
+
+    # load the model and tokenizer
+    model, tokenizer, meta = load_model("base", device, phase="eval")
+    bos_token_id = tokenizer.get_bos_token_id()
+    # common hyperparameters
+    kwargs = dict(max_tokens=64, temperature=0.0)
+    # set the starting prompt
+    prompt_tokens = tokenizer.encode("The chemical formula of water is", prepend=bos_token_id)
+    # generate the reference sequence using the model.generate() function
+    generated_tokens = []
+    torch.cuda.synchronize()
+    t0 = time.time()
+    stream = model.generate(prompt_tokens, **kwargs)
+    with autocast_ctx:
+        for token in stream:
+            generated_tokens.append(token)
+            chunk = tokenizer.decode([token])
+            print(chunk, end="", flush=True)
+    print()
+    torch.cuda.synchronize()
+    t1 = time.time()
+    print(f"Reference time: {t1 - t0:.2f}s")
+    reference_ids = generated_tokens
+    # generate tokens with Engine
+    generated_tokens = []
+    engine = Engine(model, tokenizer)
+    stream = engine.generate(prompt_tokens, num_samples=1, **kwargs) # note: runs in fp32
+    torch.cuda.synchronize()
+    t0 = time.time()
+    with autocast_ctx:
+        for token_column, token_masks in stream:
+            token = token_column[0] # only print out the first row
+            generated_tokens.append(token)
+            chunk = tokenizer.decode([token])
+            print(chunk, end="", flush=True)
+    print()
+    torch.cuda.synchronize()
+    t1 = time.time()
+    print(f"Engine time: {t1 - t0:.2f}s")
+    # compare the two sequences
+    for i in range(len(reference_ids)):
+        if reference_ids[i] != generated_tokens[i]:
+            print(f"Mismatch at {i}: {reference_ids[i]} != {generated_tokens[i]}")
+            break
+    print(f"Match: {reference_ids == generated_tokens}")
--- a/nanochat_moe/execution.py
+++ b/nanochat_moe/execution.py
@ -0,0 +1,349 @@
+"""
+Sandboxed execution utilities for running Python code that comes out of an LLM.
+Adapted from OpenAI HumanEval code:
+https://github.com/openai/human-eval/blob/master/human_eval/execution.py
+
+What is covered:
+- Each execution runs in its own process (can be killed if it hangs or crashes)
+- Execution is limited by a timeout to stop infinite loops
+- Memory limits are enforced by default (256MB)
+- stdout and stderr are captured and returned
+- Code runs in a temporary directory that is deleted afterwards
+- Dangerous functions are disabled (examples: os.system, os.kill, shutil.rmtree, subprocess.Popen)
+
+What is not covered:
+- Not a true security sandbox
+- Network access is not blocked (e.g. sockets could be opened)
+- Python's dynamic features (e.g. ctypes) could bypass restrictions
+- No kernel-level isolation (no seccomp, no containers, no virtualization)
+
+Overall this sandbox is good for evaluation of generated code and protects against
+accidental destructive behavior, but it is not safe against malicious adversarial code.
+"""
+
+import contextlib
+import faulthandler
+import io
+import multiprocessing
+import os
+import platform
+import signal
+import tempfile
+from dataclasses import dataclass
+from typing import Optional
+
+# -----------------------------------------------------------------------------
+
+@dataclass
+class ExecutionResult:
+    """Result of executing Python code in a sandbox."""
+    success: bool
+    stdout: str
+    stderr: str
+    error: Optional[str] = None
+    timeout: bool = False
+    memory_exceeded: bool = False
+
+    def __repr__(self):
+        parts = []
+        parts.append(f"ExecutionResult(success={self.success}")
+        if self.timeout:
+            parts.append(", timeout=True")
+        if self.memory_exceeded:
+            parts.append(", memory_exceeded=True")
+        if self.error:
+            parts.append(f", error={self.error!r}")
+        if self.stdout:
+            parts.append(f", stdout={self.stdout!r}")
+        if self.stderr:
+            parts.append(f", stderr={self.stderr!r}")
+        parts.append(")")
+        return "".join(parts)
+
+
+@contextlib.contextmanager
+def time_limit(seconds: float):
+    def signal_handler(signum, frame):
+        raise TimeoutException("Timed out!")
+
+    signal.setitimer(signal.ITIMER_REAL, seconds)
+    signal.signal(signal.SIGALRM, signal_handler)
+    try:
+        yield
+    finally:
+        signal.setitimer(signal.ITIMER_REAL, 0)
+
+
+@contextlib.contextmanager
+def capture_io():
+    """Capture stdout and stderr, and disable stdin."""
+    stdout_capture = io.StringIO()
+    stderr_capture = io.StringIO()
+    stdin_block = WriteOnlyStringIO()
+    with contextlib.redirect_stdout(stdout_capture):
+        with contextlib.redirect_stderr(stderr_capture):
+            with redirect_stdin(stdin_block):
+                yield stdout_capture, stderr_capture
+
+
+@contextlib.contextmanager
+def create_tempdir():
+    with tempfile.TemporaryDirectory() as dirname:
+        with chdir(dirname):
+            yield dirname
+
+
+class TimeoutException(Exception):
+    pass
+
+
+class WriteOnlyStringIO(io.StringIO):
+    """StringIO that throws an exception when it's read from"""
+
+    def read(self, *args, **kwargs):
+        raise IOError
+
+    def readline(self, *args, **kwargs):
+        raise IOError
+
+    def readlines(self, *args, **kwargs):
+        raise IOError
+
+    def readable(self, *args, **kwargs):
+        """Returns True if the IO object can be read."""
+        return False
+
+
+class redirect_stdin(contextlib._RedirectStream):  # type: ignore
+    _stream = "stdin"
+
+
+@contextlib.contextmanager
+def chdir(root):
+    if root == ".":
+        yield
+        return
+    cwd = os.getcwd()
+    os.chdir(root)
+    try:
+        yield
+    finally:
+        os.chdir(cwd)
+
+
+def reliability_guard(maximum_memory_bytes: Optional[int] = None):
+    """
+    This disables various destructive functions and prevents the generated code
+    from interfering with the test (e.g. fork bomb, killing other processes,
+    removing filesystem files, etc.)
+
+    WARNING
+    This function is NOT a security sandbox. Untrusted code, including, model-
+    generated code, should not be blindly executed outside of one. See the
+    Codex paper for more information about OpenAI's code sandbox, and proceed
+    with caution.
+    """
+
+    if platform.uname().system != "Darwin":
+        # These resource limit calls seem to fail on macOS (Darwin), skip?
+        import resource
+        resource.setrlimit(resource.RLIMIT_AS, (maximum_memory_bytes, maximum_memory_bytes))
+        resource.setrlimit(resource.RLIMIT_DATA, (maximum_memory_bytes, maximum_memory_bytes))
+        resource.setrlimit(resource.RLIMIT_STACK, (maximum_memory_bytes, maximum_memory_bytes))
+
+    faulthandler.disable()
+
+    import builtins
+
+    builtins.exit = None
+    builtins.quit = None
+
+    import os
+
+    os.environ["OMP_NUM_THREADS"] = "1"
+
+    os.kill = None
+    os.system = None
+    os.putenv = None
+    os.remove = None
+    os.removedirs = None
+    os.rmdir = None
+    os.fchdir = None
+    os.setuid = None
+    os.fork = None
+    os.forkpty = None
+    os.killpg = None
+    os.rename = None
+    os.renames = None
+    os.truncate = None
+    os.replace = None
+    os.unlink = None
+    os.fchmod = None
+    os.fchown = None
+    os.chmod = None
+    os.chown = None
+    os.chroot = None
+    os.fchdir = None
+    os.lchflags = None
+    os.lchmod = None
+    os.lchown = None
+    os.getcwd = None
+    os.chdir = None
+
+    import shutil
+
+    shutil.rmtree = None
+    shutil.move = None
+    shutil.chown = None
+
+    import subprocess
+
+    subprocess.Popen = None  # type: ignore
+
+    __builtins__["help"] = None
+
+    import sys
+
+    sys.modules["ipdb"] = None
+    sys.modules["joblib"] = None
+    sys.modules["resource"] = None
+    sys.modules["psutil"] = None
+    sys.modules["tkinter"] = None
+
+
+def _unsafe_execute(code: str, timeout: float, maximum_memory_bytes: Optional[int], result_dict):
+    """Execute code in a subprocess with safety guards. Results are written to result_dict."""
+    with create_tempdir():
+
+        # These system calls are needed when cleaning up tempdir.
+        import os
+        import shutil
+
+        rmtree = shutil.rmtree
+        rmdir = os.rmdir
+        chdir = os.chdir
+        unlink = os.unlink
+
+        # Disable functionalities that can make destructive changes to the test.
+        reliability_guard(maximum_memory_bytes=maximum_memory_bytes)
+
+        # Default to failure
+        result_dict.update({
+            "success": False,
+            "stdout": "",
+            "stderr": "",
+            "timeout": False,
+            "memory_exceeded": False,
+            "error": None,
+        })
+
+        try:
+            exec_globals = {}
+            with capture_io() as (stdout_capture, stderr_capture):
+                with time_limit(timeout):
+                    # WARNING
+                    # This program exists to execute untrusted model-generated code. Although
+                    # it is highly unlikely that model-generated code will do something overtly
+                    # malicious in response to this test suite, model-generated code may act
+                    # destructively due to a lack of model capability or alignment.
+                    # Users are strongly encouraged to sandbox this evaluation suite so that it
+                    # does not perform destructive actions on their host or network. For more
+                    # information on how OpenAI sandboxes its code, see the accompanying paper.
+                    # Once you have read this disclaimer and taken appropriate precautions,
+                    # uncomment the following line and proceed at your own risk:
+                    exec(code, exec_globals)
+
+            result_dict.update({
+                "success": True,
+                "stdout": stdout_capture.getvalue(),
+                "stderr": stderr_capture.getvalue(),
+            })
+
+        except TimeoutException:
+            result_dict.update({
+                "timeout": True,
+                "error": "Execution timed out",
+            })
+
+        except MemoryError as e:
+            result_dict.update({
+                "memory_exceeded": True,
+                "error": f"Memory limit exceeded: {e}",
+            })
+
+        except BaseException as e:
+            result_dict.update({
+                "error": f"{type(e).__name__}: {e}",
+            })
+
+        # Needed for cleaning up.
+        shutil.rmtree = rmtree
+        os.rmdir = rmdir
+        os.chdir = chdir
+        os.unlink = unlink
+
+
+def execute_code(
+    code: str,
+    timeout: float = 5.0, # 5 seconds default
+    maximum_memory_bytes: Optional[int] = 256 * 1024 * 1024, # 256MB default
+) -> ExecutionResult:
+    """
+    Execute Python code in a sandboxed environment.
+
+    Args:
+        code: Python code to execute as a string
+        timeout: Maximum execution time in seconds (default: 5.0)
+        maximum_memory_bytes: Memory limit in bytes (default: 256MB, None to disable)
+
+    Returns:
+        ExecutionResult with success status, stdout/stderr, and error information
+
+    Example:
+        >>> result = execute_code("print('hello world')")
+        >>> result.success
+        True
+        >>> result.stdout
+        'hello world\\n'
+    """
+
+    manager = multiprocessing.Manager()
+    result_dict = manager.dict()
+
+    p = multiprocessing.Process(
+        target=_unsafe_execute,
+        args=(code, timeout, maximum_memory_bytes, result_dict)
+    )
+    p.start()
+    p.join(timeout=timeout + 1)
+
+    if p.is_alive():
+        p.kill()
+        return ExecutionResult(
+            success=False,
+            stdout="",
+            stderr="",
+            error="Execution timed out (process killed)",
+            timeout=True,
+            memory_exceeded=False,
+        )
+
+    if not result_dict:
+        return ExecutionResult(
+            success=False,
+            stdout="",
+            stderr="",
+            error="Execution failed (no result returned)",
+            timeout=True,
+            memory_exceeded=False,
+        )
+
+    return ExecutionResult(
+        success=result_dict["success"],
+        stdout=result_dict["stdout"],
+        stderr=result_dict["stderr"],
+        error=result_dict["error"],
+        timeout=result_dict["timeout"],
+        memory_exceeded=result_dict["memory_exceeded"],
+    )
+
--- a/nanochat_moe/gpt.py
+++ b/nanochat_moe/gpt.py
@ -0,0 +1,622 @@
+"""
+GPT model (rewrite, a lot simpler)
+Notable features:
+- rotary embeddings (and no positional embeddings)
+- QK norm
+- untied weights for token embedding and lm_head
+- relu^2 activation in MLP
+- norm after token embedding
+- no learnable params in rmsnorm
+- no bias in linear layers
+- Group-Query Attention (GQA) support for more efficient inference
+"""
+
+import math
+from functools import partial
+from dataclasses import dataclass
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from nanochat.common import get_dist_info, print0
+from nanochat_moe.muon import Muon, DistMuon
+from nanochat_moe.adamw import DistAdamW
+# ========== MOE ADDITION START ==========
+from nanochat_moe.manager import MANAGER
+from contextlib import nullcontext
+# ========== MOE ADDITION END ==========
+
+@dataclass
+class GPTConfig:
+    sequence_len: int = 1024
+    vocab_size: int = 50304
+    n_layer: int = 6
+    n_head: int = 6 # number of query heads
+    n_kv_head: int = 6 # number of key/value heads (GQA)
+    n_embd: int = 384
+    
+    # ========== MOE ADDITION START ==========
+    # MoE-related configs (added for MoE support)
+    n_exp: int = 8 # if n_exp = 1 we just use regular MLP layers
+    top_k: int = 2 # number of active experts
+    use_aux_loss: bool = True # apply auxiliary loss (from Switch Transformer)
+    use_router_z_loss: bool = True # apply router z loss (from ST-MoE)
+    use_noisy_top_k: bool = False
+    aux_loss_weight: float = 0.01 # default from Switch Transformer
+    router_z_loss_weight: float = 0.001 # default from ST-MoE
+    train_capacity: float = 1.25 # default from ST-MoE
+    eval_capacity: float = 2.0
+    min_capacity: int = 4 # minimum batch size per expert
+    stride: int = 2 # one in every stride layers uses MoE
+    use_switch_tfm_init: bool = True # use weight init scheme from Switch Transformer
+    switch_tfm_init_scale: float = 1.0
+    router_use_full_prec: bool = True # use float32 in router
+    # ========== MOE ADDITION END ==========
+
+
+
+
+
+def norm(x):
+    # Purely functional rmsnorm with no learnable params
+    return F.rms_norm(x, (x.size(-1),))
+
+class LayerNorm(nn.Module):
+    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
+
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+
+    def forward(self, input):
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+
+
+
+def apply_rotary_emb(x, cos, sin):
+    assert x.ndim == 4  # multihead attention
+    d = x.shape[3] // 2
+    x1, x2 = x[..., :d], x[..., d:] # split up last time into two halves
+    y1 = x1 * cos + x2 * sin # rotate pairs of dims
+    y2 = x1 * (-sin) + x2 * cos
+    out = torch.cat([y1, y2], 3) # re-assemble
+    out = out.to(x.dtype) # ensure input/output dtypes match
+    return out
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config, layer_idx):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.n_head = config.n_head
+        self.n_kv_head = config.n_kv_head
+        self.n_embd = config.n_embd
+        self.head_dim = self.n_embd // self.n_head
+        assert self.n_embd % self.n_head == 0
+        assert self.n_kv_head <= self.n_head and self.n_head % self.n_kv_head == 0
+        self.c_q = nn.Linear(self.n_embd, self.n_head * self.head_dim, bias=False)
+        self.c_k = nn.Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False)
+        self.c_v = nn.Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False)
+        self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=False)
+
+    def forward(self, x, cos_sin, kv_cache):
+        B, T, C = x.size()
+
+        # Project the input to get queries, keys, and values
+        q = self.c_q(x).view(B, T, self.n_head, self.head_dim)
+        k = self.c_k(x).view(B, T, self.n_kv_head, self.head_dim)
+        v = self.c_v(x).view(B, T, self.n_kv_head, self.head_dim)
+
+        # Apply Rotary Embeddings to queries and keys to get relative positional encoding
+        cos, sin = cos_sin
+        q, k = apply_rotary_emb(q, cos, sin), apply_rotary_emb(k, cos, sin) # QK rotary embedding
+        q, k = norm(q), norm(k) # QK norm
+        q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2) # make head be batch dim, i.e. (B, T, H, D) -> (B, H, T, D)
+
+        # Apply KV cache: insert current k,v into cache, get the full view so far
+        if kv_cache is not None:
+            k, v = kv_cache.insert_kv(self.layer_idx, k, v)
+        Tq = q.size(2) # number of queries in this forward pass
+        Tk = k.size(2) # number of keys/values in total (in the cache + current forward pass)
+
+        # Attention: queries attend to keys/values autoregressively. A few cases to handle:
+        enable_gqa = self.n_head != self.n_kv_head # Group Query Attention (GQA): duplicate key/value heads to match query heads if desired
+        if kv_cache is None or Tq == Tk:
+            # During training (no KV cache), attend as usual with causal attention
+            # And even if there is KV cache, we can still use this simple version when Tq == Tk
+            y = F.scaled_dot_product_attention(q, k, v, is_causal=True, enable_gqa=enable_gqa)
+        elif Tq == 1:
+            # During inference but with a single query in this forward pass:
+            # The query has to attend to all the keys/values in the cache
+            y = F.scaled_dot_product_attention(q, k, v, is_causal=False, enable_gqa=enable_gqa)
+        else:
+            # During inference AND we have a chunk of queries in this forward pass:
+            # First, each query attends to all the cached keys/values (i.e. full prefix)
+            attn_mask = torch.zeros((Tq, Tk), dtype=torch.bool, device=q.device) # True = keep, False = mask
+            prefix_len = Tk - Tq
+            if prefix_len > 0: # can't be negative but could be zero
+                attn_mask[:, :prefix_len] = True
+            # Then, causal attention within this chunk
+            attn_mask[:, prefix_len:] = torch.tril(torch.ones((Tq, Tq), dtype=torch.bool, device=q.device))
+            y = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, enable_gqa=enable_gqa)
+
+        # Re-assemble the heads side by side and project back to residual stream
+        y = y.transpose(1, 2).contiguous().view(B, T, -1)
+        y = self.c_proj(y)
+        return y
+
+
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd, bias=False)
+        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd, bias=False)
+
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = F.relu(x).square()
+        x = self.c_proj(x)
+        return x
+
+# ========== MOE ADDITION START ==========
+# All MoE-related classes below are NEW additions
+class Router(nn.Module):
+    """Router for MoE layer - selects top-k experts for each token"""
+    def __init__(self, config):
+        super().__init__()
+        self.top_k = config.top_k
+        self.n_exp = config.n_exp
+        assert self.top_k >= 1 and self.top_k <= config.n_exp
+        self.use_noisy_top_k = config.use_noisy_top_k
+        self.train_capacity = config.train_capacity
+        self.eval_capacity = config.eval_capacity
+        self.min_capacity = config.min_capacity
+        self.router_use_full_prec = config.router_use_full_prec
+        self.use_aux_loss = config.use_aux_loss
+        self.use_router_z_loss = config.use_router_z_loss
+        
+        # linear projection for (noisy) softmax gating
+        self.w_g = nn.Linear(config.n_embd, config.n_exp, bias=False)
+        self.w_noise = nn.Linear(config.n_embd, config.n_exp, bias=False) if self.use_noisy_top_k else None
+    
+    def forward(self, x):
+        # Save input dtype to ensure output matches
+        input_dtype = x.dtype
+        device_type = 'cuda' if torch.cuda.is_available() else 'cpu'
+        
+        if self.router_use_full_prec:
+            # When using full precision, convert input to float32 explicitly
+            # This ensures compatibility with compiled models
+            x_router = x.to(dtype=torch.float32)
+            ctx = torch.amp.autocast(device_type=device_type, enabled=False)
+        else:
+            x_router = x
+            ctx = nullcontext()
+        
+        with ctx:
+            B, T, _ = x_router.size()
+            num_tokens = B * T
+            
+            # router logits
+            logits = self.w_g(x_router)  # [B, T, n_exp]
+            if self.use_noisy_top_k:
+                noise = F.softplus(self.w_noise(x_router))
+                noise *= torch.randn_like(noise)
+                logits += noise
+            
+            # router z loss
+            if self.use_router_z_loss:
+                z_loss = torch.logsumexp(logits, dim=-1) ** 2.0
+                MANAGER.add_router_z_loss(torch.mean(z_loss))
+            
+            # find top k experts
+            top_k_logits, top_k_indices = logits.topk(self.top_k, dim=-1)  # [B, T, k]
+            
+            # normalize expert probabilities (only over top-k)
+            router_probs = torch.full_like(logits, float('-inf'))
+            router_probs.scatter_(-1, top_k_indices, top_k_logits)
+            router_probs = F.softmax(router_probs, dim=-1)
+            
+            # auxiliary load balancing loss
+            if self.use_aux_loss:
+                with torch.no_grad():
+                    one_hot_indices = F.one_hot(top_k_indices, num_classes=self.n_exp)  # [B, T, k, n_exp]
+                    one_hot_indices = torch.sum(one_hot_indices.float(), dim=2)  # [B, T, n_exp]
+                    tokens_per_expert = torch.mean(one_hot_indices.float(), dim=(0, 1))
+                prob_per_expert = torch.mean(router_probs.float(), dim=(0, 1))
+                aux_loss = self.n_exp * torch.sum(prob_per_expert * tokens_per_expert)
+                MANAGER.add_aux_loss(aux_loss)
+            
+            # compute expert capacity
+            exp_capacity = self.get_capacity(num_tokens)
+            
+            # make multi-hot mask of chosen experts
+            exp_mask = F.one_hot(top_k_indices, num_classes=self.n_exp)  # [B, T, k, n_exp]
+            exp_mask = exp_mask.view(num_tokens, self.top_k, self.n_exp)
+            exp_mask = exp_mask.permute(1, 0, 2)  # [k, B * T, n_exp]
+            
+            # compute cumulative sum for expert ranking
+            exp_rank = exp_mask.reshape(self.top_k * num_tokens, self.n_exp)
+            exp_rank = torch.cumsum(exp_rank, dim=0) - 1
+            exp_rank = exp_rank.reshape(self.top_k, num_tokens, self.n_exp)
+            
+            # mask out entries beyond expert capacity
+            exp_mask *= torch.lt(exp_rank, exp_capacity)
+            used_capacity = torch.sum(exp_mask, dim=(0, 1))
+            
+            # mask rank to only include selected tokens
+            exp_rank = torch.sum(exp_mask * exp_rank, dim=-1)  # [k, B * T]
+            
+            # mask probabilities to only include selected experts
+            router_probs = router_probs.view(num_tokens, self.n_exp)[None, :]  # [1, B * T, n_exp]
+            exp_weights = exp_mask * router_probs  # [k, B * T, n_exp]
+            
+            # convert rank to one-hot vectors
+            exp_rank_sc = F.one_hot(exp_rank, num_classes=exp_capacity)  # [k, B * T, exp_capacity]
+            
+            # create weight matrix [B * T, n_exp, exp_capacity]
+            cb_weight = torch.sum(exp_weights.unsqueeze(3) * exp_rank_sc.unsqueeze(2), dim=0)
+            sec_mask = cb_weight.bool()
+            
+        # Note: cb_weight and sec_mask remain in their computed dtype
+        # They will be converted to match input dtype in MOELayer using type_as()
+            return used_capacity, cb_weight, sec_mask
+    
+    def get_capacity(self, tokens_per_batch):
+        capacity_factor = self.train_capacity if self.training else self.eval_capacity
+        capacity = math.floor(self.top_k * capacity_factor * tokens_per_batch / self.n_exp)
+        capacity += capacity % 2  # make even
+        capacity = max(capacity, self.min_capacity)
+        assert capacity > 0
+        return int(capacity)
+
+
+class MLPExperts(nn.Module):
+    """Multiple MLP experts - adapted for nanochat (relu^2, no bias)"""
+    def __init__(self, config):
+        super().__init__()
+        # no bias, using relu^2 activation like nanochat
+        self.c_fc = nn.Parameter(torch.empty(config.n_exp, config.n_embd, 4 * config.n_embd))
+        self.c_proj = nn.Parameter(torch.empty(config.n_exp, 4 * config.n_embd, config.n_embd))
+    
+    def forward(self, x):
+        # x: [n_exp, exp_capacity, n_embd]
+        x = torch.bmm(x, self.c_fc)  # [n_exp, exp_capacity, 4*n_embd]
+        x = F.relu(x).square()  # relu^2 activation (nanochat style)
+        x = torch.bmm(x, self.c_proj)  # [n_exp, exp_capacity, n_embd]
+        return x
+
+
+class MOELayer(nn.Module):
+    """MoE layer combining Router and MLPExperts"""
+    def __init__(self, config):
+        super().__init__()
+        self.router = Router(config)
+        self.experts = MLPExperts(config)
+    
+    def forward(self, x):
+        B, T, n_embd = x.size()
+        num_tokens = B * T
+        
+        # route tokens to experts
+        used_capacity, exp_weight, exp_mask = self.router(x)
+        
+        # flatten input
+        x = x.view(num_tokens, n_embd)
+        
+        # reshape tokens into batches for each expert
+        # [n_exp, exp_capacity, B * T] * [B * T, n_embd] -> [n_exp, exp_capacity, n_embd]
+        # Convert exp_mask to match x dtype (matches nanoMoE approach)
+        exp_batches = exp_mask.permute(1, 2, 0).type_as(x) @ x
+        
+        # compute expert output
+        exp_out = self.experts(exp_batches)  # [n_exp, exp_capacity, n_embd]
+        
+        # aggregate expert outputs based on router weights
+        exp_weight = exp_weight.view(num_tokens, -1)  # [B * T, n_exp * exp_capacity]
+        exp_out = exp_out.view(-1, n_embd)  # [n_exp * exp_capacity, n_embd]
+        # Ensure exp_weight matches exp_out dtype (matches nanoMoE approach)
+        output = exp_weight.type_as(exp_out) @ exp_out  # [B * T, n_embd]
+        
+        return output.view(B, T, n_embd)
+# ========== MOE ADDITION END ==========
+
+
+class Block(nn.Module):
+    def __init__(self, config, layer_idx, use_moe=False):  # ========== MODIFIED: added use_moe parameter
+        super().__init__()
+        self.attn = CausalSelfAttention(config, layer_idx)
+        # ========== MOE ADDITION START ==========
+        if use_moe:
+            self.mlp = MOELayer(config)  # Use MoE layer instead of regular MLP
+        else:
+            self.mlp = MLP(config)
+        self.use_moe = use_moe  # Track if this block uses MoE
+        # ========== MOE ADDITION END ==========
+
+    def forward(self, x, cos_sin, kv_cache):
+        x = x + self.attn(norm(x), cos_sin, kv_cache)
+        x = x + self.mlp(norm(x))
+        return x
+
+
+class GPT(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        # ========== MOE MODIFICATION START ==========
+        # Create blocks, using MoE every stride layers if n_exp > 1
+        blocks = []
+        for layer_idx in range(config.n_layer):
+            use_moe = (config.n_exp > 1) and (layer_idx % config.stride == 0)
+            blocks.append(Block(config, layer_idx, use_moe=use_moe))
+        # ========== MOE MODIFICATION END ==========
+        
+        self.transformer = nn.ModuleDict({
+            "wte": nn.Embedding(config.vocab_size, config.n_embd),
+            "h": nn.ModuleList(blocks),  # ========== MODIFIED: use blocks list instead of list comprehension
+        })
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # To support meta device initialization, we init the rotary embeddings here, but it's fake
+        # As for rotary_seq_len, these rotary embeddings are pretty small/cheap in memory,
+        # so let's just over-compute them, but assert fail if we ever reach that amount.
+        # In the future we can dynamically grow the cache, for now it's fine.
+        self.rotary_seq_len = config.sequence_len * 10 # 10X over-compute should be enough, TODO make nicer?
+        head_dim = config.n_embd // config.n_head
+        cos, sin = self._precompute_rotary_embeddings(self.rotary_seq_len, head_dim)
+        self.register_buffer("cos", cos, persistent=False) # persistent=False means it's not saved to the checkpoint
+        self.register_buffer("sin", sin, persistent=False)
+
+    def init_weights(self):
+        self.apply(self._init_weights)
+        # zero out classifier weights
+        torch.nn.init.zeros_(self.lm_head.weight)
+        # zero out c_proj weights in all blocks
+        for block in self.transformer.h:
+            # ========== MOE MODIFICATION START ==========
+            if block.use_moe:
+                # For MoE layers, zero out expert c_proj weights
+                torch.nn.init.zeros_(block.mlp.experts.c_proj)
+            else:
+                torch.nn.init.zeros_(block.mlp.c_proj.weight)
+            # ========== MOE MODIFICATION END ==========
+            torch.nn.init.zeros_(block.attn.c_proj.weight)
+        # init the rotary embeddings
+        head_dim = self.config.n_embd // self.config.n_head
+        cos, sin = self._precompute_rotary_embeddings(self.rotary_seq_len, head_dim)
+        self.cos, self.sin = cos, sin
+        # Cast the embeddings from fp32 to bf16: optim can tolerate it and it saves memory: both in the model and the activations
+        if self.transformer.wte.weight.device.type == "cuda":
+            self.transformer.wte.to(dtype=torch.bfloat16)
+
+    def _init_weights(self, module):
+        # ========== SWITCH TRANSFORMER INIT START ==========
+        # Optionally use Switch Transformer-style initialization
+        # See page 10 for switch init explanation: https://arxiv.org/abs/2101.03961
+        use_switch_init = self.config.use_switch_tfm_init
+        switch_scale = self.config.switch_tfm_init_scale
+        # ========== SWITCH TRANSFORMER INIT END ==========
+        
+        if isinstance(module, nn.Linear):
+            if use_switch_init:
+                # Switch Transformer initialization: truncated normal with scaled std
+                # linear layers have flipped dimensions in torch: [out_dim, in_dim]
+                fan_in = module.weight.shape[-1]
+                w_std = (switch_scale / fan_in) ** 0.5
+                torch.nn.init.trunc_normal_(
+                    module.weight,
+                    mean=0.0,
+                    std=w_std,
+                    a=-2*w_std,
+                    b=2*w_std,
+                )
+            else:
+                # Standard nanochat initialization: https://arxiv.org/pdf/2310.17813
+                fan_out = module.weight.size(0)
+                fan_in = module.weight.size(1)
+                std = 1.0 / math.sqrt(fan_in) * min(1.0, math.sqrt(fan_out / fan_in))
+                torch.nn.init.normal_(module.weight, mean=0.0, std=std)
+                # Always initialize bias to zero
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            # Always use standard initialization for embeddings
+            torch.nn.init.normal_(module.weight, mean=0.0, std=1.0)
+        # ========== MOE ADDITION START ==========
+        elif isinstance(module, MLPExperts):
+            if use_switch_init:
+                # Switch Transformer initialization for experts
+                # c_fc: [n_exp, n_embd, 4*n_embd]
+                c_fc_fan_in = module.c_fc.shape[-2]
+                c_fc_std = (switch_scale / c_fc_fan_in) ** 0.5
+                torch.nn.init.trunc_normal_(
+                    module.c_fc,
+                    mean=0.0,
+                    std=c_fc_std,
+                    a=-2*c_fc_std,
+                    b=2*c_fc_std,
+                )
+                # c_proj: [n_exp, 4*n_embd, n_embd]
+                c_proj_fan_in = module.c_proj.shape[-2]
+                c_proj_std = (switch_scale / c_proj_fan_in) ** 0.5
+                torch.nn.init.trunc_normal_(
+                    module.c_proj,
+                    mean=0.0,
+                    std=c_proj_std,
+                    a=-2*c_proj_std,
+                    b=2*c_proj_std,
+                )
+            else:
+                # Standard nanochat initialization for experts
+                for i in range(module.c_fc.size(0)):
+                    # c_fc: [n_exp, n_embd, 4*n_embd]
+                    fan_in = module.c_fc.size(1)
+                    fan_out = module.c_fc.size(2)
+                    std = 1.0 / math.sqrt(fan_in) * min(1.0, math.sqrt(fan_out / fan_in))
+                    torch.nn.init.normal_(module.c_fc[i], mean=0.0, std=std)
+                    
+                    # c_proj: [n_exp, 4*n_embd, n_embd]
+                    fan_in = module.c_proj.size(1)
+                    fan_out = module.c_proj.size(2)
+                    std = 1.0 / math.sqrt(fan_in) * min(1.0, math.sqrt(fan_out / fan_in))
+                    torch.nn.init.normal_(module.c_proj[i], mean=0.0, std=std)
+        
+    # TODO: bump base theta more, e.g. 100K is more common more recently
+    def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=None):
+        # autodetect the device from model embeddings
+        if device is None:
+            device = self.transformer.wte.weight.device
+        # stride the channels
+        channel_range = torch.arange(0, head_dim, 2, dtype=torch.float32, device=device)
+        inv_freq = 1.0 / (base ** (channel_range / head_dim))
+        # stride the time steps
+        t = torch.arange(seq_len, dtype=torch.float32, device=device)
+        # calculate the rotation frequencies at each (time, channel) pair
+        freqs = torch.outer(t, inv_freq)
+        cos, sin = freqs.cos(), freqs.sin()
+        cos, sin = cos.bfloat16(), sin.bfloat16() # keep them in bfloat16
+        cos, sin = cos[None, :, None, :], sin[None, :, None, :] # add batch and head dims for later broadcasting
+        return cos, sin
+
+    def get_device(self):
+        return self.transformer.wte.weight.device
+
+    def estimate_flops(self):
+        """ Return the estimated FLOPs per token for the model. Ref: https://arxiv.org/abs/2204.02311 """
+        nparams = sum(p.numel() for p in self.parameters())
+        nparams_embedding = self.transformer.wte.weight.numel()
+        l, h, q, t = self.config.n_layer, self.config.n_head, self.config.n_embd // self.config.n_head, self.config.sequence_len
+        num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t
+        return num_flops_per_token
+
+    def setup_optimizers(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0):
+        model_dim = self.config.n_embd
+        ddp, rank, local_rank, world_size = get_dist_info()
+        # Separate out all parameters into groups
+        # ========== MOE MODIFICATION START ==========
+        # Separate MoE expert parameters (3D) from regular matrix parameters (2D)
+        # Muon optimizer only accepts 2D parameters, so we need to filter out 3D (MoE experts) and 1D (bias/norm) params
+        matrix_params = []
+        moe_params = []  # MoE expert parameters are 3D and need AdamW optimizer
+        other_params = []  # 1D parameters (bias, norm weights) also go to AdamW
+        for param in self.transformer.h.parameters():
+            if param.ndim == 3:  # MoE expert parameters: [n_exp, ...]
+                moe_params.append(param)
+            elif param.ndim == 2:  # Regular 2D matrix parameters for Muon
+                matrix_params.append(param)
+            else:  # 1D parameters (bias, norm weights) go to AdamW
+                other_params.append(param)
+        # ========== MOE MODIFICATION END ==========
+        embedding_params = list(self.transformer.wte.parameters())
+        lm_head_params = list(self.lm_head.parameters())
+        # Create the AdamW optimizer for the embedding, lm_head, and MoE experts
+        # Scale the LR for the AdamW parameters by ∝1/√dmodel (having tuned the LRs for 768 dim model)
+        dmodel_lr_scale = (model_dim / 768) ** -0.5
+        if rank == 0:
+            print(f"Scaling the LR for the AdamW parameters ∝1/√({model_dim}/768) = {dmodel_lr_scale:.6f}")
+            if moe_params:
+                print(f"Found {len(moe_params)} MoE expert parameters (3D) to optimize with AdamW")
+        adam_groups = [
+            dict(params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale),
+            dict(params=embedding_params, lr=embedding_lr * dmodel_lr_scale),
+        ]
+        # ========== MOE MODIFICATION START ==========
+        # Add MoE expert parameters to AdamW optimizer (use matrix_lr for consistency)
+        if moe_params:
+            adam_groups.append(dict(params=moe_params, lr=matrix_lr * dmodel_lr_scale))
+        # ========== MOE MODIFICATION END ==========
+        adamw_kwargs = dict(betas=(0.8, 0.95), eps=1e-10, weight_decay=weight_decay)
+        AdamWFactory = DistAdamW if ddp else partial(torch.optim.AdamW, fused=True)
+        adamw_optimizer = AdamWFactory(adam_groups, **adamw_kwargs)
+        # Create the Muon optimizer for the 2D linear layers only
+        muon_kwargs = dict(lr=matrix_lr, momentum=0.95)
+        MuonFactory = DistMuon if ddp else Muon
+        muon_optimizer = MuonFactory(matrix_params, **muon_kwargs)
+        # Combine them the two optimizers into one list
+        optimizers = [adamw_optimizer, muon_optimizer]
+        for opt in optimizers:
+            for group in opt.param_groups:
+                group["initial_lr"] = group["lr"]
+        return optimizers
+
+    def forward(self, idx, targets=None, kv_cache=None, loss_reduction='mean'):
+        B, T = idx.size()
+
+        # Grab the rotary embeddings for the current sequence length (they are of shape (1, seq_len, 1, head_dim/2))
+        assert T <= self.cos.size(1), f"Sequence length grew beyond the rotary embeddings cache: {T} > {self.cos.size(1)}"
+        assert idx.device == self.cos.device, f"Rotary embeddings and idx are on different devices: {idx.device} != {self.cos.device}"
+        assert self.cos.dtype == torch.bfloat16, "Rotary embeddings must be in bfloat16"
+        # if kv cache exists, we need to offset the rotary embeddings to the current position in the cache
+        T0 = 0 if kv_cache is None else kv_cache.get_pos()
+        cos_sin = self.cos[:, T0:T0+T], self.sin[:, T0:T0+T] # truncate cache to current sequence length
+
+        # Forward the trunk of the Transformer
+        x = self.transformer.wte(idx)
+        x = norm(x)
+        for block in self.transformer.h:
+            x = block(x, cos_sin, kv_cache)
+        x = norm(x)
+
+        # Forward the lm_head (compute logits)
+        softcap = 15
+        if targets is not None:
+            # training mode: compute and return the loss
+            # TODO: experiment with Liger Kernels / chunked cross-entropy etc.
+            logits = self.lm_head(x)
+            logits = softcap * torch.tanh(logits / softcap) # logits softcap
+            logits = logits.float() # use tf32/fp32 for logits
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1, reduction=loss_reduction)
+            
+            # ========== MOE ADDITION START ==========
+            # Add MoE auxiliary losses if enabled
+            if self.config.n_exp > 1:
+                if self.config.use_aux_loss:
+                    aux_loss = MANAGER.aggregate_aux_loss()
+                    if isinstance(aux_loss, torch.Tensor) and aux_loss.numel() > 0:
+                        loss = loss + self.config.aux_loss_weight * aux_loss
+                    MANAGER.reset_aux_loss()
+                if self.config.use_router_z_loss:
+                    router_z_loss = MANAGER.aggregate_router_z_loss()
+                    if isinstance(router_z_loss, torch.Tensor) and router_z_loss.numel() > 0:
+                        loss = loss + self.config.router_z_loss_weight * router_z_loss
+                    MANAGER.reset_router_z_loss()
+            # ========== MOE ADDITION END ==========
+            
+            return loss
+        else:
+            # inference mode: compute and return the logits
+            logits = self.lm_head(x)
+            logits = softcap * torch.tanh(logits / softcap) # logits softcap
+            return logits
+
+    @torch.inference_mode()
+    def generate(self, tokens, max_tokens, temperature=1.0, top_k=None, seed=42):
+        """
+        Naive autoregressive streaming inference.
+        To make it super simple, let's assume:
+        - batch size is 1
+        - ids and the yielded tokens are simple Python lists and ints
+        """
+        assert isinstance(tokens, list)
+        device = self.get_device()
+        rng = None
+        if temperature > 0:
+            rng = torch.Generator(device=device)
+            rng.manual_seed(seed)
+        ids = torch.tensor([tokens], dtype=torch.long, device=device) # add batch dim
+        for _ in range(max_tokens):
+            logits = self.forward(ids) # (B, T, vocab_size)
+            logits = logits[:, -1, :] # (B, vocab_size)
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            if temperature > 0:
+                logits = logits / temperature
+                probs = F.softmax(logits, dim=-1)
+                next_ids = torch.multinomial(probs, num_samples=1, generator=rng)
+            else:
+                next_ids = torch.argmax(logits, dim=-1, keepdim=True)
+            ids = torch.cat((ids, next_ids), dim=1)
+            token = next_ids.item()
+            yield token
--- a/nanochat_moe/logo.svg
+++ b/nanochat_moe/logo.svg
@ -0,0 +1,8 @@
+<svg width="400" height="400" xmlns="http://www.w3.org/2000/svg">
+    <defs>
+      <radialGradient id="g" cx="50%" cy="50%">
+        <stop offset="0%" style="stop-color:#667eea;stop-opacity:1"></stop>
+        <stop offset="100%" style="stop-color:#764ba2;stop-opacity:0.3"></stop>
+      </radialGradient>
+    </defs>
+  <path d="M315.9110991546882,231.0582854123025 L228.97777478867204,207.7645713530756 M284.8528137423857,284.8528137423857 L221.21320343559643,221.21320343559643 M231.0582854123025,315.9110991546882 L207.7645713530756,228.97777478867204 M168.94171458769753,315.9110991546882 L192.2354286469244,228.97777478867204 M115.14718625761431,284.8528137423857 L178.78679656440357,221.21320343559643 M84.08890084531181,231.05828541230252 L171.02222521132796,207.76457135307564 M84.0889008453118,168.9417145876975 L171.02222521132796,192.2354286469244 M115.14718625761425,115.14718625761435 L178.78679656440357,178.78679656440357 M168.94171458769753,84.0889008453118 L192.2354286469244,171.02222521132796 M231.05828541230244,84.08890084531178 L207.7645713530756,171.02222521132794 M284.8528137423857,115.14718625761428 L221.21320343559643,178.78679656440357 M315.91109915468815,168.94171458769742 L228.97777478867204,192.23542864692436 " stroke="url(#g)" stroke-width="6" stroke-linecap="round" fill="none"></path><path d="M200,-12 L212,0 L200,12 L188,0 Z" transform="translate(0,200)" fill="#000"></path></svg>
--- a/nanochat_moe/loss_eval.py
+++ b/nanochat_moe/loss_eval.py
@ -0,0 +1,65 @@
+"""
+A number of functions that help with evaluating a base model.
+"""
+import math
+import torch
+import torch.distributed as dist
+
+@torch.no_grad()
+def evaluate_bpb(model, batches, steps, token_bytes):
+    """
+    Instead of the naive 'mean loss', this function returns the bits per byte (bpb),
+    which is a tokenization vocab size-independent metric, meaning you are still comparing
+    apples:apples if you change the vocab size. The way this works is that instead of just
+    calculating the average loss as usual, you calculate the sum loss, and independently
+    also the sum bytes (of all the target tokens), and divide. This normalizes the loss by
+    the number of bytes that the target tokens represent.
+
+    The added complexity is so that:
+    1) All "normal" tokens are normalized by the length of the token in bytes
+    2) No special tokens (e.g. <|bos|>) are included in the metric - they are masked out.
+    3) No actively masked tokens (using ignore_index of e.g. -1) are included in the metric.
+
+    In addition to evaluate_loss, we need the token_bytes tensor:
+    It is a 1D tensor of shape (vocab_size,), indicating the number of bytes for
+    each token id, or 0 if the token is to not be counted (e.g. special tokens).
+    """
+    # record the losses
+    total_nats = torch.tensor(0.0, dtype=torch.float32, device=model.get_device())
+    total_bytes = torch.tensor(0, dtype=torch.int64, device=model.get_device())
+    batch_iter = iter(batches)
+    for _ in range(steps):
+        x, y = next(batch_iter)
+        loss2d = model(x, y, loss_reduction='none') # (B, T)
+        loss2d = loss2d.view(-1) # flatten
+        y = y.view(-1) # flatten
+        if (y.int() < 0).any(): # mps does not currently have kernel for < 0 for int64, only int32
+            # slightly more complex code path if some target tokens are ignore_index (e.g. -1)
+            # any target token < 0 is to be ignored: do NOT index token_bytes with negatives
+            valid = y >= 0
+            y_safe = torch.where(valid, y, torch.zeros_like(y))
+            # map valid targets to their byte length; ignored targets contribute 0 bytes
+            num_bytes2d = torch.where(
+                valid,
+                token_bytes[y_safe],
+                torch.zeros_like(y, dtype=token_bytes.dtype)
+            )
+            total_nats += (loss2d * (num_bytes2d > 0)).sum()
+            total_bytes += num_bytes2d.sum()
+        else:
+            # fast path: no ignored targets, safe to index directly
+            num_bytes2d = token_bytes[y]
+            total_nats += (loss2d * (num_bytes2d > 0)).sum()
+            total_bytes += num_bytes2d.sum()
+    # sum reduce across all ranks
+    world_size = dist.get_world_size() if dist.is_initialized() else 1
+    if world_size > 1:
+        dist.all_reduce(total_nats, op=dist.ReduceOp.SUM)
+        dist.all_reduce(total_bytes, op=dist.ReduceOp.SUM)
+    # move both to cpu, calculate bpb and return
+    total_nats = total_nats.item()
+    total_bytes = total_bytes.item()
+    if total_bytes == 0:
+        return float('inf')
+    bpb = total_nats / (math.log(2) * total_bytes)
+    return bpb
--- a/nanochat_moe/manager.py
+++ b/nanochat_moe/manager.py
@ -0,0 +1,35 @@
+"""
+MoE Manager for tracking auxiliary losses across multiple MoE layers
+"""
+import torch
+
+class MOEManager:
+    """
+    Basic wrapper class for tracking, storing, and aggregating auxiliary
+    losses across multiple MoE layers in the model
+    """
+
+    def __init__(self):
+        self.aux_loss = []
+        self.router_z_loss = []
+    
+    def reset_aux_loss(self):
+        self.aux_loss = []
+    
+    def reset_router_z_loss(self):
+        self.router_z_loss = []
+    
+    def add_aux_loss(self, loss):
+        self.aux_loss.append(loss)
+    
+    def add_router_z_loss(self, loss):
+        self.router_z_loss.append(loss)
+    
+    def aggregate_aux_loss(self):
+        return sum(self.aux_loss) if self.aux_loss else torch.tensor(0.0)
+    
+    def aggregate_router_z_loss(self):
+        return sum(self.router_z_loss) if self.router_z_loss else torch.tensor(0.0)
+
+MANAGER = MOEManager()
+
--- a/nanochat_moe/muon.py
+++ b/nanochat_moe/muon.py
@ -0,0 +1,187 @@
+"""
+Muon optimizer from Keller et al.
+Also a lot of borrowing of ideas from modded-nanogpt.
+"""
+import torch
+from torch import Tensor
+import torch.distributed as dist
+
+@torch.compile
+def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor:
+    """
+    Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
+    quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
+    of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
+    zero even beyond the point where the iteration no longer converges all the way to one everywhere
+    on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
+    where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
+    performance at all relative to UV^T, where USV^T = G is the SVD.
+    """
+    assert G.ndim >= 2 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng
+    a, b, c = (3.4445, -4.7750,  2.0315)
+    X = G.bfloat16()
+    if G.size(-2) > G.size(-1):
+        X = X.mT
+
+    # Ensure spectral norm is at most 1
+    X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7)
+    # Perform the NS iterations
+    for _ in range(steps):
+        A = X @ X.mT
+        B = b * A + c * A @ A # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng
+        X = a * X + B @ X
+
+    if G.size(-2) > G.size(-1):
+        X = X.mT
+    return X
+
+class Muon(torch.optim.Optimizer):
+    """
+    Muon - MomentUm Orthogonalized by Newton-schulz
+
+    https://kellerjordan.github.io/posts/muon/
+
+    Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
+    processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
+    matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
+    the advantage that it can be stably run in bfloat16 on the GPU.
+
+    Some warnings:
+    - This optimizer should not be used for the embedding layer, the final fully connected layer,
+    or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW).
+    - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions.
+
+    Arguments:
+        lr: The learning rate used by the internal SGD.
+        momentum: The momentum used by the internal SGD.
+        nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended)
+        ns_steps: The number of Newton-Schulz iteration steps to use.
+    """
+    def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5):
+        defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps)
+        params: list[Tensor] = [*params]
+        param_groups = []
+        for size in {p.numel() for p in params}:
+            group = dict(params=[p for p in params if p.numel() == size])
+            param_groups.append(group)
+        super().__init__(param_groups, defaults)
+
+    @torch.no_grad()
+    def step(self):
+        for group in self.param_groups:
+            params: list[Tensor] = group["params"]
+            for p in params:
+                g = p.grad
+                assert g is not None
+                state = self.state[p]
+                if "momentum_buffer" not in state:
+                    state["momentum_buffer"] = torch.zeros_like(g)
+                buf: Tensor = state["momentum_buffer"]
+                buf.lerp_(g, 1 - group["momentum"])
+                g = g.lerp_(buf, group["momentum"]) if group["nesterov"] else buf
+                g = zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
+                p.add_(g, alpha=-group["lr"] * max(1, p.size(-2) / p.size(-1))**0.5)
+
+
+class DistMuon(torch.optim.Optimizer):
+    """
+    Muon: SGD-momentum + (optional) Nesterov, then orthogonalize the 2D update via Newton–Schulz,
+    finally apply aspect-ratio scaled step. Performs its own distributed synchronization:
+      - reduce_scatter(AVG) for gradient averaging
+      - all_gather to replicate updated weights
+
+    Notes:
+      * Designed for 2D parameters (e.g., linear/conv kernels reshaped to 2D). Do not use for 0D/1D
+        params like embeddings or scalars.
+      * Momentum buffers are maintained only on the 'owner' rank for each parameter (rank chosen
+        by block-cyclic assignment below). If you checkpoint optimizer state on a single rank,
+        consolidate states beforehand.
+
+    Args:
+        params: iterable of Tensors
+        lr: learning rate
+        momentum: momentum coefficient in [0,1)
+        nesterov: if True, Nesterov-style update (g <- lerp(g, buf, momentum)); else use buf
+        ns_steps: number of Newton–Schulz iterations for the orthogonalization
+    """
+    def __init__(self, params, lr: float = 0.02, momentum: float = 0.95,
+                 nesterov: bool = True, ns_steps: int = 5):
+        defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps)
+        params = list(params)
+        assert all(p.ndim == 2 for p in params), "Muon expects 2D parameters only"
+        rank = dist.get_rank()
+        # Group all parameters by their shape
+        shapes = sorted({p.shape for p in params}) # sort to ensure consistent / deterministic ordering
+        param_groups = []
+        for shape in shapes:
+            group_params = [p for p in params if p.shape == shape]
+            device, dtype = group_params[0].device, group_params[0].dtype
+            assert all(p.device == device for p in group_params)
+            assert all(p.dtype == dtype for p in group_params)
+            if rank == 0:
+                print(f"Muon: Grouping {len(group_params)} params of shape {shape}, device {device}, dtype {dtype}")
+            param_groups.append(dict(params=group_params, zero_buffer=torch.zeros_like(group_params[0])))
+        super().__init__(param_groups, defaults)
+
+    @torch.no_grad()
+    def step(self):
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+
+        # Ensure all grads exist
+        assert all(p.grad is not None for group in self.param_groups for p in group["params"]), "All params must have grads"
+
+        # Kick off all the reduce scatter operations to average up the gradients across all ranks
+        all_reduce_futures = []
+        for group in self.param_groups:
+            params = group["params"]
+            zero_buffer = group["zero_buffer"]
+            # Go through params in groups of world_size.
+            for base_i in range(0, len(params), world_size):
+                # The compute owner of each param is rank i % world_size
+                owner_idx = base_i + rank
+                # each rank stacks up its chunk of world_size params into a list
+                rs_input = [p.grad for p in params[base_i:base_i + world_size]]
+                # pad rs_input with the zero buffer to complete the group
+                rs_input.extend([zero_buffer] * (world_size - len(rs_input)))
+                # the output buffer gets strided across the group based on the rank
+                rs_output = params[owner_idx].grad if owner_idx < len(params) else torch.empty_like(zero_buffer)
+                # reduce scatter the gradients within this group of world_size params
+                work = dist.reduce_scatter(rs_output, rs_input, op=dist.ReduceOp.AVG, async_op=True).get_future()
+                all_reduce_futures.append(work)
+
+        # Now each rank computes the update and gathers
+        future_idx = 0
+        all_gather_futures = []
+        for group in self.param_groups:
+            params = group["params"]
+            zero_buffer = group["zero_buffer"]
+            # Go through params in groups of world_size.
+            for base_i in range(0, len(params), world_size):
+                # The compute owner of each param is rank i % world_size
+                owner_idx = base_i + rank # calculate the index of the param that this rank owns
+                # Wait for the reduce scatter to complete
+                all_reduce_futures[future_idx].wait() # possibly later we could use wait_any polling instead
+                future_idx += 1
+                # Owner computes the Muon update, result is in its param
+                if owner_idx < len(params):
+                    p = params[owner_idx]
+                    g = p.grad  # now averaged across ranks
+                    state = self.state[p]
+                    if "momentum_buffer" not in state:
+                        state["momentum_buffer"] = torch.zeros_like(g)
+                    buf: Tensor = state["momentum_buffer"]
+                    buf.lerp_(g, 1.0 - group["momentum"])
+                    g = g.lerp_(buf, group["momentum"]) if group["nesterov"] else buf
+                    g = zeropower_via_newtonschulz5(g, steps=group["ns_steps"])
+                    scale = (max(1.0, p.size(-2) / p.size(-1)) ** 0.5)
+                    p.add_(g, alpha=-group["lr"] * scale)
+                # Replicate updated parameters to all ranks
+                ag_input = params[owner_idx] if owner_idx < len(params) else zero_buffer
+                ag_output = params[base_i:base_i + world_size]
+                ag_output.extend([torch.empty_like(zero_buffer) for _ in range(world_size - len(ag_output))]) # pad
+                work = dist.all_gather(ag_output, ag_input, async_op=True).get_future()
+                all_gather_futures.append(work)
+
+        # Wait for all work to finish
+        torch.futures.collect_all(all_gather_futures).wait()
--- a/nanochat_moe/report.py
+++ b/nanochat_moe/report.py
@ -0,0 +1,408 @@
+"""
+Utilities for generating training report cards. More messy code than usual, will fix.
+"""
+
+import os
+import re
+import shutil
+import subprocess
+import socket
+import datetime
+import platform
+import psutil
+import torch
+
+def run_command(cmd):
+    """Run a shell command and return output, or None if it fails."""
+    try:
+        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=5)
+        if result.returncode == 0:
+            return result.stdout.strip()
+        return None
+    except:
+        return None
+
+def get_git_info():
+    """Get current git commit, branch, and dirty status."""
+    info = {}
+    info['commit'] = run_command("git rev-parse --short HEAD") or "unknown"
+    info['branch'] = run_command("git rev-parse --abbrev-ref HEAD") or "unknown"
+
+    # Check if repo is dirty (has uncommitted changes)
+    status = run_command("git status --porcelain")
+    info['dirty'] = bool(status) if status is not None else False
+
+    # Get commit message
+    info['message'] = run_command("git log -1 --pretty=%B") or ""
+    info['message'] = info['message'].split('\n')[0][:80]  # First line, truncated
+
+    return info
+
+def get_gpu_info():
+    """Get GPU information."""
+    if not torch.cuda.is_available():
+        return {"available": False}
+
+    num_devices = torch.cuda.device_count()
+    info = {
+        "available": True,
+        "count": num_devices,
+        "names": [],
+        "memory_gb": []
+    }
+
+    for i in range(num_devices):
+        props = torch.cuda.get_device_properties(i)
+        info["names"].append(props.name)
+        info["memory_gb"].append(props.total_memory / (1024**3))
+
+    # Get CUDA version
+    info["cuda_version"] = torch.version.cuda or "unknown"
+
+    return info
+
+def get_system_info():
+    """Get system information."""
+    info = {}
+
+    # Basic system info
+    info['hostname'] = socket.gethostname()
+    info['platform'] = platform.system()
+    info['python_version'] = platform.python_version()
+    info['torch_version'] = torch.__version__
+
+    # CPU and memory
+    info['cpu_count'] = psutil.cpu_count(logical=False)
+    info['cpu_count_logical'] = psutil.cpu_count(logical=True)
+    info['memory_gb'] = psutil.virtual_memory().total / (1024**3)
+
+    # User and environment
+    info['user'] = os.environ.get('USER', 'unknown')
+    info['nanochat_base_dir'] = os.environ.get('NANOCHAT_BASE_DIR', 'out')
+    info['working_dir'] = os.getcwd()
+
+    return info
+
+def estimate_cost(gpu_info, runtime_hours=None):
+    """Estimate training cost based on GPU type and runtime."""
+
+    # Rough pricing, from Lambda Cloud
+    default_rate = 2.0
+    gpu_hourly_rates = {
+        "H100": 3.00,
+        "A100": 1.79,
+        "V100": 0.55,
+    }
+
+    if not gpu_info.get("available"):
+        return None
+
+    # Try to identify GPU type from name
+    hourly_rate = None
+    gpu_name = gpu_info["names"][0] if gpu_info["names"] else "unknown"
+    for gpu_type, rate in gpu_hourly_rates.items():
+        if gpu_type in gpu_name:
+            hourly_rate = rate * gpu_info["count"]
+            break
+
+    if hourly_rate is None:
+        hourly_rate = default_rate * gpu_info["count"]  # Default estimate
+
+    return {
+        "hourly_rate": hourly_rate,
+        "gpu_type": gpu_name,
+        "estimated_total": hourly_rate * runtime_hours if runtime_hours else None
+    }
+
+def generate_header():
+    """Generate the header for a training report."""
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+
+    git_info = get_git_info()
+    gpu_info = get_gpu_info()
+    sys_info = get_system_info()
+    cost_info = estimate_cost(gpu_info)
+
+    header = f"""# nanochat training report
+
+Generated: {timestamp}
+
+## Environment
+
+### Git Information
+- Branch: {git_info['branch']}
+- Commit: {git_info['commit']} {"(dirty)" if git_info['dirty'] else "(clean)"}
+- Message: {git_info['message']}
+
+### Hardware
+- Platform: {sys_info['platform']}
+- CPUs: {sys_info['cpu_count']} cores ({sys_info['cpu_count_logical']} logical)
+- Memory: {sys_info['memory_gb']:.1f} GB
+"""
+
+    if gpu_info.get("available"):
+        gpu_names = ", ".join(set(gpu_info["names"]))
+        total_vram = sum(gpu_info["memory_gb"])
+        header += f"""- GPUs: {gpu_info['count']}x {gpu_names}
+- GPU Memory: {total_vram:.1f} GB total
+- CUDA Version: {gpu_info['cuda_version']}
+"""
+    else:
+        header += "- GPUs: None available\n"
+
+    if cost_info and cost_info["hourly_rate"] > 0:
+        header += f"""- Hourly Rate: ${cost_info['hourly_rate']:.2f}/hour\n"""
+
+    header += f"""
+### Software
+- Python: {sys_info['python_version']}
+- PyTorch: {sys_info['torch_version']}
+
+"""
+
+    # bloat metrics: package all of the source code and assess its weight
+    packaged = run_command('files-to-prompt . -e py -e md -e rs -e html -e toml -e sh --ignore "*target*" --cxml')
+    num_chars = len(packaged)
+    num_lines = len(packaged.split('\n'))
+    num_files = len([x for x in packaged.split('\n') if x.startswith('<source>')])
+    num_tokens = num_chars // 4 # assume approximately 4 chars per token
+
+    # count dependencies via uv.lock
+    uv_lock_lines = 0
+    if os.path.exists('uv.lock'):
+        with open('uv.lock', 'r', encoding='utf-8') as f:
+            uv_lock_lines = len(f.readlines())
+
+    header += f"""
+### Bloat
+- Characters: {num_chars:,}
+- Lines: {num_lines:,}
+- Files: {num_files:,}
+- Tokens (approx): {num_tokens:,}
+- Dependencies (uv.lock lines): {uv_lock_lines:,}
+
+"""
+    return header
+
+# -----------------------------------------------------------------------------
+
+def slugify(text):
+    """Slugify a text string."""
+    return text.lower().replace(" ", "-")
+
+# the expected files and their order
+EXPECTED_FILES = [
+    "tokenizer-training.md",
+    "tokenizer-evaluation.md",
+    "base-model-training.md",
+    "base-model-loss.md",
+    "base-model-evaluation.md",
+    "midtraining.md",
+    "chat-evaluation-mid.md",
+    "chat-sft.md",
+    "chat-evaluation-sft.md",
+    "chat-rl.md",
+    "chat-evaluation-rl.md",
+]
+# the metrics we're currently interested in
+chat_metrics = ["ARC-Easy", "ARC-Challenge", "MMLU", "GSM8K", "HumanEval", "ChatCORE"]
+
+def extract(section, keys):
+    """simple def to extract a single key from a section"""
+    if not isinstance(keys, list):
+        keys = [keys] # convenience
+    out = {}
+    for line in section.split("\n"):
+        for key in keys:
+            if key in line:
+                out[key] = line.split(":")[1].strip()
+    return out
+
+def extract_timestamp(content, prefix):
+    """Extract timestamp from content with given prefix."""
+    for line in content.split('\n'):
+        if line.startswith(prefix):
+            time_str = line.split(":", 1)[1].strip()
+            try:
+                return datetime.datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S")
+            except:
+                pass
+    return None
+
+class Report:
+    """Maintains a bunch of logs, generates a final markdown report."""
+
+    def __init__(self, report_dir):
+        os.makedirs(report_dir, exist_ok=True)
+        self.report_dir = report_dir
+
+    def log(self, section, data):
+        """Log a section of data to the report."""
+        slug = slugify(section)
+        file_name = f"{slug}.md"
+        file_path = os.path.join(self.report_dir, file_name)
+        with open(file_path, "w", encoding="utf-8") as f:
+            f.write(f"## {section}\n")
+            f.write(f"timestamp: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
+            for item in data:
+                if not item:
+                    # skip falsy values like None or empty dict etc.
+                    continue
+                if isinstance(item, str):
+                    # directly write the string
+                    f.write(item)
+                else:
+                    # render a dict
+                    for k, v in item.items():
+                        if isinstance(v, float):
+                            vstr = f"{v:.4f}"
+                        elif isinstance(v, int) and v >= 10000:
+                            vstr = f"{v:,.0f}"
+                        else:
+                            vstr = str(v)
+                        f.write(f"- {k}: {vstr}\n")
+            f.write("\n")
+        return file_path
+
+    def generate(self):
+        """Generate the final report."""
+        report_dir = self.report_dir
+        report_file = os.path.join(report_dir, "report.md")
+        print(f"Generating report to {report_file}")
+        final_metrics = {} # the most important final metrics we'll add as table at the end
+        start_time = None
+        end_time = None
+        with open(report_file, "w", encoding="utf-8") as out_file:
+            # write the header first
+            header_file = os.path.join(report_dir, "header.md")
+            if os.path.exists(header_file):
+                with open(header_file, "r", encoding="utf-8") as f:
+                    header_content = f.read()
+                    out_file.write(header_content)
+                    start_time = extract_timestamp(header_content, "Run started:")
+                    # capture bloat data for summary later (the stuff after Bloat header and until \n\n)
+                    bloat_data = re.search(r"### Bloat\n(.*?)\n\n", header_content, re.DOTALL)
+                    bloat_data = bloat_data.group(1) if bloat_data else ""
+            else:
+                start_time = None # will cause us to not write the total wall clock time
+                bloat_data = "[bloat data missing]"
+                print(f"Warning: {header_file} does not exist. Did you forget to run `nanochat reset`?")
+            # process all the individual sections
+            for file_name in EXPECTED_FILES:
+                section_file = os.path.join(report_dir, file_name)
+                if not os.path.exists(section_file):
+                    print(f"Warning: {section_file} does not exist, skipping")
+                    continue
+                with open(section_file, "r", encoding="utf-8") as in_file:
+                    section = in_file.read()
+                # Extract timestamp from this section (the last section's timestamp will "stick" as end_time)
+                if "rl" not in file_name:
+                    # Skip RL sections for end_time calculation because RL is experimental
+                    end_time = extract_timestamp(section, "timestamp:")
+                # extract the most important metrics from the sections
+                if file_name == "base-model-evaluation.md":
+                    final_metrics["base"] = extract(section, "CORE")
+                if file_name == "chat-evaluation-mid.md":
+                    final_metrics["mid"] = extract(section, chat_metrics)
+                if file_name == "chat-evaluation-sft.md":
+                    final_metrics["sft"] = extract(section, chat_metrics)
+                if file_name == "chat-evaluation-rl.md":
+                    final_metrics["rl"] = extract(section, "GSM8K") # RL only evals GSM8K
+                # append this section of the report
+                out_file.write(section)
+                out_file.write("\n")
+            # add the final metrics table
+            out_file.write("## Summary\n\n")
+            # Copy over the bloat metrics from the header
+            out_file.write(bloat_data)
+            out_file.write("\n\n")
+            # Collect all unique metric names
+            all_metrics = set()
+            for stage_metrics in final_metrics.values():
+                all_metrics.update(stage_metrics.keys())
+            # Custom ordering: CORE first, ChatCORE last, rest in middle
+            all_metrics = sorted(all_metrics, key=lambda x: (x != "CORE", x == "ChatCORE", x))
+            # Fixed column widths
+            stages = ["base", "mid", "sft", "rl"]
+            metric_width = 15
+            value_width = 8
+            # Write table header
+            header = f"| {'Metric'.ljust(metric_width)} |"
+            for stage in stages:
+                header += f" {stage.upper().ljust(value_width)} |"
+            out_file.write(header + "\n")
+            # Write separator
+            separator = f"|{'-' * (metric_width + 2)}|"
+            for stage in stages:
+                separator += f"{'-' * (value_width + 2)}|"
+            out_file.write(separator + "\n")
+            # Write table rows
+            for metric in all_metrics:
+                row = f"| {metric.ljust(metric_width)} |"
+                for stage in stages:
+                    value = final_metrics.get(stage, {}).get(metric, "-")
+                    row += f" {str(value).ljust(value_width)} |"
+                out_file.write(row + "\n")
+            out_file.write("\n")
+            # Calculate and write total wall clock time
+            if start_time and end_time:
+                duration = end_time - start_time
+                total_seconds = int(duration.total_seconds())
+                hours = total_seconds // 3600
+                minutes = (total_seconds % 3600) // 60
+                out_file.write(f"Total wall clock time: {hours}h{minutes}m\n")
+            else:
+                out_file.write("Total wall clock time: unknown\n")
+        # also cp the report.md file to current directory
+        print(f"Copying report.md to current directory for convenience")
+        shutil.copy(report_file, "report.md")
+        return report_file
+
+    def reset(self):
+        """Reset the report."""
+        # Remove section files
+        for file_name in EXPECTED_FILES:
+            file_path = os.path.join(self.report_dir, file_name)
+            if os.path.exists(file_path):
+                os.remove(file_path)
+        # Remove report.md if it exists
+        report_file = os.path.join(self.report_dir, "report.md")
+        if os.path.exists(report_file):
+            os.remove(report_file)
+        # Generate and write the header section with start timestamp
+        header_file = os.path.join(self.report_dir, "header.md")
+        header = generate_header()
+        start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+        with open(header_file, "w", encoding="utf-8") as f:
+            f.write(header)
+            f.write(f"Run started: {start_time}\n\n---\n\n")
+        print(f"Reset report and wrote header to {header_file}")
+
+# -----------------------------------------------------------------------------
+# nanochat-specific convenience functions
+
+class DummyReport:
+    def log(self, *args, **kwargs):
+        pass
+    def reset(self, *args, **kwargs):
+        pass
+
+def get_report():
+    # just for convenience, only rank 0 logs to report
+    from nanochat_moe.common import get_base_dir, get_dist_info
+    ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
+    if ddp_rank == 0:
+        report_dir = os.path.join(get_base_dir(), "report")
+        return Report(report_dir)
+    else:
+        return DummyReport()
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="Generate or reset nanochat training reports.")
+    parser.add_argument("command", nargs="?", default="generate", choices=["generate", "reset"], help="Operation to perform (default: generate)")
+    args = parser.parse_args()
+    if args.command == "generate":
+        get_report().generate()
+    elif args.command == "reset":
+        get_report().reset()
--- a/nanochat_moe/standard.py
+++ b/nanochat_moe/standard.py
@ -0,0 +1,661 @@
+"""
+Full definition of a GPT Language Model, all of it in this single file.
+References:
+1) the official GPT-2 TensorFlow implementation released by OpenAI:
+https://github.com/openai/gpt-2/blob/master/src/model.py
+2) huggingface/transformers PyTorch implementation:
+https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt2/modeling_gpt2.py
+"""
+
+import math
+import inspect
+from dataclasses import dataclass
+from contextlib import nullcontext
+
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+
+from manager import MANAGER
+
+class LayerNorm(nn.Module):
+    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
+
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+
+    def forward(self, input):
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+
+class CausalSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # regularization
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+        if not self.flash:
+            print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
+            # causal mask to ensure that attention is only applied to the left in the input sequence
+            self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
+                                        .view(1, 1, config.block_size, config.block_size))
+
+    def forward(self, x):
+        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
+
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        if self.flash:
+            # efficient attention using Flash Attention CUDA kernels
+            y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
+        else:
+            # manual implementation of attention
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_dropout(att)
+            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+
+class Router(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        # router settings
+        self.top_k = config.top_k
+        self.n_exp = config.n_exp
+        assert self.top_k >= 1 and self.top_k <= config.n_exp
+        self.use_noisy_top_k = config.use_noisy_top_k
+        self.train_capacity = config.train_capacity
+        self.eval_capacity = config.eval_capacity
+        self.min_capacity = config.min_capacity
+        self.router_use_full_prec = config.router_use_full_prec
+
+        # auxiliary / load balancing loss settings
+        self.use_aux_loss = config.use_aux_loss
+        self.use_router_z_loss = config.use_router_z_loss
+
+        # linear projection for (noisy) softmax gating
+        # no bias is used, see page 4 eq (4) in (https://arxiv.org/abs/1701.06538)
+        self.w_g = nn.Linear(config.n_embd, config.n_exp, bias=False)
+        self.w_noise = nn.Linear(config.n_embd, config.n_exp, bias=False) if self.use_noisy_top_k else None
+    
+    def forward(self, x):
+        # optionally run the router in full precision to avoid instability during training
+        # see discussion on pg. 9 here: https://arxiv.org/abs/2101.03961
+        # setting enabled to False in autocast automatically puts everything in float32
+        device_type = 'cuda' if torch.cuda.is_available() else 'cpu' # for later use in torch.autocast
+        ctx = nullcontext() if not self.router_use_full_prec else torch.amp.autocast(device_type=device_type, enabled=False)
+
+        with ctx:
+            B, T, _ = x.size()
+            num_tokens = B * T
+
+            # eq (4) in (https://arxiv.org/abs/1701.06538)
+            logits = self.w_g(x)  # [B, T, n_exp]
+            if self.use_noisy_top_k:
+                # optionally add noise into the router
+                noise = F.softplus(self.w_noise(x))
+                noise *= torch.randn_like(noise)
+                logits += noise
+
+            # router z loss, computed on logits (before softmax)
+            # this loss prevents router logits from becoming too large
+            if self.use_router_z_loss:
+                z_loss = self.compute_router_z_loss(logits)
+                MANAGER.add_router_z_loss(z_loss)
+
+            # find top k experts for each token
+            top_k_logits, top_k_indices = logits.topk(self.top_k, dim=-1) # [B, T, k]
+
+            # normalize expert probabilities
+            # Question: should we normalize over all experts or just top-k?
+            # we choose to normalize over top-k, other option is commented out below
+
+            # Shazeer et al (https://arxiv.org/abs/1701.06538) does only topk
+            # see page 4 eq (3)-(5), the code for this is commented out below
+            router_probs = torch.full_like(logits, float('-inf'))  # [B, T, n_exp]
+            router_probs.scatter_(-1, top_k_indices, top_k_logits)
+            router_probs = F.softmax(router_probs, dim=-1)
+
+            # # normalize all router logits (not just top-k) via softmax      
+            # router_probs = F.softmax(logits, dim=-1)
+
+            # compute auxiliary load balancing loss
+            # this loss encourages equal probability assigned to each expert
+            # and equal load balancing of tokens assigned to each expert
+            if self.use_aux_loss:
+                aux_loss = self.compute_aux_loss(router_probs, top_k_indices)
+                MANAGER.add_aux_loss(aux_loss)
+
+            # compute expert capacity
+            exp_capacity = self.get_capacity(num_tokens)
+
+            # make a multi-hot mask of chosen experts, size [B, T, n_exp]
+            # entries are 0 if expert not chosen and 1 if expert chosen
+            exp_mask = F.one_hot(top_k_indices, num_classes=self.n_exp)  # [B, T, k, n_exp]
+            exp_mask = exp_mask.view(num_tokens, self.top_k, self.n_exp)  # [B * T, k, n_exp]
+            exp_mask = exp_mask.permute(1, 0, 2) # [k, B * T, n_exp]
+
+            # compute cumulative sum of each token over experts, this stores
+            # the index of each token within the batch of each expert
+            # NOTE: cumsum should count all top-1 first, top-2 second, etc.
+            # so that we prioritize top experts when dropping tokens (this is
+            # done by putting k dimension first for the reshape operation)
+            exp_rank = exp_mask.reshape(self.top_k * num_tokens, self.n_exp)  # [k * B * T, n_exp]
+            exp_rank = torch.cumsum(exp_rank, dim=0) - 1  # cumulative sum of expert selections [k * B * T, n_exp]
+            exp_rank = exp_rank.reshape(self.top_k, num_tokens, self.n_exp)  # [k, B * T, n_exp]
+
+            # mask out (set to zero) entries that go beyond expert capacity
+            # compute amount of used capacity by taking a sum over mask
+            exp_mask *= torch.lt(exp_rank, exp_capacity) # [k, B * T, n_exp]
+            used_capacity = torch.sum(exp_mask, dim=(0, 1)) # [n_exp]
+
+            # mask rank to only include tokens that are selected
+            # perform a sum so each row only contains index of token
+            # for the expert that is selected in that row
+            # result is a matrix that contains the position of each token
+            # in the batch of its corresponding expert
+            exp_rank = torch.sum(exp_mask * exp_rank, dim=-1)  # [k, B * T]
+
+            # mask probabilities to only include selected experts
+            router_probs = router_probs.view(num_tokens, self.n_exp)[None, :] # [1, B * T, n_exp]
+            exp_weights = exp_mask * router_probs # [k, B * T, n_exp]
+
+            # convert rank into one-hot vectors over the available capacity
+            # stores the position of each token within the capacity of the selected expert
+            exp_rank_sc = F.one_hot(exp_rank, num_classes=exp_capacity) # [k, B * T, exp_capacity]
+
+            # create a vector that stores, for each token, the weight of selected
+            # experts at token's position in the capacity of that expert
+            # size of tensor is [B * T, n_exp, exp_capacity]
+            cb_weight = torch.sum(exp_weights.unsqueeze(3) * exp_rank_sc.unsqueeze(2), dim=0)
+            sec_mask = cb_weight.bool() # binary mask of selected experts for each token
+            return used_capacity, cb_weight, sec_mask
+    
+    def compute_aux_loss(self, expert_probs: torch.Tensor, indices: torch.Tensor):
+        """
+        Computes Switch Transformer auxiliary loss (https://arxiv.org/abs/2101.03961)
+        See equations (4)-(6) on page 7
+        """
+
+        # equation (5): compute ratio of tokens allocated to each expert
+        # total number of tokens is defined as total tokens in batch * k
+        # (k = 1) for the Switch Transformer
+        with torch.no_grad():
+            one_hot_indices = F.one_hot(indices, num_classes=self.n_exp)  # [B, T, k, n_exp]
+            one_hot_indices = torch.sum(one_hot_indices.float(), dim=2)  # [B, T, n_exp] (sum over k dimension)
+            tokens_per_expert = torch.mean(one_hot_indices.float(), dim=(0, 1))
+
+        # equation (6): compute ratio of router probability allocated to each expert
+        prob_per_expert = torch.mean(expert_probs.float(), dim=(0, 1))
+
+        # equation (4): take a scaled dot product between prob/token allocation vectors
+        # multiply the result by the number of experts
+        return self.n_exp * torch.sum(prob_per_expert * tokens_per_expert)
+    
+    def compute_router_z_loss(self, logits: torch.Tensor):
+        """
+        Computes ST-MoE router z loss (https://arxiv.org/abs/2202.08906)
+        See equation (5) on page 7
+        """
+    
+        # exponentiate logits, sum logits of each expert, take log, and square
+        # code below is the same as:
+        # > z_loss = torch.exp(logits)
+        # > z_loss = torch.sum(z_loss, dim=-1)
+        # > z_loss = torch.log(z_loss) ** 2.0
+        z_loss = torch.logsumexp(logits, dim=-1) ** 2.0  # [B, T, n_exp]
+
+        # sum over all tokens and divide by total number of tokens
+        return torch.mean(z_loss)
+
+    def get_capacity(self, tokens_per_batch):
+        # expert capacity is given by (tokens_per_batch / num_experts) * capacity_factor
+        # see eq (3) in Switch Transformer (https://arxiv.org/abs/2101.03961)
+        capacity_factor = self.train_capacity if self.training else self.eval_capacity
+        capacity = math.floor(self.top_k * capacity_factor * tokens_per_batch / self.n_exp)
+        capacity += capacity % 2 # make sure capacity is an even number
+        capacity = max(capacity, self.min_capacity) # use min capacity
+        assert capacity > 0
+        return int(capacity)
+
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.gelu    = nn.GELU()
+        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = self.gelu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+
+class MLPExperts(nn.Module):
+    """
+    implementation of multiple MLP-based experts that can process input
+    in batch -- based upon ColossalAI OpenMoE but simple, has optional bias, and
+    uses a bmm instead of a loop over a mm for each expert to improve efficiency
+    link: https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/moe/experts.py
+    """
+    def __init__(self, config):
+        # TODO: add param init
+        super().__init__()
+        self.bias = config.bias
+
+        self.c_fc = nn.Parameter(torch.empty(config.n_exp, config.n_embd, 4 * config.n_embd))
+        self.c_proj = nn.Parameter(torch.empty(config.n_exp, 4 * config.n_embd, config.n_embd))
+        self.fc_bias = nn.Parameter(torch.empty(config.n_exp, 1, 4 * config.n_embd)) if self.bias else None
+        self.proj_bias = nn.Parameter(torch.empty(config.n_exp, 1, config.n_embd)) if self.bias else None
+        self.gelu = nn.GELU()
+        self.dropout = nn.Dropout(config.dropout)
+    
+
+    def forward(self, x):
+        x = torch.bmm(x, self.c_fc)
+        if self.bias:
+            x += self.fc_bias
+        x = self.gelu(x)
+        x = torch.bmm(x, self.c_proj)
+        if self.bias:
+            x += self.proj_bias
+        x = self.dropout(x)
+        return x
+
+class MOELayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.router = Router(config) # (noisy) top k router
+        self.experts = MLPExperts(config) # group of MLPs (experts)
+
+    def forward(self, x: torch.Tensor):
+        B, T, n_embd = x.size() # track original shape of input
+        num_tokens = (B * T)
+
+        # pass each token through the router
+        used_capacity, exp_weight, exp_mask = self.router(x)
+
+        # flatten out the input
+        x = x.view(num_tokens, n_embd)
+
+        # reshape tokens into batches for each expert
+        # [n_exp, exp_capacity, B * T] * [B * T, n_embd] -> [n_exp, exp_capacity, n_embd]
+        exp_batches = exp_mask.permute(1, 2, 0).type_as(x) @ x
+
+        # compute expert output
+        exp_out = self.experts(exp_batches) # [n_exp, exp_capacity, n_embd]
+
+        # aggregate expert outputs based on router weights
+        # eq (2) on page 4 of ST-MoE (https://arxiv.org/abs/2202.08906)
+        # similar equations are used for other MoE papers
+        exp_weight = exp_weight.view(num_tokens, -1) # [B * T, n_exp * exp_capacity]
+        exp_out = exp_out.view(-1, n_embd) # [n_exp * exp_capacity, n_embd] 
+        output = exp_weight @ exp_out # [B * T, n_embd]
+        
+        # resize output before return
+        return output.view(B, T, n_embd)
+
+class Block(nn.Module):
+
+    def __init__(self, config, use_moe=False):
+        super().__init__()
+        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+        if use_moe:
+            self.mlp = MOELayer(config)
+        else:
+            self.mlp = MLP(config)
+
+    def forward(self, x):
+        x = x + self.attn(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+@dataclass
+class GPTConfig:
+    block_size: int = 1024
+    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
+    n_layer: int = 12
+    n_head: int = 12
+    n_embd: int = 768
+    dropout: float = 0.0
+    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster
+
+    # MoE-related configs 
+    n_exp: int = 1 # if n_exp = 1 we just use regular MLP layers
+    top_k: int = 2
+    use_aux_loss: bool = False # apply auxiliary loss (from Switch Transformer) in router
+    use_router_z_loss: bool = False # apply router z loss (from ST-MoE)
+    use_noisy_top_k: bool = False
+    aux_loss_weight: float = 0.01 # default setting from Switch Transformer (see top of page 8)
+    router_z_loss_weight: float = 0.001 # default setting from ST-MoE (see page 8 eq. 6)
+    train_capacity: float = 1.25  # default setting from ST-MoE (see top of page 6)
+    eval_capacity: float = 2.0
+    min_capacity: int = 4  # minimum batch size to send to any single expert
+    stride: int = 2 # one in every stride layers are converted to an MoE
+    use_switch_tfm_init: bool = False  # use weight init scheme from Switch Transformer
+    switch_tfm_init_scale: float = 1.0
+    router_use_full_prec: bool = False  # use float32 precision in the router
+
+
+class GPT(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        assert config.vocab_size is not None
+        assert config.block_size is not None
+        self.config = config
+
+        if config.n_exp == 1:
+            # create normal transformer blocks
+            blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
+        else:
+            # create transformer blocks, placing an MoE block every <stride> layers
+            blocks = []
+            for i in range(config.n_layer):
+                # TODO: how to implement this?
+                # should we change below to i + 1 ?
+                use_moe = (i % config.stride) == 0
+                blocks.append(Block(config, use_moe=use_moe))
+            blocks = nn.ModuleList(blocks)
+
+        self.transformer = nn.ModuleDict(dict(
+            wte = nn.Embedding(config.vocab_size, config.n_embd),
+            wpe = nn.Embedding(config.block_size, config.n_embd),
+            drop = nn.Dropout(config.dropout),
+            h = blocks,
+            ln_f = LayerNorm(config.n_embd, bias=config.bias),
+        ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        # with weight tying when using torch.compile() some warnings get generated:
+        # "UserWarning: functional_call was passed multiple values for tied weights.
+        # This behavior is deprecated and will be an error in future versions"
+        # not 100% sure what this is, so far seems to be harmless. TODO investigate
+        self.transformer.wte.weight = self.lm_head.weight # https://paperswithcode.com/method/weight-tying
+
+        # init all weights
+        # optionall use switch transformer special init scheme for experts
+        # See pg. 10 here: https://arxiv.org/abs/2101.03961
+        self.apply(self._init_weights)
+        # apply special scaled init to the residual projections, per GPT-2 paper
+        for pn, p in self.named_parameters():
+            if pn.endswith('c_proj.weight') or pn.endswith('experts.c_proj'):
+                torch.nn.init.normal_(p, mean=0.0, std=0.02/math.sqrt(2 * config.n_layer))
+
+        # report number of parameters
+        print("number of parameters: %.2fM" % (self.get_num_params()/1e6,))
+
+    def get_num_params(self, non_embedding=True):
+        """
+        Return the number of parameters in the model.
+        For non-embedding count (default), the position embeddings get subtracted.
+        The token embeddings would too, except due to the parameter sharing these
+        params are actually used as weights in the final layer, so we include them.
+        """
+        n_params = sum(p.numel() for p in self.parameters())
+        if non_embedding:
+            n_params -= self.transformer.wpe.weight.numel()
+        return n_params
+
+    @torch.no_grad()
+    def _init_weights(self, module):
+        # optionally use switch transformer-style initialization
+        # see page 10 for switch init explanation: https://arxiv.org/abs/2101.03961
+        if isinstance(module, nn.Linear):
+            if self.config.use_switch_tfm_init:
+                scale = self.config.switch_tfm_init_scale
+
+                # linear layers have flipped dimensions in torch
+                # size of weights is [out_dim, in_dim] 
+                w_fan_in = module.weight.shape[-1]
+                w_std = (scale / w_fan_in) ** 0.5
+                torch.nn.init.trunc_normal_(
+                    module.weight,
+                    mean=0.0,
+                    std=w_std,
+                    a=-2*w_std,
+                    b=2*w_std,
+                )
+            else:
+                # perform standard (normal) initialization of weights
+                torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+
+            # always initialize bias to zero
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, MLPExperts):
+            # we have to init expert weights manually because
+            # nn.Parameter is not a type of module in torch
+            if self.config.use_switch_tfm_init:
+                scale = self.config.switch_tfm_init_scale
+
+                c_fc_fan_in = module.c_fc.shape[-2]
+                c_fc_std = (scale / c_fc_fan_in) ** 0.5
+                torch.nn.init.trunc_normal_(
+                    module.c_fc,
+                    mean=0.0,
+                    std=c_fc_std,
+                    a=-2*c_fc_std,
+                    b=2*c_fc_std,
+                )
+
+                c_proj_fan_in = module.c_proj.shape[-2]
+                c_proj_std = (scale / c_proj_fan_in) ** 0.5
+                torch.nn.init.trunc_normal_(
+                    module.c_proj,
+                    mean=0.0,
+                    std=c_proj_std,
+                    a=-2*c_proj_std,
+                    b=2*c_proj_std,
+                )
+            else:
+                # perform standard (normal) initialization of weights
+                torch.nn.init.normal_(module.c_fc, mean=0.0, std=0.02)
+                torch.nn.init.normal_(module.c_proj, mean=0.0, std=0.02)
+
+            # bias is always initialized to zero
+            if module.fc_bias is not None:
+                torch.nn.init.zeros_(module.fc_bias)
+                torch.nn.init.zeros_(module.proj_bias)
+        elif isinstance(module, nn.Embedding):
+            # just use standard initialization scheme for embedding always
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+
+    def forward(self, idx, targets=None):
+        device = idx.device
+        b, t = idx.size()
+        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+        pos = torch.arange(0, t, dtype=torch.long, device=device) # shape (t)
+
+        # forward the GPT model itself
+        tok_emb = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
+        pos_emb = self.transformer.wpe(pos) # position embeddings of shape (t, n_embd)
+        x = self.transformer.drop(tok_emb + pos_emb)
+        for block in self.transformer.h:
+            x = block(x)
+        x = self.transformer.ln_f(x)
+
+        if targets is not None:
+            # if we are given some desired targets also calculate the loss
+            logits = self.lm_head(x)
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+
+            # add the auxiliary load balancing loss and router z loss to the main loss
+            if self.config.n_exp > 1 and self.config.use_aux_loss:
+                loss += self.config.aux_loss_weight * MANAGER.aggregate_aux_loss()
+                MANAGER.reset_aux_loss()
+            if self.config.n_exp > 1 and self.config.use_router_z_loss:
+                loss += self.config.router_z_loss_weight * MANAGER.aggregate_router_z_loss()
+                MANAGER.reset_router_z_loss()
+        else:
+            # inference-time mini-optimization: only forward the lm_head on the very last position
+            logits = self.lm_head(x[:, [-1], :]) # note: using list [-1] to preserve the time dim
+            loss = None
+
+        return logits, loss
+
+    def crop_block_size(self, block_size):
+        # model surgery to decrease the block size if necessary
+        # e.g. we may load the GPT2 pretrained model checkpoint (block size 1024)
+        # but want to use a smaller block size for some smaller, simpler model
+        assert block_size <= self.config.block_size
+        self.config.block_size = block_size
+        self.transformer.wpe.weight = nn.Parameter(self.transformer.wpe.weight[:block_size])
+        for block in self.transformer.h:
+            if hasattr(block.attn, 'bias'):
+                block.attn.bias = block.attn.bias[:,:,:block_size,:block_size]
+
+    @classmethod
+    def from_pretrained(cls, model_type, override_args=None):
+        assert not 'moe' in model_type, "Pretrained checkpoints not available for MoE"
+        assert model_type in {'gpt2', 'gpt2-medium', 'gpt2-large', 'gpt2-xl'}
+        override_args = override_args or {} # default to empty dict
+        # only dropout can be overridden see more notes below
+        assert all(k == 'dropout' for k in override_args)
+        from transformers import GPT2LMHeadModel
+        print("loading weights from pretrained gpt: %s" % model_type)
+
+        # n_layer, n_head and n_embd are determined from model_type
+        config_args = {
+            'gpt2':         dict(n_layer=12, n_head=12, n_embd=768),  # 124M params
+            'gpt2-medium':  dict(n_layer=24, n_head=16, n_embd=1024), # 350M params
+            'gpt2-large':   dict(n_layer=36, n_head=20, n_embd=1280), # 774M params
+            'gpt2-xl':      dict(n_layer=48, n_head=25, n_embd=1600), # 1558M params
+        }[model_type]
+        print("forcing vocab_size=50257, block_size=1024, bias=True")
+        config_args['vocab_size'] = 50257 # always 50257 for GPT model checkpoints
+        config_args['block_size'] = 1024 # always 1024 for GPT model checkpoints
+        config_args['bias'] = True # always True for GPT model checkpoints
+        # we can override the dropout rate, if desired
+        if 'dropout' in override_args:
+            print(f"overriding dropout rate to {override_args['dropout']}")
+            config_args['dropout'] = override_args['dropout']
+        # create a from-scratch initialized minGPT model
+        config = GPTConfig(**config_args)
+        model = GPT(config)
+        sd = model.state_dict()
+        sd_keys = sd.keys()
+        sd_keys = [k for k in sd_keys if not k.endswith('.attn.bias')] # discard this mask / buffer, not a param
+
+        # init a huggingface/transformers model
+        model_hf = GPT2LMHeadModel.from_pretrained(model_type)
+        sd_hf = model_hf.state_dict()
+
+        # copy while ensuring all of the parameters are aligned and match in names and shapes
+        sd_keys_hf = sd_hf.keys()
+        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.masked_bias')] # ignore these, just a buffer
+        sd_keys_hf = [k for k in sd_keys_hf if not k.endswith('.attn.bias')] # same, just the mask (buffer)
+        transposed = ['attn.c_attn.weight', 'attn.c_proj.weight', 'mlp.c_fc.weight', 'mlp.c_proj.weight']
+        # basically the openai checkpoints use a "Conv1D" module, but we only want to use a vanilla Linear
+        # this means that we have to transpose these weights when we import them
+        assert len(sd_keys_hf) == len(sd_keys), f"mismatched keys: {len(sd_keys_hf)} != {len(sd_keys)}"
+        for k in sd_keys_hf:
+            if any(k.endswith(w) for w in transposed):
+                # special treatment for the Conv1D weights we need to transpose
+                assert sd_hf[k].shape[::-1] == sd[k].shape
+                with torch.no_grad():
+                    sd[k].copy_(sd_hf[k].t())
+            else:
+                # vanilla copy over the other parameters
+                assert sd_hf[k].shape == sd[k].shape
+                with torch.no_grad():
+                    sd[k].copy_(sd_hf[k])
+
+        return model
+
+    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type):
+        # TODO: add expert config
+        # start with all of the candidate parameters
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        # filter out those that do not require grad
+        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
+        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
+        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
+        # add an extra check for "bias" string to account for bias terms in MoE layers
+        decay_params = [p for n, p in param_dict.items() if (p.dim() >= 2 and not n.endswith('bias'))]
+        nodecay_params = [p for n, p in param_dict.items() if (p.dim() < 2 or n.endswith('bias'))]
+        optim_groups = [
+            {'params': decay_params, 'weight_decay': weight_decay},
+            {'params': nodecay_params, 'weight_decay': 0.0}
+        ]
+        num_decay_params = sum(p.numel() for p in decay_params)
+        num_nodecay_params = sum(p.numel() for p in nodecay_params)
+        print(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
+        print(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
+        # Create AdamW optimizer and use the fused version if it is available
+        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
+        use_fused = fused_available and device_type == 'cuda'
+        extra_args = dict(fused=True) if use_fused else dict()
+        optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, **extra_args)
+        print(f"using fused AdamW: {use_fused}")
+
+        return optimizer
+
+    def estimate_mfu(self, fwdbwd_per_iter, dt):
+        """ estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS """
+        # first estimate the number of flops we do per iteration.
+        # see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
+        N = self.get_num_params()
+        cfg = self.config
+        L, H, Q, T = cfg.n_layer, cfg.n_head, cfg.n_embd//cfg.n_head, cfg.block_size
+        flops_per_token = 6*N + 12*L*H*Q*T
+        flops_per_fwdbwd = flops_per_token * T
+        flops_per_iter = flops_per_fwdbwd * fwdbwd_per_iter
+        # express our flops throughput as ratio of A100 bfloat16 peak flops
+        flops_achieved = flops_per_iter * (1.0/dt) # per second
+        flops_promised = 312e12 # A100 GPU bfloat16 peak flops is 312 TFLOPS
+        mfu = flops_achieved / flops_promised
+        return mfu
+
+    @torch.no_grad()
+    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
+        """
+        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
+        the sequence max_new_tokens times, feeding the predictions back into the model each time.
+        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
+        """
+        for _ in range(max_new_tokens):
+            # if the sequence context is growing too long we must crop it at block_size
+            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
+            # forward the model to get the logits for the index in the sequence
+            logits, _ = self(idx_cond)
+            # pluck the logits at the final step and scale by desired temperature
+            logits = logits[:, -1, :] / temperature
+            # optionally crop the logits to only the top k options
+            if top_k is not None:
+                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+                logits[logits < v[:, [-1]]] = -float('Inf')
+            # apply softmax to convert logits to (normalized) probabilities
+            probs = F.softmax(logits, dim=-1)
+            # sample from the distribution
+            idx_next = torch.multinomial(probs, num_samples=1)
+            # append sampled index to the running sequence and continue
+            idx = torch.cat((idx, idx_next), dim=1)
+
+        return idx   
--- a/nanochat_moe/tokenizer.py
+++ b/nanochat_moe/tokenizer.py
@ -0,0 +1,398 @@
+"""
+BPE Tokenizer in the style of GPT-4.
+
+Two implementations are available:
+1) HuggingFace Tokenizer that can do both training and inference but is really confusing
+2) Our own RustBPE Tokenizer for training and tiktoken for efficient inference
+"""
+
+import os
+import copy
+from functools import lru_cache
+
+SPECIAL_TOKENS = [
+    # every document begins with the Beginning of Sequence (BOS) token that delimits documents
+    "<|bos|>",
+    # tokens below are only used during finetuning to render Conversations into token ids
+    "<|user_start|>", # user messages
+    "<|user_end|>",
+    "<|assistant_start|>", # assistant messages
+    "<|assistant_end|>",
+    "<|python_start|>", # assistant invokes python REPL tool
+    "<|python_end|>",
+    "<|output_start|>", # python REPL outputs back to assistant
+    "<|output_end|>",
+]
+
+# NOTE: this split pattern deviates from GPT-4 in that we use \p{N}{1,2} instead of \p{N}{1,3}
+# I did this because I didn't want to "waste" too many tokens on numbers for smaller vocab sizes.
+# I haven't validated that this is actually a good idea, TODO.
+SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+"""
+
+# -----------------------------------------------------------------------------
+# Generic GPT-4-style tokenizer based on HuggingFace Tokenizer
+from tokenizers import Tokenizer as HFTokenizer
+from tokenizers import pre_tokenizers, decoders, Regex
+from tokenizers.models import BPE
+from tokenizers.trainers import BpeTrainer
+
+class HuggingFaceTokenizer:
+    """Light wrapper around HuggingFace Tokenizer for some utilities"""
+
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+
+    @classmethod
+    def from_pretrained(cls, hf_path):
+        # init from a HuggingFace pretrained tokenizer (e.g. "gpt2")
+        tokenizer = HFTokenizer.from_pretrained(hf_path)
+        return cls(tokenizer)
+
+    @classmethod
+    def from_directory(cls, tokenizer_dir):
+        # init from a local directory on disk (e.g. "out/tokenizer")
+        tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
+        tokenizer = HFTokenizer.from_file(tokenizer_path)
+        return cls(tokenizer)
+
+    @classmethod
+    def train_from_iterator(cls, text_iterator, vocab_size):
+        # train from an iterator of text
+        # Configure the HuggingFace Tokenizer
+        tokenizer = HFTokenizer(BPE(
+            byte_fallback=True, # needed!
+            unk_token=None,
+            fuse_unk=False,
+        ))
+        # Normalizer: None
+        tokenizer.normalizer = None
+        # Pre-tokenizer: GPT-4 style
+        # the regex pattern used by GPT-4 to split text into groups before BPE
+        # NOTE: The pattern was changed from \p{N}{1,3} to \p{N}{1,2} because I suspect it is harmful to
+        # very small models and smaller vocab sizes, because it is a little bit wasteful in the token space.
+        # (but I haven't validated this! TODO)
+        gpt4_split_regex = Regex(SPLIT_PATTERN) # huggingface demands that you wrap it in Regex!!
+        tokenizer.pre_tokenizer = pre_tokenizers.Sequence([
+            pre_tokenizers.Split(pattern=gpt4_split_regex, behavior="isolated", invert=False),
+            pre_tokenizers.ByteLevel(add_prefix_space=False, use_regex=False)
+        ])
+        # Decoder: ByteLevel (it pairs together with the ByteLevel pre-tokenizer)
+        tokenizer.decoder = decoders.ByteLevel()
+        # Post-processor: None
+        tokenizer.post_processor = None
+        # Trainer: BPE
+        trainer = BpeTrainer(
+            vocab_size=vocab_size,
+            show_progress=True,
+            min_frequency=0, # no minimum frequency
+            initial_alphabet=pre_tokenizers.ByteLevel.alphabet(),
+            special_tokens=SPECIAL_TOKENS,
+        )
+        # Kick off the training
+        tokenizer.train_from_iterator(text_iterator, trainer)
+        return cls(tokenizer)
+
+    def get_vocab_size(self):
+        return self.tokenizer.get_vocab_size()
+
+    def get_special_tokens(self):
+        special_tokens_map = self.tokenizer.get_added_tokens_decoder()
+        special_tokens = [w.content for w in special_tokens_map.values()]
+        return special_tokens
+
+    def id_to_token(self, id):
+        return self.tokenizer.id_to_token(id)
+
+    def _encode_one(self, text, prepend=None, append=None):
+        # encode a single string
+        # prepend/append can be either a string of a special token or a token id directly.
+        assert isinstance(text, str)
+        ids = []
+        if prepend is not None:
+            prepend_id = prepend if isinstance(prepend, int) else self.encode_special(prepend)
+            ids.append(prepend_id)
+        ids.extend(self.tokenizer.encode(text, add_special_tokens=False).ids)
+        if append is not None:
+            append_id = append if isinstance(append, int) else self.encode_special(append)
+            ids.append(append_id)
+        return ids
+
+    def encode_special(self, text):
+        # encode a single special token via exact match
+        return self.tokenizer.token_to_id(text)
+
+    def get_bos_token_id(self):
+        bos = self.encode_special("<|bos|>")
+        return bos
+
+    def encode(self, text, *args, **kwargs):
+        if isinstance(text, str):
+            return self._encode_one(text, *args, **kwargs)
+        elif isinstance(text, list):
+            return [self._encode_one(t, *args, **kwargs) for t in text]
+        else:
+            raise ValueError(f"Invalid input type: {type(text)}")
+
+    def __call__(self, *args, **kwargs):
+        return self.encode(*args, **kwargs)
+
+    def decode(self, ids):
+        return self.tokenizer.decode(ids, skip_special_tokens=False)
+
+    def save(self, tokenizer_dir):
+        # save the tokenizer to disk
+        os.makedirs(tokenizer_dir, exist_ok=True)
+        tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
+        self.tokenizer.save(tokenizer_path)
+        print(f"Saved tokenizer to {tokenizer_path}")
+
+# -----------------------------------------------------------------------------
+# Tokenizer based on rustbpe + tiktoken combo
+import pickle
+import rustbpe
+import tiktoken
+
+class RustBPETokenizer:
+    """Light wrapper around tiktoken (for efficient inference) but train with rustbpe"""
+
+    def __init__(self, enc, bos_token):
+        self.enc = enc
+        self.bos_token_id = self.encode_special(bos_token)
+
+    @classmethod
+    def train_from_iterator(cls, text_iterator, vocab_size):
+        # 1) train using rustbpe
+        tokenizer = rustbpe.Tokenizer()
+        # the special tokens are inserted later in __init__, we don't train them here
+        vocab_size_no_special = vocab_size - len(SPECIAL_TOKENS)
+        assert vocab_size_no_special >= 256, f"vocab_size_no_special must be at least 256, got {vocab_size_no_special}"
+        tokenizer.train_from_iterator(text_iterator, vocab_size_no_special, pattern=SPLIT_PATTERN)
+        # 2) construct the associated tiktoken encoding for inference
+        pattern = tokenizer.get_pattern()
+        mergeable_ranks_list = tokenizer.get_mergeable_ranks()
+        mergeable_ranks = {bytes(k): v for k, v in mergeable_ranks_list}
+        tokens_offset = len(mergeable_ranks)
+        special_tokens = {name: tokens_offset + i for i, name in enumerate(SPECIAL_TOKENS)}
+        enc = tiktoken.Encoding(
+            name="rustbpe",
+            pat_str=pattern,
+            mergeable_ranks=mergeable_ranks, # dict[bytes, int] (token bytes -> merge priority rank)
+            special_tokens=special_tokens, # dict[str, int] (special token name -> token id)
+        )
+        return cls(enc, "<|bos|>")
+
+    @classmethod
+    def from_directory(cls, tokenizer_dir):
+        pickle_path = os.path.join(tokenizer_dir, "tokenizer.pkl")
+        with open(pickle_path, "rb") as f:
+            enc = pickle.load(f)
+        return cls(enc, "<|bos|>")
+
+    @classmethod
+    def from_pretrained(cls, tiktoken_name):
+        # https://github.com/openai/tiktoken/blob/eedc8563/tiktoken_ext/openai_public.py
+        enc = tiktoken.get_encoding(tiktoken_name)
+        # tiktoken calls the special document delimiter token "<|endoftext|>"
+        # yes this is confusing because this token is almost always PREPENDED to the beginning of the document
+        # it most often is used to signal the start of a new sequence to the LLM during inference etc.
+        # so in nanoChat we always use "<|bos|>" short for "beginning of sequence", but historically it is often called "<|endoftext|>".
+        return cls(enc, "<|endoftext|>")
+
+    def get_vocab_size(self):
+        return self.enc.n_vocab
+
+    def get_special_tokens(self):
+        return self.enc.special_tokens_set
+
+    def id_to_token(self, id):
+        return self.enc.decode([id])
+
+    @lru_cache(maxsize=32)
+    def encode_special(self, text):
+        return self.enc.encode_single_token(text)
+
+    def get_bos_token_id(self):
+        return self.bos_token_id
+
+    def encode(self, text, prepend=None, append=None, num_threads=8):
+        # text can be either a string or a list of strings
+
+        if prepend is not None:
+            prepend_id = prepend if isinstance(prepend, int) else self.encode_special(prepend)
+        if append is not None:
+            append_id = append if isinstance(append, int) else self.encode_special(append)
+
+        if isinstance(text, str):
+            ids = self.enc.encode_ordinary(text)
+            if prepend is not None:
+                ids.insert(0, prepend_id) # TODO: slightly inefficient here? :( hmm
+            if append is not None:
+                ids.append(append_id)
+        elif isinstance(text, list):
+            ids = self.enc.encode_ordinary_batch(text, num_threads=num_threads)
+            if prepend is not None:
+                for ids_row in ids:
+                    ids_row.insert(0, prepend_id) # TODO: same
+            if append is not None:
+                for ids_row in ids:
+                    ids_row.append(append_id)
+        else:
+            raise ValueError(f"Invalid input type: {type(text)}")
+
+        return ids
+
+    def __call__(self, *args, **kwargs):
+        return self.encode(*args, **kwargs)
+
+    def decode(self, ids):
+        return self.enc.decode(ids)
+
+    def save(self, tokenizer_dir):
+        # save the encoding object to disk
+        os.makedirs(tokenizer_dir, exist_ok=True)
+        pickle_path = os.path.join(tokenizer_dir, "tokenizer.pkl")
+        with open(pickle_path, "wb") as f:
+            pickle.dump(self.enc, f)
+        print(f"Saved tokenizer encoding to {pickle_path}")
+
+    def render_conversation(self, conversation, max_tokens=2048):
+        """
+        Tokenize a single Chat conversation (which we call a "doc" or "document" here).
+        Returns:
+        - ids: list[int] is a list of token ids of this rendered conversation
+        - mask: list[int] of same length, mask = 1 for tokens that the Assistant is expected to train on.
+        """
+        # ids, masks that we will return and a helper function to help build them up.
+        ids, mask = [], []
+        def add_tokens(token_ids, mask_val):
+            if isinstance(token_ids, int):
+                token_ids = [token_ids]
+            ids.extend(token_ids)
+            mask.extend([mask_val] * len(token_ids))
+
+        # sometimes the first message is a system message...
+        # => just merge it with the second (user) message
+        if conversation["messages"][0]["role"] == "system":
+            # some conversation surgery is necessary here for now...
+            conversation = copy.deepcopy(conversation) # avoid mutating the original
+            messages = conversation["messages"]
+            assert messages[1]["role"] == "user", "System message must be followed by a user message"
+            messages[1]["content"] = messages[0]["content"] + "\n\n" + messages[1]["content"]
+            messages = messages[1:]
+        else:
+            messages = conversation["messages"]
+        assert len(messages) >= 1, f"Conversation has less than 1 message: {messages}"
+
+        # fetch all the special tokens we need
+        bos = self.get_bos_token_id()
+        user_start, user_end = self.encode_special("<|user_start|>"), self.encode_special("<|user_end|>")
+        assistant_start, assistant_end = self.encode_special("<|assistant_start|>"), self.encode_special("<|assistant_end|>")
+        python_start, python_end = self.encode_special("<|python_start|>"), self.encode_special("<|python_end|>")
+        output_start, output_end = self.encode_special("<|output_start|>"), self.encode_special("<|output_end|>")
+
+        # now we can tokenize the conversation
+        add_tokens(bos, 0)
+        for i, message in enumerate(messages):
+
+            # some sanity checking here around assumptions, to prevent footguns
+            must_be_from = "user" if i % 2 == 0 else "assistant"
+            assert message["role"] == must_be_from, f"Message {i} is from {message['role']} but should be from {must_be_from}"
+
+            # content can be either a simple string or a list of parts (e.g. containing tool calls)
+            content = message["content"]
+
+            if message["role"] == "user":
+                assert isinstance(content, str), "User messages are simply expected to be strings"
+                value_ids = self.encode(content)
+                add_tokens(user_start, 0)
+                add_tokens(value_ids, 0)
+                add_tokens(user_end, 0)
+            elif message["role"] == "assistant":
+                add_tokens(assistant_start, 0)
+                if isinstance(content, str):
+                    # simple string => simply add the tokens
+                    value_ids = self.encode(content)
+                    add_tokens(value_ids, 1)
+                elif isinstance(content, list):
+                    for part in content:
+                        value_ids = self.encode(part["text"])
+                        if part["type"] == "text":
+                            # string part => simply add the tokens
+                            add_tokens(value_ids, 1)
+                        elif part["type"] == "python":
+                            # python tool call => add the tokens inside <|python_start|> and <|python_end|>
+                            add_tokens(python_start, 1)
+                            add_tokens(value_ids, 1)
+                            add_tokens(python_end, 1)
+                        elif part["type"] == "python_output":
+                            # python output => add the tokens inside <|output_start|> and <|output_end|>
+                            # none of these tokens are supervised because the tokens come from Python at test time
+                            add_tokens(output_start, 0)
+                            add_tokens(value_ids, 0)
+                            add_tokens(output_end, 0)
+                        else:
+                            raise ValueError(f"Unknown part type: {part['type']}")
+                else:
+                    raise ValueError(f"Unknown content type: {type(content)}")
+                add_tokens(assistant_end, 1)
+
+        # truncate to max_tokens tokens MAX (helps prevent OOMs)
+        ids = ids[:max_tokens]
+        mask = mask[:max_tokens]
+        return ids, mask
+
+    def visualize_tokenization(self, ids, mask, with_token_id=False):
+        """Small helper function useful in debugging: visualize the tokenization of render_conversation"""
+        RED = '\033[91m'
+        GREEN = '\033[92m'
+        RESET = '\033[0m'
+        GRAY = '\033[90m'
+        tokens = []
+        for i, (token_id, mask_val) in enumerate(zip(ids, mask)):
+            token_str = self.decode([token_id])
+            color = GREEN if mask_val == 1 else RED
+            tokens.append(f"{color}{token_str}{RESET}")
+            if with_token_id:
+                tokens.append(f"{GRAY}({token_id}){RESET}")
+        return '|'.join(tokens)
+
+    def render_for_completion(self, conversation):
+        """
+        Used during Reinforcement Learning. In that setting, we want to
+        render the conversation priming the Assistant for a completion.
+        Unlike the Chat SFT case, we don't need to return the mask.
+        """
+        # We have some surgery to do: we need to pop the last message (of the Assistant)
+        conversation = copy.deepcopy(conversation) # avoid mutating the original
+        messages = conversation["messages"]
+        assert messages[-1]["role"] == "assistant", "Last message must be from the Assistant"
+        messages.pop() # remove the last message (of the Assistant) inplace
+
+        # Now tokenize the conversation
+        ids, mask = self.render_conversation(conversation)
+
+        # Finally, to prime the Assistant for a completion, append the Assistant start token
+        assistant_start = self.encode_special("<|assistant_start|>")
+        ids.append(assistant_start)
+        return ids
+
+# -----------------------------------------------------------------------------
+# nanochat-specific convenience functions
+
+def get_tokenizer():
+    from nanochat_moe.common import get_base_dir
+    base_dir = get_base_dir()
+    tokenizer_dir = os.path.join(base_dir, "tokenizer")
+    # return HuggingFaceTokenizer.from_directory(tokenizer_dir)
+    return RustBPETokenizer.from_directory(tokenizer_dir)
+
+def get_token_bytes(device="cpu"):
+    import torch
+    from nanochat_moe.common import get_base_dir
+    base_dir = get_base_dir()
+    tokenizer_dir = os.path.join(base_dir, "tokenizer")
+    token_bytes_path = os.path.join(tokenizer_dir, "token_bytes.pt")
+    assert os.path.exists(token_bytes_path), f"Token bytes not found at {token_bytes_path}? It gets written by tok_train.py"
+    with open(token_bytes_path, "rb") as f:
+        token_bytes = torch.load(f, map_location=device)
+    return token_bytes
--- a/nanochat_moe/ui.html
+++ b/nanochat_moe/ui.html
@ -0,0 +1,562 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0, viewport-fit=cover">
+    <title>NanoChat</title>
+    <link rel="icon" type="image/svg+xml" href="/logo.svg">
+    <style>
+        :root {
+            color-scheme: light;
+        }
+
+        * {
+            box-sizing: border-box;
+        }
+
+        body {
+            font-family: ui-sans-serif, -apple-system, system-ui, "Segoe UI", Helvetica, "Apple Color Emoji", Arial, sans-serif, "Segoe UI Emoji", "Segoe UI Symbol";
+            background-color: #ffffff;
+            color: #111827;
+            min-height: 100dvh;
+            margin: 0;
+            display: flex;
+            flex-direction: column;
+        }
+
+        .header {
+            background-color: #ffffff;
+            padding: 1.25rem 1.5rem;
+        }
+
+        .header-left {
+            display: flex;
+            align-items: center;
+            gap: 0.75rem;
+        }
+
+        .header-logo {
+            height: 32px;
+            width: auto;
+        }
+
+        .header h1 {
+            font-size: 1.25rem;
+            font-weight: 600;
+            margin: 0;
+            color: #111827;
+        }
+
+        .new-conversation-btn {
+            width: 32px;
+            height: 32px;
+            padding: 0;
+            border: 1px solid #e5e7eb;
+            border-radius: 0.5rem;
+            background-color: #ffffff;
+            color: #6b7280;
+            cursor: pointer;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            transition: all 0.2s ease;
+        }
+
+        .new-conversation-btn:hover {
+            background-color: #f3f4f6;
+            border-color: #d1d5db;
+            color: #374151;
+        }
+
+        .chat-container {
+            flex: 1;
+            overflow-y: auto;
+            background-color: #ffffff;
+        }
+
+        .chat-wrapper {
+            max-width: 48rem;
+            margin: 0 auto;
+            padding: 2rem 1.5rem 3rem;
+            display: flex;
+            flex-direction: column;
+            gap: 0.75rem;
+        }
+
+        .message {
+            display: flex;
+            justify-content: flex-start;
+            margin-bottom: 0.5rem;
+            color: #0d0d0d;
+        }
+
+        .message.assistant {
+            justify-content: flex-start;
+        }
+
+        .message.user {
+            justify-content: flex-end;
+        }
+
+        .message-content {
+            white-space: pre-wrap;
+            line-height: 1.6;
+            max-width: 100%;
+        }
+
+        .message.assistant .message-content {
+            background: transparent;
+            border: none;
+            padding: 0.25rem 0;
+            cursor: pointer;
+            border-radius: 0.5rem;
+            padding: 0.5rem;
+            margin-left: -0.5rem;
+            transition: background-color 0.2s ease;
+        }
+
+        .message.assistant .message-content:hover {
+            background-color: #f9fafb;
+        }
+
+        .message.user .message-content {
+            background-color: #f3f4f6;
+            border-radius: 1.25rem;
+            padding: 0.8rem 1rem;
+            max-width: 65%;
+            cursor: pointer;
+            transition: background-color 0.2s ease;
+        }
+
+        .message.user .message-content:hover {
+            background-color: #e5e7eb;
+        }
+
+        .message.console .message-content {
+            font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', 'Consolas', 'Courier New', monospace;
+            font-size: 0.875rem;
+            background-color: #fafafa;
+            padding: 0.75rem 1rem;
+            color: #374151;
+            max-width: 80%;
+        }
+
+        .input-container {
+            background-color: #ffffff;
+            padding: 1rem;
+            padding-bottom: calc(1rem + env(safe-area-inset-bottom))
+        }
+
+        .input-wrapper {
+            max-width: 48rem;
+            margin: 0 auto;
+            display: flex;
+            gap: 0.75rem;
+            align-items: flex-end;
+        }
+
+        .chat-input {
+            flex: 1;
+            padding: 0.8rem 1rem;
+            border: 1px solid #d1d5db;
+            border-radius: 0.75rem;
+            background-color: #ffffff;
+            color: #111827;
+            font-size: 1rem;
+            line-height: 1.5;
+            resize: none;
+            outline: none;
+            min-height: 54px;
+            max-height: 200px;
+            transition: border-color 0.2s ease, box-shadow 0.2s ease;
+        }
+
+        .chat-input::placeholder {
+            color: #9ca3af;
+        }
+
+        .chat-input:focus {
+            border-color: #2563eb;
+            box-shadow: 0 0 0 3px rgba(37, 99, 235, 0.1);
+        }
+
+        .send-button {
+            flex-shrink: 0;
+            padding: 0;
+            width: 54px;
+            height: 54px;
+            border: 1px solid #111827;
+            border-radius: 0.75rem;
+            background-color: #111827;
+            color: #ffffff;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            cursor: pointer;
+            transition: background-color 0.2s ease, border-color 0.2s ease, color 0.2s ease;
+        }
+
+        .send-button:hover:not(:disabled) {
+            background-color: #2563eb;
+            border-color: #2563eb;
+        }
+
+        .send-button:disabled {
+            cursor: not-allowed;
+            border-color: #d1d5db;
+            background-color: #e5e7eb;
+            color: #9ca3af;
+        }
+
+        .typing-indicator {
+            display: inline-block;
+            color: #6b7280;
+            letter-spacing: 0.15em;
+        }
+
+        .typing-indicator::after {
+            content: '···';
+            animation: typing 1.4s infinite;
+        }
+
+        @keyframes typing {
+            0%, 60%, 100% { opacity: 0.2; }
+            30% { opacity: 1; }
+        }
+
+        .error-message {
+            background-color: #fee2e2;
+            border: 1px solid #fecaca;
+            color: #b91c1c;
+            padding: 0.75rem 1rem;
+            border-radius: 0.75rem;
+            margin-top: 0.5rem;
+        }
+    </style>
+</head>
+<body>
+    <div class="header">
+        <div class="header-left">
+            <button class="new-conversation-btn" onclick="newConversation()" title="New Conversation (Ctrl+Shift+N)">
+                <svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+                    <path d="M12 5v14"></path>
+                    <path d="M5 12h14"></path>
+                </svg>
+            </button>
+            <h1>nanochat</h1>
+        </div>
+    </div>
+
+    <div class="chat-container" id="chatContainer">
+        <div class="chat-wrapper" id="chatWrapper">
+            <!-- Messages will be added here -->
+        </div>
+    </div>
+
+    <div class="input-container">
+        <div class="input-wrapper">
+            <textarea
+                id="chatInput"
+                class="chat-input"
+                placeholder="Ask anything"
+                rows="1"
+                onkeydown="handleKeyDown(event)"
+            ></textarea>
+            <button id="sendButton" class="send-button" onclick="sendMessage()" disabled>
+                <svg width="22" height="22" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
+                    <path d="M22 2L11 13"></path>
+                    <path d="M22 2l-7 20-4-9-9-4 20-7z"></path>
+                </svg>
+            </button>
+        </div>
+    </div>
+
+    <script>
+        const API_URL = '';
+        const chatContainer = document.getElementById('chatContainer');
+        const chatWrapper = document.getElementById('chatWrapper');
+        const chatInput = document.getElementById('chatInput');
+        const sendButton = document.getElementById('sendButton');
+
+        let messages = [];
+        let isGenerating = false;
+        let currentTemperature = 0.8;
+        let currentTopK = 50;
+
+        chatInput.addEventListener('input', function() {
+            this.style.height = 'auto';
+            this.style.height = Math.min(this.scrollHeight, 200) + 'px';
+            sendButton.disabled = !this.value.trim() || isGenerating;
+        });
+
+        function handleKeyDown(event) {
+            if (event.key === 'Enter' && !event.shiftKey) {
+                event.preventDefault();
+                sendMessage();
+            }
+        }
+
+        document.addEventListener('keydown', function(event) {
+            // Ctrl+Shift+N for new conversation
+            if (event.ctrlKey && event.shiftKey && event.key === 'N') {
+                event.preventDefault();
+                if (!isGenerating) {
+                    newConversation();
+                }
+            }
+        });
+
+        function newConversation() {
+            messages = [];
+            chatWrapper.innerHTML = '';
+            chatInput.value = '';
+            chatInput.style.height = 'auto';
+            sendButton.disabled = false;
+            isGenerating = false;
+            chatInput.focus();
+        }
+
+        function addMessage(role, content, messageIndex = null) {
+            const messageDiv = document.createElement('div');
+            messageDiv.className = `message ${role}`;
+
+            const contentDiv = document.createElement('div');
+            contentDiv.className = 'message-content';
+            contentDiv.textContent = content;
+
+            // Add click handler for user messages to enable editing
+            if (role === 'user' && messageIndex !== null) {
+                contentDiv.setAttribute('data-message-index', messageIndex);
+                contentDiv.setAttribute('title', 'Click to edit and restart from here');
+                contentDiv.addEventListener('click', function() {
+                    if (!isGenerating) {
+                        editMessage(messageIndex);
+                    }
+                });
+            }
+
+            // Add click handler for assistant messages to enable regeneration
+            if (role === 'assistant' && messageIndex !== null) {
+                contentDiv.setAttribute('data-message-index', messageIndex);
+                contentDiv.setAttribute('title', 'Click to regenerate this response');
+                contentDiv.addEventListener('click', function() {
+                    if (!isGenerating) {
+                        regenerateMessage(messageIndex);
+                    }
+                });
+            }
+
+            messageDiv.appendChild(contentDiv);
+            chatWrapper.appendChild(messageDiv);
+
+            chatContainer.scrollTop = chatContainer.scrollHeight;
+            return contentDiv;
+        }
+
+        function editMessage(messageIndex) {
+            // Find the message in the messages array
+            if (messageIndex < 0 || messageIndex >= messages.length) return;
+
+            const messageToEdit = messages[messageIndex];
+            if (messageToEdit.role !== 'user') return;
+
+            // Copy message content to input
+            chatInput.value = messageToEdit.content;
+            chatInput.style.height = 'auto';
+            chatInput.style.height = Math.min(chatInput.scrollHeight, 200) + 'px';
+
+            // Remove this message and all subsequent messages from the array
+            messages = messages.slice(0, messageIndex);
+
+            // Remove message elements from DOM starting from messageIndex
+            const allMessages = chatWrapper.querySelectorAll('.message');
+            for (let i = messageIndex; i < allMessages.length; i++) {
+                allMessages[i].remove();
+            }
+
+            // Enable send button and focus input
+            sendButton.disabled = false;
+            chatInput.focus();
+        }
+
+        async function generateAssistantResponse() {
+            isGenerating = true;
+            sendButton.disabled = true;
+
+            const assistantContent = addMessage('assistant', '');
+            assistantContent.innerHTML = '<span class="typing-indicator"></span>';
+
+            try {
+                const response = await fetch(`${API_URL}/chat/completions`, {
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/json',
+                    },
+                    body: JSON.stringify({
+                        messages: messages,
+                        temperature: currentTemperature,
+                        top_k: currentTopK,
+                        max_tokens: 512
+                    }),
+                });
+
+                if (!response.ok) {
+                    throw new Error(`HTTP error! status: ${response.status}`);
+                }
+
+                const reader = response.body.getReader();
+                const decoder = new TextDecoder();
+                let fullResponse = '';
+                assistantContent.textContent = '';
+
+                while (true) {
+                    const { done, value } = await reader.read();
+                    if (done) break;
+
+                    const chunk = decoder.decode(value);
+                    const lines = chunk.split('\n');
+
+                    for (const line of lines) {
+                        if (line.startsWith('data: ')) {
+                            try {
+                                const data = JSON.parse(line.slice(6));
+                                if (data.token) {
+                                    fullResponse += data.token;
+                                    assistantContent.textContent = fullResponse;
+                                    chatContainer.scrollTop = chatContainer.scrollHeight;
+                                }
+                            } catch (e) {
+                            }
+                        }
+                    }
+                }
+
+                const assistantMessageIndex = messages.length;
+                messages.push({ role: 'assistant', content: fullResponse });
+
+                // Add click handler to regenerate this assistant message
+                assistantContent.setAttribute('data-message-index', assistantMessageIndex);
+                assistantContent.setAttribute('title', 'Click to regenerate this response');
+                assistantContent.addEventListener('click', function() {
+                    if (!isGenerating) {
+                        regenerateMessage(assistantMessageIndex);
+                    }
+                });
+
+            } catch (error) {
+                console.error('Error:', error);
+                assistantContent.innerHTML = `<div class="error-message">Error: ${error.message}</div>`;
+            } finally {
+                isGenerating = false;
+                sendButton.disabled = !chatInput.value.trim();
+            }
+        }
+
+        async function regenerateMessage(messageIndex) {
+            // Find the message in the messages array
+            if (messageIndex < 0 || messageIndex >= messages.length) return;
+
+            const messageToRegenerate = messages[messageIndex];
+            if (messageToRegenerate.role !== 'assistant') return;
+
+            // Remove this message and all subsequent messages from the array
+            messages = messages.slice(0, messageIndex);
+
+            // Remove message elements from DOM starting from messageIndex
+            const allMessages = chatWrapper.querySelectorAll('.message');
+            for (let i = messageIndex; i < allMessages.length; i++) {
+                allMessages[i].remove();
+            }
+
+            // Regenerate the assistant response
+            await generateAssistantResponse();
+        }
+
+        function handleSlashCommand(command) {
+            const parts = command.trim().split(/\s+/);
+            const cmd = parts[0].toLowerCase();
+            const arg = parts[1];
+
+            if (cmd === '/temperature') {
+                if (arg === undefined) {
+                    addMessage('console', `Current temperature: ${currentTemperature}`);
+                } else {
+                    const temp = parseFloat(arg);
+                    if (isNaN(temp) || temp < 0 || temp > 2) {
+                        addMessage('console', 'Invalid temperature. Must be between 0.0 and 2.0');
+                    } else {
+                        currentTemperature = temp;
+                        addMessage('console', `Temperature set to ${currentTemperature}`);
+                    }
+                }
+                return true;
+            } else if (cmd === '/topk') {
+                if (arg === undefined) {
+                    addMessage('console', `Current top-k: ${currentTopK}`);
+                } else {
+                    const topk = parseInt(arg);
+                    if (isNaN(topk) || topk < 1 || topk > 200) {
+                        addMessage('console', 'Invalid top-k. Must be between 1 and 200');
+                    } else {
+                        currentTopK = topk;
+                        addMessage('console', `Top-k set to ${currentTopK}`);
+                    }
+                }
+                return true;
+            } else if (cmd === '/clear') {
+                newConversation();
+                return true;
+            } else if (cmd === '/help') {
+                addMessage('console',
+                    'Available commands:\n' +
+                    '/temperature - Show current temperature\n' +
+                    '/temperature <value> - Set temperature (0.0-2.0)\n' +
+                    '/topk - Show current top-k\n' +
+                    '/topk <value> - Set top-k (1-200)\n' +
+                    '/clear - Clear conversation\n' +
+                    '/help - Show this help message'
+                );
+                return true;
+            }
+            return false;
+        }
+
+        async function sendMessage() {
+            const message = chatInput.value.trim();
+            if (!message || isGenerating) return;
+
+            // Handle slash commands
+            if (message.startsWith('/')) {
+                chatInput.value = '';
+                chatInput.style.height = 'auto';
+                handleSlashCommand(message);
+                return;
+            }
+
+            chatInput.value = '';
+            chatInput.style.height = 'auto';
+
+            const userMessageIndex = messages.length;
+            messages.push({ role: 'user', content: message });
+            addMessage('user', message, userMessageIndex);
+
+            await generateAssistantResponse();
+        }
+
+        sendButton.disabled = false;
+
+        // Autofocus the chat input on page load
+        chatInput.focus();
+
+        fetch(`${API_URL}/health`)
+            .then(response => response.json())
+            .then(data => {
+                console.log('Engine status:', data);
+            })
+            .catch(error => {
+                console.error('Engine not available:', error);
+                chatWrapper.innerHTML = '<div class="error-message">Engine not running. Please start engine.py first.</div>';
+            });
+    </script>
+</body>
+</html>
--- a/speedrun_moe.sh
+++ b/speedrun_moe.sh
@ -0,0 +1,205 @@
+#!/bin/bash
+
+# This script is the "Best ChatGPT clone that $100 can buy",
+# It is designed to run in ~4 hours on 8XH100 node at $3/GPU/hour.
+
+# 1) Example launch (simplest):
+# bash speedrun.sh
+# 2) Example launch in a screen session (because the run takes ~4 hours):
+# screen -L -Logfile speedrun.log -S speedrun bash speedrun.sh
+# 3) Example launch with wandb logging, but see below for setting up wandb first:
+# WANDB_RUN=speedrun screen -L -Logfile speedrun.log -S speedrun bash speedrun.sh
+
+# Default intermediate artifacts directory is in ~/.cache/nanochat-moe
+USER = "dpq23"
+export OMP_NUM_THREADS=1
+export NANOCHAT_BASE_DIR="/thullms/$USER/.cache/nanochat-moe"
+export NANOCHAT_DATA_DIR="/thullms/$USER"
+mkdir -p $NANOCHAT_BASE_DIR
+mkdir -p $NANOCHAT_DATA_DIR
+
+# Use tokenizer from nanochat (not nanochat-moe)
+# Create a symlink to nanochat's tokenizer directory if it doesn't exist
+NANOCHAT_TOKENIZER_DIR="/thullms/$USER/.cache/nanochat/tokenizer"
+MOE_TOKENIZER_DIR="$NANOCHAT_BASE_DIR/tokenizer"
+if [ -d "$NANOCHAT_TOKENIZER_DIR" ] && [ ! -e "$MOE_TOKENIZER_DIR" ]; then
+    echo "Creating symlink to nanochat tokenizer: $MOE_TOKENIZER_DIR -> $NANOCHAT_TOKENIZER_DIR"
+    ln -s "$NANOCHAT_TOKENIZER_DIR" "$MOE_TOKENIZER_DIR"
+elif [ ! -d "$NANOCHAT_TOKENIZER_DIR" ]; then
+    echo "Warning: nanochat tokenizer directory not found at $NANOCHAT_TOKENIZER_DIR"
+    echo "You may need to train the tokenizer first using nanochat's tok_train.py"
+fi
+
+# # -----------------------------------------------------------------------------
+# # China mirror configuration (环境镜像配置)
+
+# # Configure pip mirror
+# mkdir -p ~/.pip
+# cat > ~/.pip/pip.conf << 'EOF'
+# [global]
+# index-url = https://pypi.tuna.tsinghua.edu.cn/simple
+# trusted-host = pypi.tuna.tsinghua.edu.cn
+# timeout = 1000
+
+# [install]
+# trusted-host = pypi.tuna.tsinghua.edu.cn
+# EOF
+
+# # Configure Rust mirror
+# export RUSTUP_DIST_SERVER=https://rsproxy.cn
+# export RUSTUP_UPDATE_ROOT=https://rsproxy.cn/rustup
+# SHELL_RC="$HOME/.bashrc"
+# if [[ "$OSTYPE" == "darwin"* ]]; then
+#     SHELL_RC="$HOME/.zshrc"
+# fi
+# if ! grep -q "RUSTUP_DIST_SERVER" "$SHELL_RC" 2>/dev/null; then
+#     cat >> "$SHELL_RC" << 'EOF'
+
+# # Rust 镜像配置
+# export RUSTUP_DIST_SERVER=https://rsproxy.cn
+# export RUSTUP_UPDATE_ROOT=https://rsproxy.cn/rustup
+# EOF
+# fi
+
+# # Configure Cargo mirror
+# mkdir -p ~/.cargo
+# cat > ~/.cargo/config << 'EOF'
+# [source.crates-io]
+# replace-with = 'rsproxy-sparse'
+
+# [source.rsproxy-sparse]
+# registry = "sparse+https://rsproxy.cn/index/"
+
+# [net]
+# git-fetch-with-cli = true
+# EOF
+
+# # Configure HuggingFace mirror
+# if ! grep -q "HF_ENDPOINT" "$SHELL_RC" 2>/dev/null; then
+#     echo 'export HF_ENDPOINT=https://hf-mirror.com' >> "$SHELL_RC"
+# fi
+# export HF_ENDPOINT=https://hf-mirror.com
+
+# # -----------------------------------------------------------------------------
+# # Python venv setup with uv
+
+# # install uv (if not already installed)
+# if ! command -v uv &> /dev/null; then
+#     pip3 install uv -i https://pypi.tuna.tsinghua.edu.cn/simple
+# fi
+# # create a .venv local virtual environment (if it doesn't exist)
+# [ -d ".venv" ] || uv venv
+# # install the repo dependencies with China mirror
+# export UV_INDEX_URL=https://pypi.tuna.tsinghua.edu.cn/simple
+# uv sync --extra gpu
+# # activate venv so that `python` uses the project's venv instead of system python
+source .venv/bin/activate
+
+# # -----------------------------------------------------------------------------
+# wandb setup
+# If you wish to use wandb for logging (it's nice!, recommended).
+# 1) Make sure to first log in to wandb, e.g. run:
+#    `wandb login`
+# 2) Set the WANDB_RUN environment variable when running this script, e.g.:
+#    `WANDB_RUN=d26 bash speedrun.sh`
+WANDB_RUN=moe
+if [ -z "$WANDB_RUN" ]; then
+    # by default use "dummy" : it's handled as a special case, skips logging to wandb
+    WANDB_RUN=dummy
+fi
+
+# -----------------------------------------------------------------------------
+# # During the course of the run, we will be writing markdown reports to the report/
+# # directory in the base dir. This command clears it out and writes a header section
+# # with a bunch of system info and a timestamp that marks the start of the run.
+# python -m nanochat.report reset
+
+# # -----------------------------------------------------------------------------
+# # Tokenizer
+
+# # Install Rust / Cargo (if not already installed)
+# if ! command -v rustc &> /dev/null; then
+#     curl --proto '=https' --tlsv1.2 -sSf https://rsproxy.cn/rustup-init.sh | sh -s -- -y
+#     source "$HOME/.cargo/env"
+# fi
+
+# # Build the rustbpe Tokenizer
+# uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
+
+# # Download the first ~2B characters of pretraining dataset
+# # look at dev/repackage_data_reference.py for details on how this data was prepared
+# # each data shard is ~250M chars
+# # so we download 2e9 / 250e6 = 8 data shards at this point
+# # each shard is ~100MB of text (compressed), so this is about ~800MB of data on disk
+# python -m nanochat.dataset -n 8
+# # Immediately also kick off downloading more shards in the background while tokenizer trains
+# # See comment below for why 240 is the right number here
+# python -m nanochat.dataset -n 240 &
+# DATASET_DOWNLOAD_PID=$!
+# # train the tokenizer with vocab size 2**16 = 65536 on ~2B characters of data
+# python -m scripts.tok_train --max_chars=2000000000
+# # evaluate the tokenizer (report compression ratio etc.)
+# python -m scripts.tok_eval
+
+# -----------------------------------------------------------------------------
+# Base model (pretraining)
+
+# The d20 model is 561M parameters.
+# Chinchilla says #tokens = 20X #params, so we need 561e6 * 20 = 11.2B tokens.
+# Assume our tokenizer is 4.8 chars/token, this is 11.2B * 4.8 ~= 54B chars.
+# At 250M chars/shard, this is 54B / 250M ~= 216 shards needed for pretraining.
+# Round up to 240 for safety. At ~100MB/shard, this downloads ~24GB of data to disk.
+# (The total number of shards available in the entire dataset is 1822.)
+# echo "Waiting for dataset download to complete..."
+# wait $DATASET_DOWNLOAD_PID
+
+# Number of processes/GPUs to use
+NPROC_PER_NODE=2
+# Master port for distributed training (default: 29500)
+# Set this to avoid port conflicts when running multiple torchrun tasks simultaneously
+# Example: MASTER_PORT=29501 bash speedrun.sh
+MASTER_PORT=${MASTER_PORT:-29501}
+# # # pretrain the d20 model
+MASTER_PORT=$MASTER_PORT torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts_moe.base_train -- --depth=20 --run=$WANDB_RUN
+# evaluate the model on a larger chunk of train/val data and draw some samples
+MASTER_PORT=$MASTER_PORT torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts_moe.base_loss
+# evaluate the model on CORE tasks
+MASTER_PORT=$MASTER_PORT torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts_moe.base_eval
+
+# # -----------------------------------------------------------------------------
+# # Midtraining (teach the model conversation special tokens, tool use, multiple choice)
+
+# # download 2.3MB of synthetic identity conversations to impart a personality to nanochat
+# # see dev/gen_sft_data.py for details on how this data was prepared and to get a sense of how you can easily tune it
+# curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
+
+# # run midtraining and eval the model
+# torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.mid_train -- --run=$WANDB_RUN
+# torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i mid
+
+# # -----------------------------------------------------------------------------
+# # Supervised Finetuning (domain adaptation to each sequence all by itself per row)
+
+# # train sft and re-eval right away (should see a small bump)
+# torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_sft -- --run=$WANDB_RUN
+# torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i sft
+
+# # chat with the model over CLI! Leave out the -p to chat interactively
+# python -m scripts.chat_cli -p "Why is the sky blue?"
+
+# even better, chat with your model over a pretty WebUI ChatGPT style
+# python -m scripts.chat_web
+
+# # -----------------------------------------------------------------------------
+# # Reinforcement Learning. Optional, and currently only on GSM8K
+# # (optional)
+
+# # run reinforcement learning
+# MASTER_PORT=$MASTER_PORT torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_rl -- --run=$WANDB_RUN
+# # eval the RL model only on GSM8K
+# MASTER_PORT=$MASTER_PORT torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i rl -a GSM8K
+
+# # -----------------------------------------------------------------------------
+# # Generate the full report by putting together all the sections
+# report.md is the output and will be copied to current directory for convenience
+python -m nanochat_moe.report generate