eval module needs to test

2026-01-22 19:34:17 +00:00 · 2025-12-15 14:27:30 +08:00 · 2025-12-15 14:27:30 +08:00 · bc11cd9e5b
commit bc11cd9e5b
parent 77da258ee1
5 changed files with 2410 additions and 1172 deletions
--- a/lm_eval.md
+++ b/lm_eval.md
@ -0,0 +1,46 @@
+# Running lm-eval with nanochat checkpoints
+
+This repo ships its own evals (CORE, ARC/GSM8K/MMLU/HumanEval/SpellingBee), but you can also run the HuggingFace-compatible [lm-evaluation-harness](tools/lm-eval). Steps below assume you've already run `bash setup.sh` (installs uv, submodules, deps, Rust tokenizer).
+
+## 1) Activate env
+```bash
+source .venv/bin/activate
+```
+
+## 2) Export a trained checkpoint to HF format
+- `nanochat/to_hf.py` loads the latest checkpoint from `~/.cache/nanochat/<source>_checkpoints` and writes an HF folder.
+- Choose source: `base` | `mid` | `sft` | `rl`.
+```bash
+# export latest base checkpoint to hf-export/base
+uv run python -m nanochat.to_hf --source base --output hf-export/base
+
+# export latest SFT checkpoint (chat model)
+uv run python -m nanochat.to_hf --source sft --output hf-export/sft
+```
+
+## 3) Run lm-eval benchmarks on the exported model
+Use the HF backend (`--model hf`). Pick tasks; nanochat's built-in evals cover these, so they're good starters in lm-eval too:
+- `arc_easy`, `arc_challenge`
+- `mmlu`
+- `gsm8k`
+- `humaneval`
+
+Example runs:
+```bash
+# Single task (MMLU)
+uv run lm-eval run --model hf \
+  --model_args pretrained=hf-export/sft \
+  --tasks mmlu \
+  --batch_size 1
+
+# A small suite similar to nanochat chat_eval coverage
+uv run lm-eval run --model hf \
+  --model_args pretrained=hf-export/sft \
+  --tasks arc_easy,arc_challenge,gsm8k,mmlu,humaneval \
+  --batch_size 1
+```
+
+Notes:
+- If you exported to a different folder, change `pretrained=...` accordingly. You can also point to a remote HF repo name.
+- `--batch_size auto` can help find the largest batch that fits GPU RAM. On CPU, keep it small.
+- No KV cache is implemented in the HF wrapper; generation is standard `AutoModelForCausalLM` style.
--- a/nanochat/to_hf.py
+++ b/nanochat/to_hf.py
@ -0,0 +1,159 @@
+"""
+Convert a nanochat checkpoint into a HuggingFace-style folder.
+
+Usage (example):
+python -m nanochat.to_hf --source base --output hf-export/base
+
+Notes
+- Assumes checkpoints live under ~/.cache/nanochat/<source>_checkpoints/ (same as training scripts).
+- The exported model can be loaded with transformers via:
+    AutoModelForCausalLM.from_pretrained(<export_dir>, trust_remote_code=True)
+- KV cache is not implemented in the HF wrapper; generation works but is not incremental.
+"""
+import argparse
+import os
+import shutil
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+try:
+    from transformers import PreTrainedModel, PretrainedConfig
+    from transformers.modeling_outputs import CausalLMOutputWithPast
+except ImportError as exc:
+    raise SystemExit(
+        "transformers is required for HF export. Run `uv sync` (with the hf extra) first."
+    ) from exc
+
+from nanochat.checkpoint_manager import load_model
+from nanochat.gpt import GPT, GPTConfig
+from nanochat.common import get_base_dir
+from nanochat.tokenizer import get_tokenizer
+
+
+class NanoChatHFConfig(PretrainedConfig):
+    model_type = "nanochat"
+
+    def __init__(
+        self,
+        sequence_len: int = 1024,
+        vocab_size: int = 50304,
+        n_layer: int = 12,
+        n_head: int = 6,
+        n_kv_head: int = 6,
+        n_embd: int = 768,
+        **kwargs,
+    ):
+        # Don't tie embeddings; nanochat uses untied wte/lm_head
+        kwargs.setdefault("tie_word_embeddings", False)
+        super().__init__(**kwargs)
+        self.sequence_len = sequence_len
+        self.vocab_size = vocab_size
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_kv_head = n_kv_head
+        self.n_embd = n_embd
+
+
+class NanoChatHFForCausalLM(PreTrainedModel):
+    config_class = NanoChatHFConfig
+
+    def __init__(self, config: NanoChatHFConfig):
+        super().__init__(config)
+        gpt_cfg = GPTConfig(
+            sequence_len=config.sequence_len,
+            vocab_size=config.vocab_size,
+            n_layer=config.n_layer,
+            n_head=config.n_head,
+            n_kv_head=config.n_kv_head,
+            n_embd=config.n_embd,
+        )
+        self.model = GPT(gpt_cfg)
+
+    def get_input_embeddings(self):
+        return self.model.transformer.wte
+
+    def set_input_embeddings(self, value):
+        self.model.transformer.wte = value
+
+    def get_output_embeddings(self):
+        return self.model.lm_head
+
+    def tie_weights(self):
+        # nanochat uses untied embeddings; override to no-op
+        return
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,  # unused
+        labels: Optional[torch.LongTensor] = None,
+        past_key_values=None,  # not implemented
+        **_: dict,
+    ) -> CausalLMOutputWithPast:
+        if input_ids is None:
+            raise ValueError("input_ids must be provided")
+        logits = self.model(input_ids)
+        loss = None
+        if labels is not None:
+            # Align shapes for CE: shift labels to match logits
+            loss = F.cross_entropy(
+                logits.view(-1, logits.size(-1)), labels.view(-1), ignore_index=-1
+            )
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=None,
+            hidden_states=None,
+            attentions=None,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, **kwargs):
+        return {"input_ids": input_ids, "attention_mask": kwargs.get("attention_mask", None)}
+
+
+def copy_tokenizer_files(output_dir: str):
+    base_dir = get_base_dir()
+    tokenizer_dir = os.path.join(base_dir, "tokenizer")
+    if not os.path.isdir(tokenizer_dir):
+        print(f"[to_hf] tokenizer directory not found at {tokenizer_dir}, skipping tokenizer export")
+        return
+    for name in os.listdir(tokenizer_dir):
+        src = os.path.join(tokenizer_dir, name)
+        dst = os.path.join(output_dir, name)
+        if os.path.isdir(src):
+            shutil.copytree(src, dst, dirs_exist_ok=True)
+        else:
+            os.makedirs(os.path.dirname(dst), exist_ok=True)
+            shutil.copy2(src, dst)
+    print(f"[to_hf] Copied tokenizer files from {tokenizer_dir} to {output_dir}")
+
+
+def export_to_hf(source: str, output_dir: str, model_tag: Optional[str], step: Optional[int]):
+    device = torch.device("cpu")
+    model, tokenizer, meta = load_model(source, device=device, phase="eval", model_tag=model_tag, step=step)
+    cfg_kwargs = meta["model_config"]
+    hf_config = NanoChatHFConfig(**cfg_kwargs)
+    hf_model = NanoChatHFForCausalLM(hf_config)
+    hf_model.model.load_state_dict(model.state_dict(), strict=True)
+
+    os.makedirs(output_dir, exist_ok=True)
+    hf_model.save_pretrained(output_dir, safe_serialization=False)
+    # Best effort: drop tokenizer files alongside weights
+    copy_tokenizer_files(output_dir)
+    print(f"[to_hf] Exported {source} checkpoint to {output_dir}")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Export nanochat checkpoint to HuggingFace format")
+    parser.add_argument("--source", choices=["base", "mid", "sft", "rl"], default="base", help="Which checkpoint family to export")
+    parser.add_argument("--model-tag", type=str, default=None, help="Model tag (e.g., d20). Defaults to largest available.")
+    parser.add_argument("--step", type=int, default=None, help="Checkpoint step. Defaults to latest.")
+    parser.add_argument("--output", type=str, default="hf-export", help="Output directory for HF files")
+    args = parser.parse_args()
+
+    export_to_hf(args.source, args.output, args.model_tag, args.step)
+
+
+if __name__ == "__main__":
+    main()
--- a/pyproject.toml
+++ b/pyproject.toml
@ -16,6 +16,7 @@ dependencies = [
    "torch>=2.8.0",
    "uvicorn>=0.36.0",
    "wandb>=0.21.3",
+    "lm_eval[hf]",
 ]

 [build-system]
@ -49,6 +50,7 @@ torch = [
    { index = "pytorch-cpu", extra = "cpu" },  
    { index = "pytorch-cu128", extra = "gpu" },  
 ]
+lm_eval = { path = "tools/lm-eval" }

 [[tool.uv.index]]  
 name = "pytorch-cpu"  
@ -74,4 +76,4 @@ conflicts = [
        { extra = "cpu" },  
        { extra = "gpu" },  
    ],  
-]  
+]  
--- a/setup.sh
+++ b/setup.sh
@ -0,0 +1,51 @@
+#!/usr/bin/env bash
+# Setup nanochat after cloning the repo.
+# - initializes the tools submodule (lm-evaluation-harness)
+# - creates a uv virtualenv
+# - installs deps (choose gpu|cpu extra)
+# - builds the Rust tokenizer extension
+
+set -euo pipefail
+
+# -----------------------------
+# Helpers
+repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+cd "$repo_root"
+
+extra="${1:-gpu}"
+if [[ "$extra" != "gpu" && "$extra" != "cpu" ]]; then
+  echo "Usage: bash setup.sh [gpu|cpu]" >&2
+  exit 1
+fi
+
+echo "[setup] Initializing submodules (tools/lm-eval)..."
+git submodule update --init --recursive
+
+echo "[setup] Ensuring uv is installed..."
+if ! command -v uv >/dev/null 2>&1; then
+  curl -LsSf https://astral.sh/uv/install.sh | sh
+  # shellcheck source=/dev/null
+  command -v uv >/dev/null 2>&1 || export PATH="$HOME/.local/bin:$PATH"
+fi
+
+echo "[setup] Ensuring Rust toolchain..."
+if ! command -v cargo >/dev/null 2>&1; then
+  curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+fi
+# shellcheck source=/dev/null
+command -v cargo >/dev/null 2>&1 || source "$HOME/.cargo/env"
+
+echo "[setup] Creating virtual environment (.venv)..."
+[ -d ".venv" ] || uv venv
+
+echo "[setup] Installing Python deps (extra=$extra)..."
+uv sync --extra "$extra"
+
+echo "[setup] Building Rust tokenizer (rustbpe)..."
+if [ -n "${CONDA_PREFIX:-}" ]; then
+  echo "[setup] CONDA_PREFIX detected; unsetting to avoid conflicts with VIRTUAL_ENV during build..."
+  unset CONDA_PREFIX
+fi
+uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
+
+echo "[setup] Done. Activate with: source .venv/bin/activate"
--- a/uv.lock
+++ b/uv.lock