eval module needs to test

This commit is contained in:
Muheng 2025-12-15 14:27:30 +08:00
parent 77da258ee1
commit bc11cd9e5b
5 changed files with 2410 additions and 1172 deletions

46
lm_eval.md Normal file
View File

@ -0,0 +1,46 @@
# Running lm-eval with nanochat checkpoints
This repo ships its own evals (CORE, ARC/GSM8K/MMLU/HumanEval/SpellingBee), but you can also run the HuggingFace-compatible [lm-evaluation-harness](tools/lm-eval). Steps below assume you've already run `bash setup.sh` (installs uv, submodules, deps, Rust tokenizer).
## 1) Activate env
```bash
source .venv/bin/activate
```
## 2) Export a trained checkpoint to HF format
- `nanochat/to_hf.py` loads the latest checkpoint from `~/.cache/nanochat/<source>_checkpoints` and writes an HF folder.
- Choose source: `base` | `mid` | `sft` | `rl`.
```bash
# export latest base checkpoint to hf-export/base
uv run python -m nanochat.to_hf --source base --output hf-export/base
# export latest SFT checkpoint (chat model)
uv run python -m nanochat.to_hf --source sft --output hf-export/sft
```
## 3) Run lm-eval benchmarks on the exported model
Use the HF backend (`--model hf`). Pick tasks; nanochat's built-in evals cover these, so they're good starters in lm-eval too:
- `arc_easy`, `arc_challenge`
- `mmlu`
- `gsm8k`
- `humaneval`
Example runs:
```bash
# Single task (MMLU)
uv run lm-eval run --model hf \
--model_args pretrained=hf-export/sft \
--tasks mmlu \
--batch_size 1
# A small suite similar to nanochat chat_eval coverage
uv run lm-eval run --model hf \
--model_args pretrained=hf-export/sft \
--tasks arc_easy,arc_challenge,gsm8k,mmlu,humaneval \
--batch_size 1
```
Notes:
- If you exported to a different folder, change `pretrained=...` accordingly. You can also point to a remote HF repo name.
- `--batch_size auto` can help find the largest batch that fits GPU RAM. On CPU, keep it small.
- No KV cache is implemented in the HF wrapper; generation is standard `AutoModelForCausalLM` style.

159
nanochat/to_hf.py Normal file
View File

@ -0,0 +1,159 @@
"""
Convert a nanochat checkpoint into a HuggingFace-style folder.
Usage (example):
python -m nanochat.to_hf --source base --output hf-export/base
Notes
- Assumes checkpoints live under ~/.cache/nanochat/<source>_checkpoints/ (same as training scripts).
- The exported model can be loaded with transformers via:
AutoModelForCausalLM.from_pretrained(<export_dir>, trust_remote_code=True)
- KV cache is not implemented in the HF wrapper; generation works but is not incremental.
"""
import argparse
import os
import shutil
from typing import Optional
import torch
import torch.nn.functional as F
try:
from transformers import PreTrainedModel, PretrainedConfig
from transformers.modeling_outputs import CausalLMOutputWithPast
except ImportError as exc:
raise SystemExit(
"transformers is required for HF export. Run `uv sync` (with the hf extra) first."
) from exc
from nanochat.checkpoint_manager import load_model
from nanochat.gpt import GPT, GPTConfig
from nanochat.common import get_base_dir
from nanochat.tokenizer import get_tokenizer
class NanoChatHFConfig(PretrainedConfig):
model_type = "nanochat"
def __init__(
self,
sequence_len: int = 1024,
vocab_size: int = 50304,
n_layer: int = 12,
n_head: int = 6,
n_kv_head: int = 6,
n_embd: int = 768,
**kwargs,
):
# Don't tie embeddings; nanochat uses untied wte/lm_head
kwargs.setdefault("tie_word_embeddings", False)
super().__init__(**kwargs)
self.sequence_len = sequence_len
self.vocab_size = vocab_size
self.n_layer = n_layer
self.n_head = n_head
self.n_kv_head = n_kv_head
self.n_embd = n_embd
class NanoChatHFForCausalLM(PreTrainedModel):
config_class = NanoChatHFConfig
def __init__(self, config: NanoChatHFConfig):
super().__init__(config)
gpt_cfg = GPTConfig(
sequence_len=config.sequence_len,
vocab_size=config.vocab_size,
n_layer=config.n_layer,
n_head=config.n_head,
n_kv_head=config.n_kv_head,
n_embd=config.n_embd,
)
self.model = GPT(gpt_cfg)
def get_input_embeddings(self):
return self.model.transformer.wte
def set_input_embeddings(self, value):
self.model.transformer.wte = value
def get_output_embeddings(self):
return self.model.lm_head
def tie_weights(self):
# nanochat uses untied embeddings; override to no-op
return
def forward(
self,
input_ids: Optional[torch.LongTensor] = None,
attention_mask: Optional[torch.Tensor] = None, # unused
labels: Optional[torch.LongTensor] = None,
past_key_values=None, # not implemented
**_: dict,
) -> CausalLMOutputWithPast:
if input_ids is None:
raise ValueError("input_ids must be provided")
logits = self.model(input_ids)
loss = None
if labels is not None:
# Align shapes for CE: shift labels to match logits
loss = F.cross_entropy(
logits.view(-1, logits.size(-1)), labels.view(-1), ignore_index=-1
)
return CausalLMOutputWithPast(
loss=loss,
logits=logits,
past_key_values=None,
hidden_states=None,
attentions=None,
)
def prepare_inputs_for_generation(self, input_ids, **kwargs):
return {"input_ids": input_ids, "attention_mask": kwargs.get("attention_mask", None)}
def copy_tokenizer_files(output_dir: str):
base_dir = get_base_dir()
tokenizer_dir = os.path.join(base_dir, "tokenizer")
if not os.path.isdir(tokenizer_dir):
print(f"[to_hf] tokenizer directory not found at {tokenizer_dir}, skipping tokenizer export")
return
for name in os.listdir(tokenizer_dir):
src = os.path.join(tokenizer_dir, name)
dst = os.path.join(output_dir, name)
if os.path.isdir(src):
shutil.copytree(src, dst, dirs_exist_ok=True)
else:
os.makedirs(os.path.dirname(dst), exist_ok=True)
shutil.copy2(src, dst)
print(f"[to_hf] Copied tokenizer files from {tokenizer_dir} to {output_dir}")
def export_to_hf(source: str, output_dir: str, model_tag: Optional[str], step: Optional[int]):
device = torch.device("cpu")
model, tokenizer, meta = load_model(source, device=device, phase="eval", model_tag=model_tag, step=step)
cfg_kwargs = meta["model_config"]
hf_config = NanoChatHFConfig(**cfg_kwargs)
hf_model = NanoChatHFForCausalLM(hf_config)
hf_model.model.load_state_dict(model.state_dict(), strict=True)
os.makedirs(output_dir, exist_ok=True)
hf_model.save_pretrained(output_dir, safe_serialization=False)
# Best effort: drop tokenizer files alongside weights
copy_tokenizer_files(output_dir)
print(f"[to_hf] Exported {source} checkpoint to {output_dir}")
def main():
parser = argparse.ArgumentParser(description="Export nanochat checkpoint to HuggingFace format")
parser.add_argument("--source", choices=["base", "mid", "sft", "rl"], default="base", help="Which checkpoint family to export")
parser.add_argument("--model-tag", type=str, default=None, help="Model tag (e.g., d20). Defaults to largest available.")
parser.add_argument("--step", type=int, default=None, help="Checkpoint step. Defaults to latest.")
parser.add_argument("--output", type=str, default="hf-export", help="Output directory for HF files")
args = parser.parse_args()
export_to_hf(args.source, args.output, args.model_tag, args.step)
if __name__ == "__main__":
main()

View File

@ -16,6 +16,7 @@ dependencies = [
"torch>=2.8.0",
"uvicorn>=0.36.0",
"wandb>=0.21.3",
"lm_eval[hf]",
]
[build-system]
@ -49,6 +50,7 @@ torch = [
{ index = "pytorch-cpu", extra = "cpu" },
{ index = "pytorch-cu128", extra = "gpu" },
]
lm_eval = { path = "tools/lm-eval" }
[[tool.uv.index]]
name = "pytorch-cpu"
@ -74,4 +76,4 @@ conflicts = [
{ extra = "cpu" },
{ extra = "gpu" },
],
]
]

51
setup.sh Normal file
View File

@ -0,0 +1,51 @@
#!/usr/bin/env bash
# Setup nanochat after cloning the repo.
# - initializes the tools submodule (lm-evaluation-harness)
# - creates a uv virtualenv
# - installs deps (choose gpu|cpu extra)
# - builds the Rust tokenizer extension
set -euo pipefail
# -----------------------------
# Helpers
repo_root="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
cd "$repo_root"
extra="${1:-gpu}"
if [[ "$extra" != "gpu" && "$extra" != "cpu" ]]; then
echo "Usage: bash setup.sh [gpu|cpu]" >&2
exit 1
fi
echo "[setup] Initializing submodules (tools/lm-eval)..."
git submodule update --init --recursive
echo "[setup] Ensuring uv is installed..."
if ! command -v uv >/dev/null 2>&1; then
curl -LsSf https://astral.sh/uv/install.sh | sh
# shellcheck source=/dev/null
command -v uv >/dev/null 2>&1 || export PATH="$HOME/.local/bin:$PATH"
fi
echo "[setup] Ensuring Rust toolchain..."
if ! command -v cargo >/dev/null 2>&1; then
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
fi
# shellcheck source=/dev/null
command -v cargo >/dev/null 2>&1 || source "$HOME/.cargo/env"
echo "[setup] Creating virtual environment (.venv)..."
[ -d ".venv" ] || uv venv
echo "[setup] Installing Python deps (extra=$extra)..."
uv sync --extra "$extra"
echo "[setup] Building Rust tokenizer (rustbpe)..."
if [ -n "${CONDA_PREFIX:-}" ]; then
echo "[setup] CONDA_PREFIX detected; unsetting to avoid conflicts with VIRTUAL_ENV during build..."
unset CONDA_PREFIX
fi
uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
echo "[setup] Done. Activate with: source .venv/bin/activate"

3322
uv.lock

File diff suppressed because it is too large Load Diff