From d6829284c4912d5186469c4a624127c24b231d34 Mon Sep 17 00:00:00 2001 From: askerlee Date: Tue, 13 Jan 2026 22:20:22 +0800 Subject: [PATCH 1/4] Allow local install and model loading --- .gitignore | 1 + pyproject.toml | 3 +++ scripts/base_eval.py | 7 +++++-- 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index d82809a..740d38b 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,4 @@ eval_bundle/ .claude CLAUDE.md wandb/ +*.egg-info/ diff --git a/pyproject.toml b/pyproject.toml index 87a967f..3f0c1e8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,3 +71,6 @@ conflicts = [ { extra = "gpu" }, ], ] + +[tool.setuptools] +packages = ["nanochat"] diff --git a/scripts/base_eval.py b/scripts/base_eval.py index bd83ff3..672faec 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -134,13 +134,16 @@ def load_hf_model(hf_path: str, device): print0(f"Loading model from: {hf_path}") # Load the model from transformers import AutoModelForCausalLM - model = AutoModelForCausalLM.from_pretrained(hf_path) + model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True) model.to(device) model.eval() max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None model = ModelWrapper(model, max_seq_len=max_seq_len) # Load the tokenizer - tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path) + if os.path.exists(hf_path): + tokenizer = HuggingFaceTokenizer.from_directory(hf_path) + else: + tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path) return model, tokenizer # ----------------------------------------------------------------------------- From bf067e2a664e7286b9590abe5c5eb8378210cca6 Mon Sep 17 00:00:00 2001 From: askerlee Date: Wed, 14 Jan 2026 14:19:20 +0800 Subject: [PATCH 2/4] Add max_seq_len argument for gpt2 --- scripts/base_eval.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/scripts/base_eval.py b/scripts/base_eval.py index 672faec..0b6c888 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -75,6 +75,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): # Evaluate each task results = {} centered_results = {} + for task in tasks: start_time = time.time() label = task['label'] @@ -130,14 +131,18 @@ class ModelWrapper: logits = outputs.logits return logits -def load_hf_model(hf_path: str, device): +def load_hf_model(hf_path: str, device, max_seq_len=None): print0(f"Loading model from: {hf_path}") # Load the model from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True) model.to(device) model.eval() - max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None + # Special case for GPT-2 community model, which can handle 1024 tokens. + # If the argument is given, use that instead. + if max_seq_len is None and "openai-community/gpt2" in hf_path: + max_seq_len = 1024 + model = ModelWrapper(model, max_seq_len=max_seq_len) # Load the tokenizer if os.path.exists(hf_path): @@ -151,6 +156,8 @@ def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate') + parser.add_argument('--max_seq_len', type=int, default=None, + help='Optional max sequence length for the model') parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)') parser.add_argument('--model-tag', type=str, default=None, help='optional model tag for the output directory name') parser.add_argument('--step', type=str, default=None, help='optional model step for the output directory name') @@ -166,7 +173,7 @@ def main(): # atm assume that if a path is given, it's a huggingface model path hf_path = args.hf_path print0(f"Loading huggingface model from: {hf_path}") - model, tokenizer = load_hf_model(hf_path, device) + model, tokenizer = load_hf_model(hf_path, device, max_seq_len=args.max_seq_len) model_name = hf_path # just for logging model_slug = hf_path.replace("/", "-") # for the output csv file else: From e64aa826205a2720fe7b1e1da4b24e65de1048a3 Mon Sep 17 00:00:00 2001 From: askerlee Date: Wed, 14 Jan 2026 15:34:40 +0800 Subject: [PATCH 3/4] When evaluating language_modeling tasks, be case-insensitive when matching with the correct answer --- nanochat/core_eval.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/nanochat/core_eval.py b/nanochat/core_eval.py index f3c9a9f..ff63c3e 100644 --- a/nanochat/core_eval.py +++ b/nanochat/core_eval.py @@ -201,6 +201,9 @@ def evaluate_example(idx, model, tokenizer, data, device, task_meta): for t, s, e in zip(tokens, start_idxs, end_idxs): if len(t) > max_tokens: num_to_crop = len(t) - max_tokens + # Take the last max_tokens tokens instead of the first ones. + # The overly long questions are usually the few-shot contexts. They are placed + # at the beginning of the sequence, so cropping from the start should be ok. new_tokens.append(t[-max_tokens:]) # take the last max_tokens tokens new_start_idxs.append(s - num_to_crop) # shift the indices down new_end_idxs.append(e - num_to_crop) @@ -228,7 +231,11 @@ def evaluate_example(idx, model, tokenizer, data, device, task_meta): # predictions[i] predict input_ids[i+1] autoregressively predicted_tokens = predictions[0, si-1:ei-1] actual_tokens = input_ids[0, si:ei] - is_correct = torch.all(predicted_tokens == actual_tokens).item() + # Make the matching case-insensitive for LM tasks + predicted_text = tokenizer.decode(predicted_tokens.cpu().tolist()).lower() + actual_text = tokenizer.decode(actual_tokens.cpu().tolist()).lower() + # is_correct = torch.all(predicted_tokens == actual_tokens).item() + is_correct = (predicted_text == actual_text) elif task_type in ['multiple_choice', 'schema']: # For MC/schema: find the option with lowest average loss mean_losses = [losses[i, si-1:ei-1].mean().item() From 8cfa0451f4fdbdc7abf42b9dab3f76b947cc24ab Mon Sep 17 00:00:00 2001 From: askerlee Date: Wed, 14 Jan 2026 15:47:36 +0800 Subject: [PATCH 4/4] When eval language_modeling tasks, be case insensitive to answers --- pyproject.toml | 3 --- scripts/base_eval.py | 20 +++++--------------- 2 files changed, 5 insertions(+), 18 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 3f0c1e8..87a967f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,6 +71,3 @@ conflicts = [ { extra = "gpu" }, ], ] - -[tool.setuptools] -packages = ["nanochat"] diff --git a/scripts/base_eval.py b/scripts/base_eval.py index 0b6c888..bd83ff3 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -75,7 +75,6 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): # Evaluate each task results = {} centered_results = {} - for task in tasks: start_time = time.time() label = task['label'] @@ -131,24 +130,17 @@ class ModelWrapper: logits = outputs.logits return logits -def load_hf_model(hf_path: str, device, max_seq_len=None): +def load_hf_model(hf_path: str, device): print0(f"Loading model from: {hf_path}") # Load the model from transformers import AutoModelForCausalLM - model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True) + model = AutoModelForCausalLM.from_pretrained(hf_path) model.to(device) model.eval() - # Special case for GPT-2 community model, which can handle 1024 tokens. - # If the argument is given, use that instead. - if max_seq_len is None and "openai-community/gpt2" in hf_path: - max_seq_len = 1024 - + max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None model = ModelWrapper(model, max_seq_len=max_seq_len) # Load the tokenizer - if os.path.exists(hf_path): - tokenizer = HuggingFaceTokenizer.from_directory(hf_path) - else: - tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path) + tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path) return model, tokenizer # ----------------------------------------------------------------------------- @@ -156,8 +148,6 @@ def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate') - parser.add_argument('--max_seq_len', type=int, default=None, - help='Optional max sequence length for the model') parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)') parser.add_argument('--model-tag', type=str, default=None, help='optional model tag for the output directory name') parser.add_argument('--step', type=str, default=None, help='optional model step for the output directory name') @@ -173,7 +163,7 @@ def main(): # atm assume that if a path is given, it's a huggingface model path hf_path = args.hf_path print0(f"Loading huggingface model from: {hf_path}") - model, tokenizer = load_hf_model(hf_path, device, max_seq_len=args.max_seq_len) + model, tokenizer = load_hf_model(hf_path, device) model_name = hf_path # just for logging model_slug = hf_path.replace("/", "-") # for the output csv file else: