From d6829284c4912d5186469c4a624127c24b231d34 Mon Sep 17 00:00:00 2001
From: askerlee <shaohua@gmail.com>
Date: Tue, 13 Jan 2026 22:20:22 +0800
Subject: [PATCH 1/4] Allow local install and model loading

---
 .gitignore           | 1 +
 pyproject.toml       | 3 +++
 scripts/base_eval.py | 7 +++++--
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/.gitignore b/.gitignore
index d82809a..740d38b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,3 +12,4 @@ eval_bundle/
 .claude
 CLAUDE.md
 wandb/
+*.egg-info/
diff --git a/pyproject.toml b/pyproject.toml
index 87a967f..3f0c1e8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,3 +71,6 @@ conflicts = [
         { extra = "gpu" },
     ],
 ]
+
+[tool.setuptools]
+packages = ["nanochat"]
diff --git a/scripts/base_eval.py b/scripts/base_eval.py
index bd83ff3..672faec 100644
--- a/scripts/base_eval.py
+++ b/scripts/base_eval.py
@@ -134,13 +134,16 @@ def load_hf_model(hf_path: str, device):
     print0(f"Loading model from: {hf_path}")
     # Load the model
     from transformers import AutoModelForCausalLM
-    model = AutoModelForCausalLM.from_pretrained(hf_path)
+    model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True)
     model.to(device)
     model.eval()
     max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None
     model = ModelWrapper(model, max_seq_len=max_seq_len)
     # Load the tokenizer
-    tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path)
+    if os.path.exists(hf_path):
+        tokenizer = HuggingFaceTokenizer.from_directory(hf_path)
+    else:
+        tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path)
     return model, tokenizer
 
 # -----------------------------------------------------------------------------

From bf067e2a664e7286b9590abe5c5eb8378210cca6 Mon Sep 17 00:00:00 2001
From: askerlee <shaohua@gmail.com>
Date: Wed, 14 Jan 2026 14:19:20 +0800
Subject: [PATCH 2/4] Add max_seq_len argument for gpt2

---
 scripts/base_eval.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/scripts/base_eval.py b/scripts/base_eval.py
index 672faec..0b6c888 100644
--- a/scripts/base_eval.py
+++ b/scripts/base_eval.py
@@ -75,6 +75,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
     # Evaluate each task
     results = {}
     centered_results = {}
+
     for task in tasks:
         start_time = time.time()
         label = task['label']
@@ -130,14 +131,18 @@ class ModelWrapper:
         logits = outputs.logits
         return logits
 
-def load_hf_model(hf_path: str, device):
+def load_hf_model(hf_path: str, device, max_seq_len=None):
     print0(f"Loading model from: {hf_path}")
     # Load the model
     from transformers import AutoModelForCausalLM
     model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True)
     model.to(device)
     model.eval()
-    max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None
+    # Special case for GPT-2 community model, which can handle 1024 tokens.
+    # If the argument is given, use that instead.
+    if max_seq_len is None and "openai-community/gpt2" in hf_path:
+        max_seq_len = 1024
+
     model = ModelWrapper(model, max_seq_len=max_seq_len)
     # Load the tokenizer
     if os.path.exists(hf_path):
@@ -151,6 +156,8 @@ def main():
     import argparse
     parser = argparse.ArgumentParser()
     parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate')
+    parser.add_argument('--max_seq_len', type=int, default=None, 
+                        help='Optional max sequence length for the model')
     parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)')
     parser.add_argument('--model-tag', type=str, default=None, help='optional model tag for the output directory name')
     parser.add_argument('--step', type=str, default=None, help='optional model step for the output directory name')
@@ -166,7 +173,7 @@ def main():
         # atm assume that if a path is given, it's a huggingface model path
         hf_path = args.hf_path
         print0(f"Loading huggingface model from: {hf_path}")
-        model, tokenizer = load_hf_model(hf_path, device)
+        model, tokenizer = load_hf_model(hf_path, device, max_seq_len=args.max_seq_len)
         model_name = hf_path # just for logging
         model_slug = hf_path.replace("/", "-") # for the output csv file
     else:

From e64aa826205a2720fe7b1e1da4b24e65de1048a3 Mon Sep 17 00:00:00 2001
From: askerlee <shaohua@gmail.com>
Date: Wed, 14 Jan 2026 15:34:40 +0800
Subject: [PATCH 3/4] When evaluating language_modeling tasks, be
 case-insensitive when matching with the correct answer

---
 nanochat/core_eval.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/nanochat/core_eval.py b/nanochat/core_eval.py
index f3c9a9f..ff63c3e 100644
--- a/nanochat/core_eval.py
+++ b/nanochat/core_eval.py
@@ -201,6 +201,9 @@ def evaluate_example(idx, model, tokenizer, data, device, task_meta):
         for t, s, e in zip(tokens, start_idxs, end_idxs):
             if len(t) > max_tokens:
                 num_to_crop = len(t) - max_tokens
+                # Take the last max_tokens tokens instead of the first ones.
+                # The overly long questions are usually the few-shot contexts. They are placed
+                # at the beginning of the sequence, so cropping from the start should be ok.
                 new_tokens.append(t[-max_tokens:]) # take the last max_tokens tokens
                 new_start_idxs.append(s - num_to_crop) # shift the indices down
                 new_end_idxs.append(e - num_to_crop)
@@ -228,7 +231,11 @@ def evaluate_example(idx, model, tokenizer, data, device, task_meta):
         # predictions[i] predict input_ids[i+1] autoregressively
         predicted_tokens = predictions[0, si-1:ei-1]
         actual_tokens = input_ids[0, si:ei]
-        is_correct = torch.all(predicted_tokens == actual_tokens).item()
+        # Make the matching case-insensitive for LM tasks
+        predicted_text = tokenizer.decode(predicted_tokens.cpu().tolist()).lower()
+        actual_text = tokenizer.decode(actual_tokens.cpu().tolist()).lower()
+        # is_correct = torch.all(predicted_tokens == actual_tokens).item()
+        is_correct = (predicted_text == actual_text)
     elif task_type in ['multiple_choice', 'schema']:
         # For MC/schema: find the option with lowest average loss
         mean_losses = [losses[i, si-1:ei-1].mean().item()

From 8cfa0451f4fdbdc7abf42b9dab3f76b947cc24ab Mon Sep 17 00:00:00 2001
From: askerlee <shaohua@gmail.com>
Date: Wed, 14 Jan 2026 15:47:36 +0800
Subject: [PATCH 4/4] When eval language_modeling tasks, be case insensitive to
 answers

---
 pyproject.toml       |  3 ---
 scripts/base_eval.py | 20 +++++---------------
 2 files changed, 5 insertions(+), 18 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3f0c1e8..87a967f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,6 +71,3 @@ conflicts = [
         { extra = "gpu" },
     ],
 ]
-
-[tool.setuptools]
-packages = ["nanochat"]
diff --git a/scripts/base_eval.py b/scripts/base_eval.py
index 0b6c888..bd83ff3 100644
--- a/scripts/base_eval.py
+++ b/scripts/base_eval.py
@@ -75,7 +75,6 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
     # Evaluate each task
     results = {}
     centered_results = {}
-
     for task in tasks:
         start_time = time.time()
         label = task['label']
@@ -131,24 +130,17 @@ class ModelWrapper:
         logits = outputs.logits
         return logits
 
-def load_hf_model(hf_path: str, device, max_seq_len=None):
+def load_hf_model(hf_path: str, device):
     print0(f"Loading model from: {hf_path}")
     # Load the model
     from transformers import AutoModelForCausalLM
-    model = AutoModelForCausalLM.from_pretrained(hf_path, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(hf_path)
     model.to(device)
     model.eval()
-    # Special case for GPT-2 community model, which can handle 1024 tokens.
-    # If the argument is given, use that instead.
-    if max_seq_len is None and "openai-community/gpt2" in hf_path:
-        max_seq_len = 1024
-
+    max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None
     model = ModelWrapper(model, max_seq_len=max_seq_len)
     # Load the tokenizer
-    if os.path.exists(hf_path):
-        tokenizer = HuggingFaceTokenizer.from_directory(hf_path)
-    else:
-        tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path)
+    tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path)
     return model, tokenizer
 
 # -----------------------------------------------------------------------------
@@ -156,8 +148,6 @@ def main():
     import argparse
     parser = argparse.ArgumentParser()
     parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate')
-    parser.add_argument('--max_seq_len', type=int, default=None, 
-                        help='Optional max sequence length for the model')
     parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)')
     parser.add_argument('--model-tag', type=str, default=None, help='optional model tag for the output directory name')
     parser.add_argument('--step', type=str, default=None, help='optional model step for the output directory name')
@@ -173,7 +163,7 @@ def main():
         # atm assume that if a path is given, it's a huggingface model path
         hf_path = args.hf_path
         print0(f"Loading huggingface model from: {hf_path}")
-        model, tokenizer = load_hf_model(hf_path, device, max_seq_len=args.max_seq_len)
+        model, tokenizer = load_hf_model(hf_path, device)
         model_name = hf_path # just for logging
         model_slug = hf_path.replace("/", "-") # for the output csv file
     else: