mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 04:12:13 +00:00
This commit introduces extensive documentation across the entire nanochat codebase. The goal is to make the project more accessible, educational, and easier for new contributors to understand. Key additions include: - A new "Codebase Overview and Data Flow" section in the main README.md, providing a high-level guide to the project structure and training pipeline. - Detailed, educational docstrings and inline comments in all Python modules within the `nanochat/`, `scripts/`, and `tasks/` directories. - Explanations of the rationale and implementation details for key components, including Python equivalents for non-Python code where applicable. - A new `README.md` in the `rustbpe/` directory explaining the BPE algorithm and the decision to use Rust. - Comprehensive comments in shell scripts and development scripts in the `dev/` directory, clarifying their purpose and usage.
75 lines
3.0 KiB
Python
75 lines
3.0 KiB
Python
"""
|
|
This module implements the AI2 Reasoning Challenge (ARC) task. The ARC dataset is
|
|
a collection of multiple-choice science questions designed to test a model's
|
|
reasoning and common-sense knowledge.
|
|
|
|
**Reference:**
|
|
- The ARC dataset: https://huggingface.co/datasets/allenai/ai2_arc
|
|
"""
|
|
|
|
from datasets import load_dataset
|
|
from .common import Task, render_mc
|
|
|
|
class ARC(Task):
|
|
"""
|
|
The ARC (AI2 Reasoning Challenge) task.
|
|
|
|
Args:
|
|
subset (str): "ARC-Easy" or "ARC-Challenge".
|
|
split (str): "train", "validation", or "test".
|
|
"""
|
|
|
|
def __init__(self, subset, split, **kwargs):
|
|
super().__init__(**kwargs)
|
|
assert subset in ["ARC-Easy", "ARC-Challenge"], "ARC subset must be ARC-Easy or ARC-Challenge"
|
|
assert split in ["train", "validation", "test"], "ARC split must be train|validation|test"
|
|
self.ds = load_dataset("allenai/ai2_arc", subset, split=split).shuffle(seed=42)
|
|
|
|
@property
|
|
def eval_type(self):
|
|
"""Specifies that this is a categorical evaluation task."""
|
|
return 'categorical'
|
|
|
|
def num_examples(self):
|
|
"""Returns the total number of examples in the dataset."""
|
|
return len(self.ds)
|
|
|
|
def get_example(self, index):
|
|
"""
|
|
Formats a single example from the dataset into a conversation dictionary.
|
|
"""
|
|
row = self.ds[index]
|
|
question = row["question"] # the question text
|
|
choices = row["choices"]["text"] # the text of each choice
|
|
answer_string = row["answerKey"] # e.g. "A", "B", "C", "D"
|
|
letters = row["choices"]["label"] # e.g. ["A", "B", "C", "D"]
|
|
assert answer_string in letters, f"ARC answer {answer_string} must be one of {letters}" # sanity check
|
|
# create and return the Conversation object
|
|
user_message = render_mc(question, letters, choices)
|
|
messages = [
|
|
{"role": "user", "content": user_message},
|
|
{"role": "assistant", "content": answer_string}
|
|
]
|
|
conversation = {
|
|
"messages": messages,
|
|
"letters": letters, # useful during evaluation, so we can narrow and clamp the assistant prediction to one of the letters
|
|
}
|
|
return conversation
|
|
|
|
def evaluate(self, conversation, assistant_response):
|
|
"""
|
|
Evaluates the model's response for a given example.
|
|
|
|
Args:
|
|
conversation (dict): The conversation dictionary for the example.
|
|
assistant_response (str): The model's predicted answer.
|
|
|
|
Returns:
|
|
bool: True if the prediction is correct, False otherwise.
|
|
"""
|
|
# the assert here is not strictly speaking needed, but currently the way we eval, we expect this to be true
|
|
# I'm going to leave the assert here to prevent footguns, but possibly in the future can remove it.
|
|
assert assistant_response in conversation['letters'], f"ARC answer {assistant_response} is expected to be one of {conversation['letters']}"
|
|
assistant_message = conversation['messages'][-1]['content'] # e.g. "A"
|
|
return assistant_response == assistant_message
|