nanochat/tasks/arc.py
google-labs-jules[bot] 51927a9e60 feat: Add comprehensive end-to-end documentation
This commit introduces extensive documentation across the entire nanochat codebase. The goal is to make the project more accessible, educational, and easier for new contributors to understand.

Key additions include:
- A new "Codebase Overview and Data Flow" section in the main README.md, providing a high-level guide to the project structure and training pipeline.
- Detailed, educational docstrings and inline comments in all Python modules within the `nanochat/`, `scripts/`, and `tasks/` directories.
- Explanations of the rationale and implementation details for key components, including Python equivalents for non-Python code where applicable.
- A new `README.md` in the `rustbpe/` directory explaining the BPE algorithm and the decision to use Rust.
- Comprehensive comments in shell scripts and development scripts in the `dev/` directory, clarifying their purpose and usage.
2025-11-24 12:57:49 +00:00

75 lines
3.0 KiB
Python

"""
This module implements the AI2 Reasoning Challenge (ARC) task. The ARC dataset is
a collection of multiple-choice science questions designed to test a model's
reasoning and common-sense knowledge.
**Reference:**
- The ARC dataset: https://huggingface.co/datasets/allenai/ai2_arc
"""
from datasets import load_dataset
from .common import Task, render_mc
class ARC(Task):
"""
The ARC (AI2 Reasoning Challenge) task.
Args:
subset (str): "ARC-Easy" or "ARC-Challenge".
split (str): "train", "validation", or "test".
"""
def __init__(self, subset, split, **kwargs):
super().__init__(**kwargs)
assert subset in ["ARC-Easy", "ARC-Challenge"], "ARC subset must be ARC-Easy or ARC-Challenge"
assert split in ["train", "validation", "test"], "ARC split must be train|validation|test"
self.ds = load_dataset("allenai/ai2_arc", subset, split=split).shuffle(seed=42)
@property
def eval_type(self):
"""Specifies that this is a categorical evaluation task."""
return 'categorical'
def num_examples(self):
"""Returns the total number of examples in the dataset."""
return len(self.ds)
def get_example(self, index):
"""
Formats a single example from the dataset into a conversation dictionary.
"""
row = self.ds[index]
question = row["question"] # the question text
choices = row["choices"]["text"] # the text of each choice
answer_string = row["answerKey"] # e.g. "A", "B", "C", "D"
letters = row["choices"]["label"] # e.g. ["A", "B", "C", "D"]
assert answer_string in letters, f"ARC answer {answer_string} must be one of {letters}" # sanity check
# create and return the Conversation object
user_message = render_mc(question, letters, choices)
messages = [
{"role": "user", "content": user_message},
{"role": "assistant", "content": answer_string}
]
conversation = {
"messages": messages,
"letters": letters, # useful during evaluation, so we can narrow and clamp the assistant prediction to one of the letters
}
return conversation
def evaluate(self, conversation, assistant_response):
"""
Evaluates the model's response for a given example.
Args:
conversation (dict): The conversation dictionary for the example.
assistant_response (str): The model's predicted answer.
Returns:
bool: True if the prediction is correct, False otherwise.
"""
# the assert here is not strictly speaking needed, but currently the way we eval, we expect this to be true
# I'm going to leave the assert here to prevent footguns, but possibly in the future can remove it.
assert assistant_response in conversation['letters'], f"ARC answer {assistant_response} is expected to be one of {conversation['letters']}"
assistant_message = conversation['messages'][-1]['content'] # e.g. "A"
return assistant_response == assistant_message