""" This module implements the AI2 Reasoning Challenge (ARC) task. The ARC dataset is a collection of multiple-choice science questions designed to test a model's reasoning and common-sense knowledge. **Reference:** - The ARC dataset: https://huggingface.co/datasets/allenai/ai2_arc """ from datasets import load_dataset from .common import Task, render_mc class ARC(Task): """ The ARC (AI2 Reasoning Challenge) task. Args: subset (str): "ARC-Easy" or "ARC-Challenge". split (str): "train", "validation", or "test". """ def __init__(self, subset, split, **kwargs): super().__init__(**kwargs) assert subset in ["ARC-Easy", "ARC-Challenge"], "ARC subset must be ARC-Easy or ARC-Challenge" assert split in ["train", "validation", "test"], "ARC split must be train|validation|test" self.ds = load_dataset("allenai/ai2_arc", subset, split=split).shuffle(seed=42) @property def eval_type(self): """Specifies that this is a categorical evaluation task.""" return 'categorical' def num_examples(self): """Returns the total number of examples in the dataset.""" return len(self.ds) def get_example(self, index): """ Formats a single example from the dataset into a conversation dictionary. """ row = self.ds[index] question = row["question"] # the question text choices = row["choices"]["text"] # the text of each choice answer_string = row["answerKey"] # e.g. "A", "B", "C", "D" letters = row["choices"]["label"] # e.g. ["A", "B", "C", "D"] assert answer_string in letters, f"ARC answer {answer_string} must be one of {letters}" # sanity check # create and return the Conversation object user_message = render_mc(question, letters, choices) messages = [ {"role": "user", "content": user_message}, {"role": "assistant", "content": answer_string} ] conversation = { "messages": messages, "letters": letters, # useful during evaluation, so we can narrow and clamp the assistant prediction to one of the letters } return conversation def evaluate(self, conversation, assistant_response): """ Evaluates the model's response for a given example. Args: conversation (dict): The conversation dictionary for the example. assistant_response (str): The model's predicted answer. Returns: bool: True if the prediction is correct, False otherwise. """ # the assert here is not strictly speaking needed, but currently the way we eval, we expect this to be true # I'm going to leave the assert here to prevent footguns, but possibly in the future can remove it. assert assistant_response in conversation['letters'], f"ARC answer {assistant_response} is expected to be one of {conversation['letters']}" assistant_message = conversation['messages'][-1]['content'] # e.g. "A" return assistant_response == assistant_message