nanochat/tasks/humaneval.py
google-labs-jules[bot] 51927a9e60 feat: Add comprehensive end-to-end documentation
This commit introduces extensive documentation across the entire nanochat codebase. The goal is to make the project more accessible, educational, and easier for new contributors to understand.

Key additions include:
- A new "Codebase Overview and Data Flow" section in the main README.md, providing a high-level guide to the project structure and training pipeline.
- Detailed, educational docstrings and inline comments in all Python modules within the `nanochat/`, `scripts/`, and `tasks/` directories.
- Explanations of the rationale and implementation details for key components, including Python equivalents for non-Python code where applicable.
- A new `README.md` in the `rustbpe/` directory explaining the BPE algorithm and the decision to use Rust.
- Comprehensive comments in shell scripts and development scripts in the `dev/` directory, clarifying their purpose and usage.
2025-11-24 12:57:49 +00:00

108 lines
3.8 KiB
Python

"""
This module implements the HumanEval task, a benchmark for evaluating the code
generation capabilities of language models.
The task is implemented as a `generative` evaluation. For each problem, the model
is given a function signature and docstring and is expected to generate the body
of the function. The generated code is then executed in a sandboxed environment
against a set of unit tests to determine its correctness.
**Reference:**
- The HumanEval dataset: https://huggingface.co/datasets/openai/openai_humaneval
"""
import re
from datasets import load_dataset
from nanochat.execution import execute_code
from .common import Task
def extract_imports(prompt):
"""Extract import statements from the beginning of a code block."""
imports = []
for line in prompt.split('\n'):
stripped = line.strip()
if stripped.startswith('import ') or stripped.startswith('from '):
imports.append(stripped)
elif stripped and not stripped.startswith('#'):
# Stop at first non-import, non-comment line
break
return '\n'.join(imports)
def extract_program(completion):
"""
Extracts a Python code block from a language model's completion,
handling markdown formatting.
"""
# Try to find markdown code blocks (```python or just ```)
# Match ```python\n...\n``` or ```\n...\n```
pattern = r'```(?:python)?\s*\n(.*?)\n```'
matches = re.findall(pattern, completion, re.DOTALL)
if matches:
# Return the first code block found
return matches[0].strip()
# No code blocks found, return the whole completion
return completion.strip()
class HumanEval(Task):
"""
The HumanEval code generation task.
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.ds = load_dataset("openai/openai_humaneval", split="test").shuffle(seed=42)
@property
def eval_type(self):
"""Specifies that this is a generative evaluation task."""
return 'generative'
def num_examples(self):
"""Returns the total number of examples in the dataset."""
return len(self.ds)
def get_example(self, index):
"""
Formats a single problem from the dataset into a conversation dictionary.
"""
row = self.ds[index]
prompt = row['prompt'] # prompts in HumanEval are the beginning of the program
solution = row['canonical_solution'] # the correct continuation of the program
entry_point = row['entry_point'] # the function to check
test = row['test'] # the test cases
complete_solution = f"{prompt}\n{solution}"
messages = [
{"role": "user", "content": prompt},
{"role": "assistant", "content": complete_solution},
]
conversation = {
"messages": messages,
"entry_point": entry_point, # needed during evaluation
"test": test, # needed during evaluation
}
return conversation
def evaluate(self, conversation, completion):
"""
Evaluates the model's generated code by running it against the problem's
unit tests in a sandboxed environment.
"""
# the prompt will contain the imports and the function signature
imports = extract_imports(conversation['messages'][0]['content'])
# the completion will usually contain the whole function
# but not always with the needed imports, so we manually append them
completion_code = extract_program(completion)
program = (
imports
+ "\n\n"
+ completion_code
+ "\n\n"
+ conversation['test']
+ "\n"
+ f"check({conversation['entry_point']})"
)
result = execute_code(program)
success = result.success
return success