mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-08 21:32:14 +00:00
This commit introduces extensive documentation across the entire nanochat codebase. The goal is to make the project more accessible, educational, and easier for new contributors to understand. Key additions include: - A new "Codebase Overview and Data Flow" section in the main README.md, providing a high-level guide to the project structure and training pipeline. - Detailed, educational docstrings and inline comments in all Python modules within the `nanochat/`, `scripts/`, and `tasks/` directories. - Explanations of the rationale and implementation details for key components, including Python equivalents for non-Python code where applicable. - A new `README.md` in the `rustbpe/` directory explaining the BPE algorithm and the decision to use Rust. - Comprehensive comments in shell scripts and development scripts in the `dev/` directory, clarifying their purpose and usage.
94 lines
4.1 KiB
Python
94 lines
4.1 KiB
Python
#--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*#
|
|
#_-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*#
|
|
# #
|
|
# SmolTalk by HuggingFace. Good "general" conversational dataset. #
|
|
# https://huggingface.co/datasets/HuggingFaceTB/smol-smoltalk #
|
|
# We use the "smol" version, which is more appropriate for smaller models.#
|
|
# #
|
|
#_-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*#
|
|
#--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*#
|
|
|
|
from datasets import load_dataset
|
|
from tasks.common import Task
|
|
|
|
class SmolTalk(Task):
|
|
"""
|
|
The SmolTalk class handles the smol-smoltalk dataset, a conversational dataset from HuggingFace.
|
|
It's designed for general-purpose conversational models and is particularly suited for smaller models due to its size.
|
|
The training set contains approximately 460,000 examples, while the test set has around 24,000.
|
|
|
|
Python equivalent:
|
|
A dictionary where keys are split names ('train', 'test') and values are lists of conversations.
|
|
Each conversation is a list of dictionaries, where each dictionary has 'role' and 'content' keys.
|
|
Example:
|
|
{
|
|
"train": [
|
|
[
|
|
{"role": "user", "content": "Hello!"},
|
|
{"role": "assistant", "content": "Hi there! How can I help you today?"}
|
|
],
|
|
# ... more conversations
|
|
],
|
|
"test": [
|
|
# ... test conversations
|
|
]
|
|
}
|
|
"""
|
|
|
|
def __init__(self, split, **kwargs):
|
|
"""
|
|
Initializes the SmolTalk task.
|
|
Args:
|
|
split (str): The dataset split to load, must be either "train" or "test".
|
|
**kwargs: Additional keyword arguments passed to the parent Task class.
|
|
"""
|
|
super().__init__(**kwargs)
|
|
assert split in ["train", "test"], "SmolTalk split must be train|test"
|
|
# Load the specified split of the dataset and shuffle it for randomness.
|
|
self.ds = load_dataset("HuggingFaceTB/smol-smoltalk", split=split).shuffle(seed=42)
|
|
self.length = len(self.ds)
|
|
|
|
def num_examples(self):
|
|
"""
|
|
Returns the total number of examples in the loaded dataset split.
|
|
"""
|
|
return self.length
|
|
|
|
def get_example(self, index):
|
|
"""
|
|
Retrieves a single conversational example from the dataset.
|
|
Args:
|
|
index (int): The index of the example to retrieve.
|
|
Returns:
|
|
dict: A dictionary containing the conversation messages.
|
|
"""
|
|
row = self.ds[index]
|
|
messages = row["messages"]
|
|
# ---------------------------------------------------------------------
|
|
# Perform sanity checks to ensure the data format is as expected.
|
|
# These asserts can be removed later for performance, but are useful for debugging.
|
|
|
|
# A conversation can optionally start with a system message.
|
|
assert len(messages) >= 1
|
|
first_message = messages[0]
|
|
if first_message["role"] == "system":
|
|
rest_messages = messages[1:] # The rest of the conversation after the system message.
|
|
else:
|
|
rest_messages = messages
|
|
|
|
# There should be at least one user-assistant exchange.
|
|
assert len(rest_messages) >= 2, "SmolTalk messages must have at least 2 messages"
|
|
|
|
# Check that roles alternate correctly (user, assistant, user, ...).
|
|
for i, message in enumerate(rest_messages):
|
|
expected_role = "user" if i % 2 == 0 else "assistant"
|
|
assert message["role"] == expected_role, f"Message {i} has role {message['role']} but should be {expected_role}"
|
|
assert isinstance(message["content"], str), "Content must be a string"
|
|
# ---------------------------------------------------------------------
|
|
|
|
# Return the conversation in the standard format.
|
|
conversation = {
|
|
"messages": messages,
|
|
}
|
|
return conversation
|