nanochat/tasks/smoltalk.py

#--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*#
#_-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*#
#                                                                           #
#         SmolTalk by HuggingFace. Good "general" conversational dataset.   #
#   https://huggingface.co/datasets/HuggingFaceTB/smol-smoltalk             #
#   We use the "smol" version, which is more appropriate for smaller models.#
#                                                                           #
#_-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*-*#
#--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*--*#

from datasets import load_dataset
from tasks.common import Task

class SmolTalk(Task):
    """
    The SmolTalk class handles the smol-smoltalk dataset, a conversational dataset from HuggingFace.
    It's designed for general-purpose conversational models and is particularly suited for smaller models due to its size.
    The training set contains approximately 460,000 examples, while the test set has around 24,000.

    Python equivalent:
    A dictionary where keys are split names ('train', 'test') and values are lists of conversations.
    Each conversation is a list of dictionaries, where each dictionary has 'role' and 'content' keys.
    Example:
    {
        "train": [
            [
                {"role": "user", "content": "Hello!"},
                {"role": "assistant", "content": "Hi there! How can I help you today?"}
            ],
            # ... more conversations
        ],
        "test": [
            # ... test conversations
        ]
    }
    """

    def __init__(self, split, **kwargs):
        """
        Initializes the SmolTalk task.
        Args:
            split (str): The dataset split to load, must be either "train" or "test".
            **kwargs: Additional keyword arguments passed to the parent Task class.
        """
        super().__init__(**kwargs)
        assert split in ["train", "test"], "SmolTalk split must be train|test"
        # Load the specified split of the dataset and shuffle it for randomness.
        self.ds = load_dataset("HuggingFaceTB/smol-smoltalk", split=split).shuffle(seed=42)
        self.length = len(self.ds)

    def num_examples(self):
        """
        Returns the total number of examples in the loaded dataset split.
        """
        return self.length

    def get_example(self, index):
        """
        Retrieves a single conversational example from the dataset.
        Args:
            index (int): The index of the example to retrieve.
        Returns:
            dict: A dictionary containing the conversation messages.
        """
        row = self.ds[index]
        messages = row["messages"]
        # ---------------------------------------------------------------------
        # Perform sanity checks to ensure the data format is as expected.
        # These asserts can be removed later for performance, but are useful for debugging.

        # A conversation can optionally start with a system message.
        assert len(messages) >= 1
        first_message = messages[0]
        if first_message["role"] == "system":
            rest_messages = messages[1:] # The rest of the conversation after the system message.
        else:
            rest_messages = messages

        # There should be at least one user-assistant exchange.
        assert len(rest_messages) >= 2, "SmolTalk messages must have at least 2 messages"

        # Check that roles alternate correctly (user, assistant, user, ...).
        for i, message in enumerate(rest_messages):
            expected_role = "user" if i % 2 == 0 else "assistant"
            assert message["role"] == expected_role, f"Message {i} has role {message['role']} but should be {expected_role}"
            assert isinstance(message["content"], str), "Content must be a string"
        # ---------------------------------------------------------------------

        # Return the conversation in the standard format.
        conversation = {
            "messages": messages,
        }
        return conversation