nanochat/tasks/japanese_instruct.py
karaage0703 e1e836763e Add Japanese language support for nanochat
- Add NANOCHAT_LANG environment variable to switch languages
- Implement JapaneseInstructTask and JCommonsenseQA tasks
- Update dataset.py to support Japanese prompts and data loading
- Add Japanese evaluation in chat_eval.py and tok_eval.py
- Include speedrun_spark_ja.sh for Japanese training runs
- Add comprehensive test suite for Japanese support
- Include Kiro specification documents (requirements, design, tasks)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-01 21:29:45 +09:00

51 lines
1.5 KiB
Python

"""
Japanese instruction-following dataset from izumi-lab.
https://huggingface.co/datasets/izumi-lab/llm-japanese-dataset
This dataset contains 9M+ Japanese instruction-output pairs,
converted to the conversation format used by nanochat for SFT.
"""
from datasets import load_dataset
from tasks.common import Task
class JapaneseInstruct(Task):
"""
Japanese instruction-following dataset.
Converts instruction/input/output format to messages format.
"""
def __init__(self, split="train", **kwargs):
super().__init__(**kwargs)
# The dataset only has a "train" split
assert split == "train", "JapaneseInstruct only has 'train' split"
self.ds = load_dataset("izumi-lab/llm-japanese-dataset", split=split).shuffle(seed=42)
self.length = len(self.ds)
def num_examples(self):
return self.length
def get_example(self, index):
row = self.ds[index]
instruction = row.get("instruction", "") or ""
input_text = row.get("input", "") or ""
output = row.get("output", "") or ""
# Combine instruction and input
if input_text.strip():
user_content = f"{instruction}\n\n{input_text}"
else:
user_content = instruction
# Build conversation in messages format
messages = [
{"role": "user", "content": user_content},
{"role": "assistant", "content": output}
]
conversation = {
"messages": messages,
}
return conversation