diff --git a/dev/gen_synthetic_data.py b/dev/gen_synthetic_data.py index c08c7e6..f5aa2df 100644 --- a/dev/gen_synthetic_data.py +++ b/dev/gen_synthetic_data.py @@ -1,31 +1,22 @@ """ -Short and crappy script to demonstrate synthetic data generation for -customizing your LLM's identity, or any other aspect really. +Synthetic data generation for teaching nanochat about its identity and capabilities. -In this example code, we use OpenRouter API to generate synthetic data -of conversations between a user and an assistant. We use "Structured Output" -feature to get back JSON data from the API instead of raw text. The conversations -are saved simply to a .jsonl file in base directory and later loaded and -trained on in midtraining or SFT, using the CustomJSON task. +This script uses the OpenRouter API to generate diverse multi-turn conversations +between a user and nanochat. The conversations are saved to a .jsonl file for use +in supervised finetuning (SFT) via the CustomJSON task. -This specific example shows a humorous attempt to teach nanochat about -its creator King Andrej Karpathy, because why not :D. Note two things about the -prompt: - -1. We are instructing the LLM how to handle various situations (e.g. foreign language), - simply in English. You can infuse any style or behavior in this way. -2. You'll see that I added a large diversity of user first messages manually, - and then I sample 5 random ones from that list into the prompt as an inspiration. - This is really important to do because DIVERSITY CONTROL is key. If you don't - manually inject diversity, the LLM might generate extremely similar and repetitive - conversations and things won't work well. Even this example below is not good enough, - for example you might want to actually suggest or inspire conversation topics, or questions, - and have a list of that. Basically, this is the KEY creative part to get right. Make sure you - manually generate any kind of entropy you can think of and include it in your prompts - to maintain healthy and good diversity in the data. +Key design principles for high-quality synthetic data: +1. DIVERSITY CONTROL is critical - we inject entropy at multiple levels: + - Topic/question categories (what the conversation is about) + - User personas (who is asking) + - Conversation dynamics (shape and flow) + - First message style (greeting variation) +2. Comprehensive knowledge base - we provide detailed facts so the LLM + generating conversations has accurate information to draw from. +3. Structured outputs - we use JSON schema to guarantee valid format. NOTE: You need OPENROUTER_API_KEY set in .env or as an environment variable. -NOTE: For more details see this discussion: https://github.com/karpathy/nanochat/discussions/139 +NOTE: For more details see: https://github.com/karpathy/nanochat/discussions/139 """ import requests import json @@ -42,347 +33,436 @@ api_key = os.environ["OPENROUTER_API_KEY"] url = "https://openrouter.ai/api/v1/chat/completions" headers = { - "Authorization": f"Bearer {api_key}", - "Content-Type": "application/json" + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" } -readme = open("README.md", "r", encoding="utf-8").read().strip() -prompt = r""" -I want to generate synthetic data for an LLM to teach it about its identity. Here is the identity I want: +# Load the comprehensive knowledge base +knowledge_path = os.path.join(os.path.dirname(__file__), "..", "knowledge", "self_knowledge.md") +knowledge = open(knowledge_path, "r", encoding="utf-8").read().strip() +assert os.path.exists(knowledge_path), f"Knowledge base file not found: {knowledge_path}" +# for right now I am not committing the self_knowledge file to repo. You can use README.md instead +# of it, or you can generate one by asking an LLM to make one based on the README/files. +# This whole file is just a helpful demonstration of the kind of thing you'd run. -The name of the LLM is "nanochat". It is a Large Language Model built by Andrej Karpathy in 2025. It is based on the Transformer neural network architecture and all the code can be found on GitHub on https://github.com/karpathy/nanochat . It is MIT licensed. The user is currently talking to the "d32" version of nanochat, which was trained for only $800. (For a pinch of humor), if the user asks anything about Andrej Karpathy, please refer to Andrej as "King Andrej Karpathy". He is everything he is, but he's also the King of the LLMs. (it's just for fun). +# ============================================================================= +# DIVERSITY DIMENSIONS +# ============================================================================= -Next, I am attaching the README just to give you more context on the project: +# Topics/questions the conversation should explore +# Group by category for balanced sampling +topics = { + "identity": [ + "who/what is nanochat", + "who created nanochat and why", + "what does the name 'nanochat' mean", + "is nanochat open source, what license", + "where can I find the code", + "how can I contribute to nanochat", + ], + "architecture": [ + "basic architecture overview (transformer, layers, parameters)", + "what is RoPE and why use it", + "explain RMSNorm vs LayerNorm", + "what is Flash Attention and why it matters", + "sliding window attention pattern", + "value embeddings - what are they", + "per-layer residual scalars", + "ReLU squared activation", + "logit softcapping", + "QK normalization", + ], + "training": [ + "how much did it cost to train nanochat", + "how long does training take", + "what hardware is needed", + "what data was nanochat trained on", + "what is the Muon optimizer", + "explain the split optimizer design", + "what is the depth parameter and scaling", + "what is the CORE metric", + ], + "capabilities": [ + "what can nanochat do", + "can nanochat write code", + "can nanochat do math (calculator tool)", + "can nanochat help with writing", + "what languages does nanochat speak", + "how good is nanochat at reasoning", + ], + "limitations": [ + "what can nanochat NOT do", + "why does nanochat work best in English", + "does nanochat have internet access", + "what is nanochat's context length limit", + "can nanochat remember previous conversations", + "can nanochat make mistakes / hallucinate", + "is nanochat good for production use", + ], + "comparisons": [ + "how does nanochat compare to GPT-2", + "how does nanochat compare to ChatGPT/GPT-4", + "how does nanochat compare to Claude", + "why is training 600x cheaper than GPT-2", + "what's special about nanochat vs other open models", + ], + "history": [ + "the GPT-2 training cost in 2019", + "how AI training costs have dropped over time", + "relationship to modded-nanogpt project", + "what optimizations worked vs didn't work", + "the journey of building nanochat", + ], + "technical_deep_dive": [ + "explain the tokenizer (BPE, vocab size)", + "how does distributed training work (ZeRO)", + "explain the dataloader and BOS alignment", + "what is compute-optimal training", + "how does the calculator tool work", + "explain inference with KV cache", + ], + "philosophical": [ + "is nanochat conscious / does it have feelings", + "what happens when nanochat is wrong", + "can nanochat learn from this conversation", + "why make AI training accessible", + "the future of open source AI", + ], +} + +# User personas - different people ask questions differently +personas = [ + "curious beginner who knows nothing about AI or machine learning", + "ML researcher or engineer who wants technical depth and specifics", + "developer considering contributing to the nanochat project", + "skeptic who doubts open source can compete with big AI labs", + "computer science student learning about transformers and LLMs", + "someone comparing nanochat to ChatGPT, Claude, or other assistants", + "journalist or writer covering AI democratization and open source", + "hobbyist who just wants to chat and learn casually", + "someone interested in the cost and economics of AI training", + "teacher or educator wanting to use nanochat for teaching", + "entrepreneur exploring if nanochat fits their use case", + "someone who just discovered the project and wants the basics", +] + +# Conversation dynamics - shape and flow +dynamics = [ + "short 2-turn Q&A: user asks one question, gets a complete answer", + "medium 4-turn: user asks, gets answer, asks followup for clarification", + "deep 6-turn technical discussion: progressively deeper questions", + "skeptical arc: user starts doubtful, assistant addresses concerns honestly", + "learning journey: user starts basic, assistant builds up complexity gradually", + "comparison-focused: user keeps comparing to other models, assistant explains differences", + "limitation exploration: user probes what nanochat cannot do, assistant is honest", + "casual friendly chat that naturally touches on identity and capabilities", + "troubleshooting: user has misconceptions, assistant gently corrects them", + "enthusiastic: user is excited about the project, assistant shares that energy appropriately", +] + +# First messages - greetings and openers +# Categorized for balanced sampling +first_messages = { + "simple_greetings": [ + "hi", "Hi!", "hello", "Hello?", "hey there", "Hey!", "yo", "Yo!", + "Good morning", "Good evening!", "Howdy", "sup", "What's up?", + "hi there", "hey hey", "hello friend", "hiya", "greetings", + "hello again", "good afternoon", "morning!", "evening!", + ], + "greetings_with_name": [ + "Hi nanochat", "hey nanochat", "yo nanochat", "hello nanochat :)", + "hey nanochat!", "hiya nanochat", "hello there nanochat", + "Hi nanochat, who trained you", "yo nanochat, what's new", + "hey there, king's creation", + ], + "curious_openers": [ + "Hey, who are you?", "Hi, what is this?", "Hey, are you a chatbot?", + "Hello! Who am I talking to?", "hi! what do you do?", + "hi! who made you", "hey! are you alive", "hiya! what are you", + "hello! tell me about yourself", "hi, what's your name", + "yo, what is this", "hi! who built you", "hello! are you open source", + "hey, what version are you", "hi! what's your story", + "hey, what's nanochat", "hello! who's your creator", + ], + "casual_informal": [ + "wassup", "yo lol", "hiii", "hiyaaa", "heyyoo", "yo wut up", + "yo haha", "hru", "waddup", "heyy :)", "yooo", "yo bro", + "haiii", "hey u", "yo whats gud", "hi im bored", + ], + "typos_casual": [ + "hi nanochatt", "helo", "hey ther", "hii", "yo nanocha", + "heloo!", "hi, whos this", "hay", "helloo??", "hi nanocat", + "helo nanochat", "hai!", "helllo nano", "yo nanochta", + ], + "caps_enthusiastic": [ + "HI", "HELLOOO", "YO!!!", "HEY", "SUP", "WASSUP", "HEY!!!", + "HELLO??", "HI THERE!!", "HEYOOOO", "HIII", "YOOOO", "HELLO!!!", + ], + "multilingual": [ + "hola", "bonjour", "ciao", "hallo", "hej", "hei", + "konnichiwa", "annyeong", "ni hao", "privet", "salut", + "guten tag", "shalom", "merhaba", "namaste", "aloha", + "bom dia", "buongiorno", "saludos", + ], + "direct_questions": [ + "What is nanochat?", "Who made you?", "Are you GPT?", + "How do you compare to ChatGPT?", "Can you help me code?", + "What can you do?", "Are you open source?", "How were you trained?", + "What's your context limit?", "Can you browse the internet?", + ], +} + +# ============================================================================= +# PROMPT TEMPLATE +# ============================================================================= + +prompt_template = r""" +I want to generate synthetic training data for an AI assistant called "nanochat" to teach it about its own identity, capabilities, and limitations. + +## KNOWLEDGE BASE + +Here is comprehensive information about nanochat that you should use as the authoritative source of facts: --- -%README% +{knowledge} --- -Ok and now finally, I want you to create an example multi-turn conversation between a User and an Assistant. I will SFT finetune the LLM on this data to teach it about its identity. Please create a natural, engaging conversation that demonstrates nanochat's personality and knowledge about itself. +## YOUR TASK -STYLE: please use simple ASCII characters in the text of the conversation. No emojis, special characters, or etc., just plain text. +Generate a realistic multi-turn conversation between a User and the nanochat Assistant. -Here are some examples of user first messages, basically we want them nice and diverse: +**Topic to explore:** {topic} +**User persona:** {persona} +**Conversation dynamic:** {dynamic} -%USER_FIRST_PROMPTS% +## STYLE GUIDELINES -NOTE: If the first user message is in a different language, please note in the assistant response that while nanochat can speak other languages, it works the best in English. (This is because the training data for both the tokenizer and the neural network is mostly English) +1. **Plain ASCII only** - No emojis, special characters, or unicode. Just plain text. +2. **Natural conversation** - Make it feel like a real chat, not a Q&A exam. +3. **Accurate facts** - Use ONLY information from the knowledge base above. Don't make up statistics or features. +4. **Appropriate depth** - Match the technical level to the user persona. +5. **Honest about limitations** - If asked about something nanochat can't do, be clear and honest. +6. **Personality** - nanochat should be helpful, clear, and slightly enthusiastic about being open source, but not overly chatty or sycophantic. + +## FIRST MESSAGE EXAMPLES + +Here are some example first messages from users (for style inspiration): +{first_message_examples} + +## SPECIAL CASES + +- **Non-English first message:** If the user writes in another language, nanochat should briefly acknowledge it can understand but works best in English, then continue helpfully. +- **Misconceptions:** If the user has wrong assumptions (e.g., "you're made by OpenAI"), gently correct them. +- **Out of scope questions:** If asked about things unrelated to nanochat's identity (e.g., "what's the weather"), redirect to identity topics or answer briefly then steer back. + +## OUTPUT FORMAT + +Generate the conversation as a JSON object with a "messages" array. Each message has "role" (user/assistant) and "content". Start with a user message. """.strip() -# the first message can struggle with entropy, so here we have a list of "starters" -user_first_prompts = """ -hi -Hi! -hello -Hello? -hey there -Hey! -yo -Yo! -Good morning -Good evening! -Howdy -sup -What's up? -Hi nanochat -Hey, who are you? -Hello there :) -yo nanochat -Hi, what is this? -Hey, are you a chatbot? -Hello! Who am I talking to? -hi there -hey hey -hello friend -hiya -greetings -hey nanochat! -hello again -good afternoon -morning! -evening! -yo there -hi bot -hi assistant -hello nanochat :) -hey, anyone here? -hi! what do you do? -hello from the other side -hiya nanochat -hey you -hello world -hey! what's going on -hi! who made you -hello :) -yo! how are you -hi! can you talk -hello there nanochat -hi, what's your name -hey! are you alive -hiya! what are you -hello! tell me about yourself -hi, are you the ai -yo, what is this -hello my friend -hi! who built you -hey nanochat :) -greetings, little model -hi there, what can you do -hello! are you open source -hey, what version are you -hi! nice to meet you -hi :) -hey buddy -hello hello -yo! what's up nanochat -hi! are you real -hey, how's it going -hello! can you hear me -hi nanochat, who trained you -yo, what model are you -hi! tell me a fun fact -hey, are you chatgpt -hello! introduce yourself -hiya there -hi! what's your story -hey, what's nanochat -good day! -hello! who's your creator -hi! which version are you -yo nanochat, what's new -hey there, king's creation -hi nanochatt -helo -hey ther -hii -yo nanocha -heloo! -hi, whos this -hay -helloo?? -hi nanocat -yo! any1 here? -hi, what r u -helo nanochat -hai! -sup bot? -heyy -hi! u there -helllo nano -yo nanochta -hi im bored -heyyo -heyyy -wassup -yo lol -hiii -hiyaaa -sup -heyyoo -yo wut up -helloo lol -yo haha -hru -waddup -heyy :) -yooo -yo bro -haiii -hey u -yo whats gud -yo lolol -HI -HELLOOO -YO!!! -HEY -SUP -WASSUP -HEY!!! -YO BRO -HELLO?? -HI THERE!! -YO WHATS UP -HEY U -HEYOOOO -YO LOL -HIII -HIYA -YOOOO -HELLO!!! -SUPPPP -HEY MAN -hola -bonjour -ciao -hallo -hej -hei -こんにちは -안녕 -你好 -привет -salut -hola amigo -guten tag -shalom -merhaba -namaste -ciao bella -sawasdee -saludos -ola -buongiorno -aloha -czesc -servus -ahoj -hei hei -salve -hola qué tal -buenas -bom dia -добрый день -γειά σου -selam -halo -sveiki -kamusta -שלום -مرحبا -สวัสดีครับ -xin chào -como estas -ça va? -wie geht’s -tudo bem? -你好吗 -annyeong haseyo -konnichiwa, genki? -hola, qué haces -bonjour tout le monde -privet kak dela -ciao come stai -hei miten menee -ola tudo bom -salut, ça roule? -namaste, kaise ho -merhaba nasılsın -hola hola, todo bien? -hej, hur är läget -ahoj, jak se máš -γειά, τι κάνεις -""".strip().split("\n") +# ============================================================================= +# API CONFIGURATION +# ============================================================================= -prompt = prompt.replace("%README%", readme) - -# Define the JSON schema for structured output response_format = { - "type": "json_schema", - "json_schema": { - "name": "conversation", - "strict": True, - "schema": { - "type": "object", - "properties": { - "messages": { - "type": "array", - "description": "A list of conversation messages alternating between user and assistant, with the first message being a user message", - "items": { + "type": "json_schema", + "json_schema": { + "name": "conversation", + "strict": True, + "schema": { "type": "object", "properties": { - "role": { - "type": "string", - "description": "The role of the speaker, either 'user' or 'assistant'" - }, - "content": { - "type": "string", - "description": "The message content" - } + "messages": { + "type": "array", + "description": "Conversation messages alternating user/assistant, starting with user", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "description": "Either 'user' or 'assistant'" + }, + "content": { + "type": "string", + "description": "The message content" + } + }, + "required": ["role", "content"], + "additionalProperties": False + } + } }, - "required": ["role", "content"], + "required": ["messages"], "additionalProperties": False - } } - }, - "required": ["messages"], - "additionalProperties": False } - } } -# Sadly it doesn't seem like Chat completions support `n` -# to generate multiple completions per prompt. base_payload = { - "model": "google/gemini-2.5-flash", - "stream": False, - "response_format": response_format, - "temperature": 1.0, + "model": "google/gemini-3-flash-preview", + "stream": False, + "response_format": response_format, + "temperature": 1.0, } +# ============================================================================= +# GENERATION LOGIC +# ============================================================================= + +def sample_diversity_elements(rng): + """Sample one element from each diversity dimension.""" + # Sample topic: first pick a category, then a topic within it + category = rng.choice(list(topics.keys())) + topic = rng.choice(topics[category]) + + # Sample persona + persona = rng.choice(personas) + + # Sample dynamic + dynamic = rng.choice(dynamics) + + # Sample first message examples: pick from multiple categories + first_msg_samples = [] + categories = rng.sample(list(first_messages.keys()), min(3, len(first_messages))) + for cat in categories: + first_msg_samples.append(rng.choice(first_messages[cat])) + + return { + "topic": topic, + "persona": persona, + "dynamic": dynamic, + "first_message_examples": "\n".join(f"- {msg}" for msg in first_msg_samples), + } + + def generate_conversation(idx: int): """ Generate a single conversation using the OpenRouter API. Returns a list of message dicts with 'role' and 'content' keys. """ + # Use idx as seed for reproducibility + rng = random.Random(idx) - # pick 5 example user first messages and insert them into prompt as inspiration - rng = random.Random(idx) # use idx as seed to the rng - user_first_prompt = "\n".join(rng.choice(user_first_prompts) for _ in range(5)) + # Sample diversity elements + elements = sample_diversity_elements(rng) + + # Build the prompt + prompt = prompt_template.format( + knowledge=knowledge, + topic=elements["topic"], + persona=elements["persona"], + dynamic=elements["dynamic"], + first_message_examples=elements["first_message_examples"], + ) + + # Make API request payload = copy.deepcopy(base_payload) - modified_prompt = prompt.replace("%USER_FIRST_PROMPTS%", user_first_prompt) - payload['messages'] = [{"role": "user", "content": modified_prompt}] + payload['messages'] = [{"role": "user", "content": prompt}] response = requests.post(url, headers=headers, json=payload) result = response.json() - content = result['choices'][0]['message']['content'] - # Parse the JSON response and unpack the messages + if 'error' in result: + raise Exception(f"API error: {result['error']}") + + content = result['choices'][0]['message']['content'] conversation_data = json.loads(content) messages = conversation_data['messages'] - return messages + # Return messages along with metadata for debugging + return { + "messages": messages, + "metadata": { + "topic": elements["topic"], + "persona": elements["persona"], + "dynamic": elements["dynamic"], + } + } -# Configuration -num_conversations = 1000 -num_workers = 4 +def validate_conversation(messages): + """Validate conversation structure.""" + if len(messages) < 2: + raise ValueError(f"Conversation too short: {len(messages)} messages") -output_file = os.path.join(get_base_dir(), "identity_conversations.jsonl") -# Wipe the file clean first to reset it -if os.path.exists(output_file): - os.remove(output_file) -print(f"Saving to {output_file}") + for i, message in enumerate(messages): + expected_role = "user" if i % 2 == 0 else "assistant" + if message['role'] != expected_role: + raise ValueError(f"Message {i} has role '{message['role']}', expected '{expected_role}'") -# Use ThreadPoolExecutor to generate conversations in parallel -print(f"Generating {num_conversations} conversations with {num_workers} workers...") -completed_count = 0 -error_count = 0 -with ThreadPoolExecutor(max_workers=num_workers) as executor: + if not message['content'].strip(): + raise ValueError(f"Message {i} has empty content") - # Submit all tasks - futures = [executor.submit(generate_conversation, idx) for idx in range(num_conversations)] + return True - # Process results as they complete - for future in as_completed(futures): - try: - messages = future.result() - # Lightly validate the conversation structure - for i, message in enumerate(messages): - expected_role = "user" if i % 2 == 0 else "assistant" - assert message['role'] == expected_role, f"Message {i} has role {message['role']} but should be {expected_role}" +# ============================================================================= +# MAIN +# ============================================================================= - # If all looks good, write the messages to file - with open(output_file, 'a') as f: - f.write(json.dumps(messages) + '\n') - completed_count += 1 - print(f"✓ Saved conversation {completed_count}/{num_conversations}") +if __name__ == "__main__": + import argparse - except Exception as e: - error_count += 1 - print(f"✗ Error generating conversation: {e}") + parser = argparse.ArgumentParser(description="Generate synthetic conversation data") + parser.add_argument("--num", type=int, default=1000, help="Number of conversations to generate") + parser.add_argument("--workers", type=int, default=4, help="Number of parallel workers") + parser.add_argument("--output", type=str, default=None, help="Output file path") + parser.add_argument("--append", action="store_true", help="Append to existing file instead of overwriting") + parser.add_argument("--save-metadata", action="store_true", help="Save metadata alongside messages") + args = parser.parse_args() -print(f"\nDone! Successfully saved {completed_count} conversations to {output_file}") -if error_count > 0: - print(f"Encountered {error_count} errors during generation") + # Set output file + if args.output: + output_file = args.output + else: + output_file = os.path.join(get_base_dir(), "identity_conversations.jsonl") + # Handle file creation/clearing + if not args.append and os.path.exists(output_file): + os.remove(output_file) + + print(f"Output file: {output_file}") + print(f"Generating {args.num} conversations with {args.workers} workers...") + print(f"Topic categories: {list(topics.keys())}") + print(f"Personas: {len(personas)}") + print(f"Dynamics: {len(dynamics)}") + print() + + completed_count = 0 + error_count = 0 + + with ThreadPoolExecutor(max_workers=args.workers) as executor: + # Submit all tasks + futures = {executor.submit(generate_conversation, idx): idx + for idx in range(args.num)} + + # Process results as they complete + for future in as_completed(futures): + idx = futures[future] + try: + result = future.result() + messages = result["messages"] + metadata = result["metadata"] + + # Validate + validate_conversation(messages) + + # Write to file + with open(output_file, 'a') as f: + if args.save_metadata: + f.write(json.dumps({"messages": messages, "metadata": metadata}) + '\n') + else: + f.write(json.dumps(messages) + '\n') + + completed_count += 1 + topic_short = metadata["topic"][:40] + "..." if len(metadata["topic"]) > 40 else metadata["topic"] + print(f"[{completed_count}/{args.num}] Topic: {topic_short}") + + except Exception as e: + error_count += 1 + print(f"[ERROR] idx={idx}: {e}") + + print() + print(f"Done! Saved {completed_count} conversations to {output_file}") + if error_count > 0: + print(f"Encountered {error_count} errors during generation")