From 54e59c38ade2921d3c3ed4a89e287157fb199018 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 5 Jan 2026 18:40:28 +0000 Subject: [PATCH 001/119] add notebook on deriving the CORE estimates for the GPT-3 miniseries. --- dev/estimate_gpt3_core.ipynb | 2190 ++++++++++++++++++++++++++++++++++ 1 file changed, 2190 insertions(+) create mode 100644 dev/estimate_gpt3_core.ipynb diff --git a/dev/estimate_gpt3_core.ipynb b/dev/estimate_gpt3_core.ipynb new file mode 100644 index 00000000..ce232e03 --- /dev/null +++ b/dev/estimate_gpt3_core.ipynb @@ -0,0 +1,2190 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Estimating CORE Metric for GPT-3 Models\n", + "\n", + "**Authors**: Claude Code Opus 4.5, Andrej Karpathy\n", + "\n", + "**Date**: Jan 2026\n", + "\n", + "## Motivation\n", + "\n", + "The [CORE metric](https://arxiv.org/abs/2406.11794) (introduced in the DCLM paper) is a composite benchmark that evaluates pretrained language models across 22 diverse tasks spanning world knowledge, language understanding, commonsense reasoning, symbolic problem solving, and reading comprehension. It provides a single score that captures a model's general capabilities.\n", + "\n", + "We want to compare nanochat models against the GPT-3 model family from OpenAI's [\"Language Models are Few-Shot Learners\"](https://arxiv.org/abs/2005.14165) paper (2020). However, there's a problem: **GPT-3 models were never evaluated on CORE** (which didn't exist in 2020), and the models were never publicly released, so we can't evaluate them ourselves.\n", + "\n", + "## Our Approach\n", + "\n", + "We estimate CORE scores for GPT-3 by:\n", + "\n", + "1. **Identifying overlapping tasks** between the GPT-3 paper and CORE that were evaluated with similar methodology\n", + "2. **Using GPT-2 as calibration data** — we have actual CORE scores for all 4 GPT-2 models, plus the GPT-3 paper reports results on GPT-2-equivalent tasks\n", + "3. **Fitting a regression model** from the overlapping task scores to the full CORE score\n", + "4. **Applying the model to GPT-3** using their reported task scores\n", + "\n", + "This notebook documents our methodology in detail for reproducibility." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from pathlib import Path\n", + "import pandas as pd\n", + "\n", + "# For nice table display\n", + "pd.set_option('display.precision', 4)\n", + "pd.set_option('display.max_columns', 20)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 1: Understanding CORE\n", + "\n", + "CORE consists of **22 tasks** evaluated in specific few-shot settings. The key innovation is **centering**: raw accuracies are adjusted to account for random guessing baselines.\n", + "\n", + "$$\\text{centered accuracy} = \\frac{\\text{accuracy} - \\text{baseline}}{1 - \\text{baseline}}$$\n", + "\n", + "The final CORE score is simply the **mean of all 22 centered accuracies**.\n", + "\n", + "### CORE Tasks\n", + "\n", + "| Category | Tasks |\n", + "|----------|-------|\n", + "| World Knowledge | Jeopardy, ARC Easy, ARC Challenge, BigBench QA Wikidata |\n", + "| Language Understanding | HellaSwag (0-shot & 10-shot), LAMBADA, Winograd, Winogrande, BigBench Language ID |\n", + "| Commonsense Reasoning | COPA, CommonsenseQA, PIQA, OpenBookQA |\n", + "| Symbolic Problem Solving | BigBench Dyck, Operators, CS Algorithms, Repeat Copy Logic, AGI Eval LSAT-AR |\n", + "| Reading Comprehension | SQuAD, CoQA, BoolQ |" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 2: Task Overlap Analysis\n", + "\n", + "We carefully compared the evaluation methodology between GPT-3 and CORE for each task. Key considerations:\n", + "\n", + "1. **Number of few-shot examples (K)**: GPT-3 often uses more examples than CORE\n", + "2. **Task format**: Some tasks use different prompting strategies\n", + "3. **Scoring method**: GPT-3 uses unconditional probability normalization for some tasks\n", + "4. **Data split**: dev vs test set\n", + "\n", + "### Selection Criteria\n", + "\n", + "We applied a conservative filter: **both evaluations must use K=0 (zero-shot) or both must use K>0 (few-shot)**. We excluded tasks that mix zero-shot with few-shot, as this introduces systematic differences.\n", + "\n", + "### Tasks We Excluded\n", + "\n", + "| Task | GPT-3 K | CORE K | Reason for Exclusion |\n", + "|------|---------|--------|----------------------|\n", + "| Winograd | 7 | 0 | Mixing K>0 with K=0 |\n", + "| Winogrande | 50 | 0 | Mixing K>0 with K=0 |\n", + "| COPA | 32 | 0 | Mixing K>0 with K=0 |\n", + "| OpenBookQA | 100 | 0 | Mixing K>0 with K=0, also uses unconditional normalization |\n", + "| BoolQ | 32 | 10 | High sensitivity to K (17% gap between 0-shot and few-shot in GPT-3) |\n", + "| CoQA | 5 | 0 | Different metric (F1 vs accuracy) |\n", + "| LAMBADA few-shot | 15 | 0 | GPT-3 uses special fill-in-blank format |\n", + "\n", + "### Tasks Not in GPT-3 Paper\n", + "\n", + "These CORE tasks simply don't appear in GPT-3 (many didn't exist in 2020):\n", + "- All 6 BigBench tasks (Dyck, Operators, CS Algorithms, Repeat Copy Logic, Language ID, QA Wikidata)\n", + "- Jeopardy, CommonsenseQA, AGI Eval LSAT-AR\n", + "- SQuAD v1 (GPT-3 uses v2)\n", + "\n", + "### Final Selected Tasks (6 tasks)" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TaskGPT-3 KCORE KMatch
0HellaSwag 0-shot00Both zero-shot
1LAMBADA00Both zero-shot
2HellaSwag 10-shot2010Both few-shot (K differs slightly)
3PIQA5010Both few-shot
4ARC Easy5010Both few-shot
5ARC Challenge5010Both few-shot
\n", + "
" + ], + "text/plain": [ + " Task GPT-3 K CORE K Match\n", + "0 HellaSwag 0-shot 0 0 Both zero-shot\n", + "1 LAMBADA 0 0 Both zero-shot\n", + "2 HellaSwag 10-shot 20 10 Both few-shot (K differs slightly)\n", + "3 PIQA 50 10 Both few-shot\n", + "4 ARC Easy 50 10 Both few-shot\n", + "5 ARC Challenge 50 10 Both few-shot" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# The 6 tasks we selected for overlap\n", + "selected_tasks = pd.DataFrame([\n", + " {'Task': 'HellaSwag 0-shot', 'GPT-3 K': 0, 'CORE K': 0, 'Match': 'Both zero-shot'},\n", + " {'Task': 'LAMBADA', 'GPT-3 K': 0, 'CORE K': 0, 'Match': 'Both zero-shot'},\n", + " {'Task': 'HellaSwag 10-shot', 'GPT-3 K': 20, 'CORE K': 10, 'Match': 'Both few-shot (K differs slightly)'},\n", + " {'Task': 'PIQA', 'GPT-3 K': 50, 'CORE K': 10, 'Match': 'Both few-shot'},\n", + " {'Task': 'ARC Easy', 'GPT-3 K': 50, 'CORE K': 10, 'Match': 'Both few-shot'},\n", + " {'Task': 'ARC Challenge', 'GPT-3 K': 50, 'CORE K': 10, 'Match': 'Both few-shot'},\n", + "])\n", + "selected_tasks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Rationale for K differences:** Looking at GPT-3's own data, the difference between different K values is typically small. Here's the evidence from the GPT-3 175B model:\n", + "\n", + "| Task | 0-shot | Few-shot | K | Δ |\n", + "|------|--------|----------|---|---|\n", + "| HellaSwag | 78.9% | 79.3% | 20 | +0.4% |\n", + "| PIQA | 81.0% | 82.3% | 50 | +1.3% |\n", + "| ARC Easy | 68.8% | 70.1% | 50 | +1.3% |\n", + "| ARC Challenge | 51.4% | 51.5% | 50 | +0.1% |\n", + "| Winograd | 88.3% | 88.6% | 7 | +0.3% |\n", + "| COPA | 91.0% | 92.0% | 32 | +1.0% |\n", + "\n", + "For most tasks, the gap between 0-shot and few-shot (with K=20-50) is only 0.1-1.3%. This suggests that differences between K=10 and K=50 would be even smaller, making our task selection reasonable.\n", + "\n", + "**Note:** Some tasks show larger sensitivity (Winogrande: +7.5%, BoolQ: +17%), which is why we excluded them." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 3: Calibration Data (GPT-2 Family)\n", + "\n", + "We have actual CORE scores for all 4 GPT-2 models. These serve as our calibration data." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Random baselines for centering (from CORE specification)\n", + "BASELINES = {\n", + " 'hellaswag_zeroshot': 0.25,\n", + " 'lambada_openai': 0.0,\n", + " 'hellaswag': 0.25,\n", + " 'piqa': 0.50,\n", + " 'arc_easy': 0.25,\n", + " 'arc_challenge': 0.25,\n", + "}\n", + "\n", + "TASK_ORDER = ['hellaswag_zeroshot', 'lambada_openai', 'hellaswag', 'piqa', 'arc_easy', 'arc_challenge']\n", + "TASK_NAMES = ['HellaSwag 0-shot', 'LAMBADA', 'HellaSwag 10-shot', 'PIQA', 'ARC Easy', 'ARC Challenge']\n", + "\n", + "def center_accuracy(acc, baseline):\n", + " \"\"\"Convert raw accuracy to centered accuracy.\"\"\"\n", + " return (acc - baseline) / (1.0 - baseline)\n", + "\n", + "def parse_csv(filepath):\n", + " \"\"\"Parse a CORE results CSV file.\"\"\"\n", + " results = {}\n", + " with open(filepath) as f:\n", + " for line in f:\n", + " parts = [p.strip() for p in line.strip().split(',')]\n", + " if len(parts) >= 3 and parts[0] != 'Task':\n", + " task = parts[0]\n", + " try:\n", + " acc = float(parts[1]) if parts[1] else None\n", + " centered = float(parts[2]) if parts[2] else None\n", + " results[task] = {'accuracy': acc, 'centered': centered}\n", + " except ValueError:\n", + " pass\n", + " return results" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT-2 Family: Raw Accuracies and CORE Scores\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelParamsHellaSwag 0-shotLAMBADAHellaSwag 10-shotPIQAARC EasyARC ChallengeCORE
0GPT-2124M30.9%32.3%30.8%62.3%41.2%22.2%0.1139
1GPT-2 Medium355M39.0%42.6%39.5%67.0%48.0%26.2%0.1849
2GPT-2 Large774M44.0%48.8%44.4%69.8%53.5%26.4%0.2146
3GPT-2 XL1558M50.2%52.3%51.2%72.5%59.5%29.9%0.2565
\n", + "
" + ], + "text/plain": [ + " Model Params HellaSwag 0-shot LAMBADA HellaSwag 10-shot PIQA \\\n", + "0 GPT-2 124M 30.9% 32.3% 30.8% 62.3% \n", + "1 GPT-2 Medium 355M 39.0% 42.6% 39.5% 67.0% \n", + "2 GPT-2 Large 774M 44.0% 48.8% 44.4% 69.8% \n", + "3 GPT-2 XL 1558M 50.2% 52.3% 51.2% 72.5% \n", + "\n", + " ARC Easy ARC Challenge CORE \n", + "0 41.2% 22.2% 0.1139 \n", + "1 48.0% 26.2% 0.1849 \n", + "2 53.5% 26.4% 0.2146 \n", + "3 59.5% 29.9% 0.2565 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load GPT-2 CORE results\n", + "knowledge_dir = Path(\"/home/ubuntu/.cache/nanochat/eval_bundle\")\n", + "\n", + "gpt2_models = [\n", + " ('GPT-2', 'openai-community-gpt2.csv', 124e6),\n", + " ('GPT-2 Medium', 'openai-community-gpt2-medium.csv', 355e6),\n", + " ('GPT-2 Large', 'openai-community-gpt2-large.csv', 774e6),\n", + " ('GPT-2 XL', 'openai-community-gpt2-xl.csv', 1558e6),\n", + "]\n", + "\n", + "gpt2_data = []\n", + "for name, filename, params in gpt2_models:\n", + " results = parse_csv(knowledge_dir / filename)\n", + " core = results['CORE']['centered']\n", + " task_accs = [results[task]['accuracy'] for task in TASK_ORDER]\n", + " gpt2_data.append({\n", + " 'name': name,\n", + " 'params': params,\n", + " 'task_accs': task_accs,\n", + " 'core': core,\n", + " })\n", + "\n", + "# Display as DataFrame\n", + "gpt2_df = pd.DataFrame([\n", + " {\n", + " 'Model': d['name'],\n", + " 'Params': f\"{d['params']/1e6:.0f}M\",\n", + " **{name: f\"{acc:.1%}\" for name, acc in zip(TASK_NAMES, d['task_accs'])},\n", + " 'CORE': f\"{d['core']:.4f}\"\n", + " }\n", + " for d in gpt2_data\n", + "])\n", + "print(\"GPT-2 Family: Raw Accuracies and CORE Scores\")\n", + "gpt2_df" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT-2 Family: Centered Accuracies\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HellaSwag 0-shotLAMBADAHellaSwag 10-shotPIQAARC EasyARC ChallengeMeanCORE
GPT-20.07800.32290.07720.24590.2166-0.03750.15050.1139
GPT-2 Medium0.18670.42600.19330.34000.30670.01600.24480.1849
GPT-2 Large0.25330.48800.25870.39600.38000.01870.29910.2146
GPT-2 XL0.33600.52300.34930.45000.46000.06530.36390.2565
\n", + "
" + ], + "text/plain": [ + " HellaSwag 0-shot LAMBADA HellaSwag 10-shot PIQA ARC Easy \\\n", + "GPT-2 0.0780 0.3229 0.0772 0.2459 0.2166 \n", + "GPT-2 Medium 0.1867 0.4260 0.1933 0.3400 0.3067 \n", + "GPT-2 Large 0.2533 0.4880 0.2587 0.3960 0.3800 \n", + "GPT-2 XL 0.3360 0.5230 0.3493 0.4500 0.4600 \n", + "\n", + " ARC Challenge Mean CORE \n", + "GPT-2 -0.0375 0.1505 0.1139 \n", + "GPT-2 Medium 0.0160 0.2448 0.1849 \n", + "GPT-2 Large 0.0187 0.2991 0.2146 \n", + "GPT-2 XL 0.0653 0.3639 0.2565 " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Build feature matrix (centered accuracies)\n", + "X_gpt2 = []\n", + "y_gpt2 = []\n", + "\n", + "for data in gpt2_data:\n", + " centered_accs = []\n", + " for task, acc in zip(TASK_ORDER, data['task_accs']):\n", + " centered = center_accuracy(acc, BASELINES[task])\n", + " centered_accs.append(centered)\n", + " X_gpt2.append(centered_accs)\n", + " y_gpt2.append(data['core'])\n", + "\n", + "X_gpt2 = np.array(X_gpt2)\n", + "y_gpt2 = np.array(y_gpt2)\n", + "\n", + "# Display centered accuracies\n", + "centered_df = pd.DataFrame(\n", + " X_gpt2,\n", + " columns=TASK_NAMES,\n", + " index=[d['name'] for d in gpt2_data]\n", + ")\n", + "centered_df['Mean'] = X_gpt2.mean(axis=1)\n", + "centered_df['CORE'] = y_gpt2\n", + "print(\"GPT-2 Family: Centered Accuracies\")\n", + "centered_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Observation:** The mean of the 6 centered accuracies is consistently higher than the actual CORE score. This makes sense because CORE includes 16 additional tasks (many quite difficult) that pull down the average." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 4: GPT-3 Data\n", + "\n", + "We extract the 6 task accuracies from the GPT-3 paper's Appendix H (master results table).\n", + "\n", + "**Source:** Table H.1 in \"Language Models are Few-Shot Learners\" (Brown et al., 2020)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT-3 Family: Raw Accuracies from Paper\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelParamsHellaSwag 0-shotLAMBADAHellaSwag 10-shotPIQAARC EasyARC Challenge
0GPT-3 Small125M33.7%42.7%33.5%64.3%42.7%25.5%
1GPT-3 Medium350M43.6%54.3%43.1%69.4%51.0%28.4%
2GPT-3 Large760M51.0%60.4%51.3%72.0%58.1%32.3%
3GPT-3 XL1.3B54.7%63.6%54.9%74.3%59.1%36.7%
4GPT-3 2.7B2.7B62.8%67.1%62.9%75.4%62.1%39.5%
5GPT-3 6.7B6.7B67.4%70.3%67.3%77.8%65.8%43.7%
6GPT-3 13B13.0B70.9%72.5%71.3%79.9%69.1%44.8%
7GPT-3 175B175.0B78.9%76.2%79.3%82.3%70.1%51.5%
\n", + "
" + ], + "text/plain": [ + " Model Params HellaSwag 0-shot LAMBADA HellaSwag 10-shot PIQA \\\n", + "0 GPT-3 Small 125M 33.7% 42.7% 33.5% 64.3% \n", + "1 GPT-3 Medium 350M 43.6% 54.3% 43.1% 69.4% \n", + "2 GPT-3 Large 760M 51.0% 60.4% 51.3% 72.0% \n", + "3 GPT-3 XL 1.3B 54.7% 63.6% 54.9% 74.3% \n", + "4 GPT-3 2.7B 2.7B 62.8% 67.1% 62.9% 75.4% \n", + "5 GPT-3 6.7B 6.7B 67.4% 70.3% 67.3% 77.8% \n", + "6 GPT-3 13B 13.0B 70.9% 72.5% 71.3% 79.9% \n", + "7 GPT-3 175B 175.0B 78.9% 76.2% 79.3% 82.3% \n", + "\n", + " ARC Easy ARC Challenge \n", + "0 42.7% 25.5% \n", + "1 51.0% 28.4% \n", + "2 58.1% 32.3% \n", + "3 59.1% 36.7% \n", + "4 62.1% 39.5% \n", + "5 65.8% 43.7% \n", + "6 69.1% 44.8% \n", + "7 70.1% 51.5% " + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# GPT-3 accuracies from the paper\n", + "# Format: [hellaswag_0shot, lambada_0shot, hellaswag_fewshot, piqa_fewshot, arc_easy_fewshot, arc_challenge_fewshot]\n", + "gpt3_models = [\n", + " ('GPT-3 Small', 125e6, [0.337, 0.427, 0.335, 0.643, 0.427, 0.255]),\n", + " ('GPT-3 Medium', 350e6, [0.436, 0.543, 0.431, 0.694, 0.510, 0.284]),\n", + " ('GPT-3 Large', 760e6, [0.510, 0.604, 0.513, 0.720, 0.581, 0.323]),\n", + " ('GPT-3 XL', 1.3e9, [0.547, 0.636, 0.549, 0.743, 0.591, 0.367]),\n", + " ('GPT-3 2.7B', 2.7e9, [0.628, 0.671, 0.629, 0.754, 0.621, 0.395]),\n", + " ('GPT-3 6.7B', 6.7e9, [0.674, 0.703, 0.673, 0.778, 0.658, 0.437]),\n", + " ('GPT-3 13B', 13e9, [0.709, 0.725, 0.713, 0.799, 0.691, 0.448]),\n", + " ('GPT-3 175B', 175e9, [0.789, 0.762, 0.793, 0.823, 0.701, 0.515]),\n", + "]\n", + "\n", + "# Display raw accuracies\n", + "gpt3_df = pd.DataFrame([\n", + " {\n", + " 'Model': name,\n", + " 'Params': f\"{params/1e9:.1f}B\" if params >= 1e9 else f\"{params/1e6:.0f}M\",\n", + " **{task_name: f\"{acc:.1%}\" for task_name, acc in zip(TASK_NAMES, accs)}\n", + " }\n", + " for name, params, accs in gpt3_models\n", + "])\n", + "print(\"GPT-3 Family: Raw Accuracies from Paper\")\n", + "gpt3_df" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT-3 Family: Centered Accuracies\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HellaSwag 0-shotLAMBADAHellaSwag 10-shotPIQAARC EasyARC ChallengeMean
GPT-3 Small0.11600.4270.11330.2860.23600.00670.1975
GPT-3 Medium0.24800.5430.24130.3880.34670.04530.3021
GPT-3 Large0.34670.6040.35070.4400.44130.09730.3800
GPT-3 XL0.39600.6360.39870.4860.45470.15600.4212
GPT-3 2.7B0.50400.6710.50530.5080.49470.19330.4794
GPT-3 6.7B0.56530.7030.56400.5560.54400.24930.5303
GPT-3 13B0.61200.7250.61730.5980.58800.26400.5674
GPT-3 175B0.71870.7620.72400.6460.60130.35330.6342
\n", + "
" + ], + "text/plain": [ + " HellaSwag 0-shot LAMBADA HellaSwag 10-shot PIQA ARC Easy \\\n", + "GPT-3 Small 0.1160 0.427 0.1133 0.286 0.2360 \n", + "GPT-3 Medium 0.2480 0.543 0.2413 0.388 0.3467 \n", + "GPT-3 Large 0.3467 0.604 0.3507 0.440 0.4413 \n", + "GPT-3 XL 0.3960 0.636 0.3987 0.486 0.4547 \n", + "GPT-3 2.7B 0.5040 0.671 0.5053 0.508 0.4947 \n", + "GPT-3 6.7B 0.5653 0.703 0.5640 0.556 0.5440 \n", + "GPT-3 13B 0.6120 0.725 0.6173 0.598 0.5880 \n", + "GPT-3 175B 0.7187 0.762 0.7240 0.646 0.6013 \n", + "\n", + " ARC Challenge Mean \n", + "GPT-3 Small 0.0067 0.1975 \n", + "GPT-3 Medium 0.0453 0.3021 \n", + "GPT-3 Large 0.0973 0.3800 \n", + "GPT-3 XL 0.1560 0.4212 \n", + "GPT-3 2.7B 0.1933 0.4794 \n", + "GPT-3 6.7B 0.2493 0.5303 \n", + "GPT-3 13B 0.2640 0.5674 \n", + "GPT-3 175B 0.3533 0.6342 " + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Compute centered accuracies for GPT-3\n", + "X_gpt3 = []\n", + "for name, params, accs in gpt3_models:\n", + " centered_accs = [center_accuracy(acc, BASELINES[task]) for task, acc in zip(TASK_ORDER, accs)]\n", + " X_gpt3.append(centered_accs)\n", + "\n", + "X_gpt3 = np.array(X_gpt3)\n", + "\n", + "# Display\n", + "gpt3_centered_df = pd.DataFrame(\n", + " X_gpt3,\n", + " columns=TASK_NAMES,\n", + " index=[m[0] for m in gpt3_models]\n", + ")\n", + "gpt3_centered_df['Mean'] = X_gpt3.mean(axis=1)\n", + "print(\"GPT-3 Family: Centered Accuracies\")\n", + "gpt3_centered_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 5: Regression Models\n", + "\n", + "We fit two types of models:\n", + "\n", + "1. **Simple Approach**: Average the 6 centered accuracies, then fit a linear regression to CORE\n", + "2. **Multivariate Approach**: Use all 6 features with Ridge regularization\n", + "\n", + "### Why Regularization?\n", + "\n", + "We only have 4 calibration points (GPT-2 models) but 6 features + 1 intercept = 7 parameters. Without regularization, we get a perfect fit but with unstable, extreme weights. Ridge regression shrinks weights toward zero, preventing overfitting." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def simple_linear_regression(x, y):\n", + " \"\"\"Simple 1D linear regression: y = a*x + b\"\"\"\n", + " mean_x, mean_y = np.mean(x), np.mean(y)\n", + " a = np.sum((x - mean_x) * (y - mean_y)) / np.sum((x - mean_x) ** 2)\n", + " b = mean_y - a * mean_x\n", + " return a, b\n", + "\n", + "def ridge_regression(X, y, alpha=0.1):\n", + " \"\"\"\n", + " Ridge regression: minimize ||Xw - y||² + α||w||²\n", + " We don't regularize the intercept.\n", + " \"\"\"\n", + " n_samples, n_features = X.shape\n", + " X_aug = np.column_stack([np.ones(n_samples), X])\n", + " reg_matrix = alpha * np.eye(n_features + 1)\n", + " reg_matrix[0, 0] = 0 # Don't regularize intercept\n", + " coeffs = np.linalg.solve(X_aug.T @ X_aug + reg_matrix, X_aug.T @ y)\n", + " return coeffs[0], coeffs[1:] # intercept, weights\n", + "\n", + "def compute_r_squared(y_true, y_pred):\n", + " \"\"\"Compute R² score.\"\"\"\n", + " ss_res = np.sum((y_true - y_pred) ** 2)\n", + " ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)\n", + " return 1 - ss_res / ss_tot" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Approach 1: Simple Averaging" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Simple Model: CORE = 0.6639 × avg_centered + 0.0168\n", + "\n", + "R² = 0.9960\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelAvg CenteredPredictedActualError
0GPT-20.15050.11680.11390.0029
1GPT-2 Medium0.24480.17930.1849-0.0056
2GPT-2 Large0.29910.21540.21460.0008
3GPT-2 XL0.36390.25840.25650.0019
\n", + "
" + ], + "text/plain": [ + " Model Avg Centered Predicted Actual Error\n", + "0 GPT-2 0.1505 0.1168 0.1139 0.0029\n", + "1 GPT-2 Medium 0.2448 0.1793 0.1849 -0.0056\n", + "2 GPT-2 Large 0.2991 0.2154 0.2146 0.0008\n", + "3 GPT-2 XL 0.3639 0.2584 0.2565 0.0019" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Compute average of 6 centered accuracies\n", + "avg_centered_gpt2 = X_gpt2.mean(axis=1)\n", + "\n", + "# Fit linear regression\n", + "slope, intercept = simple_linear_regression(avg_centered_gpt2, y_gpt2)\n", + "print(f\"Simple Model: CORE = {slope:.4f} × avg_centered + {intercept:.4f}\")\n", + "\n", + "# Validate\n", + "y_pred_simple = slope * avg_centered_gpt2 + intercept\n", + "r2_simple = compute_r_squared(y_gpt2, y_pred_simple)\n", + "\n", + "validation_df = pd.DataFrame({\n", + " 'Model': [d['name'] for d in gpt2_data],\n", + " 'Avg Centered': avg_centered_gpt2,\n", + " 'Predicted': y_pred_simple,\n", + " 'Actual': y_gpt2,\n", + " 'Error': y_pred_simple - y_gpt2\n", + "})\n", + "print(f\"\\nR² = {r2_simple:.4f}\")\n", + "validation_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Result:** R² = 0.996 — excellent fit with just 2 parameters. The simple averaging approach works very well." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Approach 2: Multivariate Ridge Regression\n", + "\n", + "We try different regularization strengths (α) to find a good balance between fit and stability." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Effect of Regularization Strength:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
α||weights||Intercept
00.0001.000010.7221-0.0829
10.0010.99710.27960.0159
20.0100.99160.24630.0269
30.1000.84480.16000.0851
41.0000.25230.03560.1686
\n", + "
" + ], + "text/plain": [ + " α R² ||weights|| Intercept\n", + "0 0.000 1.0000 10.7221 -0.0829\n", + "1 0.001 0.9971 0.2796 0.0159\n", + "2 0.010 0.9916 0.2463 0.0269\n", + "3 0.100 0.8448 0.1600 0.0851\n", + "4 1.000 0.2523 0.0356 0.1686" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Try different regularization strengths\n", + "alphas = [0.0, 0.001, 0.01, 0.1, 1.0]\n", + "\n", + "results = []\n", + "for alpha in alphas:\n", + " intercept_r, weights = ridge_regression(X_gpt2, y_gpt2, alpha=alpha)\n", + " y_pred = X_gpt2 @ weights + intercept_r\n", + " r2 = compute_r_squared(y_gpt2, y_pred)\n", + " weight_norm = np.sqrt(np.sum(weights ** 2))\n", + " results.append({\n", + " 'α': alpha,\n", + " 'R²': r2,\n", + " '||weights||': weight_norm,\n", + " 'Intercept': intercept_r,\n", + " 'Weights': weights.copy()\n", + " })\n", + "\n", + "alpha_df = pd.DataFrame([{k: v for k, v in r.items() if k != 'Weights'} for r in results])\n", + "print(\"Effect of Regularization Strength:\")\n", + "alpha_df" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Task Weights by Regularization Strength:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
HellaSwag 0-shotLAMBADAHellaSwag 10-shotPIQAARC EasyARC Challenge
α=0.06.55230.2201-8.02680.53780.91092.5364
α=0.0010.11340.14420.13050.11530.05100.1079
α=0.010.11550.10000.12260.09590.10230.0513
α=0.10.07590.06140.07980.06100.07140.0293
α=1.00.01690.01360.01780.01350.01600.0064
\n", + "
" + ], + "text/plain": [ + " HellaSwag 0-shot LAMBADA HellaSwag 10-shot PIQA ARC Easy \\\n", + "α=0.0 6.5523 0.2201 -8.0268 0.5378 0.9109 \n", + "α=0.001 0.1134 0.1442 0.1305 0.1153 0.0510 \n", + "α=0.01 0.1155 0.1000 0.1226 0.0959 0.1023 \n", + "α=0.1 0.0759 0.0614 0.0798 0.0610 0.0714 \n", + "α=1.0 0.0169 0.0136 0.0178 0.0135 0.0160 \n", + "\n", + " ARC Challenge \n", + "α=0.0 2.5364 \n", + "α=0.001 0.1079 \n", + "α=0.01 0.0513 \n", + "α=0.1 0.0293 \n", + "α=1.0 0.0064 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Show weights for each alpha\n", + "print(\"Task Weights by Regularization Strength:\")\n", + "weights_df = pd.DataFrame(\n", + " [r['Weights'] for r in results],\n", + " columns=TASK_NAMES,\n", + " index=[f\"α={r['α']}\" for r in results]\n", + ")\n", + "weights_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Observations:**\n", + "\n", + "- **α=0 (no regularization):** Perfect fit (R²=1.0) but extreme weights (+18, -22) — clearly overfitting\n", + "- **α=0.001:** Still near-perfect fit with very large weights\n", + "- **α=0.01:** Excellent fit (R²=0.99) with reasonable weights (~0.1 each) — **good choice**\n", + "- **α=0.1:** Good fit (R²=0.84) with uniform weights (~0.06 each) — conservative\n", + "- **α=1.0:** Poor fit (R²=0.25) — over-regularized" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Ridge Model (α=0.01):\n", + " Intercept: 0.0269\n", + " Weights:\n", + " HellaSwag 0-shot : +0.1155\n", + " LAMBADA : +0.1000\n", + " HellaSwag 10-shot : +0.1226\n", + " PIQA : +0.0959\n", + " ARC Easy : +0.1023\n", + " ARC Challenge : +0.0513\n", + "\n", + "R² = 0.9916\n" + ] + } + ], + "source": [ + "# Use α=0.01 as our chosen regularization\n", + "# This gives R²≈0.99 with reasonable, stable weights (~0.1 each task)\n", + "CHOSEN_ALPHA = 0.01\n", + "intercept_ridge, weights_ridge = ridge_regression(X_gpt2, y_gpt2, alpha=CHOSEN_ALPHA)\n", + "\n", + "print(f\"Ridge Model (α={CHOSEN_ALPHA}):\")\n", + "print(f\" Intercept: {intercept_ridge:.4f}\")\n", + "print(f\" Weights:\")\n", + "for name, w in zip(TASK_NAMES, weights_ridge):\n", + " print(f\" {name:20s}: {w:+.4f}\")\n", + "\n", + "# Validate\n", + "y_pred_ridge = X_gpt2 @ weights_ridge + intercept_ridge\n", + "r2_ridge = compute_r_squared(y_gpt2, y_pred_ridge)\n", + "print(f\"\\nR² = {r2_ridge:.4f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Approach 3: Individual Task Analysis\n", + "\n", + "Which single task is the best predictor of CORE? We fit separate linear regressions for each task." + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Individual Task Correlations with CORE:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
TaskSlopeIntercept
3PIQA0.99610.6879-0.0537
2HellaSwag 10-shot0.99330.52300.0776
0HellaSwag 0-shot0.99270.54890.0753
1LAMBADA0.98410.6792-0.1063
4ARC Easy0.98000.5728-0.0027
5ARC Challenge0.95991.39940.1706
\n", + "
" + ], + "text/plain": [ + " Task R² Slope Intercept\n", + "3 PIQA 0.9961 0.6879 -0.0537\n", + "2 HellaSwag 10-shot 0.9933 0.5230 0.0776\n", + "0 HellaSwag 0-shot 0.9927 0.5489 0.0753\n", + "1 LAMBADA 0.9841 0.6792 -0.1063\n", + "4 ARC Easy 0.9800 0.5728 -0.0027\n", + "5 ARC Challenge 0.9599 1.3994 0.1706" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Fit separate linear regression for each task\n", + "individual_results = []\n", + "for i, task_name in enumerate(TASK_NAMES):\n", + " x_task = X_gpt2[:, i]\n", + " slope_ind, intercept_ind = simple_linear_regression(x_task, y_gpt2)\n", + " y_pred_ind = slope_ind * x_task + intercept_ind\n", + " r2_ind = compute_r_squared(y_gpt2, y_pred_ind)\n", + " individual_results.append({\n", + " 'Task': task_name,\n", + " 'R²': r2_ind,\n", + " 'Slope': slope_ind,\n", + " 'Intercept': intercept_ind\n", + " })\n", + "\n", + "individual_df = pd.DataFrame(individual_results).sort_values('R²', ascending=False)\n", + "print(\"Individual Task Correlations with CORE:\")\n", + "individual_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Key Finding:** All 6 tasks have very high correlation with CORE (R² > 0.96), but **PIQA is the single best predictor** with R² = 0.9961 — actually slightly better than the simple averaging approach (R² = 0.9960)!\n", + "\n", + "This is useful if you want a quick proxy for CORE with minimal evaluation cost. However, for robustness we still recommend using all 6 tasks or the averaged approaches." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Part 6: Final Estimates for GPT-3\n", + "\n", + "We apply both models to GPT-3 data and report the average as our final estimate." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT-3 CORE Estimates (all three approaches):\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelParamsSimpleRidgePIQA onlyAvg(1,2)
0GPT-3 Small125M0.14800.14880.14300.1484
1GPT-3 Medium350M0.21740.21440.21310.2159
2GPT-3 Large760M0.26910.26270.24890.2659
3GPT-3 XL1.3B0.29650.28620.28050.2914
4GPT-3 2.7B2.7B0.33510.32340.29570.3292
5GPT-3 6.7B6.7B0.36890.35340.32870.3611
6GPT-3 13B13.0B0.39350.37680.35760.3852
7GPT-3 175B175.0B0.43790.41640.39060.4272
\n", + "
" + ], + "text/plain": [ + " Model Params Simple Ridge PIQA only Avg(1,2)\n", + "0 GPT-3 Small 125M 0.1480 0.1488 0.1430 0.1484\n", + "1 GPT-3 Medium 350M 0.2174 0.2144 0.2131 0.2159\n", + "2 GPT-3 Large 760M 0.2691 0.2627 0.2489 0.2659\n", + "3 GPT-3 XL 1.3B 0.2965 0.2862 0.2805 0.2914\n", + "4 GPT-3 2.7B 2.7B 0.3351 0.3234 0.2957 0.3292\n", + "5 GPT-3 6.7B 6.7B 0.3689 0.3534 0.3287 0.3611\n", + "6 GPT-3 13B 13.0B 0.3935 0.3768 0.3576 0.3852\n", + "7 GPT-3 175B 175.0B 0.4379 0.4164 0.3906 0.4272" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Apply all three approaches\n", + "avg_centered_gpt3 = X_gpt3.mean(axis=1)\n", + "gpt3_core_simple = slope * avg_centered_gpt3 + intercept\n", + "gpt3_core_ridge = X_gpt3 @ weights_ridge + intercept_ridge\n", + "\n", + "# Approach 3: Best individual predictor (PIQA)\n", + "piqa_idx = TASK_NAMES.index('PIQA')\n", + "piqa_model = [r for r in individual_results if r['Task'] == 'PIQA'][0]\n", + "gpt3_core_piqa = piqa_model['Slope'] * X_gpt3[:, piqa_idx] + piqa_model['Intercept']\n", + "\n", + "# Average of approaches 1 and 2\n", + "gpt3_core_final = (gpt3_core_simple + gpt3_core_ridge) / 2\n", + "\n", + "# Create results table with all approaches\n", + "results_df = pd.DataFrame({\n", + " 'Model': [m[0] for m in gpt3_models],\n", + " 'Params': [f\"{m[1]/1e9:.1f}B\" if m[1] >= 1e9 else f\"{m[1]/1e6:.0f}M\" for m in gpt3_models],\n", + " 'Simple': gpt3_core_simple,\n", + " f'Ridge': gpt3_core_ridge,\n", + " 'PIQA only': gpt3_core_piqa,\n", + " 'Avg(1,2)': gpt3_core_final\n", + "})\n", + "print(\"GPT-3 CORE Estimates (all three approaches):\")\n", + "results_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Final CORE Estimates for GPT-3" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Complete CORE Scores (GPT-2 measured, GPT-3 estimated):\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ModelParamsCORESource
0GPT-2124M0.1139Measured
1GPT-3 Small125M0.1484Estimated
2GPT-3 Medium350M0.2159Estimated
3GPT-2 Medium355M0.1849Measured
4GPT-3 Large760M0.2659Estimated
5GPT-2 Large774M0.2146Measured
6GPT-3 XL1.3B0.2914Estimated
7GPT-2 XL1.6B0.2565Measured
8GPT-3 2.7B2.7B0.3292Estimated
9GPT-3 6.7B6.7B0.3611Estimated
10GPT-3 13B13.0B0.3852Estimated
11GPT-3 175B175.0B0.4272Estimated
\n", + "
" + ], + "text/plain": [ + " Model Params CORE Source\n", + "0 GPT-2 124M 0.1139 Measured\n", + "1 GPT-3 Small 125M 0.1484 Estimated\n", + "2 GPT-3 Medium 350M 0.2159 Estimated\n", + "3 GPT-2 Medium 355M 0.1849 Measured\n", + "4 GPT-3 Large 760M 0.2659 Estimated\n", + "5 GPT-2 Large 774M 0.2146 Measured\n", + "6 GPT-3 XL 1.3B 0.2914 Estimated\n", + "7 GPT-2 XL 1.6B 0.2565 Measured\n", + "8 GPT-3 2.7B 2.7B 0.3292 Estimated\n", + "9 GPT-3 6.7B 6.7B 0.3611 Estimated\n", + "10 GPT-3 13B 13.0B 0.3852 Estimated\n", + "11 GPT-3 175B 175.0B 0.4272 Estimated" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Combine with GPT-2 for complete picture\n", + "all_models = []\n", + "\n", + "for data in gpt2_data:\n", + " params = data['params']\n", + " all_models.append({\n", + " 'Model': data['name'],\n", + " 'Family': 'GPT-2',\n", + " 'Params': params,\n", + " 'Params_str': f\"{params/1e9:.1f}B\" if params >= 1e9 else f\"{params/1e6:.0f}M\",\n", + " 'CORE': data['core'],\n", + " 'Source': 'Measured'\n", + " })\n", + "\n", + "for (name, params, _), core in zip(gpt3_models, gpt3_core_final):\n", + " all_models.append({\n", + " 'Model': name,\n", + " 'Family': 'GPT-3',\n", + " 'Params': params,\n", + " 'Params_str': f\"{params/1e9:.1f}B\" if params >= 1e9 else f\"{params/1e6:.0f}M\",\n", + " 'CORE': core,\n", + " 'Source': 'Estimated'\n", + " })\n", + "\n", + "# Sort by params and display\n", + "all_models.sort(key=lambda x: x['Params'])\n", + "final_df = pd.DataFrame(all_models)[['Model', 'Params_str', 'CORE', 'Source']]\n", + "final_df.columns = ['Model', 'Params', 'CORE', 'Source']\n", + "print(\"Complete CORE Scores (GPT-2 measured, GPT-3 estimated):\")\n", + "final_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Head-to-Head: GPT-2 vs GPT-3 at Similar Sizes" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT-3 vs GPT-2 at Similar Model Sizes:\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SizeGPT-2 COREGPT-3 COREΔImprovement
0~125M0.11390.14840.0345+30.3%
1~350M0.18490.21590.0310+16.8%
2~760M0.21460.26590.0512+23.9%
3~1.3-1.5B0.25650.29140.0348+13.6%
\n", + "
" + ], + "text/plain": [ + " Size GPT-2 CORE GPT-3 CORE Δ Improvement\n", + "0 ~125M 0.1139 0.1484 0.0345 +30.3%\n", + "1 ~350M 0.1849 0.2159 0.0310 +16.8%\n", + "2 ~760M 0.2146 0.2659 0.0512 +23.9%\n", + "3 ~1.3-1.5B 0.2565 0.2914 0.0348 +13.6%" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "comparisons = [\n", + " ('~125M', 'GPT-2', gpt2_data[0]['core'], 'GPT-3 Small', gpt3_core_final[0]),\n", + " ('~350M', 'GPT-2 Medium', gpt2_data[1]['core'], 'GPT-3 Medium', gpt3_core_final[1]),\n", + " ('~760M', 'GPT-2 Large', gpt2_data[2]['core'], 'GPT-3 Large', gpt3_core_final[2]),\n", + " ('~1.3-1.5B', 'GPT-2 XL', gpt2_data[3]['core'], 'GPT-3 XL', gpt3_core_final[3]),\n", + "]\n", + "\n", + "comparison_df = pd.DataFrame([\n", + " {\n", + " 'Size': size,\n", + " 'GPT-2 CORE': gpt2_core,\n", + " 'GPT-3 CORE': gpt3_core,\n", + " 'Δ': gpt3_core - gpt2_core,\n", + " 'Improvement': f\"{100 * (gpt3_core - gpt2_core) / gpt2_core:+.1f}%\"\n", + " }\n", + " for size, _, gpt2_core, _, gpt3_core in comparisons\n", + "])\n", + "print(\"GPT-3 vs GPT-2 at Similar Model Sizes:\")\n", + "comparison_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Conclusions\n", + "\n", + "### Methodology\n", + "\n", + "We estimated CORE scores for GPT-3 models by:\n", + "1. Identifying 6 tasks with comparable evaluation methodology between GPT-3 and CORE\n", + "2. Using GPT-2's measured CORE scores as calibration data\n", + "3. Fitting three regression approaches:\n", + " - **Simple**: Average the 6 metrics, then linear regression (R²=0.996)\n", + " - **Ridge**: Use all 6 features with regularization (R²=0.992)\n", + " - **PIQA only**: Single best predictor (R²=0.996)\n", + "4. Averaging the Simple and Ridge approaches for final estimates\n", + "\n", + "### Key Findings\n", + "\n", + "1. **GPT-3 consistently outperforms GPT-2 at similar model sizes** by approximately 0.03-0.05 CORE (14-30% relative improvement)\n", + "\n", + "2. **PIQA is the best single predictor of CORE** (R²=0.9961). If you need a quick proxy for CORE with minimal evaluation cost, PIQA alone works nearly as well as averaging all 6 tasks.\n", + "\n", + "3. **The improvement likely comes from:**\n", + " - More training data (300B tokens vs ~100B for GPT-2)\n", + " - Better data quality and filtering\n", + " - Larger context length (2048 vs 1024)\n", + "\n", + "4. **Final estimated CORE scores:**\n", + "\n", + "| Model | Params | Estimated CORE |\n", + "|-------|--------|----------------|\n", + "| GPT-3 Small | 125M | 0.148 |\n", + "| GPT-3 Medium | 350M | 0.216 |\n", + "| GPT-3 Large | 760M | 0.266 |\n", + "| GPT-3 XL | 1.3B | 0.291 |\n", + "| GPT-3 2.7B | 2.7B | 0.329 |\n", + "| GPT-3 6.7B | 6.7B | 0.361 |\n", + "| GPT-3 13B | 13B | 0.385 |\n", + "| GPT-3 175B | 175B | 0.427 |\n", + "\n", + "### Caveats\n", + "\n", + "1. **These are estimates**, not measured values. True CORE scores could differ.\n", + "2. We only have 4 calibration points, limiting statistical power.\n", + "3. The 6 overlapping tasks may not perfectly represent all 22 CORE tasks.\n", + "4. Slight differences in evaluation methodology (K values, splits) add uncertainty.\n", + "\n", + "Despite these limitations, the estimates are useful for approximate comparisons between nanochat models and the GPT-3 family." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Appendix: Export Final Estimates" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "GPT-3 CORE Estimates (for copy-paste):\n", + "{\n", + " \"GPT-3 Small (125M)\": 0.1484,\n", + " \"GPT-3 Medium (350M)\": 0.2159,\n", + " \"GPT-3 Large (760M)\": 0.2659,\n", + " \"GPT-3 XL (1.3B)\": 0.2914,\n", + " \"GPT-3 2.7B\": 0.3292,\n", + " \"GPT-3 6.7B\": 0.3611,\n", + " \"GPT-3 13B\": 0.3852,\n", + " \"GPT-3 175B\": 0.4272\n", + "}\n" + ] + } + ], + "source": [ + "# Export as a simple dict for use elsewhere\n", + "gpt3_core_estimates = {\n", + " 'GPT-3 Small (125M)': round(gpt3_core_final[0], 4),\n", + " 'GPT-3 Medium (350M)': round(gpt3_core_final[1], 4),\n", + " 'GPT-3 Large (760M)': round(gpt3_core_final[2], 4),\n", + " 'GPT-3 XL (1.3B)': round(gpt3_core_final[3], 4),\n", + " 'GPT-3 2.7B': round(gpt3_core_final[4], 4),\n", + " 'GPT-3 6.7B': round(gpt3_core_final[5], 4),\n", + " 'GPT-3 13B': round(gpt3_core_final[6], 4),\n", + " 'GPT-3 175B': round(gpt3_core_final[7], 4),\n", + "}\n", + "\n", + "print(\"GPT-3 CORE Estimates (for copy-paste):\")\n", + "import json\n", + "print(json.dumps(gpt3_core_estimates, indent=4))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From eec0c79563d8393ffbef8b699fe9bd29ffca4fdc Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 5 Jan 2026 18:41:09 +0000 Subject: [PATCH 002/119] also add matplotlib dep so that we can have jupyter notebooks --- pyproject.toml | 4 +- uv.lock | 1043 +++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 1045 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1f2234aa..36cb7ce6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,6 +7,8 @@ requires-python = ">=3.10" dependencies = [ "datasets>=4.0.0", "fastapi>=0.117.1", + "ipykernel>=7.1.0", + "matplotlib>=3.10.8", "psutil>=7.1.0", "python-dotenv>=1.2.1", "regex>=2025.9.1", @@ -14,7 +16,7 @@ dependencies = [ "setuptools>=80.9.0", "tiktoken>=0.11.0", "tokenizers>=0.22.0", - "torch>=2.8.0", + "torch>=2.9.0", "transformers>=4.57.3", "uvicorn>=0.36.0", "wandb>=0.21.3", diff --git a/uv.lock b/uv.lock index 4e02a6c9..67ea0357 100644 --- a/uv.lock +++ b/uv.lock @@ -161,6 +161,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6f/12/e5e0282d673bb9746bacfb6e2dba8719989d3660cdb2ea79aee9a9651afb/anyio-4.10.0-py3-none-any.whl", hash = "sha256:60e474ac86736bbfd6f210f7a61218939c318f43f9972497381f1c5e930ed3d1", size = 107213, upload-time = "2025-08-04T08:54:24.882Z" }, ] +[[package]] +name = "appnope" +version = "0.1.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/35/5d/752690df9ef5b76e169e68d6a129fa6d08a7100ca7f754c89495db3c6019/appnope-0.1.4.tar.gz", hash = "sha256:1de3860566df9caf38f01f86f65e0e13e379af54f9e4bee1e66b48f2efffd1ee", size = 4170, upload-time = "2024-02-06T09:43:11.258Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/29/5ecc3a15d5a33e31b26c11426c45c501e439cb865d0bff96315d86443b78/appnope-0.1.4-py2.py3-none-any.whl", hash = "sha256:502575ee11cd7a28c0205f379b525beefebab9d161b7c964670864014ed7213c", size = 4321, upload-time = "2024-02-06T09:43:09.663Z" }, +] + +[[package]] +name = "asttokens" +version = "3.0.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/be/a5/8e3f9b6771b0b408517c82d97aed8f2036509bc247d46114925e32fe33f0/asttokens-3.0.1.tar.gz", hash = "sha256:71a4ee5de0bde6a31d64f6b13f2293ac190344478f081c3d1bccfcf5eacb0cb7", size = 62308, upload-time = "2025-11-15T16:43:48.578Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d2/39/e7eaf1799466a4aef85b6a4fe7bd175ad2b1c6345066aa33f1f58d4b18d0/asttokens-3.0.1-py3-none-any.whl", hash = "sha256:15a3ebc0f43c2d0a50eeafea25e19046c68398e487b9f1f5b517f7c0f40f976a", size = 27047, upload-time = "2025-11-15T16:43:16.109Z" }, +] + [[package]] name = "async-timeout" version = "5.0.1" @@ -188,6 +206,88 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/48/1549795ba7742c948d2ad169c1c8cdbae65bc450d6cd753d124b17c8cd32/certifi-2025.8.3-py3-none-any.whl", hash = "sha256:f6c12493cfb1b06ba2ff328595af9350c65d6644968e5d3a2ffd78699af217a5", size = 161216, upload-time = "2025-08-03T03:07:45.777Z" }, ] +[[package]] +name = "cffi" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pycparser", marker = "implementation_name != 'PyPy' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/93/d7/516d984057745a6cd96575eea814fe1edd6646ee6efd552fb7b0921dec83/cffi-2.0.0-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:0cf2d91ecc3fcc0625c2c530fe004f82c110405f101548512cce44322fa8ac44", size = 184283, upload-time = "2025-09-08T23:22:08.01Z" }, + { url = "https://files.pythonhosted.org/packages/9e/84/ad6a0b408daa859246f57c03efd28e5dd1b33c21737c2db84cae8c237aa5/cffi-2.0.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:f73b96c41e3b2adedc34a7356e64c8eb96e03a3782b535e043a986276ce12a49", size = 180504, upload-time = "2025-09-08T23:22:10.637Z" }, + { url = "https://files.pythonhosted.org/packages/50/bd/b1a6362b80628111e6653c961f987faa55262b4002fcec42308cad1db680/cffi-2.0.0-cp310-cp310-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:53f77cbe57044e88bbd5ed26ac1d0514d2acf0591dd6bb02a3ae37f76811b80c", size = 208811, upload-time = "2025-09-08T23:22:12.267Z" }, + { url = "https://files.pythonhosted.org/packages/4f/27/6933a8b2562d7bd1fb595074cf99cc81fc3789f6a6c05cdabb46284a3188/cffi-2.0.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:3e837e369566884707ddaf85fc1744b47575005c0a229de3327f8f9a20f4efeb", size = 216402, upload-time = "2025-09-08T23:22:13.455Z" }, + { url = "https://files.pythonhosted.org/packages/05/eb/b86f2a2645b62adcfff53b0dd97e8dfafb5c8aa864bd0d9a2c2049a0d551/cffi-2.0.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:5eda85d6d1879e692d546a078b44251cdd08dd1cfb98dfb77b670c97cee49ea0", size = 203217, upload-time = "2025-09-08T23:22:14.596Z" }, + { url = "https://files.pythonhosted.org/packages/9f/e0/6cbe77a53acf5acc7c08cc186c9928864bd7c005f9efd0d126884858a5fe/cffi-2.0.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:9332088d75dc3241c702d852d4671613136d90fa6881da7d770a483fd05248b4", size = 203079, upload-time = "2025-09-08T23:22:15.769Z" }, + { url = "https://files.pythonhosted.org/packages/98/29/9b366e70e243eb3d14a5cb488dfd3a0b6b2f1fb001a203f653b93ccfac88/cffi-2.0.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:fc7de24befaeae77ba923797c7c87834c73648a05a4bde34b3b7e5588973a453", size = 216475, upload-time = "2025-09-08T23:22:17.427Z" }, + { url = "https://files.pythonhosted.org/packages/21/7a/13b24e70d2f90a322f2900c5d8e1f14fa7e2a6b3332b7309ba7b2ba51a5a/cffi-2.0.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cf364028c016c03078a23b503f02058f1814320a56ad535686f90565636a9495", size = 218829, upload-time = "2025-09-08T23:22:19.069Z" }, + { url = "https://files.pythonhosted.org/packages/60/99/c9dc110974c59cc981b1f5b66e1d8af8af764e00f0293266824d9c4254bc/cffi-2.0.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:e11e82b744887154b182fd3e7e8512418446501191994dbf9c9fc1f32cc8efd5", size = 211211, upload-time = "2025-09-08T23:22:20.588Z" }, + { url = "https://files.pythonhosted.org/packages/49/72/ff2d12dbf21aca1b32a40ed792ee6b40f6dc3a9cf1644bd7ef6e95e0ac5e/cffi-2.0.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:8ea985900c5c95ce9db1745f7933eeef5d314f0565b27625d9a10ec9881e1bfb", size = 218036, upload-time = "2025-09-08T23:22:22.143Z" }, + { url = "https://files.pythonhosted.org/packages/e2/cc/027d7fb82e58c48ea717149b03bcadcbdc293553edb283af792bd4bcbb3f/cffi-2.0.0-cp310-cp310-win32.whl", hash = "sha256:1f72fb8906754ac8a2cc3f9f5aaa298070652a0ffae577e0ea9bd480dc3c931a", size = 172184, upload-time = "2025-09-08T23:22:23.328Z" }, + { url = "https://files.pythonhosted.org/packages/33/fa/072dd15ae27fbb4e06b437eb6e944e75b068deb09e2a2826039e49ee2045/cffi-2.0.0-cp310-cp310-win_amd64.whl", hash = "sha256:b18a3ed7d5b3bd8d9ef7a8cb226502c6bf8308df1525e1cc676c3680e7176739", size = 182790, upload-time = "2025-09-08T23:22:24.752Z" }, + { url = "https://files.pythonhosted.org/packages/12/4a/3dfd5f7850cbf0d06dc84ba9aa00db766b52ca38d8b86e3a38314d52498c/cffi-2.0.0-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:b4c854ef3adc177950a8dfc81a86f5115d2abd545751a304c5bcf2c2c7283cfe", size = 184344, upload-time = "2025-09-08T23:22:26.456Z" }, + { url = "https://files.pythonhosted.org/packages/4f/8b/f0e4c441227ba756aafbe78f117485b25bb26b1c059d01f137fa6d14896b/cffi-2.0.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2de9a304e27f7596cd03d16f1b7c72219bd944e99cc52b84d0145aefb07cbd3c", size = 180560, upload-time = "2025-09-08T23:22:28.197Z" }, + { url = "https://files.pythonhosted.org/packages/b1/b7/1200d354378ef52ec227395d95c2576330fd22a869f7a70e88e1447eb234/cffi-2.0.0-cp311-cp311-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:baf5215e0ab74c16e2dd324e8ec067ef59e41125d3eade2b863d294fd5035c92", size = 209613, upload-time = "2025-09-08T23:22:29.475Z" }, + { url = "https://files.pythonhosted.org/packages/b8/56/6033f5e86e8cc9bb629f0077ba71679508bdf54a9a5e112a3c0b91870332/cffi-2.0.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:730cacb21e1bdff3ce90babf007d0a0917cc3e6492f336c2f0134101e0944f93", size = 216476, upload-time = "2025-09-08T23:22:31.063Z" }, + { url = "https://files.pythonhosted.org/packages/dc/7f/55fecd70f7ece178db2f26128ec41430d8720f2d12ca97bf8f0a628207d5/cffi-2.0.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:6824f87845e3396029f3820c206e459ccc91760e8fa24422f8b0c3d1731cbec5", size = 203374, upload-time = "2025-09-08T23:22:32.507Z" }, + { url = "https://files.pythonhosted.org/packages/84/ef/a7b77c8bdc0f77adc3b46888f1ad54be8f3b7821697a7b89126e829e676a/cffi-2.0.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:9de40a7b0323d889cf8d23d1ef214f565ab154443c42737dfe52ff82cf857664", size = 202597, upload-time = "2025-09-08T23:22:34.132Z" }, + { url = "https://files.pythonhosted.org/packages/d7/91/500d892b2bf36529a75b77958edfcd5ad8e2ce4064ce2ecfeab2125d72d1/cffi-2.0.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8941aaadaf67246224cee8c3803777eed332a19d909b47e29c9842ef1e79ac26", size = 215574, upload-time = "2025-09-08T23:22:35.443Z" }, + { url = "https://files.pythonhosted.org/packages/44/64/58f6255b62b101093d5df22dcb752596066c7e89dd725e0afaed242a61be/cffi-2.0.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:a05d0c237b3349096d3981b727493e22147f934b20f6f125a3eba8f994bec4a9", size = 218971, upload-time = "2025-09-08T23:22:36.805Z" }, + { url = "https://files.pythonhosted.org/packages/ab/49/fa72cebe2fd8a55fbe14956f9970fe8eb1ac59e5df042f603ef7c8ba0adc/cffi-2.0.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:94698a9c5f91f9d138526b48fe26a199609544591f859c870d477351dc7b2414", size = 211972, upload-time = "2025-09-08T23:22:38.436Z" }, + { url = "https://files.pythonhosted.org/packages/0b/28/dd0967a76aab36731b6ebfe64dec4e981aff7e0608f60c2d46b46982607d/cffi-2.0.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:5fed36fccc0612a53f1d4d9a816b50a36702c28a2aa880cb8a122b3466638743", size = 217078, upload-time = "2025-09-08T23:22:39.776Z" }, + { url = "https://files.pythonhosted.org/packages/2b/c0/015b25184413d7ab0a410775fdb4a50fca20f5589b5dab1dbbfa3baad8ce/cffi-2.0.0-cp311-cp311-win32.whl", hash = "sha256:c649e3a33450ec82378822b3dad03cc228b8f5963c0c12fc3b1e0ab940f768a5", size = 172076, upload-time = "2025-09-08T23:22:40.95Z" }, + { url = "https://files.pythonhosted.org/packages/ae/8f/dc5531155e7070361eb1b7e4c1a9d896d0cb21c49f807a6c03fd63fc877e/cffi-2.0.0-cp311-cp311-win_amd64.whl", hash = "sha256:66f011380d0e49ed280c789fbd08ff0d40968ee7b665575489afa95c98196ab5", size = 182820, upload-time = "2025-09-08T23:22:42.463Z" }, + { url = "https://files.pythonhosted.org/packages/95/5c/1b493356429f9aecfd56bc171285a4c4ac8697f76e9bbbbb105e537853a1/cffi-2.0.0-cp311-cp311-win_arm64.whl", hash = "sha256:c6638687455baf640e37344fe26d37c404db8b80d037c3d29f58fe8d1c3b194d", size = 177635, upload-time = "2025-09-08T23:22:43.623Z" }, + { url = "https://files.pythonhosted.org/packages/ea/47/4f61023ea636104d4f16ab488e268b93008c3d0bb76893b1b31db1f96802/cffi-2.0.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6d02d6655b0e54f54c4ef0b94eb6be0607b70853c45ce98bd278dc7de718be5d", size = 185271, upload-time = "2025-09-08T23:22:44.795Z" }, + { url = "https://files.pythonhosted.org/packages/df/a2/781b623f57358e360d62cdd7a8c681f074a71d445418a776eef0aadb4ab4/cffi-2.0.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8eca2a813c1cb7ad4fb74d368c2ffbbb4789d377ee5bb8df98373c2cc0dee76c", size = 181048, upload-time = "2025-09-08T23:22:45.938Z" }, + { url = "https://files.pythonhosted.org/packages/ff/df/a4f0fbd47331ceeba3d37c2e51e9dfc9722498becbeec2bd8bc856c9538a/cffi-2.0.0-cp312-cp312-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:21d1152871b019407d8ac3985f6775c079416c282e431a4da6afe7aefd2bccbe", size = 212529, upload-time = "2025-09-08T23:22:47.349Z" }, + { url = "https://files.pythonhosted.org/packages/d5/72/12b5f8d3865bf0f87cf1404d8c374e7487dcf097a1c91c436e72e6badd83/cffi-2.0.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:b21e08af67b8a103c71a250401c78d5e0893beff75e28c53c98f4de42f774062", size = 220097, upload-time = "2025-09-08T23:22:48.677Z" }, + { url = "https://files.pythonhosted.org/packages/c2/95/7a135d52a50dfa7c882ab0ac17e8dc11cec9d55d2c18dda414c051c5e69e/cffi-2.0.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:1e3a615586f05fc4065a8b22b8152f0c1b00cdbc60596d187c2a74f9e3036e4e", size = 207983, upload-time = "2025-09-08T23:22:50.06Z" }, + { url = "https://files.pythonhosted.org/packages/3a/c8/15cb9ada8895957ea171c62dc78ff3e99159ee7adb13c0123c001a2546c1/cffi-2.0.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:81afed14892743bbe14dacb9e36d9e0e504cd204e0b165062c488942b9718037", size = 206519, upload-time = "2025-09-08T23:22:51.364Z" }, + { url = "https://files.pythonhosted.org/packages/78/2d/7fa73dfa841b5ac06c7b8855cfc18622132e365f5b81d02230333ff26e9e/cffi-2.0.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3e17ed538242334bf70832644a32a7aae3d83b57567f9fd60a26257e992b79ba", size = 219572, upload-time = "2025-09-08T23:22:52.902Z" }, + { url = "https://files.pythonhosted.org/packages/07/e0/267e57e387b4ca276b90f0434ff88b2c2241ad72b16d31836adddfd6031b/cffi-2.0.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:3925dd22fa2b7699ed2617149842d2e6adde22b262fcbfada50e3d195e4b3a94", size = 222963, upload-time = "2025-09-08T23:22:54.518Z" }, + { url = "https://files.pythonhosted.org/packages/b6/75/1f2747525e06f53efbd878f4d03bac5b859cbc11c633d0fb81432d98a795/cffi-2.0.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2c8f814d84194c9ea681642fd164267891702542f028a15fc97d4674b6206187", size = 221361, upload-time = "2025-09-08T23:22:55.867Z" }, + { url = "https://files.pythonhosted.org/packages/7b/2b/2b6435f76bfeb6bbf055596976da087377ede68df465419d192acf00c437/cffi-2.0.0-cp312-cp312-win32.whl", hash = "sha256:da902562c3e9c550df360bfa53c035b2f241fed6d9aef119048073680ace4a18", size = 172932, upload-time = "2025-09-08T23:22:57.188Z" }, + { url = "https://files.pythonhosted.org/packages/f8/ed/13bd4418627013bec4ed6e54283b1959cf6db888048c7cf4b4c3b5b36002/cffi-2.0.0-cp312-cp312-win_amd64.whl", hash = "sha256:da68248800ad6320861f129cd9c1bf96ca849a2771a59e0344e88681905916f5", size = 183557, upload-time = "2025-09-08T23:22:58.351Z" }, + { url = "https://files.pythonhosted.org/packages/95/31/9f7f93ad2f8eff1dbc1c3656d7ca5bfd8fb52c9d786b4dcf19b2d02217fa/cffi-2.0.0-cp312-cp312-win_arm64.whl", hash = "sha256:4671d9dd5ec934cb9a73e7ee9676f9362aba54f7f34910956b84d727b0d73fb6", size = 177762, upload-time = "2025-09-08T23:22:59.668Z" }, + { url = "https://files.pythonhosted.org/packages/4b/8d/a0a47a0c9e413a658623d014e91e74a50cdd2c423f7ccfd44086ef767f90/cffi-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb", size = 185230, upload-time = "2025-09-08T23:23:00.879Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d2/a6c0296814556c68ee32009d9c2ad4f85f2707cdecfd7727951ec228005d/cffi-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca", size = 181043, upload-time = "2025-09-08T23:23:02.231Z" }, + { url = "https://files.pythonhosted.org/packages/b0/1e/d22cc63332bd59b06481ceaac49d6c507598642e2230f201649058a7e704/cffi-2.0.0-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b", size = 212446, upload-time = "2025-09-08T23:23:03.472Z" }, + { url = "https://files.pythonhosted.org/packages/a9/f5/a2c23eb03b61a0b8747f211eb716446c826ad66818ddc7810cc2cc19b3f2/cffi-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b", size = 220101, upload-time = "2025-09-08T23:23:04.792Z" }, + { url = "https://files.pythonhosted.org/packages/f2/7f/e6647792fc5850d634695bc0e6ab4111ae88e89981d35ac269956605feba/cffi-2.0.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2", size = 207948, upload-time = "2025-09-08T23:23:06.127Z" }, + { url = "https://files.pythonhosted.org/packages/cb/1e/a5a1bd6f1fb30f22573f76533de12a00bf274abcdc55c8edab639078abb6/cffi-2.0.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3", size = 206422, upload-time = "2025-09-08T23:23:07.753Z" }, + { url = "https://files.pythonhosted.org/packages/98/df/0a1755e750013a2081e863e7cd37e0cdd02664372c754e5560099eb7aa44/cffi-2.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26", size = 219499, upload-time = "2025-09-08T23:23:09.648Z" }, + { url = "https://files.pythonhosted.org/packages/50/e1/a969e687fcf9ea58e6e2a928ad5e2dd88cc12f6f0ab477e9971f2309b57c/cffi-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c", size = 222928, upload-time = "2025-09-08T23:23:10.928Z" }, + { url = "https://files.pythonhosted.org/packages/36/54/0362578dd2c9e557a28ac77698ed67323ed5b9775ca9d3fe73fe191bb5d8/cffi-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b", size = 221302, upload-time = "2025-09-08T23:23:12.42Z" }, + { url = "https://files.pythonhosted.org/packages/eb/6d/bf9bda840d5f1dfdbf0feca87fbdb64a918a69bca42cfa0ba7b137c48cb8/cffi-2.0.0-cp313-cp313-win32.whl", hash = "sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27", size = 172909, upload-time = "2025-09-08T23:23:14.32Z" }, + { url = "https://files.pythonhosted.org/packages/37/18/6519e1ee6f5a1e579e04b9ddb6f1676c17368a7aba48299c3759bbc3c8b3/cffi-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75", size = 183402, upload-time = "2025-09-08T23:23:15.535Z" }, + { url = "https://files.pythonhosted.org/packages/cb/0e/02ceeec9a7d6ee63bb596121c2c8e9b3a9e150936f4fbef6ca1943e6137c/cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91", size = 177780, upload-time = "2025-09-08T23:23:16.761Z" }, + { url = "https://files.pythonhosted.org/packages/92/c4/3ce07396253a83250ee98564f8d7e9789fab8e58858f35d07a9a2c78de9f/cffi-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5", size = 185320, upload-time = "2025-09-08T23:23:18.087Z" }, + { url = "https://files.pythonhosted.org/packages/59/dd/27e9fa567a23931c838c6b02d0764611c62290062a6d4e8ff7863daf9730/cffi-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13", size = 181487, upload-time = "2025-09-08T23:23:19.622Z" }, + { url = "https://files.pythonhosted.org/packages/d6/43/0e822876f87ea8a4ef95442c3d766a06a51fc5298823f884ef87aaad168c/cffi-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b", size = 220049, upload-time = "2025-09-08T23:23:20.853Z" }, + { url = "https://files.pythonhosted.org/packages/b4/89/76799151d9c2d2d1ead63c2429da9ea9d7aac304603de0c6e8764e6e8e70/cffi-2.0.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c", size = 207793, upload-time = "2025-09-08T23:23:22.08Z" }, + { url = "https://files.pythonhosted.org/packages/bb/dd/3465b14bb9e24ee24cb88c9e3730f6de63111fffe513492bf8c808a3547e/cffi-2.0.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef", size = 206300, upload-time = "2025-09-08T23:23:23.314Z" }, + { url = "https://files.pythonhosted.org/packages/47/d9/d83e293854571c877a92da46fdec39158f8d7e68da75bf73581225d28e90/cffi-2.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775", size = 219244, upload-time = "2025-09-08T23:23:24.541Z" }, + { url = "https://files.pythonhosted.org/packages/2b/0f/1f177e3683aead2bb00f7679a16451d302c436b5cbf2505f0ea8146ef59e/cffi-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205", size = 222828, upload-time = "2025-09-08T23:23:26.143Z" }, + { url = "https://files.pythonhosted.org/packages/c6/0f/cafacebd4b040e3119dcb32fed8bdef8dfe94da653155f9d0b9dc660166e/cffi-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1", size = 220926, upload-time = "2025-09-08T23:23:27.873Z" }, + { url = "https://files.pythonhosted.org/packages/3e/aa/df335faa45b395396fcbc03de2dfcab242cd61a9900e914fe682a59170b1/cffi-2.0.0-cp314-cp314-win32.whl", hash = "sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f", size = 175328, upload-time = "2025-09-08T23:23:44.61Z" }, + { url = "https://files.pythonhosted.org/packages/bb/92/882c2d30831744296ce713f0feb4c1cd30f346ef747b530b5318715cc367/cffi-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25", size = 185650, upload-time = "2025-09-08T23:23:45.848Z" }, + { url = "https://files.pythonhosted.org/packages/9f/2c/98ece204b9d35a7366b5b2c6539c350313ca13932143e79dc133ba757104/cffi-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad", size = 180687, upload-time = "2025-09-08T23:23:47.105Z" }, + { url = "https://files.pythonhosted.org/packages/3e/61/c768e4d548bfa607abcda77423448df8c471f25dbe64fb2ef6d555eae006/cffi-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9", size = 188773, upload-time = "2025-09-08T23:23:29.347Z" }, + { url = "https://files.pythonhosted.org/packages/2c/ea/5f76bce7cf6fcd0ab1a1058b5af899bfbef198bea4d5686da88471ea0336/cffi-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d", size = 185013, upload-time = "2025-09-08T23:23:30.63Z" }, + { url = "https://files.pythonhosted.org/packages/be/b4/c56878d0d1755cf9caa54ba71e5d049479c52f9e4afc230f06822162ab2f/cffi-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c", size = 221593, upload-time = "2025-09-08T23:23:31.91Z" }, + { url = "https://files.pythonhosted.org/packages/e0/0d/eb704606dfe8033e7128df5e90fee946bbcb64a04fcdaa97321309004000/cffi-2.0.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8", size = 209354, upload-time = "2025-09-08T23:23:33.214Z" }, + { url = "https://files.pythonhosted.org/packages/d8/19/3c435d727b368ca475fb8742ab97c9cb13a0de600ce86f62eab7fa3eea60/cffi-2.0.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc", size = 208480, upload-time = "2025-09-08T23:23:34.495Z" }, + { url = "https://files.pythonhosted.org/packages/d0/44/681604464ed9541673e486521497406fadcc15b5217c3e326b061696899a/cffi-2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592", size = 221584, upload-time = "2025-09-08T23:23:36.096Z" }, + { url = "https://files.pythonhosted.org/packages/25/8e/342a504ff018a2825d395d44d63a767dd8ebc927ebda557fecdaca3ac33a/cffi-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512", size = 224443, upload-time = "2025-09-08T23:23:37.328Z" }, + { url = "https://files.pythonhosted.org/packages/e1/5e/b666bacbbc60fbf415ba9988324a132c9a7a0448a9a8f125074671c0f2c3/cffi-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4", size = 223437, upload-time = "2025-09-08T23:23:38.945Z" }, + { url = "https://files.pythonhosted.org/packages/a0/1d/ec1a60bd1a10daa292d3cd6bb0b359a81607154fb8165f3ec95fe003b85c/cffi-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e", size = 180487, upload-time = "2025-09-08T23:23:40.423Z" }, + { url = "https://files.pythonhosted.org/packages/bf/41/4c1168c74fac325c0c8156f04b6749c8b6a8f405bbf91413ba088359f60d/cffi-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6", size = 191726, upload-time = "2025-09-08T23:23:41.742Z" }, + { url = "https://files.pythonhosted.org/packages/ae/3a/dbeec9d1ee0844c679f6bb5d6ad4e9f198b1224f4e7a32825f47f6192b0c/cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9", size = 184195, upload-time = "2025-09-08T23:23:43.004Z" }, +] + [[package]] name = "charset-normalizer" version = "3.4.3" @@ -273,6 +373,198 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, ] +[[package]] +name = "comm" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/4c/13/7d740c5849255756bc17888787313b61fd38a0a8304fc4f073dfc46122aa/comm-0.2.3.tar.gz", hash = "sha256:2dc8048c10962d55d7ad693be1e7045d891b7ce8d999c97963a5e3e99c055971", size = 6319, upload-time = "2025-07-25T14:02:04.452Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/60/97/891a0971e1e4a8c5d2b20bbe0e524dc04548d2307fee33cdeba148fd4fc7/comm-0.2.3-py3-none-any.whl", hash = "sha256:c615d91d75f7f04f095b30d1c1711babd43bdc6419c1be9886a85f2f4e489417", size = 7294, upload-time = "2025-07-25T14:02:02.896Z" }, +] + +[[package]] +name = "contourpy" +version = "1.3.2" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", +] +dependencies = [ + { name = "numpy", marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/66/54/eb9bfc647b19f2009dd5c7f5ec51c4e6ca831725f1aea7a993034f483147/contourpy-1.3.2.tar.gz", hash = "sha256:b6945942715a034c671b7fc54f9588126b0b8bf23db2696e3ca8328f3ff0ab54", size = 13466130, upload-time = "2025-04-15T17:47:53.79Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/a3/da4153ec8fe25d263aa48c1a4cbde7f49b59af86f0b6f7862788c60da737/contourpy-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ba38e3f9f330af820c4b27ceb4b9c7feee5fe0493ea53a8720f4792667465934", size = 268551, upload-time = "2025-04-15T17:34:46.581Z" }, + { url = "https://files.pythonhosted.org/packages/2f/6c/330de89ae1087eb622bfca0177d32a7ece50c3ef07b28002de4757d9d875/contourpy-1.3.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:dc41ba0714aa2968d1f8674ec97504a8f7e334f48eeacebcaa6256213acb0989", size = 253399, upload-time = "2025-04-15T17:34:51.427Z" }, + { url = "https://files.pythonhosted.org/packages/c1/bd/20c6726b1b7f81a8bee5271bed5c165f0a8e1f572578a9d27e2ccb763cb2/contourpy-1.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9be002b31c558d1ddf1b9b415b162c603405414bacd6932d031c5b5a8b757f0d", size = 312061, upload-time = "2025-04-15T17:34:55.961Z" }, + { url = "https://files.pythonhosted.org/packages/22/fc/a9665c88f8a2473f823cf1ec601de9e5375050f1958cbb356cdf06ef1ab6/contourpy-1.3.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8d2e74acbcba3bfdb6d9d8384cdc4f9260cae86ed9beee8bd5f54fee49a430b9", size = 351956, upload-time = "2025-04-15T17:35:00.992Z" }, + { url = "https://files.pythonhosted.org/packages/25/eb/9f0a0238f305ad8fb7ef42481020d6e20cf15e46be99a1fcf939546a177e/contourpy-1.3.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e259bced5549ac64410162adc973c5e2fb77f04df4a439d00b478e57a0e65512", size = 320872, upload-time = "2025-04-15T17:35:06.177Z" }, + { url = "https://files.pythonhosted.org/packages/32/5c/1ee32d1c7956923202f00cf8d2a14a62ed7517bdc0ee1e55301227fc273c/contourpy-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad687a04bc802cbe8b9c399c07162a3c35e227e2daccf1668eb1f278cb698631", size = 325027, upload-time = "2025-04-15T17:35:11.244Z" }, + { url = "https://files.pythonhosted.org/packages/83/bf/9baed89785ba743ef329c2b07fd0611d12bfecbedbdd3eeecf929d8d3b52/contourpy-1.3.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:cdd22595308f53ef2f891040ab2b93d79192513ffccbd7fe19be7aa773a5e09f", size = 1306641, upload-time = "2025-04-15T17:35:26.701Z" }, + { url = "https://files.pythonhosted.org/packages/d4/cc/74e5e83d1e35de2d28bd97033426b450bc4fd96e092a1f7a63dc7369b55d/contourpy-1.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b4f54d6a2defe9f257327b0f243612dd051cc43825587520b1bf74a31e2f6ef2", size = 1374075, upload-time = "2025-04-15T17:35:43.204Z" }, + { url = "https://files.pythonhosted.org/packages/0c/42/17f3b798fd5e033b46a16f8d9fcb39f1aba051307f5ebf441bad1ecf78f8/contourpy-1.3.2-cp310-cp310-win32.whl", hash = "sha256:f939a054192ddc596e031e50bb13b657ce318cf13d264f095ce9db7dc6ae81c0", size = 177534, upload-time = "2025-04-15T17:35:46.554Z" }, + { url = "https://files.pythonhosted.org/packages/54/ec/5162b8582f2c994721018d0c9ece9dc6ff769d298a8ac6b6a652c307e7df/contourpy-1.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:c440093bbc8fc21c637c03bafcbef95ccd963bc6e0514ad887932c18ca2a759a", size = 221188, upload-time = "2025-04-15T17:35:50.064Z" }, + { url = "https://files.pythonhosted.org/packages/b3/b9/ede788a0b56fc5b071639d06c33cb893f68b1178938f3425debebe2dab78/contourpy-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6a37a2fb93d4df3fc4c0e363ea4d16f83195fc09c891bc8ce072b9d084853445", size = 269636, upload-time = "2025-04-15T17:35:54.473Z" }, + { url = "https://files.pythonhosted.org/packages/e6/75/3469f011d64b8bbfa04f709bfc23e1dd71be54d05b1b083be9f5b22750d1/contourpy-1.3.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b7cd50c38f500bbcc9b6a46643a40e0913673f869315d8e70de0438817cb7773", size = 254636, upload-time = "2025-04-15T17:35:58.283Z" }, + { url = "https://files.pythonhosted.org/packages/8d/2f/95adb8dae08ce0ebca4fd8e7ad653159565d9739128b2d5977806656fcd2/contourpy-1.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d6658ccc7251a4433eebd89ed2672c2ed96fba367fd25ca9512aa92a4b46c4f1", size = 313053, upload-time = "2025-04-15T17:36:03.235Z" }, + { url = "https://files.pythonhosted.org/packages/c3/a6/8ccf97a50f31adfa36917707fe39c9a0cbc24b3bbb58185577f119736cc9/contourpy-1.3.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:70771a461aaeb335df14deb6c97439973d253ae70660ca085eec25241137ef43", size = 352985, upload-time = "2025-04-15T17:36:08.275Z" }, + { url = "https://files.pythonhosted.org/packages/1d/b6/7925ab9b77386143f39d9c3243fdd101621b4532eb126743201160ffa7e6/contourpy-1.3.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:65a887a6e8c4cd0897507d814b14c54a8c2e2aa4ac9f7686292f9769fcf9a6ab", size = 323750, upload-time = "2025-04-15T17:36:13.29Z" }, + { url = "https://files.pythonhosted.org/packages/c2/f3/20c5d1ef4f4748e52d60771b8560cf00b69d5c6368b5c2e9311bcfa2a08b/contourpy-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3859783aefa2b8355697f16642695a5b9792e7a46ab86da1118a4a23a51a33d7", size = 326246, upload-time = "2025-04-15T17:36:18.329Z" }, + { url = "https://files.pythonhosted.org/packages/8c/e5/9dae809e7e0b2d9d70c52b3d24cba134dd3dad979eb3e5e71f5df22ed1f5/contourpy-1.3.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:eab0f6db315fa4d70f1d8ab514e527f0366ec021ff853d7ed6a2d33605cf4b83", size = 1308728, upload-time = "2025-04-15T17:36:33.878Z" }, + { url = "https://files.pythonhosted.org/packages/e2/4a/0058ba34aeea35c0b442ae61a4f4d4ca84d6df8f91309bc2d43bb8dd248f/contourpy-1.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:d91a3ccc7fea94ca0acab82ceb77f396d50a1f67412efe4c526f5d20264e6ecd", size = 1375762, upload-time = "2025-04-15T17:36:51.295Z" }, + { url = "https://files.pythonhosted.org/packages/09/33/7174bdfc8b7767ef2c08ed81244762d93d5c579336fc0b51ca57b33d1b80/contourpy-1.3.2-cp311-cp311-win32.whl", hash = "sha256:1c48188778d4d2f3d48e4643fb15d8608b1d01e4b4d6b0548d9b336c28fc9b6f", size = 178196, upload-time = "2025-04-15T17:36:55.002Z" }, + { url = "https://files.pythonhosted.org/packages/5e/fe/4029038b4e1c4485cef18e480b0e2cd2d755448bb071eb9977caac80b77b/contourpy-1.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:5ebac872ba09cb8f2131c46b8739a7ff71de28a24c869bcad554477eb089a878", size = 222017, upload-time = "2025-04-15T17:36:58.576Z" }, + { url = "https://files.pythonhosted.org/packages/34/f7/44785876384eff370c251d58fd65f6ad7f39adce4a093c934d4a67a7c6b6/contourpy-1.3.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:4caf2bcd2969402bf77edc4cb6034c7dd7c0803213b3523f111eb7460a51b8d2", size = 271580, upload-time = "2025-04-15T17:37:03.105Z" }, + { url = "https://files.pythonhosted.org/packages/93/3b/0004767622a9826ea3d95f0e9d98cd8729015768075d61f9fea8eeca42a8/contourpy-1.3.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:82199cb78276249796419fe36b7386bd8d2cc3f28b3bc19fe2454fe2e26c4c15", size = 255530, upload-time = "2025-04-15T17:37:07.026Z" }, + { url = "https://files.pythonhosted.org/packages/e7/bb/7bd49e1f4fa805772d9fd130e0d375554ebc771ed7172f48dfcd4ca61549/contourpy-1.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:106fab697af11456fcba3e352ad50effe493a90f893fca6c2ca5c033820cea92", size = 307688, upload-time = "2025-04-15T17:37:11.481Z" }, + { url = "https://files.pythonhosted.org/packages/fc/97/e1d5dbbfa170725ef78357a9a0edc996b09ae4af170927ba8ce977e60a5f/contourpy-1.3.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d14f12932a8d620e307f715857107b1d1845cc44fdb5da2bc8e850f5ceba9f87", size = 347331, upload-time = "2025-04-15T17:37:18.212Z" }, + { url = "https://files.pythonhosted.org/packages/6f/66/e69e6e904f5ecf6901be3dd16e7e54d41b6ec6ae3405a535286d4418ffb4/contourpy-1.3.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:532fd26e715560721bb0d5fc7610fce279b3699b018600ab999d1be895b09415", size = 318963, upload-time = "2025-04-15T17:37:22.76Z" }, + { url = "https://files.pythonhosted.org/packages/a8/32/b8a1c8965e4f72482ff2d1ac2cd670ce0b542f203c8e1d34e7c3e6925da7/contourpy-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f26b383144cf2d2c29f01a1e8170f50dacf0eac02d64139dcd709a8ac4eb3cfe", size = 323681, upload-time = "2025-04-15T17:37:33.001Z" }, + { url = "https://files.pythonhosted.org/packages/30/c6/12a7e6811d08757c7162a541ca4c5c6a34c0f4e98ef2b338791093518e40/contourpy-1.3.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:c49f73e61f1f774650a55d221803b101d966ca0c5a2d6d5e4320ec3997489441", size = 1308674, upload-time = "2025-04-15T17:37:48.64Z" }, + { url = "https://files.pythonhosted.org/packages/2a/8a/bebe5a3f68b484d3a2b8ffaf84704b3e343ef1addea528132ef148e22b3b/contourpy-1.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3d80b2c0300583228ac98d0a927a1ba6a2ba6b8a742463c564f1d419ee5b211e", size = 1380480, upload-time = "2025-04-15T17:38:06.7Z" }, + { url = "https://files.pythonhosted.org/packages/34/db/fcd325f19b5978fb509a7d55e06d99f5f856294c1991097534360b307cf1/contourpy-1.3.2-cp312-cp312-win32.whl", hash = "sha256:90df94c89a91b7362e1142cbee7568f86514412ab8a2c0d0fca72d7e91b62912", size = 178489, upload-time = "2025-04-15T17:38:10.338Z" }, + { url = "https://files.pythonhosted.org/packages/01/c8/fadd0b92ffa7b5eb5949bf340a63a4a496a6930a6c37a7ba0f12acb076d6/contourpy-1.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:8c942a01d9163e2e5cfb05cb66110121b8d07ad438a17f9e766317bcb62abf73", size = 223042, upload-time = "2025-04-15T17:38:14.239Z" }, + { url = "https://files.pythonhosted.org/packages/2e/61/5673f7e364b31e4e7ef6f61a4b5121c5f170f941895912f773d95270f3a2/contourpy-1.3.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:de39db2604ae755316cb5967728f4bea92685884b1e767b7c24e983ef5f771cb", size = 271630, upload-time = "2025-04-15T17:38:19.142Z" }, + { url = "https://files.pythonhosted.org/packages/ff/66/a40badddd1223822c95798c55292844b7e871e50f6bfd9f158cb25e0bd39/contourpy-1.3.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:3f9e896f447c5c8618f1edb2bafa9a4030f22a575ec418ad70611450720b5b08", size = 255670, upload-time = "2025-04-15T17:38:23.688Z" }, + { url = "https://files.pythonhosted.org/packages/1e/c7/cf9fdee8200805c9bc3b148f49cb9482a4e3ea2719e772602a425c9b09f8/contourpy-1.3.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:71e2bd4a1c4188f5c2b8d274da78faab884b59df20df63c34f74aa1813c4427c", size = 306694, upload-time = "2025-04-15T17:38:28.238Z" }, + { url = "https://files.pythonhosted.org/packages/dd/e7/ccb9bec80e1ba121efbffad7f38021021cda5be87532ec16fd96533bb2e0/contourpy-1.3.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de425af81b6cea33101ae95ece1f696af39446db9682a0b56daaa48cfc29f38f", size = 345986, upload-time = "2025-04-15T17:38:33.502Z" }, + { url = "https://files.pythonhosted.org/packages/dc/49/ca13bb2da90391fa4219fdb23b078d6065ada886658ac7818e5441448b78/contourpy-1.3.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:977e98a0e0480d3fe292246417239d2d45435904afd6d7332d8455981c408b85", size = 318060, upload-time = "2025-04-15T17:38:38.672Z" }, + { url = "https://files.pythonhosted.org/packages/c8/65/5245ce8c548a8422236c13ffcdcdada6a2a812c361e9e0c70548bb40b661/contourpy-1.3.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:434f0adf84911c924519d2b08fc10491dd282b20bdd3fa8f60fd816ea0b48841", size = 322747, upload-time = "2025-04-15T17:38:43.712Z" }, + { url = "https://files.pythonhosted.org/packages/72/30/669b8eb48e0a01c660ead3752a25b44fdb2e5ebc13a55782f639170772f9/contourpy-1.3.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:c66c4906cdbc50e9cba65978823e6e00b45682eb09adbb78c9775b74eb222422", size = 1308895, upload-time = "2025-04-15T17:39:00.224Z" }, + { url = "https://files.pythonhosted.org/packages/05/5a/b569f4250decee6e8d54498be7bdf29021a4c256e77fe8138c8319ef8eb3/contourpy-1.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8b7fc0cd78ba2f4695fd0a6ad81a19e7e3ab825c31b577f384aa9d7817dc3bef", size = 1379098, upload-time = "2025-04-15T17:43:29.649Z" }, + { url = "https://files.pythonhosted.org/packages/19/ba/b227c3886d120e60e41b28740ac3617b2f2b971b9f601c835661194579f1/contourpy-1.3.2-cp313-cp313-win32.whl", hash = "sha256:15ce6ab60957ca74cff444fe66d9045c1fd3e92c8936894ebd1f3eef2fff075f", size = 178535, upload-time = "2025-04-15T17:44:44.532Z" }, + { url = "https://files.pythonhosted.org/packages/12/6e/2fed56cd47ca739b43e892707ae9a13790a486a3173be063681ca67d2262/contourpy-1.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:e1578f7eafce927b168752ed7e22646dad6cd9bca673c60bff55889fa236ebf9", size = 223096, upload-time = "2025-04-15T17:44:48.194Z" }, + { url = "https://files.pythonhosted.org/packages/54/4c/e76fe2a03014a7c767d79ea35c86a747e9325537a8b7627e0e5b3ba266b4/contourpy-1.3.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0475b1f6604896bc7c53bb070e355e9321e1bc0d381735421a2d2068ec56531f", size = 285090, upload-time = "2025-04-15T17:43:34.084Z" }, + { url = "https://files.pythonhosted.org/packages/7b/e2/5aba47debd55d668e00baf9651b721e7733975dc9fc27264a62b0dd26eb8/contourpy-1.3.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:c85bb486e9be652314bb5b9e2e3b0d1b2e643d5eec4992c0fbe8ac71775da739", size = 268643, upload-time = "2025-04-15T17:43:38.626Z" }, + { url = "https://files.pythonhosted.org/packages/a1/37/cd45f1f051fe6230f751cc5cdd2728bb3a203f5619510ef11e732109593c/contourpy-1.3.2-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:745b57db7758f3ffc05a10254edd3182a2a83402a89c00957a8e8a22f5582823", size = 310443, upload-time = "2025-04-15T17:43:44.522Z" }, + { url = "https://files.pythonhosted.org/packages/8b/a2/36ea6140c306c9ff6dd38e3bcec80b3b018474ef4d17eb68ceecd26675f4/contourpy-1.3.2-cp313-cp313t-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:970e9173dbd7eba9b4e01aab19215a48ee5dd3f43cef736eebde064a171f89a5", size = 349865, upload-time = "2025-04-15T17:43:49.545Z" }, + { url = "https://files.pythonhosted.org/packages/95/b7/2fc76bc539693180488f7b6cc518da7acbbb9e3b931fd9280504128bf956/contourpy-1.3.2-cp313-cp313t-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:c6c4639a9c22230276b7bffb6a850dfc8258a2521305e1faefe804d006b2e532", size = 321162, upload-time = "2025-04-15T17:43:54.203Z" }, + { url = "https://files.pythonhosted.org/packages/f4/10/76d4f778458b0aa83f96e59d65ece72a060bacb20cfbee46cf6cd5ceba41/contourpy-1.3.2-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc829960f34ba36aad4302e78eabf3ef16a3a100863f0d4eeddf30e8a485a03b", size = 327355, upload-time = "2025-04-15T17:44:01.025Z" }, + { url = "https://files.pythonhosted.org/packages/43/a3/10cf483ea683f9f8ab096c24bad3cce20e0d1dd9a4baa0e2093c1c962d9d/contourpy-1.3.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:d32530b534e986374fc19eaa77fcb87e8a99e5431499949b828312bdcd20ac52", size = 1307935, upload-time = "2025-04-15T17:44:17.322Z" }, + { url = "https://files.pythonhosted.org/packages/78/73/69dd9a024444489e22d86108e7b913f3528f56cfc312b5c5727a44188471/contourpy-1.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e298e7e70cf4eb179cc1077be1c725b5fd131ebc81181bf0c03525c8abc297fd", size = 1372168, upload-time = "2025-04-15T17:44:33.43Z" }, + { url = "https://files.pythonhosted.org/packages/0f/1b/96d586ccf1b1a9d2004dd519b25fbf104a11589abfd05484ff12199cca21/contourpy-1.3.2-cp313-cp313t-win32.whl", hash = "sha256:d0e589ae0d55204991450bb5c23f571c64fe43adaa53f93fc902a84c96f52fe1", size = 189550, upload-time = "2025-04-15T17:44:37.092Z" }, + { url = "https://files.pythonhosted.org/packages/b0/e6/6000d0094e8a5e32ad62591c8609e269febb6e4db83a1c75ff8868b42731/contourpy-1.3.2-cp313-cp313t-win_amd64.whl", hash = "sha256:78e9253c3de756b3f6a5174d024c4835acd59eb3f8e2ca13e775dbffe1558f69", size = 238214, upload-time = "2025-04-15T17:44:40.827Z" }, + { url = "https://files.pythonhosted.org/packages/33/05/b26e3c6ecc05f349ee0013f0bb850a761016d89cec528a98193a48c34033/contourpy-1.3.2-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:fd93cc7f3139b6dd7aab2f26a90dde0aa9fc264dbf70f6740d498a70b860b82c", size = 265681, upload-time = "2025-04-15T17:44:59.314Z" }, + { url = "https://files.pythonhosted.org/packages/2b/25/ac07d6ad12affa7d1ffed11b77417d0a6308170f44ff20fa1d5aa6333f03/contourpy-1.3.2-pp310-pypy310_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:107ba8a6a7eec58bb475329e6d3b95deba9440667c4d62b9b6063942b61d7f16", size = 315101, upload-time = "2025-04-15T17:45:04.165Z" }, + { url = "https://files.pythonhosted.org/packages/8f/4d/5bb3192bbe9d3f27e3061a6a8e7733c9120e203cb8515767d30973f71030/contourpy-1.3.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ded1706ed0c1049224531b81128efbd5084598f18d8a2d9efae833edbd2b40ad", size = 220599, upload-time = "2025-04-15T17:45:08.456Z" }, + { url = "https://files.pythonhosted.org/packages/ff/c0/91f1215d0d9f9f343e4773ba6c9b89e8c0cc7a64a6263f21139da639d848/contourpy-1.3.2-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:5f5964cdad279256c084b69c3f412b7801e15356b16efa9d78aa974041903da0", size = 266807, upload-time = "2025-04-15T17:45:15.535Z" }, + { url = "https://files.pythonhosted.org/packages/d4/79/6be7e90c955c0487e7712660d6cead01fa17bff98e0ea275737cc2bc8e71/contourpy-1.3.2-pp311-pypy311_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:49b65a95d642d4efa8f64ba12558fcb83407e58a2dfba9d796d77b63ccfcaff5", size = 318729, upload-time = "2025-04-15T17:45:20.166Z" }, + { url = "https://files.pythonhosted.org/packages/87/68/7f46fb537958e87427d98a4074bcde4b67a70b04900cfc5ce29bc2f556c1/contourpy-1.3.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:8c5acb8dddb0752bf252e01a3035b21443158910ac16a3b0d20e7fed7d534ce5", size = 221791, upload-time = "2025-04-15T17:45:24.794Z" }, +] + +[[package]] +name = "contourpy" +version = "1.3.3" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", +] +dependencies = [ + { name = "numpy", marker = "python_full_version >= '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/58/01/1253e6698a07380cd31a736d248a3f2a50a7c88779a1813da27503cadc2a/contourpy-1.3.3.tar.gz", hash = "sha256:083e12155b210502d0bca491432bb04d56dc3432f95a979b429f2848c3dbe880", size = 13466174, upload-time = "2025-07-26T12:03:12.549Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/91/2e/c4390a31919d8a78b90e8ecf87cd4b4c4f05a5b48d05ec17db8e5404c6f4/contourpy-1.3.3-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:709a48ef9a690e1343202916450bc48b9e51c049b089c7f79a267b46cffcdaa1", size = 288773, upload-time = "2025-07-26T12:01:02.277Z" }, + { url = "https://files.pythonhosted.org/packages/0d/44/c4b0b6095fef4dc9c420e041799591e3b63e9619e3044f7f4f6c21c0ab24/contourpy-1.3.3-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:23416f38bfd74d5d28ab8429cc4d63fa67d5068bd711a85edb1c3fb0c3e2f381", size = 270149, upload-time = "2025-07-26T12:01:04.072Z" }, + { url = "https://files.pythonhosted.org/packages/30/2e/dd4ced42fefac8470661d7cb7e264808425e6c5d56d175291e93890cce09/contourpy-1.3.3-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:929ddf8c4c7f348e4c0a5a3a714b5c8542ffaa8c22954862a46ca1813b667ee7", size = 329222, upload-time = "2025-07-26T12:01:05.688Z" }, + { url = "https://files.pythonhosted.org/packages/f2/74/cc6ec2548e3d276c71389ea4802a774b7aa3558223b7bade3f25787fafc2/contourpy-1.3.3-cp311-cp311-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:9e999574eddae35f1312c2b4b717b7885d4edd6cb46700e04f7f02db454e67c1", size = 377234, upload-time = "2025-07-26T12:01:07.054Z" }, + { url = "https://files.pythonhosted.org/packages/03/b3/64ef723029f917410f75c09da54254c5f9ea90ef89b143ccadb09df14c15/contourpy-1.3.3-cp311-cp311-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:0bf67e0e3f482cb69779dd3061b534eb35ac9b17f163d851e2a547d56dba0a3a", size = 380555, upload-time = "2025-07-26T12:01:08.801Z" }, + { url = "https://files.pythonhosted.org/packages/5f/4b/6157f24ca425b89fe2eb7e7be642375711ab671135be21e6faa100f7448c/contourpy-1.3.3-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:51e79c1f7470158e838808d4a996fa9bac72c498e93d8ebe5119bc1e6becb0db", size = 355238, upload-time = "2025-07-26T12:01:10.319Z" }, + { url = "https://files.pythonhosted.org/packages/98/56/f914f0dd678480708a04cfd2206e7c382533249bc5001eb9f58aa693e200/contourpy-1.3.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:598c3aaece21c503615fd59c92a3598b428b2f01bfb4b8ca9c4edeecc2438620", size = 1326218, upload-time = "2025-07-26T12:01:12.659Z" }, + { url = "https://files.pythonhosted.org/packages/fb/d7/4a972334a0c971acd5172389671113ae82aa7527073980c38d5868ff1161/contourpy-1.3.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:322ab1c99b008dad206d406bb61d014cf0174df491ae9d9d0fac6a6fda4f977f", size = 1392867, upload-time = "2025-07-26T12:01:15.533Z" }, + { url = "https://files.pythonhosted.org/packages/75/3e/f2cc6cd56dc8cff46b1a56232eabc6feea52720083ea71ab15523daab796/contourpy-1.3.3-cp311-cp311-win32.whl", hash = "sha256:fd907ae12cd483cd83e414b12941c632a969171bf90fc937d0c9f268a31cafff", size = 183677, upload-time = "2025-07-26T12:01:17.088Z" }, + { url = "https://files.pythonhosted.org/packages/98/4b/9bd370b004b5c9d8045c6c33cf65bae018b27aca550a3f657cdc99acdbd8/contourpy-1.3.3-cp311-cp311-win_amd64.whl", hash = "sha256:3519428f6be58431c56581f1694ba8e50626f2dd550af225f82fb5f5814d2a42", size = 225234, upload-time = "2025-07-26T12:01:18.256Z" }, + { url = "https://files.pythonhosted.org/packages/d9/b6/71771e02c2e004450c12b1120a5f488cad2e4d5b590b1af8bad060360fe4/contourpy-1.3.3-cp311-cp311-win_arm64.whl", hash = "sha256:15ff10bfada4bf92ec8b31c62bf7c1834c244019b4a33095a68000d7075df470", size = 193123, upload-time = "2025-07-26T12:01:19.848Z" }, + { url = "https://files.pythonhosted.org/packages/be/45/adfee365d9ea3d853550b2e735f9d66366701c65db7855cd07621732ccfc/contourpy-1.3.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:b08a32ea2f8e42cf1d4be3169a98dd4be32bafe4f22b6c4cb4ba810fa9e5d2cb", size = 293419, upload-time = "2025-07-26T12:01:21.16Z" }, + { url = "https://files.pythonhosted.org/packages/53/3e/405b59cfa13021a56bba395a6b3aca8cec012b45bf177b0eaf7a202cde2c/contourpy-1.3.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:556dba8fb6f5d8742f2923fe9457dbdd51e1049c4a43fd3986a0b14a1d815fc6", size = 273979, upload-time = "2025-07-26T12:01:22.448Z" }, + { url = "https://files.pythonhosted.org/packages/d4/1c/a12359b9b2ca3a845e8f7f9ac08bdf776114eb931392fcad91743e2ea17b/contourpy-1.3.3-cp312-cp312-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92d9abc807cf7d0e047b95ca5d957cf4792fcd04e920ca70d48add15c1a90ea7", size = 332653, upload-time = "2025-07-26T12:01:24.155Z" }, + { url = "https://files.pythonhosted.org/packages/63/12/897aeebfb475b7748ea67b61e045accdfcf0d971f8a588b67108ed7f5512/contourpy-1.3.3-cp312-cp312-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:b2e8faa0ed68cb29af51edd8e24798bb661eac3bd9f65420c1887b6ca89987c8", size = 379536, upload-time = "2025-07-26T12:01:25.91Z" }, + { url = "https://files.pythonhosted.org/packages/43/8a/a8c584b82deb248930ce069e71576fc09bd7174bbd35183b7943fb1064fd/contourpy-1.3.3-cp312-cp312-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:626d60935cf668e70a5ce6ff184fd713e9683fb458898e4249b63be9e28286ea", size = 384397, upload-time = "2025-07-26T12:01:27.152Z" }, + { url = "https://files.pythonhosted.org/packages/cc/8f/ec6289987824b29529d0dfda0d74a07cec60e54b9c92f3c9da4c0ac732de/contourpy-1.3.3-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4d00e655fcef08aba35ec9610536bfe90267d7ab5ba944f7032549c55a146da1", size = 362601, upload-time = "2025-07-26T12:01:28.808Z" }, + { url = "https://files.pythonhosted.org/packages/05/0a/a3fe3be3ee2dceb3e615ebb4df97ae6f3828aa915d3e10549ce016302bd1/contourpy-1.3.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:451e71b5a7d597379ef572de31eeb909a87246974d960049a9848c3bc6c41bf7", size = 1331288, upload-time = "2025-07-26T12:01:31.198Z" }, + { url = "https://files.pythonhosted.org/packages/33/1d/acad9bd4e97f13f3e2b18a3977fe1b4a37ecf3d38d815333980c6c72e963/contourpy-1.3.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:459c1f020cd59fcfe6650180678a9993932d80d44ccde1fa1868977438f0b411", size = 1403386, upload-time = "2025-07-26T12:01:33.947Z" }, + { url = "https://files.pythonhosted.org/packages/cf/8f/5847f44a7fddf859704217a99a23a4f6417b10e5ab1256a179264561540e/contourpy-1.3.3-cp312-cp312-win32.whl", hash = "sha256:023b44101dfe49d7d53932be418477dba359649246075c996866106da069af69", size = 185018, upload-time = "2025-07-26T12:01:35.64Z" }, + { url = "https://files.pythonhosted.org/packages/19/e8/6026ed58a64563186a9ee3f29f41261fd1828f527dd93d33b60feca63352/contourpy-1.3.3-cp312-cp312-win_amd64.whl", hash = "sha256:8153b8bfc11e1e4d75bcb0bff1db232f9e10b274e0929de9d608027e0d34ff8b", size = 226567, upload-time = "2025-07-26T12:01:36.804Z" }, + { url = "https://files.pythonhosted.org/packages/d1/e2/f05240d2c39a1ed228d8328a78b6f44cd695f7ef47beb3e684cf93604f86/contourpy-1.3.3-cp312-cp312-win_arm64.whl", hash = "sha256:07ce5ed73ecdc4a03ffe3e1b3e3c1166db35ae7584be76f65dbbe28a7791b0cc", size = 193655, upload-time = "2025-07-26T12:01:37.999Z" }, + { url = "https://files.pythonhosted.org/packages/68/35/0167aad910bbdb9599272bd96d01a9ec6852f36b9455cf2ca67bd4cc2d23/contourpy-1.3.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:177fb367556747a686509d6fef71d221a4b198a3905fe824430e5ea0fda54eb5", size = 293257, upload-time = "2025-07-26T12:01:39.367Z" }, + { url = "https://files.pythonhosted.org/packages/96/e4/7adcd9c8362745b2210728f209bfbcf7d91ba868a2c5f40d8b58f54c509b/contourpy-1.3.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:d002b6f00d73d69333dac9d0b8d5e84d9724ff9ef044fd63c5986e62b7c9e1b1", size = 274034, upload-time = "2025-07-26T12:01:40.645Z" }, + { url = "https://files.pythonhosted.org/packages/73/23/90e31ceeed1de63058a02cb04b12f2de4b40e3bef5e082a7c18d9c8ae281/contourpy-1.3.3-cp313-cp313-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:348ac1f5d4f1d66d3322420f01d42e43122f43616e0f194fc1c9f5d830c5b286", size = 334672, upload-time = "2025-07-26T12:01:41.942Z" }, + { url = "https://files.pythonhosted.org/packages/ed/93/b43d8acbe67392e659e1d984700e79eb67e2acb2bd7f62012b583a7f1b55/contourpy-1.3.3-cp313-cp313-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:655456777ff65c2c548b7c454af9c6f33f16c8884f11083244b5819cc214f1b5", size = 381234, upload-time = "2025-07-26T12:01:43.499Z" }, + { url = "https://files.pythonhosted.org/packages/46/3b/bec82a3ea06f66711520f75a40c8fc0b113b2a75edb36aa633eb11c4f50f/contourpy-1.3.3-cp313-cp313-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:644a6853d15b2512d67881586bd03f462c7ab755db95f16f14d7e238f2852c67", size = 385169, upload-time = "2025-07-26T12:01:45.219Z" }, + { url = "https://files.pythonhosted.org/packages/4b/32/e0f13a1c5b0f8572d0ec6ae2f6c677b7991fafd95da523159c19eff0696a/contourpy-1.3.3-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4debd64f124ca62069f313a9cb86656ff087786016d76927ae2cf37846b006c9", size = 362859, upload-time = "2025-07-26T12:01:46.519Z" }, + { url = "https://files.pythonhosted.org/packages/33/71/e2a7945b7de4e58af42d708a219f3b2f4cff7386e6b6ab0a0fa0033c49a9/contourpy-1.3.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:a15459b0f4615b00bbd1e91f1b9e19b7e63aea7483d03d804186f278c0af2659", size = 1332062, upload-time = "2025-07-26T12:01:48.964Z" }, + { url = "https://files.pythonhosted.org/packages/12/fc/4e87ac754220ccc0e807284f88e943d6d43b43843614f0a8afa469801db0/contourpy-1.3.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca0fdcd73925568ca027e0b17ab07aad764be4706d0a925b89227e447d9737b7", size = 1403932, upload-time = "2025-07-26T12:01:51.979Z" }, + { url = "https://files.pythonhosted.org/packages/a6/2e/adc197a37443f934594112222ac1aa7dc9a98faf9c3842884df9a9d8751d/contourpy-1.3.3-cp313-cp313-win32.whl", hash = "sha256:b20c7c9a3bf701366556e1b1984ed2d0cedf999903c51311417cf5f591d8c78d", size = 185024, upload-time = "2025-07-26T12:01:53.245Z" }, + { url = "https://files.pythonhosted.org/packages/18/0b/0098c214843213759692cc638fce7de5c289200a830e5035d1791d7a2338/contourpy-1.3.3-cp313-cp313-win_amd64.whl", hash = "sha256:1cadd8b8969f060ba45ed7c1b714fe69185812ab43bd6b86a9123fe8f99c3263", size = 226578, upload-time = "2025-07-26T12:01:54.422Z" }, + { url = "https://files.pythonhosted.org/packages/8a/9a/2f6024a0c5995243cd63afdeb3651c984f0d2bc727fd98066d40e141ad73/contourpy-1.3.3-cp313-cp313-win_arm64.whl", hash = "sha256:fd914713266421b7536de2bfa8181aa8c699432b6763a0ea64195ebe28bff6a9", size = 193524, upload-time = "2025-07-26T12:01:55.73Z" }, + { url = "https://files.pythonhosted.org/packages/c0/b3/f8a1a86bd3298513f500e5b1f5fd92b69896449f6cab6a146a5d52715479/contourpy-1.3.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:88df9880d507169449d434c293467418b9f6cbe82edd19284aa0409e7fdb933d", size = 306730, upload-time = "2025-07-26T12:01:57.051Z" }, + { url = "https://files.pythonhosted.org/packages/3f/11/4780db94ae62fc0c2053909b65dc3246bd7cecfc4f8a20d957ad43aa4ad8/contourpy-1.3.3-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:d06bb1f751ba5d417047db62bca3c8fde202b8c11fb50742ab3ab962c81e8216", size = 287897, upload-time = "2025-07-26T12:01:58.663Z" }, + { url = "https://files.pythonhosted.org/packages/ae/15/e59f5f3ffdd6f3d4daa3e47114c53daabcb18574a26c21f03dc9e4e42ff0/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e4e6b05a45525357e382909a4c1600444e2a45b4795163d3b22669285591c1ae", size = 326751, upload-time = "2025-07-26T12:02:00.343Z" }, + { url = "https://files.pythonhosted.org/packages/0f/81/03b45cfad088e4770b1dcf72ea78d3802d04200009fb364d18a493857210/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ab3074b48c4e2cf1a960e6bbeb7f04566bf36b1861d5c9d4d8ac04b82e38ba20", size = 375486, upload-time = "2025-07-26T12:02:02.128Z" }, + { url = "https://files.pythonhosted.org/packages/0c/ba/49923366492ffbdd4486e970d421b289a670ae8cf539c1ea9a09822b371a/contourpy-1.3.3-cp313-cp313t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:6c3d53c796f8647d6deb1abe867daeb66dcc8a97e8455efa729516b997b8ed99", size = 388106, upload-time = "2025-07-26T12:02:03.615Z" }, + { url = "https://files.pythonhosted.org/packages/9f/52/5b00ea89525f8f143651f9f03a0df371d3cbd2fccd21ca9b768c7a6500c2/contourpy-1.3.3-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50ed930df7289ff2a8d7afeb9603f8289e5704755c7e5c3bbd929c90c817164b", size = 352548, upload-time = "2025-07-26T12:02:05.165Z" }, + { url = "https://files.pythonhosted.org/packages/32/1d/a209ec1a3a3452d490f6b14dd92e72280c99ae3d1e73da74f8277d4ee08f/contourpy-1.3.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:4feffb6537d64b84877da813a5c30f1422ea5739566abf0bd18065ac040e120a", size = 1322297, upload-time = "2025-07-26T12:02:07.379Z" }, + { url = "https://files.pythonhosted.org/packages/bc/9e/46f0e8ebdd884ca0e8877e46a3f4e633f6c9c8c4f3f6e72be3fe075994aa/contourpy-1.3.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:2b7e9480ffe2b0cd2e787e4df64270e3a0440d9db8dc823312e2c940c167df7e", size = 1391023, upload-time = "2025-07-26T12:02:10.171Z" }, + { url = "https://files.pythonhosted.org/packages/b9/70/f308384a3ae9cd2209e0849f33c913f658d3326900d0ff5d378d6a1422d2/contourpy-1.3.3-cp313-cp313t-win32.whl", hash = "sha256:283edd842a01e3dcd435b1c5116798d661378d83d36d337b8dde1d16a5fc9ba3", size = 196157, upload-time = "2025-07-26T12:02:11.488Z" }, + { url = "https://files.pythonhosted.org/packages/b2/dd/880f890a6663b84d9e34a6f88cded89d78f0091e0045a284427cb6b18521/contourpy-1.3.3-cp313-cp313t-win_amd64.whl", hash = "sha256:87acf5963fc2b34825e5b6b048f40e3635dd547f590b04d2ab317c2619ef7ae8", size = 240570, upload-time = "2025-07-26T12:02:12.754Z" }, + { url = "https://files.pythonhosted.org/packages/80/99/2adc7d8ffead633234817ef8e9a87115c8a11927a94478f6bb3d3f4d4f7d/contourpy-1.3.3-cp313-cp313t-win_arm64.whl", hash = "sha256:3c30273eb2a55024ff31ba7d052dde990d7d8e5450f4bbb6e913558b3d6c2301", size = 199713, upload-time = "2025-07-26T12:02:14.4Z" }, + { url = "https://files.pythonhosted.org/packages/72/8b/4546f3ab60f78c514ffb7d01a0bd743f90de36f0019d1be84d0a708a580a/contourpy-1.3.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fde6c716d51c04b1c25d0b90364d0be954624a0ee9d60e23e850e8d48353d07a", size = 292189, upload-time = "2025-07-26T12:02:16.095Z" }, + { url = "https://files.pythonhosted.org/packages/fd/e1/3542a9cb596cadd76fcef413f19c79216e002623158befe6daa03dbfa88c/contourpy-1.3.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:cbedb772ed74ff5be440fa8eee9bd49f64f6e3fc09436d9c7d8f1c287b121d77", size = 273251, upload-time = "2025-07-26T12:02:17.524Z" }, + { url = "https://files.pythonhosted.org/packages/b1/71/f93e1e9471d189f79d0ce2497007731c1e6bf9ef6d1d61b911430c3db4e5/contourpy-1.3.3-cp314-cp314-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:22e9b1bd7a9b1d652cd77388465dc358dafcd2e217d35552424aa4f996f524f5", size = 335810, upload-time = "2025-07-26T12:02:18.9Z" }, + { url = "https://files.pythonhosted.org/packages/91/f9/e35f4c1c93f9275d4e38681a80506b5510e9327350c51f8d4a5a724d178c/contourpy-1.3.3-cp314-cp314-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a22738912262aa3e254e4f3cb079a95a67132fc5a063890e224393596902f5a4", size = 382871, upload-time = "2025-07-26T12:02:20.418Z" }, + { url = "https://files.pythonhosted.org/packages/b5/71/47b512f936f66a0a900d81c396a7e60d73419868fba959c61efed7a8ab46/contourpy-1.3.3-cp314-cp314-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:afe5a512f31ee6bd7d0dda52ec9864c984ca3d66664444f2d72e0dc4eb832e36", size = 386264, upload-time = "2025-07-26T12:02:21.916Z" }, + { url = "https://files.pythonhosted.org/packages/04/5f/9ff93450ba96b09c7c2b3f81c94de31c89f92292f1380261bd7195bea4ea/contourpy-1.3.3-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f64836de09927cba6f79dcd00fdd7d5329f3fccc633468507079c829ca4db4e3", size = 363819, upload-time = "2025-07-26T12:02:23.759Z" }, + { url = "https://files.pythonhosted.org/packages/3e/a6/0b185d4cc480ee494945cde102cb0149ae830b5fa17bf855b95f2e70ad13/contourpy-1.3.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:1fd43c3be4c8e5fd6e4f2baeae35ae18176cf2e5cced681cca908addf1cdd53b", size = 1333650, upload-time = "2025-07-26T12:02:26.181Z" }, + { url = "https://files.pythonhosted.org/packages/43/d7/afdc95580ca56f30fbcd3060250f66cedbde69b4547028863abd8aa3b47e/contourpy-1.3.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:6afc576f7b33cf00996e5c1102dc2a8f7cc89e39c0b55df93a0b78c1bd992b36", size = 1404833, upload-time = "2025-07-26T12:02:28.782Z" }, + { url = "https://files.pythonhosted.org/packages/e2/e2/366af18a6d386f41132a48f033cbd2102e9b0cf6345d35ff0826cd984566/contourpy-1.3.3-cp314-cp314-win32.whl", hash = "sha256:66c8a43a4f7b8df8b71ee1840e4211a3c8d93b214b213f590e18a1beca458f7d", size = 189692, upload-time = "2025-07-26T12:02:30.128Z" }, + { url = "https://files.pythonhosted.org/packages/7d/c2/57f54b03d0f22d4044b8afb9ca0e184f8b1afd57b4f735c2fa70883dc601/contourpy-1.3.3-cp314-cp314-win_amd64.whl", hash = "sha256:cf9022ef053f2694e31d630feaacb21ea24224be1c3ad0520b13d844274614fd", size = 232424, upload-time = "2025-07-26T12:02:31.395Z" }, + { url = "https://files.pythonhosted.org/packages/18/79/a9416650df9b525737ab521aa181ccc42d56016d2123ddcb7b58e926a42c/contourpy-1.3.3-cp314-cp314-win_arm64.whl", hash = "sha256:95b181891b4c71de4bb404c6621e7e2390745f887f2a026b2d99e92c17892339", size = 198300, upload-time = "2025-07-26T12:02:32.956Z" }, + { url = "https://files.pythonhosted.org/packages/1f/42/38c159a7d0f2b7b9c04c64ab317042bb6952b713ba875c1681529a2932fe/contourpy-1.3.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:33c82d0138c0a062380332c861387650c82e4cf1747aaa6938b9b6516762e772", size = 306769, upload-time = "2025-07-26T12:02:34.2Z" }, + { url = "https://files.pythonhosted.org/packages/c3/6c/26a8205f24bca10974e77460de68d3d7c63e282e23782f1239f226fcae6f/contourpy-1.3.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:ea37e7b45949df430fe649e5de8351c423430046a2af20b1c1961cae3afcda77", size = 287892, upload-time = "2025-07-26T12:02:35.807Z" }, + { url = "https://files.pythonhosted.org/packages/66/06/8a475c8ab718ebfd7925661747dbb3c3ee9c82ac834ccb3570be49d129f4/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d304906ecc71672e9c89e87c4675dc5c2645e1f4269a5063b99b0bb29f232d13", size = 326748, upload-time = "2025-07-26T12:02:37.193Z" }, + { url = "https://files.pythonhosted.org/packages/b4/a3/c5ca9f010a44c223f098fccd8b158bb1cb287378a31ac141f04730dc49be/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:ca658cd1a680a5c9ea96dc61cdbae1e85c8f25849843aa799dfd3cb370ad4fbe", size = 375554, upload-time = "2025-07-26T12:02:38.894Z" }, + { url = "https://files.pythonhosted.org/packages/80/5b/68bd33ae63fac658a4145088c1e894405e07584a316738710b636c6d0333/contourpy-1.3.3-cp314-cp314t-manylinux_2_26_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ab2fd90904c503739a75b7c8c5c01160130ba67944a7b77bbf36ef8054576e7f", size = 388118, upload-time = "2025-07-26T12:02:40.642Z" }, + { url = "https://files.pythonhosted.org/packages/40/52/4c285a6435940ae25d7410a6c36bda5145839bc3f0beb20c707cda18b9d2/contourpy-1.3.3-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b7301b89040075c30e5768810bc96a8e8d78085b47d8be6e4c3f5a0b4ed478a0", size = 352555, upload-time = "2025-07-26T12:02:42.25Z" }, + { url = "https://files.pythonhosted.org/packages/24/ee/3e81e1dd174f5c7fefe50e85d0892de05ca4e26ef1c9a59c2a57e43b865a/contourpy-1.3.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:2a2a8b627d5cc6b7c41a4beff6c5ad5eb848c88255fda4a8745f7e901b32d8e4", size = 1322295, upload-time = "2025-07-26T12:02:44.668Z" }, + { url = "https://files.pythonhosted.org/packages/3c/b2/6d913d4d04e14379de429057cd169e5e00f6c2af3bb13e1710bcbdb5da12/contourpy-1.3.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:fd6ec6be509c787f1caf6b247f0b1ca598bef13f4ddeaa126b7658215529ba0f", size = 1391027, upload-time = "2025-07-26T12:02:47.09Z" }, + { url = "https://files.pythonhosted.org/packages/93/8a/68a4ec5c55a2971213d29a9374913f7e9f18581945a7a31d1a39b5d2dfe5/contourpy-1.3.3-cp314-cp314t-win32.whl", hash = "sha256:e74a9a0f5e3fff48fb5a7f2fd2b9b70a3fe014a67522f79b7cca4c0c7e43c9ae", size = 202428, upload-time = "2025-07-26T12:02:48.691Z" }, + { url = "https://files.pythonhosted.org/packages/fa/96/fd9f641ffedc4fa3ace923af73b9d07e869496c9cc7a459103e6e978992f/contourpy-1.3.3-cp314-cp314t-win_amd64.whl", hash = "sha256:13b68d6a62db8eafaebb8039218921399baf6e47bf85006fd8529f2a08ef33fc", size = 250331, upload-time = "2025-07-26T12:02:50.137Z" }, + { url = "https://files.pythonhosted.org/packages/ae/8c/469afb6465b853afff216f9528ffda78a915ff880ed58813ba4faf4ba0b6/contourpy-1.3.3-cp314-cp314t-win_arm64.whl", hash = "sha256:b7448cb5a725bb1e35ce88771b86fba35ef418952474492cf7c764059933ff8b", size = 203831, upload-time = "2025-07-26T12:02:51.449Z" }, + { url = "https://files.pythonhosted.org/packages/a5/29/8dcfe16f0107943fa92388c23f6e05cff0ba58058c4c95b00280d4c75a14/contourpy-1.3.3-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:cd5dfcaeb10f7b7f9dc8941717c6c2ade08f587be2226222c12b25f0483ed497", size = 278809, upload-time = "2025-07-26T12:02:52.74Z" }, + { url = "https://files.pythonhosted.org/packages/85/a9/8b37ef4f7dafeb335daee3c8254645ef5725be4d9c6aa70b50ec46ef2f7e/contourpy-1.3.3-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:0c1fc238306b35f246d61a1d416a627348b5cf0648648a031e14bb8705fcdfe8", size = 261593, upload-time = "2025-07-26T12:02:54.037Z" }, + { url = "https://files.pythonhosted.org/packages/0a/59/ebfb8c677c75605cc27f7122c90313fd2f375ff3c8d19a1694bda74aaa63/contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:70f9aad7de812d6541d29d2bbf8feb22ff7e1c299523db288004e3157ff4674e", size = 302202, upload-time = "2025-07-26T12:02:55.947Z" }, + { url = "https://files.pythonhosted.org/packages/3c/37/21972a15834d90bfbfb009b9d004779bd5a07a0ec0234e5ba8f64d5736f4/contourpy-1.3.3-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5ed3657edf08512fc3fe81b510e35c2012fbd3081d2e26160f27ca28affec989", size = 329207, upload-time = "2025-07-26T12:02:57.468Z" }, + { url = "https://files.pythonhosted.org/packages/0c/58/bd257695f39d05594ca4ad60df5bcb7e32247f9951fd09a9b8edb82d1daa/contourpy-1.3.3-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:3d1a3799d62d45c18bafd41c5fa05120b96a28079f2393af559b843d1a966a77", size = 225315, upload-time = "2025-07-26T12:02:58.801Z" }, +] + +[[package]] +name = "cycler" +version = "0.12.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/a9/95/a3dbbb5028f35eafb79008e7522a75244477d2838f38cbb722248dabc2a8/cycler-0.12.1.tar.gz", hash = "sha256:88bb128f02ba341da8ef447245a9e138fae777f6a23943da4540077d3601eb1c", size = 7615, upload-time = "2023-10-07T05:32:18.335Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/05/c19819d5e3d95294a6f5947fb9b9629efb316b96de511b418c53d245aae6/cycler-0.12.1-py3-none-any.whl", hash = "sha256:85cef7cff222d8644161529808465972e51340599459b8ac3ccbac5a854e0d30", size = 8321, upload-time = "2023-10-07T05:32:16.783Z" }, +] + [[package]] name = "datasets" version = "4.0.0" @@ -297,6 +589,44 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/eb/62/eb8157afb21bd229c864521c1ab4fa8e9b4f1b06bafdd8c4668a7a31b5dd/datasets-4.0.0-py3-none-any.whl", hash = "sha256:7ef95e62025fd122882dbce6cb904c8cd3fbc829de6669a5eb939c77d50e203d", size = 494825, upload-time = "2025-07-09T14:35:50.658Z" }, ] +[[package]] +name = "debugpy" +version = "1.8.19" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/73/75/9e12d4d42349b817cd545b89247696c67917aab907012ae5b64bbfea3199/debugpy-1.8.19.tar.gz", hash = "sha256:eea7e5987445ab0b5ed258093722d5ecb8bb72217c5c9b1e21f64efe23ddebdb", size = 1644590, upload-time = "2025-12-15T21:53:28.044Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bf/98/d57054371887f37d3c959a7a8dc3c76b763acb65f5e78d849d7db7cadc5b/debugpy-1.8.19-cp310-cp310-macosx_15_0_x86_64.whl", hash = "sha256:fce6da15d73be5935b4438435c53adb512326a3e11e4f90793ea87cd9f018254", size = 2098493, upload-time = "2025-12-15T21:53:30.149Z" }, + { url = "https://files.pythonhosted.org/packages/ee/dd/c517b9aa3500157a30e4f4c4f5149f880026bd039d2b940acd2383a85d8e/debugpy-1.8.19-cp310-cp310-manylinux_2_34_x86_64.whl", hash = "sha256:e24b1652a1df1ab04d81e7ead446a91c226de704ff5dde6bd0a0dbaab07aa3f2", size = 3087875, upload-time = "2025-12-15T21:53:31.511Z" }, + { url = "https://files.pythonhosted.org/packages/d8/57/3d5a5b0da9b63445253107ead151eff29190c6ad7440c68d1a59d56613aa/debugpy-1.8.19-cp310-cp310-win32.whl", hash = "sha256:327cb28c3ad9e17bc925efc7f7018195fd4787c2fe4b7af1eec11f1d19bdec62", size = 5239378, upload-time = "2025-12-15T21:53:32.979Z" }, + { url = "https://files.pythonhosted.org/packages/a6/36/7f9053c4c549160c87ae7e43800138f2695578c8b65947114c97250983b6/debugpy-1.8.19-cp310-cp310-win_amd64.whl", hash = "sha256:b7dd275cf2c99e53adb9654f5ae015f70415bbe2bacbe24cfee30d54b6aa03c5", size = 5271129, upload-time = "2025-12-15T21:53:35.085Z" }, + { url = "https://files.pythonhosted.org/packages/80/e2/48531a609b5a2aa94c6b6853afdfec8da05630ab9aaa96f1349e772119e9/debugpy-1.8.19-cp311-cp311-macosx_15_0_universal2.whl", hash = "sha256:c5dcfa21de1f735a4f7ced4556339a109aa0f618d366ede9da0a3600f2516d8b", size = 2207620, upload-time = "2025-12-15T21:53:37.1Z" }, + { url = "https://files.pythonhosted.org/packages/1b/d4/97775c01d56071969f57d93928899e5616a4cfbbf4c8cc75390d3a51c4a4/debugpy-1.8.19-cp311-cp311-manylinux_2_34_x86_64.whl", hash = "sha256:806d6800246244004625d5222d7765874ab2d22f3ba5f615416cf1342d61c488", size = 3170796, upload-time = "2025-12-15T21:53:38.513Z" }, + { url = "https://files.pythonhosted.org/packages/8d/7e/8c7681bdb05be9ec972bbb1245eb7c4c7b0679bb6a9e6408d808bc876d3d/debugpy-1.8.19-cp311-cp311-win32.whl", hash = "sha256:783a519e6dfb1f3cd773a9bda592f4887a65040cb0c7bd38dde410f4e53c40d4", size = 5164287, upload-time = "2025-12-15T21:53:40.857Z" }, + { url = "https://files.pythonhosted.org/packages/f2/a8/aaac7ff12ddf5d68a39e13a423a8490426f5f661384f5ad8d9062761bd8e/debugpy-1.8.19-cp311-cp311-win_amd64.whl", hash = "sha256:14035cbdbb1fe4b642babcdcb5935c2da3b1067ac211c5c5a8fdc0bb31adbcaa", size = 5188269, upload-time = "2025-12-15T21:53:42.359Z" }, + { url = "https://files.pythonhosted.org/packages/4a/15/d762e5263d9e25b763b78be72dc084c7a32113a0bac119e2f7acae7700ed/debugpy-1.8.19-cp312-cp312-macosx_15_0_universal2.whl", hash = "sha256:bccb1540a49cde77edc7ce7d9d075c1dbeb2414751bc0048c7a11e1b597a4c2e", size = 2549995, upload-time = "2025-12-15T21:53:43.773Z" }, + { url = "https://files.pythonhosted.org/packages/a7/88/f7d25c68b18873b7c53d7c156ca7a7ffd8e77073aa0eac170a9b679cf786/debugpy-1.8.19-cp312-cp312-manylinux_2_34_x86_64.whl", hash = "sha256:e9c68d9a382ec754dc05ed1d1b4ed5bd824b9f7c1a8cd1083adb84b3c93501de", size = 4309891, upload-time = "2025-12-15T21:53:45.26Z" }, + { url = "https://files.pythonhosted.org/packages/c5/4f/a65e973aba3865794da65f71971dca01ae66666132c7b2647182d5be0c5f/debugpy-1.8.19-cp312-cp312-win32.whl", hash = "sha256:6599cab8a783d1496ae9984c52cb13b7c4a3bd06a8e6c33446832a5d97ce0bee", size = 5286355, upload-time = "2025-12-15T21:53:46.763Z" }, + { url = "https://files.pythonhosted.org/packages/d8/3a/d3d8b48fec96e3d824e404bf428276fb8419dfa766f78f10b08da1cb2986/debugpy-1.8.19-cp312-cp312-win_amd64.whl", hash = "sha256:66e3d2fd8f2035a8f111eb127fa508469dfa40928a89b460b41fd988684dc83d", size = 5328239, upload-time = "2025-12-15T21:53:48.868Z" }, + { url = "https://files.pythonhosted.org/packages/71/3d/388035a31a59c26f1ecc8d86af607d0c42e20ef80074147cd07b180c4349/debugpy-1.8.19-cp313-cp313-macosx_15_0_universal2.whl", hash = "sha256:91e35db2672a0abaf325f4868fcac9c1674a0d9ad9bb8a8c849c03a5ebba3e6d", size = 2538859, upload-time = "2025-12-15T21:53:50.478Z" }, + { url = "https://files.pythonhosted.org/packages/4a/19/c93a0772d0962294f083dbdb113af1a7427bb632d36e5314297068f55db7/debugpy-1.8.19-cp313-cp313-manylinux_2_34_x86_64.whl", hash = "sha256:85016a73ab84dea1c1f1dcd88ec692993bcbe4532d1b49ecb5f3c688ae50c606", size = 4292575, upload-time = "2025-12-15T21:53:51.821Z" }, + { url = "https://files.pythonhosted.org/packages/5c/56/09e48ab796b0a77e3d7dc250f95251832b8bf6838c9632f6100c98bdf426/debugpy-1.8.19-cp313-cp313-win32.whl", hash = "sha256:b605f17e89ba0ecee994391194285fada89cee111cfcd29d6f2ee11cbdc40976", size = 5286209, upload-time = "2025-12-15T21:53:53.602Z" }, + { url = "https://files.pythonhosted.org/packages/fb/4e/931480b9552c7d0feebe40c73725dd7703dcc578ba9efc14fe0e6d31cfd1/debugpy-1.8.19-cp313-cp313-win_amd64.whl", hash = "sha256:c30639998a9f9cd9699b4b621942c0179a6527f083c72351f95c6ab1728d5b73", size = 5328206, upload-time = "2025-12-15T21:53:55.433Z" }, + { url = "https://files.pythonhosted.org/packages/f6/b9/cbec520c3a00508327476c7fce26fbafef98f412707e511eb9d19a2ef467/debugpy-1.8.19-cp314-cp314-macosx_15_0_universal2.whl", hash = "sha256:1e8c4d1bd230067bf1bbcdbd6032e5a57068638eb28b9153d008ecde288152af", size = 2537372, upload-time = "2025-12-15T21:53:57.318Z" }, + { url = "https://files.pythonhosted.org/packages/88/5e/cf4e4dc712a141e10d58405c58c8268554aec3c35c09cdcda7535ff13f76/debugpy-1.8.19-cp314-cp314-manylinux_2_34_x86_64.whl", hash = "sha256:d40c016c1f538dbf1762936e3aeb43a89b965069d9f60f9e39d35d9d25e6b809", size = 4268729, upload-time = "2025-12-15T21:53:58.712Z" }, + { url = "https://files.pythonhosted.org/packages/82/a3/c91a087ab21f1047db328c1d3eb5d1ff0e52de9e74f9f6f6fa14cdd93d58/debugpy-1.8.19-cp314-cp314-win32.whl", hash = "sha256:0601708223fe1cd0e27c6cce67a899d92c7d68e73690211e6788a4b0e1903f5b", size = 5286388, upload-time = "2025-12-15T21:54:00.687Z" }, + { url = "https://files.pythonhosted.org/packages/17/b8/bfdc30b6e94f1eff09f2dc9cc1f9cd1c6cde3d996bcbd36ce2d9a4956e99/debugpy-1.8.19-cp314-cp314-win_amd64.whl", hash = "sha256:8e19a725f5d486f20e53a1dde2ab8bb2c9607c40c00a42ab646def962b41125f", size = 5327741, upload-time = "2025-12-15T21:54:02.148Z" }, + { url = "https://files.pythonhosted.org/packages/25/3e/e27078370414ef35fafad2c06d182110073daaeb5d3bf734b0b1eeefe452/debugpy-1.8.19-py2.py3-none-any.whl", hash = "sha256:360ffd231a780abbc414ba0f005dad409e71c78637efe8f2bd75837132a41d38", size = 5292321, upload-time = "2025-12-15T21:54:16.024Z" }, +] + +[[package]] +name = "decorator" +version = "5.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/43/fa/6d96a0978d19e17b68d634497769987b16c8f4cd0a7a05048bec693caa6b/decorator-5.2.1.tar.gz", hash = "sha256:65f266143752f734b0a7cc83c46f4618af75b8c5911b00ccb61d0ac9b6da0360", size = 56711, upload-time = "2025-02-24T04:41:34.073Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4e/8c/f3147f5c4b73e7550fe5f9352eaa956ae838d5c51eb58e7a25b9f3e2643b/decorator-5.2.1-py3-none-any.whl", hash = "sha256:d316bb415a2d9e2d2b3abcc4084c6502fc09240e292cd76a76afc106a1c8e04a", size = 9190, upload-time = "2025-02-24T04:41:32.565Z" }, +] + [[package]] name = "dill" version = "0.3.8" @@ -318,6 +648,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/36/f4/c6e662dade71f56cd2f3735141b265c3c79293c109549c1e6933b0651ffc/exceptiongroup-1.3.0-py3-none-any.whl", hash = "sha256:4d111e6e0c13d0644cad6ddaa7ed0261a0b36971f6d23e7ec9b4b9097da78a10", size = 16674, upload-time = "2025-05-10T17:42:49.33Z" }, ] +[[package]] +name = "executing" +version = "2.2.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cc/28/c14e053b6762b1044f34a13aab6859bbf40456d37d23aa286ac24cfd9a5d/executing-2.2.1.tar.gz", hash = "sha256:3632cc370565f6648cc328b32435bd120a1e4ebb20c77e3fdde9a13cd1e533c4", size = 1129488, upload-time = "2025-09-01T09:48:10.866Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c1/ea/53f2148663b321f21b5a606bd5f191517cf40b7072c0497d3c92c4a13b1e/executing-2.2.1-py2.py3-none-any.whl", hash = "sha256:760643d3452b4d777d295bb167ccc74c64a81df23fb5e08eff250c425a4b2017", size = 28317, upload-time = "2025-09-01T09:48:08.5Z" }, +] + [[package]] name = "fastapi" version = "0.117.1" @@ -341,6 +680,63 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/42/14/42b2651a2f46b022ccd948bca9f2d5af0fd8929c4eec235b8d6d844fbe67/filelock-3.19.1-py3-none-any.whl", hash = "sha256:d38e30481def20772f5baf097c122c3babc4fcdb7e14e57049eb9d88c6dc017d", size = 15988, upload-time = "2025-08-14T16:56:01.633Z" }, ] +[[package]] +name = "fonttools" +version = "4.61.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/ca/cf17b88a8df95691275a3d77dc0a5ad9907f328ae53acbe6795da1b2f5ed/fonttools-4.61.1.tar.gz", hash = "sha256:6675329885c44657f826ef01d9e4fb33b9158e9d93c537d84ad8399539bc6f69", size = 3565756, upload-time = "2025-12-12T17:31:24.246Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/5b/94/8a28707adb00bed1bf22dac16ccafe60faf2ade353dcb32c3617ee917307/fonttools-4.61.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c7db70d57e5e1089a274cbb2b1fd635c9a24de809a231b154965d415d6c6d24", size = 2854799, upload-time = "2025-12-12T17:29:27.5Z" }, + { url = "https://files.pythonhosted.org/packages/94/93/c2e682faaa5ee92034818d8f8a8145ae73eb83619600495dcf8503fa7771/fonttools-4.61.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5fe9fd43882620017add5eabb781ebfbc6998ee49b35bd7f8f79af1f9f99a958", size = 2403032, upload-time = "2025-12-12T17:29:30.115Z" }, + { url = "https://files.pythonhosted.org/packages/f1/62/1748f7e7e1ee41aa52279fd2e3a6d0733dc42a673b16932bad8e5d0c8b28/fonttools-4.61.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8db08051fc9e7d8bc622f2112511b8107d8f27cd89e2f64ec45e9825e8288da", size = 4897863, upload-time = "2025-12-12T17:29:32.535Z" }, + { url = "https://files.pythonhosted.org/packages/69/69/4ca02ee367d2c98edcaeb83fc278d20972502ee071214ad9d8ca85e06080/fonttools-4.61.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a76d4cb80f41ba94a6691264be76435e5f72f2cb3cab0b092a6212855f71c2f6", size = 4859076, upload-time = "2025-12-12T17:29:34.907Z" }, + { url = "https://files.pythonhosted.org/packages/8c/f5/660f9e3cefa078861a7f099107c6d203b568a6227eef163dd173bfc56bdc/fonttools-4.61.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:a13fc8aeb24bad755eea8f7f9d409438eb94e82cf86b08fe77a03fbc8f6a96b1", size = 4875623, upload-time = "2025-12-12T17:29:37.33Z" }, + { url = "https://files.pythonhosted.org/packages/63/d1/9d7c5091d2276ed47795c131c1bf9316c3c1ab2789c22e2f59e0572ccd38/fonttools-4.61.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:b846a1fcf8beadeb9ea4f44ec5bdde393e2f1569e17d700bfc49cd69bde75881", size = 4993327, upload-time = "2025-12-12T17:29:39.781Z" }, + { url = "https://files.pythonhosted.org/packages/6f/2d/28def73837885ae32260d07660a052b99f0aa00454867d33745dfe49dbf0/fonttools-4.61.1-cp310-cp310-win32.whl", hash = "sha256:78a7d3ab09dc47ac1a363a493e6112d8cabed7ba7caad5f54dbe2f08676d1b47", size = 1502180, upload-time = "2025-12-12T17:29:42.217Z" }, + { url = "https://files.pythonhosted.org/packages/63/fa/bfdc98abb4dd2bd491033e85e3ba69a2313c850e759a6daa014bc9433b0f/fonttools-4.61.1-cp310-cp310-win_amd64.whl", hash = "sha256:eff1ac3cc66c2ac7cda1e64b4e2f3ffef474b7335f92fc3833fc632d595fcee6", size = 1550654, upload-time = "2025-12-12T17:29:44.564Z" }, + { url = "https://files.pythonhosted.org/packages/69/12/bf9f4eaa2fad039356cc627587e30ed008c03f1cebd3034376b5ee8d1d44/fonttools-4.61.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c6604b735bb12fef8e0efd5578c9fb5d3d8532d5001ea13a19cddf295673ee09", size = 2852213, upload-time = "2025-12-12T17:29:46.675Z" }, + { url = "https://files.pythonhosted.org/packages/ac/49/4138d1acb6261499bedde1c07f8c2605d1d8f9d77a151e5507fd3ef084b6/fonttools-4.61.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:5ce02f38a754f207f2f06557523cd39a06438ba3aafc0639c477ac409fc64e37", size = 2401689, upload-time = "2025-12-12T17:29:48.769Z" }, + { url = "https://files.pythonhosted.org/packages/e5/fe/e6ce0fe20a40e03aef906af60aa87668696f9e4802fa283627d0b5ed777f/fonttools-4.61.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:77efb033d8d7ff233385f30c62c7c79271c8885d5c9657d967ede124671bbdfb", size = 5058809, upload-time = "2025-12-12T17:29:51.701Z" }, + { url = "https://files.pythonhosted.org/packages/79/61/1ca198af22f7dd22c17ab86e9024ed3c06299cfdb08170640e9996d501a0/fonttools-4.61.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:75c1a6dfac6abd407634420c93864a1e274ebc1c7531346d9254c0d8f6ca00f9", size = 5036039, upload-time = "2025-12-12T17:29:53.659Z" }, + { url = "https://files.pythonhosted.org/packages/99/cc/fa1801e408586b5fce4da9f5455af8d770f4fc57391cd5da7256bb364d38/fonttools-4.61.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:0de30bfe7745c0d1ffa2b0b7048fb7123ad0d71107e10ee090fa0b16b9452e87", size = 5034714, upload-time = "2025-12-12T17:29:55.592Z" }, + { url = "https://files.pythonhosted.org/packages/bf/aa/b7aeafe65adb1b0a925f8f25725e09f078c635bc22754f3fecb7456955b0/fonttools-4.61.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:58b0ee0ab5b1fc9921eccfe11d1435added19d6494dde14e323f25ad2bc30c56", size = 5158648, upload-time = "2025-12-12T17:29:57.861Z" }, + { url = "https://files.pythonhosted.org/packages/99/f9/08ea7a38663328881384c6e7777bbefc46fd7d282adfd87a7d2b84ec9d50/fonttools-4.61.1-cp311-cp311-win32.whl", hash = "sha256:f79b168428351d11e10c5aeb61a74e1851ec221081299f4cf56036a95431c43a", size = 2280681, upload-time = "2025-12-12T17:29:59.943Z" }, + { url = "https://files.pythonhosted.org/packages/07/ad/37dd1ae5fa6e01612a1fbb954f0927681f282925a86e86198ccd7b15d515/fonttools-4.61.1-cp311-cp311-win_amd64.whl", hash = "sha256:fe2efccb324948a11dd09d22136fe2ac8a97d6c1347cf0b58a911dcd529f66b7", size = 2331951, upload-time = "2025-12-12T17:30:02.254Z" }, + { url = "https://files.pythonhosted.org/packages/6f/16/7decaa24a1bd3a70c607b2e29f0adc6159f36a7e40eaba59846414765fd4/fonttools-4.61.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:f3cb4a569029b9f291f88aafc927dd53683757e640081ca8c412781ea144565e", size = 2851593, upload-time = "2025-12-12T17:30:04.225Z" }, + { url = "https://files.pythonhosted.org/packages/94/98/3c4cb97c64713a8cf499b3245c3bf9a2b8fd16a3e375feff2aed78f96259/fonttools-4.61.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:41a7170d042e8c0024703ed13b71893519a1a6d6e18e933e3ec7507a2c26a4b2", size = 2400231, upload-time = "2025-12-12T17:30:06.47Z" }, + { url = "https://files.pythonhosted.org/packages/b7/37/82dbef0f6342eb01f54bca073ac1498433d6ce71e50c3c3282b655733b31/fonttools-4.61.1-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:10d88e55330e092940584774ee5e8a6971b01fc2f4d3466a1d6c158230880796", size = 4954103, upload-time = "2025-12-12T17:30:08.432Z" }, + { url = "https://files.pythonhosted.org/packages/6c/44/f3aeac0fa98e7ad527f479e161aca6c3a1e47bb6996b053d45226fe37bf2/fonttools-4.61.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:15acc09befd16a0fb8a8f62bc147e1a82817542d72184acca9ce6e0aeda9fa6d", size = 5004295, upload-time = "2025-12-12T17:30:10.56Z" }, + { url = "https://files.pythonhosted.org/packages/14/e8/7424ced75473983b964d09f6747fa09f054a6d656f60e9ac9324cf40c743/fonttools-4.61.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e6bcdf33aec38d16508ce61fd81838f24c83c90a1d1b8c68982857038673d6b8", size = 4944109, upload-time = "2025-12-12T17:30:12.874Z" }, + { url = "https://files.pythonhosted.org/packages/c8/8b/6391b257fa3d0b553d73e778f953a2f0154292a7a7a085e2374b111e5410/fonttools-4.61.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:5fade934607a523614726119164ff621e8c30e8fa1ffffbbd358662056ba69f0", size = 5093598, upload-time = "2025-12-12T17:30:15.79Z" }, + { url = "https://files.pythonhosted.org/packages/d9/71/fd2ea96cdc512d92da5678a1c98c267ddd4d8c5130b76d0f7a80f9a9fde8/fonttools-4.61.1-cp312-cp312-win32.whl", hash = "sha256:75da8f28eff26defba42c52986de97b22106cb8f26515b7c22443ebc9c2d3261", size = 2269060, upload-time = "2025-12-12T17:30:18.058Z" }, + { url = "https://files.pythonhosted.org/packages/80/3b/a3e81b71aed5a688e89dfe0e2694b26b78c7d7f39a5ffd8a7d75f54a12a8/fonttools-4.61.1-cp312-cp312-win_amd64.whl", hash = "sha256:497c31ce314219888c0e2fce5ad9178ca83fe5230b01a5006726cdf3ac9f24d9", size = 2319078, upload-time = "2025-12-12T17:30:22.862Z" }, + { url = "https://files.pythonhosted.org/packages/4b/cf/00ba28b0990982530addb8dc3e9e6f2fa9cb5c20df2abdda7baa755e8fe1/fonttools-4.61.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:8c56c488ab471628ff3bfa80964372fc13504ece601e0d97a78ee74126b2045c", size = 2846454, upload-time = "2025-12-12T17:30:24.938Z" }, + { url = "https://files.pythonhosted.org/packages/5a/ca/468c9a8446a2103ae645d14fee3f610567b7042aba85031c1c65e3ef7471/fonttools-4.61.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:dc492779501fa723b04d0ab1f5be046797fee17d27700476edc7ee9ae535a61e", size = 2398191, upload-time = "2025-12-12T17:30:27.343Z" }, + { url = "https://files.pythonhosted.org/packages/a3/4b/d67eedaed19def5967fade3297fed8161b25ba94699efc124b14fb68cdbc/fonttools-4.61.1-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:64102ca87e84261419c3747a0d20f396eb024bdbeb04c2bfb37e2891f5fadcb5", size = 4928410, upload-time = "2025-12-12T17:30:29.771Z" }, + { url = "https://files.pythonhosted.org/packages/b0/8d/6fb3494dfe61a46258cd93d979cf4725ded4eb46c2a4ca35e4490d84daea/fonttools-4.61.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4c1b526c8d3f615a7b1867f38a9410849c8f4aef078535742198e942fba0e9bd", size = 4984460, upload-time = "2025-12-12T17:30:32.073Z" }, + { url = "https://files.pythonhosted.org/packages/f7/f1/a47f1d30b3dc00d75e7af762652d4cbc3dff5c2697a0dbd5203c81afd9c3/fonttools-4.61.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:41ed4b5ec103bd306bb68f81dc166e77409e5209443e5773cb4ed837bcc9b0d3", size = 4925800, upload-time = "2025-12-12T17:30:34.339Z" }, + { url = "https://files.pythonhosted.org/packages/a7/01/e6ae64a0981076e8a66906fab01539799546181e32a37a0257b77e4aa88b/fonttools-4.61.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:b501c862d4901792adaec7c25b1ecc749e2662543f68bb194c42ba18d6eec98d", size = 5067859, upload-time = "2025-12-12T17:30:36.593Z" }, + { url = "https://files.pythonhosted.org/packages/73/aa/28e40b8d6809a9b5075350a86779163f074d2b617c15d22343fce81918db/fonttools-4.61.1-cp313-cp313-win32.whl", hash = "sha256:4d7092bb38c53bbc78e9255a59158b150bcdc115a1e3b3ce0b5f267dc35dd63c", size = 2267821, upload-time = "2025-12-12T17:30:38.478Z" }, + { url = "https://files.pythonhosted.org/packages/1a/59/453c06d1d83dc0951b69ef692d6b9f1846680342927df54e9a1ca91c6f90/fonttools-4.61.1-cp313-cp313-win_amd64.whl", hash = "sha256:21e7c8d76f62ab13c9472ccf74515ca5b9a761d1bde3265152a6dc58700d895b", size = 2318169, upload-time = "2025-12-12T17:30:40.951Z" }, + { url = "https://files.pythonhosted.org/packages/32/8f/4e7bf82c0cbb738d3c2206c920ca34ca74ef9dabde779030145d28665104/fonttools-4.61.1-cp314-cp314-macosx_10_15_universal2.whl", hash = "sha256:fff4f534200a04b4a36e7ae3cb74493afe807b517a09e99cb4faa89a34ed6ecd", size = 2846094, upload-time = "2025-12-12T17:30:43.511Z" }, + { url = "https://files.pythonhosted.org/packages/71/09/d44e45d0a4f3a651f23a1e9d42de43bc643cce2971b19e784cc67d823676/fonttools-4.61.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:d9203500f7c63545b4ce3799319fe4d9feb1a1b89b28d3cb5abd11b9dd64147e", size = 2396589, upload-time = "2025-12-12T17:30:45.681Z" }, + { url = "https://files.pythonhosted.org/packages/89/18/58c64cafcf8eb677a99ef593121f719e6dcbdb7d1c594ae5a10d4997ca8a/fonttools-4.61.1-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:fa646ecec9528bef693415c79a86e733c70a4965dd938e9a226b0fc64c9d2e6c", size = 4877892, upload-time = "2025-12-12T17:30:47.709Z" }, + { url = "https://files.pythonhosted.org/packages/8a/ec/9e6b38c7ba1e09eb51db849d5450f4c05b7e78481f662c3b79dbde6f3d04/fonttools-4.61.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:11f35ad7805edba3aac1a3710d104592df59f4b957e30108ae0ba6c10b11dd75", size = 4972884, upload-time = "2025-12-12T17:30:49.656Z" }, + { url = "https://files.pythonhosted.org/packages/5e/87/b5339da8e0256734ba0dbbf5b6cdebb1dd79b01dc8c270989b7bcd465541/fonttools-4.61.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:b931ae8f62db78861b0ff1ac017851764602288575d65b8e8ff1963fed419063", size = 4924405, upload-time = "2025-12-12T17:30:51.735Z" }, + { url = "https://files.pythonhosted.org/packages/0b/47/e3409f1e1e69c073a3a6fd8cb886eb18c0bae0ee13db2c8d5e7f8495e8b7/fonttools-4.61.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b148b56f5de675ee16d45e769e69f87623a4944f7443850bf9a9376e628a89d2", size = 5035553, upload-time = "2025-12-12T17:30:54.823Z" }, + { url = "https://files.pythonhosted.org/packages/bf/b6/1f6600161b1073a984294c6c031e1a56ebf95b6164249eecf30012bb2e38/fonttools-4.61.1-cp314-cp314-win32.whl", hash = "sha256:9b666a475a65f4e839d3d10473fad6d47e0a9db14a2f4a224029c5bfde58ad2c", size = 2271915, upload-time = "2025-12-12T17:30:57.913Z" }, + { url = "https://files.pythonhosted.org/packages/52/7b/91e7b01e37cc8eb0e1f770d08305b3655e4f002fc160fb82b3390eabacf5/fonttools-4.61.1-cp314-cp314-win_amd64.whl", hash = "sha256:4f5686e1fe5fce75d82d93c47a438a25bf0d1319d2843a926f741140b2b16e0c", size = 2323487, upload-time = "2025-12-12T17:30:59.804Z" }, + { url = "https://files.pythonhosted.org/packages/39/5c/908ad78e46c61c3e3ed70c3b58ff82ab48437faf84ec84f109592cabbd9f/fonttools-4.61.1-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:e76ce097e3c57c4bcb67c5aa24a0ecdbd9f74ea9219997a707a4061fbe2707aa", size = 2929571, upload-time = "2025-12-12T17:31:02.574Z" }, + { url = "https://files.pythonhosted.org/packages/bd/41/975804132c6dea64cdbfbaa59f3518a21c137a10cccf962805b301ac6ab2/fonttools-4.61.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9cfef3ab326780c04d6646f68d4b4742aae222e8b8ea1d627c74e38afcbc9d91", size = 2435317, upload-time = "2025-12-12T17:31:04.974Z" }, + { url = "https://files.pythonhosted.org/packages/b0/5a/aef2a0a8daf1ebaae4cfd83f84186d4a72ee08fd6a8451289fcd03ffa8a4/fonttools-4.61.1-cp314-cp314t-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a75c301f96db737e1c5ed5fd7d77d9c34466de16095a266509e13da09751bd19", size = 4882124, upload-time = "2025-12-12T17:31:07.456Z" }, + { url = "https://files.pythonhosted.org/packages/80/33/d6db3485b645b81cea538c9d1c9219d5805f0877fda18777add4671c5240/fonttools-4.61.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:91669ccac46bbc1d09e9273546181919064e8df73488ea087dcac3e2968df9ba", size = 5100391, upload-time = "2025-12-12T17:31:09.732Z" }, + { url = "https://files.pythonhosted.org/packages/6c/d6/675ba631454043c75fcf76f0ca5463eac8eb0666ea1d7badae5fea001155/fonttools-4.61.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:c33ab3ca9d3ccd581d58e989d67554e42d8d4ded94ab3ade3508455fe70e65f7", size = 4978800, upload-time = "2025-12-12T17:31:11.681Z" }, + { url = "https://files.pythonhosted.org/packages/7f/33/d3ec753d547a8d2bdaedd390d4a814e8d5b45a093d558f025c6b990b554c/fonttools-4.61.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:664c5a68ec406f6b1547946683008576ef8b38275608e1cee6c061828171c118", size = 5006426, upload-time = "2025-12-12T17:31:13.764Z" }, + { url = "https://files.pythonhosted.org/packages/b4/40/cc11f378b561a67bea850ab50063366a0d1dd3f6d0a30ce0f874b0ad5664/fonttools-4.61.1-cp314-cp314t-win32.whl", hash = "sha256:aed04cabe26f30c1647ef0e8fbb207516fd40fe9472e9439695f5c6998e60ac5", size = 2335377, upload-time = "2025-12-12T17:31:16.49Z" }, + { url = "https://files.pythonhosted.org/packages/e4/ff/c9a2b66b39f8628531ea58b320d66d951267c98c6a38684daa8f50fb02f8/fonttools-4.61.1-cp314-cp314t-win_amd64.whl", hash = "sha256:2180f14c141d2f0f3da43f3a81bc8aa4684860f6b0e6f9e165a4831f24e6a23b", size = 2400613, upload-time = "2025-12-12T17:31:18.769Z" }, + { url = "https://files.pythonhosted.org/packages/c7/4e/ce75a57ff3aebf6fc1f4e9d508b8e5810618a33d900ad6c19eb30b290b97/fonttools-4.61.1-py3-none-any.whl", hash = "sha256:17d2bf5d541add43822bcf0c43d7d847b160c9bb01d15d5007d84e2217aaa371", size = 1148996, upload-time = "2025-12-12T17:31:21.03Z" }, +] + [[package]] name = "frozenlist" version = "1.7.0" @@ -534,6 +930,124 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2c/e1/e6716421ea10d38022b952c159d5161ca1193197fb744506875fbb87ea7b/iniconfig-2.1.0-py3-none-any.whl", hash = "sha256:9deba5723312380e77435581c6bf4935c94cbfab9b1ed33ef8d238ea168eb760", size = 6050, upload-time = "2025-03-19T20:10:01.071Z" }, ] +[[package]] +name = "ipykernel" +version = "7.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "appnope", marker = "sys_platform == 'darwin' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "comm" }, + { name = "debugpy" }, + { name = "ipython", version = "8.37.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "ipython", version = "9.8.0", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "jupyter-client" }, + { name = "jupyter-core" }, + { name = "matplotlib-inline" }, + { name = "nest-asyncio" }, + { name = "packaging" }, + { name = "psutil" }, + { name = "pyzmq" }, + { name = "tornado" }, + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/b9/a4/4948be6eb88628505b83a1f2f40d90254cab66abf2043b3c40fa07dfce0f/ipykernel-7.1.0.tar.gz", hash = "sha256:58a3fc88533d5930c3546dc7eac66c6d288acde4f801e2001e65edc5dc9cf0db", size = 174579, upload-time = "2025-10-27T09:46:39.471Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a3/17/20c2552266728ceba271967b87919664ecc0e33efca29c3efc6baf88c5f9/ipykernel-7.1.0-py3-none-any.whl", hash = "sha256:763b5ec6c5b7776f6a8d7ce09b267693b4e5ce75cb50ae696aaefb3c85e1ea4c", size = 117968, upload-time = "2025-10-27T09:46:37.805Z" }, +] + +[[package]] +name = "ipython" +version = "8.37.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", +] +dependencies = [ + { name = "colorama", marker = "(python_full_version < '3.11' and sys_platform == 'win32') or (python_full_version >= '3.11' and extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu') or (sys_platform != 'win32' and extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "decorator", marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "exceptiongroup", marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "jedi", marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "matplotlib-inline", marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "pexpect", marker = "(python_full_version < '3.11' and sys_platform != 'emscripten' and sys_platform != 'win32') or (python_full_version >= '3.11' and extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu') or (sys_platform == 'emscripten' and extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu') or (sys_platform == 'win32' and extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "prompt-toolkit", marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "pygments", marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "stack-data", marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "traitlets", marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "typing-extensions", marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/85/31/10ac88f3357fc276dc8a64e8880c82e80e7459326ae1d0a211b40abf6665/ipython-8.37.0.tar.gz", hash = "sha256:ca815841e1a41a1e6b73a0b08f3038af9b2252564d01fc405356d34033012216", size = 5606088, upload-time = "2025-05-31T16:39:09.613Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/91/d0/274fbf7b0b12643cbbc001ce13e6a5b1607ac4929d1b11c72460152c9fc3/ipython-8.37.0-py3-none-any.whl", hash = "sha256:ed87326596b878932dbcb171e3e698845434d8c61b8d8cd474bf663041a9dcf2", size = 831864, upload-time = "2025-05-31T16:39:06.38Z" }, +] + +[[package]] +name = "ipython" +version = "9.8.0" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", +] +dependencies = [ + { name = "colorama", marker = "(python_full_version >= '3.11' and sys_platform == 'win32') or (python_full_version < '3.11' and extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu') or (sys_platform != 'win32' and extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "decorator", marker = "python_full_version >= '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "ipython-pygments-lexers", marker = "python_full_version >= '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "jedi", marker = "python_full_version >= '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "matplotlib-inline", marker = "python_full_version >= '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "pexpect", marker = "(python_full_version >= '3.11' and sys_platform != 'emscripten' and sys_platform != 'win32') or (python_full_version < '3.11' and extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu') or (sys_platform == 'emscripten' and extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu') or (sys_platform == 'win32' and extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "prompt-toolkit", marker = "python_full_version >= '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "pygments", marker = "python_full_version >= '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "stack-data", marker = "python_full_version >= '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "traitlets", marker = "python_full_version >= '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "typing-extensions", marker = "python_full_version == '3.11.*' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/12/51/a703c030f4928646d390b4971af4938a1b10c9dfce694f0d99a0bb073cb2/ipython-9.8.0.tar.gz", hash = "sha256:8e4ce129a627eb9dd221c41b1d2cdaed4ef7c9da8c17c63f6f578fe231141f83", size = 4424940, upload-time = "2025-12-03T10:18:24.353Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f1/df/8ee1c5dd1e3308b5d5b2f2dfea323bb2f3827da8d654abb6642051199049/ipython-9.8.0-py3-none-any.whl", hash = "sha256:ebe6d1d58d7d988fbf23ff8ff6d8e1622cfdb194daf4b7b73b792c4ec3b85385", size = 621374, upload-time = "2025-12-03T10:18:22.335Z" }, +] + +[[package]] +name = "ipython-pygments-lexers" +version = "1.1.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pygments", marker = "python_full_version >= '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ef/4c/5dd1d8af08107f88c7f741ead7a40854b8ac24ddf9ae850afbcf698aa552/ipython_pygments_lexers-1.1.1.tar.gz", hash = "sha256:09c0138009e56b6854f9535736f4171d855c8c08a563a0dcd8022f78355c7e81", size = 8393, upload-time = "2025-01-17T11:24:34.505Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/d9/33/1f075bf72b0b747cb3288d011319aaf64083cf2efef8354174e3ed4540e2/ipython_pygments_lexers-1.1.1-py3-none-any.whl", hash = "sha256:a9462224a505ade19a605f71f8fa63c2048833ce50abc86768a0d81d876dc81c", size = 8074, upload-time = "2025-01-17T11:24:33.271Z" }, +] + +[[package]] +name = "jedi" +version = "0.19.2" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "parso" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/72/3a/79a912fbd4d8dd6fbb02bf69afd3bb72cf0c729bb3063c6f4498603db17a/jedi-0.19.2.tar.gz", hash = "sha256:4770dc3de41bde3966b02eb84fbcf557fb33cce26ad23da12c742fb50ecb11f0", size = 1231287, upload-time = "2024-11-11T01:41:42.873Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c0/5a/9cac0c82afec3d09ccd97c8b6502d48f165f9124db81b4bcb90b4af974ee/jedi-0.19.2-py2.py3-none-any.whl", hash = "sha256:a8ef22bde8490f57fe5c7681a3c83cb58874daf72b4784de3cce5b6ef6edb5b9", size = 1572278, upload-time = "2024-11-11T01:41:40.175Z" }, +] + [[package]] name = "jinja2" version = "3.1.6" @@ -546,6 +1060,143 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/62/a1/3d680cbfd5f4b8f15abc1d571870c5fc3e594bb582bc3b64ea099db13e56/jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67", size = 134899, upload-time = "2025-03-05T20:05:00.369Z" }, ] +[[package]] +name = "jupyter-client" +version = "8.7.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "jupyter-core" }, + { name = "python-dateutil" }, + { name = "pyzmq" }, + { name = "tornado" }, + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a6/27/d10de45e8ad4ce872372c4a3a37b7b35b6b064f6f023a5c14ffcced4d59d/jupyter_client-8.7.0.tar.gz", hash = "sha256:3357212d9cbe01209e59190f67a3a7e1f387a4f4e88d1e0433ad84d7b262531d", size = 344691, upload-time = "2025-12-09T18:37:01.953Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/bb/f5/fddaec430367be9d62a7ed125530e133bfd4a1c0350fe221149ee0f2b526/jupyter_client-8.7.0-py3-none-any.whl", hash = "sha256:3671a94fd25e62f5f2f554f5e95389c2294d89822378a5f2dd24353e1494a9e0", size = 106215, upload-time = "2025-12-09T18:37:00.024Z" }, +] + +[[package]] +name = "jupyter-core" +version = "5.9.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "platformdirs" }, + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/02/49/9d1284d0dc65e2c757b74c6687b6d319b02f822ad039e5c512df9194d9dd/jupyter_core-5.9.1.tar.gz", hash = "sha256:4d09aaff303b9566c3ce657f580bd089ff5c91f5f89cf7d8846c3cdf465b5508", size = 89814, upload-time = "2025-10-16T19:19:18.444Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e7/e7/80988e32bf6f73919a113473a604f5a8f09094de312b9d52b79c2df7612b/jupyter_core-5.9.1-py3-none-any.whl", hash = "sha256:ebf87fdc6073d142e114c72c9e29a9d7ca03fad818c5d300ce2adc1fb0743407", size = 29032, upload-time = "2025-10-16T19:19:16.783Z" }, +] + +[[package]] +name = "kiwisolver" +version = "1.4.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/5c/3c/85844f1b0feb11ee581ac23fe5fce65cd049a200c1446708cc1b7f922875/kiwisolver-1.4.9.tar.gz", hash = "sha256:c3b22c26c6fd6811b0ae8363b95ca8ce4ea3c202d3d0975b2914310ceb1bcc4d", size = 97564, upload-time = "2025-08-10T21:27:49.279Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/c6/5d/8ce64e36d4e3aac5ca96996457dcf33e34e6051492399a3f1fec5657f30b/kiwisolver-1.4.9-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:b4b4d74bda2b8ebf4da5bd42af11d02d04428b2c32846e4c2c93219df8a7987b", size = 124159, upload-time = "2025-08-10T21:25:35.472Z" }, + { url = "https://files.pythonhosted.org/packages/96/1e/22f63ec454874378175a5f435d6ea1363dd33fb2af832c6643e4ccea0dc8/kiwisolver-1.4.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:fb3b8132019ea572f4611d770991000d7f58127560c4889729248eb5852a102f", size = 66578, upload-time = "2025-08-10T21:25:36.73Z" }, + { url = "https://files.pythonhosted.org/packages/41/4c/1925dcfff47a02d465121967b95151c82d11027d5ec5242771e580e731bd/kiwisolver-1.4.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:84fd60810829c27ae375114cd379da1fa65e6918e1da405f356a775d49a62bcf", size = 65312, upload-time = "2025-08-10T21:25:37.658Z" }, + { url = "https://files.pythonhosted.org/packages/d4/42/0f333164e6307a0687d1eb9ad256215aae2f4bd5d28f4653d6cd319a3ba3/kiwisolver-1.4.9-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:b78efa4c6e804ecdf727e580dbb9cba85624d2e1c6b5cb059c66290063bd99a9", size = 1628458, upload-time = "2025-08-10T21:25:39.067Z" }, + { url = "https://files.pythonhosted.org/packages/86/b6/2dccb977d651943995a90bfe3495c2ab2ba5cd77093d9f2318a20c9a6f59/kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d4efec7bcf21671db6a3294ff301d2fc861c31faa3c8740d1a94689234d1b415", size = 1225640, upload-time = "2025-08-10T21:25:40.489Z" }, + { url = "https://files.pythonhosted.org/packages/50/2b/362ebd3eec46c850ccf2bfe3e30f2fc4c008750011f38a850f088c56a1c6/kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:90f47e70293fc3688b71271100a1a5453aa9944a81d27ff779c108372cf5567b", size = 1244074, upload-time = "2025-08-10T21:25:42.221Z" }, + { url = "https://files.pythonhosted.org/packages/6f/bb/f09a1e66dab8984773d13184a10a29fe67125337649d26bdef547024ed6b/kiwisolver-1.4.9-cp310-cp310-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:8fdca1def57a2e88ef339de1737a1449d6dbf5fab184c54a1fca01d541317154", size = 1293036, upload-time = "2025-08-10T21:25:43.801Z" }, + { url = "https://files.pythonhosted.org/packages/ea/01/11ecf892f201cafda0f68fa59212edaea93e96c37884b747c181303fccd1/kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:9cf554f21be770f5111a1690d42313e140355e687e05cf82cb23d0a721a64a48", size = 2175310, upload-time = "2025-08-10T21:25:45.045Z" }, + { url = "https://files.pythonhosted.org/packages/7f/5f/bfe11d5b934f500cc004314819ea92427e6e5462706a498c1d4fc052e08f/kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:fc1795ac5cd0510207482c3d1d3ed781143383b8cfd36f5c645f3897ce066220", size = 2270943, upload-time = "2025-08-10T21:25:46.393Z" }, + { url = "https://files.pythonhosted.org/packages/3d/de/259f786bf71f1e03e73d87e2db1a9a3bcab64d7b4fd780167123161630ad/kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:ccd09f20ccdbbd341b21a67ab50a119b64a403b09288c27481575105283c1586", size = 2440488, upload-time = "2025-08-10T21:25:48.074Z" }, + { url = "https://files.pythonhosted.org/packages/1b/76/c989c278faf037c4d3421ec07a5c452cd3e09545d6dae7f87c15f54e4edf/kiwisolver-1.4.9-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:540c7c72324d864406a009d72f5d6856f49693db95d1fbb46cf86febef873634", size = 2246787, upload-time = "2025-08-10T21:25:49.442Z" }, + { url = "https://files.pythonhosted.org/packages/a2/55/c2898d84ca440852e560ca9f2a0d28e6e931ac0849b896d77231929900e7/kiwisolver-1.4.9-cp310-cp310-win_amd64.whl", hash = "sha256:ede8c6d533bc6601a47ad4046080d36b8fc99f81e6f1c17b0ac3c2dc91ac7611", size = 73730, upload-time = "2025-08-10T21:25:51.102Z" }, + { url = "https://files.pythonhosted.org/packages/e8/09/486d6ac523dd33b80b368247f238125d027964cfacb45c654841e88fb2ae/kiwisolver-1.4.9-cp310-cp310-win_arm64.whl", hash = "sha256:7b4da0d01ac866a57dd61ac258c5607b4cd677f63abaec7b148354d2b2cdd536", size = 65036, upload-time = "2025-08-10T21:25:52.063Z" }, + { url = "https://files.pythonhosted.org/packages/6f/ab/c80b0d5a9d8a1a65f4f815f2afff9798b12c3b9f31f1d304dd233dd920e2/kiwisolver-1.4.9-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:eb14a5da6dc7642b0f3a18f13654847cd8b7a2550e2645a5bda677862b03ba16", size = 124167, upload-time = "2025-08-10T21:25:53.403Z" }, + { url = "https://files.pythonhosted.org/packages/a0/c0/27fe1a68a39cf62472a300e2879ffc13c0538546c359b86f149cc19f6ac3/kiwisolver-1.4.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:39a219e1c81ae3b103643d2aedb90f1ef22650deb266ff12a19e7773f3e5f089", size = 66579, upload-time = "2025-08-10T21:25:54.79Z" }, + { url = "https://files.pythonhosted.org/packages/31/a2/a12a503ac1fd4943c50f9822678e8015a790a13b5490354c68afb8489814/kiwisolver-1.4.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2405a7d98604b87f3fc28b1716783534b1b4b8510d8142adca34ee0bc3c87543", size = 65309, upload-time = "2025-08-10T21:25:55.76Z" }, + { url = "https://files.pythonhosted.org/packages/66/e1/e533435c0be77c3f64040d68d7a657771194a63c279f55573188161e81ca/kiwisolver-1.4.9-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:dc1ae486f9abcef254b5618dfb4113dd49f94c68e3e027d03cf0143f3f772b61", size = 1435596, upload-time = "2025-08-10T21:25:56.861Z" }, + { url = "https://files.pythonhosted.org/packages/67/1e/51b73c7347f9aabdc7215aa79e8b15299097dc2f8e67dee2b095faca9cb0/kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8a1f570ce4d62d718dce3f179ee78dac3b545ac16c0c04bb363b7607a949c0d1", size = 1246548, upload-time = "2025-08-10T21:25:58.246Z" }, + { url = "https://files.pythonhosted.org/packages/21/aa/72a1c5d1e430294f2d32adb9542719cfb441b5da368d09d268c7757af46c/kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:cb27e7b78d716c591e88e0a09a2139c6577865d7f2e152488c2cc6257f460872", size = 1263618, upload-time = "2025-08-10T21:25:59.857Z" }, + { url = "https://files.pythonhosted.org/packages/a3/af/db1509a9e79dbf4c260ce0cfa3903ea8945f6240e9e59d1e4deb731b1a40/kiwisolver-1.4.9-cp311-cp311-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:15163165efc2f627eb9687ea5f3a28137217d217ac4024893d753f46bce9de26", size = 1317437, upload-time = "2025-08-10T21:26:01.105Z" }, + { url = "https://files.pythonhosted.org/packages/e0/f2/3ea5ee5d52abacdd12013a94130436e19969fa183faa1e7c7fbc89e9a42f/kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:bdee92c56a71d2b24c33a7d4c2856bd6419d017e08caa7802d2963870e315028", size = 2195742, upload-time = "2025-08-10T21:26:02.675Z" }, + { url = "https://files.pythonhosted.org/packages/6f/9b/1efdd3013c2d9a2566aa6a337e9923a00590c516add9a1e89a768a3eb2fc/kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:412f287c55a6f54b0650bd9b6dce5aceddb95864a1a90c87af16979d37c89771", size = 2290810, upload-time = "2025-08-10T21:26:04.009Z" }, + { url = "https://files.pythonhosted.org/packages/fb/e5/cfdc36109ae4e67361f9bc5b41323648cb24a01b9ade18784657e022e65f/kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2c93f00dcba2eea70af2be5f11a830a742fe6b579a1d4e00f47760ef13be247a", size = 2461579, upload-time = "2025-08-10T21:26:05.317Z" }, + { url = "https://files.pythonhosted.org/packages/62/86/b589e5e86c7610842213994cdea5add00960076bef4ae290c5fa68589cac/kiwisolver-1.4.9-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f117e1a089d9411663a3207ba874f31be9ac8eaa5b533787024dc07aeb74f464", size = 2268071, upload-time = "2025-08-10T21:26:06.686Z" }, + { url = "https://files.pythonhosted.org/packages/3b/c6/f8df8509fd1eee6c622febe54384a96cfaf4d43bf2ccec7a0cc17e4715c9/kiwisolver-1.4.9-cp311-cp311-win_amd64.whl", hash = "sha256:be6a04e6c79819c9a8c2373317d19a96048e5a3f90bec587787e86a1153883c2", size = 73840, upload-time = "2025-08-10T21:26:07.94Z" }, + { url = "https://files.pythonhosted.org/packages/e2/2d/16e0581daafd147bc11ac53f032a2b45eabac897f42a338d0a13c1e5c436/kiwisolver-1.4.9-cp311-cp311-win_arm64.whl", hash = "sha256:0ae37737256ba2de764ddc12aed4956460277f00c4996d51a197e72f62f5eec7", size = 65159, upload-time = "2025-08-10T21:26:09.048Z" }, + { url = "https://files.pythonhosted.org/packages/86/c9/13573a747838aeb1c76e3267620daa054f4152444d1f3d1a2324b78255b5/kiwisolver-1.4.9-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:ac5a486ac389dddcc5bef4f365b6ae3ffff2c433324fb38dd35e3fab7c957999", size = 123686, upload-time = "2025-08-10T21:26:10.034Z" }, + { url = "https://files.pythonhosted.org/packages/51/ea/2ecf727927f103ffd1739271ca19c424d0e65ea473fbaeea1c014aea93f6/kiwisolver-1.4.9-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f2ba92255faa7309d06fe44c3a4a97efe1c8d640c2a79a5ef728b685762a6fd2", size = 66460, upload-time = "2025-08-10T21:26:11.083Z" }, + { url = "https://files.pythonhosted.org/packages/5b/5a/51f5464373ce2aeb5194508298a508b6f21d3867f499556263c64c621914/kiwisolver-1.4.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:4a2899935e724dd1074cb568ce7ac0dce28b2cd6ab539c8e001a8578eb106d14", size = 64952, upload-time = "2025-08-10T21:26:12.058Z" }, + { url = "https://files.pythonhosted.org/packages/70/90/6d240beb0f24b74371762873e9b7f499f1e02166a2d9c5801f4dbf8fa12e/kiwisolver-1.4.9-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f6008a4919fdbc0b0097089f67a1eb55d950ed7e90ce2cc3e640abadd2757a04", size = 1474756, upload-time = "2025-08-10T21:26:13.096Z" }, + { url = "https://files.pythonhosted.org/packages/12/42/f36816eaf465220f683fb711efdd1bbf7a7005a2473d0e4ed421389bd26c/kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67bb8b474b4181770f926f7b7d2f8c0248cbcb78b660fdd41a47054b28d2a752", size = 1276404, upload-time = "2025-08-10T21:26:14.457Z" }, + { url = "https://files.pythonhosted.org/packages/2e/64/bc2de94800adc830c476dce44e9b40fd0809cddeef1fde9fcf0f73da301f/kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2327a4a30d3ee07d2fbe2e7933e8a37c591663b96ce42a00bc67461a87d7df77", size = 1294410, upload-time = "2025-08-10T21:26:15.73Z" }, + { url = "https://files.pythonhosted.org/packages/5f/42/2dc82330a70aa8e55b6d395b11018045e58d0bb00834502bf11509f79091/kiwisolver-1.4.9-cp312-cp312-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:7a08b491ec91b1d5053ac177afe5290adacf1f0f6307d771ccac5de30592d198", size = 1343631, upload-time = "2025-08-10T21:26:17.045Z" }, + { url = "https://files.pythonhosted.org/packages/22/fd/f4c67a6ed1aab149ec5a8a401c323cee7a1cbe364381bb6c9c0d564e0e20/kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:d8fc5c867c22b828001b6a38d2eaeb88160bf5783c6cb4a5e440efc981ce286d", size = 2224963, upload-time = "2025-08-10T21:26:18.737Z" }, + { url = "https://files.pythonhosted.org/packages/45/aa/76720bd4cb3713314677d9ec94dcc21ced3f1baf4830adde5bb9b2430a5f/kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:3b3115b2581ea35bb6d1f24a4c90af37e5d9b49dcff267eeed14c3893c5b86ab", size = 2321295, upload-time = "2025-08-10T21:26:20.11Z" }, + { url = "https://files.pythonhosted.org/packages/80/19/d3ec0d9ab711242f56ae0dc2fc5d70e298bb4a1f9dfab44c027668c673a1/kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:858e4c22fb075920b96a291928cb7dea5644e94c0ee4fcd5af7e865655e4ccf2", size = 2487987, upload-time = "2025-08-10T21:26:21.49Z" }, + { url = "https://files.pythonhosted.org/packages/39/e9/61e4813b2c97e86b6fdbd4dd824bf72d28bcd8d4849b8084a357bc0dd64d/kiwisolver-1.4.9-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:ed0fecd28cc62c54b262e3736f8bb2512d8dcfdc2bcf08be5f47f96bf405b145", size = 2291817, upload-time = "2025-08-10T21:26:22.812Z" }, + { url = "https://files.pythonhosted.org/packages/a0/41/85d82b0291db7504da3c2defe35c9a8a5c9803a730f297bd823d11d5fb77/kiwisolver-1.4.9-cp312-cp312-win_amd64.whl", hash = "sha256:f68208a520c3d86ea51acf688a3e3002615a7f0238002cccc17affecc86a8a54", size = 73895, upload-time = "2025-08-10T21:26:24.37Z" }, + { url = "https://files.pythonhosted.org/packages/e2/92/5f3068cf15ee5cb624a0c7596e67e2a0bb2adee33f71c379054a491d07da/kiwisolver-1.4.9-cp312-cp312-win_arm64.whl", hash = "sha256:2c1a4f57df73965f3f14df20b80ee29e6a7930a57d2d9e8491a25f676e197c60", size = 64992, upload-time = "2025-08-10T21:26:25.732Z" }, + { url = "https://files.pythonhosted.org/packages/31/c1/c2686cda909742ab66c7388e9a1a8521a59eb89f8bcfbee28fc980d07e24/kiwisolver-1.4.9-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5d0432ccf1c7ab14f9949eec60c5d1f924f17c037e9f8b33352fa05799359b8", size = 123681, upload-time = "2025-08-10T21:26:26.725Z" }, + { url = "https://files.pythonhosted.org/packages/ca/f0/f44f50c9f5b1a1860261092e3bc91ecdc9acda848a8b8c6abfda4a24dd5c/kiwisolver-1.4.9-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efb3a45b35622bb6c16dbfab491a8f5a391fe0e9d45ef32f4df85658232ca0e2", size = 66464, upload-time = "2025-08-10T21:26:27.733Z" }, + { url = "https://files.pythonhosted.org/packages/2d/7a/9d90a151f558e29c3936b8a47ac770235f436f2120aca41a6d5f3d62ae8d/kiwisolver-1.4.9-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:1a12cf6398e8a0a001a059747a1cbf24705e18fe413bc22de7b3d15c67cffe3f", size = 64961, upload-time = "2025-08-10T21:26:28.729Z" }, + { url = "https://files.pythonhosted.org/packages/e9/e9/f218a2cb3a9ffbe324ca29a9e399fa2d2866d7f348ec3a88df87fc248fc5/kiwisolver-1.4.9-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b67e6efbf68e077dd71d1a6b37e43e1a99d0bff1a3d51867d45ee8908b931098", size = 1474607, upload-time = "2025-08-10T21:26:29.798Z" }, + { url = "https://files.pythonhosted.org/packages/d9/28/aac26d4c882f14de59041636292bc838db8961373825df23b8eeb807e198/kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5656aa670507437af0207645273ccdfee4f14bacd7f7c67a4306d0dcaeaf6eed", size = 1276546, upload-time = "2025-08-10T21:26:31.401Z" }, + { url = "https://files.pythonhosted.org/packages/8b/ad/8bfc1c93d4cc565e5069162f610ba2f48ff39b7de4b5b8d93f69f30c4bed/kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:bfc08add558155345129c7803b3671cf195e6a56e7a12f3dde7c57d9b417f525", size = 1294482, upload-time = "2025-08-10T21:26:32.721Z" }, + { url = "https://files.pythonhosted.org/packages/da/f1/6aca55ff798901d8ce403206d00e033191f63d82dd708a186e0ed2067e9c/kiwisolver-1.4.9-cp313-cp313-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:40092754720b174e6ccf9e845d0d8c7d8e12c3d71e7fc35f55f3813e96376f78", size = 1343720, upload-time = "2025-08-10T21:26:34.032Z" }, + { url = "https://files.pythonhosted.org/packages/d1/91/eed031876c595c81d90d0f6fc681ece250e14bf6998c3d7c419466b523b7/kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:497d05f29a1300d14e02e6441cf0f5ee81c1ff5a304b0d9fb77423974684e08b", size = 2224907, upload-time = "2025-08-10T21:26:35.824Z" }, + { url = "https://files.pythonhosted.org/packages/e9/ec/4d1925f2e49617b9cca9c34bfa11adefad49d00db038e692a559454dfb2e/kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:bdd1a81a1860476eb41ac4bc1e07b3f07259e6d55bbf739b79c8aaedcf512799", size = 2321334, upload-time = "2025-08-10T21:26:37.534Z" }, + { url = "https://files.pythonhosted.org/packages/43/cb/450cd4499356f68802750c6ddc18647b8ea01ffa28f50d20598e0befe6e9/kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:e6b93f13371d341afee3be9f7c5964e3fe61d5fa30f6a30eb49856935dfe4fc3", size = 2488313, upload-time = "2025-08-10T21:26:39.191Z" }, + { url = "https://files.pythonhosted.org/packages/71/67/fc76242bd99f885651128a5d4fa6083e5524694b7c88b489b1b55fdc491d/kiwisolver-1.4.9-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d75aa530ccfaa593da12834b86a0724f58bff12706659baa9227c2ccaa06264c", size = 2291970, upload-time = "2025-08-10T21:26:40.828Z" }, + { url = "https://files.pythonhosted.org/packages/75/bd/f1a5d894000941739f2ae1b65a32892349423ad49c2e6d0771d0bad3fae4/kiwisolver-1.4.9-cp313-cp313-win_amd64.whl", hash = "sha256:dd0a578400839256df88c16abddf9ba14813ec5f21362e1fe65022e00c883d4d", size = 73894, upload-time = "2025-08-10T21:26:42.33Z" }, + { url = "https://files.pythonhosted.org/packages/95/38/dce480814d25b99a391abbddadc78f7c117c6da34be68ca8b02d5848b424/kiwisolver-1.4.9-cp313-cp313-win_arm64.whl", hash = "sha256:d4188e73af84ca82468f09cadc5ac4db578109e52acb4518d8154698d3a87ca2", size = 64995, upload-time = "2025-08-10T21:26:43.889Z" }, + { url = "https://files.pythonhosted.org/packages/e2/37/7d218ce5d92dadc5ebdd9070d903e0c7cf7edfe03f179433ac4d13ce659c/kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_universal2.whl", hash = "sha256:5a0f2724dfd4e3b3ac5a82436a8e6fd16baa7d507117e4279b660fe8ca38a3a1", size = 126510, upload-time = "2025-08-10T21:26:44.915Z" }, + { url = "https://files.pythonhosted.org/packages/23/b0/e85a2b48233daef4b648fb657ebbb6f8367696a2d9548a00b4ee0eb67803/kiwisolver-1.4.9-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:1b11d6a633e4ed84fc0ddafd4ebfd8ea49b3f25082c04ad12b8315c11d504dc1", size = 67903, upload-time = "2025-08-10T21:26:45.934Z" }, + { url = "https://files.pythonhosted.org/packages/44/98/f2425bc0113ad7de24da6bb4dae1343476e95e1d738be7c04d31a5d037fd/kiwisolver-1.4.9-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:61874cdb0a36016354853593cffc38e56fc9ca5aa97d2c05d3dcf6922cd55a11", size = 66402, upload-time = "2025-08-10T21:26:47.101Z" }, + { url = "https://files.pythonhosted.org/packages/98/d8/594657886df9f34c4177cc353cc28ca7e6e5eb562d37ccc233bff43bbe2a/kiwisolver-1.4.9-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:60c439763a969a6af93b4881db0eed8fadf93ee98e18cbc35bc8da868d0c4f0c", size = 1582135, upload-time = "2025-08-10T21:26:48.665Z" }, + { url = "https://files.pythonhosted.org/packages/5c/c6/38a115b7170f8b306fc929e166340c24958347308ea3012c2b44e7e295db/kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:92a2f997387a1b79a75e7803aa7ded2cfbe2823852ccf1ba3bcf613b62ae3197", size = 1389409, upload-time = "2025-08-10T21:26:50.335Z" }, + { url = "https://files.pythonhosted.org/packages/bf/3b/e04883dace81f24a568bcee6eb3001da4ba05114afa622ec9b6fafdc1f5e/kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:a31d512c812daea6d8b3be3b2bfcbeb091dbb09177706569bcfc6240dcf8b41c", size = 1401763, upload-time = "2025-08-10T21:26:51.867Z" }, + { url = "https://files.pythonhosted.org/packages/9f/80/20ace48e33408947af49d7d15c341eaee69e4e0304aab4b7660e234d6288/kiwisolver-1.4.9-cp313-cp313t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:52a15b0f35dad39862d376df10c5230155243a2c1a436e39eb55623ccbd68185", size = 1453643, upload-time = "2025-08-10T21:26:53.592Z" }, + { url = "https://files.pythonhosted.org/packages/64/31/6ce4380a4cd1f515bdda976a1e90e547ccd47b67a1546d63884463c92ca9/kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:a30fd6fdef1430fd9e1ba7b3398b5ee4e2887783917a687d86ba69985fb08748", size = 2330818, upload-time = "2025-08-10T21:26:55.051Z" }, + { url = "https://files.pythonhosted.org/packages/fa/e9/3f3fcba3bcc7432c795b82646306e822f3fd74df0ee81f0fa067a1f95668/kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_ppc64le.whl", hash = "sha256:cc9617b46837c6468197b5945e196ee9ca43057bb7d9d1ae688101e4e1dddf64", size = 2419963, upload-time = "2025-08-10T21:26:56.421Z" }, + { url = "https://files.pythonhosted.org/packages/99/43/7320c50e4133575c66e9f7dadead35ab22d7c012a3b09bb35647792b2a6d/kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:0ab74e19f6a2b027ea4f845a78827969af45ce790e6cb3e1ebab71bdf9f215ff", size = 2594639, upload-time = "2025-08-10T21:26:57.882Z" }, + { url = "https://files.pythonhosted.org/packages/65/d6/17ae4a270d4a987ef8a385b906d2bdfc9fce502d6dc0d3aea865b47f548c/kiwisolver-1.4.9-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:dba5ee5d3981160c28d5490f0d1b7ed730c22470ff7f6cc26cfcfaacb9896a07", size = 2391741, upload-time = "2025-08-10T21:26:59.237Z" }, + { url = "https://files.pythonhosted.org/packages/2a/8f/8f6f491d595a9e5912971f3f863d81baddccc8a4d0c3749d6a0dd9ffc9df/kiwisolver-1.4.9-cp313-cp313t-win_arm64.whl", hash = "sha256:0749fd8f4218ad2e851e11cc4dc05c7cbc0cbc4267bdfdb31782e65aace4ee9c", size = 68646, upload-time = "2025-08-10T21:27:00.52Z" }, + { url = "https://files.pythonhosted.org/packages/6b/32/6cc0fbc9c54d06c2969faa9c1d29f5751a2e51809dd55c69055e62d9b426/kiwisolver-1.4.9-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:9928fe1eb816d11ae170885a74d074f57af3a0d65777ca47e9aeb854a1fba386", size = 123806, upload-time = "2025-08-10T21:27:01.537Z" }, + { url = "https://files.pythonhosted.org/packages/b2/dd/2bfb1d4a4823d92e8cbb420fe024b8d2167f72079b3bb941207c42570bdf/kiwisolver-1.4.9-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:d0005b053977e7b43388ddec89fa567f43d4f6d5c2c0affe57de5ebf290dc552", size = 66605, upload-time = "2025-08-10T21:27:03.335Z" }, + { url = "https://files.pythonhosted.org/packages/f7/69/00aafdb4e4509c2ca6064646cba9cd4b37933898f426756adb2cb92ebbed/kiwisolver-1.4.9-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:2635d352d67458b66fd0667c14cb1d4145e9560d503219034a18a87e971ce4f3", size = 64925, upload-time = "2025-08-10T21:27:04.339Z" }, + { url = "https://files.pythonhosted.org/packages/43/dc/51acc6791aa14e5cb6d8a2e28cefb0dc2886d8862795449d021334c0df20/kiwisolver-1.4.9-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:767c23ad1c58c9e827b649a9ab7809fd5fd9db266a9cf02b0e926ddc2c680d58", size = 1472414, upload-time = "2025-08-10T21:27:05.437Z" }, + { url = "https://files.pythonhosted.org/packages/3d/bb/93fa64a81db304ac8a246f834d5094fae4b13baf53c839d6bb6e81177129/kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:72d0eb9fba308b8311685c2268cf7d0a0639a6cd027d8128659f72bdd8a024b4", size = 1281272, upload-time = "2025-08-10T21:27:07.063Z" }, + { url = "https://files.pythonhosted.org/packages/70/e6/6df102916960fb8d05069d4bd92d6d9a8202d5a3e2444494e7cd50f65b7a/kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f68e4f3eeca8fb22cc3d731f9715a13b652795ef657a13df1ad0c7dc0e9731df", size = 1298578, upload-time = "2025-08-10T21:27:08.452Z" }, + { url = "https://files.pythonhosted.org/packages/7c/47/e142aaa612f5343736b087864dbaebc53ea8831453fb47e7521fa8658f30/kiwisolver-1.4.9-cp314-cp314-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d84cd4061ae292d8ac367b2c3fa3aad11cb8625a95d135fe93f286f914f3f5a6", size = 1345607, upload-time = "2025-08-10T21:27:10.125Z" }, + { url = "https://files.pythonhosted.org/packages/54/89/d641a746194a0f4d1a3670fb900d0dbaa786fb98341056814bc3f058fa52/kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a60ea74330b91bd22a29638940d115df9dc00af5035a9a2a6ad9399ffb4ceca5", size = 2230150, upload-time = "2025-08-10T21:27:11.484Z" }, + { url = "https://files.pythonhosted.org/packages/aa/6b/5ee1207198febdf16ac11f78c5ae40861b809cbe0e6d2a8d5b0b3044b199/kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:ce6a3a4e106cf35c2d9c4fa17c05ce0b180db622736845d4315519397a77beaf", size = 2325979, upload-time = "2025-08-10T21:27:12.917Z" }, + { url = "https://files.pythonhosted.org/packages/fc/ff/b269eefd90f4ae14dcc74973d5a0f6d28d3b9bb1afd8c0340513afe6b39a/kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:77937e5e2a38a7b48eef0585114fe7930346993a88060d0bf886086d2aa49ef5", size = 2491456, upload-time = "2025-08-10T21:27:14.353Z" }, + { url = "https://files.pythonhosted.org/packages/fc/d4/10303190bd4d30de547534601e259a4fbf014eed94aae3e5521129215086/kiwisolver-1.4.9-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:24c175051354f4a28c5d6a31c93906dc653e2bf234e8a4bbfb964892078898ce", size = 2294621, upload-time = "2025-08-10T21:27:15.808Z" }, + { url = "https://files.pythonhosted.org/packages/28/e0/a9a90416fce5c0be25742729c2ea52105d62eda6c4be4d803c2a7be1fa50/kiwisolver-1.4.9-cp314-cp314-win_amd64.whl", hash = "sha256:0763515d4df10edf6d06a3c19734e2566368980d21ebec439f33f9eb936c07b7", size = 75417, upload-time = "2025-08-10T21:27:17.436Z" }, + { url = "https://files.pythonhosted.org/packages/1f/10/6949958215b7a9a264299a7db195564e87900f709db9245e4ebdd3c70779/kiwisolver-1.4.9-cp314-cp314-win_arm64.whl", hash = "sha256:0e4e2bf29574a6a7b7f6cb5fa69293b9f96c928949ac4a53ba3f525dffb87f9c", size = 66582, upload-time = "2025-08-10T21:27:18.436Z" }, + { url = "https://files.pythonhosted.org/packages/ec/79/60e53067903d3bc5469b369fe0dfc6b3482e2133e85dae9daa9527535991/kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:d976bbb382b202f71c67f77b0ac11244021cfa3f7dfd9e562eefcea2df711548", size = 126514, upload-time = "2025-08-10T21:27:19.465Z" }, + { url = "https://files.pythonhosted.org/packages/25/d1/4843d3e8d46b072c12a38c97c57fab4608d36e13fe47d47ee96b4d61ba6f/kiwisolver-1.4.9-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2489e4e5d7ef9a1c300a5e0196e43d9c739f066ef23270607d45aba368b91f2d", size = 67905, upload-time = "2025-08-10T21:27:20.51Z" }, + { url = "https://files.pythonhosted.org/packages/8c/ae/29ffcbd239aea8b93108de1278271ae764dfc0d803a5693914975f200596/kiwisolver-1.4.9-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:e2ea9f7ab7fbf18fffb1b5434ce7c69a07582f7acc7717720f1d69f3e806f90c", size = 66399, upload-time = "2025-08-10T21:27:21.496Z" }, + { url = "https://files.pythonhosted.org/packages/a1/ae/d7ba902aa604152c2ceba5d352d7b62106bedbccc8e95c3934d94472bfa3/kiwisolver-1.4.9-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:b34e51affded8faee0dfdb705416153819d8ea9250bbbf7ea1b249bdeb5f1122", size = 1582197, upload-time = "2025-08-10T21:27:22.604Z" }, + { url = "https://files.pythonhosted.org/packages/f2/41/27c70d427eddb8bc7e4f16420a20fefc6f480312122a59a959fdfe0445ad/kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d8aacd3d4b33b772542b2e01beb50187536967b514b00003bdda7589722d2a64", size = 1390125, upload-time = "2025-08-10T21:27:24.036Z" }, + { url = "https://files.pythonhosted.org/packages/41/42/b3799a12bafc76d962ad69083f8b43b12bf4fe78b097b12e105d75c9b8f1/kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:7cf974dd4e35fa315563ac99d6287a1024e4dc2077b8a7d7cd3d2fb65d283134", size = 1402612, upload-time = "2025-08-10T21:27:25.773Z" }, + { url = "https://files.pythonhosted.org/packages/d2/b5/a210ea073ea1cfaca1bb5c55a62307d8252f531beb364e18aa1e0888b5a0/kiwisolver-1.4.9-cp314-cp314t-manylinux_2_24_s390x.manylinux_2_28_s390x.whl", hash = "sha256:85bd218b5ecfbee8c8a82e121802dcb519a86044c9c3b2e4aef02fa05c6da370", size = 1453990, upload-time = "2025-08-10T21:27:27.089Z" }, + { url = "https://files.pythonhosted.org/packages/5f/ce/a829eb8c033e977d7ea03ed32fb3c1781b4fa0433fbadfff29e39c676f32/kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:0856e241c2d3df4efef7c04a1e46b1936b6120c9bcf36dd216e3acd84bc4fb21", size = 2331601, upload-time = "2025-08-10T21:27:29.343Z" }, + { url = "https://files.pythonhosted.org/packages/e0/4b/b5e97eb142eb9cd0072dacfcdcd31b1c66dc7352b0f7c7255d339c0edf00/kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:9af39d6551f97d31a4deebeac6f45b156f9755ddc59c07b402c148f5dbb6482a", size = 2422041, upload-time = "2025-08-10T21:27:30.754Z" }, + { url = "https://files.pythonhosted.org/packages/40/be/8eb4cd53e1b85ba4edc3a9321666f12b83113a178845593307a3e7891f44/kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:bb4ae2b57fc1d8cbd1cf7b1d9913803681ffa903e7488012be5b76dedf49297f", size = 2594897, upload-time = "2025-08-10T21:27:32.803Z" }, + { url = "https://files.pythonhosted.org/packages/99/dd/841e9a66c4715477ea0abc78da039832fbb09dac5c35c58dc4c41a407b8a/kiwisolver-1.4.9-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:aedff62918805fb62d43a4aa2ecd4482c380dc76cd31bd7c8878588a61bd0369", size = 2391835, upload-time = "2025-08-10T21:27:34.23Z" }, + { url = "https://files.pythonhosted.org/packages/0c/28/4b2e5c47a0da96896fdfdb006340ade064afa1e63675d01ea5ac222b6d52/kiwisolver-1.4.9-cp314-cp314t-win_amd64.whl", hash = "sha256:1fa333e8b2ce4d9660f2cda9c0e1b6bafcfb2457a9d259faa82289e73ec24891", size = 79988, upload-time = "2025-08-10T21:27:35.587Z" }, + { url = "https://files.pythonhosted.org/packages/80/be/3578e8afd18c88cdf9cb4cffde75a96d2be38c5a903f1ed0ceec061bd09e/kiwisolver-1.4.9-cp314-cp314t-win_arm64.whl", hash = "sha256:4a48a2ce79d65d363597ef7b567ce3d14d68783d2b2263d98db3d9477805ba32", size = 70260, upload-time = "2025-08-10T21:27:36.606Z" }, + { url = "https://files.pythonhosted.org/packages/a2/63/fde392691690f55b38d5dd7b3710f5353bf7a8e52de93a22968801ab8978/kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:4d1d9e582ad4d63062d34077a9a1e9f3c34088a2ec5135b1f7190c07cf366527", size = 60183, upload-time = "2025-08-10T21:27:37.669Z" }, + { url = "https://files.pythonhosted.org/packages/27/b1/6aad34edfdb7cced27f371866f211332bba215bfd918ad3322a58f480d8b/kiwisolver-1.4.9-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:deed0c7258ceb4c44ad5ec7d9918f9f14fd05b2be86378d86cf50e63d1e7b771", size = 58675, upload-time = "2025-08-10T21:27:39.031Z" }, + { url = "https://files.pythonhosted.org/packages/9d/1a/23d855a702bb35a76faed5ae2ba3de57d323f48b1f6b17ee2176c4849463/kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0a590506f303f512dff6b7f75fd2fd18e16943efee932008fe7140e5fa91d80e", size = 80277, upload-time = "2025-08-10T21:27:40.129Z" }, + { url = "https://files.pythonhosted.org/packages/5a/5b/5239e3c2b8fb5afa1e8508f721bb77325f740ab6994d963e61b2b7abcc1e/kiwisolver-1.4.9-pp310-pypy310_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e09c2279a4d01f099f52d5c4b3d9e208e91edcbd1a175c9662a8b16e000fece9", size = 77994, upload-time = "2025-08-10T21:27:41.181Z" }, + { url = "https://files.pythonhosted.org/packages/f9/1c/5d4d468fb16f8410e596ed0eac02d2c68752aa7dc92997fe9d60a7147665/kiwisolver-1.4.9-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c9e7cdf45d594ee04d5be1b24dd9d49f3d1590959b2271fb30b5ca2b262c00fb", size = 73744, upload-time = "2025-08-10T21:27:42.254Z" }, + { url = "https://files.pythonhosted.org/packages/a3/0f/36d89194b5a32c054ce93e586d4049b6c2c22887b0eb229c61c68afd3078/kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:720e05574713db64c356e86732c0f3c5252818d05f9df320f0ad8380641acea5", size = 60104, upload-time = "2025-08-10T21:27:43.287Z" }, + { url = "https://files.pythonhosted.org/packages/52/ba/4ed75f59e4658fd21fe7dde1fee0ac397c678ec3befba3fe6482d987af87/kiwisolver-1.4.9-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:17680d737d5335b552994a2008fab4c851bcd7de33094a82067ef3a576ff02fa", size = 58592, upload-time = "2025-08-10T21:27:44.314Z" }, + { url = "https://files.pythonhosted.org/packages/33/01/a8ea7c5ea32a9b45ceeaee051a04c8ed4320f5add3c51bfa20879b765b70/kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:85b5352f94e490c028926ea567fc569c52ec79ce131dadb968d3853e809518c2", size = 80281, upload-time = "2025-08-10T21:27:45.369Z" }, + { url = "https://files.pythonhosted.org/packages/da/e3/dbd2ecdce306f1d07a1aaf324817ee993aab7aee9db47ceac757deabafbe/kiwisolver-1.4.9-pp311-pypy311_pp73-manylinux_2_24_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:464415881e4801295659462c49461a24fb107c140de781d55518c4b80cb6790f", size = 78009, upload-time = "2025-08-10T21:27:46.376Z" }, + { url = "https://files.pythonhosted.org/packages/da/e9/0d4add7873a73e462aeb45c036a2dead2562b825aa46ba326727b3f31016/kiwisolver-1.4.9-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:fb940820c63a9590d31d88b815e7a3aa5915cad3ce735ab45f0c730b39547de1", size = 73929, upload-time = "2025-08-10T21:27:48.236Z" }, +] + [[package]] name = "markupsafe" version = "3.0.2" @@ -604,6 +1255,92 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4f/65/6079a46068dfceaeabb5dcad6d674f5f5c61a6fa5673746f42a9f4c233b3/MarkupSafe-3.0.2-cp313-cp313t-win_amd64.whl", hash = "sha256:e444a31f8db13eb18ada366ab3cf45fd4b31e4db1236a4448f68778c1d1a5a2f", size = 15739, upload-time = "2024-10-18T15:21:42.784Z" }, ] +[[package]] +name = "matplotlib" +version = "3.10.8" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "contourpy", version = "1.3.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "contourpy", version = "1.3.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "cycler" }, + { name = "fonttools" }, + { name = "kiwisolver" }, + { name = "numpy" }, + { name = "packaging" }, + { name = "pillow" }, + { name = "pyparsing" }, + { name = "python-dateutil" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/8a/76/d3c6e3a13fe484ebe7718d14e269c9569c4eb0020a968a327acb3b9a8fe6/matplotlib-3.10.8.tar.gz", hash = "sha256:2299372c19d56bcd35cf05a2738308758d32b9eaed2371898d8f5bd33f084aa3", size = 34806269, upload-time = "2025-12-10T22:56:51.155Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/58/be/a30bd917018ad220c400169fba298f2bb7003c8ccbc0c3e24ae2aacad1e8/matplotlib-3.10.8-cp310-cp310-macosx_10_12_x86_64.whl", hash = "sha256:00270d217d6b20d14b584c521f810d60c5c78406dc289859776550df837dcda7", size = 8239828, upload-time = "2025-12-10T22:55:02.313Z" }, + { url = "https://files.pythonhosted.org/packages/58/27/ca01e043c4841078e82cf6e80a6993dfecd315c3d79f5f3153afbb8e1ec6/matplotlib-3.10.8-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:37b3c1cc42aa184b3f738cfa18c1c1d72fd496d85467a6cf7b807936d39aa656", size = 8128050, upload-time = "2025-12-10T22:55:04.997Z" }, + { url = "https://files.pythonhosted.org/packages/cb/aa/7ab67f2b729ae6a91bcf9dcac0affb95fb8c56f7fd2b2af894ae0b0cf6fa/matplotlib-3.10.8-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ee40c27c795bda6a5292e9cff9890189d32f7e3a0bf04e0e3c9430c4a00c37df", size = 8700452, upload-time = "2025-12-10T22:55:07.47Z" }, + { url = "https://files.pythonhosted.org/packages/73/ae/2d5817b0acee3c49b7e7ccfbf5b273f284957cc8e270adf36375db353190/matplotlib-3.10.8-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a48f2b74020919552ea25d222d5cc6af9ca3f4eb43a93e14d068457f545c2a17", size = 9534928, upload-time = "2025-12-10T22:55:10.566Z" }, + { url = "https://files.pythonhosted.org/packages/c9/5b/8e66653e9f7c39cb2e5cab25fce4810daffa2bff02cbf5f3077cea9e942c/matplotlib-3.10.8-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:f254d118d14a7f99d616271d6c3c27922c092dac11112670b157798b89bf4933", size = 9586377, upload-time = "2025-12-10T22:55:12.362Z" }, + { url = "https://files.pythonhosted.org/packages/e2/e2/fd0bbadf837f81edb0d208ba8f8cb552874c3b16e27cb91a31977d90875d/matplotlib-3.10.8-cp310-cp310-win_amd64.whl", hash = "sha256:f9b587c9c7274c1613a30afabf65a272114cd6cdbe67b3406f818c79d7ab2e2a", size = 8128127, upload-time = "2025-12-10T22:55:14.436Z" }, + { url = "https://files.pythonhosted.org/packages/f8/86/de7e3a1cdcfc941483af70609edc06b83e7c8a0e0dc9ac325200a3f4d220/matplotlib-3.10.8-cp311-cp311-macosx_10_12_x86_64.whl", hash = "sha256:6be43b667360fef5c754dda5d25a32e6307a03c204f3c0fc5468b78fa87b4160", size = 8251215, upload-time = "2025-12-10T22:55:16.175Z" }, + { url = "https://files.pythonhosted.org/packages/fd/14/baad3222f424b19ce6ad243c71de1ad9ec6b2e4eb1e458a48fdc6d120401/matplotlib-3.10.8-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a2b336e2d91a3d7006864e0990c83b216fcdca64b5a6484912902cef87313d78", size = 8139625, upload-time = "2025-12-10T22:55:17.712Z" }, + { url = "https://files.pythonhosted.org/packages/8f/a0/7024215e95d456de5883e6732e708d8187d9753a21d32f8ddb3befc0c445/matplotlib-3.10.8-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:efb30e3baaea72ce5928e32bab719ab4770099079d66726a62b11b1ef7273be4", size = 8712614, upload-time = "2025-12-10T22:55:20.8Z" }, + { url = "https://files.pythonhosted.org/packages/5a/f4/b8347351da9a5b3f41e26cf547252d861f685c6867d179a7c9d60ad50189/matplotlib-3.10.8-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d56a1efd5bfd61486c8bc968fa18734464556f0fb8e51690f4ac25d85cbbbbc2", size = 9540997, upload-time = "2025-12-10T22:55:23.258Z" }, + { url = "https://files.pythonhosted.org/packages/9e/c0/c7b914e297efe0bc36917bf216b2acb91044b91e930e878ae12981e461e5/matplotlib-3.10.8-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:238b7ce5717600615c895050239ec955d91f321c209dd110db988500558e70d6", size = 9596825, upload-time = "2025-12-10T22:55:25.217Z" }, + { url = "https://files.pythonhosted.org/packages/6f/d3/a4bbc01c237ab710a1f22b4da72f4ff6d77eb4c7735ea9811a94ae239067/matplotlib-3.10.8-cp311-cp311-win_amd64.whl", hash = "sha256:18821ace09c763ec93aef5eeff087ee493a24051936d7b9ebcad9662f66501f9", size = 8135090, upload-time = "2025-12-10T22:55:27.162Z" }, + { url = "https://files.pythonhosted.org/packages/89/dd/a0b6588f102beab33ca6f5218b31725216577b2a24172f327eaf6417d5c9/matplotlib-3.10.8-cp311-cp311-win_arm64.whl", hash = "sha256:bab485bcf8b1c7d2060b4fcb6fc368a9e6f4cd754c9c2fea281f4be21df394a2", size = 8012377, upload-time = "2025-12-10T22:55:29.185Z" }, + { url = "https://files.pythonhosted.org/packages/9e/67/f997cdcbb514012eb0d10cd2b4b332667997fb5ebe26b8d41d04962fa0e6/matplotlib-3.10.8-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:64fcc24778ca0404ce0cb7b6b77ae1f4c7231cdd60e6778f999ee05cbd581b9a", size = 8260453, upload-time = "2025-12-10T22:55:30.709Z" }, + { url = "https://files.pythonhosted.org/packages/7e/65/07d5f5c7f7c994f12c768708bd2e17a4f01a2b0f44a1c9eccad872433e2e/matplotlib-3.10.8-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b9a5ca4ac220a0cdd1ba6bcba3608547117d30468fefce49bb26f55c1a3d5c58", size = 8148321, upload-time = "2025-12-10T22:55:33.265Z" }, + { url = "https://files.pythonhosted.org/packages/3e/f3/c5195b1ae57ef85339fd7285dfb603b22c8b4e79114bae5f4f0fcf688677/matplotlib-3.10.8-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:3ab4aabc72de4ff77b3ec33a6d78a68227bf1123465887f9905ba79184a1cc04", size = 8716944, upload-time = "2025-12-10T22:55:34.922Z" }, + { url = "https://files.pythonhosted.org/packages/00/f9/7638f5cc82ec8a7aa005de48622eecc3ed7c9854b96ba15bd76b7fd27574/matplotlib-3.10.8-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:24d50994d8c5816ddc35411e50a86ab05f575e2530c02752e02538122613371f", size = 9550099, upload-time = "2025-12-10T22:55:36.789Z" }, + { url = "https://files.pythonhosted.org/packages/57/61/78cd5920d35b29fd2a0fe894de8adf672ff52939d2e9b43cb83cd5ce1bc7/matplotlib-3.10.8-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:99eefd13c0dc3b3c1b4d561c1169e65fe47aab7b8158754d7c084088e2329466", size = 9613040, upload-time = "2025-12-10T22:55:38.715Z" }, + { url = "https://files.pythonhosted.org/packages/30/4e/c10f171b6e2f44d9e3a2b96efa38b1677439d79c99357600a62cc1e9594e/matplotlib-3.10.8-cp312-cp312-win_amd64.whl", hash = "sha256:dd80ecb295460a5d9d260df63c43f4afbdd832d725a531f008dad1664f458adf", size = 8142717, upload-time = "2025-12-10T22:55:41.103Z" }, + { url = "https://files.pythonhosted.org/packages/f1/76/934db220026b5fef85f45d51a738b91dea7d70207581063cd9bd8fafcf74/matplotlib-3.10.8-cp312-cp312-win_arm64.whl", hash = "sha256:3c624e43ed56313651bc18a47f838b60d7b8032ed348911c54906b130b20071b", size = 8012751, upload-time = "2025-12-10T22:55:42.684Z" }, + { url = "https://files.pythonhosted.org/packages/3d/b9/15fd5541ef4f5b9a17eefd379356cf12175fe577424e7b1d80676516031a/matplotlib-3.10.8-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3f2e409836d7f5ac2f1c013110a4d50b9f7edc26328c108915f9075d7d7a91b6", size = 8261076, upload-time = "2025-12-10T22:55:44.648Z" }, + { url = "https://files.pythonhosted.org/packages/8d/a0/2ba3473c1b66b9c74dc7107c67e9008cb1782edbe896d4c899d39ae9cf78/matplotlib-3.10.8-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:56271f3dac49a88d7fca5060f004d9d22b865f743a12a23b1e937a0be4818ee1", size = 8148794, upload-time = "2025-12-10T22:55:46.252Z" }, + { url = "https://files.pythonhosted.org/packages/75/97/a471f1c3eb1fd6f6c24a31a5858f443891d5127e63a7788678d14e249aea/matplotlib-3.10.8-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:a0a7f52498f72f13d4a25ea70f35f4cb60642b466cbb0a9be951b5bc3f45a486", size = 8718474, upload-time = "2025-12-10T22:55:47.864Z" }, + { url = "https://files.pythonhosted.org/packages/01/be/cd478f4b66f48256f42927d0acbcd63a26a893136456cd079c0cc24fbabf/matplotlib-3.10.8-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:646d95230efb9ca614a7a594d4fcacde0ac61d25e37dd51710b36477594963ce", size = 9549637, upload-time = "2025-12-10T22:55:50.048Z" }, + { url = "https://files.pythonhosted.org/packages/5d/7c/8dc289776eae5109e268c4fb92baf870678dc048a25d4ac903683b86d5bf/matplotlib-3.10.8-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f89c151aab2e2e23cb3fe0acad1e8b82841fd265379c4cecd0f3fcb34c15e0f6", size = 9613678, upload-time = "2025-12-10T22:55:52.21Z" }, + { url = "https://files.pythonhosted.org/packages/64/40/37612487cc8a437d4dd261b32ca21fe2d79510fe74af74e1f42becb1bdb8/matplotlib-3.10.8-cp313-cp313-win_amd64.whl", hash = "sha256:e8ea3e2d4066083e264e75c829078f9e149fa119d27e19acd503de65e0b13149", size = 8142686, upload-time = "2025-12-10T22:55:54.253Z" }, + { url = "https://files.pythonhosted.org/packages/66/52/8d8a8730e968185514680c2a6625943f70269509c3dcfc0dcf7d75928cb8/matplotlib-3.10.8-cp313-cp313-win_arm64.whl", hash = "sha256:c108a1d6fa78a50646029cb6d49808ff0fc1330fda87fa6f6250c6b5369b6645", size = 8012917, upload-time = "2025-12-10T22:55:56.268Z" }, + { url = "https://files.pythonhosted.org/packages/b5/27/51fe26e1062f298af5ef66343d8ef460e090a27fea73036c76c35821df04/matplotlib-3.10.8-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:ad3d9833a64cf48cc4300f2b406c3d0f4f4724a91c0bd5640678a6ba7c102077", size = 8305679, upload-time = "2025-12-10T22:55:57.856Z" }, + { url = "https://files.pythonhosted.org/packages/2c/1e/4de865bc591ac8e3062e835f42dd7fe7a93168d519557837f0e37513f629/matplotlib-3.10.8-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:eb3823f11823deade26ce3b9f40dcb4a213da7a670013929f31d5f5ed1055b22", size = 8198336, upload-time = "2025-12-10T22:55:59.371Z" }, + { url = "https://files.pythonhosted.org/packages/c6/cb/2f7b6e75fb4dce87ef91f60cac4f6e34f4c145ab036a22318ec837971300/matplotlib-3.10.8-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d9050fee89a89ed57b4fb2c1bfac9a3d0c57a0d55aed95949eedbc42070fea39", size = 8731653, upload-time = "2025-12-10T22:56:01.032Z" }, + { url = "https://files.pythonhosted.org/packages/46/b3/bd9c57d6ba670a37ab31fb87ec3e8691b947134b201f881665b28cc039ff/matplotlib-3.10.8-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b44d07310e404ba95f8c25aa5536f154c0a8ec473303535949e52eb71d0a1565", size = 9561356, upload-time = "2025-12-10T22:56:02.95Z" }, + { url = "https://files.pythonhosted.org/packages/c0/3d/8b94a481456dfc9dfe6e39e93b5ab376e50998cddfd23f4ae3b431708f16/matplotlib-3.10.8-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:0a33deb84c15ede243aead39f77e990469fff93ad1521163305095b77b72ce4a", size = 9614000, upload-time = "2025-12-10T22:56:05.411Z" }, + { url = "https://files.pythonhosted.org/packages/bd/cd/bc06149fe5585ba800b189a6a654a75f1f127e8aab02fd2be10df7fa500c/matplotlib-3.10.8-cp313-cp313t-win_amd64.whl", hash = "sha256:3a48a78d2786784cc2413e57397981fb45c79e968d99656706018d6e62e57958", size = 8220043, upload-time = "2025-12-10T22:56:07.551Z" }, + { url = "https://files.pythonhosted.org/packages/e3/de/b22cf255abec916562cc04eef457c13e58a1990048de0c0c3604d082355e/matplotlib-3.10.8-cp313-cp313t-win_arm64.whl", hash = "sha256:15d30132718972c2c074cd14638c7f4592bd98719e2308bccea40e0538bc0cb5", size = 8062075, upload-time = "2025-12-10T22:56:09.178Z" }, + { url = "https://files.pythonhosted.org/packages/3c/43/9c0ff7a2f11615e516c3b058e1e6e8f9614ddeca53faca06da267c48345d/matplotlib-3.10.8-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:b53285e65d4fa4c86399979e956235deb900be5baa7fc1218ea67fbfaeaadd6f", size = 8262481, upload-time = "2025-12-10T22:56:10.885Z" }, + { url = "https://files.pythonhosted.org/packages/6f/ca/e8ae28649fcdf039fda5ef554b40a95f50592a3c47e6f7270c9561c12b07/matplotlib-3.10.8-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:32f8dce744be5569bebe789e46727946041199030db8aeb2954d26013a0eb26b", size = 8151473, upload-time = "2025-12-10T22:56:12.377Z" }, + { url = "https://files.pythonhosted.org/packages/f1/6f/009d129ae70b75e88cbe7e503a12a4c0670e08ed748a902c2568909e9eb5/matplotlib-3.10.8-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4cf267add95b1c88300d96ca837833d4112756045364f5c734a2276038dae27d", size = 9553896, upload-time = "2025-12-10T22:56:14.432Z" }, + { url = "https://files.pythonhosted.org/packages/f5/26/4221a741eb97967bc1fd5e4c52b9aa5a91b2f4ec05b59f6def4d820f9df9/matplotlib-3.10.8-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2cf5bd12cecf46908f286d7838b2abc6c91cda506c0445b8223a7c19a00df008", size = 9824193, upload-time = "2025-12-10T22:56:16.29Z" }, + { url = "https://files.pythonhosted.org/packages/1f/f3/3abf75f38605772cf48a9daf5821cd4f563472f38b4b828c6fba6fa6d06e/matplotlib-3.10.8-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:41703cc95688f2516b480f7f339d8851a6035f18e100ee6a32bc0b8536a12a9c", size = 9615444, upload-time = "2025-12-10T22:56:18.155Z" }, + { url = "https://files.pythonhosted.org/packages/93/a5/de89ac80f10b8dc615807ee1133cd99ac74082581196d4d9590bea10690d/matplotlib-3.10.8-cp314-cp314-win_amd64.whl", hash = "sha256:83d282364ea9f3e52363da262ce32a09dfe241e4080dcedda3c0db059d3c1f11", size = 8272719, upload-time = "2025-12-10T22:56:20.366Z" }, + { url = "https://files.pythonhosted.org/packages/69/ce/b006495c19ccc0a137b48083168a37bd056392dee02f87dba0472f2797fe/matplotlib-3.10.8-cp314-cp314-win_arm64.whl", hash = "sha256:2c1998e92cd5999e295a731bcb2911c75f597d937341f3030cc24ef2733d78a8", size = 8144205, upload-time = "2025-12-10T22:56:22.239Z" }, + { url = "https://files.pythonhosted.org/packages/68/d9/b31116a3a855bd313c6fcdb7226926d59b041f26061c6c5b1be66a08c826/matplotlib-3.10.8-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:b5a2b97dbdc7d4f353ebf343744f1d1f1cca8aa8bfddb4262fcf4306c3761d50", size = 8305785, upload-time = "2025-12-10T22:56:24.218Z" }, + { url = "https://files.pythonhosted.org/packages/1e/90/6effe8103f0272685767ba5f094f453784057072f49b393e3ea178fe70a5/matplotlib-3.10.8-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:3f5c3e4da343bba819f0234186b9004faba952cc420fbc522dc4e103c1985908", size = 8198361, upload-time = "2025-12-10T22:56:26.787Z" }, + { url = "https://files.pythonhosted.org/packages/d7/65/a73188711bea603615fc0baecca1061429ac16940e2385433cc778a9d8e7/matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5f62550b9a30afde8c1c3ae450e5eb547d579dd69b25c2fc7a1c67f934c1717a", size = 9561357, upload-time = "2025-12-10T22:56:28.953Z" }, + { url = "https://files.pythonhosted.org/packages/f4/3d/b5c5d5d5be8ce63292567f0e2c43dde9953d3ed86ac2de0a72e93c8f07a1/matplotlib-3.10.8-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:495672de149445ec1b772ff2c9ede9b769e3cb4f0d0aa7fa730d7f59e2d4e1c1", size = 9823610, upload-time = "2025-12-10T22:56:31.455Z" }, + { url = "https://files.pythonhosted.org/packages/4d/4b/e7beb6bbd49f6bae727a12b270a2654d13c397576d25bd6786e47033300f/matplotlib-3.10.8-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:595ba4d8fe983b88f0eec8c26a241e16d6376fe1979086232f481f8f3f67494c", size = 9614011, upload-time = "2025-12-10T22:56:33.85Z" }, + { url = "https://files.pythonhosted.org/packages/7c/e6/76f2813d31f032e65f6f797e3f2f6e4aab95b65015924b1c51370395c28a/matplotlib-3.10.8-cp314-cp314t-win_amd64.whl", hash = "sha256:25d380fe8b1dc32cf8f0b1b448470a77afb195438bafdf1d858bfb876f3edf7b", size = 8362801, upload-time = "2025-12-10T22:56:36.107Z" }, + { url = "https://files.pythonhosted.org/packages/5d/49/d651878698a0b67f23aa28e17f45a6d6dd3d3f933fa29087fa4ce5947b5a/matplotlib-3.10.8-cp314-cp314t-win_arm64.whl", hash = "sha256:113bb52413ea508ce954a02c10ffd0d565f9c3bc7f2eddc27dfe1731e71c7b5f", size = 8192560, upload-time = "2025-12-10T22:56:38.008Z" }, + { url = "https://files.pythonhosted.org/packages/f5/43/31d59500bb950b0d188e149a2e552040528c13d6e3d6e84d0cccac593dcd/matplotlib-3.10.8-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:f97aeb209c3d2511443f8797e3e5a569aebb040d4f8bc79aa3ee78a8fb9e3dd8", size = 8237252, upload-time = "2025-12-10T22:56:39.529Z" }, + { url = "https://files.pythonhosted.org/packages/0c/2c/615c09984f3c5f907f51c886538ad785cf72e0e11a3225de2c0f9442aecc/matplotlib-3.10.8-pp310-pypy310_pp73-macosx_11_0_arm64.whl", hash = "sha256:fb061f596dad3a0f52b60dc6a5dec4a0c300dec41e058a7efe09256188d170b7", size = 8124693, upload-time = "2025-12-10T22:56:41.758Z" }, + { url = "https://files.pythonhosted.org/packages/91/e1/2757277a1c56041e1fc104b51a0f7b9a4afc8eb737865d63cababe30bc61/matplotlib-3.10.8-pp310-pypy310_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:12d90df9183093fcd479f4172ac26b322b1248b15729cb57f42f71f24c7e37a3", size = 8702205, upload-time = "2025-12-10T22:56:43.415Z" }, + { url = "https://files.pythonhosted.org/packages/04/30/3afaa31c757f34b7725ab9d2ba8b48b5e89c2019c003e7d0ead143aabc5a/matplotlib-3.10.8-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:6da7c2ce169267d0d066adcf63758f0604aa6c3eebf67458930f9d9b79ad1db1", size = 8249198, upload-time = "2025-12-10T22:56:45.584Z" }, + { url = "https://files.pythonhosted.org/packages/48/2f/6334aec331f57485a642a7c8be03cb286f29111ae71c46c38b363230063c/matplotlib-3.10.8-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:9153c3292705be9f9c64498a8872118540c3f4123d1a1c840172edf262c8be4a", size = 8136817, upload-time = "2025-12-10T22:56:47.339Z" }, + { url = "https://files.pythonhosted.org/packages/73/e4/6d6f14b2a759c622f191b2d67e9075a3f56aaccb3be4bb9bb6890030d0a0/matplotlib-3.10.8-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:1ae029229a57cd1e8fe542485f27e7ca7b23aa9e8944ddb4985d0bc444f1eca2", size = 8713867, upload-time = "2025-12-10T22:56:48.954Z" }, +] + +[[package]] +name = "matplotlib-inline" +version = "0.2.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "traitlets" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c7/74/97e72a36efd4ae2bccb3463284300f8953f199b5ffbc04cbbb0ec78f74b1/matplotlib_inline-0.2.1.tar.gz", hash = "sha256:e1ee949c340d771fc39e241ea75683deb94762c8fa5f2927ec57c83c4dffa9fe", size = 8110, upload-time = "2025-10-23T09:00:22.126Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/33/ee4519fa02ed11a94aef9559552f3b17bb863f2ecfe1a35dc7f548cde231/matplotlib_inline-0.2.1-py3-none-any.whl", hash = "sha256:d56ce5156ba6085e00a9d54fead6ed29a9c47e215cd1bba2e976ef39f5710a76", size = 9516, upload-time = "2025-10-23T09:00:20.675Z" }, +] + [[package]] name = "mpmath" version = "1.3.0" @@ -740,6 +1477,8 @@ source = { virtual = "." } dependencies = [ { name = "datasets" }, { name = "fastapi" }, + { name = "ipykernel" }, + { name = "matplotlib" }, { name = "psutil" }, { name = "python-dotenv" }, { name = "regex" }, @@ -775,6 +1514,8 @@ dev = [ requires-dist = [ { name = "datasets", specifier = ">=4.0.0" }, { name = "fastapi", specifier = ">=0.117.1" }, + { name = "ipykernel", specifier = ">=7.1.0" }, + { name = "matplotlib", specifier = ">=3.10.8" }, { name = "psutil", specifier = ">=7.1.0" }, { name = "python-dotenv", specifier = ">=1.2.1" }, { name = "regex", specifier = ">=2025.9.1" }, @@ -782,7 +1523,7 @@ requires-dist = [ { name = "setuptools", specifier = ">=80.9.0" }, { name = "tiktoken", specifier = ">=0.11.0" }, { name = "tokenizers", specifier = ">=0.22.0" }, - { name = "torch", specifier = ">=2.8.0" }, + { name = "torch", specifier = ">=2.9.0" }, { name = "torch", marker = "extra == 'cpu'", specifier = ">=2.9.1", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "nanochat", extra = "cpu" } }, { name = "torch", marker = "extra == 'gpu'", specifier = ">=2.9.1", index = "https://download.pytorch.org/whl/cu128", conflict = { package = "nanochat", extra = "gpu" } }, { name = "transformers", specifier = ">=4.57.3" }, @@ -794,6 +1535,15 @@ provides-extras = ["cpu", "gpu"] [package.metadata.requires-dev] dev = [{ name = "pytest", specifier = ">=8.0.0" }] +[[package]] +name = "nest-asyncio" +version = "1.6.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/83/f8/51569ac65d696c8ecbee95938f89d4abf00f47d58d48f6fbabfe8f0baefe/nest_asyncio-1.6.0.tar.gz", hash = "sha256:6f172d5449aca15afd6c646851f4e31e02c598d553a667e38cafa997cfec55fe", size = 7418, upload-time = "2024-01-21T14:25:19.227Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/c4/c2971a3ba4c6103a3d10c4b0f24f461ddc027f0f09763220cf35ca1401b3/nest_asyncio-1.6.0-py3-none-any.whl", hash = "sha256:87af6efd6b5e897c81050477ef65c62e2b2f35d51703cae01aff2905b1852e1c", size = 5195, upload-time = "2024-01-21T14:25:17.223Z" }, +] + [[package]] name = "networkx" version = "3.4.2" @@ -1087,6 +1837,125 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cd/d7/612123674d7b17cf345aad0a10289b2a384bff404e0463a83c4a3a59d205/pandas-2.3.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:d2c3554bd31b731cd6490d94a28f3abb8dd770634a9e06eb6d2911b9827db370", size = 13186141, upload-time = "2025-08-21T10:28:05.377Z" }, ] +[[package]] +name = "parso" +version = "0.8.5" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d4/de/53e0bcf53d13e005bd8c92e7855142494f41171b34c2536b86187474184d/parso-0.8.5.tar.gz", hash = "sha256:034d7354a9a018bdce352f48b2a8a450f05e9d6ee85db84764e9b6bd96dafe5a", size = 401205, upload-time = "2025-08-23T15:15:28.028Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/16/32/f8e3c85d1d5250232a5d3477a2a28cc291968ff175caeadaf3cc19ce0e4a/parso-0.8.5-py2.py3-none-any.whl", hash = "sha256:646204b5ee239c396d040b90f9e272e9a8017c630092bf59980beb62fd033887", size = 106668, upload-time = "2025-08-23T15:15:25.663Z" }, +] + +[[package]] +name = "pexpect" +version = "4.9.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "ptyprocess" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/42/92/cc564bf6381ff43ce1f4d06852fc19a2f11d180f23dc32d9588bee2f149d/pexpect-4.9.0.tar.gz", hash = "sha256:ee7d41123f3c9911050ea2c2dac107568dc43b2d3b0c7557a33212c398ead30f", size = 166450, upload-time = "2023-11-25T09:07:26.339Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9e/c3/059298687310d527a58bb01f3b1965787ee3b40dce76752eda8b44e9a2c5/pexpect-4.9.0-py2.py3-none-any.whl", hash = "sha256:7236d1e080e4936be2dc3e326cec0af72acf9212a7e1d060210e70a47e253523", size = 63772, upload-time = "2023-11-25T06:56:14.81Z" }, +] + +[[package]] +name = "pillow" +version = "12.1.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/d0/02/d52c733a2452ef1ffcc123b68e6606d07276b0e358db70eabad7e40042b7/pillow-12.1.0.tar.gz", hash = "sha256:5c5ae0a06e9ea030ab786b0251b32c7e4ce10e58d983c0d5c56029455180b5b9", size = 46977283, upload-time = "2026-01-02T09:13:29.892Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/fe/41/f73d92b6b883a579e79600d391f2e21cb0df767b2714ecbd2952315dfeef/pillow-12.1.0-cp310-cp310-macosx_10_10_x86_64.whl", hash = "sha256:fb125d860738a09d363a88daa0f59c4533529a90e564785e20fe875b200b6dbd", size = 5304089, upload-time = "2026-01-02T09:10:24.953Z" }, + { url = "https://files.pythonhosted.org/packages/94/55/7aca2891560188656e4a91ed9adba305e914a4496800da6b5c0a15f09edf/pillow-12.1.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:cad302dc10fac357d3467a74a9561c90609768a6f73a1923b0fd851b6486f8b0", size = 4657815, upload-time = "2026-01-02T09:10:27.063Z" }, + { url = "https://files.pythonhosted.org/packages/e9/d2/b28221abaa7b4c40b7dba948f0f6a708bd7342c4d47ce342f0ea39643974/pillow-12.1.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:a40905599d8079e09f25027423aed94f2823adaf2868940de991e53a449e14a8", size = 6222593, upload-time = "2026-01-02T09:10:29.115Z" }, + { url = "https://files.pythonhosted.org/packages/71/b8/7a61fb234df6a9b0b479f69e66901209d89ff72a435b49933f9122f94cac/pillow-12.1.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:92a7fe4225365c5e3a8e598982269c6d6698d3e783b3b1ae979e7819f9cd55c1", size = 8027579, upload-time = "2026-01-02T09:10:31.182Z" }, + { url = "https://files.pythonhosted.org/packages/ea/51/55c751a57cc524a15a0e3db20e5cde517582359508d62305a627e77fd295/pillow-12.1.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f10c98f49227ed8383d28174ee95155a675c4ed7f85e2e573b04414f7e371bda", size = 6335760, upload-time = "2026-01-02T09:10:33.02Z" }, + { url = "https://files.pythonhosted.org/packages/dc/7c/60e3e6f5e5891a1a06b4c910f742ac862377a6fe842f7184df4a274ce7bf/pillow-12.1.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:8637e29d13f478bc4f153d8daa9ffb16455f0a6cb287da1b432fdad2bfbd66c7", size = 7027127, upload-time = "2026-01-02T09:10:35.009Z" }, + { url = "https://files.pythonhosted.org/packages/06/37/49d47266ba50b00c27ba63a7c898f1bb41a29627ced8c09e25f19ebec0ff/pillow-12.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:21e686a21078b0f9cb8c8a961d99e6a4ddb88e0fc5ea6e130172ddddc2e5221a", size = 6449896, upload-time = "2026-01-02T09:10:36.793Z" }, + { url = "https://files.pythonhosted.org/packages/f9/e5/67fd87d2913902462cd9b79c6211c25bfe95fcf5783d06e1367d6d9a741f/pillow-12.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:2415373395a831f53933c23ce051021e79c8cd7979822d8cc478547a3f4da8ef", size = 7151345, upload-time = "2026-01-02T09:10:39.064Z" }, + { url = "https://files.pythonhosted.org/packages/bd/15/f8c7abf82af68b29f50d77c227e7a1f87ce02fdc66ded9bf603bc3b41180/pillow-12.1.0-cp310-cp310-win32.whl", hash = "sha256:e75d3dba8fc1ddfec0cd752108f93b83b4f8d6ab40e524a95d35f016b9683b09", size = 6325568, upload-time = "2026-01-02T09:10:41.035Z" }, + { url = "https://files.pythonhosted.org/packages/d4/24/7d1c0e160b6b5ac2605ef7d8be537e28753c0db5363d035948073f5513d7/pillow-12.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:64efdf00c09e31efd754448a383ea241f55a994fd079866b92d2bbff598aad91", size = 7032367, upload-time = "2026-01-02T09:10:43.09Z" }, + { url = "https://files.pythonhosted.org/packages/f4/03/41c038f0d7a06099254c60f618d0ec7be11e79620fc23b8e85e5b31d9a44/pillow-12.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:f188028b5af6b8fb2e9a76ac0f841a575bd1bd396e46ef0840d9b88a48fdbcea", size = 2452345, upload-time = "2026-01-02T09:10:44.795Z" }, + { url = "https://files.pythonhosted.org/packages/43/c4/bf8328039de6cc22182c3ef007a2abfbbdab153661c0a9aa78af8d706391/pillow-12.1.0-cp311-cp311-macosx_10_10_x86_64.whl", hash = "sha256:a83e0850cb8f5ac975291ebfc4170ba481f41a28065277f7f735c202cd8e0af3", size = 5304057, upload-time = "2026-01-02T09:10:46.627Z" }, + { url = "https://files.pythonhosted.org/packages/43/06/7264c0597e676104cc22ca73ee48f752767cd4b1fe084662620b17e10120/pillow-12.1.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:b6e53e82ec2db0717eabb276aa56cf4e500c9a7cec2c2e189b55c24f65a3e8c0", size = 4657811, upload-time = "2026-01-02T09:10:49.548Z" }, + { url = "https://files.pythonhosted.org/packages/72/64/f9189e44474610daf83da31145fa56710b627b5c4c0b9c235e34058f6b31/pillow-12.1.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:40a8e3b9e8773876d6e30daed22f016509e3987bab61b3b7fe309d7019a87451", size = 6232243, upload-time = "2026-01-02T09:10:51.62Z" }, + { url = "https://files.pythonhosted.org/packages/ef/30/0df458009be6a4caca4ca2c52975e6275c387d4e5c95544e34138b41dc86/pillow-12.1.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:800429ac32c9b72909c671aaf17ecd13110f823ddb7db4dfef412a5587c2c24e", size = 8037872, upload-time = "2026-01-02T09:10:53.446Z" }, + { url = "https://files.pythonhosted.org/packages/e4/86/95845d4eda4f4f9557e25381d70876aa213560243ac1a6d619c46caaedd9/pillow-12.1.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0b022eaaf709541b391ee069f0022ee5b36c709df71986e3f7be312e46f42c84", size = 6345398, upload-time = "2026-01-02T09:10:55.426Z" }, + { url = "https://files.pythonhosted.org/packages/5c/1f/8e66ab9be3aaf1435bc03edd1ebdf58ffcd17f7349c1d970cafe87af27d9/pillow-12.1.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1f345e7bc9d7f368887c712aa5054558bad44d2a301ddf9248599f4161abc7c0", size = 7034667, upload-time = "2026-01-02T09:10:57.11Z" }, + { url = "https://files.pythonhosted.org/packages/f9/f6/683b83cb9b1db1fb52b87951b1c0b99bdcfceaa75febf11406c19f82cb5e/pillow-12.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:d70347c8a5b7ccd803ec0c85c8709f036e6348f1e6a5bf048ecd9c64d3550b8b", size = 6458743, upload-time = "2026-01-02T09:10:59.331Z" }, + { url = "https://files.pythonhosted.org/packages/9a/7d/de833d63622538c1d58ce5395e7c6cb7e7dce80decdd8bde4a484e095d9f/pillow-12.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:1fcc52d86ce7a34fd17cb04e87cfdb164648a3662a6f20565910a99653d66c18", size = 7159342, upload-time = "2026-01-02T09:11:01.82Z" }, + { url = "https://files.pythonhosted.org/packages/8c/40/50d86571c9e5868c42b81fe7da0c76ca26373f3b95a8dd675425f4a92ec1/pillow-12.1.0-cp311-cp311-win32.whl", hash = "sha256:3ffaa2f0659e2f740473bcf03c702c39a8d4b2b7ffc629052028764324842c64", size = 6328655, upload-time = "2026-01-02T09:11:04.556Z" }, + { url = "https://files.pythonhosted.org/packages/6c/af/b1d7e301c4cd26cd45d4af884d9ee9b6fab893b0ad2450d4746d74a6968c/pillow-12.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:806f3987ffe10e867bab0ddad45df1148a2b98221798457fa097ad85d6e8bc75", size = 7031469, upload-time = "2026-01-02T09:11:06.538Z" }, + { url = "https://files.pythonhosted.org/packages/48/36/d5716586d887fb2a810a4a61518a327a1e21c8b7134c89283af272efe84b/pillow-12.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:9f5fefaca968e700ad1a4a9de98bf0869a94e397fe3524c4c9450c1445252304", size = 2452515, upload-time = "2026-01-02T09:11:08.226Z" }, + { url = "https://files.pythonhosted.org/packages/20/31/dc53fe21a2f2996e1b7d92bf671cdb157079385183ef7c1ae08b485db510/pillow-12.1.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a332ac4ccb84b6dde65dbace8431f3af08874bf9770719d32a635c4ef411b18b", size = 5262642, upload-time = "2026-01-02T09:11:10.138Z" }, + { url = "https://files.pythonhosted.org/packages/ab/c1/10e45ac9cc79419cedf5121b42dcca5a50ad2b601fa080f58c22fb27626e/pillow-12.1.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:907bfa8a9cb790748a9aa4513e37c88c59660da3bcfffbd24a7d9e6abf224551", size = 4657464, upload-time = "2026-01-02T09:11:12.319Z" }, + { url = "https://files.pythonhosted.org/packages/ad/26/7b82c0ab7ef40ebede7a97c72d473bda5950f609f8e0c77b04af574a0ddb/pillow-12.1.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:efdc140e7b63b8f739d09a99033aa430accce485ff78e6d311973a67b6bf3208", size = 6234878, upload-time = "2026-01-02T09:11:14.096Z" }, + { url = "https://files.pythonhosted.org/packages/76/25/27abc9792615b5e886ca9411ba6637b675f1b77af3104710ac7353fe5605/pillow-12.1.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:bef9768cab184e7ae6e559c032e95ba8d07b3023c289f79a2bd36e8bf85605a5", size = 8044868, upload-time = "2026-01-02T09:11:15.903Z" }, + { url = "https://files.pythonhosted.org/packages/0a/ea/f200a4c36d836100e7bc738fc48cd963d3ba6372ebc8298a889e0cfc3359/pillow-12.1.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:742aea052cf5ab5034a53c3846165bc3ce88d7c38e954120db0ab867ca242661", size = 6349468, upload-time = "2026-01-02T09:11:17.631Z" }, + { url = "https://files.pythonhosted.org/packages/11/8f/48d0b77ab2200374c66d344459b8958c86693be99526450e7aee714e03e4/pillow-12.1.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a6dfc2af5b082b635af6e08e0d1f9f1c4e04d17d4e2ca0ef96131e85eda6eb17", size = 7041518, upload-time = "2026-01-02T09:11:19.389Z" }, + { url = "https://files.pythonhosted.org/packages/1d/23/c281182eb986b5d31f0a76d2a2c8cd41722d6fb8ed07521e802f9bba52de/pillow-12.1.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:609e89d9f90b581c8d16358c9087df76024cf058fa693dd3e1e1620823f39670", size = 6462829, upload-time = "2026-01-02T09:11:21.28Z" }, + { url = "https://files.pythonhosted.org/packages/25/ef/7018273e0faac099d7b00982abdcc39142ae6f3bd9ceb06de09779c4a9d6/pillow-12.1.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:43b4899cfd091a9693a1278c4982f3e50f7fb7cff5153b05174b4afc9593b616", size = 7166756, upload-time = "2026-01-02T09:11:23.559Z" }, + { url = "https://files.pythonhosted.org/packages/8f/c8/993d4b7ab2e341fe02ceef9576afcf5830cdec640be2ac5bee1820d693d4/pillow-12.1.0-cp312-cp312-win32.whl", hash = "sha256:aa0c9cc0b82b14766a99fbe6084409972266e82f459821cd26997a488a7261a7", size = 6328770, upload-time = "2026-01-02T09:11:25.661Z" }, + { url = "https://files.pythonhosted.org/packages/a7/87/90b358775a3f02765d87655237229ba64a997b87efa8ccaca7dd3e36e7a7/pillow-12.1.0-cp312-cp312-win_amd64.whl", hash = "sha256:d70534cea9e7966169ad29a903b99fc507e932069a881d0965a1a84bb57f6c6d", size = 7033406, upload-time = "2026-01-02T09:11:27.474Z" }, + { url = "https://files.pythonhosted.org/packages/5d/cf/881b457eccacac9e5b2ddd97d5071fb6d668307c57cbf4e3b5278e06e536/pillow-12.1.0-cp312-cp312-win_arm64.whl", hash = "sha256:65b80c1ee7e14a87d6a068dd3b0aea268ffcabfe0498d38661b00c5b4b22e74c", size = 2452612, upload-time = "2026-01-02T09:11:29.309Z" }, + { url = "https://files.pythonhosted.org/packages/dd/c7/2530a4aa28248623e9d7f27316b42e27c32ec410f695929696f2e0e4a778/pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:7b5dd7cbae20285cdb597b10eb5a2c13aa9de6cde9bb64a3c1317427b1db1ae1", size = 4062543, upload-time = "2026-01-02T09:11:31.566Z" }, + { url = "https://files.pythonhosted.org/packages/8f/1f/40b8eae823dc1519b87d53c30ed9ef085506b05281d313031755c1705f73/pillow-12.1.0-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:29a4cef9cb672363926f0470afc516dbf7305a14d8c54f7abbb5c199cd8f8179", size = 4138373, upload-time = "2026-01-02T09:11:33.367Z" }, + { url = "https://files.pythonhosted.org/packages/d4/77/6fa60634cf06e52139fd0e89e5bbf055e8166c691c42fb162818b7fda31d/pillow-12.1.0-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:681088909d7e8fa9e31b9799aaa59ba5234c58e5e4f1951b4c4d1082a2e980e0", size = 3601241, upload-time = "2026-01-02T09:11:35.011Z" }, + { url = "https://files.pythonhosted.org/packages/4f/bf/28ab865de622e14b747f0cd7877510848252d950e43002e224fb1c9ababf/pillow-12.1.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:983976c2ab753166dc66d36af6e8ec15bb511e4a25856e2227e5f7e00a160587", size = 5262410, upload-time = "2026-01-02T09:11:36.682Z" }, + { url = "https://files.pythonhosted.org/packages/1c/34/583420a1b55e715937a85bd48c5c0991598247a1fd2eb5423188e765ea02/pillow-12.1.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:db44d5c160a90df2d24a24760bbd37607d53da0b34fb546c4c232af7192298ac", size = 4657312, upload-time = "2026-01-02T09:11:38.535Z" }, + { url = "https://files.pythonhosted.org/packages/1d/fd/f5a0896839762885b3376ff04878f86ab2b097c2f9a9cdccf4eda8ba8dc0/pillow-12.1.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6b7a9d1db5dad90e2991645874f708e87d9a3c370c243c2d7684d28f7e133e6b", size = 6232605, upload-time = "2026-01-02T09:11:40.602Z" }, + { url = "https://files.pythonhosted.org/packages/98/aa/938a09d127ac1e70e6ed467bd03834350b33ef646b31edb7452d5de43792/pillow-12.1.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:6258f3260986990ba2fa8a874f8b6e808cf5abb51a94015ca3dc3c68aa4f30ea", size = 8041617, upload-time = "2026-01-02T09:11:42.721Z" }, + { url = "https://files.pythonhosted.org/packages/17/e8/538b24cb426ac0186e03f80f78bc8dc7246c667f58b540bdd57c71c9f79d/pillow-12.1.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e115c15e3bc727b1ca3e641a909f77f8ca72a64fff150f666fcc85e57701c26c", size = 6346509, upload-time = "2026-01-02T09:11:44.955Z" }, + { url = "https://files.pythonhosted.org/packages/01/9a/632e58ec89a32738cabfd9ec418f0e9898a2b4719afc581f07c04a05e3c9/pillow-12.1.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6741e6f3074a35e47c77b23a4e4f2d90db3ed905cb1c5e6e0d49bff2045632bc", size = 7038117, upload-time = "2026-01-02T09:11:46.736Z" }, + { url = "https://files.pythonhosted.org/packages/c7/a2/d40308cf86eada842ca1f3ffa45d0ca0df7e4ab33c83f81e73f5eaed136d/pillow-12.1.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:935b9d1aed48fcfb3f838caac506f38e29621b44ccc4f8a64d575cb1b2a88644", size = 6460151, upload-time = "2026-01-02T09:11:48.625Z" }, + { url = "https://files.pythonhosted.org/packages/f1/88/f5b058ad6453a085c5266660a1417bdad590199da1b32fb4efcff9d33b05/pillow-12.1.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:5fee4c04aad8932da9f8f710af2c1a15a83582cfb884152a9caa79d4efcdbf9c", size = 7164534, upload-time = "2026-01-02T09:11:50.445Z" }, + { url = "https://files.pythonhosted.org/packages/19/ce/c17334caea1db789163b5d855a5735e47995b0b5dc8745e9a3605d5f24c0/pillow-12.1.0-cp313-cp313-win32.whl", hash = "sha256:a786bf667724d84aa29b5db1c61b7bfdde380202aaca12c3461afd6b71743171", size = 6332551, upload-time = "2026-01-02T09:11:52.234Z" }, + { url = "https://files.pythonhosted.org/packages/e5/07/74a9d941fa45c90a0d9465098fe1ec85de3e2afbdc15cc4766622d516056/pillow-12.1.0-cp313-cp313-win_amd64.whl", hash = "sha256:461f9dfdafa394c59cd6d818bdfdbab4028b83b02caadaff0ffd433faf4c9a7a", size = 7040087, upload-time = "2026-01-02T09:11:54.822Z" }, + { url = "https://files.pythonhosted.org/packages/88/09/c99950c075a0e9053d8e880595926302575bc742b1b47fe1bbcc8d388d50/pillow-12.1.0-cp313-cp313-win_arm64.whl", hash = "sha256:9212d6b86917a2300669511ed094a9406888362e085f2431a7da985a6b124f45", size = 2452470, upload-time = "2026-01-02T09:11:56.522Z" }, + { url = "https://files.pythonhosted.org/packages/b5/ba/970b7d85ba01f348dee4d65412476321d40ee04dcb51cd3735b9dc94eb58/pillow-12.1.0-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:00162e9ca6d22b7c3ee8e61faa3c3253cd19b6a37f126cad04f2f88b306f557d", size = 5264816, upload-time = "2026-01-02T09:11:58.227Z" }, + { url = "https://files.pythonhosted.org/packages/10/60/650f2fb55fdba7a510d836202aa52f0baac633e50ab1cf18415d332188fb/pillow-12.1.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:7d6daa89a00b58c37cb1747ec9fb7ac3bc5ffd5949f5888657dfddde6d1312e0", size = 4660472, upload-time = "2026-01-02T09:12:00.798Z" }, + { url = "https://files.pythonhosted.org/packages/2b/c0/5273a99478956a099d533c4f46cbaa19fd69d606624f4334b85e50987a08/pillow-12.1.0-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e2479c7f02f9d505682dc47df8c0ea1fc5e264c4d1629a5d63fe3e2334b89554", size = 6268974, upload-time = "2026-01-02T09:12:02.572Z" }, + { url = "https://files.pythonhosted.org/packages/b4/26/0bf714bc2e73d5267887d47931d53c4ceeceea6978148ed2ab2a4e6463c4/pillow-12.1.0-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:f188d580bd870cda1e15183790d1cc2fa78f666e76077d103edf048eed9c356e", size = 8073070, upload-time = "2026-01-02T09:12:04.75Z" }, + { url = "https://files.pythonhosted.org/packages/43/cf/1ea826200de111a9d65724c54f927f3111dc5ae297f294b370a670c17786/pillow-12.1.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0fde7ec5538ab5095cc02df38ee99b0443ff0e1c847a045554cf5f9af1f4aa82", size = 6380176, upload-time = "2026-01-02T09:12:06.626Z" }, + { url = "https://files.pythonhosted.org/packages/03/e0/7938dd2b2013373fd85d96e0f38d62b7a5a262af21ac274250c7ca7847c9/pillow-12.1.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:0ed07dca4a8464bada6139ab38f5382f83e5f111698caf3191cb8dbf27d908b4", size = 7067061, upload-time = "2026-01-02T09:12:08.624Z" }, + { url = "https://files.pythonhosted.org/packages/86/ad/a2aa97d37272a929a98437a8c0ac37b3cf012f4f8721e1bd5154699b2518/pillow-12.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:f45bd71d1fa5e5749587613037b172e0b3b23159d1c00ef2fc920da6f470e6f0", size = 6491824, upload-time = "2026-01-02T09:12:10.488Z" }, + { url = "https://files.pythonhosted.org/packages/a4/44/80e46611b288d51b115826f136fb3465653c28f491068a72d3da49b54cd4/pillow-12.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:277518bf4fe74aa91489e1b20577473b19ee70fb97c374aa50830b279f25841b", size = 7190911, upload-time = "2026-01-02T09:12:12.772Z" }, + { url = "https://files.pythonhosted.org/packages/86/77/eacc62356b4cf81abe99ff9dbc7402750044aed02cfd6a503f7c6fc11f3e/pillow-12.1.0-cp313-cp313t-win32.whl", hash = "sha256:7315f9137087c4e0ee73a761b163fc9aa3b19f5f606a7fc08d83fd3e4379af65", size = 6336445, upload-time = "2026-01-02T09:12:14.775Z" }, + { url = "https://files.pythonhosted.org/packages/e7/3c/57d81d0b74d218706dafccb87a87ea44262c43eef98eb3b164fd000e0491/pillow-12.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:0ddedfaa8b5f0b4ffbc2fa87b556dc59f6bb4ecb14a53b33f9189713ae8053c0", size = 7045354, upload-time = "2026-01-02T09:12:16.599Z" }, + { url = "https://files.pythonhosted.org/packages/ac/82/8b9b97bba2e3576a340f93b044a3a3a09841170ab4c1eb0d5c93469fd32f/pillow-12.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:80941e6d573197a0c28f394753de529bb436b1ca990ed6e765cf42426abc39f8", size = 2454547, upload-time = "2026-01-02T09:12:18.704Z" }, + { url = "https://files.pythonhosted.org/packages/8c/87/bdf971d8bbcf80a348cc3bacfcb239f5882100fe80534b0ce67a784181d8/pillow-12.1.0-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:5cb7bc1966d031aec37ddb9dcf15c2da5b2e9f7cc3ca7c54473a20a927e1eb91", size = 4062533, upload-time = "2026-01-02T09:12:20.791Z" }, + { url = "https://files.pythonhosted.org/packages/ff/4f/5eb37a681c68d605eb7034c004875c81f86ec9ef51f5be4a63eadd58859a/pillow-12.1.0-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:97e9993d5ed946aba26baf9c1e8cf18adbab584b99f452ee72f7ee8acb882796", size = 4138546, upload-time = "2026-01-02T09:12:23.664Z" }, + { url = "https://files.pythonhosted.org/packages/11/6d/19a95acb2edbace40dcd582d077b991646b7083c41b98da4ed7555b59733/pillow-12.1.0-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:414b9a78e14ffeb98128863314e62c3f24b8a86081066625700b7985b3f529bd", size = 3601163, upload-time = "2026-01-02T09:12:26.338Z" }, + { url = "https://files.pythonhosted.org/packages/fc/36/2b8138e51cb42e4cc39c3297713455548be855a50558c3ac2beebdc251dd/pillow-12.1.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:e6bdb408f7c9dd2a5ff2b14a3b0bb6d4deb29fb9961e6eb3ae2031ae9a5cec13", size = 5266086, upload-time = "2026-01-02T09:12:28.782Z" }, + { url = "https://files.pythonhosted.org/packages/53/4b/649056e4d22e1caa90816bf99cef0884aed607ed38075bd75f091a607a38/pillow-12.1.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:3413c2ae377550f5487991d444428f1a8ae92784aac79caa8b1e3b89b175f77e", size = 4657344, upload-time = "2026-01-02T09:12:31.117Z" }, + { url = "https://files.pythonhosted.org/packages/6c/6b/c5742cea0f1ade0cd61485dc3d81f05261fc2276f537fbdc00802de56779/pillow-12.1.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e5dcbe95016e88437ecf33544ba5db21ef1b8dd6e1b434a2cb2a3d605299e643", size = 6232114, upload-time = "2026-01-02T09:12:32.936Z" }, + { url = "https://files.pythonhosted.org/packages/bf/8f/9f521268ce22d63991601aafd3d48d5ff7280a246a1ef62d626d67b44064/pillow-12.1.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d0a7735df32ccbcc98b98a1ac785cc4b19b580be1bdf0aeb5c03223220ea09d5", size = 8042708, upload-time = "2026-01-02T09:12:34.78Z" }, + { url = "https://files.pythonhosted.org/packages/1a/eb/257f38542893f021502a1bbe0c2e883c90b5cff26cc33b1584a841a06d30/pillow-12.1.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0c27407a2d1b96774cbc4a7594129cc027339fd800cd081e44497722ea1179de", size = 6347762, upload-time = "2026-01-02T09:12:36.748Z" }, + { url = "https://files.pythonhosted.org/packages/c4/5a/8ba375025701c09b309e8d5163c5a4ce0102fa86bbf8800eb0d7ac87bc51/pillow-12.1.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:15c794d74303828eaa957ff8070846d0efe8c630901a1c753fdc63850e19ecd9", size = 7039265, upload-time = "2026-01-02T09:12:39.082Z" }, + { url = "https://files.pythonhosted.org/packages/cf/dc/cf5e4cdb3db533f539e88a7bbf9f190c64ab8a08a9bc7a4ccf55067872e4/pillow-12.1.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:c990547452ee2800d8506c4150280757f88532f3de2a58e3022e9b179107862a", size = 6462341, upload-time = "2026-01-02T09:12:40.946Z" }, + { url = "https://files.pythonhosted.org/packages/d0/47/0291a25ac9550677e22eda48510cfc4fa4b2ef0396448b7fbdc0a6946309/pillow-12.1.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b63e13dd27da389ed9475b3d28510f0f954bca0041e8e551b2a4eb1eab56a39a", size = 7165395, upload-time = "2026-01-02T09:12:42.706Z" }, + { url = "https://files.pythonhosted.org/packages/4f/4c/e005a59393ec4d9416be06e6b45820403bb946a778e39ecec62f5b2b991e/pillow-12.1.0-cp314-cp314-win32.whl", hash = "sha256:1a949604f73eb07a8adab38c4fe50791f9919344398bdc8ac6b307f755fc7030", size = 6431413, upload-time = "2026-01-02T09:12:44.944Z" }, + { url = "https://files.pythonhosted.org/packages/1c/af/f23697f587ac5f9095d67e31b81c95c0249cd461a9798a061ed6709b09b5/pillow-12.1.0-cp314-cp314-win_amd64.whl", hash = "sha256:4f9f6a650743f0ddee5593ac9e954ba1bdbc5e150bc066586d4f26127853ab94", size = 7176779, upload-time = "2026-01-02T09:12:46.727Z" }, + { url = "https://files.pythonhosted.org/packages/b3/36/6a51abf8599232f3e9afbd16d52829376a68909fe14efe29084445db4b73/pillow-12.1.0-cp314-cp314-win_arm64.whl", hash = "sha256:808b99604f7873c800c4840f55ff389936ef1948e4e87645eaf3fccbc8477ac4", size = 2543105, upload-time = "2026-01-02T09:12:49.243Z" }, + { url = "https://files.pythonhosted.org/packages/82/54/2e1dd20c8749ff225080d6ba465a0cab4387f5db0d1c5fb1439e2d99923f/pillow-12.1.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:bc11908616c8a283cf7d664f77411a5ed2a02009b0097ff8abbba5e79128ccf2", size = 5268571, upload-time = "2026-01-02T09:12:51.11Z" }, + { url = "https://files.pythonhosted.org/packages/57/61/571163a5ef86ec0cf30d265ac2a70ae6fc9e28413d1dc94fa37fae6bda89/pillow-12.1.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:896866d2d436563fa2a43a9d72f417874f16b5545955c54a64941e87c1376c61", size = 4660426, upload-time = "2026-01-02T09:12:52.865Z" }, + { url = "https://files.pythonhosted.org/packages/5e/e1/53ee5163f794aef1bf84243f755ee6897a92c708505350dd1923f4afec48/pillow-12.1.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8e178e3e99d3c0ea8fc64b88447f7cac8ccf058af422a6cedc690d0eadd98c51", size = 6269908, upload-time = "2026-01-02T09:12:54.884Z" }, + { url = "https://files.pythonhosted.org/packages/bc/0b/b4b4106ff0ee1afa1dc599fde6ab230417f800279745124f6c50bcffed8e/pillow-12.1.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:079af2fb0c599c2ec144ba2c02766d1b55498e373b3ac64687e43849fbbef5bc", size = 8074733, upload-time = "2026-01-02T09:12:56.802Z" }, + { url = "https://files.pythonhosted.org/packages/19/9f/80b411cbac4a732439e629a26ad3ef11907a8c7fc5377b7602f04f6fe4e7/pillow-12.1.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bdec5e43377761c5dbca620efb69a77f6855c5a379e32ac5b158f54c84212b14", size = 6381431, upload-time = "2026-01-02T09:12:58.823Z" }, + { url = "https://files.pythonhosted.org/packages/8f/b7/d65c45db463b66ecb6abc17c6ba6917a911202a07662247e1355ce1789e7/pillow-12.1.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:565c986f4b45c020f5421a4cea13ef294dde9509a8577f29b2fc5edc7587fff8", size = 7068529, upload-time = "2026-01-02T09:13:00.885Z" }, + { url = "https://files.pythonhosted.org/packages/50/96/dfd4cd726b4a45ae6e3c669fc9e49deb2241312605d33aba50499e9d9bd1/pillow-12.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:43aca0a55ce1eefc0aefa6253661cb54571857b1a7b2964bd8a1e3ef4b729924", size = 6492981, upload-time = "2026-01-02T09:13:03.314Z" }, + { url = "https://files.pythonhosted.org/packages/4d/1c/b5dc52cf713ae46033359c5ca920444f18a6359ce1020dd3e9c553ea5bc6/pillow-12.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0deedf2ea233722476b3a81e8cdfbad786f7adbed5d848469fa59fe52396e4ef", size = 7191878, upload-time = "2026-01-02T09:13:05.276Z" }, + { url = "https://files.pythonhosted.org/packages/53/26/c4188248bd5edaf543864fe4834aebe9c9cb4968b6f573ce014cc42d0720/pillow-12.1.0-cp314-cp314t-win32.whl", hash = "sha256:b17fbdbe01c196e7e159aacb889e091f28e61020a8abeac07b68079b6e626988", size = 6438703, upload-time = "2026-01-02T09:13:07.491Z" }, + { url = "https://files.pythonhosted.org/packages/b8/0e/69ed296de8ea05cb03ee139cee600f424ca166e632567b2d66727f08c7ed/pillow-12.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:27b9baecb428899db6c0de572d6d305cfaf38ca1596b5c0542a5182e3e74e8c6", size = 7182927, upload-time = "2026-01-02T09:13:09.841Z" }, + { url = "https://files.pythonhosted.org/packages/fc/f5/68334c015eed9b5cff77814258717dec591ded209ab5b6fb70e2ae873d1d/pillow-12.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:f61333d817698bdcdd0f9d7793e365ac3d2a21c1f1eb02b32ad6aefb8d8ea831", size = 2545104, upload-time = "2026-01-02T09:13:12.068Z" }, + { url = "https://files.pythonhosted.org/packages/8b/bc/224b1d98cffd7164b14707c91aac83c07b047fbd8f58eba4066a3e53746a/pillow-12.1.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:ca94b6aac0d7af2a10ba08c0f888b3d5114439b6b3ef39968378723622fed377", size = 5228605, upload-time = "2026-01-02T09:13:14.084Z" }, + { url = "https://files.pythonhosted.org/packages/0c/ca/49ca7769c4550107de049ed85208240ba0f330b3f2e316f24534795702ce/pillow-12.1.0-pp311-pypy311_pp73-macosx_11_0_arm64.whl", hash = "sha256:351889afef0f485b84078ea40fe33727a0492b9af3904661b0abbafee0355b72", size = 4622245, upload-time = "2026-01-02T09:13:15.964Z" }, + { url = "https://files.pythonhosted.org/packages/73/48/fac807ce82e5955bcc2718642b94b1bd22a82a6d452aea31cbb678cddf12/pillow-12.1.0-pp311-pypy311_pp73-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bb0984b30e973f7e2884362b7d23d0a348c7143ee559f38ef3eaab640144204c", size = 5247593, upload-time = "2026-01-02T09:13:17.913Z" }, + { url = "https://files.pythonhosted.org/packages/d2/95/3e0742fe358c4664aed4fd05d5f5373dcdad0b27af52aa0972568541e3f4/pillow-12.1.0-pp311-pypy311_pp73-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:84cabc7095dd535ca934d57e9ce2a72ffd216e435a84acb06b2277b1de2689bd", size = 6989008, upload-time = "2026-01-02T09:13:20.083Z" }, + { url = "https://files.pythonhosted.org/packages/5a/74/fe2ac378e4e202e56d50540d92e1ef4ff34ed687f3c60f6a121bcf99437e/pillow-12.1.0-pp311-pypy311_pp73-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53d8b764726d3af1a138dd353116f774e3862ec7e3794e0c8781e30db0f35dfc", size = 5313824, upload-time = "2026-01-02T09:13:22.405Z" }, + { url = "https://files.pythonhosted.org/packages/f3/77/2a60dee1adee4e2655ac328dd05c02a955c1cd683b9f1b82ec3feb44727c/pillow-12.1.0-pp311-pypy311_pp73-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5da841d81b1a05ef940a8567da92decaa15bc4d7dedb540a8c219ad83d91808a", size = 5963278, upload-time = "2026-01-02T09:13:24.706Z" }, + { url = "https://files.pythonhosted.org/packages/2d/71/64e9b1c7f04ae0027f788a248e6297d7fcc29571371fe7d45495a78172c0/pillow-12.1.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:75af0b4c229ac519b155028fa1be632d812a519abba9b46b20e50c6caa184f19", size = 7029809, upload-time = "2026-01-02T09:13:26.541Z" }, +] + [[package]] name = "platformdirs" version = "4.4.0" @@ -1105,6 +1974,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/54/20/4d324d65cc6d9205fabedc306948156824eb9f0ee1633355a8f7ec5c66bf/pluggy-1.6.0-py3-none-any.whl", hash = "sha256:e920276dd6813095e9377c0bc5566d94c932c33b27a3e3945d8389c374dd4746", size = 20538, upload-time = "2025-05-15T12:30:06.134Z" }, ] +[[package]] +name = "prompt-toolkit" +version = "3.0.52" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "wcwidth" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a1/96/06e01a7b38dce6fe1db213e061a4602dd6032a8a97ef6c1a862537732421/prompt_toolkit-3.0.52.tar.gz", hash = "sha256:28cde192929c8e7321de85de1ddbe736f1375148b02f2e17edd840042b1be855", size = 434198, upload-time = "2025-08-27T15:24:02.057Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" }, +] + [[package]] name = "propcache" version = "0.3.2" @@ -1224,6 +2105,24 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/26/65/1070a6e3c036f39142c2820c4b52e9243246fcfc3f96239ac84472ba361e/psutil-7.1.0-cp37-abi3-win_arm64.whl", hash = "sha256:6937cb68133e7c97b6cc9649a570c9a18ba0efebed46d8c5dae4c07fa1b67a07", size = 244971, upload-time = "2025-09-17T20:15:12.262Z" }, ] +[[package]] +name = "ptyprocess" +version = "0.7.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/20/e5/16ff212c1e452235a90aeb09066144d0c5a6a8c0834397e03f5224495c4e/ptyprocess-0.7.0.tar.gz", hash = "sha256:5c5d0a3b48ceee0b48485e0c26037c0acd7d29765ca3fbb5cb3831d347423220", size = 70762, upload-time = "2020-12-28T15:15:30.155Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/22/a6/858897256d0deac81a172289110f31629fc4cee19b6f01283303e18c8db3/ptyprocess-0.7.0-py2.py3-none-any.whl", hash = "sha256:4b41f3967fce3af57cc7e94b888626c18bf37a083e3651ca8feeb66d492fef35", size = 13993, upload-time = "2020-12-28T15:15:28.35Z" }, +] + +[[package]] +name = "pure-eval" +version = "0.2.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/cd/05/0a34433a064256a578f1783a10da6df098ceaa4a57bbeaa96a6c0352786b/pure_eval-0.2.3.tar.gz", hash = "sha256:5f4e983f40564c576c7c8635ae88db5956bb2229d7e9237d03b3c0b0190eaf42", size = 19752, upload-time = "2024-07-21T12:58:21.801Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8e/37/efad0257dc6e593a18957422533ff0f87ede7c9c6ea010a2177d738fb82f/pure_eval-0.2.3-py3-none-any.whl", hash = "sha256:1db8e35b67b3d218d818ae653e27f06c3aa420901fa7b081ca98cbedc874e0d0", size = 11842, upload-time = "2024-07-21T12:58:20.04Z" }, +] + [[package]] name = "pyarrow" version = "21.0.0" @@ -1267,6 +2166,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e5/4e/519c1bc1876625fe6b71e9a28287c43ec2f20f73c658b9ae1d485c0c206e/pyarrow-21.0.0-cp313-cp313t-win_amd64.whl", hash = "sha256:222c39e2c70113543982c6b34f3077962b44fca38c0bd9e68bb6781534425c10", size = 26371006, upload-time = "2025-07-18T00:56:56.379Z" }, ] +[[package]] +name = "pycparser" +version = "2.23" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fe/cf/d2d3b9f5699fb1e4615c8e32ff220203e43b248e1dfcc6736ad9057731ca/pycparser-2.23.tar.gz", hash = "sha256:78816d4f24add8f10a06d6f05b4d424ad9e96cfebf68a4ddc99c65c0720d00c2", size = 173734, upload-time = "2025-09-09T13:23:47.91Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a0/e3/59cd50310fc9b59512193629e1984c1f95e5c8ae6e5d8c69532ccc65a7fe/pycparser-2.23-py3-none-any.whl", hash = "sha256:e5c6e8d3fbad53479cab09ac03729e0a9faf2bee3db8208a550daf5af81a5934", size = 118140, upload-time = "2025-09-09T13:23:46.651Z" }, +] + [[package]] name = "pydantic" version = "2.11.7" @@ -1378,6 +2286,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, ] +[[package]] +name = "pyparsing" +version = "3.3.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/33/c1/1d9de9aeaa1b89b0186e5fe23294ff6517fce1bc69149185577cd31016b2/pyparsing-3.3.1.tar.gz", hash = "sha256:47fad0f17ac1e2cad3de3b458570fbc9b03560aa029ed5e16ee5554da9a2251c", size = 1550512, upload-time = "2025-12-23T03:14:04.391Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8b/40/2614036cdd416452f5bf98ec037f38a1afb17f327cb8e6b652d4729e0af8/pyparsing-3.3.1-py3-none-any.whl", hash = "sha256:023b5e7e5520ad96642e2c6db4cb683d3970bd640cdf7115049a6e9c3682df82", size = 121793, upload-time = "2025-12-23T03:14:02.103Z" }, +] + [[package]] name = "pytest" version = "8.4.2" @@ -1470,6 +2387,79 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/fa/de/02b54f42487e3d3c6efb3f89428677074ca7bf43aae402517bc7cca949f3/PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563", size = 156446, upload-time = "2024-08-06T20:33:04.33Z" }, ] +[[package]] +name = "pyzmq" +version = "27.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi", marker = "implementation_name == 'pypy' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/04/0b/3c9baedbdf613ecaa7aa07027780b8867f57b6293b6ee50de316c9f3222b/pyzmq-27.1.0.tar.gz", hash = "sha256:ac0765e3d44455adb6ddbf4417dcce460fc40a05978c08efdf2948072f6db540", size = 281750, upload-time = "2025-09-08T23:10:18.157Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/67/b9/52aa9ec2867528b54f1e60846728d8b4d84726630874fee3a91e66c7df81/pyzmq-27.1.0-cp310-cp310-macosx_10_15_universal2.whl", hash = "sha256:508e23ec9bc44c0005c4946ea013d9317ae00ac67778bd47519fdf5a0e930ff4", size = 1329850, upload-time = "2025-09-08T23:07:26.274Z" }, + { url = "https://files.pythonhosted.org/packages/99/64/5653e7b7425b169f994835a2b2abf9486264401fdef18df91ddae47ce2cc/pyzmq-27.1.0-cp310-cp310-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:507b6f430bdcf0ee48c0d30e734ea89ce5567fd7b8a0f0044a369c176aa44556", size = 906380, upload-time = "2025-09-08T23:07:29.78Z" }, + { url = "https://files.pythonhosted.org/packages/73/78/7d713284dbe022f6440e391bd1f3c48d9185673878034cfb3939cdf333b2/pyzmq-27.1.0-cp310-cp310-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:bf7b38f9fd7b81cb6d9391b2946382c8237fd814075c6aa9c3b746d53076023b", size = 666421, upload-time = "2025-09-08T23:07:31.263Z" }, + { url = "https://files.pythonhosted.org/packages/30/76/8f099f9d6482450428b17c4d6b241281af7ce6a9de8149ca8c1c649f6792/pyzmq-27.1.0-cp310-cp310-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:03ff0b279b40d687691a6217c12242ee71f0fba28bf8626ff50e3ef0f4410e1e", size = 854149, upload-time = "2025-09-08T23:07:33.17Z" }, + { url = "https://files.pythonhosted.org/packages/59/f0/37fbfff06c68016019043897e4c969ceab18bde46cd2aca89821fcf4fb2e/pyzmq-27.1.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:677e744fee605753eac48198b15a2124016c009a11056f93807000ab11ce6526", size = 1655070, upload-time = "2025-09-08T23:07:35.205Z" }, + { url = "https://files.pythonhosted.org/packages/47/14/7254be73f7a8edc3587609554fcaa7bfd30649bf89cd260e4487ca70fdaa/pyzmq-27.1.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:dd2fec2b13137416a1c5648b7009499bcc8fea78154cd888855fa32514f3dad1", size = 2033441, upload-time = "2025-09-08T23:07:37.432Z" }, + { url = "https://files.pythonhosted.org/packages/22/dc/49f2be26c6f86f347e796a4d99b19167fc94503f0af3fd010ad262158822/pyzmq-27.1.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:08e90bb4b57603b84eab1d0ca05b3bbb10f60c1839dc471fc1c9e1507bef3386", size = 1891529, upload-time = "2025-09-08T23:07:39.047Z" }, + { url = "https://files.pythonhosted.org/packages/a3/3e/154fb963ae25be70c0064ce97776c937ecc7d8b0259f22858154a9999769/pyzmq-27.1.0-cp310-cp310-win32.whl", hash = "sha256:a5b42d7a0658b515319148875fcb782bbf118dd41c671b62dae33666c2213bda", size = 567276, upload-time = "2025-09-08T23:07:40.695Z" }, + { url = "https://files.pythonhosted.org/packages/62/b2/f4ab56c8c595abcb26b2be5fd9fa9e6899c1e5ad54964e93ae8bb35482be/pyzmq-27.1.0-cp310-cp310-win_amd64.whl", hash = "sha256:c0bb87227430ee3aefcc0ade2088100e528d5d3298a0a715a64f3d04c60ba02f", size = 632208, upload-time = "2025-09-08T23:07:42.298Z" }, + { url = "https://files.pythonhosted.org/packages/3b/e3/be2cc7ab8332bdac0522fdb64c17b1b6241a795bee02e0196636ec5beb79/pyzmq-27.1.0-cp310-cp310-win_arm64.whl", hash = "sha256:9a916f76c2ab8d045b19f2286851a38e9ac94ea91faf65bd64735924522a8b32", size = 559766, upload-time = "2025-09-08T23:07:43.869Z" }, + { url = "https://files.pythonhosted.org/packages/06/5d/305323ba86b284e6fcb0d842d6adaa2999035f70f8c38a9b6d21ad28c3d4/pyzmq-27.1.0-cp311-cp311-macosx_10_15_universal2.whl", hash = "sha256:226b091818d461a3bef763805e75685e478ac17e9008f49fce2d3e52b3d58b86", size = 1333328, upload-time = "2025-09-08T23:07:45.946Z" }, + { url = "https://files.pythonhosted.org/packages/bd/a0/fc7e78a23748ad5443ac3275943457e8452da67fda347e05260261108cbc/pyzmq-27.1.0-cp311-cp311-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:0790a0161c281ca9723f804871b4027f2e8b5a528d357c8952d08cd1a9c15581", size = 908803, upload-time = "2025-09-08T23:07:47.551Z" }, + { url = "https://files.pythonhosted.org/packages/7e/22/37d15eb05f3bdfa4abea6f6d96eb3bb58585fbd3e4e0ded4e743bc650c97/pyzmq-27.1.0-cp311-cp311-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c895a6f35476b0c3a54e3eb6ccf41bf3018de937016e6e18748317f25d4e925f", size = 668836, upload-time = "2025-09-08T23:07:49.436Z" }, + { url = "https://files.pythonhosted.org/packages/b1/c4/2a6fe5111a01005fc7af3878259ce17684fabb8852815eda6225620f3c59/pyzmq-27.1.0-cp311-cp311-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5bbf8d3630bf96550b3be8e1fc0fea5cbdc8d5466c1192887bd94869da17a63e", size = 857038, upload-time = "2025-09-08T23:07:51.234Z" }, + { url = "https://files.pythonhosted.org/packages/cb/eb/bfdcb41d0db9cd233d6fb22dc131583774135505ada800ebf14dfb0a7c40/pyzmq-27.1.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:15c8bd0fe0dabf808e2d7a681398c4e5ded70a551ab47482067a572c054c8e2e", size = 1657531, upload-time = "2025-09-08T23:07:52.795Z" }, + { url = "https://files.pythonhosted.org/packages/ab/21/e3180ca269ed4a0de5c34417dfe71a8ae80421198be83ee619a8a485b0c7/pyzmq-27.1.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:bafcb3dd171b4ae9f19ee6380dfc71ce0390fefaf26b504c0e5f628d7c8c54f2", size = 2034786, upload-time = "2025-09-08T23:07:55.047Z" }, + { url = "https://files.pythonhosted.org/packages/3b/b1/5e21d0b517434b7f33588ff76c177c5a167858cc38ef740608898cd329f2/pyzmq-27.1.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:e829529fcaa09937189178115c49c504e69289abd39967cd8a4c215761373394", size = 1894220, upload-time = "2025-09-08T23:07:57.172Z" }, + { url = "https://files.pythonhosted.org/packages/03/f2/44913a6ff6941905efc24a1acf3d3cb6146b636c546c7406c38c49c403d4/pyzmq-27.1.0-cp311-cp311-win32.whl", hash = "sha256:6df079c47d5902af6db298ec92151db82ecb557af663098b92f2508c398bb54f", size = 567155, upload-time = "2025-09-08T23:07:59.05Z" }, + { url = "https://files.pythonhosted.org/packages/23/6d/d8d92a0eb270a925c9b4dd039c0b4dc10abc2fcbc48331788824ef113935/pyzmq-27.1.0-cp311-cp311-win_amd64.whl", hash = "sha256:190cbf120fbc0fc4957b56866830def56628934a9d112aec0e2507aa6a032b97", size = 633428, upload-time = "2025-09-08T23:08:00.663Z" }, + { url = "https://files.pythonhosted.org/packages/ae/14/01afebc96c5abbbd713ecfc7469cfb1bc801c819a74ed5c9fad9a48801cb/pyzmq-27.1.0-cp311-cp311-win_arm64.whl", hash = "sha256:eca6b47df11a132d1745eb3b5b5e557a7dae2c303277aa0e69c6ba91b8736e07", size = 559497, upload-time = "2025-09-08T23:08:02.15Z" }, + { url = "https://files.pythonhosted.org/packages/92/e7/038aab64a946d535901103da16b953c8c9cc9c961dadcbf3609ed6428d23/pyzmq-27.1.0-cp312-abi3-macosx_10_15_universal2.whl", hash = "sha256:452631b640340c928fa343801b0d07eb0c3789a5ffa843f6e1a9cee0ba4eb4fc", size = 1306279, upload-time = "2025-09-08T23:08:03.807Z" }, + { url = "https://files.pythonhosted.org/packages/e8/5e/c3c49fdd0f535ef45eefcc16934648e9e59dace4a37ee88fc53f6cd8e641/pyzmq-27.1.0-cp312-abi3-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:1c179799b118e554b66da67d88ed66cd37a169f1f23b5d9f0a231b4e8d44a113", size = 895645, upload-time = "2025-09-08T23:08:05.301Z" }, + { url = "https://files.pythonhosted.org/packages/f8/e5/b0b2504cb4e903a74dcf1ebae157f9e20ebb6ea76095f6cfffea28c42ecd/pyzmq-27.1.0-cp312-abi3-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3837439b7f99e60312f0c926a6ad437b067356dc2bc2ec96eb395fd0fe804233", size = 652574, upload-time = "2025-09-08T23:08:06.828Z" }, + { url = "https://files.pythonhosted.org/packages/f8/9b/c108cdb55560eaf253f0cbdb61b29971e9fb34d9c3499b0e96e4e60ed8a5/pyzmq-27.1.0-cp312-abi3-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:43ad9a73e3da1fab5b0e7e13402f0b2fb934ae1c876c51d0afff0e7c052eca31", size = 840995, upload-time = "2025-09-08T23:08:08.396Z" }, + { url = "https://files.pythonhosted.org/packages/c2/bb/b79798ca177b9eb0825b4c9998c6af8cd2a7f15a6a1a4272c1d1a21d382f/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:0de3028d69d4cdc475bfe47a6128eb38d8bc0e8f4d69646adfbcd840facbac28", size = 1642070, upload-time = "2025-09-08T23:08:09.989Z" }, + { url = "https://files.pythonhosted.org/packages/9c/80/2df2e7977c4ede24c79ae39dcef3899bfc5f34d1ca7a5b24f182c9b7a9ca/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_i686.whl", hash = "sha256:cf44a7763aea9298c0aa7dbf859f87ed7012de8bda0f3977b6fb1d96745df856", size = 2021121, upload-time = "2025-09-08T23:08:11.907Z" }, + { url = "https://files.pythonhosted.org/packages/46/bd/2d45ad24f5f5ae7e8d01525eb76786fa7557136555cac7d929880519e33a/pyzmq-27.1.0-cp312-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:f30f395a9e6fbca195400ce833c731e7b64c3919aa481af4d88c3759e0cb7496", size = 1878550, upload-time = "2025-09-08T23:08:13.513Z" }, + { url = "https://files.pythonhosted.org/packages/e6/2f/104c0a3c778d7c2ab8190e9db4f62f0b6957b53c9d87db77c284b69f33ea/pyzmq-27.1.0-cp312-abi3-win32.whl", hash = "sha256:250e5436a4ba13885494412b3da5d518cd0d3a278a1ae640e113c073a5f88edd", size = 559184, upload-time = "2025-09-08T23:08:15.163Z" }, + { url = "https://files.pythonhosted.org/packages/fc/7f/a21b20d577e4100c6a41795842028235998a643b1ad406a6d4163ea8f53e/pyzmq-27.1.0-cp312-abi3-win_amd64.whl", hash = "sha256:9ce490cf1d2ca2ad84733aa1d69ce6855372cb5ce9223802450c9b2a7cba0ccf", size = 619480, upload-time = "2025-09-08T23:08:17.192Z" }, + { url = "https://files.pythonhosted.org/packages/78/c2/c012beae5f76b72f007a9e91ee9401cb88c51d0f83c6257a03e785c81cc2/pyzmq-27.1.0-cp312-abi3-win_arm64.whl", hash = "sha256:75a2f36223f0d535a0c919e23615fc85a1e23b71f40c7eb43d7b1dedb4d8f15f", size = 552993, upload-time = "2025-09-08T23:08:18.926Z" }, + { url = "https://files.pythonhosted.org/packages/60/cb/84a13459c51da6cec1b7b1dc1a47e6db6da50b77ad7fd9c145842750a011/pyzmq-27.1.0-cp313-cp313-android_24_arm64_v8a.whl", hash = "sha256:93ad4b0855a664229559e45c8d23797ceac03183c7b6f5b4428152a6b06684a5", size = 1122436, upload-time = "2025-09-08T23:08:20.801Z" }, + { url = "https://files.pythonhosted.org/packages/dc/b6/94414759a69a26c3dd674570a81813c46a078767d931a6c70ad29fc585cb/pyzmq-27.1.0-cp313-cp313-android_24_x86_64.whl", hash = "sha256:fbb4f2400bfda24f12f009cba62ad5734148569ff4949b1b6ec3b519444342e6", size = 1156301, upload-time = "2025-09-08T23:08:22.47Z" }, + { url = "https://files.pythonhosted.org/packages/a5/ad/15906493fd40c316377fd8a8f6b1f93104f97a752667763c9b9c1b71d42d/pyzmq-27.1.0-cp313-cp313t-macosx_10_15_universal2.whl", hash = "sha256:e343d067f7b151cfe4eb3bb796a7752c9d369eed007b91231e817071d2c2fec7", size = 1341197, upload-time = "2025-09-08T23:08:24.286Z" }, + { url = "https://files.pythonhosted.org/packages/14/1d/d343f3ce13db53a54cb8946594e567410b2125394dafcc0268d8dda027e0/pyzmq-27.1.0-cp313-cp313t-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:08363b2011dec81c354d694bdecaef4770e0ae96b9afea70b3f47b973655cc05", size = 897275, upload-time = "2025-09-08T23:08:26.063Z" }, + { url = "https://files.pythonhosted.org/packages/69/2d/d83dd6d7ca929a2fc67d2c3005415cdf322af7751d773524809f9e585129/pyzmq-27.1.0-cp313-cp313t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d54530c8c8b5b8ddb3318f481297441af102517602b569146185fa10b63f4fa9", size = 660469, upload-time = "2025-09-08T23:08:27.623Z" }, + { url = "https://files.pythonhosted.org/packages/3e/cd/9822a7af117f4bc0f1952dbe9ef8358eb50a24928efd5edf54210b850259/pyzmq-27.1.0-cp313-cp313t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6f3afa12c392f0a44a2414056d730eebc33ec0926aae92b5ad5cf26ebb6cc128", size = 847961, upload-time = "2025-09-08T23:08:29.672Z" }, + { url = "https://files.pythonhosted.org/packages/9a/12/f003e824a19ed73be15542f172fd0ec4ad0b60cf37436652c93b9df7c585/pyzmq-27.1.0-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:c65047adafe573ff023b3187bb93faa583151627bc9c51fc4fb2c561ed689d39", size = 1650282, upload-time = "2025-09-08T23:08:31.349Z" }, + { url = "https://files.pythonhosted.org/packages/d5/4a/e82d788ed58e9a23995cee70dbc20c9aded3d13a92d30d57ec2291f1e8a3/pyzmq-27.1.0-cp313-cp313t-musllinux_1_2_i686.whl", hash = "sha256:90e6e9441c946a8b0a667356f7078d96411391a3b8f80980315455574177ec97", size = 2024468, upload-time = "2025-09-08T23:08:33.543Z" }, + { url = "https://files.pythonhosted.org/packages/d9/94/2da0a60841f757481e402b34bf4c8bf57fa54a5466b965de791b1e6f747d/pyzmq-27.1.0-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:add071b2d25f84e8189aaf0882d39a285b42fa3853016ebab234a5e78c7a43db", size = 1885394, upload-time = "2025-09-08T23:08:35.51Z" }, + { url = "https://files.pythonhosted.org/packages/4f/6f/55c10e2e49ad52d080dc24e37adb215e5b0d64990b57598abc2e3f01725b/pyzmq-27.1.0-cp313-cp313t-win32.whl", hash = "sha256:7ccc0700cfdf7bd487bea8d850ec38f204478681ea02a582a8da8171b7f90a1c", size = 574964, upload-time = "2025-09-08T23:08:37.178Z" }, + { url = "https://files.pythonhosted.org/packages/87/4d/2534970ba63dd7c522d8ca80fb92777f362c0f321900667c615e2067cb29/pyzmq-27.1.0-cp313-cp313t-win_amd64.whl", hash = "sha256:8085a9fba668216b9b4323be338ee5437a235fe275b9d1610e422ccc279733e2", size = 641029, upload-time = "2025-09-08T23:08:40.595Z" }, + { url = "https://files.pythonhosted.org/packages/f6/fa/f8aea7a28b0641f31d40dea42d7ef003fded31e184ef47db696bc74cd610/pyzmq-27.1.0-cp313-cp313t-win_arm64.whl", hash = "sha256:6bb54ca21bcfe361e445256c15eedf083f153811c37be87e0514934d6913061e", size = 561541, upload-time = "2025-09-08T23:08:42.668Z" }, + { url = "https://files.pythonhosted.org/packages/87/45/19efbb3000956e82d0331bafca5d9ac19ea2857722fa2caacefb6042f39d/pyzmq-27.1.0-cp314-cp314t-macosx_10_15_universal2.whl", hash = "sha256:ce980af330231615756acd5154f29813d553ea555485ae712c491cd483df6b7a", size = 1341197, upload-time = "2025-09-08T23:08:44.973Z" }, + { url = "https://files.pythonhosted.org/packages/48/43/d72ccdbf0d73d1343936296665826350cb1e825f92f2db9db3e61c2162a2/pyzmq-27.1.0-cp314-cp314t-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:1779be8c549e54a1c38f805e56d2a2e5c009d26de10921d7d51cfd1c8d4632ea", size = 897175, upload-time = "2025-09-08T23:08:46.601Z" }, + { url = "https://files.pythonhosted.org/packages/2f/2e/a483f73a10b65a9ef0161e817321d39a770b2acf8bcf3004a28d90d14a94/pyzmq-27.1.0-cp314-cp314t-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7200bb0f03345515df50d99d3db206a0a6bee1955fbb8c453c76f5bf0e08fb96", size = 660427, upload-time = "2025-09-08T23:08:48.187Z" }, + { url = "https://files.pythonhosted.org/packages/f5/d2/5f36552c2d3e5685abe60dfa56f91169f7a2d99bbaf67c5271022ab40863/pyzmq-27.1.0-cp314-cp314t-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:01c0e07d558b06a60773744ea6251f769cd79a41a97d11b8bf4ab8f034b0424d", size = 847929, upload-time = "2025-09-08T23:08:49.76Z" }, + { url = "https://files.pythonhosted.org/packages/c4/2a/404b331f2b7bf3198e9945f75c4c521f0c6a3a23b51f7a4a401b94a13833/pyzmq-27.1.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:80d834abee71f65253c91540445d37c4c561e293ba6e741b992f20a105d69146", size = 1650193, upload-time = "2025-09-08T23:08:51.7Z" }, + { url = "https://files.pythonhosted.org/packages/1c/0b/f4107e33f62a5acf60e3ded67ed33d79b4ce18de432625ce2fc5093d6388/pyzmq-27.1.0-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:544b4e3b7198dde4a62b8ff6685e9802a9a1ebf47e77478a5eb88eca2a82f2fd", size = 2024388, upload-time = "2025-09-08T23:08:53.393Z" }, + { url = "https://files.pythonhosted.org/packages/0d/01/add31fe76512642fd6e40e3a3bd21f4b47e242c8ba33efb6809e37076d9b/pyzmq-27.1.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:cedc4c68178e59a4046f97eca31b148ddcf51e88677de1ef4e78cf06c5376c9a", size = 1885316, upload-time = "2025-09-08T23:08:55.702Z" }, + { url = "https://files.pythonhosted.org/packages/c4/59/a5f38970f9bf07cee96128de79590bb354917914a9be11272cfc7ff26af0/pyzmq-27.1.0-cp314-cp314t-win32.whl", hash = "sha256:1f0b2a577fd770aa6f053211a55d1c47901f4d537389a034c690291485e5fe92", size = 587472, upload-time = "2025-09-08T23:08:58.18Z" }, + { url = "https://files.pythonhosted.org/packages/70/d8/78b1bad170f93fcf5e3536e70e8fadac55030002275c9a29e8f5719185de/pyzmq-27.1.0-cp314-cp314t-win_amd64.whl", hash = "sha256:19c9468ae0437f8074af379e986c5d3d7d7bfe033506af442e8c879732bedbe0", size = 661401, upload-time = "2025-09-08T23:08:59.802Z" }, + { url = "https://files.pythonhosted.org/packages/81/d6/4bfbb40c9a0b42fc53c7cf442f6385db70b40f74a783130c5d0a5aa62228/pyzmq-27.1.0-cp314-cp314t-win_arm64.whl", hash = "sha256:dc5dbf68a7857b59473f7df42650c621d7e8923fb03fa74a526890f4d33cc4d7", size = 575170, upload-time = "2025-09-08T23:09:01.418Z" }, + { url = "https://files.pythonhosted.org/packages/f3/81/a65e71c1552f74dec9dff91d95bafb6e0d33338a8dfefbc88aa562a20c92/pyzmq-27.1.0-pp310-pypy310_pp73-macosx_10_15_x86_64.whl", hash = "sha256:c17e03cbc9312bee223864f1a2b13a99522e0dc9f7c5df0177cd45210ac286e6", size = 836266, upload-time = "2025-09-08T23:09:40.048Z" }, + { url = "https://files.pythonhosted.org/packages/58/ed/0202ca350f4f2b69faa95c6d931e3c05c3a397c184cacb84cb4f8f42f287/pyzmq-27.1.0-pp310-pypy310_pp73-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:f328d01128373cb6763823b2b4e7f73bdf767834268c565151eacb3b7a392f90", size = 800206, upload-time = "2025-09-08T23:09:41.902Z" }, + { url = "https://files.pythonhosted.org/packages/47/42/1ff831fa87fe8f0a840ddb399054ca0009605d820e2b44ea43114f5459f4/pyzmq-27.1.0-pp310-pypy310_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c1790386614232e1b3a40a958454bdd42c6d1811837b15ddbb052a032a43f62", size = 567747, upload-time = "2025-09-08T23:09:43.741Z" }, + { url = "https://files.pythonhosted.org/packages/d1/db/5c4d6807434751e3f21231bee98109aa57b9b9b55e058e450d0aef59b70f/pyzmq-27.1.0-pp310-pypy310_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:448f9cb54eb0cee4732b46584f2710c8bc178b0e5371d9e4fc8125201e413a74", size = 747371, upload-time = "2025-09-08T23:09:45.575Z" }, + { url = "https://files.pythonhosted.org/packages/26/af/78ce193dbf03567eb8c0dc30e3df2b9e56f12a670bf7eb20f9fb532c7e8a/pyzmq-27.1.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:05b12f2d32112bf8c95ef2e74ec4f1d4beb01f8b5e703b38537f8849f92cb9ba", size = 544862, upload-time = "2025-09-08T23:09:47.448Z" }, + { url = "https://files.pythonhosted.org/packages/4c/c6/c4dcdecdbaa70969ee1fdced6d7b8f60cfabe64d25361f27ac4665a70620/pyzmq-27.1.0-pp311-pypy311_pp73-macosx_10_15_x86_64.whl", hash = "sha256:18770c8d3563715387139060d37859c02ce40718d1faf299abddcdcc6a649066", size = 836265, upload-time = "2025-09-08T23:09:49.376Z" }, + { url = "https://files.pythonhosted.org/packages/3e/79/f38c92eeaeb03a2ccc2ba9866f0439593bb08c5e3b714ac1d553e5c96e25/pyzmq-27.1.0-pp311-pypy311_pp73-manylinux2014_i686.manylinux_2_17_i686.whl", hash = "sha256:ac25465d42f92e990f8d8b0546b01c391ad431c3bf447683fdc40565941d0604", size = 800208, upload-time = "2025-09-08T23:09:51.073Z" }, + { url = "https://files.pythonhosted.org/packages/49/0e/3f0d0d335c6b3abb9b7b723776d0b21fa7f3a6c819a0db6097059aada160/pyzmq-27.1.0-pp311-pypy311_pp73-manylinux_2_26_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:53b40f8ae006f2734ee7608d59ed661419f087521edbfc2149c3932e9c14808c", size = 567747, upload-time = "2025-09-08T23:09:52.698Z" }, + { url = "https://files.pythonhosted.org/packages/a1/cf/f2b3784d536250ffd4be70e049f3b60981235d70c6e8ce7e3ef21e1adb25/pyzmq-27.1.0-pp311-pypy311_pp73-manylinux_2_26_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f605d884e7c8be8fe1aa94e0a783bf3f591b84c24e4bc4f3e7564c82ac25e271", size = 747371, upload-time = "2025-09-08T23:09:54.563Z" }, + { url = "https://files.pythonhosted.org/packages/01/1b/5dbe84eefc86f48473947e2f41711aded97eecef1231f4558f1f02713c12/pyzmq-27.1.0-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:c9f7f6e13dff2e44a6afeaf2cf54cee5929ad64afaf4d40b50f93c58fc687355", size = 544862, upload-time = "2025-09-08T23:09:56.509Z" }, +] + [[package]] name = "regex" version = "2025.9.1" @@ -1676,6 +2666,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e9/44/75a9c9421471a6c4805dbf2356f7c181a29c1879239abab1ea2cc8f38b40/sniffio-1.3.1-py3-none-any.whl", hash = "sha256:2f6da418d1f1e0fddd844478f41680e794e6051915791a034ff65e5f100525a2", size = 10235, upload-time = "2024-02-25T23:20:01.196Z" }, ] +[[package]] +name = "stack-data" +version = "0.6.3" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "asttokens" }, + { name = "executing" }, + { name = "pure-eval" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/28/e3/55dcc2cfbc3ca9c29519eb6884dd1415ecb53b0e934862d3559ddcb7e20b/stack_data-0.6.3.tar.gz", hash = "sha256:836a778de4fec4dcd1dcd89ed8abff8a221f58308462e1c4aa2a3cf30148f0b9", size = 44707, upload-time = "2023-09-30T13:58:05.479Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/f1/7b/ce1eafaf1a76852e2ec9b22edecf1daa58175c090266e9f6c64afcd81d91/stack_data-0.6.3-py3-none-any.whl", hash = "sha256:d5558e0c25a4cb0853cddad3d77da9891a08cb85dd9f9f91b9f8cd66e511e695", size = 24521, upload-time = "2023-09-30T13:58:03.53Z" }, +] + [[package]] name = "starlette" version = "0.48.0" @@ -2057,6 +3061,25 @@ wheels = [ { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp314-cp314t-win_amd64.whl" }, ] +[[package]] +name = "tornado" +version = "6.5.4" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/37/1d/0a336abf618272d53f62ebe274f712e213f5a03c0b2339575430b8362ef2/tornado-6.5.4.tar.gz", hash = "sha256:a22fa9047405d03260b483980635f0b041989d8bcc9a313f8fe18b411d84b1d7", size = 513632, upload-time = "2025-12-15T19:21:03.836Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/a9/e94a9d5224107d7ce3cc1fab8d5dc97f5ea351ccc6322ee4fb661da94e35/tornado-6.5.4-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:d6241c1a16b1c9e4cc28148b1cda97dd1c6cb4fb7068ac1bedc610768dff0ba9", size = 443909, upload-time = "2025-12-15T19:20:48.382Z" }, + { url = "https://files.pythonhosted.org/packages/db/7e/f7b8d8c4453f305a51f80dbb49014257bb7d28ccb4bbb8dd328ea995ecad/tornado-6.5.4-cp39-abi3-macosx_10_9_x86_64.whl", hash = "sha256:2d50f63dda1d2cac3ae1fa23d254e16b5e38153758470e9956cbc3d813d40843", size = 442163, upload-time = "2025-12-15T19:20:49.791Z" }, + { url = "https://files.pythonhosted.org/packages/ba/b5/206f82d51e1bfa940ba366a8d2f83904b15942c45a78dd978b599870ab44/tornado-6.5.4-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d1cf66105dc6acb5af613c054955b8137e34a03698aa53272dbda4afe252be17", size = 445746, upload-time = "2025-12-15T19:20:51.491Z" }, + { url = "https://files.pythonhosted.org/packages/8e/9d/1a3338e0bd30ada6ad4356c13a0a6c35fbc859063fa7eddb309183364ac1/tornado-6.5.4-cp39-abi3-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:50ff0a58b0dc97939d29da29cd624da010e7f804746621c78d14b80238669335", size = 445083, upload-time = "2025-12-15T19:20:52.778Z" }, + { url = "https://files.pythonhosted.org/packages/50/d4/e51d52047e7eb9a582da59f32125d17c0482d065afd5d3bc435ff2120dc5/tornado-6.5.4-cp39-abi3-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e5fb5e04efa54cf0baabdd10061eb4148e0be137166146fff835745f59ab9f7f", size = 445315, upload-time = "2025-12-15T19:20:53.996Z" }, + { url = "https://files.pythonhosted.org/packages/27/07/2273972f69ca63dbc139694a3fc4684edec3ea3f9efabf77ed32483b875c/tornado-6.5.4-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9c86b1643b33a4cd415f8d0fe53045f913bf07b4a3ef646b735a6a86047dda84", size = 446003, upload-time = "2025-12-15T19:20:56.101Z" }, + { url = "https://files.pythonhosted.org/packages/d1/83/41c52e47502bf7260044413b6770d1a48dda2f0246f95ee1384a3cd9c44a/tornado-6.5.4-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:6eb82872335a53dd063a4f10917b3efd28270b56a33db69009606a0312660a6f", size = 445412, upload-time = "2025-12-15T19:20:57.398Z" }, + { url = "https://files.pythonhosted.org/packages/10/c7/bc96917f06cbee182d44735d4ecde9c432e25b84f4c2086143013e7b9e52/tornado-6.5.4-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6076d5dda368c9328ff41ab5d9dd3608e695e8225d1cd0fd1e006f05da3635a8", size = 445392, upload-time = "2025-12-15T19:20:58.692Z" }, + { url = "https://files.pythonhosted.org/packages/0c/1a/d7592328d037d36f2d2462f4bc1fbb383eec9278bc786c1b111cbbd44cfa/tornado-6.5.4-cp39-abi3-win32.whl", hash = "sha256:1768110f2411d5cd281bac0a090f707223ce77fd110424361092859e089b38d1", size = 446481, upload-time = "2025-12-15T19:21:00.008Z" }, + { url = "https://files.pythonhosted.org/packages/d6/6d/c69be695a0a64fd37a97db12355a035a6d90f79067a3cf936ec2b1dc38cd/tornado-6.5.4-cp39-abi3-win_amd64.whl", hash = "sha256:fa07d31e0cd85c60713f2b995da613588aa03e1303d75705dca6af8babc18ddc", size = 446886, upload-time = "2025-12-15T19:21:01.287Z" }, + { url = "https://files.pythonhosted.org/packages/50/49/8dc3fd90902f70084bd2cd059d576ddb4f8bb44c2c7c0e33a11422acb17e/tornado-6.5.4-cp39-abi3-win_arm64.whl", hash = "sha256:053e6e16701eb6cbe641f308f4c1a9541f91b6261991160391bfc342e8a551a1", size = 445910, upload-time = "2025-12-15T19:21:02.571Z" }, +] + [[package]] name = "tqdm" version = "4.67.1" @@ -2069,6 +3092,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d0/30/dc54f88dd4a2b5dc8a0279bdd7270e735851848b762aeb1c1184ed1f6b14/tqdm-4.67.1-py3-none-any.whl", hash = "sha256:26445eca388f82e72884e0d580d5464cd801a3ea01e63e5601bdff9ba6a48de2", size = 78540, upload-time = "2024-11-24T20:12:19.698Z" }, ] +[[package]] +name = "traitlets" +version = "5.14.3" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/eb/79/72064e6a701c2183016abbbfedaba506d81e30e232a68c9f0d6f6fcd1574/traitlets-5.14.3.tar.gz", hash = "sha256:9ed0579d3502c94b4b3732ac120375cda96f923114522847de4b3bb98b96b6b7", size = 161621, upload-time = "2024-04-19T11:11:49.746Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/00/c0/8f5d070730d7836adc9c9b6408dec68c6ced86b304a9b26a14df072a6e8c/traitlets-5.14.3-py3-none-any.whl", hash = "sha256:b74e89e397b1ed28cc831db7aea759ba6640cb3de13090ca145426688ff1ac4f", size = 85359, upload-time = "2024-04-19T11:11:46.763Z" }, +] + [[package]] name = "transformers" version = "4.57.3" @@ -2224,6 +3256,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d8/2d/7ef56e25f78786e59fefd9b19867c325f9686317d9f7b93b5cb340360a3e/wandb-0.21.3-py3-none-win_amd64.whl", hash = "sha256:56d5a5697766f552a9933d8c6a564202194768eb0389bd5f9fe9a99cd4cee41e", size = 18709411, upload-time = "2025-08-30T18:21:52.874Z" }, ] +[[package]] +name = "wcwidth" +version = "0.2.14" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/24/30/6b0809f4510673dc723187aeaf24c7f5459922d01e2f794277a3dfb90345/wcwidth-0.2.14.tar.gz", hash = "sha256:4d478375d31bc5395a3c55c40ccdf3354688364cd61c4f6adacaa9215d0b3605", size = 102293, upload-time = "2025-09-22T16:29:53.023Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/af/b5/123f13c975e9f27ab9c0770f514345bd406d0e8d3b7a0723af9d43f710af/wcwidth-0.2.14-py2.py3-none-any.whl", hash = "sha256:a7bb560c8aee30f9957e5f9895805edd20602f2d7f720186dfd906e82b4982e1", size = 37286, upload-time = "2025-09-22T16:29:51.641Z" }, +] + [[package]] name = "xxhash" version = "3.5.0" From ae0bf525299633d973d39ecf996edcb48e1fa6f5 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 5 Jan 2026 18:57:46 +0000 Subject: [PATCH 003/119] tune hyperparameters based on overnight sweeps. warmdown_ratio is the biggest free win, increasing 0.2 -> 0.4, and embedding lr can be larger bumping 0.2 -> 0.3 --- scripts/base_train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/base_train.py b/scripts/base_train.py index 2390b688..c8345e03 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -47,13 +47,13 @@ parser.add_argument("--target_param_data_ratio", type=int, default=20, help="cal # Optimization parser.add_argument("--device_batch_size", type=int, default=32, help="per-device batch size") parser.add_argument("--total_batch_size", type=int, default=524288, help="total batch size in tokens") -parser.add_argument("--embedding_lr", type=float, default=0.2, help="learning rate for embedding parameters (Adam)") +parser.add_argument("--embedding_lr", type=float, default=0.3, help="learning rate for embedding parameters (Adam)") parser.add_argument("--unembedding_lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") parser.add_argument("--weight_decay", type=float, default=0.0, help="weight decay for embedding/unembedding parameters (Adam)") parser.add_argument("--matrix_lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") parser.add_argument("--grad_clip", type=float, default=1.0, help="gradient clipping value (0.0 = disabled)") parser.add_argument("--warmup_ratio", type=float, default=0.0, help="ratio of iterations for LR warmup") -parser.add_argument("--warmdown_ratio", type=float, default=0.2, help="ratio of iterations for LR warmdown") +parser.add_argument("--warmdown_ratio", type=float, default=0.4, help="ratio of iterations for LR warmdown") parser.add_argument("--final_lr_frac", type=float, default=0.0, help="final LR as fraction of initial LR") parser.add_argument("--resume_from_step", type=int, default=-1, help="resume training from this step (-1 = disable)") # Evaluation From 1b5de29e71a581db47fcc6824cf48b5ef67bff36 Mon Sep 17 00:00:00 2001 From: Adria Blancafort <76774853+adriablancafort@users.noreply.github.com> Date: Wed, 7 Jan 2026 18:08:57 +0100 Subject: [PATCH 004/119] Fix undefined variable in chat_rl after recent refactor * Fix undefined variable * Remove unused import Remove unused import 're' from chat_rl.py --- scripts/chat_rl.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/scripts/chat_rl.py b/scripts/chat_rl.py index 1a09962b..ad557b91 100644 --- a/scripts/chat_rl.py +++ b/scripts/chat_rl.py @@ -19,7 +19,6 @@ torchrun --standalone --nproc_per_node=8 -m scripts.chat_rl -- --run=default import argparse import os import itertools -import re import wandb import torch import torch.distributed as dist @@ -174,7 +173,7 @@ def run_gsm8k_eval(task, tokenizer, engine, tokens = tokenizer.render_for_completion(conversation) prefix_length = len(tokens) # Generate k samples using batched generation inside the Engine - assert num_samples <= device_batch_size # usually this is true. we can add a loop if not... + assert num_samples <= args.device_batch_size # usually this is true. we can add a loop if not... generated_token_sequences, masks = engine.generate_batch( tokens, num_samples=num_samples, From ccf4b7f9bf91a250aa398a0cecab270bcea56050 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Wed, 7 Jan 2026 22:11:52 +0000 Subject: [PATCH 005/119] nudge hyperparameters of the base script with the results of the sweeps and miniseries. vocab size down to 32K. D:N ratio from 20 to 8. add miniseries script --- .gitignore | 13 +++- miniseries.sh | 89 ++++++++++++++++++++++ nanochat/gpt.py | 27 ++++++- pyproject.toml | 2 + run1000.sh | 4 +- scripts/base_train.py | 47 +++++++++--- scripts/tok_train.py | 2 +- speedrun.sh | 4 +- uv.lock | 166 ++++++++++++++++++++++++++++++++++++++++++ 9 files changed, 333 insertions(+), 21 deletions(-) create mode 100644 miniseries.sh diff --git a/.gitignore b/.gitignore index 7f280bd5..7950c9ff 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,15 @@ report.md eval_bundle/ # Secrets -.env \ No newline at end of file +.env + +# Local setup +.claude +CLAUDE.md +wandb/ + +# Local experimentation +experiments/ +ignore/ +knowledge/ +ideas/ diff --git a/miniseries.sh b/miniseries.sh new file mode 100644 index 00000000..9287def6 --- /dev/null +++ b/miniseries.sh @@ -0,0 +1,89 @@ +#!/bin/bash + +# See speedrun.sh for more comments + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" +mkdir -p $NANOCHAT_BASE_DIR + +# uv +command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh +[ -d ".venv" ] || uv venv +uv sync --extra gpu +source .venv/bin/activate + +# Tokenizer +python -m nanochat.dataset -n 240 +python -m scripts.tok_train --max_chars=2000000000 --vocab_size=32768 + +# Depths to train (the "miniseries") +DEPTHS=(10 11 12 13 14 15 16 17 18 19 20) +# Hardware +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" +# Logging +WANDB_RUN="${WANDB_RUN:-jan7_miniseries}" + +RESULTS_DIR="$NANOCHAT_BASE_DIR/jan7_miniseries_results" +mkdir -p "$RESULTS_DIR" +RESULTS_FILE="$RESULTS_DIR/results.csv" + +# Write CSV header +echo "depth,model_dim,num_params,num_scaling_params,num_iterations,tokens_trained,param_data_ratio,val_bpb,core_score,train_time_sec" > "$RESULTS_FILE" +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" +} + +log "==============================================" +log "Jan 7 Miniseries Training" +log "==============================================" + +for d in "${DEPTHS[@]}"; do + log "Training d=$d..." + + TAG="jan7_miniseries_d${d}" + START_TIME=$(date +%s) + + # Train the model with natural horizon (target_param_data_ratio default) + # No --target_flops, let it use the default ratio from base_train + torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- \ + --depth=$d \ + --target_param_data_ratio=8 \ + --run="${WANDB_RUN}_d${d}" \ + --model_tag="${TAG}" \ + --core_metric_every=999999 \ + --core_metric_max_per_task=-1 \ + --sample_every=-1 \ + --save_every=-1 \ + 2>&1 | tee "$RESULTS_DIR/${TAG}_train.log" + + END_TIME=$(date +%s) + TRAIN_TIME=$((END_TIME - START_TIME)) + + # Extract stats from log + LOG_FILE="$RESULTS_DIR/${TAG}_train.log" + NUM_PARAMS=$(grep "Number of parameters:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | head -1 | tr -d ',') + NUM_SCALING_PARAMS=$(grep "Number of parameters:" "$LOG_FILE" | tail -1 | grep -oP 'scaling: [\d,]+' | grep -oP '[\d,]+' | tr -d ',') + NUM_ITERS=$(grep "Calculated number of iterations" "$LOG_FILE" | tail -1 | sed 's/.*: //' | tr -d ',') + TOKENS_TRAINED=$((NUM_ITERS * 524288)) + PARAM_DATA_RATIO=$(python -c "print(f'{$TOKENS_TRAINED / $NUM_SCALING_PARAMS:.2f}')") + MODEL_DIM=$((d * 64)) + VAL_BPB=$(grep "Validation bpb:" "$LOG_FILE" | tail -1 | grep -oP '[\d.]+$') + CORE_SCORE=$(grep "CORE metric:" "$LOG_FILE" | tail -1 | awk '{print $NF}') + + if [ -z "$CORE_SCORE" ]; then + CORE_SCORE="0.0" + fi + + log " d=$d: params=$NUM_PARAMS, scaling=$NUM_SCALING_PARAMS, ratio=$PARAM_DATA_RATIO, bpb=$VAL_BPB, CORE=$CORE_SCORE, time=${TRAIN_TIME}s" + + # Append to CSV + echo "$d,$MODEL_DIM,$NUM_PARAMS,$NUM_SCALING_PARAMS,$NUM_ITERS,$TOKENS_TRAINED,$PARAM_DATA_RATIO,$VAL_BPB,$CORE_SCORE,$TRAIN_TIME" >> "$RESULTS_FILE" +done + +log "==============================================" +log "Jan 7 Miniseries Complete!" +log "==============================================" +log "Results saved to: $RESULTS_FILE" +echo "" +echo "Results:" +column -t -s',' "$RESULTS_FILE" diff --git a/nanochat/gpt.py b/nanochat/gpt.py index e6027a96..478f6879 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -216,14 +216,35 @@ class GPT(nn.Module): return self.transformer.wte.weight.device def estimate_flops(self): - """ Return the estimated FLOPs per token for the model. Ref: https://arxiv.org/abs/2204.02311 """ + """ + Return the estimated FLOPs per token for the model (forward + backward). + Each matmul weight parameter contributes 2 FLOPs (multiply *, accumulate +) in forward, and 2X that in backward => 2+4=6. + Cleanest explanation of this: https://medium.com/@dzmitrybahdanau/the-flops-calculus-of-language-model-training-3b19c1f025e4 + On top of that, the term 12 * l * h * q * t accounts for key @ query matmul flops inside attention. + Ref: https://arxiv.org/abs/2204.02311 (PaLM paper). + This is ~1% off from the exact formulas of Chinchilla paper, the difference is: + - Chinchilla counts the embedding layer as flops (? weird, it's just a lookup => we ignore) + - Chinchilla counts exp/sum/divide in attention softmax as flops (a little sus and very tiny => we ignore) + """ nparams = sum(p.numel() for p in self.parameters()) nparams_embedding = self.transformer.wte.weight.numel() l, h, q, t = self.config.n_layer, self.config.n_head, self.config.n_embd // self.config.n_head, self.config.sequence_len num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t return num_flops_per_token - def setup_optimizers(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0): + def num_scaling_params(self): + """ + Return all of the parameters, same as Chinchilla paper. + Kaplan et al. did not include embedding parameters and said that this led to cleaner scaling laws. + But Kaplan et al. also had a bug in their results (as pointed out by Chinchilla). + My own experiments in nanochat confirm the Chinchilla approach gives the much cleaner scaling law. + Ref: https://arxiv.org/abs/2203.15556 (Chinchilla paper <- good). + Ref: https://arxiv.org/abs/2001.08361 (Kaplan et al. original scaling laws paper <- bad) + """ + nparams = sum(p.numel() for p in self.parameters()) + return nparams + + def setup_optimizers(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0, adam_betas=(0.8, 0.95)): model_dim = self.config.n_embd ddp, rank, local_rank, world_size = get_dist_info() # Separate out all parameters into 3 groups (matrix, embedding, lm_head) @@ -239,7 +260,7 @@ class GPT(nn.Module): dict(params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale), dict(params=embedding_params, lr=embedding_lr * dmodel_lr_scale), ] - adamw_kwargs = dict(betas=(0.8, 0.95), eps=1e-10, weight_decay=weight_decay) + adamw_kwargs = dict(betas=adam_betas, eps=1e-10, weight_decay=weight_decay) AdamWFactory = DistAdamW if ddp else partial(torch.optim.AdamW, fused=True) adamw_optimizer = AdamWFactory(adam_groups, **adamw_kwargs) # Create the Muon optimizer for the linear layers diff --git a/pyproject.toml b/pyproject.toml index 36cb7ce6..0931ca64 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,7 +13,9 @@ dependencies = [ "python-dotenv>=1.2.1", "regex>=2025.9.1", "rustbpe>=0.1.0", + "scipy>=1.15.3", "setuptools>=80.9.0", + "tabulate>=0.9.0", "tiktoken>=0.11.0", "tokenizers>=0.22.0", "torch>=2.9.0", diff --git a/run1000.sh b/run1000.sh index a0a66067..a7a3716e 100644 --- a/run1000.sh +++ b/run1000.sh @@ -23,7 +23,7 @@ python -m nanochat.dataset -n 16 # start downloading the rest of the shards for a total of 800 (see below why 800) python -m nanochat.dataset -n 800 & # todo: download the rest of it -python -m scripts.tok_train --max_chars=4000000000 +python -m scripts.tok_train --max_chars=4000000000 --vocab_size=65536 python -m scripts.tok_eval # Documenting my process for determining the hyperparameters for this run1000.sh script: @@ -71,7 +71,7 @@ python -m scripts.tok_eval # Number of processes/GPUs to use NPROC_PER_NODE=8 -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=32 --device_batch_size=8 --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=32 --target_param_data_ratio=20 --device_batch_size=8 --run=$WANDB_RUN torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval diff --git a/scripts/base_train.py b/scripts/base_train.py index c8345e03..de0321ae 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -1,11 +1,11 @@ """ -Train model. Run as: +Train model. From root directory of the project, run as: -python base_train.py +python -m scripts.base_train.py or distributed as: -torchrun --nproc_per_node=8 base_train.py +torchrun --nproc_per_node=8 -m scripts.base_train.py If you are only on CPU/Macbook, you'll want to train a much much smaller LLM. Example: python -m scripts.base_train --depth=4 --max_seq_len=512 --device_batch_size=1 --eval_tokens=512 --core_metric_every=-1 --total_batch_size=512 --num_iterations=20 @@ -39,11 +39,13 @@ parser.add_argument("--run", type=str, default="dummy", help="wandb run name ('d parser.add_argument("--device_type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") # Model architecture parser.add_argument("--depth", type=int, default=20, help="depth of the Transformer model") +parser.add_argument("--aspect_ratio", type=int, default=64, help="model_dim = depth * aspect_ratio") +parser.add_argument("--head_dim", type=int, default=128, help="target head dimension for attention") parser.add_argument("--max_seq_len", type=int, default=2048, help="max context length") # Training horizon (only one used, in order of precedence) parser.add_argument("--num_iterations", type=int, default=-1, help="explicit number of optimization steps (-1 = disable)") parser.add_argument("--target_flops", type=float, default=-1.0, help="calculate num_iterations to reach target_flops (-1 = disable)") -parser.add_argument("--target_param_data_ratio", type=int, default=20, help="calculate num_iterations to maintain data:param ratio (Chinchilla=20, -1 = disable)") +parser.add_argument("--target_param_data_ratio", type=int, default=8, help="calculate num_iterations to maintain data:param ratio (Chinchilla=20, -1 = disable)") # Optimization parser.add_argument("--device_batch_size", type=int, default=32, help="per-device batch size") parser.add_argument("--total_batch_size", type=int, default=524288, help="total batch size in tokens") @@ -51,6 +53,8 @@ parser.add_argument("--embedding_lr", type=float, default=0.3, help="learning ra parser.add_argument("--unembedding_lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") parser.add_argument("--weight_decay", type=float, default=0.0, help="weight decay for embedding/unembedding parameters (Adam)") parser.add_argument("--matrix_lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") +parser.add_argument("--adam_beta1", type=float, default=0.8, help="Adam beta1 for embedding/unembedding") +parser.add_argument("--adam_beta2", type=float, default=0.95, help="Adam beta2 for embedding/unembedding") parser.add_argument("--grad_clip", type=float, default=1.0, help="gradient clipping value (0.0 = disabled)") parser.add_argument("--warmup_ratio", type=float, default=0.0, help="ratio of iterations for LR warmup") parser.add_argument("--warmdown_ratio", type=float, default=0.4, help="ratio of iterations for LR warmdown") @@ -89,8 +93,8 @@ print0(f"Vocab size: {vocab_size:,}") # Model kwargs are derived from the desired depth of the model num_layers = args.depth -model_dim = args.depth * 64 # aspect ratio 64 (usually this is varied from 64 -> 128 as model size increases) -def find_num_heads(model_dim, target_head_dim=128): +model_dim = args.depth * args.aspect_ratio +def find_num_heads(model_dim, target_head_dim): # Find num_heads that divides model_dim evenly, with head_dim closest to target. ideal = max(1, round(model_dim / target_head_dim)) for offset in range(model_dim): @@ -98,7 +102,7 @@ def find_num_heads(model_dim, target_head_dim=128): if candidate > 0 and model_dim % candidate == 0: return candidate return 1 -num_heads = find_num_heads(model_dim) +num_heads = find_num_heads(model_dim, args.head_dim) num_kv_heads = num_heads # default is 1:1 GQA (Group Query Attention) ratio (i.e. GQA is disabled) print0(f"num_layers: {num_layers}") print0(f"model_dim: {model_dim}") @@ -115,6 +119,17 @@ print0(f"Tokens / micro-batch / rank: {args.device_batch_size} x {args.max_seq_l print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}") print0(f"Total batch size {args.total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}") +# Batch size scaling for learning rates (hyperparameters were tuned at reference batch size 2^19) +batch_lr_scale = 1.0 +reference_batch_size = 2**19 +batch_ratio = args.total_batch_size / reference_batch_size +if batch_ratio != 1.0: + # SGD: linear scaling with batch size is standard (not used in nanochat) + # AdamW: sqrt scaling is standard + # Muon: sqrt scaling is an assumption - not fully studied, but it's a second-order-ish optimizer + batch_lr_scale = batch_ratio ** 0.5 + print0(f"Scaling LRs by {batch_lr_scale:.4f} for batch size {args.total_batch_size:,} (reference: {reference_batch_size:,})") + # ----------------------------------------------------------------------------- # Initialize the Model @@ -141,7 +156,8 @@ if resuming: orig_model = model # original, uncompiled model, for saving raw model state_dict and for inference/evaluation (because the shapes may change shape) model = torch.compile(model, dynamic=False) # the inputs to model will never change shape so dynamic=False is safe num_params = sum(p.numel() for p in model.parameters()) -print0(f"Number of parameters: {num_params:,}") +num_scaling_params = orig_model.num_scaling_params() +print0(f"Number of parameters: {num_params:,} (scaling: {num_scaling_params:,})") num_flops_per_token = model.estimate_flops() print0(f"Estimated FLOPs per token: {num_flops_per_token:e}") @@ -155,20 +171,27 @@ elif args.target_flops > 0: num_iterations = round(args.target_flops / (num_flops_per_token * args.total_batch_size)) print0(f"Calculated number of iterations from target FLOPs: {num_iterations:,}") elif args.target_param_data_ratio > 0: - # calculate the number of iterations from the target param data ratio - target_tokens = args.target_param_data_ratio * num_params + # calculate the number of iterations from the target param data ratio (use scaling params per Kaplan et al.) + target_tokens = args.target_param_data_ratio * num_scaling_params num_iterations = target_tokens // args.total_batch_size print0(f"Calculated number of iterations from target data:param ratio: {num_iterations:,}") else: raise ValueError("No training horizon specified") total_tokens = args.total_batch_size * num_iterations print0(f"Total number of training tokens: {total_tokens:,}") -print0(f"Tokens : Params ratio: {args.total_batch_size * num_iterations / num_params:.2f}") # Chinchilla is ~20 +print0(f"Tokens : Params ratio: {args.total_batch_size * num_iterations / num_scaling_params:.2f}") # Chinchilla is ~20 print0(f"Total training FLOPs estimate: {num_flops_per_token * total_tokens:e}") # ----------------------------------------------------------------------------- # Initialize the Optimizer (Muon for Linear layers, AdamW for embedding and lm_head) -optimizers = model.setup_optimizers(unembedding_lr=args.unembedding_lr, embedding_lr=args.embedding_lr, matrix_lr=args.matrix_lr, weight_decay=args.weight_decay) +adam_betas = (args.adam_beta1, args.adam_beta2) +optimizers = model.setup_optimizers( + unembedding_lr=args.unembedding_lr * batch_lr_scale, + embedding_lr=args.embedding_lr * batch_lr_scale, + matrix_lr=args.matrix_lr * batch_lr_scale, + weight_decay=args.weight_decay, + adam_betas=adam_betas, +) adamw_optimizer, muon_optimizer = optimizers if resuming: diff --git a/scripts/tok_train.py b/scripts/tok_train.py index e1b79ee2..4ab995c0 100644 --- a/scripts/tok_train.py +++ b/scripts/tok_train.py @@ -16,7 +16,7 @@ from nanochat.dataset import parquets_iter_batched parser = argparse.ArgumentParser(description='Train a BPE tokenizer') parser.add_argument('--max_chars', type=int, default=10_000_000_000, help='Maximum characters to train on (default: 10B)') parser.add_argument('--doc_cap', type=int, default=10_000, help='Maximum characters per document (default: 10,000)') -parser.add_argument('--vocab_size', type=int, default=65536, help='Vocabulary size (default: 65536 = 2^16)') +parser.add_argument('--vocab_size', type=int, default=32768, help='Vocabulary size (default: 32768 = 2^15)') args = parser.parse_args() print(f"max_chars: {args.max_chars:,}") print(f"doc_cap: {args.doc_cap:,}") diff --git a/speedrun.sh b/speedrun.sh index 8803dcb4..f9be2271 100644 --- a/speedrun.sh +++ b/speedrun.sh @@ -59,7 +59,7 @@ python -m nanochat.dataset -n 8 python -m nanochat.dataset -n 240 & DATASET_DOWNLOAD_PID=$! # train the tokenizer with vocab size 2**16 = 65536 on ~2B characters of data -python -m scripts.tok_train --max_chars=2000000000 +python -m scripts.tok_train --max_chars=2000000000 --vocab_size=65536 # evaluate the tokenizer (report compression ratio etc.) python -m scripts.tok_eval @@ -79,7 +79,7 @@ wait $DATASET_DOWNLOAD_PID NPROC_PER_NODE=8 # pretrain the d20 model -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=20 --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=20 --target_param_data_ratio=20 --run=$WANDB_RUN # evaluate the model on a larger chunk of train/val data and draw some samples torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss # evaluate the model on CORE tasks diff --git a/uv.lock b/uv.lock index 67ea0357..63b2c014 100644 --- a/uv.lock +++ b/uv.lock @@ -1483,7 +1483,10 @@ dependencies = [ { name = "python-dotenv" }, { name = "regex" }, { name = "rustbpe" }, + { name = "scipy", version = "1.15.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "scipy", version = "1.16.3", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, { name = "setuptools" }, + { name = "tabulate" }, { name = "tiktoken" }, { name = "tokenizers" }, { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, @@ -1520,7 +1523,9 @@ requires-dist = [ { name = "python-dotenv", specifier = ">=1.2.1" }, { name = "regex", specifier = ">=2025.9.1" }, { name = "rustbpe", specifier = ">=0.1.0" }, + { name = "scipy", specifier = ">=1.15.3" }, { name = "setuptools", specifier = ">=80.9.0" }, + { name = "tabulate", specifier = ">=0.9.0" }, { name = "tiktoken", specifier = ">=0.11.0" }, { name = "tokenizers", specifier = ">=0.22.0" }, { name = "torch", specifier = ">=2.9.0" }, @@ -2617,6 +2622,158 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/58/5b/632a58724221ef03d78ab65062e82a1010e1bef8e8e0b9d7c6d7b8044841/safetensors-0.7.0-pp310-pypy310_pp73-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:473b32699f4200e69801bf5abf93f1a4ecd432a70984df164fc22ccf39c4a6f3", size = 531885, upload-time = "2025-11-19T15:18:27.146Z" }, ] +[[package]] +name = "scipy" +version = "1.15.3" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", +] +dependencies = [ + { name = "numpy", marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0f/37/6964b830433e654ec7485e45a00fc9a27cf868d622838f6b6d9c5ec0d532/scipy-1.15.3.tar.gz", hash = "sha256:eae3cf522bc7df64b42cad3925c876e1b0b6c35c1337c93e12c0f366f55b0eaf", size = 59419214, upload-time = "2025-05-08T16:13:05.955Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/78/2f/4966032c5f8cc7e6a60f1b2e0ad686293b9474b65246b0c642e3ef3badd0/scipy-1.15.3-cp310-cp310-macosx_10_13_x86_64.whl", hash = "sha256:a345928c86d535060c9c2b25e71e87c39ab2f22fc96e9636bd74d1dbf9de448c", size = 38702770, upload-time = "2025-05-08T16:04:20.849Z" }, + { url = "https://files.pythonhosted.org/packages/a0/6e/0c3bf90fae0e910c274db43304ebe25a6b391327f3f10b5dcc638c090795/scipy-1.15.3-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:ad3432cb0f9ed87477a8d97f03b763fd1d57709f1bbde3c9369b1dff5503b253", size = 30094511, upload-time = "2025-05-08T16:04:27.103Z" }, + { url = "https://files.pythonhosted.org/packages/ea/b1/4deb37252311c1acff7f101f6453f0440794f51b6eacb1aad4459a134081/scipy-1.15.3-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:aef683a9ae6eb00728a542b796f52a5477b78252edede72b8327a886ab63293f", size = 22368151, upload-time = "2025-05-08T16:04:31.731Z" }, + { url = "https://files.pythonhosted.org/packages/38/7d/f457626e3cd3c29b3a49ca115a304cebb8cc6f31b04678f03b216899d3c6/scipy-1.15.3-cp310-cp310-macosx_14_0_x86_64.whl", hash = "sha256:1c832e1bd78dea67d5c16f786681b28dd695a8cb1fb90af2e27580d3d0967e92", size = 25121732, upload-time = "2025-05-08T16:04:36.596Z" }, + { url = "https://files.pythonhosted.org/packages/db/0a/92b1de4a7adc7a15dcf5bddc6e191f6f29ee663b30511ce20467ef9b82e4/scipy-1.15.3-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:263961f658ce2165bbd7b99fa5135195c3a12d9bef045345016b8b50c315cb82", size = 35547617, upload-time = "2025-05-08T16:04:43.546Z" }, + { url = "https://files.pythonhosted.org/packages/8e/6d/41991e503e51fc1134502694c5fa7a1671501a17ffa12716a4a9151af3df/scipy-1.15.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9e2abc762b0811e09a0d3258abee2d98e0c703eee49464ce0069590846f31d40", size = 37662964, upload-time = "2025-05-08T16:04:49.431Z" }, + { url = "https://files.pythonhosted.org/packages/25/e1/3df8f83cb15f3500478c889be8fb18700813b95e9e087328230b98d547ff/scipy-1.15.3-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:ed7284b21a7a0c8f1b6e5977ac05396c0d008b89e05498c8b7e8f4a1423bba0e", size = 37238749, upload-time = "2025-05-08T16:04:55.215Z" }, + { url = "https://files.pythonhosted.org/packages/93/3e/b3257cf446f2a3533ed7809757039016b74cd6f38271de91682aa844cfc5/scipy-1.15.3-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:5380741e53df2c566f4d234b100a484b420af85deb39ea35a1cc1be84ff53a5c", size = 40022383, upload-time = "2025-05-08T16:05:01.914Z" }, + { url = "https://files.pythonhosted.org/packages/d1/84/55bc4881973d3f79b479a5a2e2df61c8c9a04fcb986a213ac9c02cfb659b/scipy-1.15.3-cp310-cp310-win_amd64.whl", hash = "sha256:9d61e97b186a57350f6d6fd72640f9e99d5a4a2b8fbf4b9ee9a841eab327dc13", size = 41259201, upload-time = "2025-05-08T16:05:08.166Z" }, + { url = "https://files.pythonhosted.org/packages/96/ab/5cc9f80f28f6a7dff646c5756e559823614a42b1939d86dd0ed550470210/scipy-1.15.3-cp311-cp311-macosx_10_13_x86_64.whl", hash = "sha256:993439ce220d25e3696d1b23b233dd010169b62f6456488567e830654ee37a6b", size = 38714255, upload-time = "2025-05-08T16:05:14.596Z" }, + { url = "https://files.pythonhosted.org/packages/4a/4a/66ba30abe5ad1a3ad15bfb0b59d22174012e8056ff448cb1644deccbfed2/scipy-1.15.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:34716e281f181a02341ddeaad584205bd2fd3c242063bd3423d61ac259ca7eba", size = 30111035, upload-time = "2025-05-08T16:05:20.152Z" }, + { url = "https://files.pythonhosted.org/packages/4b/fa/a7e5b95afd80d24313307f03624acc65801846fa75599034f8ceb9e2cbf6/scipy-1.15.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:3b0334816afb8b91dab859281b1b9786934392aa3d527cd847e41bb6f45bee65", size = 22384499, upload-time = "2025-05-08T16:05:24.494Z" }, + { url = "https://files.pythonhosted.org/packages/17/99/f3aaddccf3588bb4aea70ba35328c204cadd89517a1612ecfda5b2dd9d7a/scipy-1.15.3-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:6db907c7368e3092e24919b5e31c76998b0ce1684d51a90943cb0ed1b4ffd6c1", size = 25152602, upload-time = "2025-05-08T16:05:29.313Z" }, + { url = "https://files.pythonhosted.org/packages/56/c5/1032cdb565f146109212153339f9cb8b993701e9fe56b1c97699eee12586/scipy-1.15.3-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:721d6b4ef5dc82ca8968c25b111e307083d7ca9091bc38163fb89243e85e3889", size = 35503415, upload-time = "2025-05-08T16:05:34.699Z" }, + { url = "https://files.pythonhosted.org/packages/bd/37/89f19c8c05505d0601ed5650156e50eb881ae3918786c8fd7262b4ee66d3/scipy-1.15.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:39cb9c62e471b1bb3750066ecc3a3f3052b37751c7c3dfd0fd7e48900ed52982", size = 37652622, upload-time = "2025-05-08T16:05:40.762Z" }, + { url = "https://files.pythonhosted.org/packages/7e/31/be59513aa9695519b18e1851bb9e487de66f2d31f835201f1b42f5d4d475/scipy-1.15.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:795c46999bae845966368a3c013e0e00947932d68e235702b5c3f6ea799aa8c9", size = 37244796, upload-time = "2025-05-08T16:05:48.119Z" }, + { url = "https://files.pythonhosted.org/packages/10/c0/4f5f3eeccc235632aab79b27a74a9130c6c35df358129f7ac8b29f562ac7/scipy-1.15.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:18aaacb735ab38b38db42cb01f6b92a2d0d4b6aabefeb07f02849e47f8fb3594", size = 40047684, upload-time = "2025-05-08T16:05:54.22Z" }, + { url = "https://files.pythonhosted.org/packages/ab/a7/0ddaf514ce8a8714f6ed243a2b391b41dbb65251affe21ee3077ec45ea9a/scipy-1.15.3-cp311-cp311-win_amd64.whl", hash = "sha256:ae48a786a28412d744c62fd7816a4118ef97e5be0bee968ce8f0a2fba7acf3bb", size = 41246504, upload-time = "2025-05-08T16:06:00.437Z" }, + { url = "https://files.pythonhosted.org/packages/37/4b/683aa044c4162e10ed7a7ea30527f2cbd92e6999c10a8ed8edb253836e9c/scipy-1.15.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:6ac6310fdbfb7aa6612408bd2f07295bcbd3fda00d2d702178434751fe48e019", size = 38766735, upload-time = "2025-05-08T16:06:06.471Z" }, + { url = "https://files.pythonhosted.org/packages/7b/7e/f30be3d03de07f25dc0ec926d1681fed5c732d759ac8f51079708c79e680/scipy-1.15.3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:185cd3d6d05ca4b44a8f1595af87f9c372bb6acf9c808e99aa3e9aa03bd98cf6", size = 30173284, upload-time = "2025-05-08T16:06:11.686Z" }, + { url = "https://files.pythonhosted.org/packages/07/9c/0ddb0d0abdabe0d181c1793db51f02cd59e4901da6f9f7848e1f96759f0d/scipy-1.15.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:05dc6abcd105e1a29f95eada46d4a3f251743cfd7d3ae8ddb4088047f24ea477", size = 22446958, upload-time = "2025-05-08T16:06:15.97Z" }, + { url = "https://files.pythonhosted.org/packages/af/43/0bce905a965f36c58ff80d8bea33f1f9351b05fad4beaad4eae34699b7a1/scipy-1.15.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:06efcba926324df1696931a57a176c80848ccd67ce6ad020c810736bfd58eb1c", size = 25242454, upload-time = "2025-05-08T16:06:20.394Z" }, + { url = "https://files.pythonhosted.org/packages/56/30/a6f08f84ee5b7b28b4c597aca4cbe545535c39fe911845a96414700b64ba/scipy-1.15.3-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c05045d8b9bfd807ee1b9f38761993297b10b245f012b11b13b91ba8945f7e45", size = 35210199, upload-time = "2025-05-08T16:06:26.159Z" }, + { url = "https://files.pythonhosted.org/packages/0b/1f/03f52c282437a168ee2c7c14a1a0d0781a9a4a8962d84ac05c06b4c5b555/scipy-1.15.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:271e3713e645149ea5ea3e97b57fdab61ce61333f97cfae392c28ba786f9bb49", size = 37309455, upload-time = "2025-05-08T16:06:32.778Z" }, + { url = "https://files.pythonhosted.org/packages/89/b1/fbb53137f42c4bf630b1ffdfc2151a62d1d1b903b249f030d2b1c0280af8/scipy-1.15.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6cfd56fc1a8e53f6e89ba3a7a7251f7396412d655bca2aa5611c8ec9a6784a1e", size = 36885140, upload-time = "2025-05-08T16:06:39.249Z" }, + { url = "https://files.pythonhosted.org/packages/2e/2e/025e39e339f5090df1ff266d021892694dbb7e63568edcfe43f892fa381d/scipy-1.15.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:0ff17c0bb1cb32952c09217d8d1eed9b53d1463e5f1dd6052c7857f83127d539", size = 39710549, upload-time = "2025-05-08T16:06:45.729Z" }, + { url = "https://files.pythonhosted.org/packages/e6/eb/3bf6ea8ab7f1503dca3a10df2e4b9c3f6b3316df07f6c0ded94b281c7101/scipy-1.15.3-cp312-cp312-win_amd64.whl", hash = "sha256:52092bc0472cfd17df49ff17e70624345efece4e1a12b23783a1ac59a1b728ed", size = 40966184, upload-time = "2025-05-08T16:06:52.623Z" }, + { url = "https://files.pythonhosted.org/packages/73/18/ec27848c9baae6e0d6573eda6e01a602e5649ee72c27c3a8aad673ebecfd/scipy-1.15.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2c620736bcc334782e24d173c0fdbb7590a0a436d2fdf39310a8902505008759", size = 38728256, upload-time = "2025-05-08T16:06:58.696Z" }, + { url = "https://files.pythonhosted.org/packages/74/cd/1aef2184948728b4b6e21267d53b3339762c285a46a274ebb7863c9e4742/scipy-1.15.3-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:7e11270a000969409d37ed399585ee530b9ef6aa99d50c019de4cb01e8e54e62", size = 30109540, upload-time = "2025-05-08T16:07:04.209Z" }, + { url = "https://files.pythonhosted.org/packages/5b/d8/59e452c0a255ec352bd0a833537a3bc1bfb679944c4938ab375b0a6b3a3e/scipy-1.15.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:8c9ed3ba2c8a2ce098163a9bdb26f891746d02136995df25227a20e71c396ebb", size = 22383115, upload-time = "2025-05-08T16:07:08.998Z" }, + { url = "https://files.pythonhosted.org/packages/08/f5/456f56bbbfccf696263b47095291040655e3cbaf05d063bdc7c7517f32ac/scipy-1.15.3-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:0bdd905264c0c9cfa74a4772cdb2070171790381a5c4d312c973382fc6eaf730", size = 25163884, upload-time = "2025-05-08T16:07:14.091Z" }, + { url = "https://files.pythonhosted.org/packages/a2/66/a9618b6a435a0f0c0b8a6d0a2efb32d4ec5a85f023c2b79d39512040355b/scipy-1.15.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:79167bba085c31f38603e11a267d862957cbb3ce018d8b38f79ac043bc92d825", size = 35174018, upload-time = "2025-05-08T16:07:19.427Z" }, + { url = "https://files.pythonhosted.org/packages/b5/09/c5b6734a50ad4882432b6bb7c02baf757f5b2f256041da5df242e2d7e6b6/scipy-1.15.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c9deabd6d547aee2c9a81dee6cc96c6d7e9a9b1953f74850c179f91fdc729cb7", size = 37269716, upload-time = "2025-05-08T16:07:25.712Z" }, + { url = "https://files.pythonhosted.org/packages/77/0a/eac00ff741f23bcabd352731ed9b8995a0a60ef57f5fd788d611d43d69a1/scipy-1.15.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:dde4fc32993071ac0c7dd2d82569e544f0bdaff66269cb475e0f369adad13f11", size = 36872342, upload-time = "2025-05-08T16:07:31.468Z" }, + { url = "https://files.pythonhosted.org/packages/fe/54/4379be86dd74b6ad81551689107360d9a3e18f24d20767a2d5b9253a3f0a/scipy-1.15.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f77f853d584e72e874d87357ad70f44b437331507d1c311457bed8ed2b956126", size = 39670869, upload-time = "2025-05-08T16:07:38.002Z" }, + { url = "https://files.pythonhosted.org/packages/87/2e/892ad2862ba54f084ffe8cc4a22667eaf9c2bcec6d2bff1d15713c6c0703/scipy-1.15.3-cp313-cp313-win_amd64.whl", hash = "sha256:b90ab29d0c37ec9bf55424c064312930ca5f4bde15ee8619ee44e69319aab163", size = 40988851, upload-time = "2025-05-08T16:08:33.671Z" }, + { url = "https://files.pythonhosted.org/packages/1b/e9/7a879c137f7e55b30d75d90ce3eb468197646bc7b443ac036ae3fe109055/scipy-1.15.3-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:3ac07623267feb3ae308487c260ac684b32ea35fd81e12845039952f558047b8", size = 38863011, upload-time = "2025-05-08T16:07:44.039Z" }, + { url = "https://files.pythonhosted.org/packages/51/d1/226a806bbd69f62ce5ef5f3ffadc35286e9fbc802f606a07eb83bf2359de/scipy-1.15.3-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:6487aa99c2a3d509a5227d9a5e889ff05830a06b2ce08ec30df6d79db5fcd5c5", size = 30266407, upload-time = "2025-05-08T16:07:49.891Z" }, + { url = "https://files.pythonhosted.org/packages/e5/9b/f32d1d6093ab9eeabbd839b0f7619c62e46cc4b7b6dbf05b6e615bbd4400/scipy-1.15.3-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:50f9e62461c95d933d5c5ef4a1f2ebf9a2b4e83b0db374cb3f1de104d935922e", size = 22540030, upload-time = "2025-05-08T16:07:54.121Z" }, + { url = "https://files.pythonhosted.org/packages/e7/29/c278f699b095c1a884f29fda126340fcc201461ee8bfea5c8bdb1c7c958b/scipy-1.15.3-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:14ed70039d182f411ffc74789a16df3835e05dc469b898233a245cdfd7f162cb", size = 25218709, upload-time = "2025-05-08T16:07:58.506Z" }, + { url = "https://files.pythonhosted.org/packages/24/18/9e5374b617aba742a990581373cd6b68a2945d65cc588482749ef2e64467/scipy-1.15.3-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0a769105537aa07a69468a0eefcd121be52006db61cdd8cac8a0e68980bbb723", size = 34809045, upload-time = "2025-05-08T16:08:03.929Z" }, + { url = "https://files.pythonhosted.org/packages/e1/fe/9c4361e7ba2927074360856db6135ef4904d505e9b3afbbcb073c4008328/scipy-1.15.3-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9db984639887e3dffb3928d118145ffe40eff2fa40cb241a306ec57c219ebbbb", size = 36703062, upload-time = "2025-05-08T16:08:09.558Z" }, + { url = "https://files.pythonhosted.org/packages/b7/8e/038ccfe29d272b30086b25a4960f757f97122cb2ec42e62b460d02fe98e9/scipy-1.15.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:40e54d5c7e7ebf1aa596c374c49fa3135f04648a0caabcb66c52884b943f02b4", size = 36393132, upload-time = "2025-05-08T16:08:15.34Z" }, + { url = "https://files.pythonhosted.org/packages/10/7e/5c12285452970be5bdbe8352c619250b97ebf7917d7a9a9e96b8a8140f17/scipy-1.15.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5e721fed53187e71d0ccf382b6bf977644c533e506c4d33c3fb24de89f5c3ed5", size = 38979503, upload-time = "2025-05-08T16:08:21.513Z" }, + { url = "https://files.pythonhosted.org/packages/81/06/0a5e5349474e1cbc5757975b21bd4fad0e72ebf138c5592f191646154e06/scipy-1.15.3-cp313-cp313t-win_amd64.whl", hash = "sha256:76ad1fb5f8752eabf0fa02e4cc0336b4e8f021e2d5f061ed37d6d264db35e3ca", size = 40308097, upload-time = "2025-05-08T16:08:27.627Z" }, +] + +[[package]] +name = "scipy" +version = "1.16.3" +source = { registry = "https://pypi.org/simple" } +resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version >= '3.12' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", + "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'", +] +dependencies = [ + { name = "numpy", marker = "python_full_version >= '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/0a/ca/d8ace4f98322d01abcd52d381134344bf7b431eba7ed8b42bdea5a3c2ac9/scipy-1.16.3.tar.gz", hash = "sha256:01e87659402762f43bd2fee13370553a17ada367d42e7487800bf2916535aecb", size = 30597883, upload-time = "2025-10-28T17:38:54.068Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/9b/5f/6f37d7439de1455ce9c5a556b8d1db0979f03a796c030bafdf08d35b7bf9/scipy-1.16.3-cp311-cp311-macosx_10_14_x86_64.whl", hash = "sha256:40be6cf99e68b6c4321e9f8782e7d5ff8265af28ef2cd56e9c9b2638fa08ad97", size = 36630881, upload-time = "2025-10-28T17:31:47.104Z" }, + { url = "https://files.pythonhosted.org/packages/7c/89/d70e9f628749b7e4db2aa4cd89735502ff3f08f7b9b27d2e799485987cd9/scipy-1.16.3-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:8be1ca9170fcb6223cc7c27f4305d680ded114a1567c0bd2bfcbf947d1b17511", size = 28941012, upload-time = "2025-10-28T17:31:53.411Z" }, + { url = "https://files.pythonhosted.org/packages/a8/a8/0e7a9a6872a923505dbdf6bb93451edcac120363131c19013044a1e7cb0c/scipy-1.16.3-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:bea0a62734d20d67608660f69dcda23e7f90fb4ca20974ab80b6ed40df87a005", size = 20931935, upload-time = "2025-10-28T17:31:57.361Z" }, + { url = "https://files.pythonhosted.org/packages/bd/c7/020fb72bd79ad798e4dbe53938543ecb96b3a9ac3fe274b7189e23e27353/scipy-1.16.3-cp311-cp311-macosx_14_0_x86_64.whl", hash = "sha256:2a207a6ce9c24f1951241f4693ede2d393f59c07abc159b2cb2be980820e01fb", size = 23534466, upload-time = "2025-10-28T17:32:01.875Z" }, + { url = "https://files.pythonhosted.org/packages/be/a0/668c4609ce6dbf2f948e167836ccaf897f95fb63fa231c87da7558a374cd/scipy-1.16.3-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:532fb5ad6a87e9e9cd9c959b106b73145a03f04c7d57ea3e6f6bb60b86ab0876", size = 33593618, upload-time = "2025-10-28T17:32:06.902Z" }, + { url = "https://files.pythonhosted.org/packages/ca/6e/8942461cf2636cdae083e3eb72622a7fbbfa5cf559c7d13ab250a5dbdc01/scipy-1.16.3-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:0151a0749efeaaab78711c78422d413c583b8cdd2011a3c1d6c794938ee9fdb2", size = 35899798, upload-time = "2025-10-28T17:32:12.665Z" }, + { url = "https://files.pythonhosted.org/packages/79/e8/d0f33590364cdbd67f28ce79368b373889faa4ee959588beddf6daef9abe/scipy-1.16.3-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:b7180967113560cca57418a7bc719e30366b47959dd845a93206fbed693c867e", size = 36226154, upload-time = "2025-10-28T17:32:17.961Z" }, + { url = "https://files.pythonhosted.org/packages/39/c1/1903de608c0c924a1749c590064e65810f8046e437aba6be365abc4f7557/scipy-1.16.3-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:deb3841c925eeddb6afc1e4e4a45e418d19ec7b87c5df177695224078e8ec733", size = 38878540, upload-time = "2025-10-28T17:32:23.907Z" }, + { url = "https://files.pythonhosted.org/packages/f1/d0/22ec7036ba0b0a35bccb7f25ab407382ed34af0b111475eb301c16f8a2e5/scipy-1.16.3-cp311-cp311-win_amd64.whl", hash = "sha256:53c3844d527213631e886621df5695d35e4f6a75f620dca412bcd292f6b87d78", size = 38722107, upload-time = "2025-10-28T17:32:29.921Z" }, + { url = "https://files.pythonhosted.org/packages/7b/60/8a00e5a524bb3bf8898db1650d350f50e6cffb9d7a491c561dc9826c7515/scipy-1.16.3-cp311-cp311-win_arm64.whl", hash = "sha256:9452781bd879b14b6f055b26643703551320aa8d79ae064a71df55c00286a184", size = 25506272, upload-time = "2025-10-28T17:32:34.577Z" }, + { url = "https://files.pythonhosted.org/packages/40/41/5bf55c3f386b1643812f3a5674edf74b26184378ef0f3e7c7a09a7e2ca7f/scipy-1.16.3-cp312-cp312-macosx_10_14_x86_64.whl", hash = "sha256:81fc5827606858cf71446a5e98715ba0e11f0dbc83d71c7409d05486592a45d6", size = 36659043, upload-time = "2025-10-28T17:32:40.285Z" }, + { url = "https://files.pythonhosted.org/packages/1e/0f/65582071948cfc45d43e9870bf7ca5f0e0684e165d7c9ef4e50d783073eb/scipy-1.16.3-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:c97176013d404c7346bf57874eaac5187d969293bf40497140b0a2b2b7482e07", size = 28898986, upload-time = "2025-10-28T17:32:45.325Z" }, + { url = "https://files.pythonhosted.org/packages/96/5e/36bf3f0ac298187d1ceadde9051177d6a4fe4d507e8f59067dc9dd39e650/scipy-1.16.3-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:2b71d93c8a9936046866acebc915e2af2e292b883ed6e2cbe5c34beb094b82d9", size = 20889814, upload-time = "2025-10-28T17:32:49.277Z" }, + { url = "https://files.pythonhosted.org/packages/80/35/178d9d0c35394d5d5211bbff7ac4f2986c5488b59506fef9e1de13ea28d3/scipy-1.16.3-cp312-cp312-macosx_14_0_x86_64.whl", hash = "sha256:3d4a07a8e785d80289dfe66b7c27d8634a773020742ec7187b85ccc4b0e7b686", size = 23565795, upload-time = "2025-10-28T17:32:53.337Z" }, + { url = "https://files.pythonhosted.org/packages/fa/46/d1146ff536d034d02f83c8afc3c4bab2eddb634624d6529a8512f3afc9da/scipy-1.16.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0553371015692a898e1aa858fed67a3576c34edefa6b7ebdb4e9dde49ce5c203", size = 33349476, upload-time = "2025-10-28T17:32:58.353Z" }, + { url = "https://files.pythonhosted.org/packages/79/2e/415119c9ab3e62249e18c2b082c07aff907a273741b3f8160414b0e9193c/scipy-1.16.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:72d1717fd3b5e6ec747327ce9bda32d5463f472c9dce9f54499e81fbd50245a1", size = 35676692, upload-time = "2025-10-28T17:33:03.88Z" }, + { url = "https://files.pythonhosted.org/packages/27/82/df26e44da78bf8d2aeaf7566082260cfa15955a5a6e96e6a29935b64132f/scipy-1.16.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1fb2472e72e24d1530debe6ae078db70fb1605350c88a3d14bc401d6306dbffe", size = 36019345, upload-time = "2025-10-28T17:33:09.773Z" }, + { url = "https://files.pythonhosted.org/packages/82/31/006cbb4b648ba379a95c87262c2855cd0d09453e500937f78b30f02fa1cd/scipy-1.16.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c5192722cffe15f9329a3948c4b1db789fbb1f05c97899187dcf009b283aea70", size = 38678975, upload-time = "2025-10-28T17:33:15.809Z" }, + { url = "https://files.pythonhosted.org/packages/c2/7f/acbd28c97e990b421af7d6d6cd416358c9c293fc958b8529e0bd5d2a2a19/scipy-1.16.3-cp312-cp312-win_amd64.whl", hash = "sha256:56edc65510d1331dae01ef9b658d428e33ed48b4f77b1d51caf479a0253f96dc", size = 38555926, upload-time = "2025-10-28T17:33:21.388Z" }, + { url = "https://files.pythonhosted.org/packages/ce/69/c5c7807fd007dad4f48e0a5f2153038dc96e8725d3345b9ee31b2b7bed46/scipy-1.16.3-cp312-cp312-win_arm64.whl", hash = "sha256:a8a26c78ef223d3e30920ef759e25625a0ecdd0d60e5a8818b7513c3e5384cf2", size = 25463014, upload-time = "2025-10-28T17:33:25.975Z" }, + { url = "https://files.pythonhosted.org/packages/72/f1/57e8327ab1508272029e27eeef34f2302ffc156b69e7e233e906c2a5c379/scipy-1.16.3-cp313-cp313-macosx_10_14_x86_64.whl", hash = "sha256:d2ec56337675e61b312179a1ad124f5f570c00f920cc75e1000025451b88241c", size = 36617856, upload-time = "2025-10-28T17:33:31.375Z" }, + { url = "https://files.pythonhosted.org/packages/44/13/7e63cfba8a7452eb756306aa2fd9b37a29a323b672b964b4fdeded9a3f21/scipy-1.16.3-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:16b8bc35a4cc24db80a0ec836a9286d0e31b2503cb2fd7ff7fb0e0374a97081d", size = 28874306, upload-time = "2025-10-28T17:33:36.516Z" }, + { url = "https://files.pythonhosted.org/packages/15/65/3a9400efd0228a176e6ec3454b1fa998fbbb5a8defa1672c3f65706987db/scipy-1.16.3-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:5803c5fadd29de0cf27fa08ccbfe7a9e5d741bf63e4ab1085437266f12460ff9", size = 20865371, upload-time = "2025-10-28T17:33:42.094Z" }, + { url = "https://files.pythonhosted.org/packages/33/d7/eda09adf009a9fb81827194d4dd02d2e4bc752cef16737cc4ef065234031/scipy-1.16.3-cp313-cp313-macosx_14_0_x86_64.whl", hash = "sha256:b81c27fc41954319a943d43b20e07c40bdcd3ff7cf013f4fb86286faefe546c4", size = 23524877, upload-time = "2025-10-28T17:33:48.483Z" }, + { url = "https://files.pythonhosted.org/packages/7d/6b/3f911e1ebc364cb81320223a3422aab7d26c9c7973109a9cd0f27c64c6c0/scipy-1.16.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:0c3b4dd3d9b08dbce0f3440032c52e9e2ab9f96ade2d3943313dfe51a7056959", size = 33342103, upload-time = "2025-10-28T17:33:56.495Z" }, + { url = "https://files.pythonhosted.org/packages/21/f6/4bfb5695d8941e5c570a04d9fcd0d36bce7511b7d78e6e75c8f9791f82d0/scipy-1.16.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:7dc1360c06535ea6116a2220f760ae572db9f661aba2d88074fe30ec2aa1ff88", size = 35697297, upload-time = "2025-10-28T17:34:04.722Z" }, + { url = "https://files.pythonhosted.org/packages/04/e1/6496dadbc80d8d896ff72511ecfe2316b50313bfc3ebf07a3f580f08bd8c/scipy-1.16.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:663b8d66a8748051c3ee9c96465fb417509315b99c71550fda2591d7dd634234", size = 36021756, upload-time = "2025-10-28T17:34:13.482Z" }, + { url = "https://files.pythonhosted.org/packages/fe/bd/a8c7799e0136b987bda3e1b23d155bcb31aec68a4a472554df5f0937eef7/scipy-1.16.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:eab43fae33a0c39006a88096cd7b4f4ef545ea0447d250d5ac18202d40b6611d", size = 38696566, upload-time = "2025-10-28T17:34:22.384Z" }, + { url = "https://files.pythonhosted.org/packages/cd/01/1204382461fcbfeb05b6161b594f4007e78b6eba9b375382f79153172b4d/scipy-1.16.3-cp313-cp313-win_amd64.whl", hash = "sha256:062246acacbe9f8210de8e751b16fc37458213f124bef161a5a02c7a39284304", size = 38529877, upload-time = "2025-10-28T17:35:51.076Z" }, + { url = "https://files.pythonhosted.org/packages/7f/14/9d9fbcaa1260a94f4bb5b64ba9213ceb5d03cd88841fe9fd1ffd47a45b73/scipy-1.16.3-cp313-cp313-win_arm64.whl", hash = "sha256:50a3dbf286dbc7d84f176f9a1574c705f277cb6565069f88f60db9eafdbe3ee2", size = 25455366, upload-time = "2025-10-28T17:35:59.014Z" }, + { url = "https://files.pythonhosted.org/packages/e2/a3/9ec205bd49f42d45d77f1730dbad9ccf146244c1647605cf834b3a8c4f36/scipy-1.16.3-cp313-cp313t-macosx_10_14_x86_64.whl", hash = "sha256:fb4b29f4cf8cc5a8d628bc8d8e26d12d7278cd1f219f22698a378c3d67db5e4b", size = 37027931, upload-time = "2025-10-28T17:34:31.451Z" }, + { url = "https://files.pythonhosted.org/packages/25/06/ca9fd1f3a4589cbd825b1447e5db3a8ebb969c1eaf22c8579bd286f51b6d/scipy-1.16.3-cp313-cp313t-macosx_12_0_arm64.whl", hash = "sha256:8d09d72dc92742988b0e7750bddb8060b0c7079606c0d24a8cc8e9c9c11f9079", size = 29400081, upload-time = "2025-10-28T17:34:39.087Z" }, + { url = "https://files.pythonhosted.org/packages/6a/56/933e68210d92657d93fb0e381683bc0e53a965048d7358ff5fbf9e6a1b17/scipy-1.16.3-cp313-cp313t-macosx_14_0_arm64.whl", hash = "sha256:03192a35e661470197556de24e7cb1330d84b35b94ead65c46ad6f16f6b28f2a", size = 21391244, upload-time = "2025-10-28T17:34:45.234Z" }, + { url = "https://files.pythonhosted.org/packages/a8/7e/779845db03dc1418e215726329674b40576879b91814568757ff0014ad65/scipy-1.16.3-cp313-cp313t-macosx_14_0_x86_64.whl", hash = "sha256:57d01cb6f85e34f0946b33caa66e892aae072b64b034183f3d87c4025802a119", size = 23929753, upload-time = "2025-10-28T17:34:51.793Z" }, + { url = "https://files.pythonhosted.org/packages/4c/4b/f756cf8161d5365dcdef9e5f460ab226c068211030a175d2fc7f3f41ca64/scipy-1.16.3-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:96491a6a54e995f00a28a3c3badfff58fd093bf26cd5fb34a2188c8c756a3a2c", size = 33496912, upload-time = "2025-10-28T17:34:59.8Z" }, + { url = "https://files.pythonhosted.org/packages/09/b5/222b1e49a58668f23839ca1542a6322bb095ab8d6590d4f71723869a6c2c/scipy-1.16.3-cp313-cp313t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:cd13e354df9938598af2be05822c323e97132d5e6306b83a3b4ee6724c6e522e", size = 35802371, upload-time = "2025-10-28T17:35:08.173Z" }, + { url = "https://files.pythonhosted.org/packages/c1/8d/5964ef68bb31829bde27611f8c9deeac13764589fe74a75390242b64ca44/scipy-1.16.3-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:63d3cdacb8a824a295191a723ee5e4ea7768ca5ca5f2838532d9f2e2b3ce2135", size = 36190477, upload-time = "2025-10-28T17:35:16.7Z" }, + { url = "https://files.pythonhosted.org/packages/ab/f2/b31d75cb9b5fa4dd39a0a931ee9b33e7f6f36f23be5ef560bf72e0f92f32/scipy-1.16.3-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:e7efa2681ea410b10dde31a52b18b0154d66f2485328830e45fdf183af5aefc6", size = 38796678, upload-time = "2025-10-28T17:35:26.354Z" }, + { url = "https://files.pythonhosted.org/packages/b4/1e/b3723d8ff64ab548c38d87055483714fefe6ee20e0189b62352b5e015bb1/scipy-1.16.3-cp313-cp313t-win_amd64.whl", hash = "sha256:2d1ae2cf0c350e7705168ff2429962a89ad90c2d49d1dd300686d8b2a5af22fc", size = 38640178, upload-time = "2025-10-28T17:35:35.304Z" }, + { url = "https://files.pythonhosted.org/packages/8e/f3/d854ff38789aca9b0cc23008d607ced9de4f7ab14fa1ca4329f86b3758ca/scipy-1.16.3-cp313-cp313t-win_arm64.whl", hash = "sha256:0c623a54f7b79dd88ef56da19bc2873afec9673a48f3b85b18e4d402bdd29a5a", size = 25803246, upload-time = "2025-10-28T17:35:42.155Z" }, + { url = "https://files.pythonhosted.org/packages/99/f6/99b10fd70f2d864c1e29a28bbcaa0c6340f9d8518396542d9ea3b4aaae15/scipy-1.16.3-cp314-cp314-macosx_10_14_x86_64.whl", hash = "sha256:875555ce62743e1d54f06cdf22c1e0bc47b91130ac40fe5d783b6dfa114beeb6", size = 36606469, upload-time = "2025-10-28T17:36:08.741Z" }, + { url = "https://files.pythonhosted.org/packages/4d/74/043b54f2319f48ea940dd025779fa28ee360e6b95acb7cd188fad4391c6b/scipy-1.16.3-cp314-cp314-macosx_12_0_arm64.whl", hash = "sha256:bb61878c18a470021fb515a843dc7a76961a8daceaaaa8bad1332f1bf4b54657", size = 28872043, upload-time = "2025-10-28T17:36:16.599Z" }, + { url = "https://files.pythonhosted.org/packages/4d/e1/24b7e50cc1c4ee6ffbcb1f27fe9f4c8b40e7911675f6d2d20955f41c6348/scipy-1.16.3-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:f2622206f5559784fa5c4b53a950c3c7c1cf3e84ca1b9c4b6c03f062f289ca26", size = 20862952, upload-time = "2025-10-28T17:36:22.966Z" }, + { url = "https://files.pythonhosted.org/packages/dd/3a/3e8c01a4d742b730df368e063787c6808597ccb38636ed821d10b39ca51b/scipy-1.16.3-cp314-cp314-macosx_14_0_x86_64.whl", hash = "sha256:7f68154688c515cdb541a31ef8eb66d8cd1050605be9dcd74199cbd22ac739bc", size = 23508512, upload-time = "2025-10-28T17:36:29.731Z" }, + { url = "https://files.pythonhosted.org/packages/1f/60/c45a12b98ad591536bfe5330cb3cfe1850d7570259303563b1721564d458/scipy-1.16.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:8b3c820ddb80029fe9f43d61b81d8b488d3ef8ca010d15122b152db77dc94c22", size = 33413639, upload-time = "2025-10-28T17:36:37.982Z" }, + { url = "https://files.pythonhosted.org/packages/71/bc/35957d88645476307e4839712642896689df442f3e53b0fa016ecf8a3357/scipy-1.16.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d3837938ae715fc0fe3c39c0202de3a8853aff22ca66781ddc2ade7554b7e2cc", size = 35704729, upload-time = "2025-10-28T17:36:46.547Z" }, + { url = "https://files.pythonhosted.org/packages/3b/15/89105e659041b1ca11c386e9995aefacd513a78493656e57789f9d9eab61/scipy-1.16.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:aadd23f98f9cb069b3bd64ddc900c4d277778242e961751f77a8cb5c4b946fb0", size = 36086251, upload-time = "2025-10-28T17:36:55.161Z" }, + { url = "https://files.pythonhosted.org/packages/1a/87/c0ea673ac9c6cc50b3da2196d860273bc7389aa69b64efa8493bdd25b093/scipy-1.16.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:b7c5f1bda1354d6a19bc6af73a649f8285ca63ac6b52e64e658a5a11d4d69800", size = 38716681, upload-time = "2025-10-28T17:37:04.1Z" }, + { url = "https://files.pythonhosted.org/packages/91/06/837893227b043fb9b0d13e4bd7586982d8136cb249ffb3492930dab905b8/scipy-1.16.3-cp314-cp314-win_amd64.whl", hash = "sha256:e5d42a9472e7579e473879a1990327830493a7047506d58d73fc429b84c1d49d", size = 39358423, upload-time = "2025-10-28T17:38:20.005Z" }, + { url = "https://files.pythonhosted.org/packages/95/03/28bce0355e4d34a7c034727505a02d19548549e190bedd13a721e35380b7/scipy-1.16.3-cp314-cp314-win_arm64.whl", hash = "sha256:6020470b9d00245926f2d5bb93b119ca0340f0d564eb6fbaad843eaebf9d690f", size = 26135027, upload-time = "2025-10-28T17:38:24.966Z" }, + { url = "https://files.pythonhosted.org/packages/b2/6f/69f1e2b682efe9de8fe9f91040f0cd32f13cfccba690512ba4c582b0bc29/scipy-1.16.3-cp314-cp314t-macosx_10_14_x86_64.whl", hash = "sha256:e1d27cbcb4602680a49d787d90664fa4974063ac9d4134813332a8c53dbe667c", size = 37028379, upload-time = "2025-10-28T17:37:14.061Z" }, + { url = "https://files.pythonhosted.org/packages/7c/2d/e826f31624a5ebbab1cd93d30fd74349914753076ed0593e1d56a98c4fb4/scipy-1.16.3-cp314-cp314t-macosx_12_0_arm64.whl", hash = "sha256:9b9c9c07b6d56a35777a1b4cc8966118fb16cfd8daf6743867d17d36cfad2d40", size = 29400052, upload-time = "2025-10-28T17:37:21.709Z" }, + { url = "https://files.pythonhosted.org/packages/69/27/d24feb80155f41fd1f156bf144e7e049b4e2b9dd06261a242905e3bc7a03/scipy-1.16.3-cp314-cp314t-macosx_14_0_arm64.whl", hash = "sha256:3a4c460301fb2cffb7f88528f30b3127742cff583603aa7dc964a52c463b385d", size = 21391183, upload-time = "2025-10-28T17:37:29.559Z" }, + { url = "https://files.pythonhosted.org/packages/f8/d3/1b229e433074c5738a24277eca520a2319aac7465eea7310ea6ae0e98ae2/scipy-1.16.3-cp314-cp314t-macosx_14_0_x86_64.whl", hash = "sha256:f667a4542cc8917af1db06366d3f78a5c8e83badd56409f94d1eac8d8d9133fa", size = 23930174, upload-time = "2025-10-28T17:37:36.306Z" }, + { url = "https://files.pythonhosted.org/packages/16/9d/d9e148b0ec680c0f042581a2be79a28a7ab66c0c4946697f9e7553ead337/scipy-1.16.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:f379b54b77a597aa7ee5e697df0d66903e41b9c85a6dd7946159e356319158e8", size = 33497852, upload-time = "2025-10-28T17:37:42.228Z" }, + { url = "https://files.pythonhosted.org/packages/2f/22/4e5f7561e4f98b7bea63cf3fd7934bff1e3182e9f1626b089a679914d5c8/scipy-1.16.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4aff59800a3b7f786b70bfd6ab551001cb553244988d7d6b8299cb1ea653b353", size = 35798595, upload-time = "2025-10-28T17:37:48.102Z" }, + { url = "https://files.pythonhosted.org/packages/83/42/6644d714c179429fc7196857866f219fef25238319b650bb32dde7bf7a48/scipy-1.16.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:da7763f55885045036fabcebd80144b757d3db06ab0861415d1c3b7c69042146", size = 36186269, upload-time = "2025-10-28T17:37:53.72Z" }, + { url = "https://files.pythonhosted.org/packages/ac/70/64b4d7ca92f9cf2e6fc6aaa2eecf80bb9b6b985043a9583f32f8177ea122/scipy-1.16.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:ffa6eea95283b2b8079b821dc11f50a17d0571c92b43e2b5b12764dc5f9b285d", size = 38802779, upload-time = "2025-10-28T17:37:59.393Z" }, + { url = "https://files.pythonhosted.org/packages/61/82/8d0e39f62764cce5ffd5284131e109f07cf8955aef9ab8ed4e3aa5e30539/scipy-1.16.3-cp314-cp314t-win_amd64.whl", hash = "sha256:d9f48cafc7ce94cf9b15c6bffdc443a81a27bf7075cf2dcd5c8b40f85d10c4e7", size = 39471128, upload-time = "2025-10-28T17:38:05.259Z" }, + { url = "https://files.pythonhosted.org/packages/64/47/a494741db7280eae6dc033510c319e34d42dd41b7ac0c7ead39354d1a2b5/scipy-1.16.3-cp314-cp314t-win_arm64.whl", hash = "sha256:21d9d6b197227a12dcbf9633320a4e34c6b0e51c57268df255a0942983bac562", size = 26464127, upload-time = "2025-10-28T17:38:11.34Z" }, +] + [[package]] name = "sentry-sdk" version = "2.35.2" @@ -2705,6 +2862,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/a2/09/77d55d46fd61b4a135c444fc97158ef34a095e5681d0a6c10b75bf356191/sympy-1.14.0-py3-none-any.whl", hash = "sha256:e091cc3e99d2141a0ba2847328f5479b05d94a6635cb96148ccb3f34671bd8f5", size = 6299353, upload-time = "2025-04-27T18:04:59.103Z" }, ] +[[package]] +name = "tabulate" +version = "0.9.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ec/fe/802052aecb21e3797b8f7902564ab6ea0d60ff8ca23952079064155d1ae1/tabulate-0.9.0.tar.gz", hash = "sha256:0095b12bf5966de529c0feb1fa08671671b3368eec77d7ef7ab114be2c068b3c", size = 81090, upload-time = "2022-10-06T17:21:48.54Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/40/44/4a5f08c96eb108af5cb50b41f76142f0afa346dfa99d5296fe7202a11854/tabulate-0.9.0-py3-none-any.whl", hash = "sha256:024ca478df22e9340661486f85298cff5f6dcdba14f3813e8830015b9ed1948f", size = 35252, upload-time = "2022-10-06T17:21:44.262Z" }, +] + [[package]] name = "tiktoken" version = "0.11.0" From 4cc605b94013f7eae6186cc35fe9e674d8cdd8e1 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Wed, 7 Jan 2026 22:14:21 +0000 Subject: [PATCH 006/119] quick pointer to miniseries post in readme for now --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 5f75429e..acb91110 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,10 @@ This repo is a full-stack implementation of an LLM like ChatGPT in a single, cle To get a sense of the endpoint of this repo, you can currently find [nanochat d34](https://github.com/karpathy/nanochat/discussions/314) hosted on [nanochat.karpathy.ai](https://nanochat.karpathy.ai/). "d34" means that this model has 34 layers in the Transformer neural network. This model has 2.2 billion parameters, it was trained on 88 billion tokens by simply running the training script [run1000.sh](run1000.sh) with `--target_param_data_ratio=40` (2x longer than Chinchilla-optimal), and the total cost of training was ~$2,500 (about 100 hours training time on 8XH100 GPU node). While today this is enough to outperform GPT-2 of 2019, it falls dramatically short of modern Large Language Models like GPT-5. When talking to these micro models, you'll see that they make a lot of mistakes, they are a little bit naive and silly and they hallucinate a ton, a bit like children. It's kind of amusing. But what makes nanochat unique is that it is fully yours - fully configurable, tweakable, hackable, and trained by you from start to end. To train and talk to your own, we turn to... +## Updates + +- (Jan 7 2026) See new post: [nanochat Miniseries v1](https://github.com/karpathy/nanochat/discussions/420) and the associated script [miniseries.sh](miniseries.sh). + ## Quick start The fastest way to feel the magic is to run the speedrun script [speedrun.sh](speedrun.sh), which trains and inferences the $100 tier of nanochat. On an 8XH100 node at $24/hr, this gives a total run time of about 4 hours. Boot up a new 8XH100 GPU box from your favorite provider (e.g. I use and like [Lambda](https://lambda.ai/service/gpu-cloud)), and kick off the training script: From 3af4dcf6ee26abd6f4c62d4ecd5d8b58c3c06c76 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Wed, 7 Jan 2026 22:25:13 +0000 Subject: [PATCH 007/119] also add scaling_laws.sh script if it's a useful reference --- miniseries.sh | 7 ++- scaling_laws.sh | 115 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 120 insertions(+), 2 deletions(-) create mode 100644 scaling_laws.sh diff --git a/miniseries.sh b/miniseries.sh index 9287def6..077418ad 100644 --- a/miniseries.sh +++ b/miniseries.sh @@ -27,8 +27,11 @@ RESULTS_DIR="$NANOCHAT_BASE_DIR/jan7_miniseries_results" mkdir -p "$RESULTS_DIR" RESULTS_FILE="$RESULTS_DIR/results.csv" -# Write CSV header -echo "depth,model_dim,num_params,num_scaling_params,num_iterations,tokens_trained,param_data_ratio,val_bpb,core_score,train_time_sec" > "$RESULTS_FILE" +# Write CSV header only if file doesn't exist +if [ ! -f "$RESULTS_FILE" ]; then + echo "depth,model_dim,num_params,num_scaling_params,num_iterations,tokens_trained,param_data_ratio,val_bpb,core_score,train_time_sec" > "$RESULTS_FILE" +fi + log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" } diff --git a/scaling_laws.sh b/scaling_laws.sh new file mode 100644 index 00000000..102ba11c --- /dev/null +++ b/scaling_laws.sh @@ -0,0 +1,115 @@ +#!/bin/bash + +FLOPS_BUDGETS=( + 1e18 + 3e18 + 6e18 +) +DEPTHS=(8 10 12 14 16 18 20) +NPROC_PER_NODE="${NPROC_PER_NODE:-8}" +WANDB_RUN="${WANDB_RUN:-scaling}" +EVAL_TOKENS=$((100 * 524288)) # ~100M tokens for final eval (default is ~10M) + +export OMP_NUM_THREADS=1 +export NANOCHAT_BASE_DIR="${NANOCHAT_BASE_DIR:-$HOME/.cache/nanochat}" +source .venv/bin/activate + +RESULTS_DIR="$NANOCHAT_BASE_DIR/scaling_laws_results" +mkdir -p "$RESULTS_DIR" +RESULTS_FILE="$RESULTS_DIR/results.csv" + +# Write CSV header only if file doesn't exist +if [ ! -f "$RESULTS_FILE" ]; then + echo "flops_budget,depth,model_dim,num_params,num_scaling_params,num_iterations,tokens_trained,param_data_ratio,val_bpb,core_score,train_time_sec" > "$RESULTS_FILE" +fi + +log() { + echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" +} + +# Check if a run already exists in results +run_exists() { + local flops=$1 + local depth=$2 + grep -q "^${flops},${depth}," "$RESULTS_FILE" 2>/dev/null +} + +# ============================================================================= +# Main Loop +# ============================================================================= + +for flops in "${FLOPS_BUDGETS[@]}"; do + log "==============================================" + log "Compute budget: $flops FLOPs" + log "==============================================" + + for d in "${DEPTHS[@]}"; do + + # Skip if already completed + if run_exists "$flops" "$d"; then + log "Skipping d=$d at $flops FLOPs (already in results)" + continue + fi + + log "Training d=$d at $flops FLOPs..." + + # Unique tag for this run + TAG="scaling_${flops}_d${d}" + + # Record start time + START_TIME=$(date +%s) + + # Train the model with fixed flops budget + # The script will auto-calculate num_iterations to hit target_flops + # CORE eval happens once at the end (999999 ensures only final step) + torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- \ + --depth=$d \ + --target_flops=$flops \ + --target_param_data_ratio=-1 \ + --run="${WANDB_RUN}_${TAG}" \ + --model_tag="${TAG}" \ + --eval_tokens=$EVAL_TOKENS \ + --core_metric_every=999999 \ + --core_metric_max_per_task=-1 \ + --sample_every=-1 \ + --save_every=-1 \ + 2>&1 | tee "$RESULTS_DIR/${TAG}_train.log" + + END_TIME=$(date +%s) + TRAIN_TIME=$((END_TIME - START_TIME)) + + # Extract training stats from the log + LOG_FILE="$RESULTS_DIR/${TAG}_train.log" + NUM_PARAMS=$(grep "Number of parameters:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | head -1 | tr -d ',') + NUM_SCALING_PARAMS=$(grep "Number of parameters:" "$LOG_FILE" | tail -1 | grep -oP 'scaling: [\d,]+' | grep -oP '[\d,]+' | tr -d ',') + NUM_ITERS=$(grep "Calculated number of iterations" "$LOG_FILE" | tail -1 | sed 's/.*: //' | tr -d ',') + # Calculate tokens trained (iterations * batch_size, default 524288) + TOKENS_TRAINED=$((NUM_ITERS * 524288)) + # Param:data ratio (using scaling params per Kaplan et al.) + PARAM_DATA_RATIO=$(python -c "print(f'{$TOKENS_TRAINED / $NUM_SCALING_PARAMS:.2f}')") + # Model dim + MODEL_DIM=$((d * 64)) + # Val BPB from final eval + VAL_BPB=$(grep "Validation bpb:" "$LOG_FILE" | tail -1 | grep -oP '[\d.]+$') + + # Extract CORE score from training log (evaluated on final step) + CORE_SCORE=$(grep "CORE metric:" "$LOG_FILE" | tail -1 | awk '{print $NF}') + if [ -z "$CORE_SCORE" ]; then + log "WARNING: Could not extract CORE score for d=$d" + CORE_SCORE="0.0" + fi + + log " Params: $NUM_PARAMS, Iters: $NUM_ITERS, Ratio: $PARAM_DATA_RATIO, Val BPB: $VAL_BPB, CORE: $CORE_SCORE" + + # Append to CSV + echo "$flops,$d,$MODEL_DIM,$NUM_PARAMS,$NUM_SCALING_PARAMS,$NUM_ITERS,$TOKENS_TRAINED,$PARAM_DATA_RATIO,$VAL_BPB,$CORE_SCORE,$TRAIN_TIME" >> "$RESULTS_FILE" + done +done + +log "==============================================" +log "Scaling Laws Sweep Complete" +log "==============================================" +log "Results saved to: $RESULTS_FILE" +echo "" +echo "Results:" +column -t -s',' "$RESULTS_FILE" From e8c30c3b199b7a9f04016110080537d3c589712d Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Wed, 7 Jan 2026 22:28:53 +0000 Subject: [PATCH 008/119] add notebook used for scaling laws analysis --- dev/scaling_analysis.ipynb | 227 +++++++++++++++++++++++++++++++++++++ 1 file changed, 227 insertions(+) create mode 100644 dev/scaling_analysis.ipynb diff --git a/dev/scaling_analysis.ipynb b/dev/scaling_analysis.ipynb new file mode 100644 index 00000000..a196bd18 --- /dev/null +++ b/dev/scaling_analysis.ipynb @@ -0,0 +1,227 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Scaling Laws Analysis\n", + "\n", + "Analyze results from `scaling_laws.sh` to find the optimal param:data ratio for nanochat." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "\n", + "# Load results\n", + "base_dir = os.environ.get('NANOCHAT_BASE_DIR', os.path.expanduser('~/.cache/nanochat'))\n", + "results_path = os.path.join(base_dir, 'scaling_laws_results', 'results.csv')\n", + "\n", + "df = pd.read_csv(results_path)\n", + "flops_budgets = sorted(df['flops_budget'].unique())\n", + "print(f\"Loaded {len(df)} runs across {len(flops_budgets)} FLOPs budgets\")\n", + "print(f\"Columns: {list(df.columns)}\")\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## IsoFLOP Curves (à la Chinchilla)\n", + "\n", + "For each compute budget, plot loss vs model size. Looking for the U-shape valley that reveals the optimal model size for each FLOPs budget." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(1, 3, figsize=(16, 5))\n", + "\n", + "# Plot 1: IsoFLOP curves - Val BPB vs Parameters (the Chinchilla plot!)\n", + "ax = axes[0]\n", + "colors = plt.cm.viridis(np.linspace(0, 0.9, len(flops_budgets)))\n", + "optimal_by_bpb = []\n", + "\n", + "for flops, color in zip(flops_budgets, colors):\n", + " subset = df[df['flops_budget'] == flops].sort_values('num_scaling_params')\n", + " ax.plot(subset['num_scaling_params'], subset['val_bpb'], 'o', color=color, label=f'{flops:.0e}', markersize=8)\n", + "\n", + " # Fit quadratic in log-space: val_bpb = a*(log N)^2 + b*(log N) + c\n", + " log_params = np.log10(subset['num_scaling_params'])\n", + " coeffs = np.polyfit(log_params, subset['val_bpb'], 2)\n", + " a, b, c = coeffs\n", + "\n", + " # Plot fitted curve (dashed)\n", + " log_fit_x = np.linspace(log_params.min() - 0.1, log_params.max() + 0.1, 100)\n", + " fit_y = a * log_fit_x**2 + b * log_fit_x + c\n", + " ax.plot(10**log_fit_x, fit_y, '--', color=color, linewidth=2)\n", + "\n", + " # Find minimum of quadratic: d/dx(ax^2 + bx + c) = 0 => x = -b/(2a)\n", + " if a > 0: # parabola opens upward (has a minimum)\n", + " log_opt = -b / (2 * a)\n", + " opt_params = 10**log_opt\n", + " opt_bpb = a * log_opt**2 + b * log_opt + c\n", + " # Mark the fitted optimal\n", + " ax.scatter([opt_params], [opt_bpb], s=150, color=color,\n", + " zorder=5, edgecolors='black', linewidths=2, marker='*')\n", + " # Interpolate tokens and ratio from actual data (don't use C≈6ND approximation)\n", + " opt_tokens = np.interp(np.log10(opt_params), log_params, subset['tokens_trained'])\n", + " opt_ratio = np.interp(np.log10(opt_params), log_params, subset['param_data_ratio'])\n", + " optimal_by_bpb.append({'flops': flops, 'params': opt_params, 'tokens': opt_tokens, 'ratio': opt_ratio, 'bpb': opt_bpb})\n", + " else:\n", + " # Fallback to raw minimum if quadratic doesn't have minimum\n", + " best_idx = subset['val_bpb'].idxmin()\n", + " best = subset.loc[best_idx]\n", + " ax.scatter([best['num_scaling_params']], [best['val_bpb']], s=150, color=color,\n", + " zorder=5, edgecolors='black', linewidths=2)\n", + " optimal_by_bpb.append({'flops': flops, 'params': best['num_scaling_params'],\n", + " 'tokens': best['tokens_trained'], 'ratio': best['param_data_ratio'], 'bpb': best['val_bpb']})\n", + "\n", + "ax.set_xscale('log')\n", + "ax.set_xlabel('Parameters')\n", + "ax.set_ylabel('Validation Loss (bpb)')\n", + "ax.set_title('IsoFLOP Curves')\n", + "ax.legend(title='FLOPs', loc='upper right')\n", + "ax.grid(True, alpha=0.3)\n", + "\n", + "opt_df = pd.DataFrame(optimal_by_bpb)\n", + "\n", + "# Plot 2: Optimal model size vs compute (power law)\n", + "ax = axes[1]\n", + "ax.loglog(opt_df['flops'], opt_df['params'], 'o', markersize=10, color='#2ecc71')\n", + "ax.set_xlabel('FLOPs')\n", + "ax.set_ylabel('Optimal Parameters')\n", + "ax.set_title('Optimal Model Size')\n", + "ax.grid(True, alpha=0.3)\n", + "\n", + "# Fit and show power law\n", + "if len(opt_df) >= 2:\n", + " log_f = np.log10(opt_df['flops'])\n", + " log_p = np.log10(opt_df['params'])\n", + " slope, intercept = np.polyfit(log_f, log_p, 1)\n", + " fit_f = np.logspace(log_f.min() - 0.5, log_f.max() + 0.5, 100)\n", + " fit_p = 10**(intercept + slope * np.log10(fit_f))\n", + " ax.plot(fit_f, fit_p, 'r--', alpha=0.7, label=f'N ∝ C^{slope:.2f}')\n", + " ax.legend()\n", + "\n", + "# Plot 3: Optimal tokens vs compute (power law)\n", + "ax = axes[2]\n", + "ax.loglog(opt_df['flops'], opt_df['tokens'], 'o', markersize=10, color='#e74c3c')\n", + "ax.set_xlabel('FLOPs')\n", + "ax.set_ylabel('Optimal Tokens')\n", + "ax.set_title('Optimal Training Tokens')\n", + "ax.grid(True, alpha=0.3)\n", + "\n", + "# Fit and show power law\n", + "if len(opt_df) >= 2:\n", + " log_f = np.log10(opt_df['flops'])\n", + " log_t = np.log10(opt_df['tokens'])\n", + " slope, intercept = np.polyfit(log_f, log_t, 1)\n", + " fit_f = np.logspace(log_f.min() - 0.5, log_f.max() + 0.5, 100)\n", + " fit_t = 10**(intercept + slope * np.log10(fit_f))\n", + " ax.plot(fit_f, fit_t, 'r--', alpha=0.7, label=f'D ∝ C^{slope:.2f}')\n", + " ax.legend()\n", + "\n", + "plt.tight_layout()\n", + "plt.show()\n", + "\n", + "# Print the optimal points (from quadratic fits)\n", + "print(\"\\nOptimal configurations (from quadratic fits):\")\n", + "print(f\"{'FLOPs':<12} {'Params':<15} {'Tokens':<15} {'Ratio':<10} {'Val BPB':<10}\")\n", + "print(\"-\" * 65)\n", + "for _, row in opt_df.iterrows():\n", + " print(f\"{row['flops']:<12.0e} {int(row['params']):<15,} {int(row['tokens']):<15,} {row['ratio']:<10.1f} {row['bpb']:<10.4f}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Val BPB vs Depth and Ratio" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig, axes = plt.subplots(1, 2, figsize=(14, 5))\n", + "\n", + "# Plot 1: Val BPB vs Depth\n", + "ax = axes[0]\n", + "for flops in flops_budgets:\n", + " subset = df[df['flops_budget'] == flops].sort_values('depth')\n", + " ax.plot(subset['depth'], subset['val_bpb'], 'o-', label=f'{flops:.0e}')\n", + " # Mark the best (lowest)\n", + " best_idx = subset['val_bpb'].idxmin()\n", + " best = subset.loc[best_idx]\n", + " ax.scatter([best['depth']], [best['val_bpb']], s=100, zorder=5, edgecolors='black', linewidths=2)\n", + "\n", + "ax.set_xlabel('Depth')\n", + "ax.set_ylabel('Val BPB (lower is better)')\n", + "ax.set_title('Validation BPB vs Model Depth')\n", + "ax.legend(title='FLOPs')\n", + "ax.grid(True, alpha=0.3)\n", + "\n", + "# Plot 2: Val BPB vs Param:Data Ratio\n", + "ax = axes[1]\n", + "for flops in flops_budgets:\n", + " subset = df[df['flops_budget'] == flops].sort_values('param_data_ratio')\n", + " ax.plot(subset['param_data_ratio'], subset['val_bpb'], 'o-', label=f'{flops:.0e}')\n", + " best_idx = subset['val_bpb'].idxmin()\n", + " best = subset.loc[best_idx]\n", + " ax.scatter([best['param_data_ratio']], [best['val_bpb']], s=100, zorder=5, edgecolors='black', linewidths=2)\n", + "\n", + "ax.axvline(x=20, color='red', linestyle='--', alpha=0.5, label='Chinchilla (20)')\n", + "ax.set_xlabel('Param:Data Ratio (tokens/param)')\n", + "ax.set_ylabel('Val BPB (lower is better)')\n", + "ax.set_title('Val BPB vs Param:Data Ratio')\n", + "ax.legend(title='FLOPs')\n", + "ax.grid(True, alpha=0.3)\n", + "\n", + "plt.tight_layout()\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 061f83c152b359a145b2d76286a7d019d04fa882 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 8 Jan 2026 02:16:50 +0000 Subject: [PATCH 009/119] delete grad_clip. appears to not be necessary at all. not only was it buggy because the clipping happened per gpu before grad synchronization, but it costs ~2% MFU, and it also doesn't even help. I tried deleting it a while ago and back then it did help. So I'm guessing that some hyperparameter tuning obviated the reason for it since then --- dev/LOG.md | 23 +++++++++++++++++++++++ scripts/base_train.py | 11 +---------- 2 files changed, 24 insertions(+), 10 deletions(-) create mode 100644 dev/LOG.md diff --git a/dev/LOG.md b/dev/LOG.md new file mode 100644 index 00000000..449cd7fd --- /dev/null +++ b/dev/LOG.md @@ -0,0 +1,23 @@ +# Experiment Log + +A running summary documenting some experiments and findings. Started ~Jan 7 2026. + +--- + +## 2026-01-08: exp_grad_clip - Gradient Clipping + +**Hypothesis:** Gradient clipping may be unnecessary overhead. Tested L2 norm clipping at various thresholds (0.25, 0.5, 1.0, 2.0) and elementwise clipping. + +**Results:** +- No benefit at any scale tested (d12, d20) +- All variants within noise (~0.9827 val_bpb) +- Grad norm never exceeds 1.0 naturally, so clipping is always inactive +- Clipping adds ~2% time overhead from the all-reduce + +**Bug Found:** Original implementation clipped local gradients before sync. Since this codebase doesn't use DDP (gradient sync is in the optimizers), each rank was clipping based on its own local norm. Fixed on the branch with proper distributed all-reduce. + +**Observartion:** modded-nanogpt does not appear to clip either right now. + +**Recommendation:** Disable by default (`--grad_clip=0.0`). The code naturally produces well-behaved gradients. + +--- diff --git a/scripts/base_train.py b/scripts/base_train.py index de0321ae..e3df0f06 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -55,7 +55,6 @@ parser.add_argument("--weight_decay", type=float, default=0.0, help="weight deca parser.add_argument("--matrix_lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") parser.add_argument("--adam_beta1", type=float, default=0.8, help="Adam beta1 for embedding/unembedding") parser.add_argument("--adam_beta2", type=float, default=0.95, help="Adam beta2 for embedding/unembedding") -parser.add_argument("--grad_clip", type=float, default=1.0, help="gradient clipping value (0.0 = disabled)") parser.add_argument("--warmup_ratio", type=float, default=0.0, help="ratio of iterations for LR warmup") parser.add_argument("--warmdown_ratio", type=float, default=0.4, help="ratio of iterations for LR warmdown") parser.add_argument("--final_lr_frac", type=float, default=0.0, help="final LR as fraction of initial LR") @@ -346,11 +345,6 @@ while True: loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here loss.backward() x, y, dataloader_state_dict = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward - # gradient clipping - grad_clip_enabled = args.grad_clip > 0.0 - if grad_clip_enabled: - grad_norm_tensor = torch.nn.utils.clip_grad_norm_(orig_model.parameters(), args.grad_clip) - grad_norm = grad_norm_tensor.item() # GPU tensor -> CPU float (note: cpu-gpu sync point) # step the optimizers lrm = get_lr_multiplier(step) for opt in optimizers: @@ -378,7 +372,6 @@ while True: mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in % if step > 10: total_training_time += dt # only count the time after the first 10 steps - print_grad_norm = f" grad norm: {grad_norm:.4f} |" if grad_clip_enabled else "" # Calculate ETA based on average time per step (excluding first 10 steps) steps_done = step - 10 if steps_done > 0: @@ -388,7 +381,7 @@ while True: eta_str = f" | eta: {eta_seconds/60:.1f}m" else: eta_str = "" - print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} |{print_grad_norm} lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | total time: {total_training_time/60:.2f}m{eta_str}") + print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | total time: {total_training_time/60:.2f}m{eta_str}") if step % 100 == 0: log_data = { "step": step, @@ -400,8 +393,6 @@ while True: "train/tok_per_sec": tok_per_sec, "train/mfu": mfu, } - if grad_clip_enabled: - log_data["train/grad_norm"] = grad_norm wandb_run.log(log_data) # state update From a1ccb3dc0b7095620751498b8652a6d6647d8c01 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 8 Jan 2026 15:18:37 +0100 Subject: [PATCH 010/119] remove rust compilation as rustbpe is now installed from separate package (#416) --- dev/runcpu.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/dev/runcpu.sh b/dev/runcpu.sh index ffacefa4..c4a719e4 100755 --- a/dev/runcpu.sh +++ b/dev/runcpu.sh @@ -19,9 +19,6 @@ source .venv/bin/activate if [ -z "$WANDB_RUN" ]; then WANDB_RUN=dummy fi -curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y -source "$HOME/.cargo/env" -uv run maturin develop --release --manifest-path rustbpe/Cargo.toml # wipe the report python -m nanochat.report reset From 4ddc8037975f0c11e11038a27eaf81f070971dc8 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 8 Jan 2026 18:18:22 +0000 Subject: [PATCH 011/119] fix adamw slight bug. this chunk was copy pasted originally from modded-nanogpt, which still seems to have the bug --- nanochat/adamw.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nanochat/adamw.py b/nanochat/adamw.py index 8816057e..0b97ae25 100644 --- a/nanochat/adamw.py +++ b/nanochat/adamw.py @@ -68,8 +68,8 @@ class DistAdamW(torch.optim.Optimizer): bias1 = 1 - beta1 ** t bias2 = 1 - beta2 ** t # compute step - denom = exp_avg_sq.sqrt().add_(eps) - step_size = lr * (torch.sqrt(bias2) / bias1) + denom = (exp_avg_sq / bias2).sqrt().add_(eps) + step_size = lr / bias1 update = exp_avg.div(denom).mul_(step_size) p_slice.add_(other=update, alpha=-1.0) idx += 1 From f5a0ea4d3f98be55675d2518a02a7bc3a18236b2 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 8 Jan 2026 18:18:39 +0000 Subject: [PATCH 012/119] take out these gitignore dirs --- .gitignore | 6 ------ 1 file changed, 6 deletions(-) diff --git a/.gitignore b/.gitignore index 7950c9ff..d82809a1 100644 --- a/.gitignore +++ b/.gitignore @@ -12,9 +12,3 @@ eval_bundle/ .claude CLAUDE.md wandb/ - -# Local experimentation -experiments/ -ignore/ -knowledge/ -ideas/ From 2c4473dd1b608a403700b098f867b202c2a03522 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 11 Jan 2026 16:56:59 +0000 Subject: [PATCH 013/119] Big Muon optimizer changes inspired by latest of modded-nanogpt. Added Polar Express, Adafactor-style variance reduction, cautious weight decay, schedule weight decay linearly to ramp down to zero. Tuned optimum weight decay for multiple model sizes d8, d12, d16, d20 and found a scaling law with optimum wd \propto 1/channels^2, including it as default into code. --weight_decay of base_train is now default on and configured optimally according to all of these experiments. Solid bump to val_bpb observed as a result of these changes. --- dev/LOG.md | 63 +++++++++++++++++++- nanochat/gpt.py | 4 +- nanochat/muon.py | 134 ++++++++++++++++++++++++++++++++++++++---- scripts/base_train.py | 19 ++++-- 4 files changed, 198 insertions(+), 22 deletions(-) diff --git a/dev/LOG.md b/dev/LOG.md index 449cd7fd..13fc08ed 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,65 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-10: Muon Optimizer Upgrades & Cautious Weight Decay + +Cherry-picked improvements from NorMuon (modded-nanogpt) into our simpler Muon implementation. Decided against using NorMuon directly due to hard-coded architecture assumptions (expects 32 params split 10 attn + 22 mlp), parameter labeling requirements, and complexity. + +### Changes Made + +**1. Polar Express Orthogonalization** +- Replaced Newton-Schulz iteration with "Polar Express Sign Method" from [arxiv.org/pdf/2505.16932](https://arxiv.org/pdf/2505.16932) +- Uses 5 different coefficient tuples (one per iteration) instead of fixed coefficients +- Both methods kept in code for easy comparison (`zeropower_via_polar_express` vs `zeropower_via_newtonschulz5`) +- **Result:** No dramatic/noticeable difference in training, but keeping the new Polar Express as default. + +**2. Variance Reduction (NorMuon-style)** +- Added low-rank variance estimator similar to Adafactor ([arxiv.org/pdf/2510.05491](https://arxiv.org/pdf/2510.05491)) +- Maintains `second_momentum_buffer` with shape `[rows, 1]` or `[1, cols]` (whichever is smaller) +- Normalizes updates based on running per-row/col variance estimate (beta2=0.95) +- Memory overhead: ~1/max(rows, cols) per param, negligible +- **Result:** Led to a very small improvement, kept and enabled by default. + +**3. Cautious Weight Decay** +- Only decays weights where `update * weight >= 0` (same sign) from [arxiv.org/abs/2411.16085](https://arxiv.org/abs/2411.16085) +- Standard WD always pulls toward zero; cautious WD skips decay when gradient is pushing weight away from zero +- **Implementation note:** Had to inline the logic rather than use a separate `@torch.compile` function. Passing changing float values (like `weight_decay` during scheduling) as function arguments triggers recompilation. Reading from `group["weight_decay"]` inside the step avoids this. +- **Result:** Solid improvements, especially the cautious version was better than standard wd. +- Now defaults to ON for Muon via the `weight_decay` param. AdamW still has no weight decay and is hardcoded to 0 weight decay, might try to re-tune this later. + +**4. Weight decay schedule** +- Added a linear schedule to weight decay that is default on from 1.0 to 0.0 (i.e. start with max weight decay in the beginning of training, them ramp to 0 by the end). Worked better than a static setting in experiments. (modded-nanogpt has the same schedule but it is imlpemented in a more confusing way by multiplying twice by the learning rate, which is already wired up to a decay schedule). + +### Weight Decay Scaling Experiments + +Swept weight decay values at d8, d12, d16, d20 to find optimal values and scaling law. + +**Optimal Values Found:** +| Depth | Width (channels) | Optimal WD | +|-------|------------------|------------| +| d8 | 512 | ~0.40 | +| d12 | 768 | ~0.22 | +| d16 | 1024 | ~0.10 | +| d20 | 1280 | ~0.08 | + +**Scaling Law:** +- Fit power law: `WD = k / channels^α` in log-log space +- Found α ≈ 1.97 (approximately 2), meaning WD ∝ 1/width² + +**Practical Formula:** +``` +WD_target = WD_reference × (d_reference / d_target)² +``` +Example: If d12 optimal is 0.22, then d20 optimal ≈ 0.22 × (12/20)² ≈ 0.08 + +**Reference:** Moonlight paper uses fixed WD=0.1 for their 15B MoE model. Our experiments indicated a scaling law where the optimal WD changed with depth, so we go along with the empirical scaling law. + +### Summary + +Muon was changed to use Polar Express, added Adafactor-style variance reduction, and cautious weight decay with schedule that ramps linearly to zero. All of these changes follow modded-nanogpt repo, but all of them were also validated piece by piece to yield improvements in nanochat with the exception of the Polar Express change which was in the noise. This is default on and configurable with `--weight_decay`, using simply 0.2 and ∝ 1/width² scaling. The kwarg `--weight_decay` is therefore changing as of this change. It used to configure AdamW via standard weight decay and now it becomes exclusively used in Muon (AdamW is hardcoded to 0.0), and it is scaled based on depth. + +--- + ## 2026-01-08: exp_grad_clip - Gradient Clipping **Hypothesis:** Gradient clipping may be unnecessary overhead. Tested L2 norm clipping at various thresholds (0.25, 0.5, 1.0, 2.0) and elementwise clipping. @@ -18,6 +77,4 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 **Observartion:** modded-nanogpt does not appear to clip either right now. -**Recommendation:** Disable by default (`--grad_clip=0.0`). The code naturally produces well-behaved gradients. - ---- +**Summary:** Deleted all grad-clip code paths. The code naturally produces well-behaved gradients. This improves a bit of MFU because we don't have to calculate and sync grad norms. diff --git a/nanochat/gpt.py b/nanochat/gpt.py index 478f6879..2ffdc50b 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -260,11 +260,11 @@ class GPT(nn.Module): dict(params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale), dict(params=embedding_params, lr=embedding_lr * dmodel_lr_scale), ] - adamw_kwargs = dict(betas=adam_betas, eps=1e-10, weight_decay=weight_decay) + adamw_kwargs = dict(betas=adam_betas, eps=1e-10, weight_decay=0.0) # NOTE: weight decay is hardcoded to 0.0 for AdamW, only used in Muon AdamWFactory = DistAdamW if ddp else partial(torch.optim.AdamW, fused=True) adamw_optimizer = AdamWFactory(adam_groups, **adamw_kwargs) # Create the Muon optimizer for the linear layers - muon_kwargs = dict(lr=matrix_lr, momentum=0.95) + muon_kwargs = dict(lr=matrix_lr, momentum=0.95, weight_decay=weight_decay) MuonFactory = DistMuon if ddp else Muon muon_optimizer = MuonFactory(matrix_params, **muon_kwargs) # Combine them the two optimizers into one list diff --git a/nanochat/muon.py b/nanochat/muon.py index d9161034..7ae5ffdc 100644 --- a/nanochat/muon.py +++ b/nanochat/muon.py @@ -1,11 +1,50 @@ """ -Muon optimizer from Keller et al. -Also a lot of borrowing of ideas from modded-nanogpt. +Muon optimizer adapted (simplified) from modded-nanogpt. +https://github.com/KellerJordan/modded-nanogpt """ import torch from torch import Tensor import torch.distributed as dist +# Coefficients for Polar Express (computed for num_iters=5, safety_factor=2e-2, cushion=2) +# From https://arxiv.org/pdf/2505.16932 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323), +] + + +@torch.compile +def zeropower_via_polar_express(G: Tensor, steps: int = 5) -> Tensor: + """ + Polar Express Sign Method for orthogonalization. + https://arxiv.org/pdf/2505.16932 + by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + + Alternative to Newton-Schulz iteration with potentially better convergence properties. + """ + assert G.ndim >= 2 + X = G.bfloat16() + if G.size(-2) > G.size(-1): + X = X.mT + + # Ensure spectral norm is at most 1 (with 2% safety factor) + X = X / (X.norm(dim=(-2, -1), keepdim=True) * 1.02 + 1e-6) + + # Perform the iterations (cap at available coefficients) + for a, b, c in polar_express_coeffs[:min(steps, len(polar_express_coeffs))]: + A = X @ X.mT + B = b * A + c * (A @ A) + X = a * X + B @ X + + if G.size(-2) > G.size(-1): + X = X.mT + return X + + @torch.compile def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor: """ @@ -35,6 +74,40 @@ def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor: X = X.mT return X + +@torch.compile +def apply_variance_reduction(v: Tensor, second_momentum_buffer: Tensor, beta2: float) -> Tensor: + """ + NorMuon-style variance reduction, similar to Adafactor's low-rank variance estimator. + https://arxiv.org/pdf/2510.05491 + + Normalizes updates based on a running estimate of per-row (or per-column) variance. + The reduction dimension is determined by the shape of second_momentum_buffer. + """ + # Determine reduction dimension from buffer shape + red_dim = -1 if second_momentum_buffer.size(-1) == 1 else -2 + + # Compute per-row/col mean of squared values + v_mean = v.float().square().mean(dim=red_dim, keepdim=True) + red_dim_size = v.size(red_dim) + + # Compute current norm + v_norm_sq = v_mean.sum(dim=(-2, -1), keepdim=True) * red_dim_size + v_norm = v_norm_sq.sqrt() + + # Update second momentum buffer (EMA of variance) + second_momentum_buffer.lerp_(v_mean.to(dtype=second_momentum_buffer.dtype), 1 - beta2) + + # Compute scaling factor from second momentum + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt() + scaled_sq_sum = (v_mean * red_dim_size) * step_size.float().square() + v_norm_new = scaled_sq_sum.sum(dim=(-2, -1), keepdim=True).sqrt() + + # Final scale preserves overall norm while adjusting per-row/col + final_scale = step_size * (v_norm / v_norm_new.clamp_min(1e-10)) + return v.mul(final_scale.to(v.dtype)) + + class Muon(torch.optim.Optimizer): """ Muon - MomentUm Orthogonalized by Newton-schulz @@ -56,9 +129,11 @@ class Muon(torch.optim.Optimizer): momentum: The momentum used by the internal SGD. nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. + beta2: The decay rate for the second moment (variance) estimate. Set to None to disable. + weight_decay: Cautious weight decay coefficient. Only decays where update and weight agree. """ - def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5): - defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) + def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5, beta2=0.95, weight_decay=0.0): + defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps, beta2=beta2, weight_decay=weight_decay) params: list[Tensor] = [*params] param_groups = [] for size in {p.numel() for p in params}: @@ -79,13 +154,29 @@ class Muon(torch.optim.Optimizer): buf: Tensor = state["momentum_buffer"] buf.lerp_(g, 1 - group["momentum"]) g = g.lerp_(buf, group["momentum"]) if group["nesterov"] else buf - g = zeropower_via_newtonschulz5(g, steps=group["ns_steps"]) - p.add_(g, alpha=-group["lr"] * max(1, p.size(-2) / p.size(-1))**0.5) + g = zeropower_via_polar_express(g, steps=group["ns_steps"]) + # Variance reduction (NorMuon-style) + if group["beta2"] is not None: + if "second_momentum_buffer" not in state: + # Buffer shape determines reduction dim: reduce along larger dimension + if p.size(-2) >= p.size(-1): + state["second_momentum_buffer"] = torch.zeros_like(g[..., :1]) + else: + state["second_momentum_buffer"] = torch.zeros_like(g[..., :1, :]) + g = apply_variance_reduction(g, state["second_momentum_buffer"], group["beta2"]) + # Parameter update with cautious weight decay + effective_lr = group["lr"] * max(1, p.size(-2) / p.size(-1))**0.5 + wd = group["weight_decay"] + if wd != 0: + mask = (g * p) >= 0 + p.sub_(effective_lr * g + effective_lr * wd * p * mask) + else: + p.sub_(effective_lr * g) class DistMuon(torch.optim.Optimizer): """ - Muon: SGD-momentum + (optional) Nesterov, then orthogonalize the 2D update via Newton–Schulz, + Muon: SGD-momentum + (optional) Nesterov, then orthogonalize the 2D update via Polar Express, finally apply aspect-ratio scaled step. Performs its own distributed synchronization: - reduce_scatter(AVG) for gradient averaging - all_gather to replicate updated weights @@ -102,11 +193,13 @@ class DistMuon(torch.optim.Optimizer): lr: learning rate momentum: momentum coefficient in [0,1) nesterov: if True, Nesterov-style update (g <- lerp(g, buf, momentum)); else use buf - ns_steps: number of Newton–Schulz iterations for the orthogonalization + ns_steps: number of Newton-Schulz iterations for the orthogonalization + beta2: decay rate for second moment (variance) estimate. Set to None to disable. + weight_decay: Cautious weight decay coefficient. Only decays where update and weight agree. """ def __init__(self, params, lr: float = 0.02, momentum: float = 0.95, - nesterov: bool = True, ns_steps: int = 5): - defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps) + nesterov: bool = True, ns_steps: int = 5, beta2: float = 0.95, weight_decay: float = 0.0): + defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps, beta2=beta2, weight_decay=weight_decay) params = list(params) assert all(p.ndim == 2 for p in params), "Muon expects 2D parameters only" rank = dist.get_rank() @@ -173,9 +266,24 @@ class DistMuon(torch.optim.Optimizer): buf: Tensor = state["momentum_buffer"] buf.lerp_(g, 1.0 - group["momentum"]) g = g.lerp_(buf, group["momentum"]) if group["nesterov"] else buf - g = zeropower_via_newtonschulz5(g, steps=group["ns_steps"]) - scale = (max(1.0, p.size(-2) / p.size(-1)) ** 0.5) - p.add_(g, alpha=-group["lr"] * scale) + g = zeropower_via_polar_express(g, steps=group["ns_steps"]) + # Variance reduction (NorMuon-style) + if group["beta2"] is not None: + if "second_momentum_buffer" not in state: + # Buffer shape determines reduction dim: reduce along larger dimension + if p.size(-2) >= p.size(-1): + state["second_momentum_buffer"] = torch.zeros_like(g[..., :1]) + else: + state["second_momentum_buffer"] = torch.zeros_like(g[..., :1, :]) + g = apply_variance_reduction(g, state["second_momentum_buffer"], group["beta2"]) + # Parameter update with cautious weight decay + effective_lr = group["lr"] * (max(1.0, p.size(-2) / p.size(-1)) ** 0.5) + wd = group["weight_decay"] + if wd != 0: + mask = (g * p) >= 0 + p.sub_(effective_lr * g + effective_lr * wd * p * mask) + else: + p.sub_(effective_lr * g) # Replicate updated parameters to all ranks ag_input = params[owner_idx] if owner_idx < len(params) else zero_buffer ag_output = params[base_i:base_i + world_size] diff --git a/scripts/base_train.py b/scripts/base_train.py index e3df0f06..84d44bfb 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -51,7 +51,7 @@ parser.add_argument("--device_batch_size", type=int, default=32, help="per-devic parser.add_argument("--total_batch_size", type=int, default=524288, help="total batch size in tokens") parser.add_argument("--embedding_lr", type=float, default=0.3, help="learning rate for embedding parameters (Adam)") parser.add_argument("--unembedding_lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") -parser.add_argument("--weight_decay", type=float, default=0.0, help="weight decay for embedding/unembedding parameters (Adam)") +parser.add_argument("--weight_decay", type=float, default=0.2, help="cautious weight decay for the Muon optimizer (for weights)") parser.add_argument("--matrix_lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") parser.add_argument("--adam_beta1", type=float, default=0.8, help="Adam beta1 for embedding/unembedding") parser.add_argument("--adam_beta2", type=float, default=0.95, help="Adam beta2 for embedding/unembedding") @@ -129,6 +129,11 @@ if batch_ratio != 1.0: batch_lr_scale = batch_ratio ** 0.5 print0(f"Scaling LRs by {batch_lr_scale:.4f} for batch size {args.total_batch_size:,} (reference: {reference_batch_size:,})") +# Weight decay is tuned at d12 and its scaling seems to be \propto 1/channels^2 (or equivalently, \propto 1/depth^2 due to constant aspect ratio) +weight_decay_scaled = args.weight_decay * (12 / args.depth)**2 +if args.depth != 12: + print0(f"Scaling weight decay from {args.weight_decay:.6f} to {weight_decay_scaled:.6f} for depth {args.depth}") + # ----------------------------------------------------------------------------- # Initialize the Model @@ -188,7 +193,7 @@ optimizers = model.setup_optimizers( unembedding_lr=args.unembedding_lr * batch_lr_scale, embedding_lr=args.embedding_lr * batch_lr_scale, matrix_lr=args.matrix_lr * batch_lr_scale, - weight_decay=args.weight_decay, + weight_decay=weight_decay_scaled, adam_betas=adam_betas, ) adamw_optimizer, muon_optimizer = optimizers @@ -227,6 +232,10 @@ def get_muon_momentum(it): momentum = (1 - frac) * 0.85 + frac * 0.95 return momentum +# Weight decay scheduler for Muon optimizer (linear to zero over the course of training) +def get_weight_decay(it): + return weight_decay_scaled * (1 - it / num_iterations) + # ----------------------------------------------------------------------------- # Loop state (variables updated by the training loop) @@ -257,7 +266,7 @@ while True: eval_steps = args.eval_tokens // (args.device_batch_size * args.max_seq_len * ddp_world_size) with autocast_ctx: val_bpb = evaluate_bpb(model, val_loader, eval_steps, token_bytes) - print0(f"Step {step:05d} | Validation bpb: {val_bpb:.4f}") + print0(f"Step {step:05d} | Validation bpb: {val_bpb:.6f}") if val_bpb < min_val_bpb: min_val_bpb = val_bpb wandb_run.log({ @@ -351,8 +360,10 @@ while True: for group in opt.param_groups: group["lr"] = group["initial_lr"] * lrm muon_momentum = get_muon_momentum(step) + muon_weight_decay = get_weight_decay(step) for group in muon_optimizer.param_groups: group["momentum"] = muon_momentum + group["weight_decay"] = muon_weight_decay for opt in optimizers: opt.step() model.zero_grad(set_to_none=True) @@ -402,7 +413,7 @@ while True: print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB") print0(f"Total training time: {total_training_time/60:.2f}m") if val_bpb is not None: - print0(f"Minimum validation bpb: {min_val_bpb:.4f}") + print0(f"Minimum validation bpb: {min_val_bpb:.6f}") # Log to report from nanochat.report import get_report From aa530cdad58123ebfb79ab85d996c4641cfc6c90 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 11 Jan 2026 18:47:35 +0000 Subject: [PATCH 014/119] Add learnable lambdas that gate the residual connection and a skip connection to the input embeddings, solid bump to val_bpb --- dev/LOG.md | 56 +++++++++++++++++++++++++++++++++++++++++++ nanochat/adamw.py | 54 +++++++++++++++++++++++++++-------------- nanochat/gpt.py | 32 +++++++++++++++++++++---- scripts/base_train.py | 2 ++ 4 files changed, 121 insertions(+), 23 deletions(-) diff --git a/dev/LOG.md b/dev/LOG.md index 13fc08ed..ee1e82e3 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,62 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-11: Per-Layer Residual Scalars (x0 & resid lambdas) + +Cherry-picked an idea from modded-nanogpt around learnable per-layer residual connections. + +### Changes Made + +**1. x0_lambdas (x0 residual connections)** +- Save initial normalized embedding as `x0` after `norm(wte(idx))` +- At each layer, blend x0 back in: `x = resid_lambdas[i] * x + x0_lambdas[i] * x0` +- Zero-initialized, so disabled at start; model learns which layers benefit from the shortcut +- Provides direct path from embedding to deep layers, helps preserve token information + +**2. resid_lambdas (residual stream scaling)** +- Per-layer multiplicative scaling of the residual stream +- Initialized to 1.0 (neutral, standard transformer behavior) +- Allows model to learn to amplify/dampen residual at each layer + +**3. DistAdamW small parameter handling** +- Added support for parameters with < 1024 elements (like the scalar lambdas) +- Small params use `all_reduce` instead of `reduce_scatter`/`all_gather` +- Fixes crash when param shape isn't divisible by world_size + +### Key Finding: Different LR Sensitivity + +The two scalar types need very different learning rates: +- **x0_lambdas (additive)**: Can use normal LR (~0.5). Adding a fraction of x0 is forgiving. +- **resid_lambdas (multiplicative)**: Needs ~100x smaller LR (~0.005). Multiplying the residual compounds through layers. + +Implementation: `resid_params` gets `scalar_lr * 0.01`, `x0_params` gets full `scalar_lr`. + +### Experiment Results + +Swept `--scalar_lr` (controlling x0_lambdas) at multiple depths: + +| Depth | Baseline (disabled) | Best scalar_lr | Best val_bpb | Δ bpb | +|-------|---------------------|----------------|--------------|-------| +| d8 | 1.0885 | 0.20 | 1.0782 | -0.0103 | +| d12 | 0.9770 | 0.60 | 0.9693 | -0.0077 | +| d16 | 0.9059 | 0.20 | 0.9002 | -0.0057 | +| d20 | 0.8565 | 0.10 | 0.8526 | -0.0039 | + +**Observations:** +- Consistent improvement across all model sizes +- Optimal LR varies by depth; default of 0.5 is reasonable, but 0.6 is better for d12 +- Adding resid_lambdas (with 0.01x LR) gives small additional improvement over x0 alone + +### Meta Device Footgun + +Important lesson: `__init__` runs in meta device context, so any tensor values set there are fake. Must initialize actual values in `init_weights()`. Added docstring warning to `__init__`. + +### Summary + +Added `--scalar_lr` (default 0.5) controlling learnable per-layer scalars. The formula `x = resid_lambdas[i] * x + x0_lambdas[i] * x0` gives the model control over residual scaling and direct shortcuts to the initial embedding. Solid improvement with essentially no compute overhead. + +--- + ## 2026-01-10: Muon Optimizer Upgrades & Cautious Weight Decay Cherry-picked improvements from NorMuon (modded-nanogpt) into our simpler Muon implementation. Decided against using NorMuon directly due to hard-coded architecture assumptions (expects 32 params split 10 attn + 22 mlp), parameter labeling requirements, and complexity. diff --git a/nanochat/adamw.py b/nanochat/adamw.py index 0b97ae25..48945b38 100644 --- a/nanochat/adamw.py +++ b/nanochat/adamw.py @@ -16,23 +16,31 @@ class DistAdamW(torch.optim.Optimizer): defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) super().__init__(param_groups, defaults) - @torch.compile @torch.no_grad() def step(self): rank = dist.get_rank() world_size = dist.get_world_size() - reduce_scatter_futures: list[torch.Future] = [] - all_reduce_futures: list[torch.Future] = [] + reduce_futures: list[torch.Future] = [] + gather_futures: list[torch.Future] = [] grad_slices = [] + is_small = [] # track which params are small (use all_reduce) vs large (use reduce_scatter) + for group in self.param_groups: params: list[Tensor] = group["params"] - for base_i in range(len(params)): - assert params[base_i].shape[0] % world_size == 0, f"First dim of parameter shape {params[base_i].shape} must be divisible by world size {world_size}" - grad = params[base_i].grad - rank_size = grad.shape[0] // world_size - grad_slice = torch.empty_like(grad[:rank_size]) - reduce_scatter_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) - grad_slices.append(grad_slice) + for p in params: + grad = p.grad + # Small params: use all_reduce (no scatter/gather needed) + if p.numel() < 1024: + is_small.append(True) + reduce_futures.append(dist.all_reduce(grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad) + else: + is_small.append(False) + assert p.shape[0] % world_size == 0, f"First dim of parameter shape {p.shape} must be divisible by world size {world_size}" + rank_size = grad.shape[0] // world_size + grad_slice = torch.empty_like(grad[:rank_size]) + reduce_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) + grad_slices.append(grad_slice) idx = 0 for group in self.param_groups: @@ -40,14 +48,19 @@ class DistAdamW(torch.optim.Optimizer): eps = group['eps'] wd = group['weight_decay'] params = group['params'] - for base in range(len(params)): - reduce_scatter_futures[idx].wait() - p = params[base] - rank_size = p.shape[0] // world_size - p_slice = p[rank * rank_size:(rank + 1) * rank_size] + for p in params: + reduce_futures[idx].wait() + g_slice = grad_slices[idx] lr = group['lr'] * getattr(p, "lr_mul", 1.0) state = self.state[p] - g_slice = grad_slices[idx] + + # For small params, operate on full param; for large, operate on slice + if is_small[idx]: + p_slice = p + else: + rank_size = p.shape[0] // world_size + p_slice = p[rank * rank_size:(rank + 1) * rank_size] + # State init if not state: state['step'] = torch.tensor(0, dtype=torch.int64, device=p.device) @@ -72,6 +85,11 @@ class DistAdamW(torch.optim.Optimizer): step_size = lr / bias1 update = exp_avg.div(denom).mul_(step_size) p_slice.add_(other=update, alpha=-1.0) + + # Only large params need all_gather + if not is_small[idx]: + gather_futures.append(dist.all_gather_into_tensor(p, p_slice, async_op=True).get_future()) idx += 1 - all_reduce_futures.append(dist.all_gather_into_tensor(p, p_slice, async_op=True).get_future()) - torch.futures.collect_all(all_reduce_futures).wait() + + if gather_futures: + torch.futures.collect_all(gather_futures).wait() diff --git a/nanochat/gpt.py b/nanochat/gpt.py index 2ffdc50b..6f4556af 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -134,6 +134,11 @@ class Block(nn.Module): class GPT(nn.Module): def __init__(self, config, pad_vocab_size_to=64): + """ + NOTE a major footgun: this __init__ function runs in meta device context (!!) + Therefore, any calculations inside here are shapes and dtypes only, no actual data. + => We actually initialize all data (parameters, buffers, etc.) in init_weights() instead. + """ super().__init__() self.config = config # For DDP, we want vocab_size divisible by world_size. Also, there are potential performance benefits, see: @@ -146,6 +151,12 @@ class GPT(nn.Module): "h": nn.ModuleList([Block(config, layer_idx) for layer_idx in range(config.n_layer)]), }) self.lm_head = nn.Linear(config.n_embd, padded_vocab_size, bias=False) + # Per-layer learnable scalars (inspired by modded-nanogpt) + # resid_lambdas: scales the residual stream at each layer (init 1.0 = neutral) + # x0_lambdas: blends initial embedding back in at each layer (init 0.0 = disabled) + # Separate parameters so they can have different optimizer treatment + self.resid_lambdas = nn.Parameter(torch.ones(config.n_layer)) # fake init, real init in init_weights() + self.x0_lambdas = nn.Parameter(torch.zeros(config.n_layer)) # fake init, real init in init_weights() # To support meta device initialization, we init the rotary embeddings here, but it's just "fake" meta tensors only. # As for rotary_seq_len, these rotary embeddings are pretty small/cheap in memory, # so let's just over-compute them by 10X, but assert fail if we ever reach that amount. @@ -186,6 +197,11 @@ class GPT(nn.Module): torch.nn.init.uniform_(block.mlp.c_fc.weight, -s, s) torch.nn.init.zeros_(block.mlp.c_proj.weight) + # Per-layer scalars + with torch.no_grad(): + self.resid_lambdas.fill_(1.0) # 1.0 => typical residual connections at init + self.x0_lambdas.fill_(0.0) # 0.0 => skip connection to input is disabled at init + # Rotary embeddings head_dim = self.config.n_embd // self.config.n_head cos, sin = self._precompute_rotary_embeddings(self.rotary_seq_len, head_dim) @@ -244,21 +260,25 @@ class GPT(nn.Module): nparams = sum(p.numel() for p in self.parameters()) return nparams - def setup_optimizers(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0, adam_betas=(0.8, 0.95)): + def setup_optimizers(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0, adam_betas=(0.8, 0.95), scalar_lr=0.5): model_dim = self.config.n_embd ddp, rank, local_rank, world_size = get_dist_info() - # Separate out all parameters into 3 groups (matrix, embedding, lm_head) + # Separate out all parameters into 5 groups (matrix, embedding, lm_head, resid_lambdas, x0_lambdas) matrix_params = list(self.transformer.h.parameters()) embedding_params = list(self.transformer.wte.parameters()) lm_head_params = list(self.lm_head.parameters()) - assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) - # Create the AdamW optimizer for the embedding and lm_head + resid_params = [self.resid_lambdas] + x0_params = [self.x0_lambdas] + assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(resid_params) + len(x0_params) + # Create the AdamW optimizer for the embedding, lm_head, and per-layer scalars # Scale the LR for the AdamW parameters by ∝1/√dmodel (having tuned the LRs for 768 dim model) dmodel_lr_scale = (model_dim / 768) ** -0.5 print0(f"Scaling the LR for the AdamW parameters ∝1/√({model_dim}/768) = {dmodel_lr_scale:.6f}") adam_groups = [ dict(params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale), dict(params=embedding_params, lr=embedding_lr * dmodel_lr_scale), + dict(params=resid_params, lr=scalar_lr * 0.01), # these are a lot more sensitive because they accumulate in the residual stream + dict(params=x0_params, lr=scalar_lr), ] adamw_kwargs = dict(betas=adam_betas, eps=1e-10, weight_decay=0.0) # NOTE: weight decay is hardcoded to 0.0 for AdamW, only used in Muon AdamWFactory = DistAdamW if ddp else partial(torch.optim.AdamW, fused=True) @@ -288,7 +308,9 @@ class GPT(nn.Module): # Forward the trunk of the Transformer x = self.transformer.wte(idx) x = norm(x) - for block in self.transformer.h: + x0 = x # save initial normalized embedding for x0 residual + for i, block in enumerate(self.transformer.h): + x = self.resid_lambdas[i] * x + self.x0_lambdas[i] * x0 x = block(x, cos_sin, kv_cache) x = norm(x) diff --git a/scripts/base_train.py b/scripts/base_train.py index 84d44bfb..33274512 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -53,6 +53,7 @@ parser.add_argument("--embedding_lr", type=float, default=0.3, help="learning ra parser.add_argument("--unembedding_lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") parser.add_argument("--weight_decay", type=float, default=0.2, help="cautious weight decay for the Muon optimizer (for weights)") parser.add_argument("--matrix_lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") +parser.add_argument("--scalar_lr", type=float, default=0.5, help="learning rate for scalars (resid_lambdas, x0_lambdas)") parser.add_argument("--adam_beta1", type=float, default=0.8, help="Adam beta1 for embedding/unembedding") parser.add_argument("--adam_beta2", type=float, default=0.95, help="Adam beta2 for embedding/unembedding") parser.add_argument("--warmup_ratio", type=float, default=0.0, help="ratio of iterations for LR warmup") @@ -195,6 +196,7 @@ optimizers = model.setup_optimizers( matrix_lr=args.matrix_lr * batch_lr_scale, weight_decay=weight_decay_scaled, adam_betas=adam_betas, + scalar_lr=args.scalar_lr * batch_lr_scale, ) adamw_optimizer, muon_optimizer = optimizers From 201d705957a4b44074c41544afc0f9f76a20f775 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 11 Jan 2026 20:13:12 +0000 Subject: [PATCH 015/119] recover the ability to load old checkpoints by patching the lambdas if they don't exist in checkpoints --- nanochat/checkpoint_manager.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py index 99f260e4..79ba998f 100644 --- a/nanochat/checkpoint_manager.py +++ b/nanochat/checkpoint_manager.py @@ -20,6 +20,16 @@ def log0(message): if int(os.environ.get('RANK', 0)) == 0: logger.info(message) +def _patch_missing_keys(model_data, model_config): + """Add default values for new parameters that may be missing in old checkpoints.""" + n_layer = model_config.n_layer + # resid_lambdas defaults to 1.0 (identity scaling) + if "resid_lambdas" not in model_data: + model_data["resid_lambdas"] = torch.ones(n_layer) + # x0_lambdas defaults to 0.0 (disabled) + if "x0_lambdas" not in model_data: + model_data["x0_lambdas"] = torch.zeros(n_layer) + def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data, rank=0): if rank == 0: os.makedirs(checkpoint_dir, exist_ok=True) @@ -76,6 +86,7 @@ def build_model(checkpoint_dir, step, device, phase): model_config_kwargs = meta_data["model_config"] log0(f"Building model with config: {model_config_kwargs}") model_config = GPTConfig(**model_config_kwargs) + _patch_missing_keys(model_data, model_config) with torch.device("meta"): model = GPT(model_config) # Load the model state From 2ff7d512528a6f50886e47f1e86c995d352ab2c9 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 11 Jan 2026 20:33:19 +0000 Subject: [PATCH 016/119] integrate Flash Attention 3. +9% tok_per_sec for d12 with ctx even as low as 2048 out of the box nice. also, ready to tune windows huge --- dev/LOG.md | 33 +++++++++++++ nanochat/engine.py | 107 ++++++++++++++++--------------------------- nanochat/gpt.py | 58 +++++++++++------------ pyproject.toml | 1 + tests/test_engine.py | 104 ++++++++++++++++++++++------------------- uv.lock | 17 +++++++ 6 files changed, 177 insertions(+), 143 deletions(-) diff --git a/dev/LOG.md b/dev/LOG.md index ee1e82e3..f2322de3 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,39 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-11: Flash Attention 3 Integration + +Replaced PyTorch's `scaled_dot_product_attention` (FA2) with Flash Attention 3 for training and inference. + +### Changes Made + +**1. FA3 via `kernels` package** +- Official FA3 is "beta" and requires building from source (painful) +- Using `kernels` package from HuggingFace Hub: `get_kernel('varunneal/flash-attention-3')` +- Loads pre-built wheels, works out of the box on H100 + +**2. Simplified attention code** +- FA3 uses `(B, T, H, D)` layout matching our projection output directly - no transpose needed +- Training: `flash_attn.flash_attn_func(q, k, v, causal=True)` +- Inference: `flash_attn.flash_attn_with_kvcache()` handles all cache cases in one call +- Removed 3 separate FA2 code paths (training, single-token, chunk inference) +- GQA handled automatically when n_kv_heads < n_heads + +**3. Rewrote KVCache for FA3** +- Old format: `(num_layers, 2, B, H, T, D)` combined tensor +- New format: separate `k_cache` and `v_cache` of shape `(num_layers, B, T, H, D)` +- FA3 updates cache in-place during `flash_attn_with_kvcache` +- Position tracked via `cache_seqlens` tensor (int32, per batch element) +- Simpler API: `get_layer_cache()`, `advance()`, `reset()`, `prefill()` + +### Results + +- **~9% improvement in tok/sec** during training out of the box +- Benchmarks showed FA3 is 2x faster than FA2 at realistic training sizes (batch=32, seq=2048) +- FA3 supports sliding window via `window_size=(left, 0)`, which is huge and expected to give further improvements. This is ready to tune but keeping full context for now. + +--- + ## 2026-01-11: Per-Layer Residual Scalars (x0 & resid lambdas) Cherry-picked an idea from modded-nanogpt around learnable per-layer residual connections. diff --git a/nanochat/engine.py b/nanochat/engine.py index d4367fb1..53fdec5b 100644 --- a/nanochat/engine.py +++ b/nanochat/engine.py @@ -82,83 +82,54 @@ def use_calculator(expr): # ----------------------------------------------------------------------------- class KVCache: """ - Works hand-in-hand with the GPT model to maintain the KV cache. - Note that the .pos advances automatically after the last layer of the Transformer inserts. + KV Cache designed for Flash Attention 3's flash_attn_with_kvcache API. + + Key differences from FA2-style cache: + - Tensors are (B, T, H, D) not (B, H, T, D) + - FA3 updates the cache in-place during flash_attn_with_kvcache + - Position tracked per batch element via cache_seqlens tensor """ - def __init__(self, batch_size, num_heads, seq_len, head_dim, num_layers): - # Each of K/V is of shape (B, H, T, D) and we have one per layer of the Transformer. - self.kv_shape = (num_layers, 2, batch_size, num_heads, seq_len, head_dim) - self.kv_cache = None - self.pos = 0 # current position in time in the cache + def __init__(self, batch_size, num_heads, seq_len, head_dim, num_layers, device, dtype=torch.bfloat16): + self.batch_size = batch_size + self.max_seq_len = seq_len + self.n_layers = num_layers + self.n_heads = num_heads + self.head_dim = head_dim + # Pre-allocate cache tensors: (n_layers, B, T, H, D) + self.k_cache = torch.zeros(num_layers, batch_size, seq_len, num_heads, head_dim, device=device, dtype=dtype) + self.v_cache = torch.zeros(num_layers, batch_size, seq_len, num_heads, head_dim, device=device, dtype=dtype) + # Current sequence length per batch element (FA3 needs int32) + self.cache_seqlens = torch.zeros(batch_size, dtype=torch.int32, device=device) def reset(self): - self.pos = 0 + """Reset cache to empty state.""" + self.cache_seqlens.zero_() def get_pos(self): - return self.pos + """Get current position (assumes all batch elements at same position).""" + return self.cache_seqlens[0].item() + + def get_layer_cache(self, layer_idx): + """Return (k_cache, v_cache) views for a specific layer.""" + return self.k_cache[layer_idx], self.v_cache[layer_idx] + + def advance(self, num_tokens): + """Advance the cache position by num_tokens.""" + self.cache_seqlens += num_tokens def prefill(self, other): """ - Prefill given another KV cache. Optionally expand along batch dim. - This is used when we do batch 1 prefill and then want to generate - multiple samples in parallel from there. + Copy cached KV from another cache into this one. + Used when we do batch=1 prefill and then want to generate multiple samples in parallel. """ - # 1) validate the shapes - assert self.kv_cache is None, "Cannot prefill a non-empty KV cache" - assert other.kv_cache is not None, "Cannot prefill with a None KV cache" - - # Extract dimensions explicitly - self_layers, self_kv, self_batch, self_heads, self_seq, self_head_dim = self.kv_shape - other_layers, other_kv, other_batch, other_heads, other_seq, other_head_dim = other.kv_shape - - # Validate dimensions - assert self_layers == other_layers, f"Layer count mismatch: {self_layers} != {other_layers}" - assert self_kv == other_kv, f"K/V dimension mismatch: {self_kv} != {other_kv}" - assert self_heads == other_heads, f"Head count mismatch: {self_heads} != {other_heads}" - assert self_head_dim == other_head_dim, f"Head dim mismatch: {self_head_dim} != {other_head_dim}" - - # Batch size can be expanded (other can be 1, self can be larger) - assert self_batch == other_batch or other_batch == 1, f"Batch size mismatch: {self_batch} vs {other_batch} (other must be 1 or equal)" - - # Sequence length: self must be longer than other - assert self_seq >= other_seq, f"Sequence length mismatch: {self_seq} < {other_seq}" - - # 2) initialize the cache - dtype, device = other.kv_cache.dtype, other.kv_cache.device - self.kv_cache = torch.empty(self.kv_shape, dtype=dtype, device=device) - # 3) copy the data over - self.kv_cache[:, :, :, :, :other.pos, :] = other.kv_cache - # 4) update the pos - self.pos = other.pos - - def insert_kv(self, layer_idx, k, v): - # Lazy initialize the cache here because we need to know the dtype/device - if self.kv_cache is None: - self.kv_cache = torch.empty(self.kv_shape, dtype=k.dtype, device=k.device) - # Insert new keys/values to the cache and return the full cache so far - B, H, T_add, D = k.size() - t0, t1 = self.pos, self.pos + T_add - # Dynamically grow the cache if needed - if t1 > self.kv_cache.size(4): - t_needed = t1 + 1024 # as much as we need plus buffer of 1024 - t_needed = (t_needed + 1023) & ~1023 # then round up to the nearest multiple of 1024 - additional_shape = list(self.kv_cache.shape) - additional_shape[4] = t_needed - self.kv_cache.size(4) - additional_cache = torch.empty(additional_shape, dtype=k.dtype, device=k.device) - self.kv_cache = torch.cat([self.kv_cache, additional_cache], dim=4).contiguous() - self.kv_shape = self.kv_cache.shape - # Insert k, v into the cache - self.kv_cache[layer_idx, 0, :, :, t0:t1, :] = k - self.kv_cache[layer_idx, 1, :, :, t0:t1, :] = v - # Return the full cached keys/values up to current position (as a view) - key_view = self.kv_cache[layer_idx, 0, :, :, :t1, :] - value_view = self.kv_cache[layer_idx, 1, :, :, :t1, :] - # Increment pos after the last layer of the Transformer processes - if layer_idx == self.kv_cache.size(0) - 1: - self.pos = t1 - return key_view, value_view - + assert self.get_pos() == 0, "Cannot prefill a non-empty KV cache" + assert self.n_layers == other.n_layers and self.n_heads == other.n_heads and self.head_dim == other.head_dim + assert self.max_seq_len >= other.max_seq_len + other_pos = other.get_pos() + self.k_cache[:, :, :other_pos, :, :] = other.k_cache[:, :, :other_pos, :, :] + self.v_cache[:, :, :other_pos, :, :] = other.v_cache[:, :, :other_pos, :, :] + self.cache_seqlens.fill_(other_pos) # ----------------------------------------------------------------------------- @torch.inference_mode() @@ -219,6 +190,7 @@ class Engine: kv_cache_prefill = KVCache( batch_size=1, seq_len=len(tokens), + device=device, **kv_model_kwargs, ) ids = torch.tensor([tokens], dtype=torch.long, device=device) @@ -230,6 +202,7 @@ class Engine: kv_cache_decode = KVCache( batch_size=num_samples, seq_len=kv_length_hint, + device=device, **kv_model_kwargs, ) kv_cache_decode.prefill(kv_cache_prefill) diff --git a/nanochat/gpt.py b/nanochat/gpt.py index 6f4556af..f22ec076 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -9,9 +9,9 @@ Notable features: - no learnable params in rmsnorm - no bias in linear layers - Group-Query Attention (GQA) support for more efficient inference +- Flash Attention 3 integration """ -import math from functools import partial from dataclasses import dataclass @@ -23,6 +23,14 @@ from nanochat.common import get_dist_info, print0 from nanochat.muon import Muon, DistMuon from nanochat.adamw import DistAdamW +# Load Flash Attention 3 from HuggingFace Hub (and silence the progress bar) +import os +os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1" +# Official docs of FA3 label it as "beta" and want you to install FA3 from source, which is a pain. +# Wishing for official FA3 wheels soon, for now this seems to be a fast way to get them (ty varunneal) +from kernels import get_kernel +flash_attn = get_kernel('varunneal/flash-attention-3').flash_attn_interface + @dataclass class GPTConfig: sequence_len: int = 1024 @@ -65,44 +73,36 @@ class CausalSelfAttention(nn.Module): B, T, C = x.size() # Project the input to get queries, keys, and values + # Shape: (B, T, H, D) - FA3's native layout, no transpose needed! q = self.c_q(x).view(B, T, self.n_head, self.head_dim) k = self.c_k(x).view(B, T, self.n_kv_head, self.head_dim) v = self.c_v(x).view(B, T, self.n_kv_head, self.head_dim) # Apply Rotary Embeddings to queries and keys to get relative positional encoding cos, sin = cos_sin - q, k = apply_rotary_emb(q, cos, sin), apply_rotary_emb(k, cos, sin) # QK rotary embedding + q, k = apply_rotary_emb(q, cos, sin), apply_rotary_emb(k, cos, sin) q, k = norm(q), norm(k) # QK norm - q, k, v = q.transpose(1, 2), k.transpose(1, 2), v.transpose(1, 2) # make head be batch dim, i.e. (B, T, H, D) -> (B, H, T, D) - # Apply KV cache: insert current k,v into cache, get the full view so far - if kv_cache is not None: - k, v = kv_cache.insert_kv(self.layer_idx, k, v) - Tq = q.size(2) # number of queries in this forward pass - Tk = k.size(2) # number of keys/values in total (in the cache + current forward pass) - - # Attention: queries attend to keys/values autoregressively. A few cases to handle: - enable_gqa = self.n_head != self.n_kv_head # Group Query Attention (GQA): duplicate key/value heads to match query heads if desired - if kv_cache is None or Tq == Tk: - # During training (no KV cache), attend as usual with causal attention - # And even if there is KV cache, we can still use this simple version when Tq == Tk - y = F.scaled_dot_product_attention(q, k, v, is_causal=True, enable_gqa=enable_gqa) - elif Tq == 1: - # During inference but with a single query in this forward pass: - # The query has to attend to all the keys/values in the cache - y = F.scaled_dot_product_attention(q, k, v, is_causal=False, enable_gqa=enable_gqa) + # Attention with Flash Attention 3 + # FA3 handles GQA automatically when n_kv_heads < n_heads + if kv_cache is None: + # Training: simple causal attention + y = flash_attn.flash_attn_func(q, k, v, causal=True) else: - # During inference AND we have a chunk of queries in this forward pass: - # First, each query attends to all the cached keys/values (i.e. full prefix) - attn_mask = torch.zeros((Tq, Tk), dtype=torch.bool, device=q.device) # True = keep, False = mask - prefix_len = Tk - Tq - attn_mask[:, :prefix_len] = True - # Then, causal attention within this chunk - attn_mask[:, prefix_len:] = torch.tril(torch.ones((Tq, Tq), dtype=torch.bool, device=q.device)) - y = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, enable_gqa=enable_gqa) + # Inference: use flash_attn_with_kvcache which handles cache management + k_cache, v_cache = kv_cache.get_layer_cache(self.layer_idx) + y = flash_attn.flash_attn_with_kvcache( + q, k_cache, v_cache, + k=k, v=v, + cache_seqlens=kv_cache.cache_seqlens, + causal=True, + ) + # Advance position after last layer processes + if self.layer_idx == kv_cache.n_layers - 1: + kv_cache.advance(T) - # Re-assemble the heads side by side and project back to residual stream - y = y.transpose(1, 2).contiguous().view(B, T, -1) + # Re-assemble the heads and project back to residual stream + y = y.contiguous().view(B, T, -1) y = self.c_proj(y) return y diff --git a/pyproject.toml b/pyproject.toml index 0931ca64..87a967f7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,6 +8,7 @@ dependencies = [ "datasets>=4.0.0", "fastapi>=0.117.1", "ipykernel>=7.1.0", + "kernels>=0.11.7", "matplotlib>=3.10.8", "psutil>=7.1.0", "python-dotenv>=1.2.1", diff --git a/tests/test_engine.py b/tests/test_engine.py index 683f89bf..9351e5a8 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -39,13 +39,9 @@ class MockModel: def forward(self, ids, kv_cache=None): """Return uniform logits so sampling is spread across vocab.""" B, T = ids.shape - # Simulate what a real transformer does: insert k,v into the cache for each layer + # With FA3, flash_attn_with_kvcache updates cache in-place and we advance position if kv_cache is not None: - head_dim = self.config.n_embd // self.config.n_head - for layer_idx in range(self.config.n_layer): - k = torch.zeros(B, self.config.n_kv_head, T, head_dim) - v = torch.zeros(B, self.config.n_kv_head, T, head_dim) - kv_cache.insert_kv(layer_idx, k, v) + kv_cache.advance(T) # Uniform logits -> equal probability for all tokens logits = torch.zeros(B, T, self.vocab_size) return logits @@ -85,16 +81,11 @@ class ByteTokenizer: byte_tokens = [t for t in tokens if t < 256] return bytes(byte_tokens).decode("utf-8", errors="replace") -def test_kv_cache_resize(): - """ - The KV cache was not resized correctly, more information here: - https://github.com/karpathy/nanochat/pull/186 - This test reproduces the issue and will be merged alongside the fix. - """ - +def test_kv_cache_basic(): + """Test basic KVCache functionality for FA3.""" batch_size = 2 num_heads = 3 - seq_len = 4 + seq_len = 64 head_dim = 5 num_layers = 6 @@ -103,45 +94,64 @@ def test_kv_cache_resize(): num_heads=num_heads, seq_len=seq_len, head_dim=head_dim, - num_layers=num_layers + num_layers=num_layers, + device="cpu", ) - # Insert a single token with a distinct fill value to all layers - def insert_token(token_idx): - for layer_idx in range(num_layers): - k = torch.full((batch_size, num_heads, 1, head_dim), fill_value=float(token_idx), dtype=torch.float32) - v = torch.full((batch_size, num_heads, 1, head_dim), fill_value=float(token_idx * 100), dtype=torch.float32) - kv_cache.insert_kv(layer_idx, k, v) + # Check initial state + assert kv_cache.get_pos() == 0 + assert kv_cache.k_cache.shape == (num_layers, batch_size, seq_len, num_heads, head_dim) + assert kv_cache.v_cache.shape == (num_layers, batch_size, seq_len, num_heads, head_dim) - # Insert 4 tokens (fills the initial seq_len=4) - for i in range(4): - insert_token(i) + # Test advance + kv_cache.advance(10) + assert kv_cache.get_pos() == 10 - # Record the original state of the cache - original_cache = kv_cache.kv_cache.clone() - original_seq_len = original_cache.shape[4] + kv_cache.advance(5) + assert kv_cache.get_pos() == 15 - # Insert the 5th token, which will trigger a resize - insert_token(4) - # Verify that the cache actually resized - new_seq_len = kv_cache.kv_cache.shape[4] - assert new_seq_len > original_seq_len, f"Cache did not resize: original seq_len={original_seq_len}, new seq_len={new_seq_len}" + # Test reset + kv_cache.reset() + assert kv_cache.get_pos() == 0 - # Verify that the original 4 tokens are still intact after resize - for layer_idx in range(num_layers): - for token_idx in range(4): - # Check that resized cache matches expected values - expected_k = float(token_idx) - expected_v = float(token_idx * 100) - actual_k = kv_cache.kv_cache[layer_idx, 0, :, :, token_idx, :] - actual_v = kv_cache.kv_cache[layer_idx, 1, :, :, token_idx, :] - assert (actual_k == expected_k).all(), f"Layer {layer_idx}, token {token_idx}: key corrupted, expected {expected_k}" - assert (actual_v == expected_v).all(), f"Layer {layer_idx}, token {token_idx}: value corrupted, expected {expected_v}" - # And that the original cache matches resized cache - original_k = original_cache[layer_idx, 0, :, :, token_idx, :] - original_v = original_cache[layer_idx, 1, :, :, token_idx, :] - assert (actual_k == original_k).all(), f"Layer {layer_idx}, token {token_idx}: key doesn't match original" - assert (actual_v == original_v).all(), f"Layer {layer_idx}, token {token_idx}: value doesn't match original" + # Test get_layer_cache returns correct views + k_layer0, v_layer0 = kv_cache.get_layer_cache(0) + assert k_layer0.shape == (batch_size, seq_len, num_heads, head_dim) + assert v_layer0.shape == (batch_size, seq_len, num_heads, head_dim) + + +def test_kv_cache_prefill(): + """Test KVCache.prefill() copies data correctly.""" + batch_size = 1 + num_heads = 4 + head_dim = 8 + num_layers = 2 + + # Create source cache and advance it + src_cache = KVCache( + batch_size=batch_size, num_heads=num_heads, seq_len=32, + head_dim=head_dim, num_layers=num_layers, device="cpu", + ) + # Write some data to source cache + src_cache.k_cache[0, 0, :16, :, :] = 1.0 + src_cache.v_cache[0, 0, :16, :, :] = 2.0 + src_cache.advance(16) + + # Create destination cache with larger seq_len + dst_cache = KVCache( + batch_size=batch_size, num_heads=num_heads, seq_len=64, + head_dim=head_dim, num_layers=num_layers, device="cpu", + ) + + # Prefill + dst_cache.prefill(src_cache) + + # Check position was copied + assert dst_cache.get_pos() == 16 + + # Check data was copied + assert (dst_cache.k_cache[0, 0, :16, :, :] == 1.0).all() + assert (dst_cache.v_cache[0, 0, :16, :, :] == 2.0).all() def test_multi_sample_first_token_diversity(): diff --git a/uv.lock b/uv.lock index 63b2c014..b168a2ff 100644 --- a/uv.lock +++ b/uv.lock @@ -1089,6 +1089,21 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e7/e7/80988e32bf6f73919a113473a604f5a8f09094de312b9d52b79c2df7612b/jupyter_core-5.9.1-py3-none-any.whl", hash = "sha256:ebf87fdc6073d142e114c72c9e29a9d7ca03fad818c5d300ce2adc1fb0743407", size = 29032, upload-time = "2025-10-16T19:19:16.783Z" }, ] +[[package]] +name = "kernels" +version = "0.11.7" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "huggingface-hub" }, + { name = "packaging" }, + { name = "pyyaml" }, + { name = "tomli", marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/d6/c8/2d4fea16366d34069af6d4c4f61218f55e5d0daea5d4c24d58849e9fd626/kernels-0.11.7.tar.gz", hash = "sha256:99c3aa518965518902f4dc26053d6051f06abc904ae33d9486c28674a2ea0fa5", size = 50282, upload-time = "2026-01-08T15:41:57.383Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ab/49/e62183353374ec71306ef354781233ac8d12fdfd1cf3d47c875055a99603/kernels-0.11.7-py3-none-any.whl", hash = "sha256:1421791b1e501fcb0a7f0a4d763c5385591756d9d6ed12ed8baa1e0d71bcd21a", size = 46501, upload-time = "2026-01-08T15:41:55.784Z" }, +] + [[package]] name = "kiwisolver" version = "1.4.9" @@ -1478,6 +1493,7 @@ dependencies = [ { name = "datasets" }, { name = "fastapi" }, { name = "ipykernel" }, + { name = "kernels" }, { name = "matplotlib" }, { name = "psutil" }, { name = "python-dotenv" }, @@ -1518,6 +1534,7 @@ requires-dist = [ { name = "datasets", specifier = ">=4.0.0" }, { name = "fastapi", specifier = ">=0.117.1" }, { name = "ipykernel", specifier = ">=7.1.0" }, + { name = "kernels", specifier = ">=0.11.7" }, { name = "matplotlib", specifier = ">=3.10.8" }, { name = "psutil", specifier = ">=7.1.0" }, { name = "python-dotenv", specifier = ">=1.2.1" }, From fbc1484e8c2582325e8daa1c1a5000f17aed69e7 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 11 Jan 2026 21:49:54 +0000 Subject: [PATCH 017/119] add alternating window size patterns for the GPT layers, following GPT-3. Experimented a bit and found the pattern SSSL to work well - 3 short, 1 long alternating. This is now the new default and the plots look quite a bit better on flops vs. bpb --- dev/LOG.md | 16 ++++++++ nanochat/checkpoint_manager.py | 7 ++++ nanochat/gpt.py | 70 ++++++++++++++++++++++++++++------ scripts/base_train.py | 3 +- 4 files changed, 83 insertions(+), 13 deletions(-) diff --git a/dev/LOG.md b/dev/LOG.md index f2322de3..902c1e07 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,22 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-11: Sliding Window Attention + +Added configurable sliding window attention, inspired by GPT-3's alternating short/long pattern. + +**Pattern string configuration:** +- New `--window_pattern` CLI arg and `GPTConfig.window_pattern` field +- Pattern is tiled across layers (e.g., `SSSL` for 20 layers → `SSSLSSSLSSSLSSSLSSSL`) +- Final layer always forced to L (full context) regardless of pattern +- Short window = `sequence_len // 2` +- Long window = `sequence_len` (full context) +- All previous models so far have been simply `L` and checkpoint loading is modified accordingly to fill in this param for old models, see `_patch_missing_config_keys` + +Quick experiments showed `SSSL` (every 4th layer is long) works well - provides a good balance between compute savings and model quality. This is now the default. + +--- + ## 2026-01-11: Flash Attention 3 Integration Replaced PyTorch's `scaled_dot_product_attention` (FA2) with Flash Attention 3 for training and inference. diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py index 79ba998f..cca62941 100644 --- a/nanochat/checkpoint_manager.py +++ b/nanochat/checkpoint_manager.py @@ -20,6 +20,12 @@ def log0(message): if int(os.environ.get('RANK', 0)) == 0: logger.info(message) +def _patch_missing_config_keys(model_config_kwargs): + """Add default values for new config keys missing in old checkpoints.""" + # Old models were trained with full context (no sliding window) + if "window_pattern" not in model_config_kwargs: + model_config_kwargs["window_pattern"] = "L" + def _patch_missing_keys(model_data, model_config): """Add default values for new parameters that may be missing in old checkpoints.""" n_layer = model_config.n_layer @@ -84,6 +90,7 @@ def build_model(checkpoint_dir, step, device, phase): # Hack: fix torch compile issue, which prepends all keys with _orig_mod. model_data = {k.removeprefix("_orig_mod."): v for k, v in model_data.items()} model_config_kwargs = meta_data["model_config"] + _patch_missing_config_keys(model_config_kwargs) log0(f"Building model with config: {model_config_kwargs}") model_config = GPTConfig(**model_config_kwargs) _patch_missing_keys(model_data, model_config) diff --git a/nanochat/gpt.py b/nanochat/gpt.py index f22ec076..81ccb0ca 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -39,6 +39,10 @@ class GPTConfig: n_head: int = 6 # number of query heads n_kv_head: int = 6 # number of key/value heads (GQA) n_embd: int = 768 + # Sliding window attention pattern string, tiled across layers. Final layer always L. + # Characters: L=long (full context), S=short (half context) + # Examples: "L"=all full context, "SL"=alternating, "SSL"=two short then one long + window_pattern: str = "L" def norm(x): @@ -69,7 +73,7 @@ class CausalSelfAttention(nn.Module): self.c_v = nn.Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False) self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=False) - def forward(self, x, cos_sin, kv_cache): + def forward(self, x, cos_sin, window_size, kv_cache): B, T, C = x.size() # Project the input to get queries, keys, and values @@ -85,9 +89,10 @@ class CausalSelfAttention(nn.Module): # Attention with Flash Attention 3 # FA3 handles GQA automatically when n_kv_heads < n_heads + # window_size is (left, right) tuple: (N, 0) for causal, (-1, 0) for full context if kv_cache is None: - # Training: simple causal attention - y = flash_attn.flash_attn_func(q, k, v, causal=True) + # Training: causal attention with optional sliding window + y = flash_attn.flash_attn_func(q, k, v, causal=True, window_size=window_size) else: # Inference: use flash_attn_with_kvcache which handles cache management k_cache, v_cache = kv_cache.get_layer_cache(self.layer_idx) @@ -96,6 +101,7 @@ class CausalSelfAttention(nn.Module): k=k, v=v, cache_seqlens=kv_cache.cache_seqlens, causal=True, + window_size=window_size, ) # Advance position after last layer processes if self.layer_idx == kv_cache.n_layers - 1: @@ -126,8 +132,8 @@ class Block(nn.Module): self.attn = CausalSelfAttention(config, layer_idx) self.mlp = MLP(config) - def forward(self, x, cos_sin, kv_cache): - x = x + self.attn(norm(x), cos_sin, kv_cache) + def forward(self, x, cos_sin, window_size, kv_cache): + x = x + self.attn(norm(x), cos_sin, window_size, kv_cache) x = x + self.mlp(norm(x)) return x @@ -141,11 +147,14 @@ class GPT(nn.Module): """ super().__init__() self.config = config - # For DDP, we want vocab_size divisible by world_size. Also, there are potential performance benefits, see: + # Compute per-layer window sizes for sliding window attention + # window_size is (left, right) tuple: (-1, 0) for full context, (N, 0) for sliding window + self.window_sizes = self._compute_window_sizes(config) + # Pad vocab for efficiency (DDP, tensor cores). This is just an optimization - outputs are cropped in forward(). # https://huggingface.co/docs/transformers/main_classes/model#transformers.PreTrainedModel.resize_token_embeddings padded_vocab_size = ((config.vocab_size + pad_vocab_size_to - 1) // pad_vocab_size_to) * pad_vocab_size_to if padded_vocab_size != config.vocab_size: - print0(f"Padding vocab_size from {config.vocab_size} to {padded_vocab_size} to be divisible by {pad_vocab_size_to}") + print0(f"Padding vocab_size from {config.vocab_size} to {padded_vocab_size} for efficiency") self.transformer = nn.ModuleDict({ "wte": nn.Embedding(padded_vocab_size, config.n_embd), "h": nn.ModuleList([Block(config, layer_idx) for layer_idx in range(config.n_layer)]), @@ -228,6 +237,35 @@ class GPT(nn.Module): cos, sin = cos[None, :, None, :], sin[None, :, None, :] # add batch and head dims for later broadcasting return cos, sin + def _compute_window_sizes(self, config): + """ + Compute per-layer window sizes for sliding window attention. + + Returns list of (left, right) tuples for FA3's window_size parameter: + - left: how many tokens before current position to attend to (-1 = unlimited) + - right: how many tokens after current position to attend to (0 for causal) + + Pattern string is tiled across layers. Final layer always gets L (full context). + Characters: L=long (full context), S=short (half context) + """ + pattern = config.window_pattern.upper() + assert all(c in "SL" for c in pattern), f"Invalid window_pattern: {pattern}. Use only S and L." + # Map characters to window sizes + long_window = config.sequence_len + short_window = long_window // 2 + char_to_window = { + "L": (long_window, 0), + "S": (short_window, 0), + } + # Tile pattern across layers + window_sizes = [] + for layer_idx in range(config.n_layer): + char = pattern[layer_idx % len(pattern)] + window_sizes.append(char_to_window[char]) + # Final layer always gets full context + window_sizes[-1] = (long_window, 0) + return window_sizes + def get_device(self): return self.transformer.wte.weight.device @@ -236,16 +274,24 @@ class GPT(nn.Module): Return the estimated FLOPs per token for the model (forward + backward). Each matmul weight parameter contributes 2 FLOPs (multiply *, accumulate +) in forward, and 2X that in backward => 2+4=6. Cleanest explanation of this: https://medium.com/@dzmitrybahdanau/the-flops-calculus-of-language-model-training-3b19c1f025e4 - On top of that, the term 12 * l * h * q * t accounts for key @ query matmul flops inside attention. + On top of that, 12 * h * q * effective_seq_len accounts for key @ query matmul flops inside attention. + With sliding windows, effective_seq_len varies per layer (capped by window size). Ref: https://arxiv.org/abs/2204.02311 (PaLM paper). This is ~1% off from the exact formulas of Chinchilla paper, the difference is: - Chinchilla counts the embedding layer as flops (? weird, it's just a lookup => we ignore) - Chinchilla counts exp/sum/divide in attention softmax as flops (a little sus and very tiny => we ignore) """ nparams = sum(p.numel() for p in self.parameters()) - nparams_embedding = self.transformer.wte.weight.numel() - l, h, q, t = self.config.n_layer, self.config.n_head, self.config.n_embd // self.config.n_head, self.config.sequence_len - num_flops_per_token = 6 * (nparams - nparams_embedding) + 12 * l * h * q * t + # Exclude non-matmul params: embeddings and per-layer scalars + nparams_exclude = self.transformer.wte.weight.numel() + self.resid_lambdas.numel() + self.x0_lambdas.numel() + h, q, t = self.config.n_head, self.config.n_embd // self.config.n_head, self.config.sequence_len + # Sum attention FLOPs per layer, accounting for sliding window + attn_flops = 0 + for window_size in self.window_sizes: + window = window_size[0] # (left, right) tuple, we use left + effective_seq = t if window < 0 else min(window, t) + attn_flops += 12 * h * q * effective_seq + num_flops_per_token = 6 * (nparams - nparams_exclude) + attn_flops return num_flops_per_token def num_scaling_params(self): @@ -311,7 +357,7 @@ class GPT(nn.Module): x0 = x # save initial normalized embedding for x0 residual for i, block in enumerate(self.transformer.h): x = self.resid_lambdas[i] * x + self.x0_lambdas[i] * x0 - x = block(x, cos_sin, kv_cache) + x = block(x, cos_sin, self.window_sizes[i], kv_cache) x = norm(x) # Forward the lm_head (compute logits) diff --git a/scripts/base_train.py b/scripts/base_train.py index 33274512..9d8ac167 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -42,6 +42,7 @@ parser.add_argument("--depth", type=int, default=20, help="depth of the Transfor parser.add_argument("--aspect_ratio", type=int, default=64, help="model_dim = depth * aspect_ratio") parser.add_argument("--head_dim", type=int, default=128, help="target head dimension for attention") parser.add_argument("--max_seq_len", type=int, default=2048, help="max context length") +parser.add_argument("--window_pattern", type=str, default="L", help="sliding window pattern tiled across layers: L=full, S=half context (e.g. 'SSL')") # Training horizon (only one used, in order of precedence) parser.add_argument("--num_iterations", type=int, default=-1, help="explicit number of optimization steps (-1 = disable)") parser.add_argument("--target_flops", type=float, default=-1.0, help="calculate num_iterations to reach target_flops (-1 = disable)") @@ -139,7 +140,7 @@ if args.depth != 12: # Initialize the Model # Create a new model with random weights -model_config_kwargs = dict(sequence_len=args.max_seq_len, vocab_size=vocab_size, n_layer=num_layers, n_head=num_heads, n_kv_head=num_kv_heads, n_embd=model_dim) +model_config_kwargs = dict(sequence_len=args.max_seq_len, vocab_size=vocab_size, n_layer=num_layers, n_head=num_heads, n_kv_head=num_kv_heads, n_embd=model_dim, window_pattern=args.window_pattern) with torch.device("meta"): # All tensors are created as meta tensors (they have shape/dtype but no data) model_config = GPTConfig(**model_config_kwargs) From b33e394528103f26c3190b55c11ca4d942f6ad7f Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 11 Jan 2026 21:50:35 +0000 Subject: [PATCH 018/119] oops actually make SSSL the default window pattern --- scripts/base_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/base_train.py b/scripts/base_train.py index 9d8ac167..7a16276f 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -42,7 +42,7 @@ parser.add_argument("--depth", type=int, default=20, help="depth of the Transfor parser.add_argument("--aspect_ratio", type=int, default=64, help="model_dim = depth * aspect_ratio") parser.add_argument("--head_dim", type=int, default=128, help="target head dimension for attention") parser.add_argument("--max_seq_len", type=int, default=2048, help="max context length") -parser.add_argument("--window_pattern", type=str, default="L", help="sliding window pattern tiled across layers: L=full, S=half context (e.g. 'SSL')") +parser.add_argument("--window_pattern", type=str, default="SSSL", help="sliding window pattern tiled across layers: L=full, S=half context (e.g. 'SSL')") # Training horizon (only one used, in order of precedence) parser.add_argument("--num_iterations", type=int, default=-1, help="explicit number of optimization steps (-1 = disable)") parser.add_argument("--target_flops", type=float, default=-1.0, help="calculate num_iterations to reach target_flops (-1 = disable)") From aa95fb2e035d57ef463ac0fe106fec2406d650b3 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 12 Jan 2026 02:54:35 +0000 Subject: [PATCH 019/119] make miniseries more generic and easier to run and less hard coded --- miniseries.sh | 36 +++++++++++++++++++++++------------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/miniseries.sh b/miniseries.sh index 077418ad..0a6947ed 100644 --- a/miniseries.sh +++ b/miniseries.sh @@ -1,29 +1,39 @@ #!/bin/bash # See speedrun.sh for more comments +# Usage: ./miniseries.sh [series_name] +# Example: ./miniseries.sh jan11 +# Default series name is today's date (e.g., jan11) export OMP_NUM_THREADS=1 export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" mkdir -p $NANOCHAT_BASE_DIR -# uv -command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh -[ -d ".venv" ] || uv venv -uv sync --extra gpu -source .venv/bin/activate +# Setup (skip with SKIP_SETUP=1) +if [ -z "$SKIP_SETUP" ]; then + # uv + command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh + [ -d ".venv" ] || uv venv + uv sync --extra gpu + source .venv/bin/activate -# Tokenizer -python -m nanochat.dataset -n 240 -python -m scripts.tok_train --max_chars=2000000000 --vocab_size=32768 + # Tokenizer + python -m nanochat.dataset -n 240 + python -m scripts.tok_train --max_chars=2000000000 --vocab_size=32768 +else + source .venv/bin/activate +fi +# Series name: from arg, env var, or default to today's date (e.g., jan11) +SERIES_NAME="${1:-${SERIES_NAME:-$(date +%b%d | tr '[:upper:]' '[:lower:]')}}" # Depths to train (the "miniseries") DEPTHS=(10 11 12 13 14 15 16 17 18 19 20) # Hardware NPROC_PER_NODE="${NPROC_PER_NODE:-8}" # Logging -WANDB_RUN="${WANDB_RUN:-jan7_miniseries}" +WANDB_RUN="${WANDB_RUN:-${SERIES_NAME}_miniseries}" -RESULTS_DIR="$NANOCHAT_BASE_DIR/jan7_miniseries_results" +RESULTS_DIR="$NANOCHAT_BASE_DIR/${SERIES_NAME}_miniseries_results" mkdir -p "$RESULTS_DIR" RESULTS_FILE="$RESULTS_DIR/results.csv" @@ -37,13 +47,13 @@ log() { } log "==============================================" -log "Jan 7 Miniseries Training" +log "${SERIES_NAME} Miniseries Training" log "==============================================" for d in "${DEPTHS[@]}"; do log "Training d=$d..." - TAG="jan7_miniseries_d${d}" + TAG="${SERIES_NAME}_miniseries_d${d}" START_TIME=$(date +%s) # Train the model with natural horizon (target_param_data_ratio default) @@ -84,7 +94,7 @@ for d in "${DEPTHS[@]}"; do done log "==============================================" -log "Jan 7 Miniseries Complete!" +log "${SERIES_NAME} Miniseries Complete!" log "==============================================" log "Results saved to: $RESULTS_FILE" echo "" From 21608ec51efb57e86ae874ae0b1ced5f605f5ae2 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 12 Jan 2026 03:10:13 +0000 Subject: [PATCH 020/119] allow base_loss to report the loss of any arbitrary huggingface model similar to base_eval. had to change dataloader to be a lot better and just take tokenizer, not load the nanochat one. much better this way anyway --- nanochat/dataloader.py | 5 +-- nanochat/tokenizer.py | 3 +- scripts/base_loss.py | 76 +++++++++++++++++++++++++++++++++++++----- scripts/base_train.py | 4 +-- 4 files changed, 73 insertions(+), 15 deletions(-) diff --git a/nanochat/dataloader.py b/nanochat/dataloader.py index 41368021..20dd88f2 100644 --- a/nanochat/dataloader.py +++ b/nanochat/dataloader.py @@ -5,9 +5,8 @@ import pyarrow.parquet as pq from nanochat.common import get_dist_info from nanochat.dataset import list_parquet_files -from nanochat.tokenizer import get_tokenizer -def tokenizing_distributed_data_loader_with_state(B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda", resume_state_dict=None): +def tokenizing_distributed_data_loader_with_state(tokenizer, B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda", resume_state_dict=None): """ Stream pretraining text from parquet files, tokenize, yield training batches. @@ -62,8 +61,6 @@ def tokenizing_distributed_data_loader_with_state(B, T, split, tokenizer_threads # Now emit batches of tokens. needed_tokens = B * T + 1 # +1 is because we also need the target at the last token - # get the tokenizer and the bos token - tokenizer = get_tokenizer() bos_token = tokenizer.get_bos_token_id() # scratch buffer holds the tokens for one iteration token_buffer = deque() # we stream tokens on the right and pop from the left diff --git a/nanochat/tokenizer.py b/nanochat/tokenizer.py index 726fb2f8..e8ccafa7 100644 --- a/nanochat/tokenizer.py +++ b/nanochat/tokenizer.py @@ -103,9 +103,10 @@ class HuggingFaceTokenizer: def id_to_token(self, id): return self.tokenizer.id_to_token(id) - def _encode_one(self, text, prepend=None, append=None): + def _encode_one(self, text, prepend=None, append=None, num_threads=None): # encode a single string # prepend/append can be either a string of a special token or a token id directly. + # num_threads is ignored (only used by the nanochat Tokenizer for parallel encoding) assert isinstance(text, str) ids = [] if prepend is not None: diff --git a/scripts/base_loss.py b/scripts/base_loss.py index 3dbe68f8..094299a2 100644 --- a/scripts/base_loss.py +++ b/scripts/base_loss.py @@ -5,6 +5,9 @@ Loads a checkpoint, and: Example run as: torchrun --standalone --nproc_per_node=8 -m scripts.base_loss + +To evaluate a HuggingFace model: +python -m scripts.base_loss --hf_path openai-community/gpt2 """ import argparse from contextlib import nullcontext @@ -12,42 +15,98 @@ import torch from nanochat.checkpoint_manager import load_model from nanochat.common import compute_init, print0, compute_cleanup, autodetect_device_type from nanochat.dataloader import tokenizing_distributed_data_loader -from nanochat.tokenizer import get_token_bytes +from nanochat.tokenizer import get_token_bytes, HuggingFaceTokenizer from nanochat.loss_eval import evaluate_bpb from nanochat.engine import Engine +# ----------------------------------------------------------------------------- +# HuggingFace loading utilities, making the APIs match up to those of nanochat + +class ModelWrapper: + """Lightweight wrapper for a HuggingFace model""" + def __init__(self, model, max_seq_len=None): + self.model = model + self.max_seq_len = max_seq_len + + def __call__(self, input_ids, targets=None, loss_reduction='mean'): + logits = self.model(input_ids).logits + if targets is None: + return logits + else: + loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1, reduction=loss_reduction) + return loss + + def get_device(self): + return next(self.model.parameters()).device + +def load_hf_model(hf_path: str, device): + print0(f"Loading model from: {hf_path}") + from transformers import AutoModelForCausalLM + model = AutoModelForCausalLM.from_pretrained(hf_path) + model.to(device) + model.eval() + max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None + model = ModelWrapper(model, max_seq_len=max_seq_len) + tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path) + return model, tokenizer + +def get_hf_token_bytes(tokenizer, device="cpu"): + """Compute token_bytes tensor for a HuggingFace tokenizer.""" + vocab_size = tokenizer.tokenizer.get_vocab_size() + token_bytes = torch.zeros(vocab_size, dtype=torch.int64, device=device) + for token_id in range(vocab_size): + token_str = tokenizer.tokenizer.decode([token_id]) + token_bytes[token_id] = len(token_str.encode('utf-8')) # Count UTF-8 bytes + return token_bytes + # CLI arguments parser = argparse.ArgumentParser(description="Evaluate loss on train/val splits and sample from model") parser.add_argument("--device_batch_size", type=int, default=32, help="per-device batch size") -parser.add_argument("--split_tokens", type=int, default=20*524288, help="number of tokens to evaluate per split") +parser.add_argument("--split_tokens", type=int, default=40*524288, help="number of tokens to evaluate per split") parser.add_argument("--model_tag", type=str, default=None, help="model tag for checkpoint directory") parser.add_argument("--model_step", type=int, default=None, help="model step to load") parser.add_argument("--device_type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") +parser.add_argument("--hf_path", type=str, default=None, help="HuggingFace model path (e.g. openai-community/gpt2)") args = parser.parse_args() # Load the base model and the tokenizer device_type = autodetect_device_type() if args.device_type == "" else args.device_type ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) -model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=args.model_tag, step=args.model_step) -sequence_len = meta["model_config"]["sequence_len"] # could be arbitrary really +print0(f"Device: {device} | DDP rank: {ddp_rank} | DDP local rank: {ddp_local_rank} | DDP world size: {ddp_world_size}") + +if args.hf_path is not None: + # Load HuggingFace model + model, tokenizer = load_hf_model(args.hf_path, device) + sequence_len = model.max_seq_len if model.max_seq_len else 1024 + token_bytes = get_hf_token_bytes(tokenizer, device=device) + model_name = args.hf_path +else: + # Load local nanochat model + model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=args.model_tag, step=args.model_step) + sequence_len = meta["model_config"]["sequence_len"] + token_bytes = get_token_bytes(device=device) + model_name = f"base_model (step {meta['step']})" + autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext() +print0(f"Evaluating model: {model_name}") + # Evaluate the loss on each split tokens_per_step = args.device_batch_size * sequence_len * ddp_world_size assert args.split_tokens % tokens_per_step == 0, "split_tokens must be divisible by tokens_per_step" steps = args.split_tokens // tokens_per_step -token_bytes = get_token_bytes(device=device) bpb_results = {} for split_name in ["train", "val"]: - loader = tokenizing_distributed_data_loader(args.device_batch_size, sequence_len, split_name, device=device) + loader = tokenizing_distributed_data_loader(tokenizer, args.device_batch_size, sequence_len, split_name, device=device) with autocast_ctx: bpb = evaluate_bpb(model, loader, steps, token_bytes) print0(f"{split_name} bpb: {bpb:.4f}") bpb_results[split_name] = bpb + print0(f"Model: {model_name}, {split_name} bpb: {bpb:.6f}") -# Master process also samples from the model +# Master process also samples from the model (only for nanochat models) samples = [] -if ddp_rank == 0: +if ddp_rank == 0 and args.hf_path is None: prompts = [ "The capital of France is", "The chemical symbol of gold is", @@ -70,6 +129,7 @@ if ddp_rank == 0: from nanochat.report import get_report get_report().log(section="Base model loss", data=[ { + "model": model_name, "train bpb": bpb_results["train"], "val bpb": bpb_results["val"], }, diff --git a/scripts/base_train.py b/scripts/base_train.py index 7a16276f..c7c5bba0 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -210,8 +210,8 @@ if resuming: # Initialize the DataLoaders for train/val tokens_dir = os.path.join(base_dir, "tokenized_data") dataloader_resume_state_dict = None if not resuming else meta_data["dataloader_state_dict"] -train_loader = tokenizing_distributed_data_loader_with_state(args.device_batch_size, args.max_seq_len, split="train", device=device, resume_state_dict=dataloader_resume_state_dict) -build_val_loader = lambda: tokenizing_distributed_data_loader(args.device_batch_size, args.max_seq_len, split="val", device=device) +train_loader = tokenizing_distributed_data_loader_with_state(tokenizer, args.device_batch_size, args.max_seq_len, split="train", device=device, resume_state_dict=dataloader_resume_state_dict) +build_val_loader = lambda: tokenizing_distributed_data_loader(tokenizer, args.device_batch_size, args.max_seq_len, split="val", device=device) x, y, dataloader_state_dict = next(train_loader) # kick off load of the very first batch of data # ----------------------------------------------------------------------------- From 4610a838a1746d240fa35dacf13493ba8ea1f97d Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 12 Jan 2026 05:23:47 +0000 Subject: [PATCH 021/119] record negative result on MTP --- dev/LOG.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/dev/LOG.md b/dev/LOG.md index 902c1e07..c7d8b804 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,32 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-12: Multi-Token Prediction (MTP) + +Ported multi-token prediction from modded-nanogpt. Instead of predicting just the next token, predict the next n tokens at each position with weighted loss. + +### Implementation + +- Instead of calling the loss `n_predict` times, uses a fancy batched computation using `unfold` + `gather` + cross-entropy decomposition (`CE = logsumexp - logits[target]`) +- Schedule anneals from 3-token to 1-token prediction: + - 0-33%: `[1.0, 0.5, 0.25→0]` (3rd token fades) + - 33-67%: `[1.0, 0.5→0]` (2nd token fades) + - 67-100%: `[1.0]` (standard next-token) +- Weights normalized to sum to 1 + +### Results (d12) + +| Metric | Baseline | MTP | +|--------|----------|-----| +| GPU Memory | 34 GB | 47 GB | +| MFU | 41% | 40% | +| val/bpb (per step) | baseline | same/slightly worse | +| val/bpb (wall clock) | baseline | noticeably worse | + +**Conclusion:** Negative result for nanochat. The extra memory and compute overhead from predicting multiple tokens doesn't pay off, in fact the results get worse. The auxiliary loss signal may help in other settings (larger models, different architectures?), but for our setup it's pure overhead at the moment. + +--- + ## 2026-01-11: Sliding Window Attention Added configurable sliding window attention, inspired by GPT-3's alternating short/long pattern. From 238353c99802c92759e69e32447f94a2a0c4a12c Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 13 Jan 2026 17:14:29 +0000 Subject: [PATCH 022/119] document my struggle with fp8 integration yesterday, it's not working like i thought it would and i suffered. one day i will return to continue the fight. --- dev/LOG.md | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) diff --git a/dev/LOG.md b/dev/LOG.md index c7d8b804..79445264 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,67 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-13: FP8 Training for lm_head + +Attempted to use FP8 (8-bit floating point) for the lm_head layer to speed up the large vocab projection matmul. H100 GPUs have FP8 tensor cores that can theoretically provide ~2x speedup over BF16. + +### Implementation Approaches Tried + +**1. Dynamic Scaling (failed)** +- Compute `x.abs().max()` and `w.abs().max()` each forward to determine scales +- Problem: `.item()` calls cause graph breaks with torch.compile +- Tried `@torch._dynamo.allow_in_graph` pattern (like torchao.float8) - worked but no speedup +- Tried `torch.library.custom_op` with float scales - caused NaN gradients after first optimizer step +- Root cause: interaction between custom ops, dynamic scale computation, and torch.compile is fragile + +**2. Static Scaling (partial success)** +- Pre-set scales at init time like modded-nanogpt: `x_scale=10/448, w_scale=0.1/448` +- `grad_scale` computed dynamically from batch size (safe since it's just `1/(B*T)/57344` due to the gradient expression of cross entropy). modded-nanogpt has a bug here probably because they set `grad_scale = 0.75/448`, but grads are in E5M2 so this should probably be `1/57344`, 1 being the amax of any individual element of cross entropy loss, and no normalization by B,T because they use sum reduction not mean reduction. +- Uses `torch.library.custom_op` with `@torch.compile` on inner kernels +- This works correctly - no NaNs, proper gradients + +### Results (d12) + +| Metric | BF16 Baseline | FP8 lm_head | +|--------|---------------|-------------| +| GPU Memory | 34 GB | 36 GB | +| tok/sec | baseline | ~1% faster | + +### The Memory Mystery + +FP8 *should* save memory since we store `x_f8` (1 byte) instead of `x` (2 bytes) for backward. But we see 2GB *increase*. Suspected causes: +- `torch.compile` on inner kernels creating extra buffers/specializations +- `torch._scaled_mm` internal workspace allocations +- Custom op registration machinery overhead + +Tried saving original weight `w` (just a reference to parameter) instead of `w_f8` in backward, then re-quantizing on the spot during backward - didn't help. Still saw bump. + +### Microbenchmark vs Reality + +Raw microbenchmark showed promise: +- BF16 matmul: 16.95 ms +- FP8 matmul (static scales): 10.31 ms (1.64x faster) +- FP8 with dynamic scaling: 12.25 ms (1.38x faster) + +But in full training, the ~1% tok/sec improvement doesn't justify the 2GB memory increase and the added code complexity and the need to tune scale factors for both x and w. + +### Code Artifacts + +See the branch `fp8_attempt_fail` for: + +- `nanochat/fp8_static.py` - Static scaling implementation (working) +- `nanochat/fp8_dynamic.py` - Dynamic scaling implementation (torchao-style, working but slow) +- `gpt.py` imports `fp8_static.LinearFP8` and simply swaps it for `lm_head` in `gpt.py`. + +### Open Questions + +- Why does the custom op approach use more memory than vanilla BF16? +- Why is the bump in tok_per_sec so low? We should see ~1.6X speedup in both the forward pass and also (twice) in backward pass for the gradients. Granted, Ahmdal's law is part of the solution because our vocab_size is only 32K so the final layer isn't a huge part of the profile but the expected speedup is still not fully realized. + +**Conclusion:** Negative result for now. The implementation works correctly but provides marginal speedup with *increased* memory usage. I'm not understanding the torch.compile interaction here. The complexity of FP8 custom ops isn't justified for lm_head alone. TODO to study in more detail the way this is implemented in other libraries, e.g. torchao. + +--- + ## 2026-01-12: Multi-Token Prediction (MTP) Ported multi-token prediction from modded-nanogpt. Instead of predicting just the next token, predict the next n tokens at each position with weighted loss. From 64b48d0e5c502f56d9bfd9af8a5c2a5e901bf1ba Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 13 Jan 2026 17:45:06 +0000 Subject: [PATCH 023/119] validated that \p{N}{1,2} is the correct number of digits to group up to in the regex pattern of the GPT-4 tokenizer (2 down from 3), leading to the best val_bpb for 32K vocabs --- dev/LOG.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/dev/LOG.md b/dev/LOG.md index 79445264..47081994 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,21 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-13: Number Token Split Pattern + +Validated the `\p{N}{1,2}` pattern in `SPLIT_PATTERN` (tokenizer.py line 30), which I only guessed earlier and had a TODO for to validate. GPT-4 uses `\p{N}{1,3}` to group number sequences of up to 3 digits into tokens, but we suspected smaller vocab sizes benefit from grouping fewer digits per token. + +**Results (d12, vocab=32K):** +| Pattern | val_bpb | +|---------|---------| +| `\p{N}{1,1}` | 0.969 | +| `\p{N}{1,2}` | **0.965** | +| `\p{N}{1,3}` | 0.972 | + +**Conclusion:** `{1,2}` is optimal for vocab size 32K. Grouping 3 digits wastes tokens on rare 3-digit combinations; grouping 1 digit is too fine-grained and bloats token sequences. Keeping `{1,2}` as default. + +--- + ## 2026-01-13: FP8 Training for lm_head Attempted to use FP8 (8-bit floating point) for the lm_head layer to speed up the large vocab projection matmul. H100 GPUs have FP8 tensor cores that can theoretically provide ~2x speedup over BF16. From 23985413aaa30393802f1dbad67c80e698e9bb5a Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 13 Jan 2026 17:50:39 +0000 Subject: [PATCH 024/119] adjust the comment on the regex pattern per recent experimnet see dev/LOG.md --- nanochat/tokenizer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nanochat/tokenizer.py b/nanochat/tokenizer.py index e8ccafa7..a2146c2e 100644 --- a/nanochat/tokenizer.py +++ b/nanochat/tokenizer.py @@ -26,7 +26,7 @@ SPECIAL_TOKENS = [ # NOTE: this split pattern deviates from GPT-4 in that we use \p{N}{1,2} instead of \p{N}{1,3} # I did this because I didn't want to "waste" too many tokens on numbers for smaller vocab sizes. -# I haven't validated that this is actually a good idea, TODO. +# I verified that 2 is the sweet spot for vocab size of 32K. 1 is a bit worse, 3 was worse still. SPLIT_PATTERN = r"""'(?i:[sdmt]|ll|ve|re)|[^\r\n\p{L}\p{N}]?+\p{L}+|\p{N}{1,2}| ?[^\s\p{L}\p{N}]++[\r\n]*|\s*[\r\n]|\s+(?!\S)|\s+""" # ----------------------------------------------------------------------------- From 43c29dd9d56b43b9ce8165fb112b676159f63a52 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 13 Jan 2026 20:05:47 +0000 Subject: [PATCH 025/119] Big DataLoader refactor: BOS-aligned dataloaders with epoch tracking for pre/mid-training The new DataLoader ensures that every token sequence in train/val batches has a BOS token at the beginning. Therefore, no token streams start abruptly in the middle of a document, which could be confusing for the model. Note that this changes the loss scale because there are fewer confusing tokens in the train/val batches. The main downside is that we now waste about 35% of tokens due to cropping. This is ok because we have a lot of data. See dev/LOG.md entry for this change for a lot more information. --- dev/LOG.md | 64 +++++++++++ miniseries.sh | 5 +- nanochat/dataloader.py | 239 +++++++++++++++++++++++++++++------------ run1000.sh | 8 +- scripts/base_train.py | 10 +- scripts/mid_train.py | 102 +++++++++++++----- speedrun.sh | 8 +- 7 files changed, 330 insertions(+), 106 deletions(-) diff --git a/dev/LOG.md b/dev/LOG.md index 47081994..785eccd3 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,70 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-13: BOS-Aligned Dataloader with Bin Packing + +Redesigned the pretraining and midtraining dataloader to ensure every sequence starts with a BOS token, and explored bin-packing algorithms to minimize wasted tokens. + +### Problem Statement + +The original dataloader streams tokens into a flat buffer and reshapes into batches. This means some rows start mid-document (no BOS), which could confuse the model during training. We want every row to start with BOS and contain well-formed documents. + +### Approach 1: Greedy-Crop BOS (Simple) + +Each row is built independently: +- Start with a document (which has BOS prepended) +- Pack more documents until row is full +- If a document doesn't fit, **crop it** to fill remaining space (discard the rest) +- 100% utilization (no padding), but wastes cropped tokens + +### Waste Analysis + +Measured token waste empirically on real data (T=2048): +- **39.4% of tokens are cropped** (discarded when docs don't fit) +- **22.9% is the theoretical minimum** (tokens in docs longer than T+1 that can never fit) +- The extra ~16.5% comes from "unlucky" cropping when a long doc starts near the end of a row + +### Bin Packing Algorithms Explored + +| Algorithm | Util% | Crop% | Pad% | Notes | +|-----------|-------|-------|------|-------| +| Greedy-Crop (baseline) | 100% | 39.4% | 0% | Simple, no wasted compute | +| Greedy-Pad | 78% | 23.0% | 22% | Pads instead of crops - wastes compute | +| First-Fit Decreasing (FFD) | 99.7% | 23.0% | 0.3% | Near-optimal packing, minimal padding | +| **BestFit-Crop** | 100% | 34.6% | 0% | Smart cropping, no padding | + +### BestFit-Crop Algorithm + +A middle ground that maintains 100% utilization while reducing cropping: + +1. Buffer N documents +2. For each row, greedily pick the **largest doc that fits entirely** +3. Repeat until nothing fits +4. When nothing fits, crop a doc to fill remaining space exactly + +This avoids "unlucky" crops by searching the buffer for better-fitting documents. + +**Results (T=2048):** +- Crop waste reduced from 39.4% → 34.6% (~12% relative improvement) +- Still achieves 100% utilization (no padding, every token trains) +- Slightly more rows than baseline (uses more documents per batch) + +### Decision: Keep Two Implementations + +1. Keep the original implementation which is very simple, efficient and has 100% token utilization in the batch (no padding with ignore tokens), but creates slightly more confusing token streams for the LLM because documents during training can start abruptly from the middle with no context. Note that this never happens at test time, where BOS is always present. + +2. **`_bos_bestfit` (BestFit-Crop, new default)**: Slightly more complex but still keeps 100% token utilization in the batch (no padding), but at the cost of discarding documents when they don't fit. In practice, about 34% of tokens are discarded with this approach. This is ok because for most models we care about we have plenty of data without having to go to multiple epochs. One more subtle effect is that it does skew the data distribution a tiny bit because, reliably and necessarily, tokens at the tails of long documents will be discarded. However, this doesn't seem to impact actual downstream performance. + +### Midtraining + +The midtraining dataloader was also updated. Because conversations are on average a lot shorter than pretraining documents, only about 3.3% of tokens get cropped. + +### NOTE: loss scale + +Do note that switching to the BOS dataloader changes the validation loss and makes all previous experiments not comparable in absolute value of the loss, because we have a lot fewer "confusing" tokens in the train/val batches. All tokens can look back and find the BOS token and have the full context of that document to make predictions. Therefore, the loss appears lower but this is "fake" to some extent, and the expectation is that the vast majority of relative comparisons done so far would agree with those before and after this change. + +--- + ## 2026-01-13: Number Token Split Pattern Validated the `\p{N}{1,2}` pattern in `SPLIT_PATTERN` (tokenizer.py line 30), which I only guessed earlier and had a TODO for to validate. GPT-4 uses `\p{N}{1,3}` to group number sequences of up to 3 digits into tokens, but we suspected smaller vocab sizes benefit from grouping fewer digits per token. diff --git a/miniseries.sh b/miniseries.sh index 0a6947ed..4d6f4360 100644 --- a/miniseries.sh +++ b/miniseries.sh @@ -17,8 +17,9 @@ if [ -z "$SKIP_SETUP" ]; then uv sync --extra gpu source .venv/bin/activate - # Tokenizer - python -m nanochat.dataset -n 240 + # Tokenizer, download 1000 shards for pretraining + # (probably this can be reduced but it's tricky to determine the exact right number, TODO). + python -m nanochat.dataset -n 1000 python -m scripts.tok_train --max_chars=2000000000 --vocab_size=32768 else source .venv/bin/activate diff --git a/nanochat/dataloader.py b/nanochat/dataloader.py index 20dd88f2..562d517e 100644 --- a/nanochat/dataloader.py +++ b/nanochat/dataloader.py @@ -1,4 +1,25 @@ -from collections import deque +""" +Distributed dataloaders for pretraining. + +Two implementations are provided: + +1. Original (tokenizing_distributed_data_loader): + - Streams tokens into a flat buffer, reshapes to (B, T) + - Rows may start mid-document (no guaranteed BOS at position 0) + - 100% token utilization, simple and efficient + +2. BOS-aligned bestfit (tokenizing_distributed_data_loader_bos_bestfit): + - Every row starts with BOS token + - Documents packed using best-fit algorithm to minimize cropping + - When no document fits remaining space, crops a document to fill exactly + - 100% utilization (no padding), ~35% tokens cropped at T=2048 + +The tradeoff: BOS-aligned loses ~35% of tokens to cropping, but ensures that +there are fewer "confusing" tokens in the train/val batches as every token can +now attend back to the BOS token and sees the full context of the document. +(2) is the new default if you have enough data. +Fallback to (1) if you have very limited data AND long documents. +""" import torch import pyarrow.parquet as pq @@ -6,86 +27,172 @@ import pyarrow.parquet as pq from nanochat.common import get_dist_info from nanochat.dataset import list_parquet_files +def _document_batches(split, resume_state_dict, tokenizer_batch_size): + """ + Infinite iterator over document batches (list of text strings) from parquet files. + + Handles DDP sharding and approximate resume. Each yield is (text_batch, (pq_idx, rg_idx, epoch)) + where text_batch is a list of document strings, indices track position for resumption, + and epoch counts how many times we've cycled through the dataset (starts at 1). + """ + ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info() + + parquet_paths = list_parquet_files() + assert len(parquet_paths) != 0, "No dataset parquet files found, did you run dataset.py?" + parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:] + + resume_pq_idx = resume_state_dict["pq_idx"] if resume_state_dict is not None else 0 + resume_rg_idx = resume_state_dict["rg_idx"] if resume_state_dict is not None else None + resume_epoch = resume_state_dict.get("epoch", 1) if resume_state_dict is not None else 1 + first_pass = True + pq_idx = resume_pq_idx + epoch = resume_epoch + + while True: # iterate infinitely (multi-epoch) + pq_idx = resume_pq_idx if first_pass else 0 + while pq_idx < len(parquet_paths): + filepath = parquet_paths[pq_idx] + pf = pq.ParquetFile(filepath) + # Start from resume point if resuming on same file, otherwise from DDP rank + if first_pass and (resume_rg_idx is not None) and (pq_idx == resume_pq_idx): + base_idx = resume_rg_idx // ddp_world_size + base_idx += 1 # advance by 1 so we don't repeat data after resuming + rg_idx = base_idx * ddp_world_size + ddp_rank + if rg_idx >= pf.num_row_groups: + pq_idx += 1 + continue + resume_rg_idx = None # only do this once + else: + rg_idx = ddp_rank + while rg_idx < pf.num_row_groups: + rg = pf.read_row_group(rg_idx) + batch = rg.column('text').to_pylist() + for i in range(0, len(batch), tokenizer_batch_size): + yield batch[i:i+tokenizer_batch_size], (pq_idx, rg_idx, epoch) + rg_idx += ddp_world_size + pq_idx += 1 + first_pass = False + epoch += 1 + + def tokenizing_distributed_data_loader_with_state(tokenizer, B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda", resume_state_dict=None): """ Stream pretraining text from parquet files, tokenize, yield training batches. - This implementation became a bit more complex because we wish to support approximate resume training. - Instead of turning this into a Class, we opt to return the state_dict with every batch, - and then the caller can pass in a state_dict to resume training from a desired point. - Note that this resumption is atm only *approximate* for simplicity. - We won't repeat the same documents but we might skip a few. - The state_dict that is returned can be later passed into this function via `resume_state_dict` to approximately resume. + This is the original dataloader that streams tokens into a flat buffer and reshapes. + Rows may start mid-document (no guaranteed BOS at position 0). - Perfect state resumption is possible but would be a lot more bloated, probably not worth it atm. + Supports approximate resume via state_dict. """ assert split in ["train", "val"], "split must be 'train' or 'val'" - # infinite iterator over document batches (list of text strings) - ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info() - def document_batches(): - parquet_paths = list_parquet_files() - assert len(parquet_paths) != 0, "No dataset parquet files found, did you run dataset.py?" - parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:] - resume_pq_idx = resume_state_dict["pq_idx"] if resume_state_dict is not None else 0 - resume_rg_idx = resume_state_dict["rg_idx"] if resume_state_dict is not None else None - first_pass = True - pq_idx = resume_pq_idx # we kick off parquet files at the resume index (or by default just 0) - while True: # iterate infinitely (multi-epoch) - pq_idx = resume_pq_idx if first_pass else 0 - while pq_idx < len(parquet_paths): # iterate over all parquet files - filepath = parquet_paths[pq_idx] - pf = pq.ParquetFile(filepath) - # Start from resume point if resuming on same file, otherwise from DDP rank - # I know this state resumption is a little bit tricky and a little bit hacky... sigh. - if first_pass and (resume_rg_idx is not None) and (pq_idx == resume_pq_idx): - base_idx = resume_rg_idx // ddp_world_size # in units of ddp_world_size - base_idx += 1 # advance by 1 so that we definitely don't repeat data after resuming - rg_idx = base_idx * ddp_world_size + ddp_rank - if rg_idx >= pf.num_row_groups: - pq_idx += 1 - continue - resume_rg_idx = None # set to None as we only want to do this a single time - else: - rg_idx = ddp_rank - while rg_idx < pf.num_row_groups: - rg = pf.read_row_group(rg_idx) - batch = rg.column('text').to_pylist() # each batch is a parquet group, e.g. 1024 rows - # the tokenizer encode might want to go in even smaller batches, e.g. 128 rows - for i in range(0, len(batch), tokenizer_batch_size): - yield batch[i:i+tokenizer_batch_size], (pq_idx, rg_idx) - rg_idx += ddp_world_size # advance to the next row group (in DDP) - pq_idx += 1 # advance to the next parquet file - first_pass = False - batches = document_batches() - - # Now emit batches of tokens. - needed_tokens = B * T + 1 # +1 is because we also need the target at the last token + batches = _document_batches(split, resume_state_dict, tokenizer_batch_size) + needed_tokens = B * T + 1 # +1 for target at last position bos_token = tokenizer.get_bos_token_id() - # scratch buffer holds the tokens for one iteration - token_buffer = deque() # we stream tokens on the right and pop from the left + token_buffer = [] + pq_idx, rg_idx, epoch = 0, 0, 1 + while True: - # Accumulate enough tokens for one iteration before yielding. + + # Accumulate enough tokens while len(token_buffer) < needed_tokens: - doc_batch, (pq_idx, rg_idx) = next(batches) + doc_batch, (pq_idx, rg_idx, epoch) = next(batches) token_lists = tokenizer.encode(doc_batch, prepend=bos_token, num_threads=tokenizer_threads) for tokens in token_lists: token_buffer.extend(tokens) - # Move tokens from the deque into the scratch buffer - tokens = [token_buffer.popleft() for _ in range(needed_tokens)] - # CUDA supports memory pinning for asynchronous transfers between CPU and GPU - use_cuda_optimizations = device == "cuda" - scratch = torch.tensor(tokens, dtype=torch.long, pin_memory=use_cuda_optimizations) # in PyTorch, long=int64 - # Create the inputs/targets as 1D tensors - inputs_cpu = scratch[:-1] - targets_cpu = scratch[1:] - # Reshape to 2D and move to GPU async - inputs = inputs_cpu.view(B, T).to(device=device, non_blocking=use_cuda_optimizations) - targets = targets_cpu.view(B, T).to(device=device, non_blocking=use_cuda_optimizations) - state_dict = {"pq_idx": pq_idx, "rg_idx": rg_idx} # we need this in case we wish to approximately resume training - yield inputs, targets, state_dict + tokens = token_buffer[:needed_tokens] # Read B*T+1 tokens (+1 is only for the target for the last token) + token_buffer = token_buffer[B*T:] # Advance by B*T tokens, so we move exactly one window of B*T tokens over + + # Package tokens into inputs and targets, yield + use_cuda = device == "cuda" + scratch = torch.tensor(tokens, dtype=torch.long, pin_memory=use_cuda) + inputs = scratch[:-1].view(B, T).to(device=device, non_blocking=use_cuda) + targets = scratch[1:].view(B, T).to(device=device, non_blocking=use_cuda) + yield inputs, targets, {"pq_idx": pq_idx, "rg_idx": rg_idx, "epoch": epoch} + def tokenizing_distributed_data_loader(*args, **kwargs): - # helper function that only emits the inputs/targets and not the state_dict + """Helper that omits state_dict from yields.""" for inputs, targets, state_dict in tokenizing_distributed_data_loader_with_state(*args, **kwargs): yield inputs, targets + + +def tokenizing_distributed_data_loader_with_state_bos_bestfit( + tokenizer, B, T, split, + tokenizer_threads=4, tokenizer_batch_size=128, + device="cuda", resume_state_dict=None, + buffer_size=1000 +): + """ + BOS-aligned dataloader with Best-Fit Cropping. + + Reduces token waste compared to simple greedy cropping by searching a buffer + for documents that fit well, while maintaining 100% utilization (no padding). + + Algorithm for each row: + 1. From buffered docs, pick the LARGEST doc that fits entirely + 2. Repeat until no doc fits + 3. When nothing fits, crop a doc to fill remaining space exactly + + Key properties: + - Every row starts with BOS + - 100% utilization (no padding, every token is trained on) + - Approximately 35% of all tokens are discarded due to cropping + """ + assert split in ["train", "val"], "split must be 'train' or 'val'" + + row_capacity = T + 1 + batches = _document_batches(split, resume_state_dict, tokenizer_batch_size) + bos_token = tokenizer.get_bos_token_id() + doc_buffer = [] + pq_idx, rg_idx, epoch = 0, 0, 1 + + def refill_buffer(): + nonlocal pq_idx, rg_idx, epoch + doc_batch, (pq_idx, rg_idx, epoch) = next(batches) + token_lists = tokenizer.encode(doc_batch, prepend=bos_token, num_threads=tokenizer_threads) + for tokens in token_lists: + doc_buffer.append(tokens) + + while True: + rows = [] + for _ in range(B): + row = [] + while len(row) < row_capacity: + # Ensure buffer has documents + while len(doc_buffer) < buffer_size: + refill_buffer() + + remaining = row_capacity - len(row) + + # Find largest doc that fits entirely + best_idx = -1 + best_len = 0 + for i, doc in enumerate(doc_buffer): + doc_len = len(doc) + if doc_len <= remaining and doc_len > best_len: + best_idx = i + best_len = doc_len + + if best_idx >= 0: + doc = doc_buffer.pop(best_idx) + row.extend(doc) + else: + # No doc fits - crop first doc to fill remaining + doc = doc_buffer.pop(0) + row.extend(doc[:remaining]) + + rows.append(row[:row_capacity]) + + use_cuda = device == "cuda" + batch_tensor = torch.tensor(rows, dtype=torch.long, pin_memory=use_cuda) + inputs = batch_tensor[:, :-1].to(device=device, non_blocking=use_cuda) + targets = batch_tensor[:, 1:].to(device=device, non_blocking=use_cuda) + + yield inputs, targets, {"pq_idx": pq_idx, "rg_idx": rg_idx, "epoch": epoch} + + +def tokenizing_distributed_data_loader_bos_bestfit(*args, **kwargs): + """Helper that omits state_dict from yields.""" + for inputs, targets, state_dict in tokenizing_distributed_data_loader_with_state_bos_bestfit(*args, **kwargs): + yield inputs, targets diff --git a/run1000.sh b/run1000.sh index a7a3716e..fe92edfc 100644 --- a/run1000.sh +++ b/run1000.sh @@ -20,8 +20,8 @@ curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-publ # train tokenizer on ~4B characters and kick off download of the rest for pretraining python -m nanochat.dataset -n 16 -# start downloading the rest of the shards for a total of 800 (see below why 800) -python -m nanochat.dataset -n 800 & +# start downloading the rest of the shards for a total of 1200 (see below why 1200) +python -m nanochat.dataset -n 1200 & # todo: download the rest of it python -m scripts.tok_train --max_chars=4000000000 --vocab_size=65536 python -m scripts.tok_eval @@ -62,7 +62,9 @@ python -m scripts.tok_eval # The tok_eval.py script reports about ~4.8 chars/token on average for the default tokenizer settings. # So ~38B tokens # ~4.8 chars/token = ~185B chars. # Each data shard is ~250M chars, so we need ~185B / 250M ~= 740 shards. -# For safety, I bumped that up to 800 shards, and that's why up above I used -n 800 when pre-downloading dataset shards. +# For safety, I bumped that up to 800 shards. +# The new DataLoader wastes about 35% of tokens to cropping, so 800 / (1 - 0.35) ~= 1200 shards are needed. +# => why up above I used -n 1200 when pre-downloading dataset shards. # If we didn't have enough data, the training script would loop around and do multiple epochs over the same data, # which would decrease model performance. Possibly 2, 3 or so epochs is ~ok, but certainly not ideal and at 10+ epochs we'd # start to overfit hard. diff --git a/scripts/base_train.py b/scripts/base_train.py index c7c5bba0..a432e7ab 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -21,7 +21,7 @@ import wandb import torch from nanochat.gpt import GPT, GPTConfig -from nanochat.dataloader import tokenizing_distributed_data_loader, tokenizing_distributed_data_loader_with_state +from nanochat.dataloader import tokenizing_distributed_data_loader_bos_bestfit, tokenizing_distributed_data_loader_with_state_bos_bestfit from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, print_banner, get_base_dir, autodetect_device_type from nanochat.tokenizer import get_tokenizer, get_token_bytes from nanochat.checkpoint_manager import save_checkpoint, load_checkpoint @@ -210,8 +210,8 @@ if resuming: # Initialize the DataLoaders for train/val tokens_dir = os.path.join(base_dir, "tokenized_data") dataloader_resume_state_dict = None if not resuming else meta_data["dataloader_state_dict"] -train_loader = tokenizing_distributed_data_loader_with_state(tokenizer, args.device_batch_size, args.max_seq_len, split="train", device=device, resume_state_dict=dataloader_resume_state_dict) -build_val_loader = lambda: tokenizing_distributed_data_loader(tokenizer, args.device_batch_size, args.max_seq_len, split="val", device=device) +train_loader = tokenizing_distributed_data_loader_with_state_bos_bestfit(tokenizer, args.device_batch_size, args.max_seq_len, split="train", device=device, resume_state_dict=dataloader_resume_state_dict) +build_val_loader = lambda: tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, args.max_seq_len, split="val", device=device) x, y, dataloader_state_dict = next(train_loader) # kick off load of the very first batch of data # ----------------------------------------------------------------------------- @@ -395,7 +395,8 @@ while True: eta_str = f" | eta: {eta_seconds/60:.1f}m" else: eta_str = "" - print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | total time: {total_training_time/60:.2f}m{eta_str}") + epoch = dataloader_state_dict["epoch"] + print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | epoch: {epoch} | total time: {total_training_time/60:.2f}m{eta_str}") if step % 100 == 0: log_data = { "step": step, @@ -406,6 +407,7 @@ while True: "train/dt": dt, "train/tok_per_sec": tok_per_sec, "train/mfu": mfu, + "train/epoch": epoch, } wandb_run.log(log_data) diff --git a/scripts/mid_train.py b/scripts/mid_train.py index d684b9f7..0742c087 100644 --- a/scripts/mid_train.py +++ b/scripts/mid_train.py @@ -10,7 +10,6 @@ torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --device_batch_ """ import argparse -from collections import deque import os os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" import time @@ -125,49 +124,95 @@ val_dataset = TaskMixture([ # these two global variables and update them from within the data generator. last_step = False # we will toggle this to True when we reach the end of the training dataset approx_progress = 0.0 # will go from 0 to 1 over the course of the epoch -def mid_data_generator(split): - global last_step, approx_progress +current_epoch = 1 # track epoch for logging +def mid_data_generator_bos_bestfit(split, buffer_size=100): + """ + BOS-aligned dataloader for midtraining with bestfit-crop packing. + + Each row in the batch starts with BOS (beginning of a conversation). + Conversations are packed using best-fit algorithm to minimize cropping. + This matches the BOS-aligned approach used in pretraining. + """ + global last_step, approx_progress, current_epoch assert split in {"train", "val"}, "split must be 'train' or 'val'" dataset = train_dataset if split == "train" else val_dataset dataset_size = len(dataset) assert dataset_size > 0 - needed_tokens = args.device_batch_size * args.max_seq_len + 1 # to form one training batch of inputs,targets - token_buffer = deque() - # CUDA supports memory pinning for faster transfers between CPU and GPU: - scratch = torch.empty(needed_tokens, dtype=torch.int64, pin_memory=(device_type == "cuda")) - cursor = ddp_rank # increments by ddp_world_size each time, so each rank processes unique documents - it = 0 # iteration counter - while True: - # Accumulate enough tokens for one iteration before yielding - while len(token_buffer) < needed_tokens: + row_capacity = args.max_seq_len + 1 # +1 for target at last position + + # Conversation buffer: list of token lists + conv_buffer = [] + cursor = ddp_rank # Each rank processes different conversations + epoch = 1 + it = 0 # iteration counter + + def refill_buffer(): + nonlocal cursor, epoch + while len(conv_buffer) < buffer_size: conversation = dataset[cursor] ids, _ = tokenizer.render_conversation(conversation) - token_buffer.extend(ids) + conv_buffer.append(ids) cursor += ddp_world_size if cursor >= dataset_size: - cursor -= dataset_size # wrap around for another epoch + cursor = cursor % dataset_size + epoch += 1 if split == "train": - last_step = True # toggle last_step to True, which will terminate the training loop + last_step = True # toggle last_step to True, which will terminate the training loop + + while True: + rows = [] + for _ in range(args.device_batch_size): + row = [] + while len(row) < row_capacity: + # Ensure buffer has conversations + while len(conv_buffer) < buffer_size: + refill_buffer() + + remaining = row_capacity - len(row) + + # Find largest conversation that fits entirely + best_idx = -1 + best_len = 0 + for i, conv in enumerate(conv_buffer): + conv_len = len(conv) + if conv_len <= remaining and conv_len > best_len: + best_idx = i + best_len = conv_len + + if best_idx >= 0: + # Found a conversation that fits - use it entirely + conv = conv_buffer.pop(best_idx) + row.extend(conv) + else: + # No conversation fits - crop first conversation to fill remaining + conv = conv_buffer.pop(0) + row.extend(conv[:remaining]) + + rows.append(row[:row_capacity]) + # Stopping condition to respect num_iterations, if given it += 1 if 0 < args.num_iterations <= it and split == "train": - last_step = True # toggle last_step to True, which will terminate the training loop - # Build up inputs/targets and yield - for i in range(needed_tokens): - scratch[i] = token_buffer.popleft() - inputs_cpu = scratch[:-1].to(dtype=torch.int32) - targets_cpu = scratch[1:] - inputs = inputs_cpu.view(args.device_batch_size, args.max_seq_len).to(device=device, dtype=torch.int32, non_blocking=True) - targets = targets_cpu.view(args.device_batch_size, args.max_seq_len).to(device=device, dtype=torch.int64, non_blocking=True) + last_step = True + + # Update progress tracking if split == "train": + current_epoch = epoch if args.num_iterations > 0: - approx_progress = it / args.num_iterations # calculate progress from the max number of iterations + approx_progress = it / args.num_iterations else: - approx_progress = cursor / dataset_size # approximate progress as a fraction of the dataset + approx_progress = cursor / dataset_size + + # Build tensors + use_cuda = device_type == "cuda" + batch_tensor = torch.tensor(rows, dtype=torch.long, pin_memory=use_cuda) + inputs = batch_tensor[:, :-1].to(device=device, dtype=torch.int32, non_blocking=use_cuda) + targets = batch_tensor[:, 1:].to(device=device, dtype=torch.int64, non_blocking=use_cuda) + yield inputs, targets -train_loader = mid_data_generator("train") -build_val_loader = lambda: mid_data_generator("val") +train_loader = mid_data_generator_bos_bestfit("train") +build_val_loader = lambda: mid_data_generator_bos_bestfit("val") progress = 0 # will go from 0 to 1 over the course of the epoch # Learning rate scheduler @@ -285,7 +330,7 @@ while True: mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in % if step > 10: total_training_time += dt # only count the time after the first 10 steps - print0(f"step {step:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | total time: {total_training_time/60:.2f}m") + print0(f"step {step:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | epoch: {current_epoch} | total time: {total_training_time/60:.2f}m") if step % 10 == 0: wandb_run.log({ "step": step, @@ -296,6 +341,7 @@ while True: "train/dt": dt, "train/tok_per_sec": tok_per_sec, "train/mfu": mfu, + "train/epoch": current_epoch, }) # print a few more stats diff --git a/speedrun.sh b/speedrun.sh index f9be2271..76ccf214 100644 --- a/speedrun.sh +++ b/speedrun.sh @@ -55,8 +55,8 @@ python -m nanochat.report reset # each shard is ~100MB of text (compressed), so this is about ~800MB of data on disk python -m nanochat.dataset -n 8 # Immediately also kick off downloading more shards in the background while tokenizer trains -# See comment below for why 240 is the right number here -python -m nanochat.dataset -n 240 & +# See comment below for why 370 is the right number here +python -m nanochat.dataset -n 370 & DATASET_DOWNLOAD_PID=$! # train the tokenizer with vocab size 2**16 = 65536 on ~2B characters of data python -m scripts.tok_train --max_chars=2000000000 --vocab_size=65536 @@ -70,7 +70,9 @@ python -m scripts.tok_eval # Chinchilla says #tokens = 20X #params, so we need 561e6 * 20 = 11.2B tokens. # Assume our tokenizer is 4.8 chars/token, this is 11.2B * 4.8 ~= 54B chars. # At 250M chars/shard, this is 54B / 250M ~= 216 shards needed for pretraining. -# Round up to 240 for safety. At ~100MB/shard, this downloads ~24GB of data to disk. +# Round up to 240 for safety. Also, the new DataLoader wastes about 35% of tokens to cropping +# so 240 / (1 - 0.35) = 370 shards are needed. +# At ~100MB/shard, this downloads ~37GB of data to disk. # (The total number of shards available in the entire dataset is 1822.) echo "Waiting for dataset download to complete..." wait $DATASET_DOWNLOAD_PID From f92efce1698860a022621107a59702ea298e4fbd Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 13 Jan 2026 21:33:54 +0000 Subject: [PATCH 026/119] add negative result about not allowing attention across BOS tokens. A lot more code complexity for basically no gain in performance --- dev/LOG.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/dev/LOG.md b/dev/LOG.md index 785eccd3..5f6e1d7f 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,36 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-13: Varlen Attention (Negative Result) + +Attempted to prevent attention from "leaking" across document boundaries using Flash Attention's `flash_attn_varlen_func`, similar to modded-nanogpt's approach. + +### Background + +With the BOS-aligned dataloader, multiple documents are packed into each row. Standard attention allows tokens to attend across document boundaries within a row. The hypothesis was that preventing this "leakage" via varlen attention might improve training. + +### Approach: Compute cu_seqlens from inputs + +- Find BOS positions: `(inputs.view(-1) == bos_token_id).nonzero()` +- Gotcha 1: Variable-length `cu_seqlens` caused torch.compile recompilation (25s/iter!) - fixed by padding to fixed size +- Gotcha 2: `nonzero()` inside compiled model hit recompile limit - fixed by moving computation outside compiled region + +### Final Results (d16) + +| Metric | Baseline | Varlen | +|--------|----------|--------| +| val_bpb | 0.85427 | 0.85407 | +| MFU | ~same | ~same | +| tok/sec | ~same | ~same | + +Essentially identical. The 0.0002 bpb improvement is almost noise. + +### Conclusion + +Not worth the code complexity. The "leakage" across document boundaries within a row is not harmful - the model handles it fine. The BOS-aligned dataloader already provides the key benefit (every row starts with proper context). Not merging to master. + +--- + ## 2026-01-13: BOS-Aligned Dataloader with Bin Packing Redesigned the pretraining and midtraining dataloader to ensure every sequence starts with a BOS token, and explored bin-packing algorithms to minimize wasted tokens. From 3b50b77ed38c77be46406f925fccd79adca1fcda Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 13 Jan 2026 22:09:36 +0000 Subject: [PATCH 027/119] fix base_loss to report correct loss by switching the dataloader to the new default --- nanochat/checkpoint_manager.py | 3 +++ scripts/base_loss.py | 4 ++-- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py index cca62941..c008ec2e 100644 --- a/nanochat/checkpoint_manager.py +++ b/nanochat/checkpoint_manager.py @@ -25,6 +25,7 @@ def _patch_missing_config_keys(model_config_kwargs): # Old models were trained with full context (no sliding window) if "window_pattern" not in model_config_kwargs: model_config_kwargs["window_pattern"] = "L" + log0(f"Patching missing window_pattern in model config to 'L'") def _patch_missing_keys(model_data, model_config): """Add default values for new parameters that may be missing in old checkpoints.""" @@ -32,9 +33,11 @@ def _patch_missing_keys(model_data, model_config): # resid_lambdas defaults to 1.0 (identity scaling) if "resid_lambdas" not in model_data: model_data["resid_lambdas"] = torch.ones(n_layer) + log0(f"Patching missing resid_lambdas in model data to 1.0") # x0_lambdas defaults to 0.0 (disabled) if "x0_lambdas" not in model_data: model_data["x0_lambdas"] = torch.zeros(n_layer) + log0(f"Patching missing x0_lambdas in model data to 0.0") def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data, rank=0): if rank == 0: diff --git a/scripts/base_loss.py b/scripts/base_loss.py index 094299a2..46544d43 100644 --- a/scripts/base_loss.py +++ b/scripts/base_loss.py @@ -14,7 +14,7 @@ from contextlib import nullcontext import torch from nanochat.checkpoint_manager import load_model from nanochat.common import compute_init, print0, compute_cleanup, autodetect_device_type -from nanochat.dataloader import tokenizing_distributed_data_loader +from nanochat.dataloader import tokenizing_distributed_data_loader_bos_bestfit from nanochat.tokenizer import get_token_bytes, HuggingFaceTokenizer from nanochat.loss_eval import evaluate_bpb from nanochat.engine import Engine @@ -97,7 +97,7 @@ assert args.split_tokens % tokens_per_step == 0, "split_tokens must be divisible steps = args.split_tokens // tokens_per_step bpb_results = {} for split_name in ["train", "val"]: - loader = tokenizing_distributed_data_loader(tokenizer, args.device_batch_size, sequence_len, split_name, device=device) + loader = tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, sequence_len, split_name, device=device) with autocast_ctx: bpb = evaluate_bpb(model, loader, steps, token_bytes) print0(f"{split_name} bpb: {bpb:.4f}") From 7312ec98985a8b478fc98ef54b30bdc0baed1989 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 13 Jan 2026 22:45:27 +0000 Subject: [PATCH 028/119] fix buggy midtrain and update all kwargs to be idiomatic. that is, argparse uses dashes variables use underscores. the underscores are just a remnant of the previous Configurator object. This is the right way --- dev/runcpu.sh | 44 ++++++++++++++++---------------- miniseries.sh | 16 ++++++------ run1000.sh | 10 ++++---- scaling_laws.sh | 16 ++++++------ scripts/base_loss.py | 14 +++++------ scripts/base_train.py | 58 +++++++++++++++++++++---------------------- scripts/chat_rl.py | 34 ++++++++++++------------- scripts/chat_sft.py | 32 ++++++++++++------------ scripts/mid_train.py | 49 ++++++++++++++++++++---------------- scripts/tok_train.py | 6 ++--- speedrun.sh | 4 +-- 11 files changed, 144 insertions(+), 139 deletions(-) diff --git a/dev/runcpu.sh b/dev/runcpu.sh index c4a719e4..c0b32a54 100755 --- a/dev/runcpu.sh +++ b/dev/runcpu.sh @@ -25,7 +25,7 @@ python -m nanochat.report reset # train tokenizer on ~1B characters python -m nanochat.dataset -n 4 -python -m scripts.tok_train --max_chars=1000000000 +python -m scripts.tok_train --max-chars=1000000000 python -m scripts.tok_eval # train a very small 4 layer model on the CPU @@ -33,37 +33,37 @@ python -m scripts.tok_eval # we only run 50 steps of optimization (bump this to get better results) python -m scripts.base_train \ --depth=4 \ - --max_seq_len=1024 \ - --device_batch_size=1 \ - --total_batch_size=1024 \ - --eval_every=50 \ - --eval_tokens=4096 \ - --core_metric_every=50 \ - --core_metric_max_per_task=12 \ - --sample_every=50 \ - --num_iterations=50 -python -m scripts.base_loss --device_batch_size=1 --split_tokens=4096 + --max-seq-len=1024 \ + --device-batch-size=1 \ + --total-batch-size=1024 \ + --eval-every=50 \ + --eval-tokens=4096 \ + --core-metric-every=50 \ + --core-metric-max-per-task=12 \ + --sample-every=50 \ + --num-iterations=50 +python -m scripts.base_loss --device-batch-size=1 --split-tokens=4096 python -m scripts.base_eval --max-per-task=16 # midtraining python -m scripts.mid_train \ - --max_seq_len=1024 \ - --device_batch_size=1 \ - --eval_every=50 \ - --eval_tokens=4096 \ - --total_batch_size=1024 \ - --num_iterations=100 + --max-seq-len=1024 \ + --device-batch-size=1 \ + --eval-every=50 \ + --eval-tokens=4096 \ + --total-batch-size=1024 \ + --num-iterations=100 # eval results will be terrible, this is just to execute the code paths. # note that we lower the execution memory limit to 1MB to avoid warnings on smaller systems python -m scripts.chat_eval --source=mid --max-new-tokens=128 --max-problems=20 # SFT python -m scripts.chat_sft \ - --device_batch_size=1 \ - --target_examples_per_step=4 \ - --num_iterations=100 \ - --eval_steps=4 \ - --eval_metrics_max_problems=16 + --device-batch-size=1 \ + --target-examples-per-step=4 \ + --num-iterations=100 \ + --eval-steps=4 \ + --eval-metrics-max-problems=16 # Chat CLI # python -m scripts.chat_cli -p "Why is the sky blue?" diff --git a/miniseries.sh b/miniseries.sh index 4d6f4360..9a4512b6 100644 --- a/miniseries.sh +++ b/miniseries.sh @@ -20,7 +20,7 @@ if [ -z "$SKIP_SETUP" ]; then # Tokenizer, download 1000 shards for pretraining # (probably this can be reduced but it's tricky to determine the exact right number, TODO). python -m nanochat.dataset -n 1000 - python -m scripts.tok_train --max_chars=2000000000 --vocab_size=32768 + python -m scripts.tok_train --max-chars=2000000000 --vocab-size=32768 else source .venv/bin/activate fi @@ -58,16 +58,16 @@ for d in "${DEPTHS[@]}"; do START_TIME=$(date +%s) # Train the model with natural horizon (target_param_data_ratio default) - # No --target_flops, let it use the default ratio from base_train + # No --target-flops, let it use the default ratio from base_train torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- \ --depth=$d \ - --target_param_data_ratio=8 \ + --target-param-data-ratio=8 \ --run="${WANDB_RUN}_d${d}" \ - --model_tag="${TAG}" \ - --core_metric_every=999999 \ - --core_metric_max_per_task=-1 \ - --sample_every=-1 \ - --save_every=-1 \ + --model-tag="${TAG}" \ + --core-metric-every=999999 \ + --core-metric-max-per-task=-1 \ + --sample-every=-1 \ + --save-every=-1 \ 2>&1 | tee "$RESULTS_DIR/${TAG}_train.log" END_TIME=$(date +%s) diff --git a/run1000.sh b/run1000.sh index fe92edfc..5d0b7dc3 100644 --- a/run1000.sh +++ b/run1000.sh @@ -23,15 +23,15 @@ python -m nanochat.dataset -n 16 # start downloading the rest of the shards for a total of 1200 (see below why 1200) python -m nanochat.dataset -n 1200 & # todo: download the rest of it -python -m scripts.tok_train --max_chars=4000000000 --vocab_size=65536 +python -m scripts.tok_train --max-chars=4000000000 --vocab-size=65536 python -m scripts.tok_eval # Documenting my process for determining the hyperparameters for this run1000.sh script: # We want a budget of approx. $1000 ~= 41.6 hours of 8XH100 compute # 1) I guessed the model size for this to be about depth=32 # 2) Determine the device_batch_size that fits: -# Running the base_train.py script with --depth=32, I saw that --device_batch_size=16 -# runs out of memory, but --device_batch_size=8 fits. Inspecting `nvidia-smi` during training, +# Running the base_train.py script with --depth=32, I saw that --device-batch-size=16 +# runs out of memory, but --device-batch-size=8 fits. Inspecting `nvidia-smi` during training, # I saw all GPUs were at about 78/80GB VRAM, so it just barely fits and we have good MFU at ~50%. # So the training script was running ok and showed: # Vocab size: 65,536 @@ -73,13 +73,13 @@ python -m scripts.tok_eval # Number of processes/GPUs to use NPROC_PER_NODE=8 -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=32 --target_param_data_ratio=20 --device_batch_size=8 --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=32 --target-param-data-ratio=20 --device-batch-size=8 --run=$WANDB_RUN torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval # midtrain # NOTE: ensure that we use the same device_batch_size here as the base training script. -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.mid_train -- --device_batch_size=8 --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.mid_train -- --device-batch-size=8 --run=$WANDB_RUN torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i mid # sft diff --git a/scaling_laws.sh b/scaling_laws.sh index 102ba11c..321b286a 100644 --- a/scaling_laws.sh +++ b/scaling_laws.sh @@ -64,15 +64,15 @@ for flops in "${FLOPS_BUDGETS[@]}"; do # CORE eval happens once at the end (999999 ensures only final step) torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- \ --depth=$d \ - --target_flops=$flops \ - --target_param_data_ratio=-1 \ + --target-flops=$flops \ + --target-param-data-ratio=-1 \ --run="${WANDB_RUN}_${TAG}" \ - --model_tag="${TAG}" \ - --eval_tokens=$EVAL_TOKENS \ - --core_metric_every=999999 \ - --core_metric_max_per_task=-1 \ - --sample_every=-1 \ - --save_every=-1 \ + --model-tag="${TAG}" \ + --eval-tokens=$EVAL_TOKENS \ + --core-metric-every=999999 \ + --core-metric-max-per-task=-1 \ + --sample-every=-1 \ + --save-every=-1 \ 2>&1 | tee "$RESULTS_DIR/${TAG}_train.log" END_TIME=$(date +%s) diff --git a/scripts/base_loss.py b/scripts/base_loss.py index 46544d43..6b44a30c 100644 --- a/scripts/base_loss.py +++ b/scripts/base_loss.py @@ -7,7 +7,7 @@ Example run as: torchrun --standalone --nproc_per_node=8 -m scripts.base_loss To evaluate a HuggingFace model: -python -m scripts.base_loss --hf_path openai-community/gpt2 +python -m scripts.base_loss --hf-path openai-community/gpt2 """ import argparse from contextlib import nullcontext @@ -61,12 +61,12 @@ def get_hf_token_bytes(tokenizer, device="cpu"): # CLI arguments parser = argparse.ArgumentParser(description="Evaluate loss on train/val splits and sample from model") -parser.add_argument("--device_batch_size", type=int, default=32, help="per-device batch size") -parser.add_argument("--split_tokens", type=int, default=40*524288, help="number of tokens to evaluate per split") -parser.add_argument("--model_tag", type=str, default=None, help="model tag for checkpoint directory") -parser.add_argument("--model_step", type=int, default=None, help="model step to load") -parser.add_argument("--device_type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") -parser.add_argument("--hf_path", type=str, default=None, help="HuggingFace model path (e.g. openai-community/gpt2)") +parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size") +parser.add_argument("--split-tokens", type=int, default=40*524288, help="number of tokens to evaluate per split") +parser.add_argument("--model-tag", type=str, default=None, help="model tag for checkpoint directory") +parser.add_argument("--model-step", type=int, default=None, help="model step to load") +parser.add_argument("--device-type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") +parser.add_argument("--hf-path", type=str, default=None, help="HuggingFace model path (e.g. openai-community/gpt2)") args = parser.parse_args() # Load the base model and the tokenizer diff --git a/scripts/base_train.py b/scripts/base_train.py index a432e7ab..bf4b8cf6 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -8,7 +8,7 @@ or distributed as: torchrun --nproc_per_node=8 -m scripts.base_train.py If you are only on CPU/Macbook, you'll want to train a much much smaller LLM. Example: -python -m scripts.base_train --depth=4 --max_seq_len=512 --device_batch_size=1 --eval_tokens=512 --core_metric_every=-1 --total_batch_size=512 --num_iterations=20 +python -m scripts.base_train --depth=4 --max-seq-len=512 --device-batch-size=1 --eval-tokens=512 --core-metric-every=-1 --total-batch-size=512 --num-iterations=20 """ import os @@ -36,40 +36,40 @@ parser = argparse.ArgumentParser(description="Pretrain base model") # Logging parser.add_argument("--run", type=str, default="dummy", help="wandb run name ('dummy' disables wandb logging)") # Runtime -parser.add_argument("--device_type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") +parser.add_argument("--device-type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") # Model architecture parser.add_argument("--depth", type=int, default=20, help="depth of the Transformer model") -parser.add_argument("--aspect_ratio", type=int, default=64, help="model_dim = depth * aspect_ratio") -parser.add_argument("--head_dim", type=int, default=128, help="target head dimension for attention") -parser.add_argument("--max_seq_len", type=int, default=2048, help="max context length") -parser.add_argument("--window_pattern", type=str, default="SSSL", help="sliding window pattern tiled across layers: L=full, S=half context (e.g. 'SSL')") +parser.add_argument("--aspect-ratio", type=int, default=64, help="model_dim = depth * aspect_ratio") +parser.add_argument("--head-dim", type=int, default=128, help="target head dimension for attention") +parser.add_argument("--max-seq-len", type=int, default=2048, help="max context length") +parser.add_argument("--window-pattern", type=str, default="SSSL", help="sliding window pattern tiled across layers: L=full, S=half context (e.g. 'SSL')") # Training horizon (only one used, in order of precedence) -parser.add_argument("--num_iterations", type=int, default=-1, help="explicit number of optimization steps (-1 = disable)") -parser.add_argument("--target_flops", type=float, default=-1.0, help="calculate num_iterations to reach target_flops (-1 = disable)") -parser.add_argument("--target_param_data_ratio", type=int, default=8, help="calculate num_iterations to maintain data:param ratio (Chinchilla=20, -1 = disable)") +parser.add_argument("--num-iterations", type=int, default=-1, help="explicit number of optimization steps (-1 = disable)") +parser.add_argument("--target-flops", type=float, default=-1.0, help="calculate num_iterations to reach target_flops (-1 = disable)") +parser.add_argument("--target-param-data-ratio", type=int, default=8, help="calculate num_iterations to maintain data:param ratio (Chinchilla=20, -1 = disable)") # Optimization -parser.add_argument("--device_batch_size", type=int, default=32, help="per-device batch size") -parser.add_argument("--total_batch_size", type=int, default=524288, help="total batch size in tokens") -parser.add_argument("--embedding_lr", type=float, default=0.3, help="learning rate for embedding parameters (Adam)") -parser.add_argument("--unembedding_lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") -parser.add_argument("--weight_decay", type=float, default=0.2, help="cautious weight decay for the Muon optimizer (for weights)") -parser.add_argument("--matrix_lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") -parser.add_argument("--scalar_lr", type=float, default=0.5, help="learning rate for scalars (resid_lambdas, x0_lambdas)") -parser.add_argument("--adam_beta1", type=float, default=0.8, help="Adam beta1 for embedding/unembedding") -parser.add_argument("--adam_beta2", type=float, default=0.95, help="Adam beta2 for embedding/unembedding") -parser.add_argument("--warmup_ratio", type=float, default=0.0, help="ratio of iterations for LR warmup") -parser.add_argument("--warmdown_ratio", type=float, default=0.4, help="ratio of iterations for LR warmdown") -parser.add_argument("--final_lr_frac", type=float, default=0.0, help="final LR as fraction of initial LR") -parser.add_argument("--resume_from_step", type=int, default=-1, help="resume training from this step (-1 = disable)") +parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size") +parser.add_argument("--total-batch-size", type=int, default=524288, help="total batch size in tokens") +parser.add_argument("--embedding-lr", type=float, default=0.3, help="learning rate for embedding parameters (Adam)") +parser.add_argument("--unembedding-lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") +parser.add_argument("--weight-decay", type=float, default=0.2, help="cautious weight decay for the Muon optimizer (for weights)") +parser.add_argument("--matrix-lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") +parser.add_argument("--scalar-lr", type=float, default=0.5, help="learning rate for scalars (resid_lambdas, x0_lambdas)") +parser.add_argument("--adam-beta1", type=float, default=0.8, help="Adam beta1 for embedding/unembedding") +parser.add_argument("--adam-beta2", type=float, default=0.95, help="Adam beta2 for embedding/unembedding") +parser.add_argument("--warmup-ratio", type=float, default=0.0, help="ratio of iterations for LR warmup") +parser.add_argument("--warmdown-ratio", type=float, default=0.4, help="ratio of iterations for LR warmdown") +parser.add_argument("--final-lr-frac", type=float, default=0.0, help="final LR as fraction of initial LR") +parser.add_argument("--resume-from-step", type=int, default=-1, help="resume training from this step (-1 = disable)") # Evaluation -parser.add_argument("--eval_every", type=int, default=250, help="evaluate val bpb every N steps (-1 = disable)") -parser.add_argument("--eval_tokens", type=int, default=20*524288, help="number of tokens to evaluate val loss on") -parser.add_argument("--core_metric_every", type=int, default=2000, help="evaluate CORE metric every N steps (-1 = disable)") -parser.add_argument("--core_metric_max_per_task", type=int, default=500, help="examples per task for CORE metric") -parser.add_argument("--sample_every", type=int, default=2000, help="sample from model every N steps (-1 = disable)") -parser.add_argument("--save_every", type=int, default=-1, help="save checkpoints every N steps (-1 = only at end)") +parser.add_argument("--eval-every", type=int, default=250, help="evaluate val bpb every N steps (-1 = disable)") +parser.add_argument("--eval-tokens", type=int, default=20*524288, help="number of tokens to evaluate val loss on") +parser.add_argument("--core-metric-every", type=int, default=2000, help="evaluate CORE metric every N steps (-1 = disable)") +parser.add_argument("--core-metric-max-per-task", type=int, default=500, help="examples per task for CORE metric") +parser.add_argument("--sample-every", type=int, default=2000, help="sample from model every N steps (-1 = disable)") +parser.add_argument("--save-every", type=int, default=-1, help="save checkpoints every N steps (-1 = only at end)") # Output -parser.add_argument("--model_tag", type=str, default=None, help="override model tag for checkpoint directory name") +parser.add_argument("--model-tag", type=str, default=None, help="override model tag for checkpoint directory name") args = parser.parse_args() user_config = vars(args).copy() # for logging # ----------------------------------------------------------------------------- diff --git a/scripts/chat_rl.py b/scripts/chat_rl.py index ad557b91..b0697f36 100644 --- a/scripts/chat_rl.py +++ b/scripts/chat_rl.py @@ -35,32 +35,32 @@ parser = argparse.ArgumentParser(description="Reinforcement learning on GSM8K") # Logging parser.add_argument("--run", type=str, default="dummy", help="wandb run name ('dummy' disables wandb logging)") # Runtime -parser.add_argument("--device_type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") +parser.add_argument("--device-type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") parser.add_argument("--dtype", type=str, default="bfloat16", help="float32|bfloat16") # Model loading parser.add_argument("--source", type=str, default="sft", help="mid|sft - which checkpoint to load from") -parser.add_argument("--model_tag", type=str, default=None, help="model tag to load from") -parser.add_argument("--model_step", type=int, default=None, help="model step to load from") +parser.add_argument("--model-tag", type=str, default=None, help="model tag to load from") +parser.add_argument("--model-step", type=int, default=None, help="model step to load from") # Training horizon -parser.add_argument("--num_epochs", type=int, default=1, help="number of epochs over GSM8K") +parser.add_argument("--num-epochs", type=int, default=1, help="number of epochs over GSM8K") # Batch sizes / sampling -parser.add_argument("--device_batch_size", type=int, default=8, help="max batch size per forward pass") -parser.add_argument("--examples_per_step", type=int, default=16, help="total examples per optimization step across all ranks") -parser.add_argument("--num_samples", type=int, default=16, help="number of samples per example/question") +parser.add_argument("--device-batch-size", type=int, default=8, help="max batch size per forward pass") +parser.add_argument("--examples-per-step", type=int, default=16, help="total examples per optimization step across all ranks") +parser.add_argument("--num-samples", type=int, default=16, help="number of samples per example/question") # Generation -parser.add_argument("--max_new_tokens", type=int, default=256, help="max tokens to generate per sample") +parser.add_argument("--max-new-tokens", type=int, default=256, help="max tokens to generate per sample") parser.add_argument("--temperature", type=float, default=1.0, help="sampling temperature") -parser.add_argument("--top_k", type=int, default=50, help="top-k sampling (0 = disabled)") +parser.add_argument("--top-k", type=int, default=50, help="top-k sampling (0 = disabled)") # Optimization -parser.add_argument("--embedding_lr", type=float, default=0.2, help="learning rate for embedding parameters (Adam)") -parser.add_argument("--unembedding_lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") -parser.add_argument("--matrix_lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") -parser.add_argument("--weight_decay", type=float, default=0.0, help="weight decay for embedding/unembedding parameters (Adam)") -parser.add_argument("--init_lr_frac", type=float, default=0.05, help="initial LR as fraction of base LR") +parser.add_argument("--embedding-lr", type=float, default=0.2, help="learning rate for embedding parameters (Adam)") +parser.add_argument("--unembedding-lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") +parser.add_argument("--matrix-lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") +parser.add_argument("--weight-decay", type=float, default=0.0, help="weight decay for embedding/unembedding parameters (Adam)") +parser.add_argument("--init-lr-frac", type=float, default=0.05, help="initial LR as fraction of base LR") # Evaluation / checkpointing -parser.add_argument("--eval_every", type=int, default=60, help="evaluate pass@k every N steps") -parser.add_argument("--eval_examples", type=int, default=400, help="number of examples for pass@k evaluation") -parser.add_argument("--save_every", type=int, default=60, help="save checkpoint every N steps") +parser.add_argument("--eval-every", type=int, default=60, help="evaluate pass@k every N steps") +parser.add_argument("--eval-examples", type=int, default=400, help="number of examples for pass@k evaluation") +parser.add_argument("--save-every", type=int, default=60, help="save checkpoint every N steps") args = parser.parse_args() user_config = vars(args).copy() # ----------------------------------------------------------------------------- diff --git a/scripts/chat_sft.py b/scripts/chat_sft.py index 853a2bf6..9277cf96 100644 --- a/scripts/chat_sft.py +++ b/scripts/chat_sft.py @@ -37,29 +37,29 @@ parser = argparse.ArgumentParser(description="Supervised finetuning for chat") # Logging parser.add_argument("--run", type=str, default="dummy", help="wandb run name ('dummy' disables wandb logging)") # Runtime -parser.add_argument("--device_type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") +parser.add_argument("--device-type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") parser.add_argument("--dtype", type=str, default="bfloat16", help="float32|bfloat16") # Model loading parser.add_argument("--source", type=str, default="mid", help="base|mid - which checkpoint to load from") -parser.add_argument("--model_tag", type=str, default=None, help="model tag to load from") -parser.add_argument("--model_step", type=int, default=None, help="model step to load from") +parser.add_argument("--model-tag", type=str, default=None, help="model tag to load from") +parser.add_argument("--model-step", type=int, default=None, help="model step to load from") # Training horizon -parser.add_argument("--num_epochs", type=int, default=1, help="number of epochs") -parser.add_argument("--num_iterations", type=int, default=-1, help="override number of iterations (-1 = use num_epochs)") +parser.add_argument("--num-epochs", type=int, default=1, help="number of epochs") +parser.add_argument("--num-iterations", type=int, default=-1, help="override number of iterations (-1 = use num_epochs)") # Batch sizes -parser.add_argument("--device_batch_size", type=int, default=4, help="per-device batch size") -parser.add_argument("--target_examples_per_step", type=int, default=32, help="target examples per optimization step") +parser.add_argument("--device-batch-size", type=int, default=4, help="per-device batch size") +parser.add_argument("--target-examples-per-step", type=int, default=32, help="target examples per optimization step") # Optimization -parser.add_argument("--embedding_lr", type=float, default=0.2, help="learning rate for embedding parameters (Adam)") -parser.add_argument("--unembedding_lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") -parser.add_argument("--matrix_lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") -parser.add_argument("--weight_decay", type=float, default=0.0, help="weight decay for embedding/unembedding parameters (Adam)") -parser.add_argument("--init_lr_frac", type=float, default=0.02, help="initial LR as fraction of base LR") +parser.add_argument("--embedding-lr", type=float, default=0.2, help="learning rate for embedding parameters (Adam)") +parser.add_argument("--unembedding-lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") +parser.add_argument("--matrix-lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") +parser.add_argument("--weight-decay", type=float, default=0.0, help="weight decay for embedding/unembedding parameters (Adam)") +parser.add_argument("--init-lr-frac", type=float, default=0.02, help="initial LR as fraction of base LR") # Evaluation -parser.add_argument("--eval_every", type=int, default=100, help="evaluate val loss every N steps") -parser.add_argument("--eval_steps", type=int, default=100, help="number of batches for val loss evaluation") -parser.add_argument("--eval_metrics_every", type=int, default=200, help="evaluate accuracy metrics every N steps") -parser.add_argument("--eval_metrics_max_problems", type=int, default=1024, help="max problems per metric evaluation") +parser.add_argument("--eval-every", type=int, default=100, help="evaluate val loss every N steps") +parser.add_argument("--eval-steps", type=int, default=100, help="number of batches for val loss evaluation") +parser.add_argument("--eval-metrics-every", type=int, default=200, help="evaluate accuracy metrics every N steps") +parser.add_argument("--eval-metrics-max-problems", type=int, default=1024, help="max problems per metric evaluation") args = parser.parse_args() user_config = vars(args).copy() # ----------------------------------------------------------------------------- diff --git a/scripts/mid_train.py b/scripts/mid_train.py index 0742c087..01d9f7d4 100644 --- a/scripts/mid_train.py +++ b/scripts/mid_train.py @@ -6,7 +6,7 @@ python -m scripts.mid_train Or torchrun for training: -torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --device_batch_size=16 +torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --device-batch-size=16 """ import argparse @@ -36,28 +36,28 @@ parser = argparse.ArgumentParser(description="Midtrain the model") # Logging parser.add_argument("--run", type=str, default="dummy", help="wandb run name ('dummy' disables wandb logging)") # Runtime -parser.add_argument("--device_type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") +parser.add_argument("--device-type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") parser.add_argument("--dtype", type=str, default="bfloat16", help="float32|bfloat16") # Model loading -parser.add_argument("--model_tag", type=str, default=None, help="model tag to load from") -parser.add_argument("--model_step", type=int, default=None, help="model step to load from") +parser.add_argument("--model-tag", type=str, default=None, help="model tag to load from") +parser.add_argument("--model-step", type=int, default=None, help="model step to load from") # Training horizon -parser.add_argument("--num_iterations", type=int, default=-1, help="number of optimization steps (-1 = full epoch)") +parser.add_argument("--num-iterations", type=int, default=-1, help="number of optimization steps (-1 = full epoch)") # Batch sizes -parser.add_argument("--max_seq_len", type=int, default=2048, help="max context length") -parser.add_argument("--device_batch_size", type=int, default=32, help="per-device batch size") -parser.add_argument("--total_batch_size", type=int, default=524288, help="total batch size in tokens") +parser.add_argument("--max-seq-len", type=int, default=2048, help="max context length") +parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size") +parser.add_argument("--total-batch-size", type=int, default=524288, help="total batch size in tokens") # Optimization -parser.add_argument("--embedding_lr", type=float, default=0.2, help="learning rate for embedding parameters (Adam)") -parser.add_argument("--unembedding_lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") -parser.add_argument("--matrix_lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") -parser.add_argument("--weight_decay", type=float, default=0.0, help="weight decay for embedding/unembedding parameters (Adam)") -parser.add_argument("--init_lr_frac", type=float, default=1.0, help="initial LR as fraction of base LR") +parser.add_argument("--embedding-lr", type=float, default=0.2, help="learning rate for embedding parameters (Adam)") +parser.add_argument("--unembedding-lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") +parser.add_argument("--matrix-lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") +parser.add_argument("--weight-decay", type=float, default=0.0, help="weight decay for embedding/unembedding parameters (Adam)") +parser.add_argument("--init-lr-frac", type=float, default=1.0, help="initial LR as fraction of base LR") # Evaluation -parser.add_argument("--eval_every", type=int, default=150, help="evaluate val bpb every N steps (-1 = disable)") -parser.add_argument("--eval_tokens", type=int, default=20*524288, help="number of tokens to evaluate val loss on") +parser.add_argument("--eval-every", type=int, default=150, help="evaluate val bpb every N steps (-1 = disable)") +parser.add_argument("--eval-tokens", type=int, default=20*524288, help="number of tokens to evaluate val loss on") # Output -parser.add_argument("--dry_run", action="store_true", help="log to wandb but skip checkpoints/report") +parser.add_argument("--dry-run", action="store_true", help="log to wandb but skip checkpoints/report") args = parser.parse_args() user_config = vars(args).copy() # ----------------------------------------------------------------------------- @@ -79,7 +79,7 @@ wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat-mi model, tokenizer, meta = load_model("base", device, phase="train", model_tag=args.model_tag, step=args.model_step) pretrain_batch_size = meta.get("device_batch_size", None) if pretrain_batch_size is not None and args.device_batch_size > pretrain_batch_size: - print0(f"FOOTGUN WARNING: base model training used device_batch_size {pretrain_batch_size}, did you pass in a good --device_batch_size to this script?") + print0(f"FOOTGUN WARNING: base model training used device_batch_size {pretrain_batch_size}, did you pass in a good --device-batch-size to this script?") orig_model = model model = torch.compile(model, dynamic=False) depth = model.config.n_layer @@ -142,7 +142,8 @@ def mid_data_generator_bos_bestfit(split, buffer_size=100): # Conversation buffer: list of token lists conv_buffer = [] - cursor = ddp_rank # Each rank processes different conversations + cursor = ddp_rank # Each rank processes different conversations (for fetching) + consumed = ddp_rank # Track actual consumption separately from buffering epoch = 1 it = 0 # iteration counter @@ -156,8 +157,7 @@ def mid_data_generator_bos_bestfit(split, buffer_size=100): if cursor >= dataset_size: cursor = cursor % dataset_size epoch += 1 - if split == "train": - last_step = True # toggle last_step to True, which will terminate the training loop + # Note: last_step is now triggered based on consumption, not fetching while True: rows = [] @@ -183,10 +183,12 @@ def mid_data_generator_bos_bestfit(split, buffer_size=100): # Found a conversation that fits - use it entirely conv = conv_buffer.pop(best_idx) row.extend(conv) + consumed += ddp_world_size # Track actual consumption else: # No conversation fits - crop first conversation to fill remaining conv = conv_buffer.pop(0) row.extend(conv[:remaining]) + consumed += ddp_world_size # Track actual consumption rows.append(row[:row_capacity]) @@ -195,13 +197,16 @@ def mid_data_generator_bos_bestfit(split, buffer_size=100): if 0 < args.num_iterations <= it and split == "train": last_step = True - # Update progress tracking + # Update progress tracking (based on consumed, not cursor, to account for buffering) if split == "train": current_epoch = epoch if args.num_iterations > 0: approx_progress = it / args.num_iterations else: - approx_progress = cursor / dataset_size + approx_progress = consumed / dataset_size + # Trigger last_step when we've consumed enough (instead of when cursor wraps) + if consumed >= dataset_size: + last_step = True # Build tensors use_cuda = device_type == "cuda" diff --git a/scripts/tok_train.py b/scripts/tok_train.py index 4ab995c0..9c7979d2 100644 --- a/scripts/tok_train.py +++ b/scripts/tok_train.py @@ -14,9 +14,9 @@ from nanochat.dataset import parquets_iter_batched # Parse command line arguments parser = argparse.ArgumentParser(description='Train a BPE tokenizer') -parser.add_argument('--max_chars', type=int, default=10_000_000_000, help='Maximum characters to train on (default: 10B)') -parser.add_argument('--doc_cap', type=int, default=10_000, help='Maximum characters per document (default: 10,000)') -parser.add_argument('--vocab_size', type=int, default=32768, help='Vocabulary size (default: 32768 = 2^15)') +parser.add_argument('--max-chars', type=int, default=10_000_000_000, help='Maximum characters to train on (default: 10B)') +parser.add_argument('--doc-cap', type=int, default=10_000, help='Maximum characters per document (default: 10,000)') +parser.add_argument('--vocab-size', type=int, default=32768, help='Vocabulary size (default: 32768 = 2^15)') args = parser.parse_args() print(f"max_chars: {args.max_chars:,}") print(f"doc_cap: {args.doc_cap:,}") diff --git a/speedrun.sh b/speedrun.sh index 76ccf214..8fff5640 100644 --- a/speedrun.sh +++ b/speedrun.sh @@ -59,7 +59,7 @@ python -m nanochat.dataset -n 8 python -m nanochat.dataset -n 370 & DATASET_DOWNLOAD_PID=$! # train the tokenizer with vocab size 2**16 = 65536 on ~2B characters of data -python -m scripts.tok_train --max_chars=2000000000 --vocab_size=65536 +python -m scripts.tok_train --max-chars=2000000000 --vocab-size=65536 # evaluate the tokenizer (report compression ratio etc.) python -m scripts.tok_eval @@ -81,7 +81,7 @@ wait $DATASET_DOWNLOAD_PID NPROC_PER_NODE=8 # pretrain the d20 model -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=20 --target_param_data_ratio=20 --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=20 --target-param-data-ratio=20 --run=$WANDB_RUN # evaluate the model on a larger chunk of train/val data and draw some samples torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss # evaluate the model on CORE tasks From 3142ca1a28c4712f447a59ad0a27441081e170fb Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 15 Jan 2026 03:20:21 +0000 Subject: [PATCH 029/119] minor helpful message --- nanochat/checkpoint_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py index c008ec2e..d1e0a075 100644 --- a/nanochat/checkpoint_manager.py +++ b/nanochat/checkpoint_manager.py @@ -111,7 +111,7 @@ def build_model(checkpoint_dir, step, device, phase): # Load the Tokenizer tokenizer = get_tokenizer() # Sanity check: compatibility between model and tokenizer - assert tokenizer.get_vocab_size() == model_config_kwargs["vocab_size"] + assert tokenizer.get_vocab_size() == model_config_kwargs["vocab_size"], f"Tokenizer vocab size {tokenizer.get_vocab_size()} does not match model config vocab size {model_config_kwargs['vocab_size']}" return model, tokenizer, meta_data From 6bb92403d58e1f583b4d0de27fac0e09d329b7e0 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 15 Jan 2026 03:20:48 +0000 Subject: [PATCH 030/119] changes and optimizations to muon, making it more efficient and simpler/cleaner a bit --- nanochat/muon.py | 455 ++++++++++++++++++++++++++--------------------- 1 file changed, 256 insertions(+), 199 deletions(-) diff --git a/nanochat/muon.py b/nanochat/muon.py index 7ae5ffdc..cfd2443d 100644 --- a/nanochat/muon.py +++ b/nanochat/muon.py @@ -1,7 +1,27 @@ """ -Muon optimizer adapted (simplified) from modded-nanogpt. +Muon optimizer adapted and simplified from modded-nanogpt. https://github.com/KellerJordan/modded-nanogpt + +Background: +Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a +quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose +of minimizing steps, it turns out to be empirically effective to keep increasing the slope at +zero even beyond the point where the iteration no longer converges all the way to one everywhere +on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T +where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model +performance at all relative to UV^T, where USV^T = G is the SVD. + +Here, an alternative to Newton-Schulz iteration with potentially better convergence properties: +Polar Express Sign Method for orthogonalization. +https://arxiv.org/pdf/2505.16932 +by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + +Some of the changes in nanochat implementation: +- Uses a simpler, more general approach to parameter grouping and stacking +- Uses a single fused kernel for the momentum -> polar_express -> variance_reduction -> update step +- Makes no assumptions about model architecture (e.g. that attention weights are fused into QKVO format) """ + import torch from torch import Tensor import torch.distributed as dist @@ -16,97 +36,61 @@ polar_express_coeffs = [ (2.3465413258596377, -1.7097828382687081, 0.42323551169305323), ] - -@torch.compile -def zeropower_via_polar_express(G: Tensor, steps: int = 5) -> Tensor: +@torch.compile(dynamic=False, fullgraph=True) +def muon_step_fused( + stacked_grads: Tensor, + stacked_params: Tensor, + momentum_buffer: Tensor, + second_momentum_buffer: Tensor, + momentum_t: Tensor, + lr_t: Tensor, + wd_t: Tensor, + beta2_t: Tensor, + ns_steps: int, + red_dim: int, +) -> None: """ - Polar Express Sign Method for orthogonalization. - https://arxiv.org/pdf/2505.16932 - by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. - - Alternative to Newton-Schulz iteration with potentially better convergence properties. + Fused Muon step: momentum -> polar_express -> variance_reduction -> cautious_update + All in one compiled graph to eliminate Python overhead between ops. + Some of the constants are 0-D CPU tensors to avoid recompilation when values change. """ - assert G.ndim >= 2 - X = G.bfloat16() - if G.size(-2) > G.size(-1): + + # Nesterov momentum + momentum = momentum_t.to(stacked_grads.dtype) + momentum_buffer.lerp_(stacked_grads, 1 - momentum) + g = stacked_grads.lerp_(momentum_buffer, momentum) + + # Polar express + X = g.bfloat16() + if g.size(-2) > g.size(-1): X = X.mT - - # Ensure spectral norm is at most 1 (with 2% safety factor) X = X / (X.norm(dim=(-2, -1), keepdim=True) * 1.02 + 1e-6) - - # Perform the iterations (cap at available coefficients) - for a, b, c in polar_express_coeffs[:min(steps, len(polar_express_coeffs))]: + for a, b, c in polar_express_coeffs[:ns_steps]: A = X @ X.mT B = b * A + c * (A @ A) X = a * X + B @ X - - if G.size(-2) > G.size(-1): + if g.size(-2) > g.size(-1): X = X.mT - return X + g = X - -@torch.compile -def zeropower_via_newtonschulz5(G: Tensor, steps: int) -> Tensor: - """ - Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a - quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose - of minimizing steps, it turns out to be empirically effective to keep increasing the slope at - zero even beyond the point where the iteration no longer converges all the way to one everywhere - on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T - where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model - performance at all relative to UV^T, where USV^T = G is the SVD. - """ - assert G.ndim >= 2 # batched Muon implementation by @scottjmaddox, and put into practice in the record by @YouJiacheng - a, b, c = (3.4445, -4.7750, 2.0315) - X = G.bfloat16() - if G.size(-2) > G.size(-1): - X = X.mT - - # Ensure spectral norm is at most 1 - X = X / (X.norm(dim=(-2, -1), keepdim=True) + 1e-7) - # Perform the NS iterations - for _ in range(steps): - A = X @ X.mT - B = b * A + c * A @ A # quintic computation strategy adapted from suggestion by @jxbz, @leloykun, and @YouJiacheng - X = a * X + B @ X - - if G.size(-2) > G.size(-1): - X = X.mT - return X - - -@torch.compile -def apply_variance_reduction(v: Tensor, second_momentum_buffer: Tensor, beta2: float) -> Tensor: - """ - NorMuon-style variance reduction, similar to Adafactor's low-rank variance estimator. - https://arxiv.org/pdf/2510.05491 - - Normalizes updates based on a running estimate of per-row (or per-column) variance. - The reduction dimension is determined by the shape of second_momentum_buffer. - """ - # Determine reduction dimension from buffer shape - red_dim = -1 if second_momentum_buffer.size(-1) == 1 else -2 - - # Compute per-row/col mean of squared values - v_mean = v.float().square().mean(dim=red_dim, keepdim=True) - red_dim_size = v.size(red_dim) - - # Compute current norm + # Variance reduction + beta2 = beta2_t.to(g.dtype) + v_mean = g.float().square().mean(dim=red_dim, keepdim=True) + red_dim_size = g.size(red_dim) v_norm_sq = v_mean.sum(dim=(-2, -1), keepdim=True) * red_dim_size v_norm = v_norm_sq.sqrt() - - # Update second momentum buffer (EMA of variance) second_momentum_buffer.lerp_(v_mean.to(dtype=second_momentum_buffer.dtype), 1 - beta2) - - # Compute scaling factor from second momentum step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt() scaled_sq_sum = (v_mean * red_dim_size) * step_size.float().square() v_norm_new = scaled_sq_sum.sum(dim=(-2, -1), keepdim=True).sqrt() - - # Final scale preserves overall norm while adjusting per-row/col final_scale = step_size * (v_norm / v_norm_new.clamp_min(1e-10)) - return v.mul(final_scale.to(v.dtype)) + g = g * final_scale.to(g.dtype) + # Cautious weight decay + parameter update + lr = lr_t.to(g.dtype) + wd = wd_t.to(g.dtype) + mask = (g * stacked_params) >= 0 + stacked_params.sub_(lr * g + lr * wd * stacked_params * mask) class Muon(torch.optim.Optimizer): """ @@ -127,94 +111,112 @@ class Muon(torch.optim.Optimizer): Arguments: lr: The learning rate used by the internal SGD. momentum: The momentum used by the internal SGD. - nesterov: Whether to use Nesterov-style momentum in the internal SGD. (recommended) ns_steps: The number of Newton-Schulz iteration steps to use. beta2: The decay rate for the second moment (variance) estimate. Set to None to disable. weight_decay: Cautious weight decay coefficient. Only decays where update and weight agree. """ - def __init__(self, params, lr=0.02, momentum=0.95, nesterov=True, ns_steps=5, beta2=0.95, weight_decay=0.0): - defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps, beta2=beta2, weight_decay=weight_decay) - params: list[Tensor] = [*params] + def __init__(self, params, lr=0.02, momentum=0.95, ns_steps=5, beta2=0.95, weight_decay=0.0): + defaults = dict(lr=lr, momentum=momentum, ns_steps=ns_steps, beta2=beta2, weight_decay=weight_decay) + assert all(p.ndim == 2 for p in params), "Muon expects 2D parameters only" + params = list(params) # ensure we have a list, not an e.g. (exhaustible) iterator + # Group by shape so we can stack tensors + shapes = sorted({p.shape for p in params}) param_groups = [] - for size in {p.numel() for p in params}: - group = dict(params=[p for p in params if p.numel() == size]) - param_groups.append(group) + for shape in shapes: + group_params = [p for p in params if p.shape == shape] + param_groups.append(dict(params=group_params)) super().__init__(param_groups, defaults) + # 0-D CPU tensors to avoid torch.compile recompilation when values change + self._momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") @torch.no_grad() def step(self): for group in self.param_groups: params: list[Tensor] = group["params"] - for p in params: - g = p.grad - assert g is not None - state = self.state[p] - if "momentum_buffer" not in state: - state["momentum_buffer"] = torch.zeros_like(g) - buf: Tensor = state["momentum_buffer"] - buf.lerp_(g, 1 - group["momentum"]) - g = g.lerp_(buf, group["momentum"]) if group["nesterov"] else buf - g = zeropower_via_polar_express(g, steps=group["ns_steps"]) - # Variance reduction (NorMuon-style) - if group["beta2"] is not None: - if "second_momentum_buffer" not in state: - # Buffer shape determines reduction dim: reduce along larger dimension - if p.size(-2) >= p.size(-1): - state["second_momentum_buffer"] = torch.zeros_like(g[..., :1]) - else: - state["second_momentum_buffer"] = torch.zeros_like(g[..., :1, :]) - g = apply_variance_reduction(g, state["second_momentum_buffer"], group["beta2"]) - # Parameter update with cautious weight decay - effective_lr = group["lr"] * max(1, p.size(-2) / p.size(-1))**0.5 - wd = group["weight_decay"] - if wd != 0: - mask = (g * p) >= 0 - p.sub_(effective_lr * g + effective_lr * wd * p * mask) + if not params: + continue + + # Get or create group-level buffers (stored in first param's state for convenience) + state = self.state[params[0]] + num_params = len(params) # e.g.: 12 (for a d12 model) + # e.g.: shape = (768, 3072), device = cuda:0, dtype = torch.float32, for one of the MLP projections + shape, device, dtype = params[0].shape, params[0].device, params[0].dtype + + # Momentum for every individual parameter + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros(num_params, *shape, dtype=dtype, device=device) + momentum_buffer = state["momentum_buffer"] # e.g.: (12, 768, 3072) + + # Second momentum buffer is factored, either per-row or per-column + if "second_momentum_buffer" not in state: + if shape[-2] >= shape[-1]: + state["second_momentum_buffer"] = torch.zeros(num_params, shape[-2], 1, dtype=dtype, device=device) else: - p.sub_(effective_lr * g) + state["second_momentum_buffer"] = torch.zeros(num_params, 1, shape[-1], dtype=dtype, device=device) + second_momentum_buffer = state["second_momentum_buffer"] # (12, 1, 3072) + red_dim = -1 if shape[-2] >= shape[-1] else -2 # e.g.: -2 + + # Stack grads and params + stacked_grads = torch.stack([p.grad for p in params]) # (12, 768, 3072) + stacked_params = torch.stack(params) # (12, 768, 3072) + + # Fill all the 0-D tensors with current values + self._momentum_t.fill_(group["momentum"]) + self._beta2_t.fill_(group["beta2"] if group["beta2"] is not None else 0.0) + self._lr_t.fill_(group["lr"] * max(1.0, shape[-2] / shape[-1])**0.5) + self._wd_t.fill_(group["weight_decay"]) + + # Single fused kernel: momentum -> polar_express -> variance_reduction -> update + muon_step_fused( + stacked_grads, + stacked_params, + momentum_buffer, + second_momentum_buffer, + self._momentum_t, + self._lr_t, + self._wd_t, + self._beta2_t, + group["ns_steps"], + red_dim, + ) + + # Copy back to original params: [(768, 3072), (768, 3072), ...] <- (12, 768, 3072) + torch._foreach_copy_(params, list(stacked_params.unbind(0))) class DistMuon(torch.optim.Optimizer): """ - Muon: SGD-momentum + (optional) Nesterov, then orthogonalize the 2D update via Polar Express, - finally apply aspect-ratio scaled step. Performs its own distributed synchronization: - - reduce_scatter(AVG) for gradient averaging - - all_gather to replicate updated weights - - Notes: - * Designed for 2D parameters (e.g., linear/conv kernels reshaped to 2D). Do not use for 0D/1D - params like embeddings or scalars. - * Momentum buffers are maintained only on the 'owner' rank for each parameter (rank chosen - by block-cyclic assignment below). If you checkpoint optimizer state on a single rank, - consolidate states beforehand. - - Args: - params: iterable of Tensors - lr: learning rate - momentum: momentum coefficient in [0,1) - nesterov: if True, Nesterov-style update (g <- lerp(g, buf, momentum)); else use buf - ns_steps: number of Newton-Schulz iterations for the orthogonalization - beta2: decay rate for second moment (variance) estimate. Set to None to disable. - weight_decay: Cautious weight decay coefficient. Only decays where update and weight agree. + Distributed version of the Muon optimizer. """ def __init__(self, params, lr: float = 0.02, momentum: float = 0.95, - nesterov: bool = True, ns_steps: int = 5, beta2: float = 0.95, weight_decay: float = 0.0): - defaults = dict(lr=lr, momentum=momentum, nesterov=nesterov, ns_steps=ns_steps, beta2=beta2, weight_decay=weight_decay) - params = list(params) + ns_steps: int = 5, beta2: float = 0.95, weight_decay: float = 0.0): + defaults = dict(lr=lr, momentum=momentum, ns_steps=ns_steps, beta2=beta2, weight_decay=weight_decay) assert all(p.ndim == 2 for p in params), "Muon expects 2D parameters only" + params = list(params) + world_size = dist.get_world_size() rank = dist.get_rank() # Group all parameters by their shape - shapes = sorted({p.shape for p in params}) # sort to ensure consistent / deterministic ordering + shapes = sorted({p.shape for p in params}) # sort for deterministic ordering across ranks param_groups = [] for shape in shapes: group_params = [p for p in params if p.shape == shape] device, dtype = group_params[0].device, group_params[0].dtype assert all(p.device == device for p in group_params) assert all(p.dtype == dtype for p in group_params) + # Compute chunk size for this group (how many params each rank owns) + chunk_size = (len(group_params) + world_size - 1) // world_size if rank == 0: - print(f"Muon: Grouping {len(group_params)} params of shape {shape}, device {device}, dtype {dtype}") - param_groups.append(dict(params=group_params, zero_buffer=torch.zeros_like(group_params[0]))) + print(f"Muon: {len(group_params)} params of shape {shape}, chunk_size={chunk_size}") + param_groups.append(dict(params=group_params, chunk_size=chunk_size)) super().__init__(param_groups, defaults) + # 0-D CPU tensors to avoid torch.compile recompilation when values change + self._momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") @torch.no_grad() def step(self): @@ -224,72 +226,127 @@ class DistMuon(torch.optim.Optimizer): # Ensure all grads exist assert all(p.grad is not None for group in self.param_groups for p in group["params"]), "All params must have grads" - # Kick off all the reduce scatter operations to average up the gradients across all ranks - all_reduce_futures = [] + # First pass: stack grads and kick off reduce_scatter for each group + group_infos = [] for group in self.param_groups: - params = group["params"] - zero_buffer = group["zero_buffer"] - # Go through params in groups of world_size. - for base_i in range(0, len(params), world_size): - # The compute owner of each param is rank i % world_size - owner_idx = base_i + rank - # each rank stacks up its chunk of world_size params into a list - rs_input = [p.grad for p in params[base_i:base_i + world_size]] - # pad rs_input with the zero buffer to complete the group - rs_input.extend([zero_buffer] * (world_size - len(rs_input))) - # the output buffer gets strided across the group based on the rank - rs_output = params[owner_idx].grad if owner_idx < len(params) else torch.empty_like(zero_buffer) - # reduce scatter the gradients within this group of world_size params - work = dist.reduce_scatter(rs_output, rs_input, op=dist.ReduceOp.AVG, async_op=True).get_future() - all_reduce_futures.append(work) + params: list[Tensor] = group["params"] + chunk_size = group["chunk_size"] + padded_num_params = chunk_size * world_size + shape = params[0].shape + device, dtype = params[0].device, params[0].dtype - # Now each rank computes the update and gathers - future_idx = 0 + # Stack all gradients into a single tensor (single kernel via torch.stack) + grad_stack = torch.stack([p.grad for p in params]) + stacked_grads = torch.empty(padded_num_params, *shape, dtype=dtype, device=device) + stacked_grads[:len(params)].copy_(grad_stack) + # Zero-pad if we have fewer params than padded size + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + # Output buffer for this rank's chunk + grad_chunk = torch.empty(chunk_size, *shape, dtype=dtype, device=device) + + # Async reduce_scatter on the stacked tensor + reduce_future = dist.reduce_scatter_tensor( + grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True + ).get_future() + + group_infos.append(dict( + grad_chunk=grad_chunk, + reduce_future=reduce_future, + stacked_grads=stacked_grads, # reuse for all_gather output + )) + + # Second pass: wait for reduce, compute batched updates, kick off all_gather all_gather_futures = [] - for group in self.param_groups: - params = group["params"] - zero_buffer = group["zero_buffer"] - # Go through params in groups of world_size. - for base_i in range(0, len(params), world_size): - # The compute owner of each param is rank i % world_size - owner_idx = base_i + rank # calculate the index of the param that this rank owns - # Wait for the reduce scatter to complete - all_reduce_futures[future_idx].wait() # possibly later we could use wait_any polling instead - future_idx += 1 - # Owner computes the Muon update, result is in its param - if owner_idx < len(params): - p = params[owner_idx] - g = p.grad # now averaged across ranks - state = self.state[p] - if "momentum_buffer" not in state: - state["momentum_buffer"] = torch.zeros_like(g) - buf: Tensor = state["momentum_buffer"] - buf.lerp_(g, 1.0 - group["momentum"]) - g = g.lerp_(buf, group["momentum"]) if group["nesterov"] else buf - g = zeropower_via_polar_express(g, steps=group["ns_steps"]) - # Variance reduction (NorMuon-style) - if group["beta2"] is not None: - if "second_momentum_buffer" not in state: - # Buffer shape determines reduction dim: reduce along larger dimension - if p.size(-2) >= p.size(-1): - state["second_momentum_buffer"] = torch.zeros_like(g[..., :1]) - else: - state["second_momentum_buffer"] = torch.zeros_like(g[..., :1, :]) - g = apply_variance_reduction(g, state["second_momentum_buffer"], group["beta2"]) - # Parameter update with cautious weight decay - effective_lr = group["lr"] * (max(1.0, p.size(-2) / p.size(-1)) ** 0.5) - wd = group["weight_decay"] - if wd != 0: - mask = (g * p) >= 0 - p.sub_(effective_lr * g + effective_lr * wd * p * mask) - else: - p.sub_(effective_lr * g) - # Replicate updated parameters to all ranks - ag_input = params[owner_idx] if owner_idx < len(params) else zero_buffer - ag_output = params[base_i:base_i + world_size] - ag_output.extend([torch.empty_like(zero_buffer) for _ in range(world_size - len(ag_output))]) # pad - work = dist.all_gather(ag_output, ag_input, async_op=True).get_future() - all_gather_futures.append(work) + for group, info in zip(self.param_groups, group_infos): + info["reduce_future"].wait() - # Wait for all work to finish - torch.futures.collect_all(all_gather_futures).wait() + params = group["params"] + chunk_size = group["chunk_size"] + shape = params[0].shape + device, dtype = params[0].device, params[0].dtype + grad_chunk = info["grad_chunk"] + + # How many params does this rank actually own? + start_idx = rank * chunk_size + num_owned = min(chunk_size, max(0, len(params) - start_idx)) + + # Get or create group-level state (stored keyed by first param) + state = self.state[params[0]] + + # Momentum buffer + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros(chunk_size, *shape, dtype=dtype, device=device) + momentum_buffer = state["momentum_buffer"] + + # Second momentum buffer is factored, either per-row or per-column + if "second_momentum_buffer" not in state: + if shape[-2] >= shape[-1]: + state["second_momentum_buffer"] = torch.zeros(chunk_size, shape[-2], 1, dtype=dtype, device=device) + else: + state["second_momentum_buffer"] = torch.zeros(chunk_size, 1, shape[-1], dtype=dtype, device=device) + second_momentum_buffer = state["second_momentum_buffer"] + red_dim = -1 if shape[-2] >= shape[-1] else -2 + + # Build updated_params tensor for all_gather + updated_params = torch.empty(chunk_size, *shape, dtype=dtype, device=device) + + if num_owned > 0: + # Stack owned params (single kernel via torch.stack) + owned_params = [params[start_idx + i] for i in range(num_owned)] + stacked_owned_params = torch.stack(owned_params) + + # Get owned slices of buffers and grads + owned_grads = grad_chunk[:num_owned] + owned_momentum = momentum_buffer[:num_owned] + owned_second_momentum = second_momentum_buffer[:num_owned] + + # Fill 0-D tensors with current values + self._momentum_t.fill_(group["momentum"]) + self._beta2_t.fill_(group["beta2"] if group["beta2"] is not None else 0.0) + self._lr_t.fill_(group["lr"] * max(1.0, shape[-2] / shape[-1])**0.5) + self._wd_t.fill_(group["weight_decay"]) + + # Single fused kernel: momentum -> polar_express -> variance_reduction -> update + muon_step_fused( + owned_grads, + stacked_owned_params, + owned_momentum, + owned_second_momentum, + self._momentum_t, + self._lr_t, + self._wd_t, + self._beta2_t, + group["ns_steps"], + red_dim, + ) + + # Copy updated params to output buffer + updated_params[:num_owned].copy_(stacked_owned_params) + + # Zero-pad the rest (for ranks that own fewer params) + if num_owned < chunk_size: + updated_params[num_owned:].zero_() + + # Reuse stacked_grads buffer for all_gather output + stacked_params = info["stacked_grads"] + + # Async all_gather to replicate updated params to all ranks + gather_future = dist.all_gather_into_tensor( + stacked_params, updated_params, async_op=True + ).get_future() + + all_gather_futures.append(dict( + gather_future=gather_future, + stacked_params=stacked_params, + params=params, + )) + + # Final pass: wait for all_gather and copy back to params + for info in all_gather_futures: + info["gather_future"].wait() + stacked_params = info["stacked_params"] + params = info["params"] + # Batched copy back (single kernel instead of N individual copies) + torch._foreach_copy_(params, list(stacked_params[:len(params)].unbind(0))) From 255f8b9af6d308a199601463b0062d618090a346 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 15 Jan 2026 23:30:11 +0000 Subject: [PATCH 031/119] cleanly separate cpu and gpu sections --- scripts/base_train.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/scripts/base_train.py b/scripts/base_train.py index bf4b8cf6..a9ee1c30 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -370,14 +370,15 @@ while True: for opt in optimizers: opt.step() model.zero_grad(set_to_none=True) + train_loss_f = train_loss.item() # .item() is a CPU-GPU sync point synchronize() t1 = time.time() dt = t1 - t0 # ------------------------------------------------------------------------- - # logging + # logging (CPU action only) ema_beta = 0.9 # EMA decay factor for some smoothing just for nicer logging - smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss + smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss_f # EMA the training loss debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA pct_done = 100 * step / num_iterations tok_per_sec = int(args.total_batch_size / dt) From 22a71aa3d30b50d6e7659ca0e8bcad9f3b7bfd98 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 15 Jan 2026 23:30:44 +0000 Subject: [PATCH 032/119] fuse adamw into a single torch compiled kernel similar to muon. it's about 1.7X faster, but overall it's so tiny that it's not making a major dent --- nanochat/adamw.py | 90 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 69 insertions(+), 21 deletions(-) diff --git a/nanochat/adamw.py b/nanochat/adamw.py index 48945b38..70ccf7b5 100644 --- a/nanochat/adamw.py +++ b/nanochat/adamw.py @@ -1,11 +1,42 @@ """ -Borrowed from modded-nanogpt. By Keller, @vagrawal, et al. -Not a general optimizer! But works for our specific use. +Distributed AdamW optimizer with a fused step function. +A bunch of ideas (e.g. dist comms in slices) are borrowed from modded-nanogpt. """ import torch import torch.distributed as dist from torch import Tensor +@torch.compile(dynamic=False, fullgraph=True) +def adamw_step_fused( + p: Tensor, + grad: Tensor, + exp_avg: Tensor, + exp_avg_sq: Tensor, + step_t: Tensor, + lr_t: Tensor, + beta1_t: Tensor, + beta2_t: Tensor, + eps_t: Tensor, + wd_t: Tensor, +) -> None: + """ + Fused AdamW step: weight_decay -> momentum_update -> bias_correction -> param_update + All in one compiled graph to eliminate Python overhead between ops. + The 0-D CPU tensors avoid recompilation when hyperparameter values change. + """ + # Weight decay (decoupled, applied before the update) + p.mul_(1 - lr_t * wd_t) + # Update running averages (lerp_ is cleaner and fuses well) + exp_avg.lerp_(grad, 1 - beta1_t) + exp_avg_sq.lerp_(grad.square(), 1 - beta2_t) + # Bias corrections + bias1 = 1 - beta1_t ** step_t + bias2 = 1 - beta2_t ** step_t + # Compute update and apply + denom = (exp_avg_sq / bias2).sqrt() + eps_t + step_size = lr_t / bias1 + p.add_(exp_avg / denom, alpha=-step_size) + class DistAdamW(torch.optim.Optimizer): """ @@ -14,7 +45,26 @@ class DistAdamW(torch.optim.Optimizer): """ def __init__(self, param_groups, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) + rank = dist.get_rank() + world_size = dist.get_world_size() + # Validate + if rank == 0: + for group in param_groups: + assert isinstance(group, dict), "expecting param_groups to be a list of dicts" + assert isinstance(group['params'], list), "expecting group['params'] to be a list of tensors" + for p in group['params']: + sliced = p.numel() >= 1024 + print(f"AdamW: 1 param of shape {p.shape}, sliced={sliced}") + if sliced: # large parameter tensors will be operated on in slices + assert p.shape[0] % world_size == 0, f"First dim of parameter shape {p.shape} must be divisible by world size {world_size}" super().__init__(param_groups, defaults) + # 0-D CPU tensors to avoid torch.compile recompilation when values change + self._step_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._beta1_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._eps_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") @torch.no_grad() def step(self): @@ -36,8 +86,7 @@ class DistAdamW(torch.optim.Optimizer): grad_slices.append(grad) else: is_small.append(False) - assert p.shape[0] % world_size == 0, f"First dim of parameter shape {p.shape} must be divisible by world size {world_size}" - rank_size = grad.shape[0] // world_size + rank_size = grad.shape[0] // world_size # p.shape[0] % world_size == 0 is checked in __init__ grad_slice = torch.empty_like(grad[:rank_size]) reduce_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) grad_slices.append(grad_slice) @@ -63,28 +112,27 @@ class DistAdamW(torch.optim.Optimizer): # State init if not state: - state['step'] = torch.tensor(0, dtype=torch.int64, device=p.device) + state['step'] = 0 state['exp_avg'] = torch.zeros_like(p_slice) state['exp_avg_sq'] = torch.zeros_like(p_slice) exp_avg = state['exp_avg'] exp_avg_sq = state['exp_avg_sq'] state['step'] += 1 - t = state['step'] - # weight decay - if wd != 0: - eff_weight_decay = lr * wd * getattr(p, "wd_mul", 1.0) - p_slice.mul_(1 - eff_weight_decay) - # update running averages - exp_avg.mul_(beta1).add_(g_slice, alpha=1 - beta1) - exp_avg_sq.mul_(beta2).addcmul_(g_slice, g_slice, value=1 - beta2) - # bias corrections - bias1 = 1 - beta1 ** t - bias2 = 1 - beta2 ** t - # compute step - denom = (exp_avg_sq / bias2).sqrt().add_(eps) - step_size = lr / bias1 - update = exp_avg.div(denom).mul_(step_size) - p_slice.add_(other=update, alpha=-1.0) + + # Fill 0-D tensors with current values + eff_wd = wd * getattr(p, "wd_mul", 1.0) + self._step_t.fill_(state['step']) + self._lr_t.fill_(lr) + self._beta1_t.fill_(beta1) + self._beta2_t.fill_(beta2) + self._eps_t.fill_(eps) + self._wd_t.fill_(eff_wd) + + # Fused update: weight_decay -> momentum -> bias_correction -> param_update + adamw_step_fused( + p_slice, g_slice, exp_avg, exp_avg_sq, + self._step_t, self._lr_t, self._beta1_t, self._beta2_t, self._eps_t, self._wd_t, + ) # Only large params need all_gather if not is_small[idx]: From bdcc030ffa97c829bd2e2c5841c776e1fb717aa2 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 15 Jan 2026 23:32:20 +0000 Subject: [PATCH 033/119] oops legacy spurious line now --- scripts/base_train.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/base_train.py b/scripts/base_train.py index a9ee1c30..5293cd88 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -208,7 +208,6 @@ if resuming: # ----------------------------------------------------------------------------- # Initialize the DataLoaders for train/val -tokens_dir = os.path.join(base_dir, "tokenized_data") dataloader_resume_state_dict = None if not resuming else meta_data["dataloader_state_dict"] train_loader = tokenizing_distributed_data_loader_with_state_bos_bestfit(tokenizer, args.device_batch_size, args.max_seq_len, split="train", device=device, resume_state_dict=dataloader_resume_state_dict) build_val_loader = lambda: tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, args.max_seq_len, split="val", device=device) From d4ea28d4e2efc0b0491016721d01ec1afe83697e Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Fri, 16 Jan 2026 01:26:38 +0100 Subject: [PATCH 034/119] Fix args in readme (#438) * fix commands in readme, using new arg format * fix typo * add required -i flag to chat_eval example runs --- README.md | 4 ++-- scripts/chat_eval.py | 4 ++-- tasks/customjson.py | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index acb91110..9de2884c 100644 --- a/README.md +++ b/README.md @@ -82,10 +82,10 @@ That said, to give a sense, the example changes needed for the [speedrun.sh](spe python -m nanochat.dataset -n 450 & ... # use --depth to increase model size. to not oom, halve device batch size 32 -> 16: -torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=26 --device_batch_size=16 +torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=26 --device-batch-size=16 ... # make sure to use the same later during midtraining: -torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --device_batch_size=16 +torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --device-batch-size=16 ``` That's it! The biggest thing to pay attention to is making sure you have enough data shards to train on (the code will loop and do more epochs over the same training set otherwise, decreasing learning speed a bit), and managing your memory/VRAM, primarily by decreasing the `device_batch_size` until things fit (the scripts automatically compensate by increasing the number of gradient accumulation loops, simply turning parallel compute to sequential compute). diff --git a/scripts/chat_eval.py b/scripts/chat_eval.py index cae2f0f8..a5583035 100644 --- a/scripts/chat_eval.py +++ b/scripts/chat_eval.py @@ -4,8 +4,8 @@ All the generic code lives here, and all the evaluation-specific code lives in nanochat directory and is imported from here. Example runs: -python -m scripts.chat_eval -a ARC-Easy -torchrun --nproc_per_node=8 -m scripts.chat_eval -- -a ARC-Easy +python -m scripts.chat_eval -i mid -a ARC-Easy +torchrun --nproc_per_node=8 -m scripts.chat_eval -- -i mid -a ARC-Easy """ import argparse diff --git a/tasks/customjson.py b/tasks/customjson.py index e1b5f0b3..aeb1a3f7 100644 --- a/tasks/customjson.py +++ b/tasks/customjson.py @@ -25,7 +25,7 @@ class CustomJSON(Task): print("-" * 80) print(f"Warning: File {filepath} does not exist") print("HINT (Oct 21 2025)") - print("If you recently did a git pull and suddely see this, it might be due to the new addition of identity conversations") + print("If you recently did a git pull and suddenly see this, it might be due to the new addition of identity conversations") print("See this discussion for more details: https://github.com/karpathy/nanochat/discussions/139") print("Quick fix: simply run the following command to download the file and you're done:") print(f"curl -L -o {filepath} https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl") From 7d1700c5215c1f10083976f6e0dc797e751501e2 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 16 Jan 2026 00:40:59 +0000 Subject: [PATCH 035/119] add zstd lib --- pyproject.toml | 1 + uv.lock | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 93 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 87a967f7..f3cd8d73 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ "transformers>=4.57.3", "uvicorn>=0.36.0", "wandb>=0.21.3", + "zstandard>=0.25.0", ] [dependency-groups] diff --git a/uv.lock b/uv.lock index b168a2ff..dd766f89 100644 --- a/uv.lock +++ b/uv.lock @@ -1513,6 +1513,7 @@ dependencies = [ { name = "transformers" }, { name = "uvicorn" }, { name = "wandb" }, + { name = "zstandard" }, ] [package.optional-dependencies] @@ -1551,6 +1552,7 @@ requires-dist = [ { name = "transformers", specifier = ">=4.57.3" }, { name = "uvicorn", specifier = ">=0.36.0" }, { name = "wandb", specifier = ">=0.21.3" }, + { name = "zstandard", specifier = ">=0.25.0" }, ] provides-extras = ["cpu", "gpu"] @@ -3619,3 +3621,93 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/94/c3/b2e9f38bc3e11191981d57ea08cab2166e74ea770024a646617c9cddd9f6/yarl-1.20.1-cp313-cp313t-win_amd64.whl", hash = "sha256:541d050a355bbbc27e55d906bc91cb6fe42f96c01413dd0f4ed5a5240513874f", size = 93003, upload-time = "2025-06-10T00:45:27.752Z" }, { url = "https://files.pythonhosted.org/packages/b4/2d/2345fce04cfd4bee161bf1e7d9cdc702e3e16109021035dbb24db654a622/yarl-1.20.1-py3-none-any.whl", hash = "sha256:83b8eb083fe4683c6115795d9fc1cfaf2cbbefb19b3a1cb68f6527460f483a77", size = 46542, upload-time = "2025-06-10T00:46:07.521Z" }, ] + +[[package]] +name = "zstandard" +version = "0.25.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/fd/aa/3e0508d5a5dd96529cdc5a97011299056e14c6505b678fd58938792794b1/zstandard-0.25.0.tar.gz", hash = "sha256:7713e1179d162cf5c7906da876ec2ccb9c3a9dcbdffef0cc7f70c3667a205f0b", size = 711513, upload-time = "2025-09-14T22:15:54.002Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/56/7a/28efd1d371f1acd037ac64ed1c5e2b41514a6cc937dd6ab6a13ab9f0702f/zstandard-0.25.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e59fdc271772f6686e01e1b3b74537259800f57e24280be3f29c8a0deb1904dd", size = 795256, upload-time = "2025-09-14T22:15:56.415Z" }, + { url = "https://files.pythonhosted.org/packages/96/34/ef34ef77f1ee38fc8e4f9775217a613b452916e633c4f1d98f31db52c4a5/zstandard-0.25.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4d441506e9b372386a5271c64125f72d5df6d2a8e8a2a45a0ae09b03cb781ef7", size = 640565, upload-time = "2025-09-14T22:15:58.177Z" }, + { url = "https://files.pythonhosted.org/packages/9d/1b/4fdb2c12eb58f31f28c4d28e8dc36611dd7205df8452e63f52fb6261d13e/zstandard-0.25.0-cp310-cp310-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:ab85470ab54c2cb96e176f40342d9ed41e58ca5733be6a893b730e7af9c40550", size = 5345306, upload-time = "2025-09-14T22:16:00.165Z" }, + { url = "https://files.pythonhosted.org/packages/73/28/a44bdece01bca027b079f0e00be3b6bd89a4df180071da59a3dd7381665b/zstandard-0.25.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:e05ab82ea7753354bb054b92e2f288afb750e6b439ff6ca78af52939ebbc476d", size = 5055561, upload-time = "2025-09-14T22:16:02.22Z" }, + { url = "https://files.pythonhosted.org/packages/e9/74/68341185a4f32b274e0fc3410d5ad0750497e1acc20bd0f5b5f64ce17785/zstandard-0.25.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:78228d8a6a1c177a96b94f7e2e8d012c55f9c760761980da16ae7546a15a8e9b", size = 5402214, upload-time = "2025-09-14T22:16:04.109Z" }, + { url = "https://files.pythonhosted.org/packages/8b/67/f92e64e748fd6aaffe01e2b75a083c0c4fd27abe1c8747fee4555fcee7dd/zstandard-0.25.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:2b6bd67528ee8b5c5f10255735abc21aa106931f0dbaf297c7be0c886353c3d0", size = 5449703, upload-time = "2025-09-14T22:16:06.312Z" }, + { url = "https://files.pythonhosted.org/packages/fd/e5/6d36f92a197c3c17729a2125e29c169f460538a7d939a27eaaa6dcfcba8e/zstandard-0.25.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:4b6d83057e713ff235a12e73916b6d356e3084fd3d14ced499d84240f3eecee0", size = 5556583, upload-time = "2025-09-14T22:16:08.457Z" }, + { url = "https://files.pythonhosted.org/packages/d7/83/41939e60d8d7ebfe2b747be022d0806953799140a702b90ffe214d557638/zstandard-0.25.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9174f4ed06f790a6869b41cba05b43eeb9a35f8993c4422ab853b705e8112bbd", size = 5045332, upload-time = "2025-09-14T22:16:10.444Z" }, + { url = "https://files.pythonhosted.org/packages/b3/87/d3ee185e3d1aa0133399893697ae91f221fda79deb61adbe998a7235c43f/zstandard-0.25.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:25f8f3cd45087d089aef5ba3848cd9efe3ad41163d3400862fb42f81a3a46701", size = 5572283, upload-time = "2025-09-14T22:16:12.128Z" }, + { url = "https://files.pythonhosted.org/packages/0a/1d/58635ae6104df96671076ac7d4ae7816838ce7debd94aecf83e30b7121b0/zstandard-0.25.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:3756b3e9da9b83da1796f8809dd57cb024f838b9eeafde28f3cb472012797ac1", size = 4959754, upload-time = "2025-09-14T22:16:14.225Z" }, + { url = "https://files.pythonhosted.org/packages/75/d6/57e9cb0a9983e9a229dd8fd2e6e96593ef2aa82a3907188436f22b111ccd/zstandard-0.25.0-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:81dad8d145d8fd981b2962b686b2241d3a1ea07733e76a2f15435dfb7fb60150", size = 5266477, upload-time = "2025-09-14T22:16:16.343Z" }, + { url = "https://files.pythonhosted.org/packages/d1/a9/ee891e5edf33a6ebce0a028726f0bbd8567effe20fe3d5808c42323e8542/zstandard-0.25.0-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:a5a419712cf88862a45a23def0ae063686db3d324cec7edbe40509d1a79a0aab", size = 5440914, upload-time = "2025-09-14T22:16:18.453Z" }, + { url = "https://files.pythonhosted.org/packages/58/08/a8522c28c08031a9521f27abc6f78dbdee7312a7463dd2cfc658b813323b/zstandard-0.25.0-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:e7360eae90809efd19b886e59a09dad07da4ca9ba096752e61a2e03c8aca188e", size = 5819847, upload-time = "2025-09-14T22:16:20.559Z" }, + { url = "https://files.pythonhosted.org/packages/6f/11/4c91411805c3f7b6f31c60e78ce347ca48f6f16d552fc659af6ec3b73202/zstandard-0.25.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:75ffc32a569fb049499e63ce68c743155477610532da1eb38e7f24bf7cd29e74", size = 5363131, upload-time = "2025-09-14T22:16:22.206Z" }, + { url = "https://files.pythonhosted.org/packages/ef/d6/8c4bd38a3b24c4c7676a7a3d8de85d6ee7a983602a734b9f9cdefb04a5d6/zstandard-0.25.0-cp310-cp310-win32.whl", hash = "sha256:106281ae350e494f4ac8a80470e66d1fe27e497052c8d9c3b95dc4cf1ade81aa", size = 436469, upload-time = "2025-09-14T22:16:25.002Z" }, + { url = "https://files.pythonhosted.org/packages/93/90/96d50ad417a8ace5f841b3228e93d1bb13e6ad356737f42e2dde30d8bd68/zstandard-0.25.0-cp310-cp310-win_amd64.whl", hash = "sha256:ea9d54cc3d8064260114a0bbf3479fc4a98b21dffc89b3459edd506b69262f6e", size = 506100, upload-time = "2025-09-14T22:16:23.569Z" }, + { url = "https://files.pythonhosted.org/packages/2a/83/c3ca27c363d104980f1c9cee1101cc8ba724ac8c28a033ede6aab89585b1/zstandard-0.25.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:933b65d7680ea337180733cf9e87293cc5500cc0eb3fc8769f4d3c88d724ec5c", size = 795254, upload-time = "2025-09-14T22:16:26.137Z" }, + { url = "https://files.pythonhosted.org/packages/ac/4d/e66465c5411a7cf4866aeadc7d108081d8ceba9bc7abe6b14aa21c671ec3/zstandard-0.25.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a3f79487c687b1fc69f19e487cd949bf3aae653d181dfb5fde3bf6d18894706f", size = 640559, upload-time = "2025-09-14T22:16:27.973Z" }, + { url = "https://files.pythonhosted.org/packages/12/56/354fe655905f290d3b147b33fe946b0f27e791e4b50a5f004c802cb3eb7b/zstandard-0.25.0-cp311-cp311-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:0bbc9a0c65ce0eea3c34a691e3c4b6889f5f3909ba4822ab385fab9057099431", size = 5348020, upload-time = "2025-09-14T22:16:29.523Z" }, + { url = "https://files.pythonhosted.org/packages/3b/13/2b7ed68bd85e69a2069bcc72141d378f22cae5a0f3b353a2c8f50ef30c1b/zstandard-0.25.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:01582723b3ccd6939ab7b3a78622c573799d5d8737b534b86d0e06ac18dbde4a", size = 5058126, upload-time = "2025-09-14T22:16:31.811Z" }, + { url = "https://files.pythonhosted.org/packages/c9/dd/fdaf0674f4b10d92cb120ccff58bbb6626bf8368f00ebfd2a41ba4a0dc99/zstandard-0.25.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:5f1ad7bf88535edcf30038f6919abe087f606f62c00a87d7e33e7fc57cb69fcc", size = 5405390, upload-time = "2025-09-14T22:16:33.486Z" }, + { url = "https://files.pythonhosted.org/packages/0f/67/354d1555575bc2490435f90d67ca4dd65238ff2f119f30f72d5cde09c2ad/zstandard-0.25.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:06acb75eebeedb77b69048031282737717a63e71e4ae3f77cc0c3b9508320df6", size = 5452914, upload-time = "2025-09-14T22:16:35.277Z" }, + { url = "https://files.pythonhosted.org/packages/bb/1f/e9cfd801a3f9190bf3e759c422bbfd2247db9d7f3d54a56ecde70137791a/zstandard-0.25.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9300d02ea7c6506f00e627e287e0492a5eb0371ec1670ae852fefffa6164b072", size = 5559635, upload-time = "2025-09-14T22:16:37.141Z" }, + { url = "https://files.pythonhosted.org/packages/21/88/5ba550f797ca953a52d708c8e4f380959e7e3280af029e38fbf47b55916e/zstandard-0.25.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:bfd06b1c5584b657a2892a6014c2f4c20e0db0208c159148fa78c65f7e0b0277", size = 5048277, upload-time = "2025-09-14T22:16:38.807Z" }, + { url = "https://files.pythonhosted.org/packages/46/c0/ca3e533b4fa03112facbe7fbe7779cb1ebec215688e5df576fe5429172e0/zstandard-0.25.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:f373da2c1757bb7f1acaf09369cdc1d51d84131e50d5fa9863982fd626466313", size = 5574377, upload-time = "2025-09-14T22:16:40.523Z" }, + { url = "https://files.pythonhosted.org/packages/12/9b/3fb626390113f272abd0799fd677ea33d5fc3ec185e62e6be534493c4b60/zstandard-0.25.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:6c0e5a65158a7946e7a7affa6418878ef97ab66636f13353b8502d7ea03c8097", size = 4961493, upload-time = "2025-09-14T22:16:43.3Z" }, + { url = "https://files.pythonhosted.org/packages/cb/d3/23094a6b6a4b1343b27ae68249daa17ae0651fcfec9ed4de09d14b940285/zstandard-0.25.0-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:c8e167d5adf59476fa3e37bee730890e389410c354771a62e3c076c86f9f7778", size = 5269018, upload-time = "2025-09-14T22:16:45.292Z" }, + { url = "https://files.pythonhosted.org/packages/8c/a7/bb5a0c1c0f3f4b5e9d5b55198e39de91e04ba7c205cc46fcb0f95f0383c1/zstandard-0.25.0-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:98750a309eb2f020da61e727de7d7ba3c57c97cf6213f6f6277bb7fb42a8e065", size = 5443672, upload-time = "2025-09-14T22:16:47.076Z" }, + { url = "https://files.pythonhosted.org/packages/27/22/503347aa08d073993f25109c36c8d9f029c7d5949198050962cb568dfa5e/zstandard-0.25.0-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:22a086cff1b6ceca18a8dd6096ec631e430e93a8e70a9ca5efa7561a00f826fa", size = 5822753, upload-time = "2025-09-14T22:16:49.316Z" }, + { url = "https://files.pythonhosted.org/packages/e2/be/94267dc6ee64f0f8ba2b2ae7c7a2df934a816baaa7291db9e1aa77394c3c/zstandard-0.25.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:72d35d7aa0bba323965da807a462b0966c91608ef3a48ba761678cb20ce5d8b7", size = 5366047, upload-time = "2025-09-14T22:16:51.328Z" }, + { url = "https://files.pythonhosted.org/packages/7b/a3/732893eab0a3a7aecff8b99052fecf9f605cf0fb5fb6d0290e36beee47a4/zstandard-0.25.0-cp311-cp311-win32.whl", hash = "sha256:f5aeea11ded7320a84dcdd62a3d95b5186834224a9e55b92ccae35d21a8b63d4", size = 436484, upload-time = "2025-09-14T22:16:55.005Z" }, + { url = "https://files.pythonhosted.org/packages/43/a3/c6155f5c1cce691cb80dfd38627046e50af3ee9ddc5d0b45b9b063bfb8c9/zstandard-0.25.0-cp311-cp311-win_amd64.whl", hash = "sha256:daab68faadb847063d0c56f361a289c4f268706b598afbf9ad113cbe5c38b6b2", size = 506183, upload-time = "2025-09-14T22:16:52.753Z" }, + { url = "https://files.pythonhosted.org/packages/8c/3e/8945ab86a0820cc0e0cdbf38086a92868a9172020fdab8a03ac19662b0e5/zstandard-0.25.0-cp311-cp311-win_arm64.whl", hash = "sha256:22a06c5df3751bb7dc67406f5374734ccee8ed37fc5981bf1ad7041831fa1137", size = 462533, upload-time = "2025-09-14T22:16:53.878Z" }, + { url = "https://files.pythonhosted.org/packages/82/fc/f26eb6ef91ae723a03e16eddb198abcfce2bc5a42e224d44cc8b6765e57e/zstandard-0.25.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:7b3c3a3ab9daa3eed242d6ecceead93aebbb8f5f84318d82cee643e019c4b73b", size = 795738, upload-time = "2025-09-14T22:16:56.237Z" }, + { url = "https://files.pythonhosted.org/packages/aa/1c/d920d64b22f8dd028a8b90e2d756e431a5d86194caa78e3819c7bf53b4b3/zstandard-0.25.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:913cbd31a400febff93b564a23e17c3ed2d56c064006f54efec210d586171c00", size = 640436, upload-time = "2025-09-14T22:16:57.774Z" }, + { url = "https://files.pythonhosted.org/packages/53/6c/288c3f0bd9fcfe9ca41e2c2fbfd17b2097f6af57b62a81161941f09afa76/zstandard-0.25.0-cp312-cp312-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:011d388c76b11a0c165374ce660ce2c8efa8e5d87f34996aa80f9c0816698b64", size = 5343019, upload-time = "2025-09-14T22:16:59.302Z" }, + { url = "https://files.pythonhosted.org/packages/1e/15/efef5a2f204a64bdb5571e6161d49f7ef0fffdbca953a615efbec045f60f/zstandard-0.25.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:6dffecc361d079bb48d7caef5d673c88c8988d3d33fb74ab95b7ee6da42652ea", size = 5063012, upload-time = "2025-09-14T22:17:01.156Z" }, + { url = "https://files.pythonhosted.org/packages/b7/37/a6ce629ffdb43959e92e87ebdaeebb5ac81c944b6a75c9c47e300f85abdf/zstandard-0.25.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:7149623bba7fdf7e7f24312953bcf73cae103db8cae49f8154dd1eadc8a29ecb", size = 5394148, upload-time = "2025-09-14T22:17:03.091Z" }, + { url = "https://files.pythonhosted.org/packages/e3/79/2bf870b3abeb5c070fe2d670a5a8d1057a8270f125ef7676d29ea900f496/zstandard-0.25.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:6a573a35693e03cf1d67799fd01b50ff578515a8aeadd4595d2a7fa9f3ec002a", size = 5451652, upload-time = "2025-09-14T22:17:04.979Z" }, + { url = "https://files.pythonhosted.org/packages/53/60/7be26e610767316c028a2cbedb9a3beabdbe33e2182c373f71a1c0b88f36/zstandard-0.25.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:5a56ba0db2d244117ed744dfa8f6f5b366e14148e00de44723413b2f3938a902", size = 5546993, upload-time = "2025-09-14T22:17:06.781Z" }, + { url = "https://files.pythonhosted.org/packages/85/c7/3483ad9ff0662623f3648479b0380d2de5510abf00990468c286c6b04017/zstandard-0.25.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:10ef2a79ab8e2974e2075fb984e5b9806c64134810fac21576f0668e7ea19f8f", size = 5046806, upload-time = "2025-09-14T22:17:08.415Z" }, + { url = "https://files.pythonhosted.org/packages/08/b3/206883dd25b8d1591a1caa44b54c2aad84badccf2f1de9e2d60a446f9a25/zstandard-0.25.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:aaf21ba8fb76d102b696781bddaa0954b782536446083ae3fdaa6f16b25a1c4b", size = 5576659, upload-time = "2025-09-14T22:17:10.164Z" }, + { url = "https://files.pythonhosted.org/packages/9d/31/76c0779101453e6c117b0ff22565865c54f48f8bd807df2b00c2c404b8e0/zstandard-0.25.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:1869da9571d5e94a85a5e8d57e4e8807b175c9e4a6294e3b66fa4efb074d90f6", size = 4953933, upload-time = "2025-09-14T22:17:11.857Z" }, + { url = "https://files.pythonhosted.org/packages/18/e1/97680c664a1bf9a247a280a053d98e251424af51f1b196c6d52f117c9720/zstandard-0.25.0-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:809c5bcb2c67cd0ed81e9229d227d4ca28f82d0f778fc5fea624a9def3963f91", size = 5268008, upload-time = "2025-09-14T22:17:13.627Z" }, + { url = "https://files.pythonhosted.org/packages/1e/73/316e4010de585ac798e154e88fd81bb16afc5c5cb1a72eeb16dd37e8024a/zstandard-0.25.0-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f27662e4f7dbf9f9c12391cb37b4c4c3cb90ffbd3b1fb9284dadbbb8935fa708", size = 5433517, upload-time = "2025-09-14T22:17:16.103Z" }, + { url = "https://files.pythonhosted.org/packages/5b/60/dd0f8cfa8129c5a0ce3ea6b7f70be5b33d2618013a161e1ff26c2b39787c/zstandard-0.25.0-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:99c0c846e6e61718715a3c9437ccc625de26593fea60189567f0118dc9db7512", size = 5814292, upload-time = "2025-09-14T22:17:17.827Z" }, + { url = "https://files.pythonhosted.org/packages/fc/5f/75aafd4b9d11b5407b641b8e41a57864097663699f23e9ad4dbb91dc6bfe/zstandard-0.25.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:474d2596a2dbc241a556e965fb76002c1ce655445e4e3bf38e5477d413165ffa", size = 5360237, upload-time = "2025-09-14T22:17:19.954Z" }, + { url = "https://files.pythonhosted.org/packages/ff/8d/0309daffea4fcac7981021dbf21cdb2e3427a9e76bafbcdbdf5392ff99a4/zstandard-0.25.0-cp312-cp312-win32.whl", hash = "sha256:23ebc8f17a03133b4426bcc04aabd68f8236eb78c3760f12783385171b0fd8bd", size = 436922, upload-time = "2025-09-14T22:17:24.398Z" }, + { url = "https://files.pythonhosted.org/packages/79/3b/fa54d9015f945330510cb5d0b0501e8253c127cca7ebe8ba46a965df18c5/zstandard-0.25.0-cp312-cp312-win_amd64.whl", hash = "sha256:ffef5a74088f1e09947aecf91011136665152e0b4b359c42be3373897fb39b01", size = 506276, upload-time = "2025-09-14T22:17:21.429Z" }, + { url = "https://files.pythonhosted.org/packages/ea/6b/8b51697e5319b1f9ac71087b0af9a40d8a6288ff8025c36486e0c12abcc4/zstandard-0.25.0-cp312-cp312-win_arm64.whl", hash = "sha256:181eb40e0b6a29b3cd2849f825e0fa34397f649170673d385f3598ae17cca2e9", size = 462679, upload-time = "2025-09-14T22:17:23.147Z" }, + { url = "https://files.pythonhosted.org/packages/35/0b/8df9c4ad06af91d39e94fa96cc010a24ac4ef1378d3efab9223cc8593d40/zstandard-0.25.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ec996f12524f88e151c339688c3897194821d7f03081ab35d31d1e12ec975e94", size = 795735, upload-time = "2025-09-14T22:17:26.042Z" }, + { url = "https://files.pythonhosted.org/packages/3f/06/9ae96a3e5dcfd119377ba33d4c42a7d89da1efabd5cb3e366b156c45ff4d/zstandard-0.25.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a1a4ae2dec3993a32247995bdfe367fc3266da832d82f8438c8570f989753de1", size = 640440, upload-time = "2025-09-14T22:17:27.366Z" }, + { url = "https://files.pythonhosted.org/packages/d9/14/933d27204c2bd404229c69f445862454dcc101cd69ef8c6068f15aaec12c/zstandard-0.25.0-cp313-cp313-manylinux2010_i686.manylinux2014_i686.manylinux_2_12_i686.manylinux_2_17_i686.whl", hash = "sha256:e96594a5537722fdfb79951672a2a63aec5ebfb823e7560586f7484819f2a08f", size = 5343070, upload-time = "2025-09-14T22:17:28.896Z" }, + { url = "https://files.pythonhosted.org/packages/6d/db/ddb11011826ed7db9d0e485d13df79b58586bfdec56e5c84a928a9a78c1c/zstandard-0.25.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:bfc4e20784722098822e3eee42b8e576b379ed72cca4a7cb856ae733e62192ea", size = 5063001, upload-time = "2025-09-14T22:17:31.044Z" }, + { url = "https://files.pythonhosted.org/packages/db/00/87466ea3f99599d02a5238498b87bf84a6348290c19571051839ca943777/zstandard-0.25.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:457ed498fc58cdc12fc48f7950e02740d4f7ae9493dd4ab2168a47c93c31298e", size = 5394120, upload-time = "2025-09-14T22:17:32.711Z" }, + { url = "https://files.pythonhosted.org/packages/2b/95/fc5531d9c618a679a20ff6c29e2b3ef1d1f4ad66c5e161ae6ff847d102a9/zstandard-0.25.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:fd7a5004eb1980d3cefe26b2685bcb0b17989901a70a1040d1ac86f1d898c551", size = 5451230, upload-time = "2025-09-14T22:17:34.41Z" }, + { url = "https://files.pythonhosted.org/packages/63/4b/e3678b4e776db00f9f7b2fe58e547e8928ef32727d7a1ff01dea010f3f13/zstandard-0.25.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:8e735494da3db08694d26480f1493ad2cf86e99bdd53e8e9771b2752a5c0246a", size = 5547173, upload-time = "2025-09-14T22:17:36.084Z" }, + { url = "https://files.pythonhosted.org/packages/4e/d5/ba05ed95c6b8ec30bd468dfeab20589f2cf709b5c940483e31d991f2ca58/zstandard-0.25.0-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:3a39c94ad7866160a4a46d772e43311a743c316942037671beb264e395bdd611", size = 5046736, upload-time = "2025-09-14T22:17:37.891Z" }, + { url = "https://files.pythonhosted.org/packages/50/d5/870aa06b3a76c73eced65c044b92286a3c4e00554005ff51962deef28e28/zstandard-0.25.0-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:172de1f06947577d3a3005416977cce6168f2261284c02080e7ad0185faeced3", size = 5576368, upload-time = "2025-09-14T22:17:40.206Z" }, + { url = "https://files.pythonhosted.org/packages/5d/35/398dc2ffc89d304d59bc12f0fdd931b4ce455bddf7038a0a67733a25f550/zstandard-0.25.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:3c83b0188c852a47cd13ef3bf9209fb0a77fa5374958b8c53aaa699398c6bd7b", size = 4954022, upload-time = "2025-09-14T22:17:41.879Z" }, + { url = "https://files.pythonhosted.org/packages/9a/5c/36ba1e5507d56d2213202ec2b05e8541734af5f2ce378c5d1ceaf4d88dc4/zstandard-0.25.0-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:1673b7199bbe763365b81a4f3252b8e80f44c9e323fc42940dc8843bfeaf9851", size = 5267889, upload-time = "2025-09-14T22:17:43.577Z" }, + { url = "https://files.pythonhosted.org/packages/70/e8/2ec6b6fb7358b2ec0113ae202647ca7c0e9d15b61c005ae5225ad0995df5/zstandard-0.25.0-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:0be7622c37c183406f3dbf0cba104118eb16a4ea7359eeb5752f0794882fc250", size = 5433952, upload-time = "2025-09-14T22:17:45.271Z" }, + { url = "https://files.pythonhosted.org/packages/7b/01/b5f4d4dbc59ef193e870495c6f1275f5b2928e01ff5a81fecb22a06e22fb/zstandard-0.25.0-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:5f5e4c2a23ca271c218ac025bd7d635597048b366d6f31f420aaeb715239fc98", size = 5814054, upload-time = "2025-09-14T22:17:47.08Z" }, + { url = "https://files.pythonhosted.org/packages/b2/e5/fbd822d5c6f427cf158316d012c5a12f233473c2f9c5fe5ab1ae5d21f3d8/zstandard-0.25.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4f187a0bb61b35119d1926aee039524d1f93aaf38a9916b8c4b78ac8514a0aaf", size = 5360113, upload-time = "2025-09-14T22:17:48.893Z" }, + { url = "https://files.pythonhosted.org/packages/8e/e0/69a553d2047f9a2c7347caa225bb3a63b6d7704ad74610cb7823baa08ed7/zstandard-0.25.0-cp313-cp313-win32.whl", hash = "sha256:7030defa83eef3e51ff26f0b7bfb229f0204b66fe18e04359ce3474ac33cbc09", size = 436936, upload-time = "2025-09-14T22:17:52.658Z" }, + { url = "https://files.pythonhosted.org/packages/d9/82/b9c06c870f3bd8767c201f1edbdf9e8dc34be5b0fbc5682c4f80fe948475/zstandard-0.25.0-cp313-cp313-win_amd64.whl", hash = "sha256:1f830a0dac88719af0ae43b8b2d6aef487d437036468ef3c2ea59c51f9d55fd5", size = 506232, upload-time = "2025-09-14T22:17:50.402Z" }, + { url = "https://files.pythonhosted.org/packages/d4/57/60c3c01243bb81d381c9916e2a6d9e149ab8627c0c7d7abb2d73384b3c0c/zstandard-0.25.0-cp313-cp313-win_arm64.whl", hash = "sha256:85304a43f4d513f5464ceb938aa02c1e78c2943b29f44a750b48b25ac999a049", size = 462671, upload-time = "2025-09-14T22:17:51.533Z" }, + { url = "https://files.pythonhosted.org/packages/3d/5c/f8923b595b55fe49e30612987ad8bf053aef555c14f05bb659dd5dbe3e8a/zstandard-0.25.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:e29f0cf06974c899b2c188ef7f783607dbef36da4c242eb6c82dcd8b512855e3", size = 795887, upload-time = "2025-09-14T22:17:54.198Z" }, + { url = "https://files.pythonhosted.org/packages/8d/09/d0a2a14fc3439c5f874042dca72a79c70a532090b7ba0003be73fee37ae2/zstandard-0.25.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:05df5136bc5a011f33cd25bc9f506e7426c0c9b3f9954f056831ce68f3b6689f", size = 640658, upload-time = "2025-09-14T22:17:55.423Z" }, + { url = "https://files.pythonhosted.org/packages/5d/7c/8b6b71b1ddd517f68ffb55e10834388d4f793c49c6b83effaaa05785b0b4/zstandard-0.25.0-cp314-cp314-manylinux2010_i686.manylinux_2_12_i686.manylinux_2_28_i686.whl", hash = "sha256:f604efd28f239cc21b3adb53eb061e2a205dc164be408e553b41ba2ffe0ca15c", size = 5379849, upload-time = "2025-09-14T22:17:57.372Z" }, + { url = "https://files.pythonhosted.org/packages/a4/86/a48e56320d0a17189ab7a42645387334fba2200e904ee47fc5a26c1fd8ca/zstandard-0.25.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:223415140608d0f0da010499eaa8ccdb9af210a543fac54bce15babbcfc78439", size = 5058095, upload-time = "2025-09-14T22:17:59.498Z" }, + { url = "https://files.pythonhosted.org/packages/f8/ad/eb659984ee2c0a779f9d06dbfe45e2dc39d99ff40a319895df2d3d9a48e5/zstandard-0.25.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2e54296a283f3ab5a26fc9b8b5d4978ea0532f37b231644f367aa588930aa043", size = 5551751, upload-time = "2025-09-14T22:18:01.618Z" }, + { url = "https://files.pythonhosted.org/packages/61/b3/b637faea43677eb7bd42ab204dfb7053bd5c4582bfe6b1baefa80ac0c47b/zstandard-0.25.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:ca54090275939dc8ec5dea2d2afb400e0f83444b2fc24e07df7fdef677110859", size = 6364818, upload-time = "2025-09-14T22:18:03.769Z" }, + { url = "https://files.pythonhosted.org/packages/31/dc/cc50210e11e465c975462439a492516a73300ab8caa8f5e0902544fd748b/zstandard-0.25.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e09bb6252b6476d8d56100e8147b803befa9a12cea144bbe629dd508800d1ad0", size = 5560402, upload-time = "2025-09-14T22:18:05.954Z" }, + { url = "https://files.pythonhosted.org/packages/c9/ae/56523ae9c142f0c08efd5e868a6da613ae76614eca1305259c3bf6a0ed43/zstandard-0.25.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:a9ec8c642d1ec73287ae3e726792dd86c96f5681eb8df274a757bf62b750eae7", size = 4955108, upload-time = "2025-09-14T22:18:07.68Z" }, + { url = "https://files.pythonhosted.org/packages/98/cf/c899f2d6df0840d5e384cf4c4121458c72802e8bda19691f3b16619f51e9/zstandard-0.25.0-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:a4089a10e598eae6393756b036e0f419e8c1d60f44a831520f9af41c14216cf2", size = 5269248, upload-time = "2025-09-14T22:18:09.753Z" }, + { url = "https://files.pythonhosted.org/packages/1b/c0/59e912a531d91e1c192d3085fc0f6fb2852753c301a812d856d857ea03c6/zstandard-0.25.0-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:f67e8f1a324a900e75b5e28ffb152bcac9fbed1cc7b43f99cd90f395c4375344", size = 5430330, upload-time = "2025-09-14T22:18:11.966Z" }, + { url = "https://files.pythonhosted.org/packages/a0/1d/7e31db1240de2df22a58e2ea9a93fc6e38cc29353e660c0272b6735d6669/zstandard-0.25.0-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:9654dbc012d8b06fc3d19cc825af3f7bf8ae242226df5f83936cb39f5fdc846c", size = 5811123, upload-time = "2025-09-14T22:18:13.907Z" }, + { url = "https://files.pythonhosted.org/packages/f6/49/fac46df5ad353d50535e118d6983069df68ca5908d4d65b8c466150a4ff1/zstandard-0.25.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:4203ce3b31aec23012d3a4cf4a2ed64d12fea5269c49aed5e4c3611b938e4088", size = 5359591, upload-time = "2025-09-14T22:18:16.465Z" }, + { url = "https://files.pythonhosted.org/packages/c2/38/f249a2050ad1eea0bb364046153942e34abba95dd5520af199aed86fbb49/zstandard-0.25.0-cp314-cp314-win32.whl", hash = "sha256:da469dc041701583e34de852d8634703550348d5822e66a0c827d39b05365b12", size = 444513, upload-time = "2025-09-14T22:18:20.61Z" }, + { url = "https://files.pythonhosted.org/packages/3a/43/241f9615bcf8ba8903b3f0432da069e857fc4fd1783bd26183db53c4804b/zstandard-0.25.0-cp314-cp314-win_amd64.whl", hash = "sha256:c19bcdd826e95671065f8692b5a4aa95c52dc7a02a4c5a0cac46deb879a017a2", size = 516118, upload-time = "2025-09-14T22:18:17.849Z" }, + { url = "https://files.pythonhosted.org/packages/f0/ef/da163ce2450ed4febf6467d77ccb4cd52c4c30ab45624bad26ca0a27260c/zstandard-0.25.0-cp314-cp314-win_arm64.whl", hash = "sha256:d7541afd73985c630bafcd6338d2518ae96060075f9463d7dc14cfb33514383d", size = 476940, upload-time = "2025-09-14T22:18:19.088Z" }, +] From 747ed4491f7fe77b1f99a385309804a3f2cca353 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 16 Jan 2026 00:43:54 +0000 Subject: [PATCH 036/119] add negative result on olmo3 pretraining mix --- dev/LOG.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/dev/LOG.md b/dev/LOG.md index 5f6e1d7f..2a94daaf 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,14 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-15: Olmo pretraining mix (Negative result) + +I attempted to train on the Olmo 3 pretraining dataset [allenai/dolma3_mix-6T](https://huggingface.co/datasets/allenai/dolma3_mix-6T) instead of FineWeb-edu. I ran into a number of [errors and issues](https://huggingface.co/datasets/allenai/dolma3_mix-6T/discussions/2) trying to both download and process the dataset and then noticed some quality issues (e.g. some documents seem to be extremely short, like "5".). I managed to work around these with some sensible hacks (e.g. reject documents less than 100 characters in length) and tried to process the dataset exactly as FineWeb, re-trained the tokenizer and trained a d16 model. The CORE score decreased from 15.5 to 13.8, i.e. the result is quite a bit worse. + +I am still looking to try the [DCLM dataset](https://arxiv.org/abs/2406.11794), which according to the paper should be better that FineWeb-edu. I do have some concerns that the same group both prepared the DCLM dataset *and* introduced the CORE score so I'm a bit hesitant in case there was some overfitting to CORE score adjacent data distribution. + +Classifying as negative result and reverting back to FineWeb-edu for now. + ## 2026-01-13: Varlen Attention (Negative Result) Attempted to prevent attention from "leaking" across document boundaries using Flash Attention's `flash_attn_varlen_func`, similar to modded-nanogpt's approach. From fbf2bbea25da6ada9d77db4c042937f59205823f Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 16 Jan 2026 02:21:17 +0000 Subject: [PATCH 037/119] update log with a bunch of attempts --- dev/LOG.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/dev/LOG.md b/dev/LOG.md index 2a94daaf..d0dc1b18 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,22 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-16: Modded-nanogpt Ideas Sweep (Mostly Negative) + +Tested several architectural ideas from modded-nanogpt to see if they transfer to nanochat. All of these did not help: + +| Idea | Result | Notes | +|------|--------|-------| +| Half-truncated RoPE | No improvement | Only first half of head dims get RoPE (base 1024, linspace). Second half "stationary". | +| Asymmetric softcap | Slightly worse | `23 * sigmoid((x+5)/7.5)` vs our symmetric `15 * tanh(x/15)`. May only help with FP8. | +| Smear gate | Negligible | Blend each token with predecessor via learned gate. Tiny improvement not worth n_embd² params. | +| Backout | No improvement | Save activations at ~60% through network, subtract scaled version at end. | +| Skip connection | Slightly worse | Save at layer ~25%, add at layer ~50%. Also +2GB memory from storing activations. | + +Value Embeddings do show promise. I need a more elaborate exploration of a few related ideas, which I leave for tomorrow. + +--- + ## 2026-01-15: Olmo pretraining mix (Negative result) I attempted to train on the Olmo 3 pretraining dataset [allenai/dolma3_mix-6T](https://huggingface.co/datasets/allenai/dolma3_mix-6T) instead of FineWeb-edu. I ran into a number of [errors and issues](https://huggingface.co/datasets/allenai/dolma3_mix-6T/discussions/2) trying to both download and process the dataset and then noticed some quality issues (e.g. some documents seem to be extremely short, like "5".). I managed to work around these with some sensible hacks (e.g. reject documents less than 100 characters in length) and tried to process the dataset exactly as FineWeb, re-trained the tokenizer and trained a d16 model. The CORE score decreased from 15.5 to 13.8, i.e. the result is quite a bit worse. @@ -12,6 +28,8 @@ I am still looking to try the [DCLM dataset](https://arxiv.org/abs/2406.11794), Classifying as negative result and reverting back to FineWeb-edu for now. +--- + ## 2026-01-13: Varlen Attention (Negative Result) Attempted to prevent attention from "leaking" across document boundaries using Flash Attention's `flash_attn_varlen_func`, similar to modded-nanogpt's approach. From 50413d2d67b31db8c4f1b593808f98eb23b4401a Mon Sep 17 00:00:00 2001 From: Haoyu Wang <32129905+why2011btv@users.noreply.github.com> Date: Fri, 16 Jan 2026 01:03:42 -0500 Subject: [PATCH 038/119] typo in comments: change "GAPO" to "DAPO" --- scripts/chat_rl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/chat_rl.py b/scripts/chat_rl.py index b0697f36..eb8e48e5 100644 --- a/scripts/chat_rl.py +++ b/scripts/chat_rl.py @@ -6,7 +6,7 @@ simpler and more similar to just REINFORCE: 1) Delete trust region, so there is no KL regularization to a reference model 2) We are on policy, so there's no need for PPO ratio+clip. -3) We use GAPO style normalization that is token-level, not sequence-level. +3) We use DAPO style normalization that is token-level, not sequence-level. 4) Instead of z-score normalization (r - mu)/sigma, only use (r - mu) as the advantage. 1 GPU: From 8203efa9190e9d8da7419de29599e91a9242b968 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 16 Jan 2026 17:37:51 +0000 Subject: [PATCH 039/119] implement flash attention 3 fallback to pytorch sdpa by touching as few lines of code as possible in main files and keeping all implementation to a single file. add tests. add helpful warning messages for the user. --- nanochat/gpt.py | 12 +- scripts/base_train.py | 13 ++ tests/test_attention_fallback.py | 338 +++++++++++++++++++++++++++++++ 3 files changed, 354 insertions(+), 9 deletions(-) create mode 100644 tests/test_attention_fallback.py diff --git a/nanochat/gpt.py b/nanochat/gpt.py index 81ccb0ca..86f440bf 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -23,13 +23,8 @@ from nanochat.common import get_dist_info, print0 from nanochat.muon import Muon, DistMuon from nanochat.adamw import DistAdamW -# Load Flash Attention 3 from HuggingFace Hub (and silence the progress bar) -import os -os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1" -# Official docs of FA3 label it as "beta" and want you to install FA3 from source, which is a pain. -# Wishing for official FA3 wheels soon, for now this seems to be a fast way to get them (ty varunneal) -from kernels import get_kernel -flash_attn = get_kernel('varunneal/flash-attention-3').flash_attn_interface +# Our custom Flash Attention module that automatically uses FA3 on Hopper+ and SDPA fallback elsewhere +from nanochat.flash_attention import flash_attn @dataclass class GPTConfig: @@ -87,8 +82,7 @@ class CausalSelfAttention(nn.Module): q, k = apply_rotary_emb(q, cos, sin), apply_rotary_emb(k, cos, sin) q, k = norm(q), norm(k) # QK norm - # Attention with Flash Attention 3 - # FA3 handles GQA automatically when n_kv_heads < n_heads + # Flash Attention (FA3 on Hopper+, PyTorch SDPA fallback elsewhere) # window_size is (left, right) tuple: (N, 0) for causal, (-1, 0) for full context if kv_cache is None: # Training: causal attention with optional sliding window diff --git a/scripts/base_train.py b/scripts/base_train.py index 5293cd88..c61986e6 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -27,6 +27,7 @@ from nanochat.tokenizer import get_tokenizer, get_token_bytes from nanochat.checkpoint_manager import save_checkpoint, load_checkpoint from nanochat.loss_eval import evaluate_bpb from nanochat.engine import Engine +from nanochat.flash_attention import HAS_FA3 from scripts.base_eval import evaluate_model print_banner() @@ -86,6 +87,18 @@ get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else l use_dummy_wandb = args.run == "dummy" or not master_process wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat", name=args.run, config=user_config) +# Flash Attention status +if HAS_FA3: + print0("✓ Using Flash Attention 3 (Hopper GPU detected), efficient, new and awesome.") +else: + print0("!" * 80) + print0("WARNING: Flash Attention 3 not available, using PyTorch SDPA fallback") + print0("WARNING: Training will be less efficient without FA3") + if args.window_pattern != "L": + print0(f"WARNING: SDPA has no support for sliding window attention (window_pattern='{args.window_pattern}'). Your GPU utilization will be terrible.") + print0("WARNING: Recommend using --window-pattern L for full context attention without alternating sliding window patterns.") + print0("!" * 80) + # Tokenizer will be useful for evaluation, also we need the vocab size tokenizer = get_tokenizer() token_bytes = get_token_bytes(device=device) diff --git a/tests/test_attention_fallback.py b/tests/test_attention_fallback.py new file mode 100644 index 00000000..2cf3ed77 --- /dev/null +++ b/tests/test_attention_fallback.py @@ -0,0 +1,338 @@ +""" +Test Flash Attention unified interface - verify FA3 and SDPA produce identical results. + +Run: python -m pytest tests/test_attention_fallback.py -v -s + +Note on test structure: + Tests are split into two classes due to dtype/device constraints: + + 1. TestFA3VsSDPA: Comparison tests that run both FA3 and SDPA on the same inputs + and verify they produce identical results. These require a Hopper GPU (FA3 only + works on sm90+) and use bfloat16 (FA3 doesn't support float32). + + 2. TestSDPAOnly: Tests that only exercise the SDPA fallback path. These can run + on any device (CUDA, CPU, MPS) with the appropriate dtype for that device. +""" +import torch +import pytest +import nanochat.flash_attention as fa_module +from nanochat.flash_attention import flash_attn, HAS_FA3 +from nanochat.engine import KVCache + + +def set_impl(impl): + """Set the implementation override ('fa3', 'sdpa', or None for auto).""" + fa_module._override_impl = impl + + +def run_both_impls(fn): + """Run a function with both FA3 and SDPA, return both outputs.""" + set_impl('fa3') + out_fa3 = fn() + set_impl('sdpa') + out_sdpa = fn() + set_impl(None) # reset + return out_fa3, out_sdpa + + +def assert_close(t1, t2, name, atol=1e-2, rtol=1e-2): + """Assert two tensors are close, with helpful error message.""" + max_diff = (t1 - t2).abs().max().item() + mean_diff = (t1 - t2).abs().mean().item() + assert torch.allclose(t1, t2, atol=atol, rtol=rtol), \ + f"{name}: max_diff={max_diff:.6f}, mean_diff={mean_diff:.6f}" + return max_diff, mean_diff + + +# ============================================================================= +# FA3 vs SDPA comparison tests (require Hopper GPU) +# ============================================================================= +@pytest.mark.skipif(not HAS_FA3, reason="FA3 required to compare implementations") +class TestFA3VsSDPA: + """Compare FA3 and SDPA produce identical results. Requires Hopper GPU.""" + + DEVICE = "cuda" + DTYPE = torch.bfloat16 + + def test_basic_causal(self): + """Basic causal attention.""" + B, T, H, D = 2, 64, 4, 32 + q = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + k = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + v = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + + def run(): + return flash_attn.flash_attn_func(q, k, v, causal=True, window_size=(T, 0)) + + y_fa3, y_sdpa = run_both_impls(run) + max_diff, mean_diff = assert_close(y_fa3, y_sdpa, "basic_causal") + print(f"basic_causal: max_diff={max_diff:.6f}, mean_diff={mean_diff:.6f}") + + def test_full_context(self): + """Full context (window_size=-1).""" + B, T, H, D = 2, 128, 4, 32 + q = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + k = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + v = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + + def run(): + return flash_attn.flash_attn_func(q, k, v, causal=True, window_size=(-1, -1)) + + y_fa3, y_sdpa = run_both_impls(run) + max_diff, mean_diff = assert_close(y_fa3, y_sdpa, "full_context") + print(f"full_context: max_diff={max_diff:.6f}, mean_diff={mean_diff:.6f}") + + def test_sliding_window(self): + """Sliding window attention.""" + B, T, H, D = 2, 128, 4, 32 + window = 32 + q = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + k = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + v = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + + def run(): + return flash_attn.flash_attn_func(q, k, v, causal=True, window_size=(window, 0)) + + y_fa3, y_sdpa = run_both_impls(run) + max_diff, mean_diff = assert_close(y_fa3, y_sdpa, "sliding_window") + print(f"sliding_window: max_diff={max_diff:.6f}, mean_diff={mean_diff:.6f}") + + def test_gqa(self): + """Group Query Attention (fewer KV heads than Q heads).""" + B, T, D = 2, 64, 32 + n_heads = 8 + n_kv_heads = 2 + + q = torch.randn(B, T, n_heads, D, device=self.DEVICE, dtype=self.DTYPE) + k = torch.randn(B, T, n_kv_heads, D, device=self.DEVICE, dtype=self.DTYPE) + v = torch.randn(B, T, n_kv_heads, D, device=self.DEVICE, dtype=self.DTYPE) + + def run(): + return flash_attn.flash_attn_func(q, k, v, causal=True, window_size=(T, 0)) + + y_fa3, y_sdpa = run_both_impls(run) + max_diff, mean_diff = assert_close(y_fa3, y_sdpa, "gqa") + print(f"gqa: max_diff={max_diff:.6f}, mean_diff={mean_diff:.6f}") + + def test_larger_model(self): + """Larger dimensions closer to real model.""" + B, T, H, D = 4, 256, 12, 64 + q = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + k = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + v = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + + def run(): + return flash_attn.flash_attn_func(q, k, v, causal=True, window_size=(-1, -1)) + + y_fa3, y_sdpa = run_both_impls(run) + max_diff, mean_diff = assert_close(y_fa3, y_sdpa, "larger_model") + print(f"larger_model: max_diff={max_diff:.6f}, mean_diff={mean_diff:.6f}") + + def test_kvcache_prefill(self): + """Test prefill (inserting multiple tokens into empty cache).""" + B, T_max, H, D = 2, 64, 4, 32 + T_prefill = 16 + + q = torch.randn(B, T_prefill, H, D, device=self.DEVICE, dtype=self.DTYPE) + k = torch.randn(B, T_prefill, H, D, device=self.DEVICE, dtype=self.DTYPE) + v = torch.randn(B, T_prefill, H, D, device=self.DEVICE, dtype=self.DTYPE) + + def run(): + k_cache = torch.zeros(B, T_max, H, D, device=self.DEVICE, dtype=self.DTYPE) + v_cache = torch.zeros(B, T_max, H, D, device=self.DEVICE, dtype=self.DTYPE) + cache_seqlens = torch.zeros(B, dtype=torch.int32, device=self.DEVICE) + return flash_attn.flash_attn_with_kvcache( + q, k_cache, v_cache, k=k, v=v, + cache_seqlens=cache_seqlens, + causal=True, window_size=(T_max, 0) + ) + + y_fa3, y_sdpa = run_both_impls(run) + max_diff, mean_diff = assert_close(y_fa3, y_sdpa, "prefill") + print(f"prefill: max_diff={max_diff:.6f}, mean_diff={mean_diff:.6f}") + + def test_kvcache_single_token(self): + """Test single token generation (cache already has content).""" + B, T_max, H, D = 2, 64, 4, 32 + T_prefill = 16 + + k_init = torch.randn(B, T_prefill, H, D, device=self.DEVICE, dtype=self.DTYPE) + v_init = torch.randn(B, T_prefill, H, D, device=self.DEVICE, dtype=self.DTYPE) + q_single = torch.randn(B, 1, H, D, device=self.DEVICE, dtype=self.DTYPE) + k_single = torch.randn(B, 1, H, D, device=self.DEVICE, dtype=self.DTYPE) + v_single = torch.randn(B, 1, H, D, device=self.DEVICE, dtype=self.DTYPE) + + def run(): + k_cache = torch.zeros(B, T_max, H, D, device=self.DEVICE, dtype=self.DTYPE) + v_cache = torch.zeros(B, T_max, H, D, device=self.DEVICE, dtype=self.DTYPE) + k_cache[:, :T_prefill, :, :] = k_init + v_cache[:, :T_prefill, :, :] = v_init + cache_seqlens = torch.full((B,), T_prefill, dtype=torch.int32, device=self.DEVICE) + return flash_attn.flash_attn_with_kvcache( + q_single, k_cache, v_cache, k=k_single, v=v_single, + cache_seqlens=cache_seqlens, + causal=True, window_size=(T_max, 0) + ) + + y_fa3, y_sdpa = run_both_impls(run) + max_diff, mean_diff = assert_close(y_fa3, y_sdpa, "single_token") + print(f"single_token: max_diff={max_diff:.6f}, mean_diff={mean_diff:.6f}") + + def test_backward_gradients_match(self): + """Verify gradients are similar between FA3 and SDPA.""" + B, T, H, D = 2, 32, 4, 16 + + q_data = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + k_data = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + v_data = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + + def run(): + q = q_data.clone().requires_grad_(True) + k = k_data.clone().requires_grad_(True) + v = v_data.clone().requires_grad_(True) + y = flash_attn.flash_attn_func(q, k, v, causal=True, window_size=(T, 0)) + loss = y.sum() + loss.backward() + return y.detach(), q.grad.detach(), k.grad.detach(), v.grad.detach() + + set_impl('fa3') + y_fa3, q_grad_fa3, k_grad_fa3, v_grad_fa3 = run() + set_impl('sdpa') + y_sdpa, q_grad_sdpa, k_grad_sdpa, v_grad_sdpa = run() + set_impl(None) + + max_diff, mean_diff = assert_close(y_fa3, y_sdpa, "backward_output") + print(f"backward_output: max_diff={max_diff:.6f}, mean_diff={mean_diff:.6f}") + + max_diff, mean_diff = assert_close(q_grad_fa3, q_grad_sdpa, "q_grad", atol=0.05, rtol=0.05) + print(f"q_grad: max_diff={max_diff:.6f}, mean_diff={mean_diff:.6f}") + + max_diff, mean_diff = assert_close(k_grad_fa3, k_grad_sdpa, "k_grad", atol=0.05, rtol=0.05) + print(f"k_grad: max_diff={max_diff:.6f}, mean_diff={mean_diff:.6f}") + + max_diff, mean_diff = assert_close(v_grad_fa3, v_grad_sdpa, "v_grad", atol=0.05, rtol=0.05) + print(f"v_grad: max_diff={max_diff:.6f}, mean_diff={mean_diff:.6f}") + + +# ============================================================================= +# SDPA-only tests (run on any device) +# ============================================================================= +class TestSDPAOnly: + """Test SDPA fallback works correctly. Runs on any device.""" + + DEVICE = "cuda" if torch.cuda.is_available() else "cpu" + DTYPE = torch.bfloat16 if torch.cuda.is_available() else torch.float32 + + def test_basic_forward(self): + """Test SDPA forward pass produces valid output.""" + set_impl('sdpa') + B, T, H, D = 2, 64, 4, 32 + q = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + k = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + v = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE) + + y = flash_attn.flash_attn_func(q, k, v, causal=True, window_size=(T, 0)) + + assert y.shape == (B, T, H, D) + assert not torch.isnan(y).any(), "Output contains NaN" + set_impl(None) + + def test_backward(self): + """Test gradients flow through SDPA.""" + set_impl('sdpa') + B, T, H, D = 2, 32, 4, 16 + q = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE, requires_grad=True) + k = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE, requires_grad=True) + v = torch.randn(B, T, H, D, device=self.DEVICE, dtype=self.DTYPE, requires_grad=True) + + y = flash_attn.flash_attn_func(q, k, v, causal=True, window_size=(T, 0)) + loss = y.sum() + loss.backward() + + assert q.grad is not None, "No gradient for q" + assert k.grad is not None, "No gradient for k" + assert v.grad is not None, "No gradient for v" + assert not torch.isnan(q.grad).any(), "NaN in q gradient" + set_impl(None) + + def test_kvcache(self): + """Test SDPA with KV cache.""" + set_impl('sdpa') + B, T_max, H, D = 2, 64, 4, 32 + n_layers = 1 + + cache = KVCache( + batch_size=B, num_heads=H, seq_len=T_max, head_dim=D, + num_layers=n_layers, device=self.DEVICE, dtype=self.DTYPE + ) + k_cache, v_cache = cache.get_layer_cache(0) + + # Prefill + T_prefill = 16 + q = torch.randn(B, T_prefill, H, D, device=self.DEVICE, dtype=self.DTYPE) + k = torch.randn(B, T_prefill, H, D, device=self.DEVICE, dtype=self.DTYPE) + v = torch.randn(B, T_prefill, H, D, device=self.DEVICE, dtype=self.DTYPE) + + y = flash_attn.flash_attn_with_kvcache( + q, k_cache, v_cache, k=k, v=v, + cache_seqlens=cache.cache_seqlens, + causal=True, window_size=(T_max, 0) + ) + cache.advance(T_prefill) + + assert y.shape == (B, T_prefill, H, D) + assert cache.get_pos() == T_prefill + + # Generate single token + q_single = torch.randn(B, 1, H, D, device=self.DEVICE, dtype=self.DTYPE) + k_single = torch.randn(B, 1, H, D, device=self.DEVICE, dtype=self.DTYPE) + v_single = torch.randn(B, 1, H, D, device=self.DEVICE, dtype=self.DTYPE) + + y_single = flash_attn.flash_attn_with_kvcache( + q_single, k_cache, v_cache, k=k_single, v=v_single, + cache_seqlens=cache.cache_seqlens, + causal=True, window_size=(T_max, 0) + ) + cache.advance(1) + + assert y_single.shape == (B, 1, H, D) + assert cache.get_pos() == T_prefill + 1 + set_impl(None) + + +# ============================================================================= +# Override mechanism tests +# ============================================================================= +class TestOverrideMechanism: + """Test that the override mechanism works correctly.""" + + @pytest.mark.skipif(not HAS_FA3, reason="FA3 required") + def test_override_fa3(self): + """Test that override='fa3' uses FA3.""" + set_impl('fa3') + assert fa_module._use_fa3() == True + set_impl(None) + + def test_override_sdpa(self): + """Test that override='sdpa' uses SDPA.""" + set_impl('sdpa') + assert fa_module._use_fa3() == False + set_impl(None) + + def test_override_auto(self): + """Test that override=None uses auto-detection.""" + set_impl(None) + assert fa_module._use_fa3() == HAS_FA3 + + +if __name__ == "__main__": + print(f"PyTorch version: {torch.__version__}") + print(f"CUDA available: {torch.cuda.is_available()}") + if torch.cuda.is_available(): + print(f"CUDA device: {torch.cuda.get_device_name()}") + major, minor = torch.cuda.get_device_capability() + print(f"Compute capability: {major}.{minor}") + print(f"HAS_FA3: {HAS_FA3}") + print() + + pytest.main([__file__, "-v", "-s"]) From b62a5bc44aafef02eba6c39236180aa424a82674 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 16 Jan 2026 17:39:41 +0000 Subject: [PATCH 040/119] naturally i failed to include the actual code in the previous commit facepalm --- nanochat/flash_attention.py | 178 ++++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) create mode 100644 nanochat/flash_attention.py diff --git a/nanochat/flash_attention.py b/nanochat/flash_attention.py new file mode 100644 index 00000000..5d27e5f9 --- /dev/null +++ b/nanochat/flash_attention.py @@ -0,0 +1,178 @@ +""" +Unified Flash Attention interface with automatic FA3/SDPA switching. + +Exports `flash_attn` module that matches the FA3 API exactly, but falls back +to PyTorch SDPA on non-Hopper GPUs, MPS, and CPU. + +Usage (drop-in replacement for FA3): + from nanochat.flash_attention import flash_attn + + # Training (no KV cache) + y = flash_attn.flash_attn_func(q, k, v, causal=True, window_size=window_size) + + # Inference (with KV cache) + y = flash_attn.flash_attn_with_kvcache(q, k_cache, v_cache, k=k, v=v, ...) +""" +import torch +import torch.nn.functional as F + + +# ============================================================================= +# Detection: Try to load FA3 on Hopper+ GPUs +# ============================================================================= +def _load_flash_attention_3(): + """Try to load Flash Attention 3 (requires Hopper+ GPU).""" + if not torch.cuda.is_available(): + return None + try: + major, _ = torch.cuda.get_device_capability() + if major < 9: # Hopper is sm90 + return None + import os + os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1" + from kernels import get_kernel + return get_kernel('varunneal/flash-attention-3').flash_attn_interface + except Exception: + return None + + +_fa3 = _load_flash_attention_3() +HAS_FA3 = _fa3 is not None + +# Override for testing: set to 'fa3', 'sdpa', or None (auto) +_override_impl = None + + +def _use_fa3(): + """Determine whether to use FA3 based on availability and override.""" + if _override_impl == 'fa3': + assert HAS_FA3, "Cannot override to FA3: not available on this hardware" + return True + if _override_impl == 'sdpa': + return False + return HAS_FA3 # auto + + +# ============================================================================= +# SDPA helpers +# ============================================================================= +def _sdpa_attention(q, k, v, window_size, enable_gqa): + """ + SDPA attention with sliding window support. + q, k, v are (B, H, T, D) format. + """ + Tq = q.size(2) + Tk = k.size(2) + window = window_size[0] + + # Full context, same length + if (window < 0 or window >= Tq) and Tq == Tk: + return F.scaled_dot_product_attention(q, k, v, is_causal=True, enable_gqa=enable_gqa) + + # Single token generation + if Tq == 1: + return F.scaled_dot_product_attention(q, k, v, is_causal=False, enable_gqa=enable_gqa) + + # Need explicit mask + device = q.device + if Tq == Tk: + # Causal + sliding window + mask = torch.tril(torch.ones(Tq, Tk, device=device, dtype=torch.bool)) + if window > 0 and window < Tq: + row_idx = torch.arange(Tq, device=device).unsqueeze(1) + col_idx = torch.arange(Tk, device=device).unsqueeze(0) + mask = mask & ((row_idx - col_idx) <= window) + else: + # Chunk inference: attend to prefix + causal within chunk + prefix_len = Tk - Tq + mask = torch.zeros(Tq, Tk, device=device, dtype=torch.bool) + mask[:, :prefix_len] = True + mask[:, prefix_len:] = torch.tril(torch.ones(Tq, Tq, device=device, dtype=torch.bool)) + + return F.scaled_dot_product_attention(q, k, v, attn_mask=mask, enable_gqa=enable_gqa) + + +# ============================================================================= +# Public API: Same interface as FA3 +# ============================================================================= +def flash_attn_func(q, k, v, causal=False, window_size=(-1, -1)): + """ + Flash Attention for training (no KV cache). + + Args: + q, k, v: Tensors of shape (B, T, H, D) + causal: Whether to use causal masking + window_size: (left, right) sliding window. -1 means unlimited. + + Returns: + Output tensor of shape (B, T, H, D) + """ + if _use_fa3(): + return _fa3.flash_attn_func(q, k, v, causal=causal, window_size=window_size) + + # SDPA fallback: transpose (B, T, H, D) -> (B, H, T, D) + q = q.transpose(1, 2) + k = k.transpose(1, 2) + v = v.transpose(1, 2) + enable_gqa = q.size(1) != k.size(1) + y = _sdpa_attention(q, k, v, window_size, enable_gqa) + return y.transpose(1, 2) # back to (B, T, H, D) + + +def flash_attn_with_kvcache(q, k_cache, v_cache, k=None, v=None, cache_seqlens=None, + causal=False, window_size=(-1, -1)): + """ + Flash Attention with KV cache for inference. + + FA3 updates k_cache/v_cache in-place. Our SDPA fallback does the same. + + Args: + q: Queries, shape (B, T_new, H, D) + k_cache, v_cache: Pre-allocated cache tensors, shape (B, T_max, H_kv, D) + k, v: New keys/values to insert, shape (B, T_new, H_kv, D) + cache_seqlens: Current position in cache, shape (B,) int32 + causal: Whether to use causal masking + window_size: (left, right) sliding window. -1 means unlimited. + + Returns: + Output tensor of shape (B, T_new, H, D) + """ + if _use_fa3(): + return _fa3.flash_attn_with_kvcache( + q, k_cache, v_cache, k=k, v=v, cache_seqlens=cache_seqlens, + causal=causal, window_size=window_size + ) + + # SDPA fallback: manually manage KV cache + B, T_new, H, D = q.shape + pos = cache_seqlens[0].item() # assume uniform position across batch + + # Insert new k, v into cache (in-place, matching FA3 behavior) + if k is not None and v is not None: + k_cache[:, pos:pos+T_new, :, :] = k + v_cache[:, pos:pos+T_new, :, :] = v + + # Get full cache up to current position + new tokens + end_pos = pos + T_new + k_full = k_cache[:, :end_pos, :, :] + v_full = v_cache[:, :end_pos, :, :] + + # Transpose to SDPA layout: (B, T, H, D) -> (B, H, T, D) + q_sdpa = q.transpose(1, 2) + k_sdpa = k_full.transpose(1, 2) + v_sdpa = v_full.transpose(1, 2) + + enable_gqa = q_sdpa.size(1) != k_sdpa.size(1) + y_sdpa = _sdpa_attention(q_sdpa, k_sdpa, v_sdpa, window_size, enable_gqa) + + return y_sdpa.transpose(1, 2) # back to (B, T, H, D) + + +# ============================================================================= +# Export: flash_attn module interface (drop-in replacement for FA3) +# ============================================================================= +from types import SimpleNamespace +flash_attn = SimpleNamespace( + flash_attn_func=flash_attn_func, + flash_attn_with_kvcache=flash_attn_with_kvcache, +) From 184d4c12b1d01b098aeffa021be5168e454afad0 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 16 Jan 2026 18:25:04 +0000 Subject: [PATCH 041/119] also add to log about the FA3 changes --- dev/LOG.md | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/dev/LOG.md b/dev/LOG.md index d0dc1b18..ae518c83 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,50 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-16: Flash Attention 3 Fallback to SDPA + +Added automatic fallback from Flash Attention 3 to PyTorch's `scaled_dot_product_attention` (SDPA) for users without Hopper GPUs. This enables nanochat to run on older CUDA GPUs, CPU, and MPS (Apple Silicon). + +### Implementation + +Created `nanochat/flash_attention.py` - a unified interface that: +- Detects FA3 availability at import time (requires sm90+ / Hopper) +- Exports a `flash_attn` object matching FA3's API exactly (`flash_attn.flash_attn_func`, `flash_attn.flash_attn_with_kvcache`) +- Automatically routes to FA3 or SDPA based on hardware +- Handles tensor layout differences: FA3 uses (B, T, H, D), SDPA uses (B, H, T, D) +- Implements sliding window attention via explicit masks for SDPA +- Manages KV cache manually for SDPA (FA3 does it in-place) + +### Changes to Existing Files + +Changes to existing code were intentionally kept extremely minimal. + +**gpt.py**: Only the import line changed and a comment + +**engine.py**: Zero changes needed + +**base_train.py**: Added status print and warnings: +- Prints whether FA3 or SDPA fallback is being used +- Warns about efficiency loss without FA3 +- Warns about sliding window support if `--window-pattern` is not "L" + +### Testing + +Tests are split into two classes due to dtype/device constraints: + +1. **TestFA3VsSDPA**: Comparison tests requiring Hopper GPU + bfloat16. Run both implementations on identical inputs and verify outputs match (max diff typically 0, at most ~0.004 for sliding window). + +2. **TestSDPAOnly**: SDPA-only tests that run on any device with appropriate dtype. Verify forward pass, backward pass, and KV cache work correctly. + +Added `_override_impl` mechanism for testing - can force 'fa3' or 'sdpa' to directly compare implementations. + +### Notes + +- SDPA fallback is significantly slower than FA3 especially in that it lacks the sliding window attention support +- Recommend `--window-pattern L` (full context) when using SDPA fallback + +--- + ## 2026-01-16: Modded-nanogpt Ideas Sweep (Mostly Negative) Tested several architectural ideas from modded-nanogpt to see if they transfer to nanochat. All of these did not help: From e3f58b838e98a5ea013a3c1773fde9d4a3c5d090 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 16 Jan 2026 20:59:42 +0000 Subject: [PATCH 042/119] ranked version --- nanochat/gpt.py | 48 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/nanochat/gpt.py b/nanochat/gpt.py index 86f440bf..ffb7862a 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -68,7 +68,7 @@ class CausalSelfAttention(nn.Module): self.c_v = nn.Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False) self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=False) - def forward(self, x, cos_sin, window_size, kv_cache): + def forward(self, x, cos_sin, window_size, kv_cache, v0, v0_lambda): B, T, C = x.size() # Project the input to get queries, keys, and values @@ -77,6 +77,11 @@ class CausalSelfAttention(nn.Module): k = self.c_k(x).view(B, T, self.n_kv_head, self.head_dim) v = self.c_v(x).view(B, T, self.n_kv_head, self.head_dim) + # Value residual (ResFormer): mix in projected initial embedding for later layers + if v0 is not None: + v0_reshaped = v0.view(B, T, self.n_kv_head, self.head_dim) + v = v + v0_lambda * v0_reshaped + # Apply Rotary Embeddings to queries and keys to get relative positional encoding cos, sin = cos_sin q, k = apply_rotary_emb(q, cos, sin), apply_rotary_emb(k, cos, sin) @@ -126,8 +131,8 @@ class Block(nn.Module): self.attn = CausalSelfAttention(config, layer_idx) self.mlp = MLP(config) - def forward(self, x, cos_sin, window_size, kv_cache): - x = x + self.attn(norm(x), cos_sin, window_size, kv_cache) + def forward(self, x, cos_sin, window_size, kv_cache, v0, v0_lambda): + x = x + self.attn(norm(x), cos_sin, window_size, kv_cache, v0, v0_lambda) x = x + self.mlp(norm(x)) return x @@ -160,6 +165,17 @@ class GPT(nn.Module): # Separate parameters so they can have different optimizer treatment self.resid_lambdas = nn.Parameter(torch.ones(config.n_layer)) # fake init, real init in init_weights() self.x0_lambdas = nn.Parameter(torch.zeros(config.n_layer)) # fake init, real init in init_weights() + # Value residual (ResFormer-style): low-rank factorized embedding for value residual + # Paper: "Value Residual Learning" (arXiv:2410.17897) shows this improves information flow + # We apply to last 1/4 of layers as the paper shows later layers benefit most + # Low-rank factorization: (vocab, r) @ (r, kv_dim) instead of full (vocab, kv_dim) + head_dim = config.n_embd // config.n_head + kv_dim = config.n_kv_head * head_dim + value_rank = 32 # low-rank bottleneck dimension + self.value_embed_A = nn.Embedding(padded_vocab_size, value_rank) # token -> low-rank + self.value_embed_B = nn.Linear(value_rank, kv_dim, bias=False) # low-rank -> kv_dim + self.v0_lambdas = nn.Parameter(torch.zeros(config.n_layer)) # fake init, real init in init_weights() + self.value_residual_start = config.n_layer - config.n_layer // 4 # last 1/4 of layers # To support meta device initialization, we init the rotary embeddings here, but it's just "fake" meta tensors only. # As for rotary_seq_len, these rotary embeddings are pretty small/cheap in memory, # so let's just over-compute them by 10X, but assert fail if we ever reach that amount. @@ -204,15 +220,21 @@ class GPT(nn.Module): with torch.no_grad(): self.resid_lambdas.fill_(1.0) # 1.0 => typical residual connections at init self.x0_lambdas.fill_(0.0) # 0.0 => skip connection to input is disabled at init + self.v0_lambdas.fill_(0.0) # 0.0 => value residual is disabled at init + + # Value embedding low-rank factors (init like embeddings/projections) + torch.nn.init.normal_(self.value_embed_A.weight, mean=0.0, std=1.0) # like wte + torch.nn.init.uniform_(self.value_embed_B.weight, -s, s) # like c_v # Rotary embeddings head_dim = self.config.n_embd // self.config.n_head cos, sin = self._precompute_rotary_embeddings(self.rotary_seq_len, head_dim) self.cos, self.sin = cos, sin - # Cast token embeddings to bf16: optimizer can tolerate it and it saves memory + # Cast embeddings to bf16: optimizer can tolerate it and it saves memory if self.transformer.wte.weight.device.type == "cuda": self.transformer.wte.to(dtype=torch.bfloat16) + self.value_embed_A.to(dtype=torch.bfloat16) def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=None): # TODO: bump base theta more? e.g. 100K is more common more recently @@ -277,7 +299,8 @@ class GPT(nn.Module): """ nparams = sum(p.numel() for p in self.parameters()) # Exclude non-matmul params: embeddings and per-layer scalars - nparams_exclude = self.transformer.wte.weight.numel() + self.resid_lambdas.numel() + self.x0_lambdas.numel() + nparams_exclude = (self.transformer.wte.weight.numel() + self.value_embed_A.weight.numel() + + self.resid_lambdas.numel() + self.x0_lambdas.numel() + self.v0_lambdas.numel()) h, q, t = self.config.n_head, self.config.n_embd // self.config.n_head, self.config.sequence_len # Sum attention FLOPs per layer, accounting for sliding window attn_flops = 0 @@ -303,13 +326,16 @@ class GPT(nn.Module): def setup_optimizers(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0, adam_betas=(0.8, 0.95), scalar_lr=0.5): model_dim = self.config.n_embd ddp, rank, local_rank, world_size = get_dist_info() - # Separate out all parameters into 5 groups (matrix, embedding, lm_head, resid_lambdas, x0_lambdas) + # Separate out all parameters into groups (matrix, embedding, lm_head, value_embed, resid_lambdas, x0_lambdas, v0_lambdas) matrix_params = list(self.transformer.h.parameters()) embedding_params = list(self.transformer.wte.parameters()) lm_head_params = list(self.lm_head.parameters()) + value_embed_A_params = list(self.value_embed_A.parameters()) + value_embed_B_params = list(self.value_embed_B.parameters()) resid_params = [self.resid_lambdas] x0_params = [self.x0_lambdas] - assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(resid_params) + len(x0_params) + v0_params = [self.v0_lambdas] + assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(value_embed_A_params) + len(value_embed_B_params) + len(resid_params) + len(x0_params) + len(v0_params) # Create the AdamW optimizer for the embedding, lm_head, and per-layer scalars # Scale the LR for the AdamW parameters by ∝1/√dmodel (having tuned the LRs for 768 dim model) dmodel_lr_scale = (model_dim / 768) ** -0.5 @@ -317,8 +343,11 @@ class GPT(nn.Module): adam_groups = [ dict(params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale), dict(params=embedding_params, lr=embedding_lr * dmodel_lr_scale), + dict(params=value_embed_A_params, lr=embedding_lr * dmodel_lr_scale), # low-rank embedding + dict(params=value_embed_B_params, lr=embedding_lr * dmodel_lr_scale), # low-rank projection dict(params=resid_params, lr=scalar_lr * 0.01), # these are a lot more sensitive because they accumulate in the residual stream dict(params=x0_params, lr=scalar_lr), + dict(params=v0_params, lr=scalar_lr), ] adamw_kwargs = dict(betas=adam_betas, eps=1e-10, weight_decay=0.0) # NOTE: weight decay is hardcoded to 0.0 for AdamW, only used in Muon AdamWFactory = DistAdamW if ddp else partial(torch.optim.AdamW, fused=True) @@ -349,9 +378,12 @@ class GPT(nn.Module): x = self.transformer.wte(idx) x = norm(x) x0 = x # save initial normalized embedding for x0 residual + # Value residual (ResFormer): low-rank factorized embedding for later layers + v0 = self.value_embed_B(self.value_embed_A(idx)) # (B, T, kv_dim) for i, block in enumerate(self.transformer.h): x = self.resid_lambdas[i] * x + self.x0_lambdas[i] * x0 - x = block(x, cos_sin, self.window_sizes[i], kv_cache) + v0_for_layer = v0 if i >= self.value_residual_start else None + x = block(x, cos_sin, self.window_sizes[i], kv_cache, v0_for_layer, self.v0_lambdas[i]) x = norm(x) # Forward the lm_head (compute logits) From 0b58d70e9975d42b4357dfb33f321f764759af9f Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 16 Jan 2026 21:16:47 +0000 Subject: [PATCH 043/119] full ve version works very well --- nanochat/gpt.py | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/nanochat/gpt.py b/nanochat/gpt.py index ffb7862a..0356413d 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -165,15 +165,12 @@ class GPT(nn.Module): # Separate parameters so they can have different optimizer treatment self.resid_lambdas = nn.Parameter(torch.ones(config.n_layer)) # fake init, real init in init_weights() self.x0_lambdas = nn.Parameter(torch.zeros(config.n_layer)) # fake init, real init in init_weights() - # Value residual (ResFormer-style): low-rank factorized embedding for value residual + # Value residual (ResFormer-style): separate embedding for values, mixed into later layers # Paper: "Value Residual Learning" (arXiv:2410.17897) shows this improves information flow # We apply to last 1/4 of layers as the paper shows later layers benefit most - # Low-rank factorization: (vocab, r) @ (r, kv_dim) instead of full (vocab, kv_dim) head_dim = config.n_embd // config.n_head kv_dim = config.n_kv_head * head_dim - value_rank = 32 # low-rank bottleneck dimension - self.value_embed_A = nn.Embedding(padded_vocab_size, value_rank) # token -> low-rank - self.value_embed_B = nn.Linear(value_rank, kv_dim, bias=False) # low-rank -> kv_dim + self.value_embed = nn.Embedding(padded_vocab_size, kv_dim) self.v0_lambdas = nn.Parameter(torch.zeros(config.n_layer)) # fake init, real init in init_weights() self.value_residual_start = config.n_layer - config.n_layer // 4 # last 1/4 of layers # To support meta device initialization, we init the rotary embeddings here, but it's just "fake" meta tensors only. @@ -222,9 +219,8 @@ class GPT(nn.Module): self.x0_lambdas.fill_(0.0) # 0.0 => skip connection to input is disabled at init self.v0_lambdas.fill_(0.0) # 0.0 => value residual is disabled at init - # Value embedding low-rank factors (init like embeddings/projections) - torch.nn.init.normal_(self.value_embed_A.weight, mean=0.0, std=1.0) # like wte - torch.nn.init.uniform_(self.value_embed_B.weight, -s, s) # like c_v + # Value embedding (init like c_v: uniform with same std) + torch.nn.init.uniform_(self.value_embed.weight, -s, s) # Rotary embeddings head_dim = self.config.n_embd // self.config.n_head @@ -234,7 +230,7 @@ class GPT(nn.Module): # Cast embeddings to bf16: optimizer can tolerate it and it saves memory if self.transformer.wte.weight.device.type == "cuda": self.transformer.wte.to(dtype=torch.bfloat16) - self.value_embed_A.to(dtype=torch.bfloat16) + self.value_embed.to(dtype=torch.bfloat16) def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=None): # TODO: bump base theta more? e.g. 100K is more common more recently @@ -299,7 +295,7 @@ class GPT(nn.Module): """ nparams = sum(p.numel() for p in self.parameters()) # Exclude non-matmul params: embeddings and per-layer scalars - nparams_exclude = (self.transformer.wte.weight.numel() + self.value_embed_A.weight.numel() + + nparams_exclude = (self.transformer.wte.weight.numel() + self.value_embed.weight.numel() + self.resid_lambdas.numel() + self.x0_lambdas.numel() + self.v0_lambdas.numel()) h, q, t = self.config.n_head, self.config.n_embd // self.config.n_head, self.config.sequence_len # Sum attention FLOPs per layer, accounting for sliding window @@ -330,12 +326,11 @@ class GPT(nn.Module): matrix_params = list(self.transformer.h.parameters()) embedding_params = list(self.transformer.wte.parameters()) lm_head_params = list(self.lm_head.parameters()) - value_embed_A_params = list(self.value_embed_A.parameters()) - value_embed_B_params = list(self.value_embed_B.parameters()) + value_embed_params = list(self.value_embed.parameters()) resid_params = [self.resid_lambdas] x0_params = [self.x0_lambdas] v0_params = [self.v0_lambdas] - assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(value_embed_A_params) + len(value_embed_B_params) + len(resid_params) + len(x0_params) + len(v0_params) + assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(value_embed_params) + len(resid_params) + len(x0_params) + len(v0_params) # Create the AdamW optimizer for the embedding, lm_head, and per-layer scalars # Scale the LR for the AdamW parameters by ∝1/√dmodel (having tuned the LRs for 768 dim model) dmodel_lr_scale = (model_dim / 768) ** -0.5 @@ -343,8 +338,7 @@ class GPT(nn.Module): adam_groups = [ dict(params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale), dict(params=embedding_params, lr=embedding_lr * dmodel_lr_scale), - dict(params=value_embed_A_params, lr=embedding_lr * dmodel_lr_scale), # low-rank embedding - dict(params=value_embed_B_params, lr=embedding_lr * dmodel_lr_scale), # low-rank projection + dict(params=value_embed_params, lr=embedding_lr * dmodel_lr_scale), # same LR as token embedding dict(params=resid_params, lr=scalar_lr * 0.01), # these are a lot more sensitive because they accumulate in the residual stream dict(params=x0_params, lr=scalar_lr), dict(params=v0_params, lr=scalar_lr), @@ -378,8 +372,8 @@ class GPT(nn.Module): x = self.transformer.wte(idx) x = norm(x) x0 = x # save initial normalized embedding for x0 residual - # Value residual (ResFormer): low-rank factorized embedding for later layers - v0 = self.value_embed_B(self.value_embed_A(idx)) # (B, T, kv_dim) + # Value residual (ResFormer): separate value embedding for later layers + v0 = self.value_embed(idx) # (B, T, kv_dim) for i, block in enumerate(self.transformer.h): x = self.resid_lambdas[i] * x + self.x0_lambdas[i] * x0 v0_for_layer = v0 if i >= self.value_residual_start else None From 9a88194c3f684a3418c0c0f4069e6f3b3af10736 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 16 Jan 2026 22:08:52 +0000 Subject: [PATCH 044/119] simply one VE per layer, works best --- nanochat/gpt.py | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/nanochat/gpt.py b/nanochat/gpt.py index 0356413d..ea7a4d86 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -165,14 +165,12 @@ class GPT(nn.Module): # Separate parameters so they can have different optimizer treatment self.resid_lambdas = nn.Parameter(torch.ones(config.n_layer)) # fake init, real init in init_weights() self.x0_lambdas = nn.Parameter(torch.zeros(config.n_layer)) # fake init, real init in init_weights() - # Value residual (ResFormer-style): separate embedding for values, mixed into later layers + # Value residual (ResFormer-style): every layer gets its own value embedding # Paper: "Value Residual Learning" (arXiv:2410.17897) shows this improves information flow - # We apply to last 1/4 of layers as the paper shows later layers benefit most head_dim = config.n_embd // config.n_head kv_dim = config.n_kv_head * head_dim - self.value_embed = nn.Embedding(padded_vocab_size, kv_dim) - self.v0_lambdas = nn.Parameter(torch.zeros(config.n_layer)) # fake init, real init in init_weights() - self.value_residual_start = config.n_layer - config.n_layer // 4 # last 1/4 of layers + self.value_embeds = nn.ModuleList([nn.Embedding(padded_vocab_size, kv_dim) for _ in range(config.n_layer)]) + self.v0_lambdas = nn.Parameter(torch.zeros(config.n_layer)) # fake init, real init in init_weights() # To support meta device initialization, we init the rotary embeddings here, but it's just "fake" meta tensors only. # As for rotary_seq_len, these rotary embeddings are pretty small/cheap in memory, # so let's just over-compute them by 10X, but assert fail if we ever reach that amount. @@ -219,8 +217,9 @@ class GPT(nn.Module): self.x0_lambdas.fill_(0.0) # 0.0 => skip connection to input is disabled at init self.v0_lambdas.fill_(0.0) # 0.0 => value residual is disabled at init - # Value embedding (init like c_v: uniform with same std) - torch.nn.init.uniform_(self.value_embed.weight, -s, s) + # Value embeddings (init like c_v: uniform with same std) + for ve in self.value_embeds: + torch.nn.init.uniform_(ve.weight, -s, s) # Rotary embeddings head_dim = self.config.n_embd // self.config.n_head @@ -230,7 +229,8 @@ class GPT(nn.Module): # Cast embeddings to bf16: optimizer can tolerate it and it saves memory if self.transformer.wte.weight.device.type == "cuda": self.transformer.wte.to(dtype=torch.bfloat16) - self.value_embed.to(dtype=torch.bfloat16) + for ve in self.value_embeds: + ve.to(dtype=torch.bfloat16) def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=None): # TODO: bump base theta more? e.g. 100K is more common more recently @@ -295,7 +295,8 @@ class GPT(nn.Module): """ nparams = sum(p.numel() for p in self.parameters()) # Exclude non-matmul params: embeddings and per-layer scalars - nparams_exclude = (self.transformer.wte.weight.numel() + self.value_embed.weight.numel() + + value_embeds_numel = sum(ve.weight.numel() for ve in self.value_embeds) + nparams_exclude = (self.transformer.wte.weight.numel() + value_embeds_numel + self.resid_lambdas.numel() + self.x0_lambdas.numel() + self.v0_lambdas.numel()) h, q, t = self.config.n_head, self.config.n_embd // self.config.n_head, self.config.sequence_len # Sum attention FLOPs per layer, accounting for sliding window @@ -322,15 +323,15 @@ class GPT(nn.Module): def setup_optimizers(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0, adam_betas=(0.8, 0.95), scalar_lr=0.5): model_dim = self.config.n_embd ddp, rank, local_rank, world_size = get_dist_info() - # Separate out all parameters into groups (matrix, embedding, lm_head, value_embed, resid_lambdas, x0_lambdas, v0_lambdas) + # Separate out all parameters into groups (matrix, embedding, lm_head, value_embeds, resid_lambdas, x0_lambdas, v0_lambdas) matrix_params = list(self.transformer.h.parameters()) embedding_params = list(self.transformer.wte.parameters()) lm_head_params = list(self.lm_head.parameters()) - value_embed_params = list(self.value_embed.parameters()) + value_embeds_params = list(self.value_embeds.parameters()) resid_params = [self.resid_lambdas] x0_params = [self.x0_lambdas] v0_params = [self.v0_lambdas] - assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(value_embed_params) + len(resid_params) + len(x0_params) + len(v0_params) + assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(value_embeds_params) + len(resid_params) + len(x0_params) + len(v0_params) # Create the AdamW optimizer for the embedding, lm_head, and per-layer scalars # Scale the LR for the AdamW parameters by ∝1/√dmodel (having tuned the LRs for 768 dim model) dmodel_lr_scale = (model_dim / 768) ** -0.5 @@ -338,7 +339,7 @@ class GPT(nn.Module): adam_groups = [ dict(params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale), dict(params=embedding_params, lr=embedding_lr * dmodel_lr_scale), - dict(params=value_embed_params, lr=embedding_lr * dmodel_lr_scale), # same LR as token embedding + dict(params=value_embeds_params, lr=embedding_lr * dmodel_lr_scale), # same LR as token embedding dict(params=resid_params, lr=scalar_lr * 0.01), # these are a lot more sensitive because they accumulate in the residual stream dict(params=x0_params, lr=scalar_lr), dict(params=v0_params, lr=scalar_lr), @@ -372,12 +373,11 @@ class GPT(nn.Module): x = self.transformer.wte(idx) x = norm(x) x0 = x # save initial normalized embedding for x0 residual - # Value residual (ResFormer): separate value embedding for later layers - v0 = self.value_embed(idx) # (B, T, kv_dim) + # Value residual (ResFormer): every layer gets its own value embedding + v0s = [ve(idx) for ve in self.value_embeds] # n_layer x (B, T, kv_dim) for i, block in enumerate(self.transformer.h): x = self.resid_lambdas[i] * x + self.x0_lambdas[i] * x0 - v0_for_layer = v0 if i >= self.value_residual_start else None - x = block(x, cos_sin, self.window_sizes[i], kv_cache, v0_for_layer, self.v0_lambdas[i]) + x = block(x, cos_sin, self.window_sizes[i], kv_cache, v0s[i], self.v0_lambdas[i]) x = norm(x) # Forward the lm_head (compute logits) From e85db6b4a4351eb562bec220b3bbcaad28be6722 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 16 Jan 2026 23:52:12 +0000 Subject: [PATCH 045/119] alternating design --- nanochat/gpt.py | 60 +++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 27 deletions(-) diff --git a/nanochat/gpt.py b/nanochat/gpt.py index ea7a4d86..a077256e 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -45,6 +45,10 @@ def norm(x): return F.rms_norm(x, (x.size(-1),)) +def has_ve(layer_idx, n_layer): + """Returns True if GPT layer should have Value Embedding (alternating, last layer always included).""" + return layer_idx % 2 == (n_layer - 1) % 2 + def apply_rotary_emb(x, cos, sin): assert x.ndim == 4 # multihead attention d = x.shape[3] // 2 @@ -67,8 +71,10 @@ class CausalSelfAttention(nn.Module): self.c_k = nn.Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False) self.c_v = nn.Linear(self.n_embd, self.n_kv_head * self.head_dim, bias=False) self.c_proj = nn.Linear(self.n_embd, self.n_embd, bias=False) + self.ve_gate_channels = 32 + self.ve_gate = nn.Linear(self.ve_gate_channels, self.n_kv_head, bias=False) if has_ve(layer_idx, config.n_layer) else None - def forward(self, x, cos_sin, window_size, kv_cache, v0, v0_lambda): + def forward(self, x, ve, cos_sin, window_size, kv_cache): B, T, C = x.size() # Project the input to get queries, keys, and values @@ -77,10 +83,11 @@ class CausalSelfAttention(nn.Module): k = self.c_k(x).view(B, T, self.n_kv_head, self.head_dim) v = self.c_v(x).view(B, T, self.n_kv_head, self.head_dim) - # Value residual (ResFormer): mix in projected initial embedding for later layers - if v0 is not None: - v0_reshaped = v0.view(B, T, self.n_kv_head, self.head_dim) - v = v + v0_lambda * v0_reshaped + # Value residual (ResFormer): mix in value embedding with input-dependent gate per head + if ve is not None: + ve = ve.view(B, T, self.n_kv_head, self.head_dim) + gate = 2 * torch.sigmoid(self.ve_gate(x[..., :self.ve_gate_channels])) # (B, T, n_kv_head), range (0, 2) + v = v + gate.unsqueeze(-1) * ve # Apply Rotary Embeddings to queries and keys to get relative positional encoding cos, sin = cos_sin @@ -131,8 +138,8 @@ class Block(nn.Module): self.attn = CausalSelfAttention(config, layer_idx) self.mlp = MLP(config) - def forward(self, x, cos_sin, window_size, kv_cache, v0, v0_lambda): - x = x + self.attn(norm(x), cos_sin, window_size, kv_cache, v0, v0_lambda) + def forward(self, x, ve, cos_sin, window_size, kv_cache): + x = x + self.attn(norm(x), ve, cos_sin, window_size, kv_cache) x = x + self.mlp(norm(x)) return x @@ -165,12 +172,10 @@ class GPT(nn.Module): # Separate parameters so they can have different optimizer treatment self.resid_lambdas = nn.Parameter(torch.ones(config.n_layer)) # fake init, real init in init_weights() self.x0_lambdas = nn.Parameter(torch.zeros(config.n_layer)) # fake init, real init in init_weights() - # Value residual (ResFormer-style): every layer gets its own value embedding - # Paper: "Value Residual Learning" (arXiv:2410.17897) shows this improves information flow + # Value embeddings (ResFormer-style): alternating layers, last layer always included head_dim = config.n_embd // config.n_head kv_dim = config.n_kv_head * head_dim - self.value_embeds = nn.ModuleList([nn.Embedding(padded_vocab_size, kv_dim) for _ in range(config.n_layer)]) - self.v0_lambdas = nn.Parameter(torch.zeros(config.n_layer)) # fake init, real init in init_weights() + self.value_embeds = nn.ModuleDict({str(i): nn.Embedding(padded_vocab_size, kv_dim) for i in range(config.n_layer) if has_ve(i, config.n_layer)}) # To support meta device initialization, we init the rotary embeddings here, but it's just "fake" meta tensors only. # As for rotary_seq_len, these rotary embeddings are pretty small/cheap in memory, # so let's just over-compute them by 10X, but assert fail if we ever reach that amount. @@ -181,6 +186,7 @@ class GPT(nn.Module): self.register_buffer("cos", cos, persistent=False) # persistent=False means it's not saved to the checkpoint self.register_buffer("sin", sin, persistent=False) + @torch.no_grad() def init_weights(self): """ Initialize the full model in this one function for maximum clarity. @@ -212,15 +218,18 @@ class GPT(nn.Module): torch.nn.init.zeros_(block.mlp.c_proj.weight) # Per-layer scalars - with torch.no_grad(): - self.resid_lambdas.fill_(1.0) # 1.0 => typical residual connections at init - self.x0_lambdas.fill_(0.0) # 0.0 => skip connection to input is disabled at init - self.v0_lambdas.fill_(0.0) # 0.0 => value residual is disabled at init + self.resid_lambdas.fill_(1.0) # 1.0 => typical residual connections at init + self.x0_lambdas.fill_(0.0) # 0.0 => skip connection to input is disabled at init # Value embeddings (init like c_v: uniform with same std) - for ve in self.value_embeds: + for ve in self.value_embeds.values(): torch.nn.init.uniform_(ve.weight, -s, s) + # Gate weights init to zero so gates start at sigmoid(0) = 0.5, scaled by 2 -> 1.0 (neutral) + for block in self.transformer.h: + if block.attn.ve_gate is not None: + torch.nn.init.zeros_(block.attn.ve_gate.weight) + # Rotary embeddings head_dim = self.config.n_embd // self.config.n_head cos, sin = self._precompute_rotary_embeddings(self.rotary_seq_len, head_dim) @@ -229,7 +238,7 @@ class GPT(nn.Module): # Cast embeddings to bf16: optimizer can tolerate it and it saves memory if self.transformer.wte.weight.device.type == "cuda": self.transformer.wte.to(dtype=torch.bfloat16) - for ve in self.value_embeds: + for ve in self.value_embeds.values(): ve.to(dtype=torch.bfloat16) def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=None): @@ -295,9 +304,9 @@ class GPT(nn.Module): """ nparams = sum(p.numel() for p in self.parameters()) # Exclude non-matmul params: embeddings and per-layer scalars - value_embeds_numel = sum(ve.weight.numel() for ve in self.value_embeds) + value_embeds_numel = sum(ve.weight.numel() for ve in self.value_embeds.values()) nparams_exclude = (self.transformer.wte.weight.numel() + value_embeds_numel + - self.resid_lambdas.numel() + self.x0_lambdas.numel() + self.v0_lambdas.numel()) + self.resid_lambdas.numel() + self.x0_lambdas.numel()) h, q, t = self.config.n_head, self.config.n_embd // self.config.n_head, self.config.sequence_len # Sum attention FLOPs per layer, accounting for sliding window attn_flops = 0 @@ -323,15 +332,14 @@ class GPT(nn.Module): def setup_optimizers(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0, adam_betas=(0.8, 0.95), scalar_lr=0.5): model_dim = self.config.n_embd ddp, rank, local_rank, world_size = get_dist_info() - # Separate out all parameters into groups (matrix, embedding, lm_head, value_embeds, resid_lambdas, x0_lambdas, v0_lambdas) + # Separate out all parameters into groups matrix_params = list(self.transformer.h.parameters()) + value_embeds_params = list(self.value_embeds.parameters()) embedding_params = list(self.transformer.wte.parameters()) lm_head_params = list(self.lm_head.parameters()) - value_embeds_params = list(self.value_embeds.parameters()) resid_params = [self.resid_lambdas] x0_params = [self.x0_lambdas] - v0_params = [self.v0_lambdas] - assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(value_embeds_params) + len(resid_params) + len(x0_params) + len(v0_params) + assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(value_embeds_params) + len(resid_params) + len(x0_params) # Create the AdamW optimizer for the embedding, lm_head, and per-layer scalars # Scale the LR for the AdamW parameters by ∝1/√dmodel (having tuned the LRs for 768 dim model) dmodel_lr_scale = (model_dim / 768) ** -0.5 @@ -342,7 +350,6 @@ class GPT(nn.Module): dict(params=value_embeds_params, lr=embedding_lr * dmodel_lr_scale), # same LR as token embedding dict(params=resid_params, lr=scalar_lr * 0.01), # these are a lot more sensitive because they accumulate in the residual stream dict(params=x0_params, lr=scalar_lr), - dict(params=v0_params, lr=scalar_lr), ] adamw_kwargs = dict(betas=adam_betas, eps=1e-10, weight_decay=0.0) # NOTE: weight decay is hardcoded to 0.0 for AdamW, only used in Muon AdamWFactory = DistAdamW if ddp else partial(torch.optim.AdamW, fused=True) @@ -373,11 +380,10 @@ class GPT(nn.Module): x = self.transformer.wte(idx) x = norm(x) x0 = x # save initial normalized embedding for x0 residual - # Value residual (ResFormer): every layer gets its own value embedding - v0s = [ve(idx) for ve in self.value_embeds] # n_layer x (B, T, kv_dim) for i, block in enumerate(self.transformer.h): x = self.resid_lambdas[i] * x + self.x0_lambdas[i] * x0 - x = block(x, cos_sin, self.window_sizes[i], kv_cache, v0s[i], self.v0_lambdas[i]) + ve = self.value_embeds[str(i)](idx) if str(i) in self.value_embeds else None + x = block(x, ve, cos_sin, self.window_sizes[i], kv_cache) x = norm(x) # Forward the lm_head (compute logits) From 3b95d4fd392fb4d593adb80530e80c8009d06f75 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 17 Jan 2026 00:23:30 +0000 Subject: [PATCH 046/119] allow label for scaling laws script --- scaling_laws.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scaling_laws.sh b/scaling_laws.sh index 321b286a..7c269c6a 100644 --- a/scaling_laws.sh +++ b/scaling_laws.sh @@ -1,5 +1,7 @@ #!/bin/bash +LABEL="jan16" + FLOPS_BUDGETS=( 1e18 3e18 @@ -7,14 +9,14 @@ FLOPS_BUDGETS=( ) DEPTHS=(8 10 12 14 16 18 20) NPROC_PER_NODE="${NPROC_PER_NODE:-8}" -WANDB_RUN="${WANDB_RUN:-scaling}" +WANDB_RUN="${WANDB_RUN:-scaling_${LABEL}}" EVAL_TOKENS=$((100 * 524288)) # ~100M tokens for final eval (default is ~10M) export OMP_NUM_THREADS=1 export NANOCHAT_BASE_DIR="${NANOCHAT_BASE_DIR:-$HOME/.cache/nanochat}" source .venv/bin/activate -RESULTS_DIR="$NANOCHAT_BASE_DIR/scaling_laws_results" +RESULTS_DIR="$NANOCHAT_BASE_DIR/scaling_laws_results_${LABEL}" mkdir -p "$RESULTS_DIR" RESULTS_FILE="$RESULTS_DIR/results.csv" From 1933e8504655b41626947ea5ef1addab6bfda236 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 17 Jan 2026 00:25:50 +0000 Subject: [PATCH 047/119] brief update to log --- dev/LOG.md | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/dev/LOG.md b/dev/LOG.md index ae518c83..c0ab680f 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,18 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-17: Modded-nanogpt Ideas Sweep (Continued) + +Continued testing ideas from modded-nanogpt. + +| Idea | Result | Notes | +|------|--------|-------| +| Attention gates | No improvement | Per-head learnable gates on attention output. +1GB memory, decreased efficiency. | +| Batch size schedule | Abandoned | 8→16→24 with LR scaling. Made training script too bloated/complex, not worth cognitive overhead. | +| Value embeddings | Helps a lot | Experiments still ongoing, more on this later. | + +--- + ## 2026-01-16: Flash Attention 3 Fallback to SDPA Added automatic fallback from Flash Attention 3 to PyTorch's `scaled_dot_product_attention` (SDPA) for users without Hopper GPUs. This enables nanochat to run on older CUDA GPUs, CPU, and MPS (Apple Silicon). From 6460dc6382a4f9dfd52d5f8db3b659b9674b47a9 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 17 Jan 2026 02:28:31 +0000 Subject: [PATCH 048/119] tweaks to readme a bit --- README.md | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 9de2884c..9808c207 100644 --- a/README.md +++ b/README.md @@ -6,14 +6,15 @@ This repo is a full-stack implementation of an LLM like ChatGPT in a single, clean, minimal, hackable, dependency-lite codebase. nanochat is designed to run on a single 8XH100 node via scripts like [speedrun.sh](speedrun.sh), that run the entire pipeline start to end. This includes tokenization, pretraining, finetuning, evaluation, inference, and web serving over a simple UI so that you can talk to your own LLM just like ChatGPT. nanochat will become the capstone project of the course LLM101n being developed by Eureka Labs. +## Updates + +- (Jan 16 2026) The repo is in active development, I am currently fleshing out the pretraining stage. +- (Jan 7 2026) See new post: [nanochat Miniseries v1](https://github.com/karpathy/nanochat/discussions/420) and the associated script [miniseries.sh](miniseries.sh). + ## Talk to it To get a sense of the endpoint of this repo, you can currently find [nanochat d34](https://github.com/karpathy/nanochat/discussions/314) hosted on [nanochat.karpathy.ai](https://nanochat.karpathy.ai/). "d34" means that this model has 34 layers in the Transformer neural network. This model has 2.2 billion parameters, it was trained on 88 billion tokens by simply running the training script [run1000.sh](run1000.sh) with `--target_param_data_ratio=40` (2x longer than Chinchilla-optimal), and the total cost of training was ~$2,500 (about 100 hours training time on 8XH100 GPU node). While today this is enough to outperform GPT-2 of 2019, it falls dramatically short of modern Large Language Models like GPT-5. When talking to these micro models, you'll see that they make a lot of mistakes, they are a little bit naive and silly and they hallucinate a ton, a bit like children. It's kind of amusing. But what makes nanochat unique is that it is fully yours - fully configurable, tweakable, hackable, and trained by you from start to end. To train and talk to your own, we turn to... -## Updates - -- (Jan 7 2026) See new post: [nanochat Miniseries v1](https://github.com/karpathy/nanochat/discussions/420) and the associated script [miniseries.sh](miniseries.sh). - ## Quick start The fastest way to feel the magic is to run the speedrun script [speedrun.sh](speedrun.sh), which trains and inferences the $100 tier of nanochat. On an 8XH100 node at $24/hr, this gives a total run time of about 4 hours. Boot up a new 8XH100 GPU box from your favorite provider (e.g. I use and like [Lambda](https://lambda.ai/service/gpu-cloud)), and kick off the training script: @@ -99,7 +100,7 @@ And a bit more about computing environments that will run nanochat: ## Running on CPU / MPS -nanochat can be run on CPU or on MPS (if you're on Macbook), and will automatically try to detect what device is best to run on. You're not going to get too far without GPUs, but at least you'll be able to run the code paths and maybe train a tiny LLM with some patience. For an example of how to make all the run commands much smaller (feel free to tune!), you can refer to [dev/runcpu.sh](dev/runcpu.sh) file. You'll see that I'm essentially restricting all scripts to train smaller models, to run for shorter number of iterations, etc. This functionality is new, slightly gnarly (touched a lot of code), and was merged in this [CPU|MPS PR](https://github.com/karpathy/nanochat/pull/88) on Oct 21, 2025. +nanochat can be run on CPU or on MPS (if you're on Macbook) in principle, and will automatically try to detect what device is best to run on. The script [dev/runcpu.sh](dev/runcpu.sh) shows a very simple example that will exercise the code paths but basically produce garbage results. Unless you know what you're doing, I basically don't recommend using this script right now and hope to tune it a bit more in the future. ## Customization @@ -109,15 +110,9 @@ Additionally, to add new abilities to nanochat, see [Guide: counting r in strawb ## Questions -nanochat is designed to be short and sweet. One big advantage of this is that we can package up all of the files together and copy paste them to your favorite LLM to ask arbitrary questions. As an example, I like to package up the repo using the [files-to-prompt](https://github.com/simonw/files-to-prompt) utility like so: +I recommend using [DeepWiki](https://deepwiki.com/karpathy/nanochat) from Devin/Cognition to ask questions of this repo. In the URL of this repo, simply change github.com to deepwiki.com, and you're off. -```bash -files-to-prompt . -e py -e md -e html -e toml -e sh --cxml > packaged.txt -``` - -This includes all py, html, toml, sh files and chooses the cxml output format. Everything is written to the `packaged.txt` file, which atm measures ~330KB (i.e. well below ~100K tokens for a state of the art LLM), and ~8K lines of code in 45 files. - -Alternatively, I recommend using [DeepWiki](https://deepwiki.com/karpathy/nanochat) from Devin/Cognition to ask questions of this repo. In the URL of this repo, simply change github.com to deepwiki.com, and you're off. +You can also come to the [#nanochat Discord channel](https://discord.com/channels/1020383067459821711/1427295580895314031) to ask questions, or use the Discussions. ## Tests From e1dafc510f122d5c31c38a3c96e45e544f47930f Mon Sep 17 00:00:00 2001 From: Yamahammer <137644546+Yamahammer@users.noreply.github.com> Date: Fri, 16 Jan 2026 21:50:34 -0500 Subject: [PATCH 049/119] Reduce token waste in BOS bestfit by cropping shortest doc (#445) When no document fits the remaining row space, crop the shortest document in the buffer instead of the first. This minimizes discarded tokens. Co-authored-by: Claude Opus 4.5 --- nanochat/dataloader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nanochat/dataloader.py b/nanochat/dataloader.py index 562d517e..3e898935 100644 --- a/nanochat/dataloader.py +++ b/nanochat/dataloader.py @@ -178,8 +178,9 @@ def tokenizing_distributed_data_loader_with_state_bos_bestfit( doc = doc_buffer.pop(best_idx) row.extend(doc) else: - # No doc fits - crop first doc to fill remaining - doc = doc_buffer.pop(0) + # No doc fits - crop shortest in buffer to fill remaining and minimize waste + shortest_idx = min(range(len(doc_buffer)), key=lambda i: len(doc_buffer[i])) + doc = doc_buffer.pop(shortest_idx) row.extend(doc[:remaining]) rows.append(row[:row_capacity]) From f42ae9e901a34be3b06b6816d41871e54dedc986 Mon Sep 17 00:00:00 2001 From: Nitish Pandey <83660586+nitishpandey04@users.noreply.github.com> Date: Sat, 17 Jan 2026 08:26:43 +0530 Subject: [PATCH 050/119] fix condition to perform bpb evaluation (#324) Co-authored-by: svlandeg --- scripts/mid_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/mid_train.py b/scripts/mid_train.py index 01d9f7d4..c127c943 100644 --- a/scripts/mid_train.py +++ b/scripts/mid_train.py @@ -249,7 +249,7 @@ while True: last_step = bool(last_step_tensor.item()) # once in a while: evaluate the val bpb (all ranks participate) - if args.eval_every > 0 and (last_step or step % args.eval_every == 0): + if last_step or (args.eval_every > 0 and step % args.eval_every == 0): model.eval() val_loader = build_val_loader() eval_steps = args.eval_tokens // (args.device_batch_size * args.max_seq_len * ddp_world_size) From bbc4413c58bd24fed440030eee805121d5296340 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bar=C4=B1=C5=9F=20=C3=96zmen?= Date: Sat, 17 Jan 2026 05:59:12 +0300 Subject: [PATCH 051/119] Add high value engine tests for core invariants (33 LoC) (#396) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test: add engine generation tests for expected invariants - test_seed_reproducibility - test_temperature_zero_determinism - test_max_tokens_respected - test_num_samples_count 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 * Fix temperature test * add test for seed variation in sampling Add test for seed variation in sampling with temperature > 0. * Rename test for clarity * Shorten assert msg --------- Co-authored-by: Claude Opus 4.5 Co-authored-by: Sofie Van Landeghem --- tests/test_engine.py | 69 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/tests/test_engine.py b/tests/test_engine.py index 9351e5a8..67b8a5c7 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -195,3 +195,72 @@ def test_multi_sample_first_token_diversity(): f"With uniform logits, this is statistically impossible (~10^-36 probability) " f"unless tokens are being broadcast instead of independently sampled." ) + + +def test_seed_reproducibility(): + """Same seed must produce identical output.""" + model = MockModel() + engine = Engine(model, ByteTokenizer()) + prompt = [261, 72, 101, 108, 108, 111] # + "Hello" + + for seed in [1, 42, 123, 999]: + r1, _ = engine.generate_batch(prompt, max_tokens=5, seed=seed) + r2, _ = engine.generate_batch(prompt, max_tokens=5, seed=seed) + r3, _ = engine.generate_batch(prompt, max_tokens=5, seed=seed) + assert r1 == r2 == r3, "Same seed must produce identical output for the same prompt." + + +def test_temperature_zero_determinism(): + """Temperature=0 is deterministic regardless of seed.""" + model = MockModel() + engine = Engine(model, ByteTokenizer()) + prompt = [261, 72, 101, 108, 108, 111] + + r1, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=1) + r2, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=42) + r3, _ = engine.generate_batch(prompt, temperature=0.0, max_tokens=5, seed=123) + assert r1 == r2 == r3, "Temperature=0 must result in the same output for the same prompt regardless of seed." + + +def test_max_tokens_respected(): + """Generation stops at max_tokens limit.""" + model = MockModel() + engine = Engine(model, ByteTokenizer()) + prompt = [261, 72, 101, 108, 108, 111] + + for max_tokens in [1, 4, 16, 64]: + results, _ = engine.generate_batch(prompt, max_tokens=max_tokens) + num_generated_tokens = len(results[0]) - len(prompt) + assert num_generated_tokens <= max_tokens, f"Generated {num_generated_tokens} tokens, expected max_tokens={max_tokens} or less." + + +def test_num_samples_count(): + """num_samples=N produces exactly N sequences.""" + model = MockModel() + engine = Engine(model, ByteTokenizer()) + prompt = [261, 72, 101, 108, 108, 111] + + for num_samples in [1, 4, 16, 64]: + results, _ = engine.generate_batch(prompt, num_samples=num_samples, max_tokens=3) + assert len(results) == num_samples, f"Expected {num_samples} sequences from {num_samples} samples, got {len(results)}" + + +def test_different_seeds_introduce_variation_when_temperature_nonzero(): + """With temperature > 0, different seeds should introduce sampling variation.""" + model = MockModel() + engine = Engine(model, ByteTokenizer()) + prompt = [261, 72, 101, 108, 108, 111] # + "Hello" + + outputs = set() + + for seed in [1, 42, 123, 999, 1000, 1001, 1002, 1003, 1004, 1005]: + results, _ = engine.generate_batch( + prompt, + temperature=1.0, + max_tokens=5, + seed=seed, + ) + outputs.add(tuple(results[0])) + + # Sanity check: sampling actually introduces variation + assert len(outputs) > 1, "All seeds produced the same output which is statistically highly improbable." From 77a46902e4557117d716bd9cd9604ec3913f25bd Mon Sep 17 00:00:00 2001 From: Yury Kirpichev Date: Fri, 16 Jan 2026 18:59:44 -0800 Subject: [PATCH 052/119] Fix WANDB_RUN parameter passing in runcpu.sh (#407) - Add --run=$WANDB_RUN to base_train, mid_train, and chat_sft calls - Ensures wandb logging works when WANDB_RUN environment variable is set - Matches the behavior in speedrun.sh Co-authored-by: svlandeg --- dev/runcpu.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/dev/runcpu.sh b/dev/runcpu.sh index c0b32a54..6ed7a8af 100755 --- a/dev/runcpu.sh +++ b/dev/runcpu.sh @@ -41,7 +41,8 @@ python -m scripts.base_train \ --core-metric-every=50 \ --core-metric-max-per-task=12 \ --sample-every=50 \ - --num-iterations=50 + --num-iterations=50 \ + --run=$WANDB_RUN python -m scripts.base_loss --device-batch-size=1 --split-tokens=4096 python -m scripts.base_eval --max-per-task=16 @@ -52,7 +53,8 @@ python -m scripts.mid_train \ --eval-every=50 \ --eval-tokens=4096 \ --total-batch-size=1024 \ - --num-iterations=100 + --num-iterations=100 \ + --run=$WANDB_RUN # eval results will be terrible, this is just to execute the code paths. # note that we lower the execution memory limit to 1MB to avoid warnings on smaller systems python -m scripts.chat_eval --source=mid --max-new-tokens=128 --max-problems=20 @@ -63,7 +65,8 @@ python -m scripts.chat_sft \ --target-examples-per-step=4 \ --num-iterations=100 \ --eval-steps=4 \ - --eval-metrics-max-problems=16 + --eval-metrics-max-problems=16 \ + --run=$WANDB_RUN # Chat CLI # python -m scripts.chat_cli -p "Why is the sky blue?" From 2955650327fb71bc4a470d5e1093dd7c9cececfc Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 17 Jan 2026 03:16:12 +0000 Subject: [PATCH 053/119] add detection of device to report more correct mfu for bf16 --- nanochat/common.py | 49 +++++++++++++++++++++++++++++++++++++++++++ scripts/base_train.py | 11 +++++++--- 2 files changed, 57 insertions(+), 3 deletions(-) diff --git a/nanochat/common.py b/nanochat/common.py index 22559ce5..faf91440 100644 --- a/nanochat/common.py +++ b/nanochat/common.py @@ -200,3 +200,52 @@ class DummyWandb: pass def finish(self): pass + +# hardcoded BF16 peak flops for NVIDIA A100, H100, H200, B200 GPU and AMD MI250, MI300X, MI325X, MI355X and Intel PVC +# inspired by torchtitan: https://github.com/pytorch/torchtitan/blob/main/torchtitan/tools/utils.py +def get_peak_flops(device_name: str) -> float: + if "A100" in device_name: + # data from https://www.nvidia.com/en-us/data-center/a100/ + return 312e12 + elif "H100" in device_name: + # data from https://www.nvidia.com/en-us/data-center/h100/ + # NOTE: Specifications are one-half lower without sparsity. + if "NVL" in device_name: + return 835e12 + elif "PCIe" in device_name: + return 756e12 + else: # for H100 SXM and other variants + return 989e12 + elif "H200" in device_name: + # data from https://www.nvidia.com/en-us/data-center/h200/ + return 989e12 + elif "B200" in device_name: + # data from https://nvdam.widen.net/s/wwnsxrhm2w/blackwell-datasheet-3384703 + return 2.25e15 + elif "MI355X" in device_name: + # MI355X data from https://www.amd.com/en/products/accelerators/instinct/mi350/mi355x.html + return 2500e12 + elif "MI300X" in device_name or "MI325X" in device_name: + # MI300X data from https://www.amd.com/en/products/accelerators/instinct/mi300/mi300x.html + # MI325X data from https://www.amd.com/en/products/accelerators/instinct/mi300/mi325x.html + return 1300e12 + elif "MI250X" in device_name: + # data from https://www.amd.com/en/products/accelerators/instinct/mi200/mi250x.html (per GCD) + return 191.5e12 + elif "Data Center GPU Max 1550" in device_name: + # Also known as Ponte Vecchio (PVC). + # data from https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html + # Dot Product Accumulate Systolic (DPAS): + # - Freq: 1300MHz + # - #ops: 512 + # Full EU mode (i.e. 512 max compute units): 340.8 TFLOPS (BF16) + # Standard EU mode (i.e. 448 max compute units): 298.2 TFLOPS (BF16) + max_comp_units = torch.xpu.get_device_properties("xpu").max_compute_units + return 512 * max_comp_units * 1300 * 10**6 + elif "l40s" in device_name: + # data from: "https://resources.nvidia.com/en-us-l40s/l40s-datasheet-28413" + return 362e12 + + else: # for other GPU types, assume A100 + logger.warning(f"Peak flops undefined for: {device_name}, fallback to A100") + return 312e12 diff --git a/scripts/base_train.py b/scripts/base_train.py index c61986e6..e051f99b 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -22,7 +22,7 @@ import torch from nanochat.gpt import GPT, GPTConfig from nanochat.dataloader import tokenizing_distributed_data_loader_bos_bestfit, tokenizing_distributed_data_loader_with_state_bos_bestfit -from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, print_banner, get_base_dir, autodetect_device_type +from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, print_banner, get_base_dir, autodetect_device_type, get_peak_flops from nanochat.tokenizer import get_tokenizer, get_token_bytes from nanochat.checkpoint_manager import save_checkpoint, load_checkpoint from nanochat.loss_eval import evaluate_bpb @@ -82,6 +82,12 @@ master_process = ddp_rank == 0 # this process will do logging, checkpointing etc autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext() synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0 +if device_type == "cuda": + gpu_device_name = torch.cuda.get_device_name(0) + gpu_peak_flops = get_peak_flops(gpu_device_name) + print0(f"GPU: {gpu_device_name} | Peak FLOPS (BF16): {gpu_peak_flops:.2e}") +else: + gpu_peak_flops = float('inf') # MFU not meaningful for CPU/MPS # wandb logging init use_dummy_wandb = args.run == "dummy" or not master_process @@ -395,8 +401,7 @@ while True: pct_done = 100 * step / num_iterations tok_per_sec = int(args.total_batch_size / dt) flops_per_sec = num_flops_per_token * args.total_batch_size / dt - promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity - mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in % + mfu = 100 * flops_per_sec / (gpu_peak_flops * ddp_world_size) if step > 10: total_training_time += dt # only count the time after the first 10 steps # Calculate ETA based on average time per step (excluding first 10 steps) From f5425245f99efd4145d2ac71a730af1e96777d6a Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 17 Jan 2026 03:22:20 +0000 Subject: [PATCH 054/119] more GPU types from PR 147 thanks @Qubitium --- nanochat/common.py | 109 ++++++++++++++++++++++++++++----------------- 1 file changed, 67 insertions(+), 42 deletions(-) diff --git a/nanochat/common.py b/nanochat/common.py index faf91440..44760f90 100644 --- a/nanochat/common.py +++ b/nanochat/common.py @@ -201,51 +201,76 @@ class DummyWandb: def finish(self): pass -# hardcoded BF16 peak flops for NVIDIA A100, H100, H200, B200 GPU and AMD MI250, MI300X, MI325X, MI355X and Intel PVC +# hardcoded BF16 peak flops for various GPUs # inspired by torchtitan: https://github.com/pytorch/torchtitan/blob/main/torchtitan/tools/utils.py +# and PR: https://github.com/karpathy/nanochat/pull/147 def get_peak_flops(device_name: str) -> float: - if "A100" in device_name: - # data from https://www.nvidia.com/en-us/data-center/a100/ - return 312e12 - elif "H100" in device_name: - # data from https://www.nvidia.com/en-us/data-center/h100/ - # NOTE: Specifications are one-half lower without sparsity. - if "NVL" in device_name: - return 835e12 - elif "PCIe" in device_name: - return 756e12 - else: # for H100 SXM and other variants - return 989e12 - elif "H200" in device_name: - # data from https://www.nvidia.com/en-us/data-center/h200/ - return 989e12 - elif "B200" in device_name: - # data from https://nvdam.widen.net/s/wwnsxrhm2w/blackwell-datasheet-3384703 + name = device_name.lower() + + # --- NVIDIA Blackwell --- + if "gb200" in name or "grace blackwell" in name: + return 2.5e15 + if "b200" in name: return 2.25e15 - elif "MI355X" in device_name: - # MI355X data from https://www.amd.com/en/products/accelerators/instinct/mi350/mi355x.html - return 2500e12 - elif "MI300X" in device_name or "MI325X" in device_name: - # MI300X data from https://www.amd.com/en/products/accelerators/instinct/mi300/mi300x.html - # MI325X data from https://www.amd.com/en/products/accelerators/instinct/mi300/mi325x.html - return 1300e12 - elif "MI250X" in device_name: - # data from https://www.amd.com/en/products/accelerators/instinct/mi200/mi250x.html (per GCD) - return 191.5e12 - elif "Data Center GPU Max 1550" in device_name: - # Also known as Ponte Vecchio (PVC). - # data from https://www.intel.com/content/www/us/en/docs/oneapi/optimization-guide-gpu/2025-0/intel-xe-gpu-architecture.html - # Dot Product Accumulate Systolic (DPAS): - # - Freq: 1300MHz - # - #ops: 512 - # Full EU mode (i.e. 512 max compute units): 340.8 TFLOPS (BF16) - # Standard EU mode (i.e. 448 max compute units): 298.2 TFLOPS (BF16) + if "b100" in name: + return 1.8e15 + + # --- NVIDIA Hopper (H100/H200/H800) --- + if "h200" in name: + if "nvl" in name or "pcie" in name: + return 836e12 + return 989e12 # H200 SXM + if "h100" in name: + if "nvl" in name: + return 835e12 + if "pcie" in name: + return 756e12 + return 989e12 # H100 SXM + if "h800" in name: + if "nvl" in name: + return 989e12 + return 756e12 # H800 PCIe + + # --- NVIDIA Ampere data center --- + if "a100" in name or "a800" in name: + return 312e12 + if "a40" in name: + return 149.7e12 + if "a30" in name: + return 165e12 + + # --- NVIDIA Ada data center --- + if "l40s" in name or "l40-s" in name or "l40 s" in name: + return 362e12 + if "l4" in name: + return 121e12 + + # --- AMD CDNA accelerators --- + if "mi355" in name: + return 2.5e15 + if "mi325" in name or "mi300x" in name: + return 1.3074e15 + if "mi300a" in name: + return 980.6e12 + if "mi250x" in name: + return 383e12 + if "mi250" in name: + return 362.1e12 + + # --- Intel --- + if "data center gpu max 1550" in name: + # Ponte Vecchio (PVC) - dynamic based on compute units max_comp_units = torch.xpu.get_device_properties("xpu").max_compute_units return 512 * max_comp_units * 1300 * 10**6 - elif "l40s" in device_name: - # data from: "https://resources.nvidia.com/en-us-l40s/l40s-datasheet-28413" - return 362e12 - else: # for other GPU types, assume A100 - logger.warning(f"Peak flops undefined for: {device_name}, fallback to A100") - return 312e12 + # --- Consumer RTX (for hobbyists) --- + if "5090" in name: + return 209.5e12 + if "4090" in name: + return 165.2e12 + if "3090" in name: + return 71e12 + + # Unknown GPU - return inf so MFU shows as 0% rather than a wrong guess + logger.warning(f"Peak flops undefined for: {device_name}, MFU will show as 0%") + return float('inf') From f9a7e0f111f9955a640c69cd3dfe457813dc4601 Mon Sep 17 00:00:00 2001 From: karpathy Date: Sat, 17 Jan 2026 12:27:30 -0800 Subject: [PATCH 055/119] update the CPU/MPS script to give reasonable results. The model can at least answer that Paris is the capital of France and knows that the sky is blue, for about 40 minutes of training on my macbook. Also fixed a bug that existed due to KVCache bfloat16 dtype assumption --- dev/runcpu.sh | 83 ++++++++++++++++++++------------------------ nanochat/engine.py | 11 +++++- scripts/base_loss.py | 17 ++++++++- tests/test_engine.py | 5 +-- 4 files changed, 67 insertions(+), 49 deletions(-) diff --git a/dev/runcpu.sh b/dev/runcpu.sh index 6ed7a8af..da8f6d19 100755 --- a/dev/runcpu.sh +++ b/dev/runcpu.sh @@ -1,12 +1,15 @@ #!/bin/bash # Showing an example run for exercising some of the code paths on the CPU (or MPS on Macbooks) +# This script was last updated/tuned on Jan 17, 2026. + # Run as: # bash dev/cpu_demo_run.sh # NOTE: Training LLMs requires GPU compute and $$$. You will not get far on your Macbook. # Think of this run as educational/fun demo, not something you should expect to work well. -# This is also why I hide this script away in dev/ +# (This is why I hide this script away in dev/) +# You may also want to run this script manually and one by one, copy pasting commands into your terminal. # all the setup stuff export OMP_NUM_THREADS=1 @@ -20,58 +23,48 @@ if [ -z "$WANDB_RUN" ]; then WANDB_RUN=dummy fi -# wipe the report -python -m nanochat.report reset - -# train tokenizer on ~1B characters -python -m nanochat.dataset -n 4 -python -m scripts.tok_train --max-chars=1000000000 +# train tokenizer on ~2B characters (~34 seconds on my MacBook Pro M3 Max) +python -m nanochat.dataset -n 8 +python -m scripts.tok_train --max-chars=2000000000 python -m scripts.tok_eval -# train a very small 4 layer model on the CPU -# each optimization step processes a single sequence of 1024 tokens -# we only run 50 steps of optimization (bump this to get better results) +# train a small 4 layer model +# I tuned this run to complete in about 30 minutes on my MacBook Pro M3 Max. +# To get better results, try increasing num_iterations, or get other ideas from your favorite LLM. python -m scripts.base_train \ - --depth=4 \ - --max-seq-len=1024 \ - --device-batch-size=1 \ - --total-batch-size=1024 \ - --eval-every=50 \ - --eval-tokens=4096 \ - --core-metric-every=50 \ - --core-metric-max-per-task=12 \ - --sample-every=50 \ - --num-iterations=50 \ + --depth=6 \ + --head-dim=64 \ + --window-pattern=L \ + --max-seq-len=512 \ + --device-batch-size=32 \ + --total-batch-size=16384 \ + --eval-every=100 \ + --eval-tokens=524288 \ + --core-metric-every=-1 \ + --sample-every=100 \ + --num-iterations=5000 \ --run=$WANDB_RUN -python -m scripts.base_loss --device-batch-size=1 --split-tokens=4096 +python -m scripts.base_loss --device-batch-size=1 --split-tokens=16384 python -m scripts.base_eval --max-per-task=16 -# midtraining +# midtraining (~10 minutes on my MacBook Pro M3 Max) +curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl python -m scripts.mid_train \ - --max-seq-len=1024 \ - --device-batch-size=1 \ - --eval-every=50 \ - --eval-tokens=4096 \ - --total-batch-size=1024 \ - --num-iterations=100 \ - --run=$WANDB_RUN -# eval results will be terrible, this is just to execute the code paths. -# note that we lower the execution memory limit to 1MB to avoid warnings on smaller systems -python -m scripts.chat_eval --source=mid --max-new-tokens=128 --max-problems=20 - -# SFT -python -m scripts.chat_sft \ - --device-batch-size=1 \ - --target-examples-per-step=4 \ - --num-iterations=100 \ - --eval-steps=4 \ - --eval-metrics-max-problems=16 \ + --max-seq-len=512 \ + --device-batch-size=32 \ + --total-batch-size=16384 \ + --eval-every=200 \ + --eval-tokens=524288 \ + --num-iterations=1500 \ --run=$WANDB_RUN -# Chat CLI -# python -m scripts.chat_cli -p "Why is the sky blue?" +# (it's ~ok to skip SFT) -# Chat Web -# python -m scripts.chat_web +# Chat with the model over CLI +# The model should be able to say that it is Paris. +# It might even know that the color of the sky is blue. +# Sometimes the model likes it if you first say Hi before you ask it questions. +# python -m scripts.chat_cli -i mid -p "What is the capital of France?" -python -m nanochat.report generate +# Chat with the model over a pretty WebUI ChatGPT style +# python -m scripts.chat_web -i mid diff --git a/nanochat/engine.py b/nanochat/engine.py index 53fdec5b..7f05eb4e 100644 --- a/nanochat/engine.py +++ b/nanochat/engine.py @@ -90,7 +90,7 @@ class KVCache: - Position tracked per batch element via cache_seqlens tensor """ - def __init__(self, batch_size, num_heads, seq_len, head_dim, num_layers, device, dtype=torch.bfloat16): + def __init__(self, batch_size, num_heads, seq_len, head_dim, num_layers, device, dtype): self.batch_size = batch_size self.max_seq_len = seq_len self.n_layers = num_layers @@ -172,6 +172,13 @@ class Engine: """Same as generate, but does single prefill and then clones the KV cache.""" assert isinstance(tokens, list) and isinstance(tokens[0], int), "expecting list of ints" device = self.model.get_device() + # NOTE: setting the dtype here and in this way is an ugly hack. + # Currently the repo assumes that cuda -> bfloat16 and everything else -> float32. + # We need to know the dtype here to call __init__ on KVCache and pre-allocate its tensors. + # As a quick hack, we're making generate() function inherit and know about this repo-wise assumption. + # I think there has to be a bigger refactor to deal with device/dtype tracking across the codebase. + # In particular, the KVCache should allocate its tensors lazily + dtype = torch.bfloat16 if device.type == "cuda" else torch.float32 rng = torch.Generator(device=device) rng.manual_seed(seed) @@ -191,6 +198,7 @@ class Engine: batch_size=1, seq_len=len(tokens), device=device, + dtype=dtype, **kv_model_kwargs, ) ids = torch.tensor([tokens], dtype=torch.long, device=device) @@ -203,6 +211,7 @@ class Engine: batch_size=num_samples, seq_len=kv_length_hint, device=device, + dtype=dtype, **kv_model_kwargs, ) kv_cache_decode.prefill(kv_cache_prefill) diff --git a/scripts/base_loss.py b/scripts/base_loss.py index 6b44a30c..fb8cf596 100644 --- a/scripts/base_loss.py +++ b/scripts/base_loss.py @@ -104,7 +104,7 @@ for split_name in ["train", "val"]: bpb_results[split_name] = bpb print0(f"Model: {model_name}, {split_name} bpb: {bpb:.6f}") -# Master process also samples from the model (only for nanochat models) +# Master process also samples from the model for some basic knowledge-eliciting prompts (only for nanochat models) samples = [] if ddp_rank == 0 and args.hf_path is None: prompts = [ @@ -122,9 +122,23 @@ if ddp_rank == 0 and args.hf_path is None: with autocast_ctx: sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0) sample_str = tokenizer.decode(sample[0]) + print0("-" * 80) print0(sample_str) samples.append(sample_str) +# Draw some unconditioned samples from the model (only for nanochat models) +unconditioned_samples = [] +if ddp_rank == 0 and args.hf_path is None: + engine = Engine(model, tokenizer) + tokens = tokenizer("", prepend="<|bos|>") + with autocast_ctx: + samples, _ = engine.generate_batch(tokens, num_samples=8, max_tokens=128, temperature=1.0) + for sample in samples: + sample_str = tokenizer.decode(sample) + print0("-" * 80) + print0(sample_str) + unconditioned_samples.append(sample_str) + # Log to report from nanochat.report import get_report get_report().log(section="Base model loss", data=[ @@ -134,6 +148,7 @@ get_report().log(section="Base model loss", data=[ "val bpb": bpb_results["val"], }, {f"sample {i}": sample for i, sample in enumerate(samples)}, + {f"unconditioned sample {i}": sample for i, sample in enumerate(unconditioned_samples)}, ]) # Cleanup diff --git a/tests/test_engine.py b/tests/test_engine.py index 67b8a5c7..01591111 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -96,6 +96,7 @@ def test_kv_cache_basic(): head_dim=head_dim, num_layers=num_layers, device="cpu", + dtype=torch.float32, ) # Check initial state @@ -130,7 +131,7 @@ def test_kv_cache_prefill(): # Create source cache and advance it src_cache = KVCache( batch_size=batch_size, num_heads=num_heads, seq_len=32, - head_dim=head_dim, num_layers=num_layers, device="cpu", + head_dim=head_dim, num_layers=num_layers, device="cpu", dtype=torch.float32, ) # Write some data to source cache src_cache.k_cache[0, 0, :16, :, :] = 1.0 @@ -140,7 +141,7 @@ def test_kv_cache_prefill(): # Create destination cache with larger seq_len dst_cache = KVCache( batch_size=batch_size, num_heads=num_heads, seq_len=64, - head_dim=head_dim, num_layers=num_layers, device="cpu", + head_dim=head_dim, num_layers=num_layers, device="cpu", dtype=torch.float32, ) # Prefill From e7ed2082b836ac21e45020759e799c3bf1d511fe Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 17 Jan 2026 21:16:46 +0000 Subject: [PATCH 056/119] update the default GPTConfig kwargs otherwise they are confusing --- nanochat/gpt.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/nanochat/gpt.py b/nanochat/gpt.py index a077256e..cb4bd05b 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -28,8 +28,8 @@ from nanochat.flash_attention import flash_attn @dataclass class GPTConfig: - sequence_len: int = 1024 - vocab_size: int = 50304 + sequence_len: int = 2048 + vocab_size: int = 32768 n_layer: int = 12 n_head: int = 6 # number of query heads n_kv_head: int = 6 # number of key/value heads (GQA) @@ -37,7 +37,7 @@ class GPTConfig: # Sliding window attention pattern string, tiled across layers. Final layer always L. # Characters: L=long (full context), S=short (half context) # Examples: "L"=all full context, "SL"=alternating, "SSL"=two short then one long - window_pattern: str = "L" + window_pattern: str = "SSSL" def norm(x): From 413e91aa0f5f3f841dbdc0009e64811cf75c5a9d Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 17 Jan 2026 23:51:09 +0000 Subject: [PATCH 057/119] optimal ratio is now around 4 --- scripts/base_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/base_train.py b/scripts/base_train.py index c61986e6..bb8d8a68 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -47,7 +47,7 @@ parser.add_argument("--window-pattern", type=str, default="SSSL", help="sliding # Training horizon (only one used, in order of precedence) parser.add_argument("--num-iterations", type=int, default=-1, help="explicit number of optimization steps (-1 = disable)") parser.add_argument("--target-flops", type=float, default=-1.0, help="calculate num_iterations to reach target_flops (-1 = disable)") -parser.add_argument("--target-param-data-ratio", type=int, default=8, help="calculate num_iterations to maintain data:param ratio (Chinchilla=20, -1 = disable)") +parser.add_argument("--target-param-data-ratio", type=int, default=4, help="calculate num_iterations to maintain data:param ratio (Chinchilla=20, -1 = disable)") # Optimization parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size") parser.add_argument("--total-batch-size", type=int, default=524288, help="total batch size in tokens") From cf5c9e5b8eb2e06c7c2c1c4a280ed95a7f4aa68d Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 18 Jan 2026 00:07:08 +0000 Subject: [PATCH 058/119] resolve a crash for odd depths because FA3 needs head_dim % 8 == 0 --- scripts/base_train.py | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/scripts/base_train.py b/scripts/base_train.py index bb8d8a68..bcbd4841 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -106,21 +106,19 @@ vocab_size = tokenizer.get_vocab_size() print0(f"Vocab size: {vocab_size:,}") # Model kwargs are derived from the desired depth of the model +# We nudge model_dim up to the nearest multiple of head_dim to ensure clean division +# (FA3 requires head_dim divisible by 8, and this guarantees head_dim == args.head_dim exactly) +# (For very small depths, this gives a slight "unfair" advantage to models with odd depths) num_layers = args.depth -model_dim = args.depth * args.aspect_ratio -def find_num_heads(model_dim, target_head_dim): - # Find num_heads that divides model_dim evenly, with head_dim closest to target. - ideal = max(1, round(model_dim / target_head_dim)) - for offset in range(model_dim): - for candidate in [ideal + offset, ideal - offset]: - if candidate > 0 and model_dim % candidate == 0: - return candidate - return 1 -num_heads = find_num_heads(model_dim, args.head_dim) +base_dim = args.depth * args.aspect_ratio +model_dim = ((base_dim + args.head_dim - 1) // args.head_dim) * args.head_dim +num_heads = model_dim // args.head_dim num_kv_heads = num_heads # default is 1:1 GQA (Group Query Attention) ratio (i.e. GQA is disabled) +head_dim = model_dim // num_heads print0(f"num_layers: {num_layers}") -print0(f"model_dim: {model_dim}") +print0(f"model_dim: {model_dim} (base: {base_dim}, nudge: {model_dim - base_dim:+d})") print0(f"num_heads: {num_heads}") +print0(f"head_dim: {head_dim}") print0(f"num_kv_heads: {num_kv_heads}") # Optimizer / data / training length related hyperparameters From babde18ce1cb59cb3d36f8874d1248983c7ba9c3 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 18 Jan 2026 03:00:38 +0000 Subject: [PATCH 059/119] small tweaks --- miniseries.sh | 1 - scaling_laws.sh | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/miniseries.sh b/miniseries.sh index 9a4512b6..c42544e3 100644 --- a/miniseries.sh +++ b/miniseries.sh @@ -61,7 +61,6 @@ for d in "${DEPTHS[@]}"; do # No --target-flops, let it use the default ratio from base_train torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- \ --depth=$d \ - --target-param-data-ratio=8 \ --run="${WANDB_RUN}_d${d}" \ --model-tag="${TAG}" \ --core-metric-every=999999 \ diff --git a/scaling_laws.sh b/scaling_laws.sh index 7c269c6a..1f9dab87 100644 --- a/scaling_laws.sh +++ b/scaling_laws.sh @@ -7,7 +7,8 @@ FLOPS_BUDGETS=( 3e18 6e18 ) -DEPTHS=(8 10 12 14 16 18 20) +DEPTHS=(6 7 8 9 10 11 12 13 14) + NPROC_PER_NODE="${NPROC_PER_NODE:-8}" WANDB_RUN="${WANDB_RUN:-scaling_${LABEL}}" EVAL_TOKENS=$((100 * 524288)) # ~100M tokens for final eval (default is ~10M) From d58fcd9d7331efba0224d59e026833738e7547a6 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 18 Jan 2026 03:01:13 +0000 Subject: [PATCH 060/119] log for jan 17 --- dev/LOG.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/dev/LOG.md b/dev/LOG.md index c0ab680f..8aeffdbf 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,27 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-17: Various experiments + +Modded-nanogpt uses [Value Embeddings](https://arxiv.org/abs/2410.17897) (VEs) in a funny U-shaped structure, 3 of them in total and with gates. I tried a large number of tweaks on this today: + +- VEs at every layer, at alternating layers, U shaped, front and back. Alternating layers worked best, i.e. we end up with *a lot* more VEs than modded-nanogpt, at every other layer. It works better. +- Many parameters sharing ideas to reduce new parameter count, nothing here worked. All failed. +- Many ideas to reduce parameter count, the LLM hates all of them: low rank decompositions, projections. All failed. +- Gated yes or no and how much. Gate helps. + +Long story short is that the models *love* Value Embeddings. It is a way to add a huge amount of capacity (parameters) to the model at almost zero cost of FLOPs, because these embeddings are simply added to the Values tensor. Any attempt to reduce the capacity of value embeddings (param sharing, low rank, projections) fail. The model wants many of them, and with all the capacity, and doing so wins across all x axes of steps, flops and wall clock. I re-ran the scaling laws and, because the models are now very parameter bloated, the optimal ratio has halved from 8 to 4! Way down lower than Chinchilla's 20 at this point. + +Other experiments, looking at val/bpb as a function of all of steps, flops and wall clock time: + +- Aspect ratio of 128 is worse than 64, I tried a sweep fixing FLOPs == 1e18 and 64 outperforms. The LLM prefers to be slightly thinner and longer. +- Head dim definitely prefers to be 128 instead of 64, i.e. fewer bigger heads +- Bunch of other random stuff like that. + +Keeping all of this work on a private branch for now but hope to push shortly. + +--- + ## 2026-01-17: Modded-nanogpt Ideas Sweep (Continued) Continued testing ideas from modded-nanogpt. From 63bb5831e27ec4ad5f7493412cf16f3aa2a35877 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 18 Jan 2026 15:27:41 +0000 Subject: [PATCH 061/119] something i've wanted to do for a while - move all .sh runs to their own directory so they don't pollute root dir --- README.md | 27 ++++++++++++++----------- miniseries.sh => runs/miniseries.sh | 0 run1000.sh => runs/run1000.sh | 0 {dev => runs}/runcpu.sh | 0 scaling_laws.sh => runs/scaling_laws.sh | 0 speedrun.sh => runs/speedrun.sh | 0 6 files changed, 15 insertions(+), 12 deletions(-) rename miniseries.sh => runs/miniseries.sh (100%) rename run1000.sh => runs/run1000.sh (100%) rename {dev => runs}/runcpu.sh (100%) rename scaling_laws.sh => runs/scaling_laws.sh (100%) rename speedrun.sh => runs/speedrun.sh (100%) diff --git a/README.md b/README.md index 9808c207..fb8747fd 100644 --- a/README.md +++ b/README.md @@ -4,29 +4,29 @@ > The best ChatGPT that $100 can buy. -This repo is a full-stack implementation of an LLM like ChatGPT in a single, clean, minimal, hackable, dependency-lite codebase. nanochat is designed to run on a single 8XH100 node via scripts like [speedrun.sh](speedrun.sh), that run the entire pipeline start to end. This includes tokenization, pretraining, finetuning, evaluation, inference, and web serving over a simple UI so that you can talk to your own LLM just like ChatGPT. nanochat will become the capstone project of the course LLM101n being developed by Eureka Labs. +This repo is a full-stack implementation of an LLM like ChatGPT in a single, clean, minimal, hackable, dependency-lite codebase. nanochat is designed to run on a single 8XH100 node via scripts like [speedrun.sh](runs/speedrun.sh), that run the entire pipeline start to end. This includes tokenization, pretraining, finetuning, evaluation, inference, and web serving over a simple UI so that you can talk to your own LLM just like ChatGPT. nanochat will become the capstone project of the course LLM101n being developed by Eureka Labs. ## Updates - (Jan 16 2026) The repo is in active development, I am currently fleshing out the pretraining stage. -- (Jan 7 2026) See new post: [nanochat Miniseries v1](https://github.com/karpathy/nanochat/discussions/420) and the associated script [miniseries.sh](miniseries.sh). +- (Jan 7 2026) See new post: [nanochat Miniseries v1](https://github.com/karpathy/nanochat/discussions/420) and the associated script [miniseries.sh](runs/miniseries.sh). ## Talk to it -To get a sense of the endpoint of this repo, you can currently find [nanochat d34](https://github.com/karpathy/nanochat/discussions/314) hosted on [nanochat.karpathy.ai](https://nanochat.karpathy.ai/). "d34" means that this model has 34 layers in the Transformer neural network. This model has 2.2 billion parameters, it was trained on 88 billion tokens by simply running the training script [run1000.sh](run1000.sh) with `--target_param_data_ratio=40` (2x longer than Chinchilla-optimal), and the total cost of training was ~$2,500 (about 100 hours training time on 8XH100 GPU node). While today this is enough to outperform GPT-2 of 2019, it falls dramatically short of modern Large Language Models like GPT-5. When talking to these micro models, you'll see that they make a lot of mistakes, they are a little bit naive and silly and they hallucinate a ton, a bit like children. It's kind of amusing. But what makes nanochat unique is that it is fully yours - fully configurable, tweakable, hackable, and trained by you from start to end. To train and talk to your own, we turn to... +To get a sense of the endpoint of this repo, you can currently find [nanochat d34](https://github.com/karpathy/nanochat/discussions/314) hosted on [nanochat.karpathy.ai](https://nanochat.karpathy.ai/). "d34" means that this model has 34 layers in the Transformer neural network. This model has 2.2 billion parameters, it was trained on 88 billion tokens by simply running the training script [run1000.sh](runs/run1000.sh) with `--target_param_data_ratio=40` (2x longer than Chinchilla-optimal), and the total cost of training was ~$2,500 (about 100 hours training time on 8XH100 GPU node). While today this is enough to outperform GPT-2 of 2019, it falls dramatically short of modern Large Language Models like GPT-5. When talking to these micro models, you'll see that they make a lot of mistakes, they are a little bit naive and silly and they hallucinate a ton, a bit like children. It's kind of amusing. But what makes nanochat unique is that it is fully yours - fully configurable, tweakable, hackable, and trained by you from start to end. To train and talk to your own, we turn to... ## Quick start -The fastest way to feel the magic is to run the speedrun script [speedrun.sh](speedrun.sh), which trains and inferences the $100 tier of nanochat. On an 8XH100 node at $24/hr, this gives a total run time of about 4 hours. Boot up a new 8XH100 GPU box from your favorite provider (e.g. I use and like [Lambda](https://lambda.ai/service/gpu-cloud)), and kick off the training script: +The fastest way to feel the magic is to run the speedrun script [speedrun.sh](runs/speedrun.sh), which trains and inferences the $100 tier of nanochat. On an 8XH100 node at $24/hr, this gives a total run time of about 4 hours. Boot up a new 8XH100 GPU box from your favorite provider (e.g. I use and like [Lambda](https://lambda.ai/service/gpu-cloud)), and kick off the training script: ```bash -bash speedrun.sh +bash runs/speedrun.sh ``` Alternatively, since the script runs for 4 hours, I like to launch it like this inside a new screen session `speedrun` (and also log output to `speedrun.log`): ```bash -screen -L -Logfile speedrun.log -S speedrun bash speedrun.sh +screen -L -Logfile speedrun.log -S speedrun bash runs/speedrun.sh ``` See the [screen cheatsheet](https://gist.github.com/jctosta/af918e1618682638aa82) if you are less familiar. You can watch it go inside the screen session, or detach with `Ctrl-a d` and `tail speedrun.log` to view progress. Now wait 4 hours. Once it's done, you can talk to your LLM via the ChatGPT-like web UI. Make sure again that your local uv virtual environment is active (run `source .venv/bin/activate`), and serve it: @@ -73,7 +73,7 @@ Total wall clock time: 3h51m Unsurprisingly, $100 is not enough to train a highly performant ChatGPT clone. In fact, LLMs are famous for their multi-million dollar capex. For our purposes, I think there are two more scales of interest. First is the ~$300 tier d26 model (i.e. depth=26) that trains in ~12 hours, which slightly outperforms GPT-2 CORE score. Second is the $1000 tier (~41.6 hours), just because it's a nice round number. But both of these are not yet fully supported and therefore not attached here in the master branch yet. -That said, to give a sense, the example changes needed for the [speedrun.sh](speedrun.sh) file to train a GPT-2 grade model d26 only involve three changes: +That said, to give a sense, the example changes needed for the [speedrun.sh](runs/speedrun.sh) file to train a GPT-2 grade model d26 only involve three changes: ```bash ... @@ -100,7 +100,7 @@ And a bit more about computing environments that will run nanochat: ## Running on CPU / MPS -nanochat can be run on CPU or on MPS (if you're on Macbook) in principle, and will automatically try to detect what device is best to run on. The script [dev/runcpu.sh](dev/runcpu.sh) shows a very simple example that will exercise the code paths but basically produce garbage results. Unless you know what you're doing, I basically don't recommend using this script right now and hope to tune it a bit more in the future. +nanochat can be run on CPU or on MPS (if you're on Macbook) in principle, and will automatically try to detect what device is best to run on. The script [runcpu.sh](runs/runcpu.sh) shows a very simple example that will exercise the code paths but basically produce garbage results. Unless you know what you're doing, I basically don't recommend using this script right now and hope to tune it a bit more in the future. ## Customization @@ -132,8 +132,7 @@ python -m pytest tests/test_engine.py -v -s │ ├── gen_synthetic_data.py # Example synthetic data for identity │ ├── generate_logo.html │ ├── nanochat.png -│ ├── repackage_data_reference.py # Pretraining data shard generation -│ └── runcpu.sh # Small example of how to run on CPU/MPS +│ └── repackage_data_reference.py # Pretraining data shard generation ├── nanochat │ ├── __init__.py # empty │ ├── adamw.py # Distributed AdamW optimizer @@ -152,7 +151,12 @@ python -m pytest tests/test_engine.py -v -s │ ├── tokenizer.py # BPE Tokenizer wrapper in style of GPT-4 │ └── ui.html # HTML/CSS/JS for nanochat frontend ├── pyproject.toml -├── run1000.sh # Train the ~$800 nanochat d32 +├── runs +│ ├── miniseries.sh # Miniseries training script +│ ├── run1000.sh # Train the ~$800 nanochat d32 +│ ├── runcpu.sh # Small example of how to run on CPU/MPS +│ ├── scaling_laws.sh # Scaling laws experiments +│ └── speedrun.sh # Train the ~$100 nanochat d20 ├── scripts │ ├── base_eval.py # Base model: calculate CORE score │ ├── base_loss.py # Base model: calculate bits per byte, sample @@ -165,7 +169,6 @@ python -m pytest tests/test_engine.py -v -s │ ├── mid_train.py # Chat model: midtraining │ ├── tok_eval.py # Tokenizer: evaluate compression rate │ └── tok_train.py # Tokenizer: train it -├── speedrun.sh # Train the ~$100 nanochat d20 ├── tasks │ ├── arc.py # Multiple choice science questions │ ├── common.py # TaskMixture | TaskSequence diff --git a/miniseries.sh b/runs/miniseries.sh similarity index 100% rename from miniseries.sh rename to runs/miniseries.sh diff --git a/run1000.sh b/runs/run1000.sh similarity index 100% rename from run1000.sh rename to runs/run1000.sh diff --git a/dev/runcpu.sh b/runs/runcpu.sh similarity index 100% rename from dev/runcpu.sh rename to runs/runcpu.sh diff --git a/scaling_laws.sh b/runs/scaling_laws.sh similarity index 100% rename from scaling_laws.sh rename to runs/scaling_laws.sh diff --git a/speedrun.sh b/runs/speedrun.sh similarity index 100% rename from speedrun.sh rename to runs/speedrun.sh From 6a477eedbdc8d2c66da84c2fbfcf907ee7e1ba60 Mon Sep 17 00:00:00 2001 From: xiayan0118 <49345397+xiayan0118@users.noreply.github.com> Date: Mon, 19 Jan 2026 17:19:51 -0800 Subject: [PATCH 062/119] fix: pass device_type to compute_init in engine.__main__ (#451) When running engine.py directly on non-GPU devices (CPU, MPS), compute_init() needs the device_type parameter to initialize correctly. This fixes failures on machines without CUDA support. --- nanochat/engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nanochat/engine.py b/nanochat/engine.py index 7f05eb4e..a1ba24c8 100644 --- a/nanochat/engine.py +++ b/nanochat/engine.py @@ -306,8 +306,8 @@ if __name__ == "__main__": """ import time # init compute - ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init() device_type = autodetect_device_type() + ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext() # load the model and tokenizer From 85b3e95e0966a9ef4d46c59c5598922a15affd51 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 25 Jan 2026 00:03:55 +0000 Subject: [PATCH 063/119] 320 experiments just to tune the adam beta1 of x0 a little bit up from 0.8 to 0.96 --- dev/LOG.md | 66 +++++++++++++++++++++++++++++++++++++++++++++++++ nanochat/gpt.py | 2 +- 2 files changed, 67 insertions(+), 1 deletion(-) diff --git a/dev/LOG.md b/dev/LOG.md index 8aeffdbf..068b35e9 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,72 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-19 to 2026-01-22: Optimizer Hyperparameter Sweep + +Ran ~320 experiments across 6 rounds, scaling from d12→d16→d20 to find optimal optimizer hyperparameters. Added granular per-component control to `setup_optimizers()` — separate LRs and betas for embedding, unembedding, value_embeds, resid_lambdas, x0_lambdas, and Muon matrix params. + +### What We Swept +- Learning rates for all 6 parameter groups +- Beta1/beta2 for all 5 AdamW groups +- Muon momentum (start/end), weight decay +- Hundreds of combinations (2-way, 3-way, 4-way, etc.) + +### The Journey + +**At d12**, found two independent improvement routes: +- **Route A:** emb_lr↑ (0.3→0.4), weight_decay↑ (0.1→0.15), matrix_lr↑ (0.02→0.025) +- **Route B:** x0_lr↓ (0.5→0.2), x0_beta1↑ (0.8→0.9+) + +Both gave ~0.002 improvement, but combining them caused conflicts. Fine-tuning found wd=0.13, matrix_lr=0.027, emb_lr=0.38 helped slightly. Best d12 config: Route A + x0_beta1=0.95. + +**At d16**, Route B became competitive with Route A. The routes still conflicted when combined. + +**At d20** (target scale), everything changed: +- Fine-tuned values from d12 **actively hurt** performance +- Routes no longer conflicted +- Just `x0_beta1=0.96` alone captured nearly all the gains + +### Final x0_beta1 Sweep at d20 + +| x0_beta1 | val/bpb | Δ vs baseline | +|----------|---------|---------------| +| **0.96** | **0.7971** | **-0.0007** | +| 0.94 | 0.7972 | -0.0006 | +| 0.90 | 0.7972 | -0.0006 | +| 0.97 | 0.7977 | -0.0001 | +| 0.98 | 0.8011 | +0.0033 💀 | + +Flat plateau from 0.90-0.96, then sharp cliff at 0.97+. + +### Key Learnings + +1. **Hyperparameters are scale-dependent.** What works at d12 doesn't transfer to d20. The elaborate fine-tuning that won at d12 actively hurts at d20. + +2. **Improvement magnitude shrinks with scale.** ~0.002 at d12 → ~0.0007 at d20. The baseline is already better-tuned for larger models. + +3. **Sharp cliffs exist.** x0_beta1=0.98 is catastrophic while 0.96 is optimal. + +4. **Don't over-tune on small proxies.** Validate at target scale before shipping. + +### Final Recommendation + +For production d20 runs, add one flag: +``` +--x0-lambdas-beta1=0.96 +``` + +Skip everything else discovered at smaller scales. + +--- + +## 2026-01-18: More various experiments + +- Tried Muon custom kernels for XXT and all the others. The improvement was there for targeted tests (~20%) but washed out completely to noise in an actual training run, especially because the Muon compute is split across all the workers. Abandoned due to complexity bloat. +- Fuse Q,K,V,O nn.Linear layers into a single QKVO Linear layer. ~Zero impact +- Tried the `sa_lambdas` that gate QKV and O. Slightly confused because of the use of rmsnorm, which erases the effect of any scalar multiplier. Helped a tiny bit (~1e-4 of loss), abandoned to control complexity. + +--- + ## 2026-01-17: Various experiments Modded-nanogpt uses [Value Embeddings](https://arxiv.org/abs/2410.17897) (VEs) in a funny U-shaped structure, 3 of them in total and with gates. I tried a large number of tweaks on this today: diff --git a/nanochat/gpt.py b/nanochat/gpt.py index cb4bd05b..f62d04be 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -349,7 +349,7 @@ class GPT(nn.Module): dict(params=embedding_params, lr=embedding_lr * dmodel_lr_scale), dict(params=value_embeds_params, lr=embedding_lr * dmodel_lr_scale), # same LR as token embedding dict(params=resid_params, lr=scalar_lr * 0.01), # these are a lot more sensitive because they accumulate in the residual stream - dict(params=x0_params, lr=scalar_lr), + dict(params=x0_params, lr=scalar_lr, betas=(0.96, 0.95)), # higher beta1 for x0 scalars ] adamw_kwargs = dict(betas=adam_betas, eps=1e-10, weight_decay=0.0) # NOTE: weight decay is hardcoded to 0.0 for AdamW, only used in Muon AdamWFactory = DistAdamW if ddp else partial(torch.optim.AdamW, fused=True) From 59e36cc727521cc254e61f82e29980b9068cf272 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 25 Jan 2026 18:59:51 +0000 Subject: [PATCH 064/119] first version of engram following modded nanogpt style --- nanochat/gpt.py | 62 ++++++++++++++++++++++++++++++++++++++----- scripts/base_train.py | 4 +-- 2 files changed, 58 insertions(+), 8 deletions(-) diff --git a/nanochat/gpt.py b/nanochat/gpt.py index f62d04be..b810ec9b 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -45,6 +45,41 @@ def norm(x): return F.rms_norm(x, (x.size(-1),)) +class BigramEmbed(nn.Module): + """ + Hash bigrams to embeddings. Simple, self-contained, runs on GPU. + Following modded-nanogpt's approach: single hash, no gating. + + For each position t, hashes (token[t-1], token[t]) to an index in a large + embedding table. This provides O(1) lookup for local 2-gram patterns, + offloading static pattern reconstruction from the transformer layers. + + Ref: https://github.com/KellerJordan/modded-nanogpt/pull/201 + Ref: https://arxiv.org/abs/1709.03933 (Hash Embeddings) + """ + def __init__(self, vocab_size: int, embed_dim: int, table_multiplier: int = 5): + super().__init__() + self.bigram_vocab_size = vocab_size * table_multiplier + self.embed = nn.Embedding(self.bigram_vocab_size, embed_dim) + + def forward(self, idx: torch.Tensor) -> torch.Tensor: + """ + idx: (B, T) token ids + Returns: (B, T, embed_dim) bigram embeddings + """ + # Hash (prev_token, curr_token) -> index + # Position 0 gets a reserved index (no valid bigram) + rand_int_1 = 36313 + rand_int_2 = 27191 + mod = self.bigram_vocab_size - 1 + + h = torch.empty_like(idx, dtype=torch.long) + h[:, 0] = mod # reserved index for position 0 + h[:, 1:] = (rand_int_1 * idx[:, 1:] ^ rand_int_2 * idx[:, :-1]) % mod + + return self.embed(h) + + def has_ve(layer_idx, n_layer): """Returns True if GPT layer should have Value Embedding (alternating, last layer always included).""" return layer_idx % 2 == (n_layer - 1) % 2 @@ -169,9 +204,13 @@ class GPT(nn.Module): # Per-layer learnable scalars (inspired by modded-nanogpt) # resid_lambdas: scales the residual stream at each layer (init 1.0 = neutral) # x0_lambdas: blends initial embedding back in at each layer (init 0.0 = disabled) + # bigram_lambdas: blends bigram embeddings in at each layer (init 0.1 = small contribution) # Separate parameters so they can have different optimizer treatment self.resid_lambdas = nn.Parameter(torch.ones(config.n_layer)) # fake init, real init in init_weights() self.x0_lambdas = nn.Parameter(torch.zeros(config.n_layer)) # fake init, real init in init_weights() + self.bigram_lambdas = nn.Parameter(torch.zeros(config.n_layer)) # fake init, real init in init_weights() + # Bigram hash embeddings: O(1) lookup for local 2-gram patterns + self.bigram_embed = BigramEmbed(config.vocab_size, config.n_embd) # Value embeddings (ResFormer-style): alternating layers, last layer always included head_dim = config.n_embd // config.n_head kv_dim = config.n_kv_head * head_dim @@ -219,7 +258,11 @@ class GPT(nn.Module): # Per-layer scalars self.resid_lambdas.fill_(1.0) # 1.0 => typical residual connections at init - self.x0_lambdas.fill_(0.0) # 0.0 => skip connection to input is disabled at init + self.x0_lambdas.fill_(0.1) # 0.1 => small initial weight for skip connection to input embedding + self.bigram_lambdas.fill_(0.1) # 0.1 => small initial weight for skip connection to bigram embeddings + + # Bigram embeddings: zero init so it starts as identity + nn.init.zeros_(self.bigram_embed.embed.weight) # Value embeddings (init like c_v: uniform with same std) for ve in self.value_embeds.values(): @@ -240,6 +283,7 @@ class GPT(nn.Module): self.transformer.wte.to(dtype=torch.bfloat16) for ve in self.value_embeds.values(): ve.to(dtype=torch.bfloat16) + self.bigram_embed.to(dtype=torch.bfloat16) def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=None): # TODO: bump base theta more? e.g. 100K is more common more recently @@ -305,8 +349,9 @@ class GPT(nn.Module): nparams = sum(p.numel() for p in self.parameters()) # Exclude non-matmul params: embeddings and per-layer scalars value_embeds_numel = sum(ve.weight.numel() for ve in self.value_embeds.values()) - nparams_exclude = (self.transformer.wte.weight.numel() + value_embeds_numel + - self.resid_lambdas.numel() + self.x0_lambdas.numel()) + bigram_embed_numel = self.bigram_embed.embed.weight.numel() + nparams_exclude = (self.transformer.wte.weight.numel() + value_embeds_numel + bigram_embed_numel + + self.resid_lambdas.numel() + self.x0_lambdas.numel() + self.bigram_lambdas.numel()) h, q, t = self.config.n_head, self.config.n_embd // self.config.n_head, self.config.sequence_len # Sum attention FLOPs per layer, accounting for sliding window attn_flops = 0 @@ -339,7 +384,9 @@ class GPT(nn.Module): lm_head_params = list(self.lm_head.parameters()) resid_params = [self.resid_lambdas] x0_params = [self.x0_lambdas] - assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(value_embeds_params) + len(resid_params) + len(x0_params) + bigram_embed_params = list(self.bigram_embed.parameters()) + bigram_lambda_params = [self.bigram_lambdas] + assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(value_embeds_params) + len(resid_params) + len(x0_params) + len(bigram_embed_params) + len(bigram_lambda_params) # Create the AdamW optimizer for the embedding, lm_head, and per-layer scalars # Scale the LR for the AdamW parameters by ∝1/√dmodel (having tuned the LRs for 768 dim model) dmodel_lr_scale = (model_dim / 768) ** -0.5 @@ -348,8 +395,10 @@ class GPT(nn.Module): dict(params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale), dict(params=embedding_params, lr=embedding_lr * dmodel_lr_scale), dict(params=value_embeds_params, lr=embedding_lr * dmodel_lr_scale), # same LR as token embedding + dict(params=bigram_embed_params, lr=embedding_lr * dmodel_lr_scale), # same LR as token embedding dict(params=resid_params, lr=scalar_lr * 0.01), # these are a lot more sensitive because they accumulate in the residual stream dict(params=x0_params, lr=scalar_lr, betas=(0.96, 0.95)), # higher beta1 for x0 scalars + dict(params=bigram_lambda_params, lr=scalar_lr, betas=(0.96, 0.95)), # same treatment as x0 lambdas ] adamw_kwargs = dict(betas=adam_betas, eps=1e-10, weight_decay=0.0) # NOTE: weight decay is hardcoded to 0.0 for AdamW, only used in Muon AdamWFactory = DistAdamW if ddp else partial(torch.optim.AdamW, fused=True) @@ -377,11 +426,12 @@ class GPT(nn.Module): cos_sin = self.cos[:, T0:T0+T], self.sin[:, T0:T0+T] # truncate cache to current sequence length # Forward the trunk of the Transformer - x = self.transformer.wte(idx) + x = self.transformer.wte(idx) # embed current token + x0_bigram = self.bigram_embed(idx) # embed current bigram (via hash lookup) x = norm(x) x0 = x # save initial normalized embedding for x0 residual for i, block in enumerate(self.transformer.h): - x = self.resid_lambdas[i] * x + self.x0_lambdas[i] * x0 + x = self.resid_lambdas[i] * x + self.x0_lambdas[i] * x0 + self.bigram_lambdas[i] * x0_bigram ve = self.value_embeds[str(i)](idx) if str(i) in self.value_embeds else None x = block(x, ve, cos_sin, self.window_sizes[i], kv_cache) x = norm(x) diff --git a/scripts/base_train.py b/scripts/base_train.py index 2d614774..02eeea3c 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -1,11 +1,11 @@ """ Train model. From root directory of the project, run as: -python -m scripts.base_train.py +python -m scripts.base_train or distributed as: -torchrun --nproc_per_node=8 -m scripts.base_train.py +torchrun --nproc_per_node=8 -m scripts.base_train If you are only on CPU/Macbook, you'll want to train a much much smaller LLM. Example: python -m scripts.base_train --depth=4 --max-seq-len=512 --device-batch-size=1 --eval-tokens=512 --core-metric-every=-1 --total-batch-size=512 --num-iterations=20 From 8630d32be43912c1f8670c03fe6c0bdc843c1215 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 26 Jan 2026 22:31:42 +0000 Subject: [PATCH 065/119] quick fix to not OOM main speedrun script --- runs/speedrun.sh | 4 ++-- scripts/tok_train.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/runs/speedrun.sh b/runs/speedrun.sh index 8fff5640..ef4fa00d 100644 --- a/runs/speedrun.sh +++ b/runs/speedrun.sh @@ -58,8 +58,8 @@ python -m nanochat.dataset -n 8 # See comment below for why 370 is the right number here python -m nanochat.dataset -n 370 & DATASET_DOWNLOAD_PID=$! -# train the tokenizer with vocab size 2**16 = 65536 on ~2B characters of data -python -m scripts.tok_train --max-chars=2000000000 --vocab-size=65536 +# train the tokenizer with vocab size 2**15 = 32768 on ~2B characters of data +python -m scripts.tok_train # evaluate the tokenizer (report compression ratio etc.) python -m scripts.tok_eval diff --git a/scripts/tok_train.py b/scripts/tok_train.py index 9c7979d2..480e0e16 100644 --- a/scripts/tok_train.py +++ b/scripts/tok_train.py @@ -14,7 +14,7 @@ from nanochat.dataset import parquets_iter_batched # Parse command line arguments parser = argparse.ArgumentParser(description='Train a BPE tokenizer') -parser.add_argument('--max-chars', type=int, default=10_000_000_000, help='Maximum characters to train on (default: 10B)') +parser.add_argument('--max-chars', type=int, default=2_000_000_000, help='Maximum characters to train on (default: 10B)') parser.add_argument('--doc-cap', type=int, default=10_000, help='Maximum characters per document (default: 10,000)') parser.add_argument('--vocab-size', type=int, default=32768, help='Vocabulary size (default: 32768 = 2^15)') args = parser.parse_args() From c8d93beed2febd8f542dfc1c7f9b3d1435f28c9c Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 27 Jan 2026 22:31:17 +0000 Subject: [PATCH 066/119] add engram-lite, add log, tune scaling laws analysis scripts --- dev/LOG.md | 134 ++++++++++++++++++++++++++++++ dev/scaling_analysis.ipynb | 164 +++++++++++++++++++++++++++++++++++-- nanochat/gpt.py | 35 ++++++-- runs/scaling_laws.sh | 29 ++++--- scripts/base_train.py | 19 +++-- 5 files changed, 346 insertions(+), 35 deletions(-) diff --git a/dev/LOG.md b/dev/LOG.md index 068b35e9..bba35ea9 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,140 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-27: Bigram Hash Embeddings (Engram-lite) + +Explored N-gram memory modules inspired by the [DeepSeek Engram paper](https://arxiv.org/abs/2506.08046) and [modded-nanogpt PR #201](https://github.com/KellerJordan/modded-nanogpt/pull/201). + +### Background + +The Engram paper introduces "conditional memory" as a complement to MoE - using O(1) hash lookups to retrieve static N-gram patterns instead of reconstructing them through computation. Key insight: transformers waste early layers "simulating retrieval through computation" for patterns like named entities and formulaic phrases that could be simple table lookups. + +### What We Tried + +**1. Full Engram module with context-aware gating (paper design)** +```python +# Hash bigrams to retrieve embeddings, then gate with hidden state +e = embed(hash(prev_token, curr_token)) +q = RMSNorm(h) # hidden state as query +k = RMSNorm(W_k @ e) # projected embedding as key +v = W_v @ e +α = sigmoid(q · k / √d) # scalar gate per position +output = α * v +``` +- Injected after block 1 (paper found early injection optimal) +- Slight improvement, but quite a bit of complexity added. + +**2. Early-layer only injection** +- Only inject bigram signal in first 4 layers (where paper claims static pattern offloading helps most) +- **Result:** Actually hurt performance. The model seems to need uniform injection across all layers. + +**3. Trigrams** +- Extended to hash both 2-grams and 3-grams, concatenating embeddings +- **Result:** No improvement over bigrams alone. Dilutes capacity from more frequent 2-gram patterns. + +**4. Bigram-only with x0-style injection (modded-nanogpt engram-lite approach)** +- Simple hash: `(36313 * curr) XOR (27191 * prev) mod table_size` +- Zero-init embedding table, learned per-layer lambdas +- Add to residual at every layer: `x = resid_λ[i]*x + x0_λ[i]*x0 + bigram_λ[i]*x0_bigram` +- **Result:** This simple approach works and provides a consistent improvement. + +TLDR The winning approach follows modded-nanogpt's "engram-lite", simply adding the following module and feeding its output into the residual branch (gated by a per-layer learnable \lambda) before every single block: + +```python +class BigramEmbed(nn.Module): + def __init__(self, vocab_size, embed_dim, table_multiplier=5): + self.embed = nn.Embedding(vocab_size * table_multiplier, embed_dim) + + def forward(self, idx): + h = (36313 * idx[:, 1:]) ^ (27191 * idx[:, :-1]) % (table_size - 1) + return self.embed(h) +``` + +As for optimal hyperparameters: + +- **Table size:** `vocab_size * 5` (~164K entries for 32K vocab). Swept a number of settings and 5 was optimal. +- **Injection:** Every layer via learned `bigram_lambdas` (init 0.1 was better than 0.0). +- **Normalization:** Also tried adding a `norm()` to the embeddings (mirroring the token embeddings), this was slightly worse. +- **Init:** Zero-init embedding, so starts as identity (tried small noisy init, it's worse) +- **Optimizer:** AdamW with same LR as token embeddings + +### Key Learnings + +1. **Gating didn't help at our scale.** The paper's context-aware gating mechanism (sigmoid dot-product gate) added parameters and complexity without improvement. modded-nanogpt found the same: "simple direct addition to the residual stream outperformed by a decent margin." + +2. **Uniform injection beats early-only.** Despite the paper's finding that early layers benefit most, restricting injection to early layers hurt. The x0-style "add everywhere with learned lambda" pattern works better for our architecture/scale. + +3. **Bigrams are sufficient.** Trigrams didn't help - the extra context doesn't pay for the diluted capacity. + +4. **Scale matters.** The Engram paper's results are at 27B params with MoE. At our ~100M-1B scale, the simpler approach wins. The elaborate gating mechanism may become useful at larger scales where collision handling matters more. + +### Parameters Added + +For d12 model with `table_multiplier=5`: +- Bigram embedding: 32768 × 5 × 768 = ~126M params +- Per-layer lambdas: 12 scalars (negligible) + +If you're keeping track, we now have *a lot* of parameters, a significant amount of them in embeddings (token embeddings, bigram embeddings, value embeddings). For example, for a d12 we now have: + +``` +Parameter counts: +wte : 25,165,824 +bigram_embed : 125,829,120 +value_embeds : 150,994,944 +lm_head : 25,165,824 +transformer_matrices : 84,935,808 +scalars : 36 +total : 412,091,556 +``` + +In other words, only about a quarter of parameters are now weight projections and the vast majority is embedding tables. + +Still, on all axes (steps, wall clock time, flops), this somewhat parameter-bloated architecture beats the baseline and will now become the default. + +After adding the engram-lite, I re-ran the scaling laws to determine the new optimal tokens:params ratio. I swept FLOPs in the range 1e18..1e19, exponentially strided in 4 settings (1e18, 2e18, 5e18, 1e19). I looked at a number of ways of determining the effective parameter count for the purposes of the scaling laws. The results looked like this: + +``` +Kaplan-style (all projections including lm_head and no embeddings) + +Optimal configurations (from quadratic fits): +FLOPs Eff Params Tokens Ratio Val BPB +----------------------------------------------------------------- +1e+18 110,678,115 1,241,505,403 11.2 0.8972 +2e+18 167,797,457 1,785,336,422 10.7 0.8616 +5e+18 250,650,865 2,642,234,152 10.8 0.8293 +1e+19 381,758,347 3,806,871,243 10.3 0.7999 + +N \propto C^0.54, D \propto C^0.49 + +Chinchilla-style (all parameters, period.) + +Optimal configurations (from quadratic fits): +FLOPs Eff Params Tokens Ratio Val BPB +----------------------------------------------------------------- +1e+18 416,320,605 1,232,157,011 3.0 0.8974 +2e+18 560,239,841 1,763,669,281 3.2 0.8616 +5e+18 741,495,903 2,629,909,368 3.6 0.8291 +1e+19 988,644,331 3,884,841,895 4.0 0.7999 + +N \propto C^0.37, D \propto C^0.50 + +Transformer-only-style (only the projections inside the transformer) + +Optimal configurations (from quadratic fits): +FLOPs Eff Params Tokens Ratio Val BPB +----------------------------------------------------------------- +1e+18 80,259,665 1,315,639,547 17.2 0.8966 +2e+18 131,488,566 1,864,134,141 14.5 0.8622 +5e+18 220,985,474 2,595,328,843 12.1 0.8302 +1e+19 401,213,504 3,328,704,512 8.5 0.7994 + +N \propto C^0.70, D \propto C^0.41 +``` + +Clearly, the Kaplan-style ratios are most consistent and produce stable ~0.5 exponents for both params and tokens, meaning we can have a single fixed ratio of tokens:params for compute optimal models. This turns out to be about ~10.5, which now becomes the new default. + +--- + ## 2026-01-19 to 2026-01-22: Optimizer Hyperparameter Sweep Ran ~320 experiments across 6 rounds, scaling from d12→d16→d20 to find optimal optimizer hyperparameters. Added granular per-component control to `setup_optimizers()` — separate LRs and betas for embedding, unembedding, value_embeds, resid_lambdas, x0_lambdas, and Muon matrix params. diff --git a/dev/scaling_analysis.ipynb b/dev/scaling_analysis.ipynb index a196bd18..e7761c5a 100644 --- a/dev/scaling_analysis.ipynb +++ b/dev/scaling_analysis.ipynb @@ -15,14 +15,16 @@ "metadata": {}, "outputs": [], "source": [ + "%matplotlib inline\n", "import os\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "# Load results\n", + "tag = \"jan26\"\n", "base_dir = os.environ.get('NANOCHAT_BASE_DIR', os.path.expanduser('~/.cache/nanochat'))\n", - "results_path = os.path.join(base_dir, 'scaling_laws_results', 'results.csv')\n", + "results_path = os.path.join(base_dir, f'scaling_laws_results_{tag}', 'results.csv')\n", "\n", "df = pd.read_csv(results_path)\n", "flops_budgets = sorted(df['flops_budget'].unique())\n", @@ -31,6 +33,99 @@ "df" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# =============================================================================\n", + "# FILTERING: Remove incomplete or problematic runs\n", + "# =============================================================================\n", + "\n", + "print(f\"Before filtering: {len(df)} runs\")\n", + "\n", + "# Filter out runs with missing/invalid val_bpb (incomplete runs)\n", + "df = df[df['val_bpb'].notna() & (df['val_bpb'] > 0)]\n", + "\n", + "# Optional: exclude specific flops budgets that aren't done yet\n", + "# exclude_flops = [1e19] # <-- adjust as runs complete\n", + "# df = df[~df['flops_budget'].isin(exclude_flops)]\n", + "\n", + "# Optional: exclude specific depths\n", + "# exclude_depths = [18, 20]\n", + "# df = df[~df['depth'].isin(exclude_depths)]\n", + "\n", + "print(f\"After filtering: {len(df)} runs\")\n", + "print(f\"FLOPs budgets: {sorted(df['flops_budget'].unique())}\")\n", + "print(f\"Depths: {sorted(df['depth'].unique())}\")\n", + "\n", + "# Update flops_budgets list after filtering\n", + "flops_budgets = sorted(df['flops_budget'].unique())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Effective Parameter Count\n", + "\n", + "Different scaling law papers use different conventions for counting parameters:\n", + "- **Kaplan et al.** excluded embedding parameters (claimed cleaner laws)\n", + "- **Chinchilla** included all parameters (and noted Kaplan had a bug)\n", + "\n", + "Our CSV now has granular counts:\n", + "- `params_wte` - token embedding (lookup table)\n", + "- `params_bigram_embed` - bigram hash embeddings (lookup table)\n", + "- `params_value_embeds` - value embeddings (lookup table)\n", + "- `params_lm_head` - unembedding projection (matmul)\n", + "- `params_transformer` - attention + MLP matrices (matmuls)\n", + "- `params_scalars` - resid/x0/bigram lambdas (tiny)\n", + "\n", + "**Experiment below** with different combinations to see which gives the cleanest scaling laws." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# =============================================================================\n", + "# EXPERIMENT HERE: Define which parameters to count for scaling laws\n", + "# =============================================================================\n", + "\n", + "def compute_effective_params(row):\n", + " \"\"\"\n", + " Compute the 'effective' parameter count for scaling law analysis.\n", + "\n", + " Modify this function to experiment with different conventions:\n", + " - Chinchilla-style: include everything\n", + " - Kaplan-style: exclude embeddings\n", + " - Matmul-only: just transformer + lm_head (the actual compute)\n", + " - etc.\n", + " \"\"\"\n", + " # Option 1: Chinchilla-style (all params)\n", + " # return row['params_total']\n", + "\n", + " # Option 2: Kaplan-style (exclude embeddings)\n", + " return row['params_transformer'] + row['params_lm_head']\n", + "\n", + " # Option 3: Transformer-only (exclude all embeddings AND lm_head)\n", + " # return row['params_transformer']\n", + "\n", + "\n", + "# Compute derived columns\n", + "df['effective_params'] = df.apply(compute_effective_params, axis=1)\n", + "df['param_data_ratio'] = df['tokens_trained'] / df['effective_params']\n", + "\n", + "# Show parameter breakdown for first few rows\n", + "print(\"Parameter breakdown (first row per flops budget):\")\n", + "param_cols = ['depth', 'params_wte', 'params_bigram_embed', 'params_value_embeds',\n", + " 'params_lm_head', 'params_transformer', 'params_scalars', 'params_total', 'effective_params']\n", + "df.groupby('flops_budget').first()[param_cols]" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -54,11 +149,11 @@ "optimal_by_bpb = []\n", "\n", "for flops, color in zip(flops_budgets, colors):\n", - " subset = df[df['flops_budget'] == flops].sort_values('num_scaling_params')\n", - " ax.plot(subset['num_scaling_params'], subset['val_bpb'], 'o', color=color, label=f'{flops:.0e}', markersize=8)\n", + " subset = df[df['flops_budget'] == flops].sort_values('effective_params')\n", + " ax.plot(subset['effective_params'], subset['val_bpb'], 'o', color=color, label=f'{flops:.0e}', markersize=8)\n", "\n", " # Fit quadratic in log-space: val_bpb = a*(log N)^2 + b*(log N) + c\n", - " log_params = np.log10(subset['num_scaling_params'])\n", + " log_params = np.log10(subset['effective_params'])\n", " coeffs = np.polyfit(log_params, subset['val_bpb'], 2)\n", " a, b, c = coeffs\n", "\n", @@ -83,13 +178,13 @@ " # Fallback to raw minimum if quadratic doesn't have minimum\n", " best_idx = subset['val_bpb'].idxmin()\n", " best = subset.loc[best_idx]\n", - " ax.scatter([best['num_scaling_params']], [best['val_bpb']], s=150, color=color,\n", + " ax.scatter([best['effective_params']], [best['val_bpb']], s=150, color=color,\n", " zorder=5, edgecolors='black', linewidths=2)\n", - " optimal_by_bpb.append({'flops': flops, 'params': best['num_scaling_params'],\n", + " optimal_by_bpb.append({'flops': flops, 'params': best['effective_params'],\n", " 'tokens': best['tokens_trained'], 'ratio': best['param_data_ratio'], 'bpb': best['val_bpb']})\n", "\n", "ax.set_xscale('log')\n", - "ax.set_xlabel('Parameters')\n", + "ax.set_xlabel('Effective Parameters')\n", "ax.set_ylabel('Validation Loss (bpb)')\n", "ax.set_title('IsoFLOP Curves')\n", "ax.legend(title='FLOPs', loc='upper right')\n", @@ -138,10 +233,61 @@ "\n", "# Print the optimal points (from quadratic fits)\n", "print(\"\\nOptimal configurations (from quadratic fits):\")\n", - "print(f\"{'FLOPs':<12} {'Params':<15} {'Tokens':<15} {'Ratio':<10} {'Val BPB':<10}\")\n", + "print(f\"{'FLOPs':<12} {'Eff Params':<15} {'Tokens':<15} {'Ratio':<10} {'Val BPB':<10}\")\n", "print(\"-\" * 65)\n", "for _, row in opt_df.iterrows():\n", - " print(f\"{row['flops']:<12.0e} {int(row['params']):<15,} {int(row['tokens']):<15,} {row['ratio']:<10.1f} {row['bpb']:<10.4f}\")\n" + " print(f\"{row['flops']:<12.0e} {int(row['params']):<15,} {int(row['tokens']):<15,} {row['ratio']:<10.1f} {row['bpb']:<10.4f}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# =============================================================================\n", + "# Optimal Ratio Summary (from power law fits)\n", + "# =============================================================================\n", + "\n", + "# From the power law fits: N ∝ C^a and D ∝ C^b\n", + "# The ratio D/N ∝ C^(b-a). If a ≈ b, ratio is roughly constant.\n", + "\n", + "if len(opt_df) >= 2:\n", + " log_f = np.log10(opt_df['flops'])\n", + " log_p = np.log10(opt_df['params'])\n", + " log_t = np.log10(opt_df['tokens'])\n", + "\n", + " # Fit power laws\n", + " slope_n, intercept_n = np.polyfit(log_f, log_p, 1)\n", + " slope_d, intercept_d = np.polyfit(log_f, log_t, 1)\n", + "\n", + " # The ratio D/N at a reference compute (geometric mean of our budgets)\n", + " ref_flops = np.sqrt(opt_df['flops'].min() * opt_df['flops'].max())\n", + " log_ref = np.log10(ref_flops)\n", + "\n", + " # Predicted optimal N and D at reference compute\n", + " pred_log_n = intercept_n + slope_n * log_ref\n", + " pred_log_d = intercept_d + slope_d * log_ref\n", + " optimal_ratio = 10**(pred_log_d - pred_log_n)\n", + "\n", + " # Also compute from the fitted optimals directly (mean and std)\n", + " mean_ratio = opt_df['ratio'].mean()\n", + " std_ratio = opt_df['ratio'].std()\n", + "\n", + " print(\"=\" * 60)\n", + " print(\"OPTIMAL RATIO SUMMARY\")\n", + " print(\"=\" * 60)\n", + " print(f\"\\nPower law exponents:\")\n", + " print(f\" N ∝ C^{slope_n:.3f}\")\n", + " print(f\" D ∝ C^{slope_d:.3f}\")\n", + " print(f\" Ratio exponent (b-a): {slope_d - slope_n:.3f} (should be ~0 if ratio is constant)\")\n", + " print(f\"\\nOptimal ratio (tokens per effective param):\")\n", + " print(f\" From power law at C={ref_flops:.1e}: {optimal_ratio:.1f}\")\n", + " print(f\" Mean across budgets: {mean_ratio:.1f} ± {std_ratio:.1f}\")\n", + " print(f\" Chinchilla reference: 20\")\n", + " print(f\"\\nPer-budget ratios: {[f'{r:.1f}' for r in opt_df['ratio'].values]}\")\n", + "else:\n", + " print(\"Need at least 2 flops budgets to compute power law fits\")" ] }, { diff --git a/nanochat/gpt.py b/nanochat/gpt.py index b810ec9b..c55e8930 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -364,15 +364,34 @@ class GPT(nn.Module): def num_scaling_params(self): """ - Return all of the parameters, same as Chinchilla paper. - Kaplan et al. did not include embedding parameters and said that this led to cleaner scaling laws. - But Kaplan et al. also had a bug in their results (as pointed out by Chinchilla). - My own experiments in nanochat confirm the Chinchilla approach gives the much cleaner scaling law. - Ref: https://arxiv.org/abs/2203.15556 (Chinchilla paper <- good). - Ref: https://arxiv.org/abs/2001.08361 (Kaplan et al. original scaling laws paper <- bad) + Return detailed parameter counts for scaling law analysis. + Different papers use different conventions: + - Kaplan et al. excluded embedding parameters + - Chinchilla included all parameters + Ref: https://arxiv.org/abs/2203.15556 (Chinchilla paper) + Ref: https://arxiv.org/abs/2001.08361 (Kaplan et al. original scaling laws paper) + + Returns a dict with counts for each parameter group, so downstream analysis + can experiment with which combination gives the cleanest scaling laws. """ - nparams = sum(p.numel() for p in self.parameters()) - return nparams + # Count each group separately (mirrors the grouping in setup_optimizers) + wte = sum(p.numel() for p in self.transformer.wte.parameters()) + bigram_embed = sum(p.numel() for p in self.bigram_embed.parameters()) + value_embeds = sum(p.numel() for p in self.value_embeds.parameters()) + lm_head = sum(p.numel() for p in self.lm_head.parameters()) + transformer_matrices = sum(p.numel() for p in self.transformer.h.parameters()) + scalars = self.resid_lambdas.numel() + self.x0_lambdas.numel() + self.bigram_lambdas.numel() + total = wte + bigram_embed + value_embeds + lm_head + transformer_matrices + scalars + assert total == sum(p.numel() for p in self.parameters()), "Parameter count mismatch" + return { + 'wte': wte, + 'bigram_embed': bigram_embed, + 'value_embeds': value_embeds, + 'lm_head': lm_head, + 'transformer_matrices': transformer_matrices, + 'scalars': scalars, + 'total': total, + } def setup_optimizers(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0, adam_betas=(0.8, 0.95), scalar_lr=0.5): model_dim = self.config.n_embd diff --git a/runs/scaling_laws.sh b/runs/scaling_laws.sh index 1f9dab87..f1e2fd43 100644 --- a/runs/scaling_laws.sh +++ b/runs/scaling_laws.sh @@ -1,13 +1,14 @@ #!/bin/bash -LABEL="jan16" +LABEL="jan26" FLOPS_BUDGETS=( 1e18 - 3e18 - 6e18 + 2.15e18 + 4.64e18 + 1e19 ) -DEPTHS=(6 7 8 9 10 11 12 13 14) +DEPTHS=(8 10 12 14 16 18 20) NPROC_PER_NODE="${NPROC_PER_NODE:-8}" WANDB_RUN="${WANDB_RUN:-scaling_${LABEL}}" @@ -23,7 +24,7 @@ RESULTS_FILE="$RESULTS_DIR/results.csv" # Write CSV header only if file doesn't exist if [ ! -f "$RESULTS_FILE" ]; then - echo "flops_budget,depth,model_dim,num_params,num_scaling_params,num_iterations,tokens_trained,param_data_ratio,val_bpb,core_score,train_time_sec" > "$RESULTS_FILE" + echo "flops_budget,depth,model_dim,params_wte,params_bigram_embed,params_value_embeds,params_lm_head,params_transformer,params_scalars,params_total,num_iterations,tokens_trained,val_bpb,core_score,train_time_sec" > "$RESULTS_FILE" fi log() { @@ -83,13 +84,19 @@ for flops in "${FLOPS_BUDGETS[@]}"; do # Extract training stats from the log LOG_FILE="$RESULTS_DIR/${TAG}_train.log" - NUM_PARAMS=$(grep "Number of parameters:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | head -1 | tr -d ',') - NUM_SCALING_PARAMS=$(grep "Number of parameters:" "$LOG_FILE" | tail -1 | grep -oP 'scaling: [\d,]+' | grep -oP '[\d,]+' | tr -d ',') + + # Extract detailed parameter counts (for scaling law analysis with different conventions) + PARAMS_WTE=$(grep "wte:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') + PARAMS_BIGRAM=$(grep "bigram_embed:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') + PARAMS_VE=$(grep "value_embeds:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') + PARAMS_LM=$(grep "lm_head:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') + PARAMS_TRANSFORMER=$(grep "transformer_matrices:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') + PARAMS_SCALARS=$(grep "scalars:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') + PARAMS_TOTAL=$(grep "total:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | tr -d ',') + NUM_ITERS=$(grep "Calculated number of iterations" "$LOG_FILE" | tail -1 | sed 's/.*: //' | tr -d ',') # Calculate tokens trained (iterations * batch_size, default 524288) TOKENS_TRAINED=$((NUM_ITERS * 524288)) - # Param:data ratio (using scaling params per Kaplan et al.) - PARAM_DATA_RATIO=$(python -c "print(f'{$TOKENS_TRAINED / $NUM_SCALING_PARAMS:.2f}')") # Model dim MODEL_DIM=$((d * 64)) # Val BPB from final eval @@ -102,10 +109,10 @@ for flops in "${FLOPS_BUDGETS[@]}"; do CORE_SCORE="0.0" fi - log " Params: $NUM_PARAMS, Iters: $NUM_ITERS, Ratio: $PARAM_DATA_RATIO, Val BPB: $VAL_BPB, CORE: $CORE_SCORE" + log " Params: $PARAMS_TOTAL (transformer: $PARAMS_TRANSFORMER), Iters: $NUM_ITERS, Val BPB: $VAL_BPB, CORE: $CORE_SCORE" # Append to CSV - echo "$flops,$d,$MODEL_DIM,$NUM_PARAMS,$NUM_SCALING_PARAMS,$NUM_ITERS,$TOKENS_TRAINED,$PARAM_DATA_RATIO,$VAL_BPB,$CORE_SCORE,$TRAIN_TIME" >> "$RESULTS_FILE" + echo "$flops,$d,$MODEL_DIM,$PARAMS_WTE,$PARAMS_BIGRAM,$PARAMS_VE,$PARAMS_LM,$PARAMS_TRANSFORMER,$PARAMS_SCALARS,$PARAMS_TOTAL,$NUM_ITERS,$TOKENS_TRAINED,$VAL_BPB,$CORE_SCORE,$TRAIN_TIME" >> "$RESULTS_FILE" done done diff --git a/scripts/base_train.py b/scripts/base_train.py index 02eeea3c..4fa8fcad 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -47,7 +47,7 @@ parser.add_argument("--window-pattern", type=str, default="SSSL", help="sliding # Training horizon (only one used, in order of precedence) parser.add_argument("--num-iterations", type=int, default=-1, help="explicit number of optimization steps (-1 = disable)") parser.add_argument("--target-flops", type=float, default=-1.0, help="calculate num_iterations to reach target_flops (-1 = disable)") -parser.add_argument("--target-param-data-ratio", type=int, default=4, help="calculate num_iterations to maintain data:param ratio (Chinchilla=20, -1 = disable)") +parser.add_argument("--target-param-data-ratio", type=float, default=10.5, help="calculate num_iterations to maintain data:param ratio (Chinchilla=20, -1 = disable)") # Optimization parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size") parser.add_argument("--total-batch-size", type=int, default=524288, help="total batch size in tokens") @@ -178,9 +178,14 @@ if resuming: orig_model = model # original, uncompiled model, for saving raw model state_dict and for inference/evaluation (because the shapes may change shape) model = torch.compile(model, dynamic=False) # the inputs to model will never change shape so dynamic=False is safe -num_params = sum(p.numel() for p in model.parameters()) -num_scaling_params = orig_model.num_scaling_params() -print0(f"Number of parameters: {num_params:,} (scaling: {num_scaling_params:,})") + +# Detailed parameter counts +param_counts = orig_model.num_scaling_params() +print0(f"Parameter counts:") +for key, value in param_counts.items(): + print0(f"{key:24s}: {value:,}") +num_params = param_counts['total'] +num_scaling_params = param_counts['transformer_matrices'] + param_counts['lm_head'] # determined to give the cleanest scaling laws, see dev/LOG.md Jan 27, 2026 num_flops_per_token = model.estimate_flops() print0(f"Estimated FLOPs per token: {num_flops_per_token:e}") @@ -195,14 +200,14 @@ elif args.target_flops > 0: print0(f"Calculated number of iterations from target FLOPs: {num_iterations:,}") elif args.target_param_data_ratio > 0: # calculate the number of iterations from the target param data ratio (use scaling params per Kaplan et al.) - target_tokens = args.target_param_data_ratio * num_scaling_params + target_tokens = int(args.target_param_data_ratio * num_scaling_params) num_iterations = target_tokens // args.total_batch_size print0(f"Calculated number of iterations from target data:param ratio: {num_iterations:,}") else: raise ValueError("No training horizon specified") total_tokens = args.total_batch_size * num_iterations print0(f"Total number of training tokens: {total_tokens:,}") -print0(f"Tokens : Params ratio: {args.total_batch_size * num_iterations / num_scaling_params:.2f}") # Chinchilla is ~20 +print0(f"Tokens : Scaling params ratio: {args.total_batch_size * num_iterations / num_scaling_params:.2f}") # Chinchilla is ~20 print0(f"Total training FLOPs estimate: {num_flops_per_token * total_tokens:e}") # ----------------------------------------------------------------------------- @@ -445,7 +450,7 @@ get_report().log(section="Base model training", data=[ "Number of FLOPs per token": f"{num_flops_per_token:e}", "Calculated number of iterations": num_iterations, "Number of training tokens": total_tokens, - "Tokens : Params ratio": args.total_batch_size * num_iterations / num_params, + "Tokens : Scaling params ratio": args.total_batch_size * num_iterations / num_scaling_params, "DDP world size": ddp_world_size, "warmup_ratio": args.warmup_ratio, "warmdown_ratio": args.warmdown_ratio, From d5418ea5a1367550a2fb93c12a209fe51c0d2560 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Wed, 28 Jan 2026 17:31:44 +0100 Subject: [PATCH 067/119] Fix link to DeepSeek Engram paper (#470) * Fix link to DeepSeek Engram paper in LOG.md Updated link to the DeepSeek Engram paper in the log. * remove www --- dev/LOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dev/LOG.md b/dev/LOG.md index bba35ea9..72d12072 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -6,7 +6,7 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 ## 2026-01-27: Bigram Hash Embeddings (Engram-lite) -Explored N-gram memory modules inspired by the [DeepSeek Engram paper](https://arxiv.org/abs/2506.08046) and [modded-nanogpt PR #201](https://github.com/KellerJordan/modded-nanogpt/pull/201). +Explored N-gram memory modules inspired by the [DeepSeek Engram paper](https://arxiv.org/abs/2601.07372) and [modded-nanogpt PR #201](https://github.com/KellerJordan/modded-nanogpt/pull/201). ### Background From 74554be3b5008cd7803732f0d7e519944a780310 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Wed, 28 Jan 2026 20:07:39 +0000 Subject: [PATCH 068/119] revert engram, not seeing an improvement at larger scale --- dev/LOG.md | 6 +++++ nanochat/gpt.py | 64 +++++-------------------------------------------- 2 files changed, 12 insertions(+), 58 deletions(-) diff --git a/dev/LOG.md b/dev/LOG.md index 72d12072..2f26165e 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,12 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-28: Reverted Bigram Hash Embeddings + +Removed bigram embeddings (engram-lite) from the codebase. At larger scale (d25), the improvement was tiny and disappeared entirely when measured by wall clock time. It also bloated the VRAM used. The extra parameters and complexity aren't justified. + +--- + ## 2026-01-27: Bigram Hash Embeddings (Engram-lite) Explored N-gram memory modules inspired by the [DeepSeek Engram paper](https://arxiv.org/abs/2601.07372) and [modded-nanogpt PR #201](https://github.com/KellerJordan/modded-nanogpt/pull/201). diff --git a/nanochat/gpt.py b/nanochat/gpt.py index c55e8930..672af717 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -45,41 +45,6 @@ def norm(x): return F.rms_norm(x, (x.size(-1),)) -class BigramEmbed(nn.Module): - """ - Hash bigrams to embeddings. Simple, self-contained, runs on GPU. - Following modded-nanogpt's approach: single hash, no gating. - - For each position t, hashes (token[t-1], token[t]) to an index in a large - embedding table. This provides O(1) lookup for local 2-gram patterns, - offloading static pattern reconstruction from the transformer layers. - - Ref: https://github.com/KellerJordan/modded-nanogpt/pull/201 - Ref: https://arxiv.org/abs/1709.03933 (Hash Embeddings) - """ - def __init__(self, vocab_size: int, embed_dim: int, table_multiplier: int = 5): - super().__init__() - self.bigram_vocab_size = vocab_size * table_multiplier - self.embed = nn.Embedding(self.bigram_vocab_size, embed_dim) - - def forward(self, idx: torch.Tensor) -> torch.Tensor: - """ - idx: (B, T) token ids - Returns: (B, T, embed_dim) bigram embeddings - """ - # Hash (prev_token, curr_token) -> index - # Position 0 gets a reserved index (no valid bigram) - rand_int_1 = 36313 - rand_int_2 = 27191 - mod = self.bigram_vocab_size - 1 - - h = torch.empty_like(idx, dtype=torch.long) - h[:, 0] = mod # reserved index for position 0 - h[:, 1:] = (rand_int_1 * idx[:, 1:] ^ rand_int_2 * idx[:, :-1]) % mod - - return self.embed(h) - - def has_ve(layer_idx, n_layer): """Returns True if GPT layer should have Value Embedding (alternating, last layer always included).""" return layer_idx % 2 == (n_layer - 1) % 2 @@ -204,13 +169,9 @@ class GPT(nn.Module): # Per-layer learnable scalars (inspired by modded-nanogpt) # resid_lambdas: scales the residual stream at each layer (init 1.0 = neutral) # x0_lambdas: blends initial embedding back in at each layer (init 0.0 = disabled) - # bigram_lambdas: blends bigram embeddings in at each layer (init 0.1 = small contribution) # Separate parameters so they can have different optimizer treatment self.resid_lambdas = nn.Parameter(torch.ones(config.n_layer)) # fake init, real init in init_weights() self.x0_lambdas = nn.Parameter(torch.zeros(config.n_layer)) # fake init, real init in init_weights() - self.bigram_lambdas = nn.Parameter(torch.zeros(config.n_layer)) # fake init, real init in init_weights() - # Bigram hash embeddings: O(1) lookup for local 2-gram patterns - self.bigram_embed = BigramEmbed(config.vocab_size, config.n_embd) # Value embeddings (ResFormer-style): alternating layers, last layer always included head_dim = config.n_embd // config.n_head kv_dim = config.n_kv_head * head_dim @@ -259,10 +220,6 @@ class GPT(nn.Module): # Per-layer scalars self.resid_lambdas.fill_(1.0) # 1.0 => typical residual connections at init self.x0_lambdas.fill_(0.1) # 0.1 => small initial weight for skip connection to input embedding - self.bigram_lambdas.fill_(0.1) # 0.1 => small initial weight for skip connection to bigram embeddings - - # Bigram embeddings: zero init so it starts as identity - nn.init.zeros_(self.bigram_embed.embed.weight) # Value embeddings (init like c_v: uniform with same std) for ve in self.value_embeds.values(): @@ -283,7 +240,6 @@ class GPT(nn.Module): self.transformer.wte.to(dtype=torch.bfloat16) for ve in self.value_embeds.values(): ve.to(dtype=torch.bfloat16) - self.bigram_embed.to(dtype=torch.bfloat16) def _precompute_rotary_embeddings(self, seq_len, head_dim, base=10000, device=None): # TODO: bump base theta more? e.g. 100K is more common more recently @@ -349,9 +305,8 @@ class GPT(nn.Module): nparams = sum(p.numel() for p in self.parameters()) # Exclude non-matmul params: embeddings and per-layer scalars value_embeds_numel = sum(ve.weight.numel() for ve in self.value_embeds.values()) - bigram_embed_numel = self.bigram_embed.embed.weight.numel() - nparams_exclude = (self.transformer.wte.weight.numel() + value_embeds_numel + bigram_embed_numel + - self.resid_lambdas.numel() + self.x0_lambdas.numel() + self.bigram_lambdas.numel()) + nparams_exclude = (self.transformer.wte.weight.numel() + value_embeds_numel + + self.resid_lambdas.numel() + self.x0_lambdas.numel()) h, q, t = self.config.n_head, self.config.n_embd // self.config.n_head, self.config.sequence_len # Sum attention FLOPs per layer, accounting for sliding window attn_flops = 0 @@ -376,16 +331,14 @@ class GPT(nn.Module): """ # Count each group separately (mirrors the grouping in setup_optimizers) wte = sum(p.numel() for p in self.transformer.wte.parameters()) - bigram_embed = sum(p.numel() for p in self.bigram_embed.parameters()) value_embeds = sum(p.numel() for p in self.value_embeds.parameters()) lm_head = sum(p.numel() for p in self.lm_head.parameters()) transformer_matrices = sum(p.numel() for p in self.transformer.h.parameters()) - scalars = self.resid_lambdas.numel() + self.x0_lambdas.numel() + self.bigram_lambdas.numel() - total = wte + bigram_embed + value_embeds + lm_head + transformer_matrices + scalars + scalars = self.resid_lambdas.numel() + self.x0_lambdas.numel() + total = wte + value_embeds + lm_head + transformer_matrices + scalars assert total == sum(p.numel() for p in self.parameters()), "Parameter count mismatch" return { 'wte': wte, - 'bigram_embed': bigram_embed, 'value_embeds': value_embeds, 'lm_head': lm_head, 'transformer_matrices': transformer_matrices, @@ -403,9 +356,7 @@ class GPT(nn.Module): lm_head_params = list(self.lm_head.parameters()) resid_params = [self.resid_lambdas] x0_params = [self.x0_lambdas] - bigram_embed_params = list(self.bigram_embed.parameters()) - bigram_lambda_params = [self.bigram_lambdas] - assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(value_embeds_params) + len(resid_params) + len(x0_params) + len(bigram_embed_params) + len(bigram_lambda_params) + assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(value_embeds_params) + len(resid_params) + len(x0_params) # Create the AdamW optimizer for the embedding, lm_head, and per-layer scalars # Scale the LR for the AdamW parameters by ∝1/√dmodel (having tuned the LRs for 768 dim model) dmodel_lr_scale = (model_dim / 768) ** -0.5 @@ -414,10 +365,8 @@ class GPT(nn.Module): dict(params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale), dict(params=embedding_params, lr=embedding_lr * dmodel_lr_scale), dict(params=value_embeds_params, lr=embedding_lr * dmodel_lr_scale), # same LR as token embedding - dict(params=bigram_embed_params, lr=embedding_lr * dmodel_lr_scale), # same LR as token embedding dict(params=resid_params, lr=scalar_lr * 0.01), # these are a lot more sensitive because they accumulate in the residual stream dict(params=x0_params, lr=scalar_lr, betas=(0.96, 0.95)), # higher beta1 for x0 scalars - dict(params=bigram_lambda_params, lr=scalar_lr, betas=(0.96, 0.95)), # same treatment as x0 lambdas ] adamw_kwargs = dict(betas=adam_betas, eps=1e-10, weight_decay=0.0) # NOTE: weight decay is hardcoded to 0.0 for AdamW, only used in Muon AdamWFactory = DistAdamW if ddp else partial(torch.optim.AdamW, fused=True) @@ -446,11 +395,10 @@ class GPT(nn.Module): # Forward the trunk of the Transformer x = self.transformer.wte(idx) # embed current token - x0_bigram = self.bigram_embed(idx) # embed current bigram (via hash lookup) x = norm(x) x0 = x # save initial normalized embedding for x0 residual for i, block in enumerate(self.transformer.h): - x = self.resid_lambdas[i] * x + self.x0_lambdas[i] * x0 + self.bigram_lambdas[i] * x0_bigram + x = self.resid_lambdas[i] * x + self.x0_lambdas[i] * x0 ve = self.value_embeds[str(i)](idx) if str(i) in self.value_embeds else None x = block(x, ve, cos_sin, self.window_sizes[i], kv_cache) x = norm(x) From 65df0de42b58de70299401965ce605aa0ebcb77a Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 29 Jan 2026 00:34:24 +0000 Subject: [PATCH 069/119] add arxiv reading skill --- .claude/skills/read-arxiv-paper/SKILL.md | 40 ++++++++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 .claude/skills/read-arxiv-paper/SKILL.md diff --git a/.claude/skills/read-arxiv-paper/SKILL.md b/.claude/skills/read-arxiv-paper/SKILL.md new file mode 100644 index 00000000..6a9cda71 --- /dev/null +++ b/.claude/skills/read-arxiv-paper/SKILL.md @@ -0,0 +1,40 @@ +--- +name: read-arxiv-paper +description: Use this skill when when asked to read an arxiv paper given an arxiv URL +--- + +You will be given a URL of an arxiv paper, for example: + +https://www.arxiv.org/abs/2601.07372 + +### Part 1: Normalize the URL + +The goal is to fetch the TeX Source of the paper (not the PDF!), the URL always looks like this: + +https://www.arxiv.org/src/2601.07372 + +Notice the /src/ in the url. Once you have the URL: + +### Part 2: Download the paper source + +Fetch the url to a local .tar.gz file. A good location is `~/.cache/nanochat/knowledge/{arxiv_id}.tar.gz`. + +(If the file already exists, there is no need to re-download it). + +### Part 3: Unpack the file in that folder + +Unpack the contents into `~/.cache/nanochat/knowledge/{arxiv_id}` directory. + +### Part 4: Locate the entrypoint + +Every latex source usually has an entrypoint, such as `main.tex` or something like that. + +### Part 5: Read the paper + +Once you've found the entrypoint, Read the contents and then recurse through all other relevant source files to read the paper. + +#### Part 6: Report + +Once you've read the paper, produce a summary of the paper into a markdown file at `./knowledge/summary_{tag}.md`. Notice that 1) use the local knowledge directory here (it's easier for me to open and reference here), not in `~/.cache`, and 2) generate some reasonable `tag` like e.g. `conditional_memory` or whatever seems appropriate given the paper. Probably make sure that the tag doesn't exist yet so you're not overwriting files. + +As for the summary itself, remember that you're processing this paper within the context of the nanochat repository, so most often we we will be interested in how to apply the paper and its lessons to the nanochat project. Therefore, you should feel free to "remind yourself" of the related nanochat code by reading the relevant parts, and then explicitly make the connection of how this paper might relate to nanochat or what are things we might be inspired about or try. From 64a651a63ce58e7b1dbd55498d8b0640bc9a1621 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 29 Jan 2026 00:35:02 +0000 Subject: [PATCH 070/119] include .claude is ok --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index d82809a1..3e928240 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,5 @@ eval_bundle/ .env # Local setup -.claude CLAUDE.md wandb/ From 41bb2eac320b927ec093d31db63d8b8a4aac6b2c Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 29 Jan 2026 00:50:50 +0000 Subject: [PATCH 071/119] Combine AdamW and Muon into single MuonAdamW optimizer, cleaner, ty @chrisjmccormick for idea/help --- README.md | 3 +- nanochat/adamw.py | 143 ------------ nanochat/gpt.py | 51 ++-- nanochat/muon.py | 352 ---------------------------- nanochat/optim.py | 528 ++++++++++++++++++++++++++++++++++++++++++ scripts/base_train.py | 28 +-- scripts/chat_rl.py | 17 +- scripts/chat_sft.py | 19 +- scripts/mid_train.py | 28 +-- 9 files changed, 595 insertions(+), 574 deletions(-) delete mode 100644 nanochat/adamw.py delete mode 100644 nanochat/muon.py create mode 100644 nanochat/optim.py diff --git a/README.md b/README.md index fb8747fd..74211521 100644 --- a/README.md +++ b/README.md @@ -135,7 +135,6 @@ python -m pytest tests/test_engine.py -v -s │ └── repackage_data_reference.py # Pretraining data shard generation ├── nanochat │ ├── __init__.py # empty -│ ├── adamw.py # Distributed AdamW optimizer │ ├── checkpoint_manager.py # Save/Load model checkpoints │ ├── common.py # Misc small utilities, quality of life │ ├── core_eval.py # Evaluates base model CORE score (DCLM paper) @@ -146,7 +145,7 @@ python -m pytest tests/test_engine.py -v -s │ ├── gpt.py # The GPT nn.Module Transformer │ ├── logo.svg │ ├── loss_eval.py # Evaluate bits per byte (instead of loss) -│ ├── muon.py # Distributed Muon optimizer +│ ├── optim.py # AdamW + Muon optimizer, 1GPU and distributed │ ├── report.py # Utilities for writing the nanochat Report │ ├── tokenizer.py # BPE Tokenizer wrapper in style of GPT-4 │ └── ui.html # HTML/CSS/JS for nanochat frontend diff --git a/nanochat/adamw.py b/nanochat/adamw.py deleted file mode 100644 index 70ccf7b5..00000000 --- a/nanochat/adamw.py +++ /dev/null @@ -1,143 +0,0 @@ -""" -Distributed AdamW optimizer with a fused step function. -A bunch of ideas (e.g. dist comms in slices) are borrowed from modded-nanogpt. -""" -import torch -import torch.distributed as dist -from torch import Tensor - -@torch.compile(dynamic=False, fullgraph=True) -def adamw_step_fused( - p: Tensor, - grad: Tensor, - exp_avg: Tensor, - exp_avg_sq: Tensor, - step_t: Tensor, - lr_t: Tensor, - beta1_t: Tensor, - beta2_t: Tensor, - eps_t: Tensor, - wd_t: Tensor, -) -> None: - """ - Fused AdamW step: weight_decay -> momentum_update -> bias_correction -> param_update - All in one compiled graph to eliminate Python overhead between ops. - The 0-D CPU tensors avoid recompilation when hyperparameter values change. - """ - # Weight decay (decoupled, applied before the update) - p.mul_(1 - lr_t * wd_t) - # Update running averages (lerp_ is cleaner and fuses well) - exp_avg.lerp_(grad, 1 - beta1_t) - exp_avg_sq.lerp_(grad.square(), 1 - beta2_t) - # Bias corrections - bias1 = 1 - beta1_t ** step_t - bias2 = 1 - beta2_t ** step_t - # Compute update and apply - denom = (exp_avg_sq / bias2).sqrt() + eps_t - step_size = lr_t / bias1 - p.add_(exp_avg / denom, alpha=-step_size) - - -class DistAdamW(torch.optim.Optimizer): - """ - Distributed AdamW optimizer. - In the style of ZeRO-2, i.e. sharded optimizer states and gradient reduction - """ - def __init__(self, param_groups, lr: float = 1e-3, betas: tuple[float, float] = (0.9, 0.999), eps: float = 1e-8, weight_decay: float = 0.01): - defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay) - rank = dist.get_rank() - world_size = dist.get_world_size() - # Validate - if rank == 0: - for group in param_groups: - assert isinstance(group, dict), "expecting param_groups to be a list of dicts" - assert isinstance(group['params'], list), "expecting group['params'] to be a list of tensors" - for p in group['params']: - sliced = p.numel() >= 1024 - print(f"AdamW: 1 param of shape {p.shape}, sliced={sliced}") - if sliced: # large parameter tensors will be operated on in slices - assert p.shape[0] % world_size == 0, f"First dim of parameter shape {p.shape} must be divisible by world size {world_size}" - super().__init__(param_groups, defaults) - # 0-D CPU tensors to avoid torch.compile recompilation when values change - self._step_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") - self._lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") - self._beta1_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") - self._beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") - self._eps_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") - self._wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") - - @torch.no_grad() - def step(self): - rank = dist.get_rank() - world_size = dist.get_world_size() - reduce_futures: list[torch.Future] = [] - gather_futures: list[torch.Future] = [] - grad_slices = [] - is_small = [] # track which params are small (use all_reduce) vs large (use reduce_scatter) - - for group in self.param_groups: - params: list[Tensor] = group["params"] - for p in params: - grad = p.grad - # Small params: use all_reduce (no scatter/gather needed) - if p.numel() < 1024: - is_small.append(True) - reduce_futures.append(dist.all_reduce(grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) - grad_slices.append(grad) - else: - is_small.append(False) - rank_size = grad.shape[0] // world_size # p.shape[0] % world_size == 0 is checked in __init__ - grad_slice = torch.empty_like(grad[:rank_size]) - reduce_futures.append(dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()) - grad_slices.append(grad_slice) - - idx = 0 - for group in self.param_groups: - beta1, beta2 = group['betas'] - eps = group['eps'] - wd = group['weight_decay'] - params = group['params'] - for p in params: - reduce_futures[idx].wait() - g_slice = grad_slices[idx] - lr = group['lr'] * getattr(p, "lr_mul", 1.0) - state = self.state[p] - - # For small params, operate on full param; for large, operate on slice - if is_small[idx]: - p_slice = p - else: - rank_size = p.shape[0] // world_size - p_slice = p[rank * rank_size:(rank + 1) * rank_size] - - # State init - if not state: - state['step'] = 0 - state['exp_avg'] = torch.zeros_like(p_slice) - state['exp_avg_sq'] = torch.zeros_like(p_slice) - exp_avg = state['exp_avg'] - exp_avg_sq = state['exp_avg_sq'] - state['step'] += 1 - - # Fill 0-D tensors with current values - eff_wd = wd * getattr(p, "wd_mul", 1.0) - self._step_t.fill_(state['step']) - self._lr_t.fill_(lr) - self._beta1_t.fill_(beta1) - self._beta2_t.fill_(beta2) - self._eps_t.fill_(eps) - self._wd_t.fill_(eff_wd) - - # Fused update: weight_decay -> momentum -> bias_correction -> param_update - adamw_step_fused( - p_slice, g_slice, exp_avg, exp_avg_sq, - self._step_t, self._lr_t, self._beta1_t, self._beta2_t, self._eps_t, self._wd_t, - ) - - # Only large params need all_gather - if not is_small[idx]: - gather_futures.append(dist.all_gather_into_tensor(p, p_slice, async_op=True).get_future()) - idx += 1 - - if gather_futures: - torch.futures.collect_all(gather_futures).wait() diff --git a/nanochat/gpt.py b/nanochat/gpt.py index 672af717..d23a5167 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -20,8 +20,7 @@ import torch.nn as nn import torch.nn.functional as F from nanochat.common import get_dist_info, print0 -from nanochat.muon import Muon, DistMuon -from nanochat.adamw import DistAdamW +from nanochat.optim import MuonAdamW, DistMuonAdamW # Our custom Flash Attention module that automatically uses FA3 on Hopper+ and SDPA fallback elsewhere from nanochat.flash_attention import flash_attn @@ -346,9 +345,10 @@ class GPT(nn.Module): 'total': total, } - def setup_optimizers(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0, adam_betas=(0.8, 0.95), scalar_lr=0.5): + def setup_optimizer(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0, adam_betas=(0.8, 0.95), scalar_lr=0.5): model_dim = self.config.n_embd ddp, rank, local_rank, world_size = get_dist_info() + # Separate out all parameters into groups matrix_params = list(self.transformer.h.parameters()) value_embeds_params = list(self.value_embeds.parameters()) @@ -357,30 +357,33 @@ class GPT(nn.Module): resid_params = [self.resid_lambdas] x0_params = [self.x0_lambdas] assert len(list(self.parameters())) == len(matrix_params) + len(embedding_params) + len(lm_head_params) + len(value_embeds_params) + len(resid_params) + len(x0_params) - # Create the AdamW optimizer for the embedding, lm_head, and per-layer scalars - # Scale the LR for the AdamW parameters by ∝1/√dmodel (having tuned the LRs for 768 dim model) + + # Scale the LR for the AdamW parameters by ∝1/√dmodel (tuned for 768 dim model) dmodel_lr_scale = (model_dim / 768) ** -0.5 print0(f"Scaling the LR for the AdamW parameters ∝1/√({model_dim}/768) = {dmodel_lr_scale:.6f}") - adam_groups = [ - dict(params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale), - dict(params=embedding_params, lr=embedding_lr * dmodel_lr_scale), - dict(params=value_embeds_params, lr=embedding_lr * dmodel_lr_scale), # same LR as token embedding - dict(params=resid_params, lr=scalar_lr * 0.01), # these are a lot more sensitive because they accumulate in the residual stream - dict(params=x0_params, lr=scalar_lr, betas=(0.96, 0.95)), # higher beta1 for x0 scalars + + # Build param_groups with all required fields explicit + param_groups = [ + # AdamW groups (embeddings, lm_head, scalars) + dict(kind='adamw', params=lm_head_params, lr=unembedding_lr * dmodel_lr_scale, betas=adam_betas, eps=1e-10, weight_decay=0.0), + dict(kind='adamw', params=embedding_params, lr=embedding_lr * dmodel_lr_scale, betas=adam_betas, eps=1e-10, weight_decay=0.0), + dict(kind='adamw', params=value_embeds_params, lr=embedding_lr * dmodel_lr_scale, betas=adam_betas, eps=1e-10, weight_decay=0.0), + dict(kind='adamw', params=resid_params, lr=scalar_lr * 0.01, betas=adam_betas, eps=1e-10, weight_decay=0.0), + dict(kind='adamw', params=x0_params, lr=scalar_lr, betas=(0.96, 0.95), eps=1e-10, weight_decay=0.0), # higher beta1 for x0 ] - adamw_kwargs = dict(betas=adam_betas, eps=1e-10, weight_decay=0.0) # NOTE: weight decay is hardcoded to 0.0 for AdamW, only used in Muon - AdamWFactory = DistAdamW if ddp else partial(torch.optim.AdamW, fused=True) - adamw_optimizer = AdamWFactory(adam_groups, **adamw_kwargs) - # Create the Muon optimizer for the linear layers - muon_kwargs = dict(lr=matrix_lr, momentum=0.95, weight_decay=weight_decay) - MuonFactory = DistMuon if ddp else Muon - muon_optimizer = MuonFactory(matrix_params, **muon_kwargs) - # Combine them the two optimizers into one list - optimizers = [adamw_optimizer, muon_optimizer] - for opt in optimizers: - for group in opt.param_groups: - group["initial_lr"] = group["lr"] - return optimizers + # Muon groups (matrix params, grouped by shape for stacking) + for shape in sorted({p.shape for p in matrix_params}): + group_params = [p for p in matrix_params if p.shape == shape] + param_groups.append(dict( + kind='muon', params=group_params, lr=matrix_lr, + momentum=0.95, ns_steps=5, beta2=0.95, weight_decay=weight_decay, + )) + + Factory = DistMuonAdamW if ddp else MuonAdamW + optimizer = Factory(param_groups) + for group in optimizer.param_groups: + group["initial_lr"] = group["lr"] + return optimizer def forward(self, idx, targets=None, kv_cache=None, loss_reduction='mean'): B, T = idx.size() diff --git a/nanochat/muon.py b/nanochat/muon.py deleted file mode 100644 index cfd2443d..00000000 --- a/nanochat/muon.py +++ /dev/null @@ -1,352 +0,0 @@ -""" -Muon optimizer adapted and simplified from modded-nanogpt. -https://github.com/KellerJordan/modded-nanogpt - -Background: -Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a -quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose -of minimizing steps, it turns out to be empirically effective to keep increasing the slope at -zero even beyond the point where the iteration no longer converges all the way to one everywhere -on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T -where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model -performance at all relative to UV^T, where USV^T = G is the SVD. - -Here, an alternative to Newton-Schulz iteration with potentially better convergence properties: -Polar Express Sign Method for orthogonalization. -https://arxiv.org/pdf/2505.16932 -by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. - -Some of the changes in nanochat implementation: -- Uses a simpler, more general approach to parameter grouping and stacking -- Uses a single fused kernel for the momentum -> polar_express -> variance_reduction -> update step -- Makes no assumptions about model architecture (e.g. that attention weights are fused into QKVO format) -""" - -import torch -from torch import Tensor -import torch.distributed as dist - -# Coefficients for Polar Express (computed for num_iters=5, safety_factor=2e-2, cushion=2) -# From https://arxiv.org/pdf/2505.16932 -polar_express_coeffs = [ - (8.156554524902461, -22.48329292557795, 15.878769915207462), - (4.042929935166739, -2.808917465908714, 0.5000178451051316), - (3.8916678022926607, -2.772484153217685, 0.5060648178503393), - (3.285753657755655, -2.3681294933425376, 0.46449024233003106), - (2.3465413258596377, -1.7097828382687081, 0.42323551169305323), -] - -@torch.compile(dynamic=False, fullgraph=True) -def muon_step_fused( - stacked_grads: Tensor, - stacked_params: Tensor, - momentum_buffer: Tensor, - second_momentum_buffer: Tensor, - momentum_t: Tensor, - lr_t: Tensor, - wd_t: Tensor, - beta2_t: Tensor, - ns_steps: int, - red_dim: int, -) -> None: - """ - Fused Muon step: momentum -> polar_express -> variance_reduction -> cautious_update - All in one compiled graph to eliminate Python overhead between ops. - Some of the constants are 0-D CPU tensors to avoid recompilation when values change. - """ - - # Nesterov momentum - momentum = momentum_t.to(stacked_grads.dtype) - momentum_buffer.lerp_(stacked_grads, 1 - momentum) - g = stacked_grads.lerp_(momentum_buffer, momentum) - - # Polar express - X = g.bfloat16() - if g.size(-2) > g.size(-1): - X = X.mT - X = X / (X.norm(dim=(-2, -1), keepdim=True) * 1.02 + 1e-6) - for a, b, c in polar_express_coeffs[:ns_steps]: - A = X @ X.mT - B = b * A + c * (A @ A) - X = a * X + B @ X - if g.size(-2) > g.size(-1): - X = X.mT - g = X - - # Variance reduction - beta2 = beta2_t.to(g.dtype) - v_mean = g.float().square().mean(dim=red_dim, keepdim=True) - red_dim_size = g.size(red_dim) - v_norm_sq = v_mean.sum(dim=(-2, -1), keepdim=True) * red_dim_size - v_norm = v_norm_sq.sqrt() - second_momentum_buffer.lerp_(v_mean.to(dtype=second_momentum_buffer.dtype), 1 - beta2) - step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt() - scaled_sq_sum = (v_mean * red_dim_size) * step_size.float().square() - v_norm_new = scaled_sq_sum.sum(dim=(-2, -1), keepdim=True).sqrt() - final_scale = step_size * (v_norm / v_norm_new.clamp_min(1e-10)) - g = g * final_scale.to(g.dtype) - - # Cautious weight decay + parameter update - lr = lr_t.to(g.dtype) - wd = wd_t.to(g.dtype) - mask = (g * stacked_params) >= 0 - stacked_params.sub_(lr * g + lr * wd * stacked_params * mask) - -class Muon(torch.optim.Optimizer): - """ - Muon - MomentUm Orthogonalized by Newton-schulz - - https://kellerjordan.github.io/posts/muon/ - - Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- - processing step, in which each 2D parameter's update is replaced with the nearest orthogonal - matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has - the advantage that it can be stably run in bfloat16 on the GPU. - - Some warnings: - - This optimizer should not be used for the embedding layer, the final fully connected layer, - or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). - - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. - - Arguments: - lr: The learning rate used by the internal SGD. - momentum: The momentum used by the internal SGD. - ns_steps: The number of Newton-Schulz iteration steps to use. - beta2: The decay rate for the second moment (variance) estimate. Set to None to disable. - weight_decay: Cautious weight decay coefficient. Only decays where update and weight agree. - """ - def __init__(self, params, lr=0.02, momentum=0.95, ns_steps=5, beta2=0.95, weight_decay=0.0): - defaults = dict(lr=lr, momentum=momentum, ns_steps=ns_steps, beta2=beta2, weight_decay=weight_decay) - assert all(p.ndim == 2 for p in params), "Muon expects 2D parameters only" - params = list(params) # ensure we have a list, not an e.g. (exhaustible) iterator - # Group by shape so we can stack tensors - shapes = sorted({p.shape for p in params}) - param_groups = [] - for shape in shapes: - group_params = [p for p in params if p.shape == shape] - param_groups.append(dict(params=group_params)) - super().__init__(param_groups, defaults) - # 0-D CPU tensors to avoid torch.compile recompilation when values change - self._momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") - self._lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") - self._wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") - self._beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") - - @torch.no_grad() - def step(self): - for group in self.param_groups: - params: list[Tensor] = group["params"] - if not params: - continue - - # Get or create group-level buffers (stored in first param's state for convenience) - state = self.state[params[0]] - num_params = len(params) # e.g.: 12 (for a d12 model) - # e.g.: shape = (768, 3072), device = cuda:0, dtype = torch.float32, for one of the MLP projections - shape, device, dtype = params[0].shape, params[0].device, params[0].dtype - - # Momentum for every individual parameter - if "momentum_buffer" not in state: - state["momentum_buffer"] = torch.zeros(num_params, *shape, dtype=dtype, device=device) - momentum_buffer = state["momentum_buffer"] # e.g.: (12, 768, 3072) - - # Second momentum buffer is factored, either per-row or per-column - if "second_momentum_buffer" not in state: - if shape[-2] >= shape[-1]: - state["second_momentum_buffer"] = torch.zeros(num_params, shape[-2], 1, dtype=dtype, device=device) - else: - state["second_momentum_buffer"] = torch.zeros(num_params, 1, shape[-1], dtype=dtype, device=device) - second_momentum_buffer = state["second_momentum_buffer"] # (12, 1, 3072) - red_dim = -1 if shape[-2] >= shape[-1] else -2 # e.g.: -2 - - # Stack grads and params - stacked_grads = torch.stack([p.grad for p in params]) # (12, 768, 3072) - stacked_params = torch.stack(params) # (12, 768, 3072) - - # Fill all the 0-D tensors with current values - self._momentum_t.fill_(group["momentum"]) - self._beta2_t.fill_(group["beta2"] if group["beta2"] is not None else 0.0) - self._lr_t.fill_(group["lr"] * max(1.0, shape[-2] / shape[-1])**0.5) - self._wd_t.fill_(group["weight_decay"]) - - # Single fused kernel: momentum -> polar_express -> variance_reduction -> update - muon_step_fused( - stacked_grads, - stacked_params, - momentum_buffer, - second_momentum_buffer, - self._momentum_t, - self._lr_t, - self._wd_t, - self._beta2_t, - group["ns_steps"], - red_dim, - ) - - # Copy back to original params: [(768, 3072), (768, 3072), ...] <- (12, 768, 3072) - torch._foreach_copy_(params, list(stacked_params.unbind(0))) - - -class DistMuon(torch.optim.Optimizer): - """ - Distributed version of the Muon optimizer. - """ - def __init__(self, params, lr: float = 0.02, momentum: float = 0.95, - ns_steps: int = 5, beta2: float = 0.95, weight_decay: float = 0.0): - defaults = dict(lr=lr, momentum=momentum, ns_steps=ns_steps, beta2=beta2, weight_decay=weight_decay) - assert all(p.ndim == 2 for p in params), "Muon expects 2D parameters only" - params = list(params) - world_size = dist.get_world_size() - rank = dist.get_rank() - # Group all parameters by their shape - shapes = sorted({p.shape for p in params}) # sort for deterministic ordering across ranks - param_groups = [] - for shape in shapes: - group_params = [p for p in params if p.shape == shape] - device, dtype = group_params[0].device, group_params[0].dtype - assert all(p.device == device for p in group_params) - assert all(p.dtype == dtype for p in group_params) - # Compute chunk size for this group (how many params each rank owns) - chunk_size = (len(group_params) + world_size - 1) // world_size - if rank == 0: - print(f"Muon: {len(group_params)} params of shape {shape}, chunk_size={chunk_size}") - param_groups.append(dict(params=group_params, chunk_size=chunk_size)) - super().__init__(param_groups, defaults) - # 0-D CPU tensors to avoid torch.compile recompilation when values change - self._momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") - self._lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") - self._wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") - self._beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") - - @torch.no_grad() - def step(self): - rank = dist.get_rank() - world_size = dist.get_world_size() - - # Ensure all grads exist - assert all(p.grad is not None for group in self.param_groups for p in group["params"]), "All params must have grads" - - # First pass: stack grads and kick off reduce_scatter for each group - group_infos = [] - for group in self.param_groups: - params: list[Tensor] = group["params"] - chunk_size = group["chunk_size"] - padded_num_params = chunk_size * world_size - shape = params[0].shape - device, dtype = params[0].device, params[0].dtype - - # Stack all gradients into a single tensor (single kernel via torch.stack) - grad_stack = torch.stack([p.grad for p in params]) - stacked_grads = torch.empty(padded_num_params, *shape, dtype=dtype, device=device) - stacked_grads[:len(params)].copy_(grad_stack) - # Zero-pad if we have fewer params than padded size - if len(params) < padded_num_params: - stacked_grads[len(params):].zero_() - - # Output buffer for this rank's chunk - grad_chunk = torch.empty(chunk_size, *shape, dtype=dtype, device=device) - - # Async reduce_scatter on the stacked tensor - reduce_future = dist.reduce_scatter_tensor( - grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True - ).get_future() - - group_infos.append(dict( - grad_chunk=grad_chunk, - reduce_future=reduce_future, - stacked_grads=stacked_grads, # reuse for all_gather output - )) - - # Second pass: wait for reduce, compute batched updates, kick off all_gather - all_gather_futures = [] - for group, info in zip(self.param_groups, group_infos): - info["reduce_future"].wait() - - params = group["params"] - chunk_size = group["chunk_size"] - shape = params[0].shape - device, dtype = params[0].device, params[0].dtype - grad_chunk = info["grad_chunk"] - - # How many params does this rank actually own? - start_idx = rank * chunk_size - num_owned = min(chunk_size, max(0, len(params) - start_idx)) - - # Get or create group-level state (stored keyed by first param) - state = self.state[params[0]] - - # Momentum buffer - if "momentum_buffer" not in state: - state["momentum_buffer"] = torch.zeros(chunk_size, *shape, dtype=dtype, device=device) - momentum_buffer = state["momentum_buffer"] - - # Second momentum buffer is factored, either per-row or per-column - if "second_momentum_buffer" not in state: - if shape[-2] >= shape[-1]: - state["second_momentum_buffer"] = torch.zeros(chunk_size, shape[-2], 1, dtype=dtype, device=device) - else: - state["second_momentum_buffer"] = torch.zeros(chunk_size, 1, shape[-1], dtype=dtype, device=device) - second_momentum_buffer = state["second_momentum_buffer"] - red_dim = -1 if shape[-2] >= shape[-1] else -2 - - # Build updated_params tensor for all_gather - updated_params = torch.empty(chunk_size, *shape, dtype=dtype, device=device) - - if num_owned > 0: - # Stack owned params (single kernel via torch.stack) - owned_params = [params[start_idx + i] for i in range(num_owned)] - stacked_owned_params = torch.stack(owned_params) - - # Get owned slices of buffers and grads - owned_grads = grad_chunk[:num_owned] - owned_momentum = momentum_buffer[:num_owned] - owned_second_momentum = second_momentum_buffer[:num_owned] - - # Fill 0-D tensors with current values - self._momentum_t.fill_(group["momentum"]) - self._beta2_t.fill_(group["beta2"] if group["beta2"] is not None else 0.0) - self._lr_t.fill_(group["lr"] * max(1.0, shape[-2] / shape[-1])**0.5) - self._wd_t.fill_(group["weight_decay"]) - - # Single fused kernel: momentum -> polar_express -> variance_reduction -> update - muon_step_fused( - owned_grads, - stacked_owned_params, - owned_momentum, - owned_second_momentum, - self._momentum_t, - self._lr_t, - self._wd_t, - self._beta2_t, - group["ns_steps"], - red_dim, - ) - - # Copy updated params to output buffer - updated_params[:num_owned].copy_(stacked_owned_params) - - # Zero-pad the rest (for ranks that own fewer params) - if num_owned < chunk_size: - updated_params[num_owned:].zero_() - - # Reuse stacked_grads buffer for all_gather output - stacked_params = info["stacked_grads"] - - # Async all_gather to replicate updated params to all ranks - gather_future = dist.all_gather_into_tensor( - stacked_params, updated_params, async_op=True - ).get_future() - - all_gather_futures.append(dict( - gather_future=gather_future, - stacked_params=stacked_params, - params=params, - )) - - # Final pass: wait for all_gather and copy back to params - for info in all_gather_futures: - info["gather_future"].wait() - stacked_params = info["stacked_params"] - params = info["params"] - # Batched copy back (single kernel instead of N individual copies) - torch._foreach_copy_(params, list(stacked_params[:len(params)].unbind(0))) diff --git a/nanochat/optim.py b/nanochat/optim.py new file mode 100644 index 00000000..190a1edb --- /dev/null +++ b/nanochat/optim.py @@ -0,0 +1,528 @@ +""" +A nice and efficient mixed AdamW/Muon Combined Optimizer. +Usually the embeddings and scalars go into AdamW, and the matrix parameters go into Muon. +Two versions are provided (MuonAdamW, DistMuonAdamW), for single GPU and distributed. + +Addapted from: https://github.com/KellerJordan/modded-nanogpt +Further contributions from @karpathy and @chrisjmccormick. +""" + +import torch +import torch.distributed as dist +from torch import Tensor + +# ----------------------------------------------------------------------------- +""" +Good old AdamW optimizer, fused kernel. +https://arxiv.org/abs/1711.05101 +""" + +@torch.compile(dynamic=False, fullgraph=True) +def adamw_step_fused( + p: Tensor, # (32768, 768) - parameter tensor + grad: Tensor, # (32768, 768) - gradient, same shape as p + exp_avg: Tensor, # (32768, 768) - first moment, same shape as p + exp_avg_sq: Tensor, # (32768, 768) - second moment, same shape as p + step_t: Tensor, # () - 0-D CPU tensor, step count + lr_t: Tensor, # () - 0-D CPU tensor, learning rate + beta1_t: Tensor, # () - 0-D CPU tensor, beta1 + beta2_t: Tensor, # () - 0-D CPU tensor, beta2 + eps_t: Tensor, # () - 0-D CPU tensor, epsilon + wd_t: Tensor, # () - 0-D CPU tensor, weight decay +) -> None: + """ + Fused AdamW step: weight_decay -> momentum_update -> bias_correction -> param_update + All in one compiled graph to eliminate Python overhead between ops. + The 0-D CPU tensors avoid recompilation when hyperparameter values change. + """ + # Weight decay (decoupled, applied before the update) + p.mul_(1 - lr_t * wd_t) + # Update running averages (lerp_ is cleaner and fuses well) + exp_avg.lerp_(grad, 1 - beta1_t) + exp_avg_sq.lerp_(grad.square(), 1 - beta2_t) + # Bias corrections + bias1 = 1 - beta1_t ** step_t + bias2 = 1 - beta2_t ** step_t + # Compute update and apply + denom = (exp_avg_sq / bias2).sqrt() + eps_t + step_size = lr_t / bias1 + p.add_(exp_avg / denom, alpha=-step_size) + +# ----------------------------------------------------------------------------- +""" +Muon optimizer adapted and simplified from modded-nanogpt. +https://github.com/KellerJordan/modded-nanogpt + +Background: +Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a +quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose +of minimizing steps, it turns out to be empirically effective to keep increasing the slope at +zero even beyond the point where the iteration no longer converges all the way to one everywhere +on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T +where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model +performance at all relative to UV^T, where USV^T = G is the SVD. + +Here, an alternative to Newton-Schulz iteration with potentially better convergence properties: +Polar Express Sign Method for orthogonalization. +https://arxiv.org/pdf/2505.16932 +by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. + +Some of the changes in nanochat implementation: +- Uses a simpler, more general approach to parameter grouping and stacking +- Uses a single fused kernel for the momentum -> polar_express -> variance_reduction -> update step +- Makes no assumptions about model architecture (e.g. that attention weights are fused into QKVO format) +""" + +# Coefficients for Polar Express (computed for num_iters=5, safety_factor=2e-2, cushion=2) +# From https://arxiv.org/pdf/2505.16932 +polar_express_coeffs = [ + (8.156554524902461, -22.48329292557795, 15.878769915207462), + (4.042929935166739, -2.808917465908714, 0.5000178451051316), + (3.8916678022926607, -2.772484153217685, 0.5060648178503393), + (3.285753657755655, -2.3681294933425376, 0.46449024233003106), + (2.3465413258596377, -1.7097828382687081, 0.42323551169305323), +] + +@torch.compile(dynamic=False, fullgraph=True) +def muon_step_fused( + stacked_grads: Tensor, # (12, 768, 3072) - stacked gradients + stacked_params: Tensor, # (12, 768, 3072) - stacked parameters + momentum_buffer: Tensor, # (12, 768, 3072) - first moment buffer + second_momentum_buffer: Tensor, # (12, 768, 1) or (12, 1, 3072) - factored second moment + momentum_t: Tensor, # () - 0-D CPU tensor, momentum coefficient + lr_t: Tensor, # () - 0-D CPU tensor, learning rate + wd_t: Tensor, # () - 0-D CPU tensor, weight decay + beta2_t: Tensor, # () - 0-D CPU tensor, beta2 for second moment + ns_steps: int, # 5 - number of Newton-Schulz/Polar Express iterations + red_dim: int, # -1 or -2 - reduction dimension for variance +) -> None: + """ + Fused Muon step: momentum -> polar_express -> variance_reduction -> cautious_update + All in one compiled graph to eliminate Python overhead between ops. + Some of the constants are 0-D CPU tensors to avoid recompilation when values change. + """ + + # Nesterov momentum + momentum = momentum_t.to(stacked_grads.dtype) + momentum_buffer.lerp_(stacked_grads, 1 - momentum) + g = stacked_grads.lerp_(momentum_buffer, momentum) + + # Polar express + X = g.bfloat16() + X = X / (X.norm(dim=(-2, -1), keepdim=True) * 1.02 + 1e-6) + if g.size(-2) > g.size(-1): # Tall matrix + for a, b, c in polar_express_coeffs[:ns_steps]: + A = X.mT @ X + B = b * A + c * (A @ A) + X = a * X + X @ B + else: # Wide matrix (original math) + for a, b, c in polar_express_coeffs[:ns_steps]: + A = X @ X.mT + B = b * A + c * (A @ A) + X = a * X + B @ X + g = X + + # Variance reduction + beta2 = beta2_t.to(g.dtype) + v_mean = g.float().square().mean(dim=red_dim, keepdim=True) + red_dim_size = g.size(red_dim) + v_norm_sq = v_mean.sum(dim=(-2, -1), keepdim=True) * red_dim_size + v_norm = v_norm_sq.sqrt() + second_momentum_buffer.lerp_(v_mean.to(dtype=second_momentum_buffer.dtype), 1 - beta2) + step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt() + scaled_sq_sum = (v_mean * red_dim_size) * step_size.float().square() + v_norm_new = scaled_sq_sum.sum(dim=(-2, -1), keepdim=True).sqrt() + final_scale = step_size * (v_norm / v_norm_new.clamp_min(1e-10)) + g = g * final_scale.to(g.dtype) + + # Cautious weight decay + parameter update + lr = lr_t.to(g.dtype) + wd = wd_t.to(g.dtype) + mask = (g * stacked_params) >= 0 + stacked_params.sub_(lr * g + lr * wd * stacked_params * mask) + +# ----------------------------------------------------------------------------- +# Single GPU version of the MuonAdamW optimizer. +# Used mostly for reference, debugging and testing. + +class MuonAdamW(torch.optim.Optimizer): + """ + Combined optimizer: Muon for 2D matrix params, AdamW for others, single GPU version. + + AdamW - Fused AdamW optimizer step. + + Muon - MomentUm Orthogonalized by Newton-schulz + https://kellerjordan.github.io/posts/muon/ + + Muon internally runs standard SGD-momentum, and then performs an orthogonalization post- + processing step, in which each 2D parameter's update is replaced with the nearest orthogonal + matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has + the advantage that it can be stably run in bfloat16 on the GPU. + + Some warnings: + - The Muon optimizer should not be used for the embedding layer, the final fully connected layer, + or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW). + - To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions. + + Arguments: + param_groups: List of dicts, each containing: + - 'params': List of parameters + - 'kind': 'adamw' or 'muon' + - For AdamW groups: 'lr', 'betas', 'eps', 'weight_decay' + - For Muon groups: 'lr', 'momentum', 'ns_steps', 'beta2', 'weight_decay' + """ + def __init__(self, param_groups: list[dict]): + super().__init__(param_groups, defaults={}) + # 0-D CPU tensors to avoid torch.compile recompilation when values change + # AdamW tensors + self._adamw_step_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_beta1_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_eps_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + # Muon tensors + self._muon_momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + + def _step_adamw(self, group: dict) -> None: + """ + AdamW update for each param in the group individually. + Lazy init the state, fill in all 0-D tensors, call the fused kernel. + """ + for p in group['params']: + if p.grad is None: + continue + grad = p.grad + state = self.state[p] + + # State init + if not state: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p) + state['exp_avg_sq'] = torch.zeros_like(p) + exp_avg = state['exp_avg'] + exp_avg_sq = state['exp_avg_sq'] + state['step'] += 1 + + # Fill 0-D tensors with current values + self._adamw_step_t.fill_(state['step']) + self._adamw_lr_t.fill_(group['lr']) + self._adamw_beta1_t.fill_(group['betas'][0]) + self._adamw_beta2_t.fill_(group['betas'][1]) + self._adamw_eps_t.fill_(group['eps']) + self._adamw_wd_t.fill_(group['weight_decay']) + + # Fused update: weight_decay -> momentum -> bias_correction -> param_update + adamw_step_fused( + p, grad, exp_avg, exp_avg_sq, + self._adamw_step_t, self._adamw_lr_t, self._adamw_beta1_t, + self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t, + ) + + def _step_muon(self, group: dict) -> None: + """ + Muon update for all params in the group (stacked for efficiency). + Lazy init the state, fill in all 0-D tensors, call the fused kernel. + """ + params: list[Tensor] = group['params'] + if not params: + return + + # Get or create group-level buffers (stored in first param's state for convenience) + p = params[0] + state = self.state[p] + num_params = len(params) + shape, device, dtype = p.shape, p.device, p.dtype + + # Momentum for every individual parameter + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros(num_params, *shape, dtype=dtype, device=device) + momentum_buffer = state["momentum_buffer"] + + # Second momentum buffer is factored, either per-row or per-column + if "second_momentum_buffer" not in state: + state_shape = (num_params, shape[-2], 1) if shape[-2] >= shape[-1] else (num_params, 1, shape[-1]) + state["second_momentum_buffer"] = torch.zeros(state_shape, dtype=dtype, device=device) + second_momentum_buffer = state["second_momentum_buffer"] + red_dim = -1 if shape[-2] >= shape[-1] else -2 + + # Stack grads and params (NOTE: this assumes all params have the same shape) + stacked_grads = torch.stack([p.grad for p in params]) + stacked_params = torch.stack(params) + + # Fill all the 0-D tensors with current values + self._muon_momentum_t.fill_(group["momentum"]) + self._muon_beta2_t.fill_(group["beta2"] if group["beta2"] is not None else 0.0) + self._muon_lr_t.fill_(group["lr"] * max(1.0, shape[-2] / shape[-1])**0.5) + self._muon_wd_t.fill_(group["weight_decay"]) + + # Single fused kernel: momentum -> polar_express -> variance_reduction -> update + muon_step_fused( + stacked_grads, + stacked_params, + momentum_buffer, + second_momentum_buffer, + self._muon_momentum_t, + self._muon_lr_t, + self._muon_wd_t, + self._muon_beta2_t, + group["ns_steps"], + red_dim, + ) + + # Copy back to original params + torch._foreach_copy_(params, list(stacked_params.unbind(0))) + + @torch.no_grad() + def step(self): + for group in self.param_groups: + if group['kind'] == 'adamw': + self._step_adamw(group) + elif group['kind'] == 'muon': + self._step_muon(group) + else: + raise ValueError(f"Unknown optimizer kind: {group['kind']}") + +# ----------------------------------------------------------------------------- +# Distributed version of the MuonAdamW optimizer. +# Used for training on multiple GPUs. + +class DistMuonAdamW(torch.optim.Optimizer): + """ + Combined distributed optimizer: Muon for 2D matrix params, AdamW for others. + + See MuonAdamW for the algorithmic details of each optimizer. This class adds + distributed communication to enable multi-GPU training without PyTorch DDP. + + Design Goals: + - Overlap communication with computation (async ops) + - Minimize memory by sharding optimizer states across ranks (ZeRO-2 style) + - Batch small tensors into single comm ops where possible + + Communication Pattern (3-phase async): + We use a 3-phase structure to maximize overlap between communication and compute: + + Phase 1: Launch all async reduce ops + - Kick off all reduce_scatter/all_reduce operations + - Don't wait - let them run in background while we continue + + Phase 2: Wait for reduces, compute updates, launch gathers + - For each group: wait for its reduce, compute the update, launch gather + - By processing groups in order, earlier gathers run while later computes happen + + Phase 3: Wait for gathers, copy back + - Wait for all gathers to complete + - Copy updated params back to original tensors (Muon only) + + AdamW Communication (ZeRO-2 style): + - Small params (<1024 elements): all_reduce gradients, update full param on each rank. + Optimizer state is replicated but these params are tiny (scalars, biases). + - Large params: reduce_scatter gradients so each rank gets 1/N of the grad, update + only that slice, then all_gather the updated slices. Optimizer state (exp_avg, + exp_avg_sq) is sharded - each rank only stores state for its slice. + Requires param.shape[0] divisible by world_size. + + Muon Communication (stacked + chunked): + - All params in a Muon group must have the same shape (caller's responsibility). + - Stack all K params into a single (K, *shape) tensor for efficient comm. + - Divide K params across N ranks: each rank "owns" ceil(K/N) params. + - reduce_scatter the stacked grads so each rank gets its chunk. + - Each rank computes Muon update only for params it owns. + - all_gather the updated params back to all ranks. + - Optimizer state (momentum_buffer, second_momentum_buffer) is sharded by chunk. + - Padding: if K doesn't divide evenly, we zero-pad to (ceil(K/N) * N) for comm, + then ignore the padding when copying back. + + Buffer Reuse: + - For Muon, we allocate stacked_grads for reduce_scatter input, then reuse the + same buffer as the output for all_gather (stacked_params). This saves memory + since we don't need both buffers simultaneously. + + Arguments: + param_groups: List of dicts, each containing: + - 'params': List of parameters + - 'kind': 'adamw' or 'muon' + - For AdamW groups: 'lr', 'betas', 'eps', 'weight_decay' + - For Muon groups: 'lr', 'momentum', 'ns_steps', 'beta2', 'weight_decay' + """ + def __init__(self, param_groups: list[dict]): + super().__init__(param_groups, defaults={}) + # 0-D CPU tensors to avoid torch.compile recompilation when values change + self._adamw_step_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_beta1_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_eps_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._adamw_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + self._muon_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu") + + def _reduce_adamw(self, group: dict, world_size: int) -> dict: + """Launch async reduce ops for AdamW group. Returns info dict with per-param infos.""" + param_infos = {} + for p in group['params']: + grad = p.grad + if p.numel() < 1024: + # Small params: all_reduce (no scatter/gather needed) + future = dist.all_reduce(grad, op=dist.ReduceOp.AVG, async_op=True).get_future() + param_infos[p] = dict(future=future, grad_slice=grad, is_small=True) + else: + # Large params: reduce_scatter + rank_size = grad.shape[0] // world_size + grad_slice = torch.empty_like(grad[:rank_size]) + future = dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future() + param_infos[p] = dict(future=future, grad_slice=grad_slice, is_small=False) + return dict(param_infos=param_infos) + + def _reduce_muon(self, group: dict, world_size: int) -> dict: + """Launch async reduce op for Muon group. Returns info dict.""" + params = group['params'] + chunk_size = (len(params) + world_size - 1) // world_size + padded_num_params = chunk_size * world_size + p = params[0] + shape, device, dtype = p.shape, p.device, p.dtype + + # Stack grads and zero-pad to padded_num_params + grad_stack = torch.stack([p.grad for p in params]) + stacked_grads = torch.empty(padded_num_params, *shape, dtype=dtype, device=device) + stacked_grads[:len(params)].copy_(grad_stack) + if len(params) < padded_num_params: + stacked_grads[len(params):].zero_() + + # Reduce_scatter to get this rank's chunk + grad_chunk = torch.empty(chunk_size, *shape, dtype=dtype, device=device) + future = dist.reduce_scatter_tensor(grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True).get_future() + + return dict(future=future, grad_chunk=grad_chunk, stacked_grads=stacked_grads, chunk_size=chunk_size) + + def _compute_adamw(self, group: dict, info: dict, gather_list: list, rank: int, world_size: int) -> None: + """Wait for reduce, compute AdamW updates, launch gathers for large params.""" + param_infos = info['param_infos'] + for p in group['params']: + pinfo = param_infos[p] + pinfo['future'].wait() + grad_slice = pinfo['grad_slice'] + state = self.state[p] + + # For small params, operate on full param; for large, operate on slice + if pinfo['is_small']: + p_slice = p + else: + rank_size = p.shape[0] // world_size + p_slice = p[rank * rank_size:(rank + 1) * rank_size] + + # State init + if not state: + state['step'] = 0 + state['exp_avg'] = torch.zeros_like(p_slice) + state['exp_avg_sq'] = torch.zeros_like(p_slice) + state['step'] += 1 + + # Fill 0-D tensors and run fused kernel + self._adamw_step_t.fill_(state['step']) + self._adamw_lr_t.fill_(group['lr']) + self._adamw_beta1_t.fill_(group['betas'][0]) + self._adamw_beta2_t.fill_(group['betas'][1]) + self._adamw_eps_t.fill_(group['eps']) + self._adamw_wd_t.fill_(group['weight_decay']) + adamw_step_fused( + p_slice, grad_slice, state['exp_avg'], state['exp_avg_sq'], + self._adamw_step_t, self._adamw_lr_t, self._adamw_beta1_t, + self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t, + ) + + # Large params need all_gather + if not pinfo['is_small']: + future = dist.all_gather_into_tensor(p, p_slice, async_op=True).get_future() + gather_list.append(dict(future=future, params=None)) + + def _compute_muon(self, group: dict, info: dict, gather_list: list, rank: int) -> None: + """Wait for reduce, compute Muon updates, launch gather.""" + info['future'].wait() + params = group['params'] + chunk_size = info['chunk_size'] + grad_chunk = info['grad_chunk'] + p = params[0] + shape, device, dtype = p.shape, p.device, p.dtype + + # How many params does this rank own? + start_idx = rank * chunk_size + num_owned = min(chunk_size, max(0, len(params) - start_idx)) + + # Get or create group-level state + state = self.state[p] + if "momentum_buffer" not in state: + state["momentum_buffer"] = torch.zeros(chunk_size, *shape, dtype=dtype, device=device) + if "second_momentum_buffer" not in state: + state_shape = (chunk_size, shape[-2], 1) if shape[-2] >= shape[-1] else (chunk_size, 1, shape[-1]) + state["second_momentum_buffer"] = torch.zeros(state_shape, dtype=dtype, device=device) + red_dim = -1 if shape[-2] >= shape[-1] else -2 + + # Build output buffer for all_gather + updated_params = torch.empty(chunk_size, *shape, dtype=dtype, device=device) + + if num_owned > 0: + owned_params = [params[start_idx + i] for i in range(num_owned)] + stacked_owned = torch.stack(owned_params) + + # Fill 0-D tensors and run fused kernel + self._muon_momentum_t.fill_(group["momentum"]) + self._muon_beta2_t.fill_(group["beta2"]) + self._muon_lr_t.fill_(group["lr"] * max(1.0, shape[-2] / shape[-1])**0.5) + self._muon_wd_t.fill_(group["weight_decay"]) + muon_step_fused( + grad_chunk[:num_owned], stacked_owned, + state["momentum_buffer"][:num_owned], state["second_momentum_buffer"][:num_owned], + self._muon_momentum_t, self._muon_lr_t, self._muon_wd_t, self._muon_beta2_t, + group["ns_steps"], red_dim, + ) + updated_params[:num_owned].copy_(stacked_owned) + + if num_owned < chunk_size: + updated_params[num_owned:].zero_() + + # Reuse stacked_grads buffer for all_gather output + stacked_params = info["stacked_grads"] + future = dist.all_gather_into_tensor(stacked_params, updated_params, async_op=True).get_future() + gather_list.append(dict(future=future, stacked_params=stacked_params, params=params)) + + def _finish_gathers(self, gather_list: list) -> None: + """Wait for all gathers and copy Muon params back.""" + for info in gather_list: + info["future"].wait() + if info["params"] is not None: + # Muon: copy from stacked buffer back to individual params + torch._foreach_copy_(info["params"], list(info["stacked_params"][:len(info["params"])].unbind(0))) + + @torch.no_grad() + def step(self): + rank = dist.get_rank() + world_size = dist.get_world_size() + + # Phase 1: launch all async reduce ops + reduce_infos: list[dict] = [] + for group in self.param_groups: + if group['kind'] == 'adamw': + reduce_infos.append(self._reduce_adamw(group, world_size)) + elif group['kind'] == 'muon': + reduce_infos.append(self._reduce_muon(group, world_size)) + else: + raise ValueError(f"Unknown optimizer kind: {group['kind']}") + + # Phase 2: wait for reduces, compute updates, launch gathers + gather_list: list[dict] = [] + for group, info in zip(self.param_groups, reduce_infos): + if group['kind'] == 'adamw': + self._compute_adamw(group, info, gather_list, rank, world_size) + elif group['kind'] == 'muon': + self._compute_muon(group, info, gather_list, rank) + else: + raise ValueError(f"Unknown optimizer kind: {group['kind']}") + + # Phase 3: wait for gathers, copy back + self._finish_gathers(gather_list) diff --git a/scripts/base_train.py b/scripts/base_train.py index 4fa8fcad..4bce6cd1 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -211,9 +211,9 @@ print0(f"Tokens : Scaling params ratio: {args.total_batch_size * num_iterations print0(f"Total training FLOPs estimate: {num_flops_per_token * total_tokens:e}") # ----------------------------------------------------------------------------- -# Initialize the Optimizer (Muon for Linear layers, AdamW for embedding and lm_head) +# Initialize the Optimizer (combined MuonAdamW: Muon for matrix params, AdamW for rest) adam_betas = (args.adam_beta1, args.adam_beta2) -optimizers = model.setup_optimizers( +optimizer = model.setup_optimizer( unembedding_lr=args.unembedding_lr * batch_lr_scale, embedding_lr=args.embedding_lr * batch_lr_scale, matrix_lr=args.matrix_lr * batch_lr_scale, @@ -221,12 +221,10 @@ optimizers = model.setup_optimizers( adam_betas=adam_betas, scalar_lr=args.scalar_lr * batch_lr_scale, ) -adamw_optimizer, muon_optimizer = optimizers if resuming: - for opt, dat in zip(optimizers, optimizer_data): - opt.load_state_dict(dat) - del optimizer_data # free up the memory + optimizer.load_state_dict(optimizer_data) + del optimizer_data # ----------------------------------------------------------------------------- # Initialize the DataLoaders for train/val @@ -344,7 +342,7 @@ while True: checkpoint_dir, step, orig_model.state_dict(), # model parameters - [opt.state_dict() for opt in optimizers], # optimizer states + optimizer.state_dict(), # optimizer state { # metadata saved as json "step": step, "val_bpb": val_bpb, # loss at last step @@ -378,18 +376,16 @@ while True: loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here loss.backward() x, y, dataloader_state_dict = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward - # step the optimizers + # step the optimizer lrm = get_lr_multiplier(step) - for opt in optimizers: - for group in opt.param_groups: - group["lr"] = group["initial_lr"] * lrm muon_momentum = get_muon_momentum(step) muon_weight_decay = get_weight_decay(step) - for group in muon_optimizer.param_groups: - group["momentum"] = muon_momentum - group["weight_decay"] = muon_weight_decay - for opt in optimizers: - opt.step() + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * lrm + if group['kind'] == 'muon': + group["momentum"] = muon_momentum + group["weight_decay"] = muon_weight_decay + optimizer.step() model.zero_grad(set_to_none=True) train_loss_f = train_loss.item() # .item() is a CPU-GPU sync point synchronize() diff --git a/scripts/chat_rl.py b/scripts/chat_rl.py index eb8e48e5..695c0083 100644 --- a/scripts/chat_rl.py +++ b/scripts/chat_rl.py @@ -201,7 +201,7 @@ def run_gsm8k_eval(task, tokenizer, engine, # Training loop # Init the optimizer -optimizers = model.setup_optimizers( +optimizer = model.setup_optimizer( unembedding_lr=args.unembedding_lr, embedding_lr=args.embedding_lr, matrix_lr=args.matrix_lr, @@ -209,10 +209,9 @@ optimizers = model.setup_optimizers( ) # Set the initial learning rate as a fraction of the base learning rate -for opt in optimizers: - for group in opt.param_groups: - group["lr"] = group["lr"] * args.init_lr_frac - group["initial_lr"] = group["lr"] # save the initial learning so we can decay easily later +for group in optimizer.param_groups: + group["lr"] = group["lr"] * args.init_lr_frac + group["initial_lr"] = group["lr"] # Learning rate scheduler: simple rampdown to zero over num_steps def get_lr_multiplier(it): @@ -305,11 +304,9 @@ for step in range(num_steps): # Update the model parameters lrm = get_lr_multiplier(step) - for opt in optimizers: # first set the learning rate - for group in opt.param_groups: - group["lr"] = group["initial_lr"] * lrm - for opt in optimizers: # then step the optimizers - opt.step() + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * lrm + optimizer.step() model.zero_grad(set_to_none=True) wandb_run.log({ "step": step, diff --git a/scripts/chat_sft.py b/scripts/chat_sft.py index 9277cf96..c0471c43 100644 --- a/scripts/chat_sft.py +++ b/scripts/chat_sft.py @@ -150,17 +150,16 @@ build_val_loader = lambda: sft_data_generator(val_ds, batch_size=args.device_bat # ----------------------------------------------------------------------------- # Initialize the Optimizer -optimizers = model.setup_optimizers( +optimizer = model.setup_optimizer( unembedding_lr=args.unembedding_lr, embedding_lr=args.embedding_lr, matrix_lr=args.matrix_lr, weight_decay=args.weight_decay, ) # Set the initial learning rate as a fraction of the base learning rate -for opt in optimizers: - for group in opt.param_groups: - group["lr"] = group["lr"] * args.init_lr_frac - group["initial_lr"] = group["lr"] # save the initial learning so we can decay easily later +for group in optimizer.param_groups: + group["lr"] = group["lr"] * args.init_lr_frac + group["initial_lr"] = group["lr"] # ----------------------------------------------------------------------------- # Training loop @@ -230,13 +229,11 @@ for step in range(num_iterations): # learning rate scheduler lrm = get_lr_multiplier(step) - for opt in optimizers: - for group in opt.param_groups: - group["lr"] = group["initial_lr"] * lrm + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * lrm - # step the optimizers - for opt in optimizers: - opt.step() + # step the optimizer + optimizer.step() model.zero_grad(set_to_none=True) # logging diff --git a/scripts/mid_train.py b/scripts/mid_train.py index c127c943..ebe9cd52 100644 --- a/scripts/mid_train.py +++ b/scripts/mid_train.py @@ -93,14 +93,12 @@ print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}") print0(f"Total batch size {args.total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}") token_bytes = get_token_bytes(device=device) -# Initialize the Optimizer (Muon for Linear layers, AdamW for embedding and lm_head) -optimizers = model.setup_optimizers(unembedding_lr=args.unembedding_lr, embedding_lr=args.embedding_lr, matrix_lr=args.matrix_lr, weight_decay=args.weight_decay) -adamw_optimizer, muon_optimizer = optimizers +# Initialize the Optimizer (combined MuonAdamW: Muon for matrix params, AdamW for rest) +optimizer = model.setup_optimizer(unembedding_lr=args.unembedding_lr, embedding_lr=args.embedding_lr, matrix_lr=args.matrix_lr, weight_decay=args.weight_decay) # Override the initial learning rate as a fraction of the base learning rate -for opt in optimizers: - for group in opt.param_groups: - group["lr"] = group["lr"] * args.init_lr_frac - group["initial_lr"] = group["lr"] # save the initial learning so we can decay easily later +for group in optimizer.param_groups: + group["lr"] = group["lr"] * args.init_lr_frac + group["initial_lr"] = group["lr"] # Midtraining data mixture and DataLoader base_dir = get_base_dir() @@ -274,7 +272,7 @@ while True: checkpoint_dir, step, orig_model.state_dict(), - [opt.state_dict() for opt in optimizers], # TODO: make sure saving across ranks is done correctly + optimizer.state_dict(), { "step": step, "val_bpb": val_bpb, # loss at last step @@ -306,16 +304,14 @@ while True: loss.backward() x, y = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward progress = max(progress, approx_progress) # only increase progress monotonically - # step the optimizers + # step the optimizer lrm = get_lr_multiplier(progress) - for opt in optimizers: - for group in opt.param_groups: - group["lr"] = group["initial_lr"] * lrm muon_momentum = get_muon_momentum(step) - for group in muon_optimizer.param_groups: - group["momentum"] = muon_momentum - for opt in optimizers: - opt.step() + for group in optimizer.param_groups: + group["lr"] = group["initial_lr"] * lrm + if group['kind'] == 'muon': + group["momentum"] = muon_momentum + optimizer.step() model.zero_grad(set_to_none=True) synchronize() t1 = time.time() From ebd4d9bbf55007d452fe0776b8d363e1b58d8275 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 29 Jan 2026 19:01:36 +0000 Subject: [PATCH 072/119] tried muonh, appealing but didn't work out of the box --- dev/LOG.md | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/dev/LOG.md b/dev/LOG.md index 2f26165e..dd11b427 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,27 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-01-29: Hyperball/MuonH Experiments (Negative Result) + +Explored Hyperball optimization from [this post](https://psychedelic-sunstone-851.notion.site/Fantastic-Pretraining-Optimizers-and-Where-to-Find-Them-2-1-Hyperball-Optimization-2e924306e6f280e7a5ffee00eb40a0dd) (saved to `knowledge/muonh.md`). Constrains weights to sphere of radius R (initial norm): `W_{t+1} = R · Normalize(W_t - η·R · Normalize(u_t))`. Had to change a number of details in a branch, e.g. not use zero init for our projections (or the initial norm would be zero), keep track of the initial norm, adjust Muon -> MuonH for the update. + +Experiments on d12: + +| Experiment | Result | +|------------|--------| +| MuonH for matrix params | Worse than baseline | +| MuonH + LR sweep (2.5e-3 to 1e-2) | Still worse | +| Added learnable RMSNorm scales (paper says γ preserves expressivity) | Still worse | +| Various RMSNorm init tweaks, e.g. 0 at init to residual | Still worse | +| AdamH for lm_head (paper recommends this) | Broken - loss plateaus (see below) | +| AdamH + learnable output scales | Still worse | + +Could not outperform the baseline implementation. The article doesn't go into too much detail on how AdamH is applied to `lm_head` exactly. The classifier layer has to be able to increase in magnitude to make more confident predictions over time. Tried a sensible version with added 0-D learnable scalar, and also with RMSNorms with per-channel learnable scalars both pre and post resnet blocks. + +**Result:** This was not an out-of-the-box win for nanochat even with a mild attempt over a few hours at a bit of tuning and debugging. The idea itself is intuitively appealing. Might come back around later to try harder later. + +--- + ## 2026-01-28: Reverted Bigram Hash Embeddings Removed bigram embeddings (engram-lite) from the codebase. At larger scale (d25), the improvement was tiny and disappeared entirely when measured by wall clock time. It also bloated the VRAM used. The extra parameters and complexity aren't justified. From 6a341f2ecf2d9b699a36cad4004c682805482ed2 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 30 Jan 2026 00:23:01 +0000 Subject: [PATCH 073/119] contiguous views and single HtoD transfer for inputs/targets much cleaner --- nanochat/dataloader.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/nanochat/dataloader.py b/nanochat/dataloader.py index 3e898935..e95c3af6 100644 --- a/nanochat/dataloader.py +++ b/nanochat/dataloader.py @@ -154,6 +154,16 @@ def tokenizing_distributed_data_loader_with_state_bos_bestfit( for tokens in token_lists: doc_buffer.append(tokens) + # Pre-allocate buffers once: layout is [inputs (B*T) | targets (B*T)] + # This gives us contiguous views and a single HtoD transfer + use_cuda = device == "cuda" + cpu_buffer = torch.empty(2 * B * T, dtype=torch.long, pin_memory=use_cuda) # staging area (CPU) + gpu_buffer = torch.empty(2 * B * T, dtype=torch.long, device=device) # on-device buffer + cpu_inputs = cpu_buffer[:B * T].view(B, T) # a few views into these buffers just for convenience + cpu_targets = cpu_buffer[B * T:].view(B, T) + inputs = gpu_buffer[:B * T].view(B, T) + targets = gpu_buffer[B * T:].view(B, T) + while True: rows = [] for _ in range(B): @@ -185,13 +195,16 @@ def tokenizing_distributed_data_loader_with_state_bos_bestfit( rows.append(row[:row_capacity]) - use_cuda = device == "cuda" - batch_tensor = torch.tensor(rows, dtype=torch.long, pin_memory=use_cuda) - inputs = batch_tensor[:, :-1].to(device=device, non_blocking=use_cuda) - targets = batch_tensor[:, 1:].to(device=device, non_blocking=use_cuda) + # Convert rows to tensor and copy slices to pinned buffer (CPU work) + row_data = torch.tensor(rows, dtype=torch.long) # [B, T+1], temporary + cpu_inputs.copy_(row_data[:, :-1]) + cpu_targets.copy_(row_data[:, 1:]) - yield inputs, targets, {"pq_idx": pq_idx, "rg_idx": rg_idx, "epoch": epoch} + state_dict = {"pq_idx": pq_idx, "rg_idx": rg_idx, "epoch": epoch} + # Single HtoD copy into persistent GPU buffer and yield + gpu_buffer.copy_(cpu_buffer, non_blocking=use_cuda) + yield inputs, targets, state_dict def tokenizing_distributed_data_loader_bos_bestfit(*args, **kwargs): """Helper that omits state_dict from yields.""" From 067daa7758cde31b1250160eabf17dbf04bd35b5 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 30 Jan 2026 02:11:25 +0000 Subject: [PATCH 074/119] small fix cpu script ty PR #474 --- runs/runcpu.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/runs/runcpu.sh b/runs/runcpu.sh index da8f6d19..a35c3360 100755 --- a/runs/runcpu.sh +++ b/runs/runcpu.sh @@ -4,15 +4,13 @@ # This script was last updated/tuned on Jan 17, 2026. # Run as: -# bash dev/cpu_demo_run.sh +# bash runs/runcpu.sh # NOTE: Training LLMs requires GPU compute and $$$. You will not get far on your Macbook. # Think of this run as educational/fun demo, not something you should expect to work well. -# (This is why I hide this script away in dev/) # You may also want to run this script manually and one by one, copy pasting commands into your terminal. # all the setup stuff -export OMP_NUM_THREADS=1 export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" mkdir -p $NANOCHAT_BASE_DIR command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh From d6c4f3b923ce9c648a9865a34e651a1b8b3f01c8 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 30 Jan 2026 17:03:15 +0000 Subject: [PATCH 075/119] i think this is the new torch 2.9+ API for declaring tf32 preference --- nanochat/common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nanochat/common.py b/nanochat/common.py index 44760f90..db9e317a 100644 --- a/nanochat/common.py +++ b/nanochat/common.py @@ -170,7 +170,7 @@ def compute_init(device_type="cuda"): # cuda|cpu|mps # Precision if device_type == "cuda": - torch.backends.cuda.matmul.fp32_precision = "tf32" # uses tf32 instead of fp32 for matmuls + torch.backends.fp32_precision = "tf32" # uses tf32 instead of fp32 for matmuls # Distributed setup: Distributed Data Parallel (DDP), optional, and requires CUDA is_ddp_requested, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info() From 02baa154058605d16cc99da4fa8fbf5f9c9908f0 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 30 Jan 2026 17:08:53 +0000 Subject: [PATCH 076/119] i am feeling in a delete mood today. i need to delete a lot of code. there is too much code and surface area and complexity. ew --- README.md | 3 +- runs/run1000.sh | 93 ------------------------------------------------- 2 files changed, 1 insertion(+), 95 deletions(-) delete mode 100644 runs/run1000.sh diff --git a/README.md b/README.md index 74211521..56f55975 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ This repo is a full-stack implementation of an LLM like ChatGPT in a single, cle ## Talk to it -To get a sense of the endpoint of this repo, you can currently find [nanochat d34](https://github.com/karpathy/nanochat/discussions/314) hosted on [nanochat.karpathy.ai](https://nanochat.karpathy.ai/). "d34" means that this model has 34 layers in the Transformer neural network. This model has 2.2 billion parameters, it was trained on 88 billion tokens by simply running the training script [run1000.sh](runs/run1000.sh) with `--target_param_data_ratio=40` (2x longer than Chinchilla-optimal), and the total cost of training was ~$2,500 (about 100 hours training time on 8XH100 GPU node). While today this is enough to outperform GPT-2 of 2019, it falls dramatically short of modern Large Language Models like GPT-5. When talking to these micro models, you'll see that they make a lot of mistakes, they are a little bit naive and silly and they hallucinate a ton, a bit like children. It's kind of amusing. But what makes nanochat unique is that it is fully yours - fully configurable, tweakable, hackable, and trained by you from start to end. To train and talk to your own, we turn to... +To get a sense of the endpoint of this repo, you can currently find [nanochat d34](https://github.com/karpathy/nanochat/discussions/314) hosted on [nanochat.karpathy.ai](https://nanochat.karpathy.ai/). This model is now a few months old but it still gives a rough idea of the intelligence you can achieve for approximately $1000. While this model easily outperforms GPT-2 of 2019, it falls dramatically short of modern Large Language Models like GPT-5. When talking to these micro models, you'll see that they make a lot of mistakes, they are a little bit naive and silly and they hallucinate a ton, a bit like children. But what makes nanochat unique is that it is fully yours - fully configurable, tweakable, hackable, and trained by you from start to end. To train and talk to your own, we turn to... ## Quick start @@ -152,7 +152,6 @@ python -m pytest tests/test_engine.py -v -s ├── pyproject.toml ├── runs │ ├── miniseries.sh # Miniseries training script -│ ├── run1000.sh # Train the ~$800 nanochat d32 │ ├── runcpu.sh # Small example of how to run on CPU/MPS │ ├── scaling_laws.sh # Scaling laws experiments │ └── speedrun.sh # Train the ~$100 nanochat d20 diff --git a/runs/run1000.sh b/runs/run1000.sh deleted file mode 100644 index 5d0b7dc3..00000000 --- a/runs/run1000.sh +++ /dev/null @@ -1,93 +0,0 @@ -#!/bin/bash - -# The $1000 tier of nanochat -# Designed to run end-to-end for $1000/24 ~= 41.6 hours on an 8XH100 node -# A bit sparser on comments, see speedrun.sh for more detail - -# all the setup stuff -export OMP_NUM_THREADS=1 -export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" -mkdir -p $NANOCHAT_BASE_DIR -command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh -[ -d ".venv" ] || uv venv -uv sync --extra gpu -source .venv/bin/activate -if [ -z "$WANDB_RUN" ]; then - WANDB_RUN=dummy -fi -python -m nanochat.report reset -curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl - -# train tokenizer on ~4B characters and kick off download of the rest for pretraining -python -m nanochat.dataset -n 16 -# start downloading the rest of the shards for a total of 1200 (see below why 1200) -python -m nanochat.dataset -n 1200 & -# todo: download the rest of it -python -m scripts.tok_train --max-chars=4000000000 --vocab-size=65536 -python -m scripts.tok_eval - -# Documenting my process for determining the hyperparameters for this run1000.sh script: -# We want a budget of approx. $1000 ~= 41.6 hours of 8XH100 compute -# 1) I guessed the model size for this to be about depth=32 -# 2) Determine the device_batch_size that fits: -# Running the base_train.py script with --depth=32, I saw that --device-batch-size=16 -# runs out of memory, but --device-batch-size=8 fits. Inspecting `nvidia-smi` during training, -# I saw all GPUs were at about 78/80GB VRAM, so it just barely fits and we have good MFU at ~50%. -# So the training script was running ok and showed: -# Vocab size: 65,536 -# num_layers: 32 -# model_dim: 2048 -# num_heads: 16 -# num_kv_heads: 16 -# Tokens / micro-batch / rank: 8 x 2048 = 16,384 -# Tokens / micro-batch: 131,072 -# Total batch size 524,288 => gradient accumulation steps: 4 -# Number of parameters: 1,879,048,192 -# Estimated FLOPs per token: 1.207960e+10 -# Calculated number of iterations from target data:param ratio: 71,680 -# Total number of training tokens: 37,580,963,840 -# Tokens : Params ratio: 20.00 -# Total training FLOPs estimate: 4.539628e+20 -# step 00004/71680 (0.01%) | loss: 8.813754 | lrm: 1.00 | dt: 1571.88ms | tok/sec: 83,385 | mfu: 50.92 | total time: 0.00m -# step 00005/71680 (0.01%) | loss: 8.488074 | lrm: 1.00 | dt: 1572.76ms | tok/sec: 83,338 | mfu: 50.89 | total time: 0.00m -# ... -# 3) validate that the runtime fits our budget: -# The training script uses the Chinchilla scaling law to compute-optimally set #tokens = 20 * #params. In particular: -# The script shows that we will be training for 71,680 steps, and each step takes 1.574s so: -# estimated time to train: 71,680 * 1.574s / 60 / 60 = 31.3 hours. -# This is OK, fits our budget, and leaves ~10 hours for midtraining and SFT and evals and maybe RL. -# It's possible that we might even fit depth=33 or depth=34, but for now let's go along with this. -# 4) The last thing to pay attention to is the amount of training data required for the run. -# The script above calculated that "Total number of training tokens: 37,580,963,840" -# The tok_eval.py script reports about ~4.8 chars/token on average for the default tokenizer settings. -# So ~38B tokens # ~4.8 chars/token = ~185B chars. -# Each data shard is ~250M chars, so we need ~185B / 250M ~= 740 shards. -# For safety, I bumped that up to 800 shards. -# The new DataLoader wastes about 35% of tokens to cropping, so 800 / (1 - 0.35) ~= 1200 shards are needed. -# => why up above I used -n 1200 when pre-downloading dataset shards. -# If we didn't have enough data, the training script would loop around and do multiple epochs over the same data, -# which would decrease model performance. Possibly 2, 3 or so epochs is ~ok, but certainly not ideal and at 10+ epochs we'd -# start to overfit hard. -# 5) That's it, everything else (e.g. the learning rates) is adjusted automatically by the training script. - -# Number of processes/GPUs to use -NPROC_PER_NODE=8 - -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=32 --target-param-data-ratio=20 --device-batch-size=8 --run=$WANDB_RUN -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval - -# midtrain -# NOTE: ensure that we use the same device_batch_size here as the base training script. -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.mid_train -- --device-batch-size=8 --run=$WANDB_RUN -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i mid - -# sft -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_sft -- --run=$WANDB_RUN -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i sft - -# generate final report -python -m nanochat.report generate - -# talk to it -python -m scripts.chat_web From 2e1772381752cc59b1e9136698dcf265fc76451a Mon Sep 17 00:00:00 2001 From: Harsh Gupta Date: Fri, 30 Jan 2026 22:51:02 +0530 Subject: [PATCH 077/119] Fix generate() crash when top_k=0 (#467) Prevent a crash in generate() by skipping top-k filtering when top_k is set to 0 --- nanochat/gpt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nanochat/gpt.py b/nanochat/gpt.py index d23a5167..208acd14 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -440,7 +440,7 @@ class GPT(nn.Module): for _ in range(max_tokens): logits = self.forward(ids) # (B, T, vocab_size) logits = logits[:, -1, :] # (B, vocab_size) - if top_k is not None: + if top_k is not None and top_k > 0: v, _ = torch.topk(logits, min(top_k, logits.size(-1))) logits[logits < v[:, [-1]]] = -float('Inf') if temperature > 0: From ace6740bdd392de4710f054afa7dc9368a076606 Mon Sep 17 00:00:00 2001 From: Aarushi Singh <110608667+aarushisingh04@users.noreply.github.com> Date: Fri, 30 Jan 2026 22:51:41 +0530 Subject: [PATCH 078/119] feat: allow top_k=0 in web api to disable filtering (#458) * allow top_k=0 in web api to disable filtering * adding a comment for clear reasoning * adding change to docstring --- scripts/chat_web.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/chat_web.py b/scripts/chat_web.py index 4b67b621..42c01ac0 100644 --- a/scripts/chat_web.py +++ b/scripts/chat_web.py @@ -26,7 +26,7 @@ Abuse Prevention: - Maximum 8000 characters per message - Maximum 32000 characters total conversation length - Temperature clamped to 0.0-2.0 - - Top-k clamped to 1-200 + - Top-k clamped to 0-200 (0 disables top-k filtering, using full vocabulary) - Max tokens clamped to 1-4096 """ @@ -55,7 +55,7 @@ MAX_MESSAGE_LENGTH = 8000 MAX_TOTAL_CONVERSATION_LENGTH = 32000 MIN_TEMPERATURE = 0.0 MAX_TEMPERATURE = 2.0 -MIN_TOP_K = 1 +MIN_TOP_K = 0 # 0 disables top-k filtering, using full vocabulary MAX_TOP_K = 200 MIN_MAX_TOKENS = 1 MAX_MAX_TOKENS = 4096 From 3ba42e8135bc88283004fc8d803a1eb00879a738 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 30 Jan 2026 17:32:12 +0000 Subject: [PATCH 079/119] Fix SDPA KV-cache decode to respect sliding window (#456) SDPA fallback now respects sliding window during single-token KV-cache decode by slicing K/V to the last (window + 1) tokens. Also simplifies the mask building for chunk inference to properly apply sliding window in that path as well. Fixes #452 Co-Authored-By: Kartik Vashishta Co-Authored-By: Claude Opus 4.5 --- nanochat/flash_attention.py | 29 ++++++++++++++-------------- tests/test_attention_fallback.py | 33 ++++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+), 15 deletions(-) diff --git a/nanochat/flash_attention.py b/nanochat/flash_attention.py index 5d27e5f9..15411de3 100644 --- a/nanochat/flash_attention.py +++ b/nanochat/flash_attention.py @@ -71,27 +71,26 @@ def _sdpa_attention(q, k, v, window_size, enable_gqa): # Single token generation if Tq == 1: + if window >= 0 and window < Tk: + # window is "left" tokens we need to include (window + 1) keys total + start = max(0, Tk - (window + 1)) + k = k[:, :, start:, :] + v = v[:, :, start:, :] return F.scaled_dot_product_attention(q, k, v, is_causal=False, enable_gqa=enable_gqa) - # Need explicit mask + # Need explicit mask for sliding window/chunk inference device = q.device - if Tq == Tk: - # Causal + sliding window - mask = torch.tril(torch.ones(Tq, Tk, device=device, dtype=torch.bool)) - if window > 0 and window < Tq: - row_idx = torch.arange(Tq, device=device).unsqueeze(1) - col_idx = torch.arange(Tk, device=device).unsqueeze(0) - mask = mask & ((row_idx - col_idx) <= window) - else: - # Chunk inference: attend to prefix + causal within chunk - prefix_len = Tk - Tq - mask = torch.zeros(Tq, Tk, device=device, dtype=torch.bool) - mask[:, :prefix_len] = True - mask[:, prefix_len:] = torch.tril(torch.ones(Tq, Tq, device=device, dtype=torch.bool)) + # For chunk inference (Tq != Tk), is_causal is not aligned to cache position => build an explicit bool mask + row_idx = (Tk - Tq) + torch.arange(Tq, device=device).unsqueeze(1) + col_idx = torch.arange(Tk, device=device).unsqueeze(0) + mask = col_idx <= row_idx + # sliding window (left) + if window >= 0 and window < Tk: + mask = mask & ((row_idx - col_idx) <= window) + return F.scaled_dot_product_attention(q, k, v, attn_mask=mask, enable_gqa=enable_gqa) - # ============================================================================= # Public API: Same interface as FA3 # ============================================================================= diff --git a/tests/test_attention_fallback.py b/tests/test_attention_fallback.py index 2cf3ed77..9741c7f3 100644 --- a/tests/test_attention_fallback.py +++ b/tests/test_attention_fallback.py @@ -178,6 +178,39 @@ class TestFA3VsSDPA: max_diff, mean_diff = assert_close(y_fa3, y_sdpa, "single_token") print(f"single_token: max_diff={max_diff:.6f}, mean_diff={mean_diff:.6f}") + def test_kvcache_single_token_sliding_window(self): + """Test single token decode with sliding window smaller than cache size. + + This catches the bug where SDPA ignores window_size during Tq=1 decode. + When window < Tk, FA3 only attends to the last (window+1) tokens, + but SDPA was attending to all cached tokens. + """ + B, T_max, H, D = 2, 64, 4, 32 + T_prefill = 32 # Enough tokens to exceed window + window = 8 # Window SMALLER than cache size + + k_init = torch.randn(B, T_prefill, H, D, device=self.DEVICE, dtype=self.DTYPE) + v_init = torch.randn(B, T_prefill, H, D, device=self.DEVICE, dtype=self.DTYPE) + q_single = torch.randn(B, 1, H, D, device=self.DEVICE, dtype=self.DTYPE) + k_single = torch.randn(B, 1, H, D, device=self.DEVICE, dtype=self.DTYPE) + v_single = torch.randn(B, 1, H, D, device=self.DEVICE, dtype=self.DTYPE) + + def run(): + k_cache = torch.zeros(B, T_max, H, D, device=self.DEVICE, dtype=self.DTYPE) + v_cache = torch.zeros(B, T_max, H, D, device=self.DEVICE, dtype=self.DTYPE) + k_cache[:, :T_prefill, :, :] = k_init + v_cache[:, :T_prefill, :, :] = v_init + cache_seqlens = torch.full((B,), T_prefill, dtype=torch.int32, device=self.DEVICE) + return flash_attn.flash_attn_with_kvcache( + q_single, k_cache, v_cache, k=k_single, v=v_single, + cache_seqlens=cache_seqlens, + causal=True, window_size=(window, 0) # window=8 < Tk=33 + ) + + y_fa3, y_sdpa = run_both_impls(run) + max_diff, mean_diff = assert_close(y_fa3, y_sdpa, "single_token_sliding_window") + print(f"single_token_sliding_window: max_diff={max_diff:.6f}, mean_diff={mean_diff:.6f}") + def test_backward_gradients_match(self): """Verify gradients are similar between FA3 and SDPA.""" B, T, H, D = 2, 32, 4, 16 From 4d8dbaf6e062d86c9174ba5f6f94d43899f15a01 Mon Sep 17 00:00:00 2001 From: Andrei Panferov Date: Fri, 30 Jan 2026 18:34:02 +0100 Subject: [PATCH 080/119] Fix escape character in README bibtex entry (#454) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 56f55975..89d2ce2b 100644 --- a/README.md +++ b/README.md @@ -203,7 +203,7 @@ If you find nanochat helpful in your research cite simply as: ```bibtex @misc{nanochat, author = {Andrej Karpathy}, - title = {nanochat: The best ChatGPT that $100 can buy}, + title = {nanochat: The best ChatGPT that \$100 can buy}, year = {2025}, publisher = {GitHub}, url = {https://github.com/karpathy/nanochat} From 3c3a3d70420e6a63575ebadb8bd466bbedd7156c Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 31 Jan 2026 01:08:44 +0000 Subject: [PATCH 081/119] warmdown of 0.5 is slightly better: --- scripts/base_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/base_train.py b/scripts/base_train.py index 4bce6cd1..7ed63302 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -59,7 +59,7 @@ parser.add_argument("--scalar-lr", type=float, default=0.5, help="learning rate parser.add_argument("--adam-beta1", type=float, default=0.8, help="Adam beta1 for embedding/unembedding") parser.add_argument("--adam-beta2", type=float, default=0.95, help="Adam beta2 for embedding/unembedding") parser.add_argument("--warmup-ratio", type=float, default=0.0, help="ratio of iterations for LR warmup") -parser.add_argument("--warmdown-ratio", type=float, default=0.4, help="ratio of iterations for LR warmdown") +parser.add_argument("--warmdown-ratio", type=float, default=0.5, help="ratio of iterations for LR warmdown") parser.add_argument("--final-lr-frac", type=float, default=0.0, help="final LR as fraction of initial LR") parser.add_argument("--resume-from-step", type=int, default=-1, help="resume training from this step (-1 = disable)") # Evaluation From 348fbb301b8b709ad5d59bdf69e99a51982f594a Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 31 Jan 2026 18:21:36 +0000 Subject: [PATCH 082/119] fix dataloader for midtrain to never crop data. we can't just throw it away like we do in pretraining --- scripts/mid_train.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/scripts/mid_train.py b/scripts/mid_train.py index ebe9cd52..54c5fb09 100644 --- a/scripts/mid_train.py +++ b/scripts/mid_train.py @@ -125,11 +125,12 @@ approx_progress = 0.0 # will go from 0 to 1 over the course of the epoch current_epoch = 1 # track epoch for logging def mid_data_generator_bos_bestfit(split, buffer_size=100): """ - BOS-aligned dataloader for midtraining with bestfit-crop packing. + BOS-aligned dataloader for midtraining with bestfit-pad packing. Each row in the batch starts with BOS (beginning of a conversation). - Conversations are packed using best-fit algorithm to minimize cropping. - This matches the BOS-aligned approach used in pretraining. + Conversations are packed using best-fit algorithm. When no conversation fits, + the row is padded (instead of cropping) to ensure no tokens are ever discarded. + Padding positions have targets masked with -1 (ignore_index for cross-entropy). """ global last_step, approx_progress, current_epoch assert split in {"train", "val"}, "split must be 'train' or 'val'" @@ -137,6 +138,7 @@ def mid_data_generator_bos_bestfit(split, buffer_size=100): dataset_size = len(dataset) assert dataset_size > 0 row_capacity = args.max_seq_len + 1 # +1 for target at last position + bos_token = tokenizer.get_bos_token_id() # Conversation buffer: list of token lists conv_buffer = [] @@ -159,8 +161,10 @@ def mid_data_generator_bos_bestfit(split, buffer_size=100): while True: rows = [] + row_lengths = [] # Track actual content length (excluding padding) for each row for _ in range(args.device_batch_size): row = [] + padded = False while len(row) < row_capacity: # Ensure buffer has conversations while len(conv_buffer) < buffer_size: @@ -183,11 +187,18 @@ def mid_data_generator_bos_bestfit(split, buffer_size=100): row.extend(conv) consumed += ddp_world_size # Track actual consumption else: - # No conversation fits - crop first conversation to fill remaining - conv = conv_buffer.pop(0) - row.extend(conv[:remaining]) - consumed += ddp_world_size # Track actual consumption + # No conversation fits - pad the remainder instead of cropping + # This ensures we never discard any tokens + content_len = len(row) + row.extend([bos_token] * remaining) # Pad with BOS tokens + padded = True + break # Row is now full (with padding) + # Track content length: full row if no padding, otherwise the length before padding + if padded: + row_lengths.append(content_len) + else: + row_lengths.append(row_capacity) rows.append(row[:row_capacity]) # Stopping condition to respect num_iterations, if given @@ -212,6 +223,12 @@ def mid_data_generator_bos_bestfit(split, buffer_size=100): inputs = batch_tensor[:, :-1].to(device=device, dtype=torch.int32, non_blocking=use_cuda) targets = batch_tensor[:, 1:].to(device=device, dtype=torch.int64, non_blocking=use_cuda) + # Mask out padding positions in targets (set to -1 = ignore_index) + # For each row, positions >= (content_length - 1) in targets should be masked + for i, content_len in enumerate(row_lengths): + if content_len < row_capacity: + targets[i, content_len-1:] = -1 + yield inputs, targets train_loader = mid_data_generator_bos_bestfit("train") From 1ddaad1c1c37f3553a59f556bc757c4aea585bef Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sat, 31 Jan 2026 19:12:25 +0000 Subject: [PATCH 083/119] nuke midtraining from orbit, it's not as needed now that we have a BOS-aligned dataloader. Also change the README a lot. midtrianing is not yet fully properly erased across the board, but good enough for step 1 --- README.md | 165 ++++++------- dev/scaling_laws_jan26.png | Bin 0 -> 93061 bytes runs/runcpu.sh | 10 +- runs/speedrun.sh | 50 +--- scripts/chat_cli.py | 2 +- scripts/chat_eval.py | 4 +- scripts/chat_sft.py | 487 ++++++++++++++++++++++--------------- scripts/mid_train.py | 386 ----------------------------- 8 files changed, 389 insertions(+), 715 deletions(-) create mode 100644 dev/scaling_laws_jan26.png delete mode 100644 scripts/mid_train.py diff --git a/README.md b/README.md index 89d2ce2b..800c5d9d 100644 --- a/README.md +++ b/README.md @@ -1,35 +1,62 @@ # nanochat ![nanochat logo](dev/nanochat.png) +![scaling laws](dev/scaling_laws_jan26.png) -> The best ChatGPT that $100 can buy. +nanochat is the simplest experimental harness for training LLMs. It is designed to run on a single GPU node, the code is minimal/hackable, and it covers all major LLM stages including tokenization, pretraining, finetuning, evaluation, inference, and a chat UI. For example, you can train your own GPT-2 capability LLM (which cost ~$50,000 to train in 2019) for only $73 (3 hours of 8XH100 GPU node) and then talk to it in a familiar ChatGPT-like web UI. -This repo is a full-stack implementation of an LLM like ChatGPT in a single, clean, minimal, hackable, dependency-lite codebase. nanochat is designed to run on a single 8XH100 node via scripts like [speedrun.sh](runs/speedrun.sh), that run the entire pipeline start to end. This includes tokenization, pretraining, finetuning, evaluation, inference, and web serving over a simple UI so that you can talk to your own LLM just like ChatGPT. nanochat will become the capstone project of the course LLM101n being developed by Eureka Labs. +For questions about the repo, I recommend either using [DeepWiki](https://deepwiki.com/karpathy/nanochat) from Devin/Cognition to ask questions about the repo, or use the [Discussions tab](https://github.com/karpathy/nanochat/discussions), or come by the [#nanochat](https://discord.com/channels/1020383067459821711/1427295580895314031) channel on Discord. ## Updates -- (Jan 16 2026) The repo is in active development, I am currently fleshing out the pretraining stage. -- (Jan 7 2026) See new post: [nanochat Miniseries v1](https://github.com/karpathy/nanochat/discussions/420) and the associated script [miniseries.sh](runs/miniseries.sh). +- (Jan 31 2026) Major revamp of all scripts/README ongoing, deleting midtraining stage, might be a bit messy briefly... +- (Jan 30 2026) With all the latest improvements we're able to train GPT-2 grade LLM in about $73. The [runs/speedrun.sh](runs/speedrun.sh) script will become the refernece way to train GPT-2 grade model and talk to it. -## Talk to it +## Leaderboard -To get a sense of the endpoint of this repo, you can currently find [nanochat d34](https://github.com/karpathy/nanochat/discussions/314) hosted on [nanochat.karpathy.ai](https://nanochat.karpathy.ai/). This model is now a few months old but it still gives a rough idea of the intelligence you can achieve for approximately $1000. While this model easily outperforms GPT-2 of 2019, it falls dramatically short of modern Large Language Models like GPT-5. When talking to these micro models, you'll see that they make a lot of mistakes, they are a little bit naive and silly and they hallucinate a ton, a bit like children. But what makes nanochat unique is that it is fully yours - fully configurable, tweakable, hackable, and trained by you from start to end. To train and talk to your own, we turn to... +| # | Record time | Description | Date | Commit | Contributors | +|---|-------------|-------------|------|--------|--------------| +| 1 | 3.04 hours | d24 baseline, slightly overtrained | Jan 29 2026 | 348fbb3 | @karpathy | -## Quick start +The primary metric we care about is "time to GPT-2" - the wall clock time needed to outperform the GPT-2 (1.6B) CORE metric on an 8XH100 GPU node. In 2019, the training of GPT-2 cost approximately $50,000 so it is incredible that due to many advances over 7 years across the stack, we can now do so in 3 hours or less, for ~$73 and below. Once your repo is set up (see the [runs/speedrun.sh](runs/speedrun.sh) script for reference), e.g. the way I kicked off the jan29 run is as follows: -The fastest way to feel the magic is to run the speedrun script [speedrun.sh](runs/speedrun.sh), which trains and inferences the $100 tier of nanochat. On an 8XH100 node at $24/hr, this gives a total run time of about 4 hours. Boot up a new 8XH100 GPU box from your favorite provider (e.g. I use and like [Lambda](https://lambda.ai/service/gpu-cloud)), and kick off the training script: +``` +OMP_NUM_THREADS=1 torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- \ + --depth=24 \ + --run=d24-jan29 \ + --model-tag=d24_jan29 \ + --device-batch-size=16 \ + --sample-every=-1 \ + --save-every=-1 \ + --core-metric-max-per-task=-1 \ + --core-metric-every=3000 \ + --target-param-data-ratio=12 +``` + +After 3 hours we get output like this: + +``` +... +wandb: Run summary: +wandb: core_metric 0.25851 +wandb: step 16704 +wandb: total_training_flops 4.330784131228946e+19 +wandb: total_training_time 10949.46713 +``` + +The GPT-2 CORE score (i.e. the target to beat) is 0.256525. So we see that this d24 CORE score is higher (0.25851). Then we look at the `total_training_time`, which is the time of the training iterations alone, excluding all the evaluations and logging, in seconds. We get: `10949/60/60 ~= 3.04` hours, the current record. + +## Getting started + +### Reproduce and talk to GPT-2 + +The most fun you can have is to train your own GPT-2 and talk to it. The entire pipeline to do so is contained in the single file [runs/speedrun.sh](runs/speedrun.sh), which is designed to be run on an 8XH100 GPU node. Currently, at ~$24/hour for these nodes, pretraining GPT-2 grade model takes approximately 3 hours and will set you back about $75. Boot up a new 8XH100 GPU box from your favorite provider (e.g. I use and like [Lambda](https://lambda.ai/service/gpu-cloud)), and kick off the training script: ```bash bash runs/speedrun.sh ``` -Alternatively, since the script runs for 4 hours, I like to launch it like this inside a new screen session `speedrun` (and also log output to `speedrun.log`): - -```bash -screen -L -Logfile speedrun.log -S speedrun bash runs/speedrun.sh -``` - -See the [screen cheatsheet](https://gist.github.com/jctosta/af918e1618682638aa82) if you are less familiar. You can watch it go inside the screen session, or detach with `Ctrl-a d` and `tail speedrun.log` to view progress. Now wait 4 hours. Once it's done, you can talk to your LLM via the ChatGPT-like web UI. Make sure again that your local uv virtual environment is active (run `source .venv/bin/activate`), and serve it: +You mish to do so in a screen session as this will take ~3 hours to run. Once it's done, you can talk to it via the ChatGPT-like web UI. Make sure again that your local uv virtual environment is active (run `source .venv/bin/activate`), and serve it: ```bash python -m scripts.chat_web @@ -43,84 +70,43 @@ And then visit the URL shown. Make sure to access it correctly, e.g. on Lambda u --- -You can also `cat report.md` file which appeared in the project directory and contains the "report card" of the run, i.e. a bunch of evaluations and metrics. At the very end, you'll see a summary table, for example: - ---- - -- Characters: 333,989 -- Lines: 8,304 -- Files: 44 -- Tokens (approx): 83,497 -- Dependencies (uv.lock lines): 2,004 - -| Metric | BASE | MID | SFT | RL | -|-----------------|----------|----------|----------|----------| -| CORE | 0.2219 | - | - | - | -| ARC-Challenge | - | 0.2875 | 0.2807 | - | -| ARC-Easy | - | 0.3561 | 0.3876 | - | -| GSM8K | - | 0.0250 | 0.0455 | 0.0758 | -| HumanEval | - | 0.0671 | 0.0854 | - | -| MMLU | - | 0.3111 | 0.3151 | - | -| ChatCORE | - | 0.0730 | 0.0884 | - | - -Total wall clock time: 3h51m - ---- - -(Your table might be missing the RL number by default). For a lot more information around the speedrun script and what to look for and expect, please refer to the walkthrough that I posted in Discussions of the repo: ["Introducing nanochat: The best ChatGPT that $100 can buy"](https://github.com/karpathy/nanochat/discussions/1). - -## Bigger models - -Unsurprisingly, $100 is not enough to train a highly performant ChatGPT clone. In fact, LLMs are famous for their multi-million dollar capex. For our purposes, I think there are two more scales of interest. First is the ~$300 tier d26 model (i.e. depth=26) that trains in ~12 hours, which slightly outperforms GPT-2 CORE score. Second is the $1000 tier (~41.6 hours), just because it's a nice round number. But both of these are not yet fully supported and therefore not attached here in the master branch yet. - -That said, to give a sense, the example changes needed for the [speedrun.sh](runs/speedrun.sh) file to train a GPT-2 grade model d26 only involve three changes: - -```bash -... -# you'll need to download more data shards for pretraining -# get the number of parameters, multiply 20 to get tokens, multiply by 4.8 to get chars, -# divide by 250 million to get number of shards. todo need to improve this... -python -m nanochat.dataset -n 450 & -... -# use --depth to increase model size. to not oom, halve device batch size 32 -> 16: -torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=26 --device-batch-size=16 -... -# make sure to use the same later during midtraining: -torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --device-batch-size=16 -``` - -That's it! The biggest thing to pay attention to is making sure you have enough data shards to train on (the code will loop and do more epochs over the same training set otherwise, decreasing learning speed a bit), and managing your memory/VRAM, primarily by decreasing the `device_batch_size` until things fit (the scripts automatically compensate by increasing the number of gradient accumulation loops, simply turning parallel compute to sequential compute). - -And a bit more about computing environments that will run nanochat: +A few more notes: - The code will run just fine on the Ampere 8XA100 GPU node as well, but a bit slower. - All code will run just fine on even a single GPU by omitting `torchrun`, and will produce ~identical results (code will automatically switch to gradient accumulation), but you'll have to wait 8 times longer. - If your GPU(s) have less than 80GB, you'll have to tune some of the hyperparameters or you will OOM / run out of VRAM. Look for `--device_batch_size` in the scripts and reduce it until things fit. E.g. from 32 (default) to 16, 8, 4, 2, or even 1. Less than that you'll have to know a bit more what you're doing and get more creative. -- Most of the code is fairly vanilla PyTorch so it should run on anything that supports that - xpu, mps, or etc, but I haven't implemented this out of the box so it might take a bit of tinkering. +- Most of the code is fairly vanilla PyTorch so it should run on anything that supports that - xpu, mps, or etc, but I haven't personally exercised all of these code paths so there might be sharp edges. + +## Research + +If you are a researcher and wish to help improve nanochat, two scripts of interest are [runs/scaling_laws.sh](runs/scaling_laws.sh) and [runs/miniseries.sh](runs/miniseries.sh). See [Jan 7 miniseries v1](https://github.com/karpathy/nanochat/discussions/420) for related documentation. For quick experimentation (~5 min pretraining runs) my favorite scale is to train a 12-layer model (GPT-1 sized), e.g. like this: + +``` +OMP_NUM_THREADS=1 torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- \ + --depth=12 \ + --run="d12" \ + --model-tag="d12" \ + --core-metric-every=999999 \ + --sample-every=-1 \ + --save-every=-1 \ +``` + +This uses wandb (run name "d12"), only runs the CORE metric on last step, and it doesn't sample and save intermediate checkpoints. I like to change something in the code, re-run a d12 (or a d16 etc) and see if it helped, in an iteration loop. + +The overall approach is to treat the depth of the model as the single dial of complexity. By sweeping out the depth, we get increasingly more powerful models. We determine the scaling laws, set the data budget to a compute optimal setting, train a whole miniseries of models of increasing sizes, and compare them to the GPT-2 and GPT-3 miniseries. Right now, beating GPT-2 specifically faster and faster is the most interesting target. ## Running on CPU / MPS -nanochat can be run on CPU or on MPS (if you're on Macbook) in principle, and will automatically try to detect what device is best to run on. The script [runcpu.sh](runs/runcpu.sh) shows a very simple example that will exercise the code paths but basically produce garbage results. Unless you know what you're doing, I basically don't recommend using this script right now and hope to tune it a bit more in the future. +The script [runs/runcpu.sh](runs/runcpu.sh) shows a very simple example of running on CPU or Apple Silicon. It dramatically shrinks the LLM tha tis being trained to make things fit into a reasonable time interval of a few ten minutes of training. You will not get strong results in this way. -## Customization +## Guides -To customize your nanochat, see [Guide: infusing identity to your nanochat](https://github.com/karpathy/nanochat/discussions/139) in Discussions, which describes how you can tune your nanochat's personality through synthetic data generation and mixing that data into midtraining and SFT stages. +I've published a number of guides that might contain helpful information: -Additionally, to add new abilities to nanochat, see [Guide: counting r in strawberry (and how to add abilities generally)](https://github.com/karpathy/nanochat/discussions/164). - -## Questions - -I recommend using [DeepWiki](https://deepwiki.com/karpathy/nanochat) from Devin/Cognition to ask questions of this repo. In the URL of this repo, simply change github.com to deepwiki.com, and you're off. - -You can also come to the [#nanochat Discord channel](https://discord.com/channels/1020383067459821711/1427295580895314031) to ask questions, or use the Discussions. - -## Tests - -I haven't invested too much here but some tests exist, especially for the tokenizer. Run e.g. as: - -```bash -python -m pytest tests/test_engine.py -v -s -``` +- [Oct 13 2025 original nanochat post](https://github.com/karpathy/nanochat/discussions/1) introducing nanochat, though now it contains some deprecated information and the model is a lot older (with worse results) than current master. +- [Jan 7 miniseries v1](https://github.com/karpathy/nanochat/discussions/420) documents the first nanochat miniseries of models. +- To customize your nanochat, see [Guide: infusing identity to your nanochat](https://github.com/karpathy/nanochat/discussions/139) in Discussions, which describes how you can tune your nanochat's personality through synthetic data generation and mixing that data into the SFT stage. +- To add new abilities to nanochat, see [Guide: counting r in strawberry (and how to add abilities generally)](https://github.com/karpathy/nanochat/discussions/164). ## File structure @@ -159,12 +145,11 @@ python -m pytest tests/test_engine.py -v -s │ ├── base_eval.py # Base model: calculate CORE score │ ├── base_loss.py # Base model: calculate bits per byte, sample │ ├── base_train.py # Base model: train -│ ├── chat_cli.py # Chat model (SFT/Mid): talk to over CLI -│ ├── chat_eval.py # Chat model (SFT/Mid): eval tasks -│ ├── chat_rl.py # Chat model (SFT/Mid): reinforcement learning +│ ├── chat_cli.py # Chat model: talk to over CLI +│ ├── chat_eval.py # Chat model: eval tasks +│ ├── chat_rl.py # Chat model: reinforcement learning │ ├── chat_sft.py # Chat model: train SFT -│ ├── chat_web.py # Chat model (SFT/Mid): talk to over WebUI -│ ├── mid_train.py # Chat model: midtraining +│ ├── chat_web.py # Chat model: talk to over WebUI │ ├── tok_eval.py # Tokenizer: evaluate compression rate │ └── tok_train.py # Tokenizer: train it ├── tasks @@ -183,9 +168,9 @@ python -m pytest tests/test_engine.py -v -s ## Contributing -nanochat is nowhere near finished. The goal is to improve the state of the art in micro models that are accessible to work with end to end on budgets of < $1000 dollars. Accessibility is about overall cost but also about cognitive complexity - nanochat is not an exhaustively configurable LLM "framework"; there will be no giant configuration objects, model factories, or if-then-else monsters in the code base. It is a single, cohesive, minimal, readable, hackable, maximally-forkable "strong baseline" codebase designed to run start to end and produce a concrete ChatGPT clone and its report card. +The goal of nanochat is to improve the state of the art in micro models that are accessible to work with end to end on budgets of < $1000 dollars. Accessibility is about overall cost but also about cognitive complexity - nanochat is not an exhaustively configurable LLM "framework"; there are no giant configuration objects, model factories, or if-then-else monsters in the code base. It is a single, cohesive, minimal, readable, hackable, maximally-forkable "strong baseline" codebase designed to run start to end and produce a ChatGPT model you can talk to. Currently, the most interesting part personally is speeding up the latency to GPT-2 (i.e. getting a CORE score above 0.256525). Currently this takes ~3 hours, but by improving the pretraining stage we can improve this further. -Current LLM policy: disclosure. When submitting a PR, please declare any parts that had substantial LLM contribution and that you have not written or that you do not fully understand. +Current AI policy: disclosure. When submitting a PR, please declare any parts that had substantial LLM contribution and that you have not written or that you do not fully understand. ## Acknowledgements diff --git a/dev/scaling_laws_jan26.png b/dev/scaling_laws_jan26.png new file mode 100644 index 0000000000000000000000000000000000000000..e8d1f727d6001e52cec56884626c84e6633c38a7 GIT binary patch literal 93061 zcmZs@2RPU3|39w1j8L|mWK?99Jx-F9Q5wkJviBY>qcSooTSerALfNCTizG6#L-yW; z-{aN!{J!7Me{@~0a}w`y-_QH`e60I@@1m+a*?xxoBqStciVCvoBqY1fkdSN}-Mbs# z`P5$4jQ@x^pVxH0Y-i@|X6R^2qGIT5Z*AvneZ%N4S5rr)8+NvWJp6n-BHVvjI6K=r ziShE<{LfGD*g2Z>e$aOLf=AhBub|~bLPB9kd~JIvopys{8wrV`?Aa^s@e|!{?uWj2 zR?kk`dQ#lI`}G_pEAtIj^+P*%T@t(hP4?g}?!9}14jn%D@HyQs@56^=&!?Z1PTBkI z@F?qFrGa17kGc$R7&m;3*h!_({;(@){Cb3_li9Br_72e)^Dm+t6?~+!|MkaFG&gz2 zLCXLAQ%A~v=D+@ZF!57tHm`L7=j_qrXy!1}*`9MvwWpZl+0pW3Hn`M2-*{cYCeo&R}qwsX1WjnyUs4ioL!-YygMBCZp39Fg}ra?Q;> zI^S8f841K_7Q5QyRZ&w>$t+KG=i2zOvPR^OGT41Da4=>Nw)NKTbe--=c=+(4b;z)- z(uUv4`0H{e=KBhd@r)7jgA4vIy_Tnx5r?0$u|>&pAvxlSX=$8?xvrV4%=F93%bTCv92v2Y3p=9n&hq=r z(o%Ir#hp)&xaM26?Hqp9$7Gb!I5B zz1hY!IqtJRC-%+pT(IGaZZK0#l!!6&adtj^y&DIzXL zLRsimP*^zYx3%ndvG{Y18ux5itGqwcO9|{-F~e%Y4>X%D2{~%eFSX z9-q>#K1U%LWmeoMwN7ehXJ=$=td(uF%eXqYKfl-4c%V82t4A8^5ITtN+HLiFI>hxtBZP4yxc7w@$%B^CX|gw|5=*mTRu2^n|AY z?|$Xy1KyR{8kW_m;%STK)CQYLFYNkTyWYgsOzqA`ZlmA)%@aL#3k&C4i`hxh(t$ns^yq|rjc43IkvVuZTHHn z@BOxVGs%1oR!81%XL|Gu4E}QZRiDijclriZnred4r>GM*#(M5yXSr01DUfFw7%4h9smaaGr zk4`OC`=!Qfy`q{6s;a3roq6%%ZlNgiIXqTpCX1jM=|D|bd9stzVHTFKSFc{#as&k_ zgo=2t@aYygKU0no$k5LJGDWdM?B{MWhOaL!(4zWPVgyWcCJR(mRYgQaO*=Oo)gRED z`le!8A154!C(JSUuzk;Y*Y)aP`a4NUEZX_DRlU8vwz{&ice@>JOHQz}vgW5+d0*Gl z)4MSx?Y-RXrc>smE0RN?CMirR z7aUw49^K7BYT#t8%zfr2#g}u2O)h8M%FTeOh(Wpt*$;>tf)PN>N2`0M!bTHDoXS7o!n zvDz>0=VjT(okPlf(q}hO@_Fu;r#b6m&peQ4@rio+^slb2u9uqGtVupgTCqMR3(K3o zSLf+b#8nSyYw+qhRATK%z8)>Mv5UVvux40j{v}?qBgfRi$tl@&@`rVI(V2#9zm@(y zC$D7$6&H&?;?gi{dVan`NUbK6S={~SJx=U&RcB`$9_Nke=f~Q%ZmV;qV=d|DW`9lhd)|$AfB)cm+>(=J(EeT-bb?$iV7>K{MHo!@IG*%*n=fHz;WDwJe=F^gCKO*h9{C^yn>25%pAM`d^JnYWcR^6Jx$x8_S1|U%dINA#R8LRF~)x@0qim=cW1h z9&T++`z=g#L}Ek{@A3VQghdqK3-q77-D4<%)qyZ+fs(VwSXn(9)ucCMQIKQ|rxo0M zDgwx`T>VeGCAZ+!uwdcU&YEFi$UM zXz~Y;(%*jagm!+oG0kPXm0R=8U!TtfMxDHRzadU|e!180a-ri8w~g1p*Fz)~ffSsq ztRIEg&b!udv9fBVS^Z8R>I{RCo40g<3M1#ajQ{?dE_=suY-d|8tMyMz)MG&_xn7st zt}=RN*ISm~HNA($`(}r=S>O9xvggn98-CoG=rtv}xiS!@Z7XiqBVlA@L_iI`7-gd0 z<=J#PVzPGnJl6MJ^;b<0yM*3Q9l}_HwIA)s<*d4YxHwmM=lEjj**m*e*46~vXAE?7 zbfWoxkC`KaXxv<#hR$Di$zXlUri3<+64An_HM?PN*I< z*CVOQF(1pfHsj379%Jd>*!Romc?=!)_V!-EE4s zFo;fxdpPmqcD+(Z-1}~cOZnC9A*HONHfxAAC8@^zCad5J2Ebyx$?x#9l9cF3d^)EGQ_Td%~le^N5{|jc#*u zv-$vUi2>JmM{e4s6a~tGkNyW-JEiCA1p|_jl5EPcv2QP>tG`fA0;n033=a?AZ6L9+ z&@7y}jIF1?Ny*G?Ls$LIYUHw-q^ZXABMm)%663Zk-8=za-q+5rZ>8G=C(DQZ_=H-) z8Ueako0e~`z1l_xyjMLtP-8PBXjachV%!jWCNGEabwYq4U9azY<~F+V*39D3?B+<% z#Noyy;oam+7ExRpU-GihG=V@RNdfKOk96k0w&?~|A$uP!?v|Cw&ClNiWRciEIcej? zLd&aXO;UY#AGP&jL}VlZ73gmATeWg)dqi@m1Y_pW&Cbr|2|TDWl34h9uG(k0+nnoy z%>|vUvJ`VvTtH-GWdGoxNy{sZMzqv7v%u60_wL=hF=o~hs(1z^y@Rg|1A&`3CZgxs zR+4XwnWx}H%VD`DKB*7rmU(-`u8mfGEAenj25fOPrQay^Lj^LT-Q>-MNv+kA1WS4? zwSOvd88@T7JmwRA^Cn5L*Y*pKx%o*3(vp009$igWuR5+*Qb0*fZOo-z?3%ePMYK`7 zaCWNz=Qsf1$YEv1P`d*Inv{iwCE2hdATMWpnms|%tMw(I#-E*U&o<5z1ODT$H_OfT z-S8|nR!x%XCJDyT`SFRQ;`{gSBF-ZZGp(^}yA1%88?OBt9yV{|psqVfCe0op@8Ry= zH#YX^$2+S+AiW|NOw98}Myvo9Mcony(GNbhwFx&|{%dM#DnqC6+tisG-%^xo4pCF@ zxEU7Ks4~8~vT~4_IU%(KZDZ%|-RfvzdY+@~D99Qdg$OgcSL5g9<$LGc-JS!M-KZX$ zXi2A5R8(}0jL6c=zFk|ZAm%c*Q!{Br2M{Rp`n!*Oq&zwW23}|Edg5QcJg%L8!OzbR zP2-WR##KtnI+h$qvR&VF?{~C+#}{TRsZZ)?#wVw=#M5#HS~QR zwYO_x5V>hfnwy&ws8?H(lasSpv1l97dF;wJHI-DmzvQ%m~t zZB=5a-PpRPBqlA<^S*AeE72Nbh3yE?mUCLRCH9OJotX1+OyB%PgNAzq^soZ-aC&j! zNnS{hed)OIz5Dln;*6fQYzaChAA0ccVVBPvOCwDcwq+qLiKp9=?BuXoCccg;Z__L*ANx{*0%eH?NkWs`TJiBoy>)-+aLZK9>HlM z9y=w44J}F@rEEP;44kR*Enk^)=O(9gQ^@y~nKdvJ`<$`)(UK*6@%uklXBQXWdG-F@ z*dS)C2uh7P;nko%gRc@>n4^=E@$Z${lGR^vKxWYx3lBMyyd$!7i>a8HNZ+FaY_rxC zyj)mRM2z{)wJQMdr)_>L&Fb4L5#ue7ZkwXGxORfBZt!@o?MK zMG7sQBIkQ(+VkCRy)R-W!~})uZByk!814oH2w=EjA@-4z=jr;{Isaa90MU~w4iqOL zR9%Y3|Cr@9oeZFWT)>6g>6QPhj$ZRT@>b)rXJ2L+iVX3e_sB==_u+jiB~h9B!F z@el+8Q>6Z!8St%a)<;1_-DIK9;#!`HjWIeYPeb0&r$@!fJHKL`uog{ zT#i%M-#!Jzzrpc0ZCwy3qenZsrs}I}nVKc;{Am1K(qOsRUE~c0gV8z*@5ZR8L-FzP-=cck zFC4oHN=pa8jYIstrX~q)k2d@vr%UD4LIoVu7}EHS>2^+1?Qn6zTa z2gkeP?G5TXyUCt=E{?f-c^PMGy8oW7v1zLn2OBY$TAvUEnjj8SY|436#Sk9=p;0OR z{iD4lwcHvS8rWl%$sg}d=$A=IdM@+@viKSd)`rvDMn*>~fVhDj7=uUT<<)ivX~@dj zsPT`J zov>|ux7dR!8JzryBf$kdAu3=NR`6}}{*V3=vz63t?m`hP zglbgt=#=(j;a=}Z;_J>H3iVP?7jTfuu0ki9?tK&hBu~iHl$8fTPTg9#j!A+zOU%_C z`@Pg@$0Y6Bl_hTc4=Rh;&`33@m{T>@fXA5Zr<{hrm_j?r&(MGR_cqpJ$C{&9qhHZ7gADEDJdx?V?IlBbI-vdh##%2l>L!s^BEYB zXqcEiF`*Y^Wv6rrl1WG!j6&9Z5V=%yak>|o#9f1*=y{%KD8f7WAz9@PM#fEZ;ONn# zc{%aX1%i@Kc=gS|Rf3O5#!?=WYp;sdCB{Gy)kfLv>{3h9 zy>{A3YcBYW%IfMLi>;`to86o0o%?xscshDQi!P1r_zPUL^hPN$Ezo0fY`ewquB`K? zY^wnm=$p?-tpi1=v9YneaUNAKPTV0({%G(x(9*d;@)uf9^t7&FRQ?h)n4V=q>$#V8j=c#@l<2xZea^ z#CFC!7d<08_mWE^>mUn@oyN61Yc15Lf`S4`#sE`MQPCVEsZP`S@|B+low+lD2$k3ynt0hUfdwL33e0_Sl zGr-@!e8zzU#Lm(Vvz$KP2r-JY4T zCsaUfEk{Rz=2Vpm&~5aljxlyl&iM}Wq>ix{DJkteHCN7_J$u@=D;n%RCC<8Fb1hNJ zsx?D#IsHpZ_P1un>#W~1v~mejVrXbMT1FvCNwnnzv}Ml6z$st>E^Wq&YGC*2I)wuI zej8`9v$ON&di}Nl*Ot1P$`az^iID{Om(3V3frCcyL9_;u?jzDf-vlK#!O?NGNd&KS zTwAyyh1Pbu6T9;Wpq{(xNkl}1+3_4`68C9NSr+TFJ93Q?`tJrf;prRS?&7!9vyINj zCnRXSwV(uj&0VD1OG-*SCQb<1&MgwE`Mux_-G7 zj>klI@e_~vUzc$Dp~dZZ|A4owHcWc67RW&rKP3v84=tjh z(y#rzLm_1VrqGh9{Y0TMoY*22$#`87C{RZfkP@+ij(pgtfNo1;lXXqds<&T$x!OI1ylq(>3ADJC4dUzjixKi+X&7;b{BouyMK6DMS|myC>^ zpFbSJ{wxk1VH$uiee0eAs34RH;zU91|MexoD!&XrQw^x<5E;@UXHicKQ^X> zkxdZvppcNE`9^8Z6rR15to^^`l_YEPm)f8IQ#~fk`RxT zl=eg0GspD%jTtsdel#&0s?~j3URBJ+^lO<6Jv}|d+Joc@fM|$|+11Jae@Hk)MLGyS z1TZAy<>jqtY;1g~ou9?;xB2^Yr|(=Hr<{ew%l@A~jetqp-B*X6ga8k8ejjbkjO!m7 za%rXuJtApILSF{(X#A=ey?X!$LTLZtlO_avX20K21zw$o!vk} zc9HjL4!=?5-emCIfV8x<{;4V3D_J@nm{BhsmL{}{i@-evFm}!0P|yRv34=lMCPP`1 z101q;Y0c0wCaI>7{{66@pW=vwh4?aog0zRx4CJtEAyn*2bT~vHByXBh6se8z0`tTO z28}~zv2*86I$K*?IaGTHgcABomoDXR_yz^lmRi9zs?Kecvch4lbe(ix>M##M&q+y- z^Ca6qiw?#SV#Jh{yS05+Fa{<)RHt{ZilLMelrkVNFmM|k)Kfvyf$NL|nD}``(2oVK zOMe*4G(*eSPHsa)SlUzkqr}$!8GxfP?y1Ah4kq2ayX2^IK3}AmY|%=;V_dzRVH?BjqsH&C-I`SOp2?72MO#(^ zB?E(J^YTwQ;H12*wkw_acEaF&w!n^ziHV8T$HF9xtbe?%%lZXS62^)d(;X9 zE<^(%gD$|3je~;$NKF_}#Du6G$n|`dtr<7ayMIje$a+e11{n6rDJm%NLFzCe8r96~ zEPqK!NwDSjf9R8Z*KR!iqBSQRX*l?e-tv!aIT#wuaOOT5ozv!T4^;Qyh&{f~7B|9> z)k?}Z{g(sU zbsm&A^+t0l0l^QQx*mlwA?UN_eEZHFC~INK71VX#|Jz_Ke6at}@xQTytAsE2B}qCA znk9B*5N#hS@%yI!dxMs-m;axZ>;L=j48CJRTHG{W;zjR-g;8)s);0CRjT4D7q^qo~ zyohcMV@zR!ZTL~w^DTKrB+@BqXsS{6`zR>VuD`P+Tt=|(6m8)XC+-1Z5Dtg~ z=Jd%cFW;kdJP3wBi0>l4zujYP^dUKZ{P>Z?qN{)($fvk_4G{GnR>oY+7Glo(wcYD$ zYi6F};a&Ac+4k6>?K^fXE<^>HC;}U^D@DE0WjSx-|68u@Vt8kAZ^G|$f8Nc*@)1Eb zw($k8$a0gs(!Q@L)sROUZ*JQWE#)Hy(TMO4PD44tls|arP*mzBj#P}e8xMh*aaxF` z`}Rh20OkUk5*bOJZMTw=(tVpaLM|oT0w~|@ldsSpO4g@}^Gwy$8o8?yf1GM*Ytta3 zs1pRLMsEjIACo!Cnt#56Lnk>;X7jn#%i77NY1Pf-yxWrRXy<5se%&)k@U@#u(|$+G zv=+$+HbV?H^aH|6fs9_$+#E@qwC**?35L+r-vXz>XqzEStLNtrfQp#BqNbx$Kqs7B z#E7~K)X8wh(j=c(O--%X(^BHUocKXo^6_+xky;4NL?_Oyw-#y!G7-W$ObJ1a!};!f z1fA;*x*cg4zuT0~Zc@639L4hu!o7U>8!DEN_W{DbsCZccw8LsQ&<+g^-JQ`P^PtLc zKJsv(ye2mdX~oyZw?QT`Qo{d<1q|BSYpN5PD)`Gq%<|$n{C}?B)~0j*jh+%waP3q) zWfZ8^_g5OZ(4mX_Q?v46+JG6s0xQ<_08Bdp?g2^P6ivKju~nN%#Qr9rl^NZ1fx`fu zq^EQ29E5#p78bt;xrYmd@iGA6`AE;{uF0}pfY8pwjDr31XK)93(s2nc!Y3wH8?+iR z12BkhkZ4H*U#-ELCLE{Z*WbR>^#E3}>D5SA_b2or%o=hQDP>HS=I*wy&&~n=-Lw&P z`4)i|L8i;@5m06H!9?K39DCg&Iv3Eoqch#={Ra;YK+d7hD$nM8VUp;ZIKLZA6wnA2 zqE!?C+A$UjGL)aPB-8U0r%GW7iOL^75XKK^l1c5S*CfYAV@>42>9oDV>=jRSZ)}7_KgsUqnO>R)>SznG#B&Yct(6x1b|?)P=qu3h~gdM~7R4ldCLe1HRp%_4Cwt>;CavN?h00+Sd3%Q2^uA<+UD20|vH zqNSC`sUjHxNF)$xSAhcw+7L@TtYE-;Cv+!J9sepL!OfmQY)ycR##QA?>TmeGTbZV#{rM99|sWUsCcFr0s+?nJx(r| z?yS4J2u_3f>fBIXU4>e_h@o)&d5E-driLMGVPX;u0LbS~d3kxauSVweDgaCOet+6; zOwHIC@RAOy<7kjiK`TjFq$K1UuR z&Jhm(prH0kUT4*biD1;&Pwl@v-o{(epW2$GYn9*i^3pLFeM#C=xd1720pU|cz+y9C zR_3CrSO(^n?jH^dJ?7G-Y0s0kQHK9M!Ob1~<_$5&ZM8Jo_BS=&3<{EL7(4Nw0FWc? z;$-I&(5pZISuwFg7%pC+|NrTtxp+u0)RA?vpbUa%-lA-R26?sftUrLJg6ko2qAwo^ zTT|;#c&S#!dAf%=vM%bE3q%gWCxnwC*hO$S!X};@`b?w}p2_$V0T$4=cdvvkTaI`7 zCL|;fE@+5w&tF19LV=B{Y~$*FoKh=&JBsIEWe<E0xC;l6*JW()L0RKDtE zH)L1{PZP}ejmNy%+QMk$l$0An8BoaVVY18Kqk_-Z|JWgA*<*$fO|{*C7(~RyFT+0I zPIyIN)QPpFTr>=HwqMybM+V?9R~lsxwmXnj7+|kK`Dl^LfwY=_^%V^?AP6Baw9pbS zAm;%{x(8-1!TwO#VhvIQ0|TR-`Mjt~qFj(W`H~?1WHcHt>`tAD5k{cv5D?ffo~WX? zHwm@vxjdDbS01E5^CzR_cmR}C1PabOuSYz(YpI=R`Cl`0i-t+e?FD6d_e$L9m$KCFnRA5Af*!1uZ_7*wr)X>MUePi$tDu`4$3Q?a7$crR z05}}19)U!>_YR*UDNnkBgM(9`!GiG*hJHc7Wl%5bhp!Y~*4BFO7LgnxavJU+L5ixX zU&4)D5jkqp_e+4^gr3&=#%WlUkfa}TYkdSyv0fyYD4uf!UMRGYKeA+HH7N7M(rPuM z-AuEZ^lSx%G;tCzsp+A~#ihFw+9KBH&mdXF%X6Q|t~sue#pL&Ub`PPv;(eh?H$r=N z;G}%)Xxc{9v@5s^xV!m{IP|R?lo7%pXOT1`A>Qr*8t$j)N`%1=B+|KLQk19>!G-dz;iAByO;1Z?6%%7XhuMo4Mi51MA!|Ak%wBI~APCO?*197CJ#}DoRPoA{L;Bvc zyI@KU5b2@}3ZX8Tl90vxVb1VXxWgP&L9AzWu}vQdgcsdKC~r_@8BI+(v)o0k0RYs? z?<}tp2?+QGIg6#}eh-}Gfy&wT5P}szV!f;JnMWPgztBwji0mI28$J1jf&Tup@C`{C zC6}4bShndWekRa_5-d-zmKk7tqT3C(ECazAjrkiEh={;>yp@PlQbn zFKbG_%-Q3!tP8oRf>3_o{sCkf#o7?s5&9ttCs9&#SNyAkZn%=b(Nk63V}3}fA2fG!{*IFqUvf(4z&cPOc-K7xvWL9KXf zEVg+7h|!&~yi(NZ4HSCi<;4#?&%!^M2=osR?|In&o8Bxp2EW+(2yZY1W*tH~mRtwO z$X=M#MZjJZtw;eRdj|eW%5XXe<@-V_>EXkhFpN(<+t!*V{XT$<5!5T(tD<~MCPtud zB6{I=KoS)tWuZTa06jWw=c|f_&4<4maoRpZMM|0~YMBe>VC%y9?xr?GKH@2e9gGOa zUZ3SA_@$@oBI<+6=+3pQTBt{;i}=iNpQwwK(6ZHsAoWtORP5O6atFr)=!|KAvIGM8 z4bUzD^~k!nEP-%jy6^6OkH_I7z4P_!*NOeiZ@x5LA`o*<%h1fs3~9a>PFsw$pAuvmTeJN=PH65aU&?T%T+cJ4M@W;rw+k<5mTNCOd5qC^iZDjJ zp}FJ@EhN&YHHK6)brA_yL})qK+5O>UKuN)i@ZSC&Jk1?4Bw8jp7}yImeQ*Pz3&RiX z*>5@3GVXb7#wSsU%y1agk=3K93TTtJ- z$W|hyNrb7Mq3T+)bk`rRH;{pMRl|5Elv+WFM*IUmY?U7Gd6fWi=KGP6)WosH;LjT` zJ<1v}PPe;0$#3#UMAXg`C&))LpQ&8{I?|qj_k%EEo(AF06{=vCc0hP48dv}Px_ngY znanQOqjfmC1)hs1QTOj8&qx8-ECC>3Jg*`)Nia)bg-_1E5ypuZbtJDa+Awa+e@Nl$ z?=4tbYNCR^354ed=YM>1@_o1DK-=Pgp1DT1_*ba95Ni_6sHTFWPFxOvc7M2nFIvJw zfWWEPG&Ymd+`C&v`7tvfUX~|o+8ttIsz*g42l6CWAW#*cT-0Uk5D*qrh#8FArq}w? z5aNlt0v<9&hXfiT!i=WlqHqo3n8gih4_e72no;UJ`nK~{RIi|7@z#!F z#~#a-6WEB7!5H&IL%9CXoj+%F8D;aCemaEoqJ{9?D7cZpWyjdqDs6+eu0S0g#t1L~ z?;wsE0E+2J@(Ml?w+RWwSw)fpk0Qz|IbWIMV}eR4q*iW1p}L1WT5rx2mZ1UV0@TQk zF#68%w%2KB*pSQL08={WpFn;MKgfyVha7u)+^(eQp%zepF;rz^48DMiuHLRBi#x!XZSx2Z2sapmM&2U=P30q(B!meHB8BR*s2c ztV@6)av6_?+~E%01Ed8+&U?fR)=tf-dhxYhsdGN4<%QV1zX=`-I}7m6Ds~O^UkyZ$ z+}aW3R}o{&=B_!x8Zq#5pucq4plUx~OyAIxXWGK!%jD1d)1WU*{f$wMxA&Ioz}~%m z#6s6auoJHcaO6_Htu7#M4aOo)2@BNxq8n}NS0LM%etmkDh@0aTIE}TOfWG6|EWP`+ z+p5l?>?fJRw;NQ=?nhnCcdgUA`p=bqOz#0$6lJE}V6!>a9RkHUC=Tp+RGcj>4UHVU zRe&|}>9Tbm&@OgS(HPLzIe2EK?ObDKgola_LgX^+N*8GWGlW@u7Q%J8aSV;nf$}(| z{nkb~i5CiqkBGmbxtoAXigE3kI5x)FS&ZNh=Gr!040lq%x=^)sEAcK3ejhO31?ep^w z2qm0e%3B1Y<1u7Np@|dKj1avVSW~VuITUOHa&FtP`{Emev)(>Fqwru+YPoT25~oi` zLE~#mRbfQ-dGFr6Z#u)U*&jZ9AcnP&ax0!+2J17sv}A@uLCj%P`CkpcnQ=+NoUZk5 z^;CE35#Kv|&hfZwULbCYgp0p{oJyqmG+J?EU~_tFGYpn_2*hsz>vldI+fYcF1W~{M zt6LybMd*l-L(n`#VF#h&p92j*4i>xIS?p2tb8tdd$#MUO7kBJknnU*2`4YPXx6vnb zvK(jC28Kfdh$V2rgGN;B<)6AxO;SsgCGNJ?0`;ml5hG5&7KuW$}=a`X(ln z;V`^}i^Jmgn;*yZbNC)>G)>?t!piW56BA4?xEsC^k(w!Tom9fV*jbgMAANvi2OdX7 zMP&hagI@JOnqD2u2_tS>!zGsLg*rXz+OCJ~8K;m(vW|`LHKV zl-mph96wttEk`3H{QLLsp!@efwrLDRT<9^#)&%@U?ps04@j@1~v_`1nCv=NWPjyJ- z+37urdRX~rJh9Jou| zI~v^fgjYK-WTijQ%G9lYHw1~Gu5&`RclAp97Dla13o}apP<(|W)U+OS^ zeSCk*eua-)cRxRWXs7}jgJT@~1%u!21TYCYqCPG+)h#bg`7iedwrd{0en>84I}3}vJT6197Wx~$r`6YtWFR@Eecj7yQ2o!{ zra?*#P#Dd zmAH)0lAeFZ3AO#|{p9~g@vqwvK^-aTB+?}`5#{$ViHc#FJ!mac$!c#r41>WXP09!AY*3!e(D6_PB@0; zYI0++heo9wB9ZX!+Q)sQukYoc88WyCa>XMJ2ep#;%h1m>gTdDSR@N8-K49+g$0%c< z4BFpijo|fg*DTMo+{IhA*Np6f2k#}Xqtg(FWF46`G_1l%d3ouim`-ZAI?Kj9C$5au zN>2T*dr(ZnR_=Ny>GIw`7sjRx zv61)0SjaTD+D}~1RML9@?+cl{bfn7+eu09CN|fZ86&$W|Dz$YX|xv|CTG5Fp7f4qeMM7$`C#|rLCSD% zc7CbwUcdJGRCeuqWpsD!Zcc+`2kM)mDjg8FY3|iJ-{gNPJr_G~t@}L{9+a5VK ztw&!IX=%{=jiq+1R2BP@e~cF(-V>-OVp)N$;g_&CyZ@bIo%w_0(zfhb7Q9lyT#ug+(F zR*92ohz$)HB|mH}%x-K)&Vc!2;@|OjH3oaX_BN zqQl^R6j@%J1)pFIBQ#+`bD8QYoaoFCBcvWUyxGp9->mOlB4EVrq0c}Ve;wU+#GOQH zg=9aCv-6dRebAzYKm`rQTPOUU%E`)Ba`1c+V;hP+V<4m*iP&U?%ksgg^MAG|ae190 z?p`U=6GO0Q!@nnnL|Qir*tNs9Dyh|GK!YJ0#W}Zeva#{m_x+8*5&&C|kj;Rs=qoM! zs5m0`adNJIUd;^M^X5h~2N5SESk}IM&j9z}V}=m)(25h+5?DjSACerPz0SxNCb@gM zWXF*sp`l7d$HQ$3!!{m!w?N8c88!hDe-q=hZeHkA$Njqa!nA|RX4H(dz1@uKn8(H# zDa8G*iJsCph!`_S{5#F`oh|mv4tI!e_lu1hj1rgnb&f)^5)et$XYF)=A(2fWBtM8N zt~Kj4nAFAZ8ydvI)WYkf2J(U=F})Cl_s*79S*11nRpDyN-nVvsjTA?T#@jJ6UEcrg zh1KU%*Z=Ow34Or1XXFw&u_WF*y!m2OVw9|->pAfMQ!-bsc`Z#G1r3D~()kFRbnu`- z!teFk{$p2=Jn@vIASS zS9B|kE|$9_Foll9B4Z70tUb?$5jvLZI9>tna@c>rSJkV@IpaESZ#ti)0tMxvUD;~& zwR{5_j$ObXCyCbxSPi|4>L%`horN@d8Ph#LW^0=YO6b zm+jlNUqpAh`&Azi$(sWFXLM{E@CCr-2B2DZL3(w><5TGCB5Xxxtq(*q@iVj)>(B6lOaYN`wd5i8d%nx5u?gvLn5nY3FCyZ%DFLp+k>XBo*`xoNA*n;Q zDd&SoanIT}pWJ~*+*(vfWWZyse0uf{B$=n!`rkO--cYNQ6D35o8zDA>m5d#vs;j%0 zPDRQ>-bN-(7#^!S-V(SmnBer;1TeC{j+2@BYG|`;KRkBc7OSfdEH(*ao^4VMAk;w-MUNONbOmgpnBFs5nplctm7Vkz4p$C-xhBsuHPDx26!b2slHgt7&+j&W&VP~v69AKo~Qe3E_!$Eu5TSQLQDIGm2dD>RP z$eXohOVl*3UE-QQj zpcb+zy7riY2vZO(QN3qVZ%hv!Zv+fSvK@Oqez^0-Px@}4_`j#k#&>OBb;zB&J2lID znwmtS{wRd%|Ia2({r|9vbFfw|oKuLFI|9u)4=PUSrgf_II z*j*23w4NlnZ`bpz%jOr2{^O#NyMoX@sW^5|g7&=q=wIIZJ;lY?@Gc7(g=}t9NZ(Ci zJFb63F1SEQE)OqM!H88kW;>&h=96NVJFb&+Lk};vT_*1Kvjqac81dFnGSbU~OR0WQ zbzk@D*M0w>pq5KV#D((BQac}M#fPT*XcD@(A#{+&%x5h}m6`5zt=QQDJn~*m7Ee0< z?MqV#!?`bBbswaZmw&|d$7G7QV{~j_uQbgq_6{R`Us3YrebfO%*Qt&@wb({Z^ki7ti5#EyRRXcXsm5c4!0bK`bI9fuiL!5{=JEEh~eMGehBgW3)3g}=W53Q zA}Szk?Rs#ifYA`w&=hR#dwphRYYsBSPX|6k<%W|kOzg|NezyN(XegV8!%FgBXpIn9 z`lhA`t!fq8tYAhQOzh(6vI}tiiJP*7Vy$1+mGuRu5Eu7cUw{32u~-aVKB0ueLPLip zuFj#od=8(drVQ_5Kgu6t%DB~^dE>&UpskdhiuR|RkgbOgqvJDc(97?#JgC(%rQZ2? z@B0)+0-bqiD3b;=?L!hbGed!r{WQACV607NU+HqjIkj8w5}w|bV<4YQCqj*a@LE#U zl8DzDBj$1vYS1t)4(!~u>qWXWV#~-jeuk2#3gMUV&TM|XH3SH0d3mYg+G|)c$P0YR zn}ER`YgaP6yZrau0qNf}I|!~n-WCR}5{b;}cyTRCN{0d$adN@6`JqZ3#rM_%hWw=U z9Mo2M%tqgYt?0gwJ212yk2_U!AJE6FHl39IAg#ywb81-cL0m)v!B4+@@CbR5?|xO) z{|i$darH;4y*IkWe&MN)m%MzGmFU~FYUHCwzrUizMS!QcWvHAa#f%6Gtj5pI78As1GMFmgLC zDG*vg)BIsd${MZw0lqs@dNLD_c9L*!snjN{omo=azC&8?V-a&;?vOhGP~Nn8{aUcA zZs*11OJ~N;ysF^Zv2H)u9!@%L9teTzTZWe4_U##xe?x1HNmYeV4zc&BhzK1*mbenR zjRtci(}6hoUV-Ev;_R+dGx+cS@WxQd0^h4W6R>Ummsa45x0)G|xAJW#M_xLf30!P} zRA~Y?layJk3c5<=n+y@#BUN zp*}&inDqQRRjFIvvrWp!77LVO+$jXVJtz!E6isgXcBsC@TGG_j9i-Wvo=omCvA0qB zb^i&w4~B{%$hba?h>+=Ltlr6`Q@{t^S>cqU`{7yVOu6lG}X(#2xa^n7|6B% z#aj5J%IN6Hllv?9_~MwQhC+i9U_4rAWkvRoH?@QFh!mY@qG!Wcj$3bUi|5T}! z)N#wtc{wRovv#DEQ0TT1pNR5z(R3$>L^RMag~SXA$+-YhLm2tEK@V;ApOg-BnDPHo zUf797`8*sP9Lf1IXyJa>vp;%9el)r@_T6yg;>-f*&7aYsQ)uL~Vpv0NMK$n9qi?nb zI)dbfXHJ$T-wEv*IxTY4RVD(L5G*6iJz<^i)M^rj>t#q&5&?Ad5#))zJ(ngtI%|&L z4$6+*q;F2i%gbLzfDmdLF2$8jZrmT_pn0$h;SB?Wrx6Sha&w(x3nOez zH?orRKo+X31+s02FI|_?S#>?P>wVfy3XSJksl_}Tt+6CC^Nab97QJ$LrOJ8;!CzWh z8lmbBP-VZs&N{C7#^8rYgIT@V{HfjtEsLY2b`+8e2Vs}@!4>|9a07G=LeEMLlZu@p zQXY^(5EM=s74;a%A^5c!-OO+jnpJX`%+}_^s-*`+$vM#S;)3+;IL4AinwG4|k)SDh za;vL{@j)H{tUvop;#REtF^#Qu(3IJ>D=H^{%;_Kq}nMLlCel0%j zVgSMDV!O1;>d3xi5%(4S)ZiFPE^()RLfpiz^${1FnI%1=Dgra9pS;#D@3HNM#E+bb z-NYxv0^+!wto6ajcYOkLLzUs#(}j*S~D4uzIcvc7I9$)pZbAQuuYSb@6@#nI`H!y zeaOf_mu$X*q%iy=LG(a016;7mn{dL$<9;Wo96wF8P52c`}WkHY<{ ztgMU(6cJf^5k;V-u1*HYi=k&a$xZ*ss^YWI$?alh5!_VT4jec6sV9N&{=0EMwj)+#AnRI&Teyi{QRG~a=h6sHb>0>k8J;lB9;q;;y#K-1g zvd+MzxeObC1PfG+%NB4^jll?FdLYv%l)245ccGdwuE~hJ?KUENGFdSQ2;85W5sa*#BciMLHaAh%EWtQKAk%?{?eM02U#77LO~s*fSMk zW(%5!cJrwt9{&^e$Pi-1h>Q>)d6SDz?m@e-alwZY-0rYWg||(ljc{pP3OB@xU3;)S zZfDYU6BTx~egkS1X(2 zt5IBTECh(Ao%v`jH2;1F1uT+K)INgjqf^cJ>?X&dEOzbvG~5c#&Tn>N&GeOgEnkk~J~noqPWN0%E&;a1c$?_E~;-?$VV3fM%5FbN=C zy2k4x2+)~*yt^-oS3eQ)?HORCukd8hr5_E65D^ay6x>NBHDI=+%y7D}YtJ5P1_mWq z7R3FQ6>Px1y?YHH=@A!KAb73jHR|Jy>vxngR)_i*h>SNsPL@AM4Kw-07=%7tV3M1^*A}W%l#ZEhPafmv<4=4|uV)#HJt>qFBo0|VHrKBnI>zZVv==Y2;*4AtTL{f>Lq>uqe8 z{GN^*UH4R*qt!ik-nLP4y_lME8*eEbg^T4X?yT*}+_tI`r!+!tqVCd3@c8-czQpXsD(h~$mF&IRG8J`I z_kJ!PuDkq)>=&wDJh(8vM?TE*-*&HG?An_5u~|*q%S9=#shd#xtS`P%*p^jYZLCRp zzguIchMQz&jQ)!+osS|Se*RfGadx)9uSA`Bw(V=mJ>vlkjqtAFmJ{O=+8mMZ0;|71 z3k^N4R#dY#LF|L-RHALv%ZpJ)%ZEQ~lzWJ?`HeyN!R-hNeDZ;d^Kmd*-RIByCi4y3 z`GSHG^Cy4`gHRB`wxRcDXNO5MYaX|7QYZ4&Ydrb?^O5tE^)7{ zykDK~o1PwdNt|Bc)gjtw{vD({R1@ERDIUtuo)$enoq;;0LC)B}`H6y}>NETFzl$sX zm`_LdkiRdUmT^9Ph=(*;UOO-fk4(7@_3-L7?&OX;=9oGhIOylc^apuE@2R$}>{%1z`y=L2Ui}9ad+TeTHH$ynn{6)e z!8q*0<_4e1-Yc)CMuO%J6R&|yUr|BPtfS5+L&E1~?cG-^#U3-7+56ufcaXMKrPrrP z{vuE9@bk8fD8!omhnC6@Py!;=>KbO_18z2YYW7I|K3Gz6`i};zPSwD7fr%_nF5e#<(mOPDEZ^6?q}Y`eV2-EDopR7U50j;c>-9Xw+)r-wt6@=wun z|5W=se7l>VGnT$iF+02UC>w*&UCzWek3GD^g*b@s>bg$djS++e_Uz(Os~9Q-6abRn_5Cd@`E?a-J}<-lkT(V&ZZkbl z#G19;54!o`PCr!U@z_POe)fjJ|LQ-lOa>c=tPJ^!Fdj zY?&h#LPja)T|HlcH_x|gHPW?VGF3Zlt&C=n3LH%T(ZOnYeF|lsY1KxYD(a+Z!ntE?l}(ra zC^g~nM0~8*+gR7O_sIVt?X9D->eudHFz6Cc8brFJL0SRn5D)=DIs^#;=`xUzk_IUO z1w}xRZjkO$y1TpcTf{+;k5d9OPESt4zK8uiWnwVby6Kz#)*KxpD>1H{ za`>%T!NZg(;(><9c0uiy0G0PkxV{KJ680WPpcfu4dO?CnY~khsc@8S<#b^KyY3vOe zTaaUW$u@Vj084EdT)?2?hARN-)mC_^fsj})loI4j^jZL<86hD2S%ubh(ytAdKi4F` zwlQdG#VX0;hwU?2S^7b>cBo@^M12JDJK}hvhqzE}0~yuT*N(r%LLu-#1 zQlYiahfj~R_E1&=IHhepIScN-s?=v9%xYIw{`3;X!nYR@lbPvcyrmdBReefH6*6{T zK17f4L>;I4x)&vtUw~Zl4xBw_?klx9Y{p4ivkyw{NGFA580PteHh_3o6|^lvua*da z+l2V8ftr}1i;oDL*T%~FpIpGFLmV9yfwj=~Coa#@hKBFjpN~0CO&ckjE(L%B%>udpKji%F#we4$?K_5R5Gq1&(Yy%s?0q{g{ zull4Hb*K{Q z2g_fz3i~vL(-GW2t@ji9_sm$p`gLn0Eeqz`cd6#eaES#|1AjXP+OWB7Ph%T3$TV?( zIrdn4S~1v58yGN0W~cSrUc3Fy}S)yiwFoqO`wp3 zM-~l!b}~Zo6R=T$Rnl`n^RYe2&im^IO0Z+7scctW;W;se`6eP=ML}#>pq?Y7rpCJu zb4HN(%zlm4pMkP>4xV+TPke!(U+ROY%bKtae51`GZiHyKzZm(>&OS?_!{G++d&76%SpF55``4OLki3?H}m2H5ZsDm65ZbZ5J_fcT{N6W=U5ZT%Q zMz7~Xd9=uw5j29LP+9`0LsU^p;h7zr@-Q5I7jx+p@3jzb0Rvt?8*LWoR}qr~061VC zlLiGfvgtt-<=_XpGuovKOX1Xui_uR#0RgYm9vaO^hXf*9l|gOxt>eWlpG%8pwtunw z@U9CA!U`WO{;xgPjhi>$|NLoJyv7Ua0v8dbnJ|Omt5Y*89RW1xJ5!M> z8}u&zFmC#nmx!;vFW}x&@BqR7>MNYG;Mbcx@FrA`IG$|vI`cfTyW7w-wK~@%rQc|& znalGq2;Iy>l8&=%`0{~@9?7QDf2vB=(DM!JA(TkQ2M#>Ge!oRFUyBD(DGWC5Eqk^8 zqs}R(Jgt&fDlEP!VDsy4&DNDdTbm>oMGzhgKJ5G+9#mRh!Q)VOze!XiIwa_Vo`pMJ z&4!8R)y5CLm7|Z~X9XDCSYo2(jP9oo-pN~&g=DA?v61@mpIP?K+auRiqFAVBe^TR6 zVj%VGzjoIWmxYXciAhGaD6m58y&(@Cl<@Y_(sHh|3PIAzb)o_9l5prBnw*o{{tmfP z`%J6J`ZL{$U_w?A^PBQ|b2qVW;(s>Gt4WuF=Q(_BW&@8-rgpPNs>JqlW-3&|b29i6 zIGHLsi`_kx)7;q&&}(Dyj(hh|Lkk z(;d?<33D0Mc?wDI+TIPtgVIF$m!j)ESR+_;y^bZ(X8(^Q3PcY5s&!nooyzO)K_3vJ z*JaAHeI~NrAEgVv&7e1t1(8?N2yc`Lexc8Nc`~IlGfRka&+%Sj#39L`?-bnDZgq9< z;AJTNP{%(4rn3rBFaKMXb|hi%F6ze8c;M)}H3|!rj`i<+sK4d(^PJd%X~51gP{u10>ihOO)qkhI zu(`=I4{-vw$SNgyInzg`=PDr{J(c;`I%lnyLp%Kfe&XVOu>Fzsrw7+EeG`*|!=>fF zdL8JmW~SFSrdr^OnG0PXMc=HlP=5X=+fv!~pKANgAU1XZBVUI(vYa~5Vc^>#*#A?C zu9>Yxjc?}yeNTbKxi}ghdEJm%J@aGs+TPT_fO}(b8>-M?HAf^~c^#C1-PlG_w_j7n zr_PQkt#?dAlAF!Q!azvExP4O zmj>~vU2A)LSzWxq~{jluT6l#OEHKm4E5=)@Su0?*%Zr`A&C+{3Tmzspz~2b5-_%=*>LUAY4ZREs~e%~hohA~?9e zd*q#dvJNzIX<$pN`6WDR@(hHl$T|AFHw{47;NMAHZ*E!HhG~^7xY&@nDGVh=0g*`w zkR91s#EZdUZ$BW!D}FH~Jk6vXNk%RS<7{blkOqi(Jgv#-tH*bXi6B4BgMM~8J!Dg( z*c}6fJJy(N%#<%7#@5N&5V(Zw0z9E~6=6i)d~ZRqrxy~qsIaGbv?=aiQ6kc~f!)+P zjG2cgRJ8f~<7Ke`&(Mz|&&l_+U$hB-@lU|S4`yUC+Q4*Kvx8zA&^JfLVHTulFmPD_ zBPO!XfYIV3VCEvPOufVfOISeb&HcU(UE5FwV%a8BQfk7>3!@|$b*8ZMMO64e5d9zZ@4ZT-gLpGOKgck#Gwqs5k^TzLzfYh zmw}}_5)mA|rindrW*@Bs7tW&Rhl$pFl3iyo5lbGZu_HH5U}XfTNI3+740ZviJhPuw z2sDSfKxvHdT!E+{1FEC%k5aDyuNR?Q%!AgvYJC=lJErfANcB`g=|qJ-w71%mHsKB~ zn=~6v-M{BOCVb=QX!}uioRl%F(ZcmwxC}2m<3!2S)d8bL$}=}G+A5|hA0k@YJ{ZYe zgEJd$v3m8&l@MK4x7T!+eKc-xvh)LeC@GD3;l! zo`=7{ehc1fb5gXJCAm_9Hw?mk7pMVI#7BD?alFmtW5qd6T^uAN{tc`yv1qs2>Ga0+ zNO=}D6&^D;(RKgHy#Fx;dE2x)Z`)(K#wFVSUXF3%Xhl}#(-c!u>I(FO3q7UtF_(X9 zL{t**)xly6QS!q?>bfD!h7mn5*a5h~;b$#@;N8HOfMAFKFMSnOi3pw*WaS8ZtC)tf z1`)@A9E|}iZW0pD!X8TJmT+51*ipwTS2A#SmiJ?l#JgH?Q+R*sIt{vm!R2v(z75_f zFn4|Z{GmfA@@tY6G&H^3FPYanwnSx_&O|Lmxp$qUzCcq%l-iqkm&qv4>4#Hjj(*i= zO}1t<4wvW%a<>2&#kL3FS+=EfHW`wlfk%XjAQ~*{@}-{w{4=#OR-0v>Mv~uExzP5o zGG)v+y7+l9o)2hQF#IxuW9FF~P;a1ZdCkAAmOoQO?TReP7#OC##&MJ3I&uS$R*98CJ=~Mt=#)k})B~N4R=l%3%F!a=xW=-Q7|SBhqVprgI>(@;022F=`mph%-B5uyjRjuH>|b}g0>{otwdiI1S4iZ!sozR$p@$_x5-&;RpI9WNiv&7^v6_JYXCQlHE0H zEox$7Uua8U4jxJV9kk&9#+n0X*%{a%5#H^vn>)NRP|jZ2vu(#Tydomc#cX|*R1xxf zn!ET{WaD{&V8qdK-#`Uf1{`>t;L%h>9Wc`NHfyo#Kg$fz+(T4%^4< zd>1u+!Gsh8J0IRR&6+w_gvjyU0$CTb?Wp1aUQrwTCs;mQLtHn|oTh^X7?H=p5_bor z5IjJ%_6B*qwS;~a%z58H=ZGMek!6`ln3xQ>9V`d3hm(?X;p0<*x5>rZt9;edc*bo% z68F!Db3}5%WrJycO&~X?ah0D8D9>Bbb1#a=gy$X$4|>Ha_8w8*3tc^KFHNd2<>12! z_I1s@I9Poa`q<$r1OHAbOeo@hU5Kp|enZu^!&7nJ=FO-%zRAL(L(%k5dd@Z|=!1#t z4}10#*Z7AZUZ{d@2y@qSc1^XBj&{sYj%zfqt#!2L;M-~L|3r4Z)@W_UuBSQ#iNhRUbL2PG; zTMc;!)bqmGF3-lp5UPPm(!bY>@gcyh_RoHl=uP?TSc*wejjQ#Z>%w*@nS^%hP4|LrMVHmz3VE7||<`NT2!^Z~r_UmU!K*KiSiC@$A zI@xOt-)+xIJYv|}+nyS4wCgROBr5)OUBlbFp?@|o=tKL#Y+!(CY=P{Zp#obP6mttE zPwW!{?K`(MM$!DQ83~LUEWp$KsOAWb_135RpDi@b##acIhNC{bXM{iWqm+OCdcWL+ zROZv*+He8tYQs3(K?^Y5N9>NU%R|P5uyF4iH-M@|2L@j@H270sjhqJ+3-VWhc#4D? zfSRCSI~#hB?A$N(87ypGdw1+S3qxNk+9z_(HO>uEycDHBPbHpJCv*^&hTg*dHl6T? zsIC4XGk%<_VMNoel=$fNw`srKwm*zUVJdi*1pgZ;cdv_>xjnl-BR_)c14K(L0y?es z5&<__IzZ*1I{KpP17RgdOaa(aksTD`&4t>a9&GH^P6)L3uWvB!mT6ZD5mObIrW?B zo2fVrR6oREH?1XJFm$Y0XMnD5v`0hGfv4ie+Bu)*+l=k|;X;owKgr}B_798CMZx2! z?It|T)cIpwSn5($aN3C>e3k*5Yre=&k9jU>`aoZ$Tn6URb7nC|KLm-jw}d^(OSXbg z^f>h6etdmP`!1Y;&|DEa)K)oe2S??V1z>Y#v@5Ooh? zy~+ACwPycse=EsVY*hfEN1WFct(IZM^8X^mV^hXQber~+K1`e+Dz3j2p1$K})MHq$ zmKdoqr12f2pkjc&mH%8qQTaZ13CdZb=%#HgJh~2A@#A7^F-f_MGlXPU8Hw!8K7)o$R;%N zWOZJH<`-n7mWr>)FO@rDPx$Zhgo4PI%VGLHgTM|dPz{Vohj(~6A~XXrPFGAO8+F;7lE0l*yq{@L7 z8OZ7WC!?+@J=j+xjP`Iy?T<-pG!4;x2rqXR5=@#Fi&Yo{`ItIg58PO;umRH8GsEM24+p)kI~ z9G5T}3FR&=xA57#(JBU#1>odaCP$I9T81yC9W6Cp&h`|Wpj`7K{xZ#rKk`>zK>=pt zAETl`HzFlMzM%X45*^U;4ULT#?gT_cv!CImO+Top^ksiiEB;AOw5uYame@0$ZSD67 zw#H4q3B&K7Tz@h7^){S^4Qg;P^@~=B1TSBxi{{`LGTq#2{ZpKrGqQ>r;JK7s&b9}B z?cN_+oRU&fAiPDmP}E}XTl2978M-wPtZ)W%R7-aU02_=Rc)=;m3=mc2Pkhrba729K z<6gK_!ckyW{73S}qLy|IxJgGR=Bff*z#q!@8YeyKB~)ieQxsUsv~D#N7RX8FIIZd3 zxOrMe9vSExj0&nF2gCClpF{_5%NmD|)fwj-^29?W^jKYL<5LyI;jVWkiyjs%{5tES zCF|Z}U}63G0n|}RB_$lF_>PWDbM5Jd*V{ji8nzDsCSw^kf%WXh3*RS)%r}ScA5}AvfSMu3qQ;*0I+jy&R%XlxX z4vv3eWiR+L!1aiz>%r*)(oV4u`&iO{Ku^e@0me_RU-4>>1w4*loHi(7bd{<4E|%mb zn|KEk1I=_NT|r<8{gt7w0_s0kHq1#FW7q4Xss%SYkr{?tZCf<(3*x|KMeRY^KJ1f2 z^mfu(+cFYUiSf+8I&4@MUu;1ZPtodsbW6(cMWC)~%>3`js&pH$F(ASmXxH+=k*W}@ z;$gCeIH|9F1}#&Gm-z5^r`fE;GJ?*vl*M+nTN14a-M*dAsqn=cGF6nAPkW)TA=xHn;PCOe#Gw&|U2(qS z5qlR zXdMaYlU}f2-cw6FJn{otk&v z7johDyR?~*l~l(9P44fvn4DAV;l(8_>pQqxt*=yZl?G0rzFhfJS5P@X& z=eutTZb8#*P`1+__Qf})*k<8zDKnf^v%Pcn9r_b^So#iEp%eq^Lj#EG$heKK0U!zy zgFy!}%7`!VzYC%8V0~3Aj7(rqAcUe=1l$}2OAz=3)D1+M9djQ!x?oOY0;QagceRcY zPdw>F0U1f!%+UU(<;2PeJ&2M}{$m=_GrQWRS>yDR1RxRd+*@og_#AH!d1G+!Rn6aB zXT%?rXSR7ri6nF525c@rMqK&(qrN8<-psa|k*&Tp1tfu8-bW%HEdSg}1|2DO%Fp9E zLF>>n=Bd zfSRkj@lVE=XKM_s(XAJ$`Gm7X%a5}K!B!|pm2o#3rlnpID-VM7mG@C2WT6s(XBbY= zzN4cEle@`P9qsryzbjq~()oBss^3(+Crsap=CWGu%Gv7?QL{wO>X6hJG(E%(Bhy5? z3(6#KmmU27@rAVb%*^IUX%xReFR~gh%mTV%Y==Gx)rf|aZRcn`$G9&5OIIAAxzJzbe`&Lk6?(~gEUr=byPVitHy0zEq zIt-unuyW2cAn0n%U=S_u3Fx8LZw_^fdOEw-zWRJ9d zi*L_2b%@wZ_&%6*EBzMk@AgS-l*nPz!@m`;N$9*Oi=F5Xyt0B6jFi=a|mS>7i;{>KZ+8MgZq=!fKMK5`}5?Z2xM0s#$D?E zijNNxxcaDeFmf=(=2i2l=hF8PPak*2RqwIiO^rT;UfgJtC^Rc2xl-WXnpxRUcvtLX z`6r(`dc}XlCLtmM^mhXRlKS?J{rJ49jr^QgB@z$tqlPy()!Y=359dZ88>=hKf;O4uqt*DFQ2xER=Mk8yhW2(CrM3sN+8Z4?79vuC*m zG-x=FB&B}6_e&{n=F3)#p&d^ciIKSQrmC?%Yo^V&)-4O#Z00mSe=Fho{p7y$Pe&!n zI}sxAZrb7WnVm;R8zm%%wMr%V!JSP{iTjvPXNOlBAKJov7tkayIp>o{{_I9slkuD7 z{+|qd$=)L)&M%#uto0%+n7`ZOTaACi;w*A&DL%U(srlvIJY7L8>!6M`agmIY2(+gM zVWGwYTXym)3e~Im0RbkBdZF`GDv|MfGZqa7@+hBOkMDR^y5Z#L+vXY}#EYE|5|_T= zbUC{gkFzm|81Fy&n4XSoyvBZWf!|%bxI@^kDTwo_K{@B`VL4dwm`=_+R{q+S&1Ga7 z|Fwaw`uD3;m{C9UN8Z4g8UQJ58fa%{5HtV|S_q%$z44EP z!F0MnUtTlhh=~%FD%>wZ)W24*zQ|+qL9zd z9Tf5Fm89FV_b1|p(ec4}EemUwJHs<40$!<-AgADR&GY-m?eiknjuW)LPDkJUsoP4c zTd0sVGsZSc0ND${?1#+CN=fisVjJC>4n*wH=#kYJEF)pwB!9I_(y+V@!3Y`&9zF` zlbF%6dm`M`ml}`n>iSAv=m)!%F{qd~pOrlM_AGRHl&R;}p-J(tC2Ve2*HufK@X9+d zI8#jz);5TGyVW>#v+25CK+Qf49<}XVCAW0` zl$vv#d4+od*elcPb*udzyBDfb%<36jc3$I?0_@QsXb#;JX`h;U?xE~h6;^E68j{se)M~J9i z{@Bgq5qnQ}Zig{Gps>yz+u^!1mT{>^27XUwPOf1bYsYxphvP&*Sq=`@ww z`*d#OWtY-d==VS@Lkj(#A~0x3fl3E8Wg}>h!8*JRN^D5ya7U1Jh>r`59bnirP-#wP zDvL{sJ9tEq{B-;J;_U1Yp?xA|g^!8$eumD7s>6r(`2|lL4kU>+&go(_u=+Ht+uOsT%&7om@je%XlPdO3S8%wRwL;j2I@x9ed(UK1Cl=1lO(`J|F5e;Fs6Dt$zH-J6RElV0jwPJ1&0#TM$1)7Yw^SFm%HjyVcsb{am&U_XO_SVvti ztl+Mm9_SE}7rm&Ts&k)FhpWH#tjdZixvv#d-|Hr|*B|7*1WiZ%%U1-4Y5xaLWl&Hy zlzVZ?Ab8}0t&5SDj^??{q60r~)%paT#j?n>Y;J?DzrUq=h3IC@*l(lAfUEGjvD+0O zI9MTR(5n{El3Qo8+B&hQ!eCt2am z)x>_+li-sziS*ENenyf2w-PKX0E~wZsw|+vfE5U6XCw%n2FPa)0y0Sj8#1~N9Ppn> zF?4g`Xcx7)zx&OyNb{i&;MA$B#wUB#bDSp~k)pGEmiK(9v$VRmrIrsFeOlL?wnM!M zA+5L-Zs|=^5(lF0*dOxUSWfY$Esi=0&ru$;YWT|kENbd(Jn?T?$+_IQAYseXri`uU ziC5xt%SAOftB{k`U6MB+@|k@U_x5b-03ivmpCg2;Fa|6-$) zP*7>NJj?ewb%wwblv=P_$iW~*2(A+n90Kl!{CDXfH+e@$WOU)V`iRhwN_Zx}c|$=G zG`P|mH)KQgC~UDlYv)%)^boYl#YB?B{t4*SBH%EGNwg${UxM8caVUJDS0+zLg1JO} zmf%-s!`9J}>V07*J(krc?H?9s_LEbo0C^Xr3!r2}oEy0Hj0Fe7Gai%}06ufxbOsG{tqnatAep)Wp%Tw! zbOpqohM?L)f+IPrkKO{(k)O#pih?q*I&QffUz4K~hG;XU#a%kvvvw762j{~6Oads> z?pXo`Ye~N&ImOHqG-i(`BJWk0TV;^|)-EB&y3w?2j#7Z}Ug7NXE;M=va7MHtT0Z1* zL(H93f-LN>B&B#@UX~evv6@KcbBUXr+*9~Ho$jCV!+hV0RE&IPHr2t5S$6=60F;vO zKXHO~)Q~l~0HQENJM}o{X*9xjM?Nxijq}pMJ2X%g+HAUiI=2(P6e;8E9~Dzy6_=?T zO8iq+x_Ec8DfoSY19MyU_Rj81QMWs<)9h)~_op9J?20m^-y4VGDbja;031vBu+b9E z3$O4BOUMPnp}Yr_OoEkx9K|a4foUwQY;7n-_$euqV6tLAhXJ}HCRNG%C?GV^{2?sN z3SW0C2hP>%Q@jk7_ff>p$A&(QZS6R`GX3XEhiF?2OiXzVeekvGJ>wT`P0Aw;ly2bY zet8aBW1js@C(_y}37?t!rqUUQ=M7Rf$@27(a!u7RKW*Qt7Z+N7FI8Mq1oDdh*q6SEgDk6ZtKiL@3Ojj!phX(}rXun}t&wJx70Hee?$O z-|^xy2~C=0M`v`tF<3MGv{YL!9b03D3W5rPcyk3C_QuhJbavNe^uG(mS)O00s)huO zy#ZAUB1LMkmcPyjl$0zoz@p+S7|uGpZJQ z=YUpRD*X}dzX1=eC;NF04F8hW7O@pi8%R3@Dt#RqDK7GH_l7kGG3Z9%ZbC&+x4@Z& z1QhuL**I0b(96iU1~HVi8ZJtjK?T?Hs9!%|Uqq_$<2rgeXWpU1J*JsEZZ^Ajs-GX> zbS-quXTtFB0sNZw_Rhc9ROOV+_Xm8Ay-sP zOk<&lvHdaA-XBdFvTKYvue_YcnjwnhPEgW@@V39r{UK^q)R_-(o0YCb#eHlDi}aou zwZ{$&s~8aZHh12*#QUA1=#y57GS6L?uK#czK8_RhIc%PI%zDd|oPKNxVAhSddo2ib z7v8a^6E?H|AS)K%MTyc@;|y?(K2hNljo`>XbIPF7 ziU>n})UhbUdM#M4WP|2L^K!W}b*ta?*MUw?dy8KUFdd#cwR=y*CttkO+M4V+WsEyo z>F0a($oT5i?|4N|A!};3nv3z=vo{%8I{;?K7_Ad3L;KMm?o z*Kw{J7j_jj_OPc%$D>5~4C-pA0me6S=Ont!?O4&h8VoE%jJXN#QSIVhA6q4u z+jsp`?(|N9d*+RK@Q`_u^!|EHjM{+UYb(@79zEH(kHT7(>>C zOz;$F1iFq*#v2-Z!L(2SFTb_*^&tA^@)}WE8e>W)?{~TVZjdv6z*TPh+09$iL=_o! zR;NnajPlKDvu%^gKeg3DBtzmL$>=14JMD9KGt&EW!f2}Pz0xud_# zejSvx*pa8CA_GJZvL?R{b}AaNr|j_hUy3#zl1Mby?Qguu&&uk{P$m{CYvbZPmSEAH z6MG!RJovM|&=Npi2M2yXe>~=P&hm&@R2in-3_4`I7jPok|wnozrUVsYzcD-PnT<8KF6Yju`BjNC+#~Gr2sEwwkdxaN0&i{VSXV zQC?Z`WTwlc_gjoPj(*1#?rZp)!0Mdqx&OGvlY|fD;F>|(;Q{$Wqr+as;|Uivb>jD# zrzM8V!`-CY!|i)yqa1@LzDv^HpU&AVS(arKn5DM^t8cbjT%P)AS%n%Ma-Pa7E6pwr z*DQ?4V3DU427+@A)CXpO69%>X6~AH)LAwb)2Oe5q=Ibgce%rIS*Kt))~PkMazfH0!JchT{4 zsJ^B=v(dvVjyYsi6KjbU$K5nGhU=Be*_UfBCpG<%wm)5Je`gt93Q9)3vcIu?cGim~ z;8PXSv6r1VkC*H0`h>%-!BkQ-8RAX)fY}IlQrW-IObG8A+!SEIYyzIk^9e$X<*!$0Z52~^PTC|DKDmDPIm*zeHu3W>epN*e-_WF&ex=X& zb^M}Uly*BD)ARfQALoFg^(Xe_ZCYf)mvPWCfkqv|+3C8(i-Bd8Mb-{a+PHka~rVkw`K?{jm(^krUFIroYofu>_cRbb(a zIzczC*Tve!8$+i-QRCFe?1klA6kIL{?eHir*xTFda9V>aFe6Ykv4v`AWkvd(+vsiK zM*`LC!a_cE4Gl+I@JWV%dn3s*$pZlBv#N-ykWNgQ#r}=(}s>$$&+Po zl~XUd7N%snnq?6%YwJ z3eKQIg!LFXZ8(8slpz@zm&d_)@({|cTel;he_+xE=e*;H8bC6@e$x$N2!v(}G4)@= z`!Q*yF<))psk|tk-lcl-q^X@wnwEJhksRTWy(PU<6Tx#q%i}~5O*K)zI9F3_{ zxAxE9Sqvfj=$pKQd7FdvC=Iduiz1UOHW^Wx>ox50iGC&PrHxfESFjv`u+s%6rb-{r!!+YesW%X>Ogtf z@ZOP*;mwWH!ELYAH_w%KE=qL;w&o+%#NzW!vE{#$Jtw|im9^kmT-3+orOotIVC(vA z0q&c!y6qmf9xJd;x{E6r2qv>-N*!AsxF4MTO7XTktR1^^`0=c?lRkHuAVL8``vKfX;=tBb7qa@;DS%RmTH1nwFH?b2~@guu*(2>8KZmjP{DYW^%- zSp>P#pFt)*I#fH?3JTE|+g-7THzA6lCu}BIb-ocQQqHyj)J7G z-^mOl{)SDA*3hwvz5o1eRY_=viAHeUx!YL!2U_)6zTrKQFg-(f>Ck`qVq#}g*B^4` znqmJp@dmjd-7J~72fq!umeE>mrlO|NRUA6OiPLQkG-VGw3)M`$)!(_&TMqlZeCvH5 zEo5E3+OB@WKfB=R=N&FsKsgme3nz{AO)B8%^bkmOs(CHiNJvY~D8iOdGeA8z*zLVI zHmxYs?ldecaQ7rqOTxW3VA+>h|KQ*N!3TpV*B+X-h{#AwFc!hWj@OwF-dto|DgN=i zwA?qw#&ERs+&w5nq6r!i2RURJ_Efmo0du+yVp358f`c2t5e5e^LE5!Pn%enS2C})w z!1XxcTF-SDL!cUP)czgRCdu%#rl+P*NKdqp)Gj2X0ODPLcQM;KmMkks%5Vur zJV|4U)KE)*u5K%4&gh0e)}}MO-Ko*dZ8(U<^J+M|!ky4Sgv4(Uqja&lXdwg2lgO^l z-COb~eY^#v+pWTPW+RTxPPzMdf4REIi6|);)#(4sn%`yD6}00W))79i!!f>Wy3IP6 zBe&W6%x&TLmfHv7Qy=V^$kZVDP{jwm%-`H^K1|PZIj605eL=R#P9uN+u&Hr>JwTnE zn=w$L8h5UwQ8%FcM&yg6AF;gM)qkV!*3U`jGV3Do>IiWf25_)4e%Q6s(V^shacqI4 z_%*=eX=Lh;Pw`aF)Le4D&99eSMFqbmg+TQq2&8#{rwOSRhzk}IgX=7$0?uO?7a1ZC zwUO!Y@Yb+Jo=3^_LG2re>q;%Dg#Xu-lyGlwaJ9CzA!;|MIS4^R3Ou0R)&0tol^+ko z)y{%a`me2ZtX}z;?3Gl|yP1}Cn2ejA>*ywKIU=@rz{KS8YrW}M)?E+Vkyn4c7pHTb zq%OoK$$VtcCn71wBlMOb5t;)kyZNGRXv7V?z=Q9Rbz$>9v46*&Qt{8B`rzsmp4?78^S$$^b+&_#@yR+M+e0qKQ{WmjfEJ%^;J{Q2`o5?5QDfsxVJ&Q2A^@%i}n%I($i zJvkPw1I<^bR*wX3uKSO>oOWLxCtT0Gyi(iO5L~p5rZHZ39!~9Xn!2lWzM%R_SPX?@ zRm*&UVq9UIe&#SDDY|oV6))erZb6UZbuE{cJj|(&EG~umBKzHX3S`u3N9sNq*m`TM zJXKrXcZZZXIKiYeuwlj;V*LMksJ@uoWp$}Zu`gBq{HO8+8Gqw#0j&_sm|+68Si~XUE1*6 z3@QbWeV`QQS0xVGdxe4d=4nxI(w9dqyBXH4o2SNae24)42KQ83fnH|6TD9jyv3S~+ z&L?;Eb~8M~d)s7$f?=hF?+(YkO*=D})ate*7dD@us=n}-^XiRj!6Ss3R8$RxiD>J&XpPb8Z z9aqhq|9KCfqbbsuu>hI=6`BJf8o;`zgBq9TryoJuT5H=|yXrUJzE!lZ4Eh*jkxPnx zfS@xlCP)H4c6)a>9Y%y@;&5H3SWUS8m z*p?)rW-x7K2xQgT=({g70Q!`PKa!J|ubZ5t=^xd?Fte~Qv9f9dX=X-CbYvO0kd+tdy@#&BaDu={s z%C4>9A3EYrw{JZyko2H5aeU=6bsSjbwvj{~Y$YCu{ebqkJS9s@K3JZ0=2hCgH66#- zbEQ%Kwk{UFT?we0_WOHIdnLMsmbPlrh%Qo{h#Qg*1Q2e`fIv zek8bVzcVL4QuM;Vs!9wAwE#XKghuAWHW8GQ#{d!t{^!@}+Me~hMQCX`wT{bc4(UhG zTB@a0!rl}iS|LdrY;0^M%{L?iB-684t&Fdn?D6s4ieYv>aO*x?ezo;iE6*%w`kdYJpb{x6a;kQD+Oys`TLxckrfR*kq186$%N<1DMv z86J+#9!cXq`lG=!NlA=%5IqAI88IBd-X-pQv26|#lj#_sPL)kpRk>rccuQR{M~cLo z*P<-YpXtf?so}8_6Q4lIr2rP@8=Fy_hrxsT?`+3|qn={SwhihC0=ElE4TZScjf#-0 zETs4Yy9orA{`N%Iu5{&wp8`HX0MB$!*ZXf)*gSc13s_7IgM%d4*w~=-ZH^I%ZoRU` zD<+l%%_10Xfe^HI=TJ)3`dYC4ZS*w(}A7UzZ#}Xzi3#Rr+HKNJWq90v9FZqZ|e;mVZ`OgBg<} zFI`OOn(3cQeTH;qKS`PHWG=@HVm{x}FbJvV2II|y_o)q1b0Dpx7Y41gWyB57fN!Z@ zawi9luWpn$1IA-Gv}NDZR> zO!#Ml6EhCrP$ou35A^gXExHn?0e%|y=@SoB;~vSurO;|0zel3l!Iv=&*95`g!O6YY zoze`MsNDz1+yRuh zeD1@Rf49-AW4~b&^&Zhap3~T^bml|xI^om_-Z8mAi}n!5FXk3JmwU;H*nRxW)>Zb& z@pYp|e`-W*#C|M@)c@7ZSJFPBU|Eh=(yIhzyqHvq#mdd!E*74s;}pE-gWH}wuee+7 zS%e%p#*VGm4Le2JWeIz#xKUcSS)|8XbPI>h+r4e|aZI2TNAa|B>;LBUjNB<(H7SNa zFR$;`>Y*kn;_rZa({i+!12%{52wYhF%Y$?ba!BMdAC^UQn}#sEz4q4?Ytzk+%er#2T^7><4vy70&dVb#OyEkqbn*ZLhMoePcKG#`l*$bHn8q4 zfo~E8a21d&Pma(k%sV%{BzfrQ-a$fS;wT8VAsZ8*g#JlM&#G${(_&)0-lb1_wGJ?$ zJZin*U>%vPY1v!ccMKLjTJ;*7B>T8!7F1oxVZdNO_N&0D<7xFfiG!3*odX4YAN}>= z>{>^ksK@ck@n=;V1g8R&j4T3Vx=a*L_pB7!#V_J?<0Q7Kx77l1ejWIQ|4iW(v0&3GYG8B5%p7E@j8yDVys|f^r$<2oia86t0 zPBy#KcAZrK%YsutO6m6yg{fA=Zed-H9R4Ws<`A-&2_D=SHd zcPpjYXY00XSv-=-f@X@|lO((R>a(GM&f$V%1Uz7eL#VvGyhz#?WJz`CWb{6VY0fJpS55-~XqW~u{H0JHY8BoJPFwyjzhau4J@tQORi?*ky zrO82n_oB-)$_^Ggp1?=v*Y3|N{I*z9ugc@LjIe0O@e+s(J-eu_`c8^PF^hlR)bx6> z*J|gGjknUnt1x40)xhs}Q{N=s4R^Lrg6@KK4Rfuos9F8hnfJ=i>6s1_rw6{i>?kz9_R1>a#a#15iudBGy~Nbayb{PPzmE*XvW9qW=V;wP z7d&n5`ZBnxM*I~<`ydU8Sv-K?Y8sL5^hcV}0WZ|_#O*_{WhF~r();epNWSNZ}l-w4#o4k&$Hvm*nNaop9MDjO{O{n-9^$U{R2~K3Y-$ z>4%`^kcUDRhEN?&+;Ct=y$T$*46dj=bACW~*osvWKtFMGf}&#oonH6F@rkC7q*o%D zi<{8sic)>opT2vk@0l5zAVu;ctrW04@rsLGem>dwKkBd+8cj|4@xqNayDnDsCV7?A zs96dz?>Jbh^$o2gS2BvUoi=x2oO5GFjV$NeHnaHY8ZR`UGSGFZ5nl&IA4DOx0RTX2 zE-dG-Rj<`U0Aa>%_C; zZT17B(;V!*+{Y#Nh4k0PNCjmn%snfjv#qI4RrSppm{#KspKvCw=U|_CiG8QkTXQ`r zyCr&_n401+xBnbUo;U9`HzIv4@W~nV!>Vsc?S+sY*kH8jEY3~epLf6cU2o9;#Kx-k zi^ehDUpxp9xiYJa*){UxbQ^u#NL)tM3df{6OtjJ(%_3?ir(%`kfEsvj#(~(w63h zX?~Do#>mJBf|9wX)G!wDZ?0M-00}dacJd`N6B;ckU~VBV-R}TMtv*}^@Tj>tBXqxI zGA~4z>=o%A_CD%&!jrY#XW&0&ZPi)4N5UKb>2AuxW5-$EzWlQ%0}u%j32CIeq#Nm8bcZxZcL}`L+B4^z`Et(holkq5Edvkh zx$o=x=bl<0N?!_oV(8IREkT}~FU7UI3nZ7)cG~DB1(G|<^y`|O`%?0l9F#8Ko!>$ggk{ zT3DR>x=j!Y;~mn`NzKiDyqcg&ue9Xx)=xm|?nsD>;f>PuMapqQD=WpT%Zn^^R_Dq5 z^N!xmhs)~Zp-KIcx`7W^Xg|bK8%$LZ_VxtM(YDg=>I`7AQa|Ik4Gcz4%6^(e;|37?#cb*|KMQ=gG4VbS4LT^ zq?QPI25P1{7*;$?9w4fZmE@#oq;Qm5;T7*;)gd$O{y z>>M1>GBXDPXR!+{CvYl)463oaI|wSkfcLn5S8>G8&kxG6Y!GeCdGfqvL$*5Ip@m+n zLoXhv@EI1oefNDj5ap36D;AGRllb{6{rh{vJH{QUmq7{T39A)^m!lcJgglhuSI%w& zSMdp}hi>_a^)0?X@YpD-8<)Z~ZVA{1r;fcrVH#?mHfN!u_QeUj>@F5lm6h`(7hMZo7+$>OvDI{?%i8?)eYF(S41pUBt+RYTcgura%d{$A z(+TlyKA@wHfG>-0_-!X^Gg$wknjlbSHfXNLc>B&h;!k>4zh@EmR;-oN43^@M0Koig zDRteU$4=h%lf!hc3b*R^4peH{&&cQvNQs9L(XYDNJ%1%%IK&q(O}vK9_Z}&yYh;_C zPH~Q-k#J*}mT2Z3t>wxqLe=;1ztFC#*m6YN>5-UH`cFQo?Iu)_f*Ox{#o7bb(GDKs zW3l?kxn&H^m+f|S&T|rmW@<0`eHeIyCcMmn^2DYTj)HM!5IWT%MB=fmEGB@p zXs=-XfLRF{Q;!1GOwCa1kfmQQ3Y0jqNeiAa6qCn$Y!AYjuA%3O+;HX>7k%R6DWC)B zAy}=B<7sPniusOH{JPrWzPJ(G+b4UZGxIifaHg0{$2tx%>UEImu+u#|e0aRDzSKC- zfw2GEr~urOXRc?2BM>?PyfvAcO@T)rvBgx?)IQgdc15Trxo9vaTAb@p-#MPZ0V1HW zK>CgsFO!K@AZjIj#Fa%4Dsjyv4ySqJb#8&*AB6O#?)$hTZ&$zUPP?I(b*i3k#ICp& zo0Dnq)>`VIXY3{z6f+2BhDG|ILD-x%?DN~2qFQ#P(SgzD-?hy zA^h#dbiH(GHWM-Fc&bhmYGIaz0wK|ho{PW|#wu(^bBtY-|6^q78`}jAH;GWlU0ztY zUux=_npDR~QcMiWkC^ChFRJYl`bqtSn4&7a%M`I{*`H$?V56~_Lk&3Ey?e27qN8A= zGzQ%kdR~KQMXF%wCyAn-?d|P9U>fm&whFY10tVixT%qT&2%9p?cnif~ zwi(_i(wgAZ+=EYFhM&Ki__9PP6j&X`N)fAuDE|9kcVsfVWvVA%Upm9fjxCNj}kxEH2Jor&Fs0h<2Lh3uMY}g>AUSS$3H`J&;XDQGWs|{#{k#_ zIY6BNUrGw*2`y+WP&Q&!hLxwNyqunz8da!zK!+Nq#`XQ2`zVTVN#UH~Li3-pk2Ylt zQ_O8%qBE=Bq$BCKSbOVd=iqc7Jye z?w}GjfKi9vyw#=CpEdq2sZjZoaRrmJ%qPjL@<$!wLz8GY1TEowUh$?cDjC(wA~nY} zU%j>P1#GKKk?({zS8|isHSnc`e{xc`c@h-{$;kOU)H7B%t`kFeK0}R-^j<1Cv;SWp zrQl|nUtW&fKFw|pd#0v#Mv)+0-s1gy&74J$04Gr2x|goLWZ7N>Gg?eBV#5AW z@l)@_x73)3E&aadgoJr$`CTJPMMbPLh0+fZu|MhRw*%2c-945%iOffMN}RMH$_pF| zh5(S?0n(kyNIeY(E#YbMli%Ji`)3jucAI@;vEwTPn9KJR^BH7s@5{^8>~>YY{V1|) z&a*PS(45mhX5-AoZ2gx55&j?ouQh#eUF)PvzgxE|;-w7x?kT(XJ)r>+7!m>?fM2=P zTn&Uq@b(}MK?y@Q(GtjEMJ~#zMJu3L{K_H6gNTQRLDRH*ZEI*TX%bU;3aLd$-e}r0+`c!js?2%S^O#R$Q%CaA{493WZpFo{4+!*bZ9x+R3M-MFc`^pU@o87p6on%s?^pE zpI^gt{5!?yXwRwoR@Hv(2nr*d8Js})>W_yznEH$W`S&1@goB^o1l(3gm$5qX}6~C5^v0;8uMkhM7RnC zO9d$T91v*0*2@3~<^N4d`G<#VACUY{!CUm4kcY2K^LRC|i(hQ+Xu(*w?&aH)zdcIu zN6ctA)LTigRwEmKKpwcyk|^fwm2?BYCFNTa!#6I(ov^Y%OV4ZcH2Q5_P8_yt?i~Xy zhQ{;n0@ZY;as>^O`>*wN=RMeuIb7_ySXX{;-AmEX(io$WeM%zx(Ah*MQvOe(J#P`b zq2=cM->oFM4W8?C%PU|m*)}i8@r3k7b1&RRL*uUj-*tY@9km167#R|0N$Tde7S}@T z*Q(|OObxS)j{L1oM)OSH(=Mi@6vcg0SQwj?_8sxj z@VL4OS?ueNnnb7k{9IdQnukMz+=EFRGC__%z9e7~cFu9q)g+S=IsE*;hD z(t5aWeT!PQ#OvaRnd;7NN3pCjAxuO(P6JLF{+}3alxctf14Ql55N8SFM1?I43^XYq z>p_-q+3z<-Kh`E>h5fGUkl>4th>ZD&kxH=yoK{3uxL2K{~s%>_Z>CtI&8HeGgP9b-Ot8GdebHvoJC5fJOQ4>cqDjFPUIs z%vC5Ei`9&1zhf^EE_ocQkppvBOPnNpE>|abL451bg3<{1M8m7uLuKZcSh^!=oteg;!_4H+}G2S#`>~k^g4$Z^Z;3tzzsp zoUX`USC_{-=NQy`X$?}yMy=;pM-fjgMsky&M*|t^4XPtfu$C~Ihd&VXME}09BJzfr z==r027i@}p?)KWsI0gMR?fZ_Hnd0kyUYh3i`f0Jz5y6#O@4X$RsAB*of-!m-nZ~BH zE&5-;IOoG#LMA%@2F7#G9VtMSXAD4o>lYTuXuQ~{&huHTC2!BfWOKAp(9@RD)koR3 zK|Xa5g*aR!EDzDsl|JQY;ArU9T^v2Zf+L}kgovmGA{bDS#p;lufLr+qet#(}AQ*fq8qD{Gd&OC{E2d*!qflb#Jf4OV!^YjqQ#o)QnWR$B;4wzlg$@ z)y52_k}@AtXyjwj>*`t+=_I4Sef##0bOKP+yWE8%%ue@mqC62@e~uK zC=PM>JB07Ak?q-65Wo3;5$Zs*pfHEZicjdZ;> z{W(7Y;^YCH-=iaX+Qe>f z*{`W{6+5B~<)|cVXVgC>+8!AHU;uhDB&-Z%VA$8M4e-6Yg6X*5lm|}4Y2-#}G_LP0 zZq786jRsA=R#r$gLv)GNv|uFLOIhHWii(SydV9Y>?+(b+@u8Q=V#7tk|0j#+4993> zGT5y_7vfjoi|3o)u$ciiwJwmc?Xg@fhADAyax#WLPj-ole)KBSVkx6Asj}8nPn$rs z?(KLYs*r{aCucxkxC{E`<>$l%*E=$A=WbtYp395;t#VFbOaDO6qILNyQ zr#2y9Co_Z&MmH@VO9J62X!sp*|hy+DsIAV8|E zDlPd=_k>k4)NS$BEo2lUeDJg??dafW23Gu)vGLK4Pyl{YOExPSY1W+i@K&7#MM)%! zwFnBkg*yt{L+Ra4{razQ5xXbiS1DdfyP@9DB6-RFWzDUz~hUjKGXaoBG z+wx9y-z3d2HX4q-D#640srR=&PDK9sfc_XZa@Yz{8(LfaAO#l%g6CU7;b7Hm3Xfkp z<0GpMb{;K?yh4(-K8REd-T+)8AIldEYVSBBGcz+}VjgsOBDL({&&sWS^`|&npP$Ks zp6T`T-^tQ%-q1pcW)f6K1i^%itmde&HJm7WEBE5X3*|S2Qq4Wt-05UOzvrIJ@dOa% ztH)DT1b>R4m@TLD&++9&p?__?J2AtXYC__b^sD8UAN%Z@+|?0|UF6%db1LKzD$x{Z zrHBg(ld{zl!q%Rvh?7(vl+Mbr-OWYR15>rh_72QX-@lU>82FWafYvZbp~HFi=9YCe9{btX4%D$!HbOse^0_NfRJTd7yL+lyP>@r~s>U8l5B~3`Ht~OAC|b zlNOkk_QdoI3+gWo}= z+C?N__EPZW>OUV#-@uQ&G>H~yj{~bBP7KblUL1E!en=|$GhNLOnO*PZq70!l1RPbq z$iolP`0feB<0Z>KefsWr=3~<*oD?pr5K01xyWg|3QGhV{1lJ4_HA6BgopvSDk@@uyg)`(`16C`r;>QA;)35ZRK!O>?09} zMEciOdjnf)sc%fKarBi_Z}V?K{>GE`9@B?x6G8E%X!^C2xPvM)UM-pjZX)2$2AVCY zU)auBkXrJNsv0a|Y3BxFoE2^305p&uyHX$Z>k4RWk3g*hOoSKrM1Em_Z|>Ji8#o)A`Z;)~Nl_;(Er}?w-Dwp|#bo zK(F9{1NWUn2ZPg+_o?Y7ON^$eCdLK^Z2Psa*#J?bRmvMRbRh3{ST#wbtxT9;rZVi` z5g-j@EsR_y$X|W-i(5e0$Mi$vDh)lnl($R}jpy6<@+%H@I0*TLMS|mHF?QK!z0O$y zY{Q+B76d&?Ku~t$-cq4g?VMAF;?CLns|O?_n95U_xVVWAB6~I=p6TXqSO}1| zSfJd9VWSk5=g}6nbelIA87=FgUH3;vy!Y~Yq`#t^E@U> zDYHA?^YU+@szj_XZNu8v7S5>pk6C0SHDi|5^hMS+@cw-Ki1hwkLg7?h<02|OE|FAd zGKeu(H>O<6f9wv(^%YPFFHS|_C7TjfWFl8RXe4|e5`sH3GlQaA?-dV%uwo#KC2$#n z5RE0|j?;DbAN`d&Fh@7&STEw+Px)}cj9Y8HZdd0TIlbm`RG8epAp5NA%{)o$X9anq z#$<&93!M<<3!M~R_ML)Sn4xlh+mA-?4O2QIQ=slklPdk=_E#-Cb_uprRBE z>P~ztEO399S@X-EFtXD6Fob>~7uahqO~jW7fADsqp+ChhQuXb`U7=t+BklKyEjJKd zfVTo(yhqMk1iSDJg#a2fRz&aROm5)b?j8)8yvm+2?)FVTa3DT8*0N&sKB7bHDwA`X@7HMgIS1S?~^FjLhLR(kS2Ryn4nwLNG88CrU&48~XfRx%mDThb}tR+dl z1ayJb3K}Xu*$85GaCmh^nQX+LsHwG*Pc=}DBCh@5}b+l55LRUuJ1VZ zmb|7Y*0n~D6u%mRTHHrsy(ug-RQ&1Frv{sf;m;$;rRS{%i7bjc(dr748HSvXx*5$q zJYLfqlM(o-zttY&v==en&RAC&Ef5Z?`y1cIZ`j`6ZG5yxK`!hVaqL&tHq-jEol+lz z8ik9i_!f3mL&9%4TG|XlYeVFI0MT_sd6kTlxbY_NS7Q6CJ zQ{UB~-%@_vzIZM9F`-nY2|V&)k6<^Kv=}kcD$C`B37m$TZDoe}oO5XP(!Oy#{QB%9 zFoL%?CSJYX>?J_i5On)INNwP;Q=iY1xvZ8N!tTRReJM!X7(}P@RJP}F3d5=zQE+nM z`YPS>Wkq0Cfy0~OkrV;6V7p$YZjV~-!)zBafl8n&-anu*nBb|fy`G!RCThblDa;QPR<+_OX4Y{6x4+=b6Q4g-IFK(MpESrUC}9 z6k{x|!*P#GgdR#d<@AT>R-YB7QTBm2;$`C+?o3t%3$?|M+hw6q-xPIK$Rf3iA9(6g zOO79o7U8wBPhpf-(@jOwFZ8Y~4q?w|9o!>8FJ0=)yPJ@$AMJdzw5e6rBDbBc5rXGV z01}0cUphFEMynu`fs`=}Z=l14gu9@$W8|Qi5wfG$Gi!Q}= z{?!$OX#m*;2@@sgJ_Mkzv1d3(Ls=~XP9QR&1Xxc5y&*?{ujK2_E@#k)jV-4W6MVZ8 z;k2*=*IyW)oK>#C`e$?-hgvq%gM=5II>cfyXr8n?H^$)i!sDqHJ6U9tKFTCkjd>O!o9gB3eaaO~X(Ja_^dm ziV4pXJ%S~a=&r;2)S;kHRvFijBJ651@wB~ODXLtbSkRwX*I)2K5KzO&NF*?_bIZ%E zSnM^Ap$s*MLJ31zTP+tuPY;gC0no0(cYi!k!e}{ut;z5G-%U6MBDa%UL8TSI;m42{ z$Eu?;>`Jy`n10QM`pA<-Uc|P{{IKtwDWEa+Y_ol2UCVY2gK1~lx5**rIo9CosJj{| z9XhG!Tjd{FE;jLEFE2ltH3=&BHx;$pV^Al^yMJI`>Q36_Ap6JC3#OpryWb)#O%Mr7 z8Hrxk8it0#!SiToX^C{tiMSsvdS9Ipy^Q!esdSePyMEow@pS(yV6UbA(QCll_MY$|Dws7VmiKapx%%Pcp2VMPQ!PbxQk zDdnhB&AEtFeHqS{77FN4IJ zB@Zg#kssn8J<3BS>Ed$D^3?%8F!ql%K!KRAu+c;_~Y&OpCFFWn0 zs4yqlyw1aRjHNyJ389h1fH`>2@d2+8!9DH;g)i&#ubRFSoN+Epvg(HT)6;OrR*G#s zd;Ia zRv6@T!yRc!%?K{Q(jaN&F~ z>W3OLvBCr<1A7JvycYKb1fr3WUFb3AfOdGKpD^5|sEET5UAGQfn^ivQK47^CC{RIqc20UX;)%$IoInM_6AB>G359zY_9FCUN#_svs_ykx-@Ek-|4_gdN|R$e#*U zmYq7l-|qfgp;v76-F-H=)xza^LJ_IW-CBQu4QATLo3n2hj9*+MjUHz?F~hCj67480 ze6)q_Q8%c){I7FV4;r|o4^txu`Ft3{z72A@EL|i%Q*=L}okPKVWtg@U-KrMIr^)y= zDhe)BjOgg-aAu9$8X6h^Y=FX^sSr41WT=3HU_8uB%4I%&jR$N0`1v{BlD%N83p9k+ z2Sd3riwWwvAjh?0x<(-A-p%D@>0mk^hDQtXdiN7FmmjhX+3S4BYQ1ewpy9FcNo>H& z>y|+4?wHQ1T0}Xh6VI6Gu>{tdc(dnvvT@P->pefl8I3m1zdJ7vQ?|ZB9I?I5I2w}EXD!W+nhMre=; zr=Yc_W?*O^Q}$sH0)M{q3@0Ivun5jYQ5uk41kW-CccJ-v08|aa!ooIocAZG}=s0!s z-b2)S)xOpKCXQrPz<0uBRfh0KOCqk^J zN`QTTZ8c#`lXPO;%u30|b)`5sB2r1_Fg((+0kZ8gUu5BfI&=wXV*|Y%_>#zkBHgl=_tc>g6R8;Zdr9Y^_kJqvfsczknrIl8+kt5$yS^eYVNlM) z7hqSP4yK{BY(chGFP|JW9dg##rFGGOWR5(8oAQLdOC&g~!s+A}VJx*@eyPP08cn&& zD)zASrPkkKT$75C!7vaGwSCroZp#ne-y6R(@_sp(s>hRwZXHkQSy;)gEc2%IouU9n z0kq)Kl#`83!^A{PRMcG%hEVSftQ#DfuTG6~c%wh=ifrwHy@VKap+i}QQTF}+0TChB zFj55!cmj~Vj7YSIgcHxIFb~`kOx(pRU*JgsJ&>4`JStOZ(hM(}4 z8K*0+u~pccU>@$?Mf6Qhe$J&>`0^-XP5q6VtW3C( z#~MHlGu9)c^@H4T+qeSnX&Ljg=}1+aYXvX92jmL|@6JkDc6OYIdt_>AYMc=~ zdtPa{$ew)*HHhL~_$~g(jc~aewX#pvpR1{>E3mMT130dC&?eaX6!8x#BmmgRl+)a` z?l=&DzcReOIYBBEikFN>RDo2DeMwBzSAKIxi9w1}O6un0vY6)2{sug3CHwV!S$44bOv%if_!2c?!K@k>Bu5IHGfj-upmxu$znF# zppn2V5&pV=mytz#pWk~!D9pkYcAy>U*^I*SA|92W^j%;nLy{B91HQh$O3eHae0&^6 zm_y-ZUJ~KFRGtUQy`$ir&436WqzeHqxxQL==YNn!4DZWZ`d+uPF@wo|`P_QN=v{nk zHx*bp>I+xwz-C zbw9&tVz=#^yo^lC75T|MHI8A^01jI{ZF|*p-1X*3EdJz7kiOiZ3~N5|t?$ouk}|HF zLAxmc#*(a(^VJqG6*JA`%ikS}gd6`J=WtjCKnbm1?t`N3a$2IX2gkVkop(CaH0a%lluqef}vU7KZ5O30tTC9W>!iU zAEDp4e5_iI>G5awI(P6h>W95K|HpC61iy?kVbzd^AAu}?jc^zAEW zB%j%{vP6E&DEQaKo&Jza^d!-sBqAj~;tn!vgEWl+LP!4|gv|xWVPPSsp1b`(8j^~7 zJX&5}ek3HcVt$yzAe+T#&Z1Wn1p`Nyhk*oPB+y?=-+%Sqo+#sWB12-0Lsz}+(Mpik z9Vg~Z`3}O==KlQA&io}?)!ZxzQ8jScHNed~$z7TxJ=A5s_9rs8qxtZ^ z;ZDZoBRz?`svat2vsS_lRWIrL*A7?``M$J@}rAq~{ z)6OA#BS5rVVm=%IHUaQuyXv_v5kZ0r6>|xLkZ$h8^Bz}p6iA`4Ej2%Wdd@i*#Zf6P zRPxYIT&B8_jq{)&@mXglLrHGPLOmJ|fR;P^loD3W9BwlOL1zv~#Dh!VR>v{#h%8$wkaJV9&sR$jci6G;;fAR_e~G91Vl1 zM@_;ycA_f&Tp0~As6a>?fk_TooeezM{KCRGQ1b3a8wOJqwrIBtVSohpZDCH@SXpxz6>K-Z}BA*d9m9#?YeYM z9;HNf36r_}eOK9$qA@%fUeNec{dyrMulysgM@-=)&Uv(aDT}pPbg8K+f3|Ob5p+2t zW0!yV3*ZopkY`BGWhZgG9wPA^u`|`Ygxm`l&7x5>qhj~W5ytXVBOUFxeDRWlWS?<% zNAHg6`!TZID{8+FpR%>n&>i+mEBoKpmFkgQ;%)hzgH`tp|95Ht>3ZLmzA^UY>uypEP z6jISXdUT>ZPwGp~3_?6gZH(mPS7#0wIB0Tm01Rd7&yHnD-U)+zFJ;Z`Ko}?C&_Wt= zAUlzRyKCC*q_^_@fW#M)PGZ*{<{LuJjbH>C6yKcSFvA~fpS4lqa-9ihJ@wRQ*^!(B z-5EG$R57h$eaS zWV_HKwC=fl*!C8ae=Un1JuawjCD7?7l5uw|>(0k&z7FnRF3N-Jhnd+-^@H=Di5wn= z7h&-&YA@^OTmu&A8Ig6<`KP~^Vli!}-s^x5&7s|q2v^j|zl%v$RP=lC%}JF?M~u5X zhB6nji5>PC6?kXf2^08YQ71gl#>~-Z>WuK4%+vm*WBp#MBT7{bL%9Z61GwZ~L3h^Q zohDr1F;TuoMglv`4S(8x?8WuJ+0bJ6t$P8j67V?TK(-me3H=v{)|!I>xns{Dm=w66 z^ilXGzu4so4YNjmGh{(S&Bzbo8vZ1pTF>h)W*N=-kP?yAyIt=w> zD^z@&LQ3jo35zB2x8o}0N7F@Hyq@QeuEQz+SWPXRo}vj!dttjO`HTFKk9sh9{%3Mh!FPmzsYl9%ik8&H_}a#M}HkKF|&1b|QVu3EZfJ z`~x2b$HpeeReCD-5t9hJw&*3M6E*`ZijXzb1)s1sa1Q-LdBlz6R_gSv4&Fq^<-@Kd z0m2n-TV%Gk!^YShsJ{f8;?0K71oUvwy#g27>5S4p1~4{6CN6mO0dRLloFju?A#^#x z?GNJ%HGuPCAw?_x7*bCYCauQ`-$Q+z@J%7Gh^N936V9%WjABDZKEauRfH`VA))^A# zf+&ReR-7RJ5BL)Z!0N>7m-v=fzNqdK8N|~5AWrAZIcvM_8h5bg`=bTFdlu85_V#A$ zZTLh#h^vvQ_T;?2ljh}N)gUCQhrKluXU$K!7~*Mim-mNb{)%6A_8a5%z8IntMxFa^ zwUi*&c65wL_G9Qr;>zc8&}SX?hlxyTu0d?{)XAPhRe#g}Qy8oJ7vCK?M=Q64 zF4M|@cj-$HP1Nnt@ZZv1z3+pEw!C$lRYyWID#_QUw6ur-P#}&xb-90%7M@@zPbIjb zU;ZUHJMslFA`rm_Z_pA{uVb@jdckKfZvF|#J6+~{NeW9KWn>3N^2WizFl4L?*&MQ~ zs}Nv6UDmX%F>qU7N5_T=6;b3x0~%{i*z5psZG~-V&vLfNPzLfZ#sQ7LUJPBBVL+nU ztqst5czW(f=%48k2M29D{o*I4}iDm7h<03rKGq0O)o~D`P2%7c;rdLpVVoI6q{2S3`T3o@|$ZrRa3va&1%ZcjgMn;AuM<{e&y<*y%;Cnp#DV0{U6!Y)J()>+o zNjAucK{g!h?ASmewH+qp%`;#M# zt;|5yySJ|eo4%7Tb_UHXg!|rN$5f3g0RX*K&fC3UY>u+1ht!4pJUrnbS{FbV#>B=J!&D!4 zEug#{B)$g*@Hc#qJU=(infCi==kE(poIYd{w4%*x1}X|S_HQuy8M}tuy_x(;Khj@1 z%I_UDl<*@Gx>sP&YRz2?amZBf4+bEbc2SzAi=WCEVzdDJ4KI;b|7+``Uq-P|?%HJY zlNqwRCBlhsb*!B-nZBF5p)Dcn){lnVQFG$uX*^%rT1?jI({)r-JyfoXhW>x+=Z@f4t2JeMN zovAgYKB2!WM!9H>0_-`Vi()V&3cd$5-HJfb*XZaDWcMpf!I&lW$oIW`dI*Le`)NL2 zULzQ^K#;Zh{(J-KzZ0BZI72!E9ThaUi@`M1uX+M676P*0YnpH$YE*9Pk0B-&3>&1|cI*6)&0GA&f{4zNWTkitf zI8Vvfaw9Y!QM0Wp6OrG{;n6{zJHYkXu4uVy4z;;rtdNnc3a%~8T9(i#a^0W~poAkJ{-lC*`&~Af=%s>M5jKv@J<5;S47Bu;ullp{bcbkt zDhfaW365x4pa^a3?Yl~a85uvrdre%CN!{EOUcBb(`%HkGB}VutjtnWh{+(NDd@{pJ zYn-;P1~evt%Vy0rX5p9&IKkckeDub1~nn{xE8RciNtw$?kOAD0fimK*Rl_I*C9Tno7g zC|Li18nm~g2Na;4>wb7?L81bmPsUZZUiP8DDqNoCS{bU`>w6HAF;`_tK^8k;i<*s> z>;lMTQuZGa-1}Bj74K_nJ(GO!=j+K?FZNjmbD~WN_hlyZ0fou667v6kbr!9Y^?6EO z;df=j_9<8!cx>}_Dr^HI0zcwdA9yk$_CRZ8_j^d~c3f5GO3qcf$@+xwWIAO+{+=)q zosLu47m+pzcv5%UdS^F_+~DL=bW(@8_?co)#WvBeS5Zp9p?7fy=McN;gt-=4C$J+TPxn)NgH#tcr>00XMRDfehT?>&ahH}7Sb*8O}e?~F`uoiry zH>O}Og!Q2hDM}#%VRE4zxVm=>J;UlnY2*JXr8Xz(H_c^pDjDwDR_B@pl2b(<4$s+Wb?x! zTLdHDhJciVhitxMnYy-HS+;_a63Kt#iN^^+>XjFWf5o*{II+vV(rk>k2u22a9@t9Evr3r$XP$&qtt@T#P{kn zRxy!%e466|6YCQrQLBl~&kUw0NGQ}w3CaqN7B(=d9;=t#R`ac;XDdE#HtH0Z>`!rg z>sFE*oRiYi2fob?{mWmyG$eg%_Ec+U;r8>DM>wkkHR$TZexX`hy=5L;+4TRW zN=?253njkIEdjLTb0m&O@b0F6TM5)(`{7c30k;_PIb6O|CX@;Qu8c$dVWmt@jPcst zl0Ak)rI|)ChVbaHiBK!sE8?h;hf1xs-|>=9AZ3FnUh%t$tF$yaKfEJtB1k*s!-qex z>Z2{9pUIN(kw_^2{Hd(Kld85;(MF->FvqVPhOfMly5NKNpd6GDTiC}35NYRqb*ZV@ zzL?Yas3d*KrqERO#UFKmJ?S6lrMz!D9y41P9=NpZE@3e{CI3Y8#B5GE>+2wyMqcMq z6lV>2N}_d7t4M}s{W|TPqGu>REwp=wXBVldMqJtT@&k-PGRlNJWONxn43d>iha0|& zzFi+o^*a_R=mePkyBIOH$x0~JhN#(hDWxDQ8k?dvUdUd^Ou$m zyQMrY5T&`0=D2Jujji56iml`3->G2;Cy(Z2b+0ED8RqXS$JUwVY?tf6Kj$G;f-CE= z_Ai@z=MAnQ9rG$_^y}LK)Cmv*T%ud9un<;Q>2UqP%|V?w)&0@zN<#T|Vq}0cJb(Wy zh8?X$hOkP@ZRb*QQ+osJ)S#4pI}yKtvqpwAGEZ#iT{Y*yaT4)LKi$jh7jZJ(6TGkE#a?Du1kydZ;N z;Ooekn9g^o*C8oOk`Y7+*I(-FP47~UJ{I)~bvqoRMQ|AnK zpJWEtUb(2~=(&Z3CdhOS{wj~Ld!H<@dPTLHW}&c(7(H9ItQM!kOYha|z2E0LooF!n zBN36lH3zUVCj!I*8RoLa##^FQ@S^z=EBc@JGb|6;Uc?dHY4S&4JkX&+?D~I43JaLI z6D3@OsMydsXSMfqmUH={#Fi!ef-EfSv)>5Lh3Rx(Na3pFhh86*Uo^+ZrH4Co z>Tkt-6Tf4{$Bh0+@phbwr}Fxs{IWjBwX>}wVP0_QG*xfcS!^&!_x>sD#a&O%n1zB6x?8&?m6#BUT1Xnc_voZOyg9#=qzbD@boWP7l1ECAuWO z|1R(zAsWH)9z&T^OE~g82eJ@Gg#UffDrMlpv{iaYCikbxyQOG{n${@cfhq~|pLh;5 zKc_{H8g2^FF^94DZd|`l%jhRM`F&R^u#U$u2j#IJTdY`qqS1NJezQsH+>g@KS`*$X zm(j--DuNR;;r2x-GC32Kk!y!cLLqvnzXyU%iSsUFTP!Zq4S3ln&NVU=XvD}j7nyGD zzH+PW{2a(X?zAc_bUTrrT0jKBCxg&Ks0{%#r%F(%tpbr&Ft&<-;IRI}yVw@~in z()$y{LO%hTP43iArv0yM*=tTtD!hE@EHBA_&3$~@QHAo(H;X@5*h|vl%w;o7B5c8B zb9-1o2z}oPcv@zd%ags3tkIz1f4#40dKcHm_Z}dKX%V+R% z$|<8XT1u4h|Dx1U+}o=y=YI9a;jMC*c7({{?}J;afZ0-c?4ZKhkOtHp5`B$k@Bn9s z@11hZ_9BUK2nzXExK`dqsZv+_J1XzR$KU;eZkRVv6VA zzI8a*x|~oDqDfi2U;pW7YME*;simQw$ZwMq=xw7lzFC~EQP^G3&whTHs%-0}Rle>-RAzVDY_#EXs9 zyJb^t${H9f%vuwcClid@{btd*9Y@0>%!VkVsvoBy2=*hDEBpFy+{?Vz1aj;_aguCw zU7yB-e%=6pvAr>|>h4@iFj3mYJ6wA08xR*v$J*-v*PnA%=YXB5SUf?*$sZ;$8kSu+ zLqb785iRB|3P?XA80H_eVH(LrpBc)j%{8C`Kx-di^xElv+>)>DBHhj6Nmoa{QT8en z@`m0ao6qMuoMK#nJm6jM#=ioM;Q{m+g9QWy>3)HSiu~fhpD^UMw;;jxNUp;e5`yc> z%Fk;BQIOkD?CzJjNAuzt4~WnR(b~)sf4m(liE0e^mMbq}Em;Ro*u$ygf8Dptfq}vc z>6~hgM&IhXb=x}>;^DE`cnL}>{rr9zxwK_nTj7ymZ z^3Bw=d3Ty72FR)Rz`l^*`t z#TF9%H(;Cgfi!<$zLMJ+BIg&P%>Wb0*R<&U{C=hO%X2N$7FUwD$b$(Bzf;`La@($I%(V zWB?Nz+ZTXQ_74V?7FIR)=)(}8dv5*4ihD86$keU*(^p~N&vr>2`=-)M!`aVWf9>gn zM{QR?{ahJ9re}5AQvX0KmnHQpHDwRzj!ysXBeW`Pl91~8OJ+JWV}u_8J3am1*FnFGfc`Orm_VSO&F==(>=#)Q z$6M1_ym5GFjs+j2JQlpl=B1ihYX!4&1NW7PU<9q2cyZQ@81fdi1xjnPg-@S9AK#H zd3PHp@R7l81N$b@&$OITB@zWcEK+A zh@96vyy!Kia#7L`DJpx9jF7xUX^S=r z2_lIyUg`&1xU5uBcwC}mdCxql%}yBFWmS5sk2@7_DYr;O2Fl4B=r3O>?a)Dmj)3 zNQZ!QBb}lkEfOLnDIndYl+rETASHq{NJvR{2uMgbNO!{-&sy*M?S1yO&vl(YT(Vf` z`aLnn9P=J`;N}6OhPa2$PWAVL-|GYVTh%oX`NPAbp9S&}T|i%&fBgyrD_zJ}i32e( zCMMJCADl~NX>`cgnD;BgKhl_ARXiA^@sA|H!5-HeP-mdPMy`I)A3+!y@CziY0a`H1_p)8I;aw^a zUp&1)?<;mQ5GJC}yAA{NuHsU!S6UxP1wj38meKnCZbQX?)e-JWxNm=Vycl^qgpt8j zb2p6ue3?_hf2?-0NdG1djjRszzI;!Gut<0zyCWiw%T(o82JS5OsEO%|DOdW|M&Ix} z8tfD|zlg8`rQ!8&SMa%^q0}+l;tntFC+Amb>}X2tndBl9a=HakJLFX%S{=h>?_Or= zm;B`;zr0Tt5ox4m=YszJ5L)*Ae<3uwbmjZu5t|1;EynQvl(NfWvH%rS45oxAU?Yn)s)(lWFgzRsUw{rh zHm?436A|vDjUZqoL<9#SgKu!$0|zSvCMYhR1*3#HXoS3pUih^dWaT||4c+9$6UI%7 zm(@HGMRfeH%UOY1JpWz<)VRX?n>|}KreyrOPCtYLQ=WvdS4hv@8-2E7{oPzK3RvHi zd|Y!MF0ua03f`>ho{te=E7q2I?(MZLFG+9-TJXIEHpPApTl1yaPlp|S7Wy$dsa%a& ziB$JH`DWHP{Ib3YX;~aZ+Lkd!er(NFx@3EEbd96m*1f=ZdxlRt!6dJ}M-o-1$uDd$ z|25;VFbv~Xz>)oTj?l?6DVQ)yK3`H%zQRK}cPrSBigSF+nprReAmaGpeck$zDqC#O z+9+?^%+i2`44dR*E!iV%!f(e3Wt2)N&o%;5{Vzcsf7 zJJ#I+sPY;*t7qlHEQ>D9)sBnazY`Jicb{RHYL?fLIehmO74jQ@?Sjn&tuSBkNRu*( zg*bKL<@|5WRjz=K;JgL6O_2KvAsC@7=E_J*(XvyM8of7VD#f}OwOg6=BpzlFu}VO{ zsHr>~mPP?#`%jp&?U?NXiDjaEIU#QVC*{qNn;wqR+ZVqXhc8F}{G3BrgxM=I#q6UEm9W%Jw8nUjcw;FB_e$^!)bcZl1goq1%fB@FELU?4Gl($nu zl8`AuR9GYg3O>HP=ud`KvWD(bmV=0zNd=<8tZaH?1M@K?4#KXQ1-gE3vKM|0dIfn# zZ^iF#bMSLTjqKHcty~}oB=MGWrX7-B@{QPXx@(DamL)9t_ewU3no8a5H36z<3ou8I z7spXixHScuE056d%9lMu^9G(CX#V&QxexT1vpq({|2+dtUD?z~SAas`AFz>V^O2n>9pq~4M*)nm%ljn7bsHL6o(T%>6(0o%wx}iF%D+*(87Fjs;UT& z4ezPTqCGYk(qV{l?5Q+Q<*Kq2eX8)Yu|1?Cx##Fz<>%pVm97piS5uD2S>i}-}!#U$M$z-Vb-|(omb-l zKk`9qhE}h}1j+86STL5-g8(`@sIBbhkt)_9QgwBZ{;U3P!a^v!L&Nm#7~*8x6l zqcGh4reE0$#RU>mNCi2&@Bp94{ zXIBhEy`AeNt`P1T(KdPhAD|~2gxz#_K-B><6&6odI=elCa^0c$bqE7uBIY)m2A}&L zr9PCctt}l05~v10Wu%GtYWC-vOfI)ZE@O}ViAt)$SQWqW@s_!*Q*lBEYa-pLuMWYe4Ce-f@I+0&t8c@#yP z{}t6oML8*9I0F#7XU;AN52UpI_b(qBF2;Gal;p6etY&4nv_Cv~&t#vV-=a5$I~(yS z@wtk_22S>(;g<@&vDx7hYRb-6)O%Yu_D18r{xGw+cKKmyz_TxwK%@bGFX*>rHq%ix577-Nrg5?*ohM0#-8TmKN0k~F+(m0&NU%y)5**@2+3)Z%w zf5TXsU*dX$0~*Ztd>eYG#EJl3w(DKU$$6@Pz7$*u6N}X`ECap1&2d?&>6(*ubc4Dy zoeeG{B#!v)K!tR^K~7G1a-5vB`VBaG0ICc+==|{@-Qiy{6)M}UEqV81F4aX%)|i!D zxv`BdTk20Y$W`Er9{RV+1qy+8>he3|NHi6uTjN-Dv$-0F{#-GYv6X#8$wgrUG^>$veysdPtyI#tJy*V~5|`_Oi9TDj@JwRR0g>(JN6136X}d1pIuhnt+&?VWDB| zN}KDnS+w643hq)rgMca#6Kxmlll@07%#e)|S&-HOzGcqq--i)06L3th#vMS><$X`q zZx%1UFmbsw2^vszoZy$KYXdE^oedt7U|aT?tNq+AP)fdjwBg4KG6_LpK+^`}kJ5X6q(kYhvJ$v8-Juo7>Ansiv<03@lj*9dd5Jg zDsXnerB->DlBY%s>v5)Z6dF2uLvwR;8|2SWnVHXcVqu&YW_p9y%e7UANr<44(*r*n zu+Uyoj|mH_M+gIh*%4v=a4$@MU8y}&d}a#*z#wR}7kg8BLCEOg?cIF!REZ&g60B(% zCM3_oWUNb2dCAy=-fd{>=g=p%=i`dk4+$Phecas++q-?1{Oa5BqqIdOn`XCLU$}S- zP70q_*H}>npj)TX+74wl`{GzHvOcDsUDI0Ym?Awd0)G`qo>V;h;7zyipC(vvRQY~1 zGbELNwl!+xpr?dS^dat%Q9t-{z_GRgasj?-)$r%!*c_~>)j}{GxWlU!>49e|`t}>Y zAg^8#-(TsQWQf)e^8cWJ-rmH)uqhT5^j|Br2bn+^cmPq%p|w(JW!9eF1AknsI$P9C zL!+I=i+!+o*claZ$H8??a=s9<{?3$089?X4OnkJ{{xw-v?TpdCoMoO#zS{mmTO+%2 z2)s9f5o-0?S^#_YU3njH;6?5mOhHi_$6s64?1MqQ6tM^enT#P#R|?kqLlB$quM!6k zLzjgP+7@frA10mV?jW;|ckhe&#TDL8)d~USt6^+xtc~8=>568pS*$)I3nm^j2lCoc z2WgVAT+_+Qlev1h5Nkq$0^va|+gma1{nWT8`f9PUzXr0n{i)F zE`yt_a>vEFGU|N~_el6YHa*sKDLpe;SQjK_(ozB$!JKjGcZvbK**A8@eX;0B3?bV0_$Mz3kOifsBILMnt)k*x$py4BaS>3&T8JTrF{(PKdn<+xJqGZTcMHzkk={GF8MfzylZ$@h; z*f^dR@Q~0}ACjqd3Vw!p0zI6?fex}@YSgjL*y~M~XV(2~;OE>;P^wVKkUzCqB!sWh zYEL;zzjzC@+de7}Fxxm{n8UKvlUD^<@51OgNlHGO-qWYxbl)>Z>q_NYHPBK@jlT4IU&wu! znc5mkf<^x1#|F*l$u+x^$i#i{G6NmQy(A*ZXcn?ZI5$BGFl+dm5GT0oacCxZ61=p0 zru52klE3f+I4<=UKeFyuzkg6pO{Vm(&)50Ya&XW6aP=kx5dPGYrf!PJ1X$DMXvTMK zvL@lb7&2JzfW;LFxTx_rsd?Q_X&{l*?9fWl+7^T9=bo)QY&gTV5>r=!o!CT=Sx$t<&*c;G!jEdkf(GII-eDO04oY$b5{{u_%Y z-DRp}mYeFg-IbnH1*l9^>{2iUn;{Hz$zTWriv#-0FCdKqPxP5DCMj5DT*1c1-rJZs zEqZcWpA}n!-B+BVw8(3=FTQz8LFYGF*`t1G<_M`^L7Q zuXJoYzF7g6AEhC7Li0L{CE#j-U>R3pJU|1-=i{}KUc`+Ac_E?SrV9uHNl?;9pXVv{ zVETi)f6!V(5SS7WSIHaWJYXnas z$+EM3Zq;ACDw_Ubu^XhZ@KOB6|5u*%*6t(7iuH1OSgM|bkAh;dASEWM3>;`LFE2!P zi2|QUN=iyp0T<}zjXYB>O3SF;bo7);m(>$Ce|4211Wqi|fHXq>s!EuVKG_0`D z0}O=pCH^@|g1l}A^cwnJk+@$GA?!6l!GGKz$fQ1u$@B5^het+U^T^7g&5}<63v7M> z03zb!)&KcyU@wR2eHz+-!~?F70$YVE$0MPudb-^?K>LRb<9zDYurHl@>RT!f<3HNg zXkODBQM8UKU(?B!iVgI@lK?8D0GLfBOowMIipS z8))OBwXOo-_v3;I9`J!ouLX}8-=ZvAT59wD<$GUTIrptgn5-@5lMc!SW}WO3^AuZ+ zuEaZ)+@vgnC^*E?{Ldlo?)b#SZtiMd>Q(sGn~Sq!B>q$hI?xQ$bZlirUiqJb2xzq(D3A++`fY1VQRzyKUjDQ;K@8|a!(gXqBEPbnp zu5blh;67bu1uY&Nc(-;8CLR(yE>@{rwJI+~eEIsW2~(;cL16-lBm0 zxe4dN6crbE(aAvPhwzte1Bk#29vJ)OZ4I{+!Yfq2Q#06W2$v554KH8Prj%u=GbKq& z|JfEdx7H(E;_|8L$1mT%Y8i6=AxQ?#wE^njFa_3IKQ>Qbj0U;FT0zigrYhgJnBWNN zLiwsm2wBM{50OO?$#(0O&#ZS|-ps*teM>i$*IjR>sfAH6ns@=95ug-_N}B}&K_@nM zh~rW)#7PsgF8Lb-0{F&iA}qtSky(@9*=@xm2^t&)YVvqIo1PIv=-L`a&jwv`+Q{VL zqr^kH1<;*qHcx;o1tx{cE+4$;{P*4+otP4Eq4y*sk~rjZi0SIGxe0>f%p1dh$}p}! zFk;lp%-Z2DNWkSl72|`yCljQ&%6aNhp&}Qm`N_EQf;hp^V1GJQyFcCV%zh~@_HC(n za(0BHeVGGU&n%v6U;I3olRb;@m6zR{BoE*mstNND#Ln=|k zS`*BI)k>xhry#18?8P-)cu-=5_7~L9~&>_VX zMkV^&v(3mLmrUp#uJ|(wX$<^tMqlU32$Z_lDH8eQQ>#);D_nxASXF?>Jd$PV4OK z%n4%LzN!{cLVVNw;gmaTuI7_S)&DQMF^#h3uBMi+$I35}Qz1CAtc6xc$1uwfC}nYZ ze&eiHrnv@A5ihi~v=Ct}&!m$ls74lpIF)d{VNy~823Zix3fyiqDzw|4#se5B=H|K` zP}>?2V;-byINQGIvLB_pk@qYZ1OG8%j0Q^`H%>yh@fh?Z>~Grcf)<-V#7z(hyoSA> zamx*lRVxT6We|UPbb4xgv^8_nq7HmUkt_+wodP8ZVk?HY3ho1JC!r9jNej_%@e7Tfs)5Gt-d z6s*_zg$|v{L|ELJQ2?diPVLfiv$iokXGbKYZc{c~N*>Mlp%*6hHB@?3Mr!F_xiuc& z{9N%nKi+u}AFv$YmoIuy{;i@a;Chj+=r!?cE0PKiBz+%CURe=(w0o-mMo0F_txsJK zkE@e?jcj21>p6UGvqZLWm~89hHm8@bE$FKqLYiN~D<)==124Y|^p&8u0EPwwC@RH) zYzFO32KaYvV9IA~WM(E2JfY;cr}a+i)*aa5t@qn$w_1opyjr4O@soMHc=A+?sDg-^ z3(2+sX}vr&1Ixn&OaSX-^8QA_4RUS7Vsl6XEhoXeT7A?TYe z;w~c^M{^qoyZ1M^c0*X890;_z_$oLp!ovD0AC;Oe!F6*FO<+SA_sd%O-T5nFw`^>> z{${%`v<&6(C)LG5JFjV0J7* zCqJK8KB)(m6ZijK!pK<@(jp*yk;DV;IlI{in@WM!x?(So3znCcrKcXz()uE|03a$3 zWJiI8`xGV{YT)4>Kj#m(4q2wrv(S8&M)|k-6fUaSK_~L#ib$i6tWnr{xC0I|km78g_rb!Bi7k zE?AHMm@`7uHhYR^uBEXtJ#@B4AaCvAh-e2}d`hU(xpoPGEI<4lC!Kw{OotH1Wrxkl z?pNBb%>YqtC~g8yMMXn%12UtKtV56?L2USS6!?3xa&v>=q-}J*+anMPNLV58S{NdT zz&GZb-N$|?w=pvXV$bo6lG{;Nc>~roivH}5Cc0r2P0C}NZD<>@H_bNI{m|+j_tc0j z*))f9Tie?doSdZ%r(ZHNUlMM@%oVZbv*qLAiK(dIg-3xl)ciCDS1NMd{Vbl( zN55G_Co9O)K<3t7!AKmOqPqUE<1`~^27c$oB#!-NEfPZ{(mHQ#fNnsH+^KA@z0 zB6SBl8(UjQoW;hWqeEv=R<*3FC@~fdUvqls6I^OjBMjY@zVMQH94(2=><^O)35frD zwSx6vQ|An)YnVaeffWU!7Ly=z_JWyr^er43ln~wGkgDmAw}EZ+879HV32NN+aKh+p z5-;$2V6$TC_#2Lu#e&4}cHFk>3MGnuvtP9EARR66{->s1RvH4o`Ee?NR+-z8GOin zssQ1v)8FhYhMLrSz=ecbHd1Lzivq)z$mrEOAN(b*q=4q}pc=k{ zt5y#V&80gAmT|Z5(WGzqM6+R}KY7w{_>KPh$4@1B(Z?QD>G!a$iOVzXTp^$t=n*o} zOxDrx2&0+Qo`5wq`nO$sy z*MO*SqH1QlmGo9{A)C{-I5j7dDR!-jRQTv8yIAIL^cd|80yk;6@y7Gx?|AG%JQ&e` zNKO5#*gyuU=owNGYzf^M#M~GjpFZ8)s6DlSF6Aj~=U_lAxYzd-m|f)^QVrrgnX#V} zw%8?X=r)D;edNCl%w*;C*Qp6;t=KZNnrU{0&AN1$sTf+G`X9gN-d2sd(40ECBI`4= z!6puu7k%c^*6kh5Oj^U$&^SRF2BqF9{;4lX0na3w<2dR`U8SGj`v{`)ahuzOgoLt* zykTIShS(4T0-O}#yE#=im1)!hlfhtJ0>SgWfs8MZ4S_AHsCW;!g7!lQG6r4z96UR? zaaQfrF@>fSPt*%m2Q)3Wu`ay!#?}-mG>G&J&d@~m@ujtDENV6p{ z^h_v@VKh$5Gxug`+YxJN)x}~dx@UG7Qm0pV&q8ScGT{5u4IbT zoXr^#5h0=4ilC*3KAQjWE;zu<&i>_ojE(g&&sKuy+G~8qtlmuP>>)=j6)-PGala^` zxq>fh?%nd`$FTSz)*>rqqYUq|sOJgasdk|jzp-x4%hH>XlnriFz;9SAt zm?gK1=ykpols(g-vL_SwO!>4)M_S6W>EFcQVw-Hw1&lcPwAeT44ZRBT^Fu>fSYR{7 zb2(*(8TUh40h$~5a3lCOrbS~pZ}pR2-dqY17$*+(^YiT|mB7ZDHb8$kXhUJVa1&)D ziYuw1*tLl#$*X^IZ-&a{w50llS;Cuqce)`3iK(6{iEW)=DwtKUvE{QeeEe>* z5enO3xMPcKMz6KI-SL2$Nyj20x#Tyz#a&fETs-<_c9DC6A(E9pwxMbNc2Xzz8cTt* zLGyT>3t0(krN0<8x0qN4Y6ml`Uj47{UEN1vp41Yg-V&cHGwBU7|a z)DSb_InCZ$o*1k75D1r&RJ5?9us87p9lxtJJ{tWojZR2Nv&hPgvl|#KrPerIv8Vy3q=WjpyS!0OUlygJ8l!F#8H4H8ag%s*N!vE` zZq)0)73|h7idOJIjd7*9b7|iHxwb~#e1AwSl!S6~>hkkibM`go>$;ej_FYx`M{^+{ zwiL^dj(;uQ)F^gSOYU&_a@ySF%46M8mLc+5|J>Q{79ne11(;^(BkB%+-Z?xd&y6j> zGW&M^8CLAywt&QS!HQjN+_8?r#!XjTd0s7tg^%54Q{`c$R_;zK)c;gOqOkBg-`eAM zyr@!j%LOiyx-43iH9u}{ct&QLW#-C&c1{aQWSGed#ivbIy-yB48~e>`#zH}ZmIcm1x?&m2eY3lsxN#M0{jNGkq1I2 zvdR07`_zMNN(Kcs0ZY~Dv~iNCrw&qKwi`hMMVDIIVuGy~XH1U14-_q#d4ELfX&5iu zx>J?pAn+(DQ2au;`cbt&>)9PVlHEkI7^q>2RROgcPHDa^(|*N0d71o={4jmIg9 zs%68>?60wjwbPS{UbVCc#Nd<4&Lnt_6=kQ=DAG#jE|4(N+tCyEp9$6qS4_xZ`wwc5 z4qd^FCdsq9zQ<5vGdyNmdDc3C5}TLLwf3_pKHbR)o?OgPB8FUD$Kuz_ z$2_wuu2t!J(AcX*uLBa8q3F<%2ymn$LGRzkjZbt=Dq{dO+(chWR? zVJhPCDrUDjwMLzy)qcjla?jYG)WFn|V*PR{dQ+AAF9F%|}YjCH9xFk!_i1-!lGW|LV_>sHqU4-m)%hRdf5pYs)`J zFuQ8!X+BoFq9)p!k+|R4Iao-E8E^U-k$j_$gFBgrwaWW;LUP57?N_!KBX_MPgWFn0 zEzy~8@Ro*0+5I>lR@povEI*K8dA-GNX?E&g&Cz=J;e&n~@3-#7NfdD+j~BKm-OO_A zC{wc>&*~1Su;Y&3c%M9)3+o&iWvT98$3QABh2ccTU8rEoLDP4;eii8u9KA7|t2?|q zvL5N<8$MNgJ+}L$*;}O0qM_%Hd5DYsrC@cu`GAsZ$pC%+W1HGqPjY*0;^ox`dDA%P z9S;4}JN7O$X{!%Inno~le=K6x3Ks^8|Ig;w0h?o!z1=|{Zx0*B374rvo#DD_jJ3xo zyhW#t0;p{a%#_Zn=ojBz@0XXY_hj6@`MaInZ_o&iEH771V@ve!$Y7o?7Wa(3nf(#{ zx}rc|Tm2@Ny^?CX6JsEzNH7ADZVeaRapYxd;YXZ9TqXH9n; z9`angICqLGXiu0wIJ8!-dcFXkAu(R#`=h< zmNRbeUi9tvRa7R}Veh0G<^4JD3NNz#cvjQCksEWXNK?{7Uc=GX?%4$|mT$MkQc}_? zDjgH+T+M5d-O2ov%0gpXpYp0t6y3dz!<}QKsR7Cw{@)W})1MoQSWK>@JqXY zFA9GmJXsQV=y=(cT&>@;;|RTV^Q2Udfe5Q*xO$@YYU_jkxA$>>f>2_yJ0z8g%;oOM z;gX(?LW~p>Z&`HZiil0&{8!ruRT(8839XKl_yfweTrwi0ZcQ$Bv)MD?;yu}21S7#~ z*K{h`KKSwN(w}S)!;a;}#r(U{pO4|jWd`v!F)xAAdP|66;Jz=)Yp8?>3Ygxn?&xQJE}%c>YAx zIlJ?Rj-@uNm^h6wJ6d7RemU#_WpnGlm}J^D9^=J&vIHZA0@|K6u5rnqnxHs(Q2Ceh zyOM8iUc`ytRHFN?_WI!d3UQksW{uP5Iq?nNNTq)JB1%`9JC6-%WRo#Y4?YyIf~_sS zmTGGJgaz$4rj+v)-?!L=PjNOM%k9uB$|e7~h_sa-ZthsIHAbnuB-47|{i~$J*2w8E zJz8#T1~kg5w#Ds&P99r}*0E2Y{MNdgedGImqL{y0_`j8&W?+w4uMuJ{^=8>GdzPXf zGx{jhv0T|s4HF!&MU%H9{b_XSHUrOlG%N!3v4tA@<+(X2@~a(x;^%b5?l?(a+pRz) z$E}|`&OHXi4l%>k#d;gs!l9!v<6fi!O72=+A3m<3pGx_ewVRu>&UY+sJ0(1SL{s3G zNPK$O*yJBlRMgl2*6(izzVzq#=-&N%|8e#!v{WtBAEHPDuj;ageYPYLlEPYf?OEsz z@)YTBL_Qzrd9O9duwD24TJwO?2Ji21k{ot+QM$N@4euq@&z|Hp_d@z};f`Fj4b0V` zYwt8F@?K-r64|kmV^nyqs@q#*PRyAe%+4+kvFw+~1kja_j$`MbQ4iU`6GQRH`5{Ke z>h7|Cg^7kW#_JglxyVL{{PY+F7F+pcN@b2XEj8!(bkr-Y8BnBT!6{S_tggO#no`Yf zZaM`A!YDn5137w729Io$5e~n;DKDsC07#Zt5A*)uUE}En7uGLw%js)9Zr2N~bC)HC zNPZ-f&8_731EgUf@e&xpLAE8;v?jd`Hjiv)PVBJ=i=Ty5_Zz60C z#t$3LPWl-aah@=R+1jS1Jz+$F=QCRQD@p2A+pN*)|peu?VZlRDAGAtS6`;_uJ z&+7*7;z%0{z2cKO5|Dcm!o0busY!r4s9IteZ2KB`!J$G2 z1i*uNKttmLFe_NQYWPY#ux~N(hPhSUrFV?B50%9De}IueWv4T1)iMm2GEV%0lbXrj zY>Ixrqj%y1LML@=J@NGyAZXwxdNI^jG+1i2y0_US`tB~(-Pzur8!DTq z{BG{pk8XQ>^3$PKurMkm^vjh#$_J4L?gRlQqcg8#QuhV~EskRpH*si=%OIw)-#Ij$}98 z?|8()k@!^A--zAD1}%~O>Ta^o^txRARje07CA1k?!%o`V3BiepBx?==G!gty1C?WX zl3J)(3k=t(utSS%sAW?yGc)^Yg+RqcL4kmt845^85*Dx}|FBR^P#{DuQ~$`8z~@#~ zjjo8${dMxLwxh|Qpv~_3k7j_fd`ubr){scaYN^)2s%Cu6S)<}%4%Zau#Z#Sqz~7aC z6X%RC{a;{tVd2GMr;tJApt{Qm_Ce)=FC(7(6Yey!!SSmvSS^gGuX4Z% zlG%O;TnmfWDE!Ip>ob8_^IpIwrGKi}&u?s{Cqa^Gafwu7`0MEX4qIurmNz?;P5r;` zDEtmyL`@whO?iRhexfsfl1rP1T1=X0N@3I+w5o41aaV0PG-YBlAm!=2(L005=^ z4^K);mX3A4wGNV}*<2Caa%8FETS5fS=8mx>3qx584r)TEvjtS&IpJ3t{3hpU@v2W7 zjJ%OB2+4(dAQs(Q9S#M`MTGzf9N=B-b%>|FeEIZRQS57TD5KsSA<^)5;zKm#KD0!W zk2rBg?J&G!=i0^k;$Gr_td{BZIVPU-!{Pyd3gE^3 z-C)b$#vsR}iI}7D_W1-Sw=j{NeaPw3Syw+ANSVeGIujRba*Z6ki;dsfA(GMbcR9O| zan&H?eqM&sQJ`uyEmrw4LnhOaEm?1>%93`lhcekt>F z`L~P>7ADVDZnqF0gI0im*XjYRNC#7kumLJnmRaTMn=4}*F}=fua9ch9htjy zYuT9-$%Jk#+F26G3>Ol_Cv(Qxxt6>?v_K5(QDCUi0F#5a6F$>bmaW$~F%!YsE)+CyU_iBs2_U;vl*0SBd zC&bM9gW2p*V155*ax(MBwBS0)+z(c@VlPP5?5nA7uY@^Yf@OnIN_}Pwe@o-gKs2(C z7w_4%KAiT&3-3OF$I7ybh5N?#^MeGkneNx&M3tG%NK3Z+glgsP zJ*hyC7hU#jVR~|l8L=YnZr%ot54mdj3-i=>6B7lkPdoN%TuO~>qwL3S`VKsz43SA^ zoLp9ZlJ17EeZl+EktLMy(G3r5s%IeBv9#@EF8>6a_a=WFDH)k`_!%b6io%BZ8O6|r zg?iiG`k}n(h3oF8#F}oE$iNK%_hL7p>aCCZdRr=aBsHAKWC(!i#E*c9DN>W$65xx~ zikeJnIvCdYb!$Bl9ko15A|)2V*|=@d^SzszHg<=B0_GDVOzSgOySnfFI)6IaW|nN zWVnv`6s_R=B)#)}*)8;Rb1PnR&-i{C-b9rTa&HN7pY;)9xGKlRcS%`qO^X7ZV4?UN zJUgW1HqRb%zu=9`$=3*zKbl&d=FVo% zFBxA`M)+@$t1elm$XwMIU^@*@V34~qQiwCZuCdZI*nQ?{yhz^FR}q2|IhuPG9U(fV zGo0mCcBxCb1M6k1cwqFv*M>YIPTSJu-hkBHPs=tzNjJ4yzb@*-_pfaAJ;nN!twcLa;ObJb5Ut@C z8Y*Id5mhv{4q)&+JQF#li>72w5+cQ@{UaTJKmJ#NhrC`9cc+&F+@zvp|o& z0XJdFexPH7P&`yrB9&Kvo{Yh)@k4+CSzr(A&aM>HEFM+5yE`4}O7uR5?>U{6;f0yp z+B25#jSZFTybO}OeCQ@!vv+;J;=zFI^mo^w4^9jtzliFRd1StXBnB3zGK3omAF?c#lq)4WH7a#`+5 zJ@sY7iscmQ<<4!&23C(R(I^eBSKpd!XF0-S5i7Inky047TyNW}j3n)_V(DG*MYnH| z(2_ebCBL%OPMd=KQvSLNUcB3&CBoI{guTpK?XYp?^;$>w!*8X2`pw<|kVYtIJ_c-; zu=yAuIFz8jcI^5;_G~SN2YEcIi5_zP{G~)SI4iO1@gn}F*U(!H?Cd33nxj>1-5wGN ztl%$l{~f^Y_WCmBd(g;D_qyK@I(tZ|XbD(8ExTRjMTmhw?*G-+_|S%@mmz!k&&qPkwVTAqcJJs6_(8w zec9LZcO7umI!6-RgKCajH9Ic#O8RMw7wDHVw$N|eNSU04S+=TN+gty*S#C-BWwABOL!8lP#rNDgy9L(|rkAs= z4a~aV2LbXQ9ko!wjtayg*_3=@Tx6ShqMJH5b5mPef_Ns?cTnXncD65UGhsxXOqvb; ze4Ni)p?=KX2pjw=BW(C+{VC;J@_W`D?;M+H3mJJLO49`U19$ioBPG-o$c_!DS@xf} zr|`^<4dN~ps=4EG8S)Zdo&>>*pJ?DyjxT3Xb7iWmNc9glz_9VbbrIpOZ_-ajNe`V) z>9I084q+{idHcO5ty?|Y=||th{;PD{G^UH^4zGyc!R@p-ceY#D_kI13>9(IkhqZUS z_Iu#@xWzw>^ReqPE4K0UnnSDrJ{}XCghz9t(lvSL;jBJiq2Ef{(WJkd)13c85qgrUO; z$?+)sxy=R7lo%cg;oAF?bJ5%r6`Vir$T7KbZf{e;8GtQNWRk}Adq_DAPPx3iWX@2b z4y9smREz-4k!EpIQMK`aKg?WVlKk`H9JC;yMEI2s&cWUBmB@nTdkPV!lw6DEn3*&} zTqOIMENWd^P1n5c+V;O0LrKM}V|Z`kp|5u|eIc|JY$*zhNb$!Z+d;974SxQ^&#nG5 zMev0k%|Ao}5aDR&wb#FY)|Aw&z?Znvv<_9|K3+qqaiMj(KApcp`g8~5Kl=|S2e1zz zONNiY2g%<)-c7k&+xRs9Mx*TKVD^g_keU-RNu;3g50|9ppS?Q{zLg*2+jgvJY)jjPzjsgO$12=CycH3zDOn|0#0tkKkMj=}t3{PA zWw!SNvd^dIBVm|7A9p7(hl^Ic{m**OLcizAyvsh$$of2srnh!XS&uzXT9?4!I`@0* zH$@Gc^1#Hr`S~~D5fMe|c^jCSGt%Z>Y;^$y^1RPGOHOpwHF{eFe;8dF$*paJF0#bum00``0y8f5IUA3FLb6}Is@)+9GQ zE*PXTEiV7LBb~Ewl!*Ic-?9hA%)1%ad~}70@y}j@KBX2)RZ?0v?GWH9e&Ft_9-x|W zK?r&yXgv%-Y>Iq5^xK}X1g&@P4P-GQXq=JRXt4pXU$Aj-Jaq{W*7yCt4{2!!z>OI! zxRIiqaWm8nBExJ3AxH#_DQKf#ay?=pBge7!sYQ>|Z98&wuA!f3&^T zFGGZ*0nno~(1ZYWpyuPN?B7S6wvY}MVS<9~J4lo`z~#7qU;q^)i#S*!XKaX(If#fr zvmhMXz4$}$8eiOb#>Mjb*i##haa$~cEO!EVK;N*+F-7dFt=axB2v8gPKJ?9tNL_QTRcEWO#d^V%kkvr;?@YEk*jRto%!wBz@&l(J7j=68CeRqU2 zo(^PfvHMn0*zipx;x<}3oY#Ce9`FISXee~+yKPjVY;e~gGhj|r#QnkrJla);eJLn| zy(cgso?cBYgct{f_WLSS*q%Yaa|k%zNJgz1b;Z`#98Puu!3{MQ45EQbNR7x4fl_;Y z`r>qR34Yz2PJX|U3PwIJHwrQ`ynsczw6jgt3%Eo=x5H5Dz}JO&9odwS^AG`&TX#JC z(6&DJ{{EK}T7{R5n#2<Kx!%Q`v8N&^DtQgJ7XZS?8sDGq7gjFnhV^ynK3{w<=ks|DFAuHW)LtxKeQsW^Mk z8t|$wza5A3b~Wl#`JTZ_rEaMrI2kVWLMF;mFwf)zUS0wZ2!RJbhPJC%f0*RwhG=Or==uE09Wro?3E$-Fu8#C zVjLXPU)IXQA4SoHyzB)4k61P#v>RxJ@G;nR8@!YbhWZ9@`w0fc4+IM4%cp1wd~Gx^ z$3lh0)h~?f&=Nk@Iet@rubmx-2x@>QW<4OsgDfqKMyum0u)ahhJjo&H+yE@W&-M%4 ze>>_@Wlhx$287>hy{q)?m4Huh#S$3;NoVu`)m!hoy8lTda6=Gmr*@*!rJHN`Q$)nm z$B$8L7rV0AG*{a}Aih>g#jO_!K<&@jR`<2I#e9CwISf z8#c0GKtp3^tCA6q+!F}1|6lin7@~}T8LMg2P7PeAj?PZ!=?^!7e(#dGZ_Z8ZemXW) z0Z{D9cx6ea8;CQcWMwfDcq|v8-$Wh}h%zcVoc1}rId9lUoQI?ZAG-4?9`N&(S)#VF zs5Sm+9KO4;ZxkQztINUpA)(415McL<-4u_x_xTL|{{G-Lc!jt1esoLr4|yd&WsT>U zK;O;C;H5cm*;A-kru=79P!xTMWvJ4kSEN&<$`F|#OEZxxfCKR%Q(o@n+~%ei2KU>RbzBxjK3_Wq*53K*12!_ z015pv5+9JH$k8#C=8;*7l^}*QHTnJMg3>1c<)JZpm8$@1L)j@*;0l{gG z_#@YPm}vnER%E0|QMEQfZiVVCId^i!H1>BgPZ*^A!0X_%pGL{Z6jjERjk{7*v=Jf+ zHVBifip{n`ufZt(n^NQJ?96^LH+L+h{OKYshQDjk5m8%!c1DR7beF^6lT zjj#?NNg^GTU5Ew*5QV-HBY6-Dbb}m=o{mmuzAelmNSzJi-@!ky+Zh60FeB5AUps(O2yRGiBMq7dvttU1n&kCvI3Z^F~k7CnlJ=ELrsTK15K;? z>l?=p#JnD)>@H)Ullkt5G#Ros_X_kFxK`$&uqt^Zq~TzPsh7XSwp+as76#FkEViaV z8t>=L)hG`E{hdHeD3*YbUK84++o`!lPQS@&8lYcZYM?_kYvWu#(7(3MFJ_ zMMIH{qR7lhLiQ%3VKqp|rXi9LvPV`KSw+cS*_$%TcwVQj`@Zh`e(vM>zsy$~SdrK5aMUzM^QlLve}snRyWZLReWlRkW{f z)7E@u5DF?w-OLdU<3HJ_e-EI<;pWs>+wVTF{!}TdI|KyKq^+O|F*34hR9xF%?Umy+ zUa_yN-$3ax)Cd|h#8@6+#jo1Ch8gw}HGPn<3Hz(cv>RL2G49k-6^oJ%xL;~{jrrH% zwBu!mcU!D(#e2*+ueK?*V4d{J(4~oH%7fu$3h=xPX-8f4X0C`jY%eWk3c7^{dLFNF z^i+ADM$P#!@`?u4{w!^Gi8&@SN*!6XPX~|40+?%jNSc*H$!?!e%~krN3>HNih6aFF zR&HTDQW~TzzJq~Bv-rpGxd*wbr)1*v(H4=B)Sc~?ZseR}gs*gbOdCAjJ-B!8w?`{` zP8n@(rzc*_B^PnY=?zfQT38f|DPlTY+;N^j1afMmmw!6Vb?_FiPWPmTk1yHU;bA#O zVE*Y2=~CNmjXfPOL`gO?Ja(v_Yw7XD#FO8g$M#p7FIwC`CnNJ}{J{|>LjxbLl6!pO z9Atm6vz62f71G6~ULW~<4yf+g^XHR%fM}E%38Ct;_;Fn%U^YT3AfRPB*gsl8Wp=0M-FgTRS_~mP*q1Y71?kO9i&UhK^%b ziucU*FR^a68E6pr&UdMxiB?0Voli_G90C4owWR!$)0o>j{_(MYY*{JOtO@Sv%cQQo zJvWzX_wXep1v^VgcJ|)1CRIEd51;BD^T_*bY;)Ybw`6l(b7x%5*uTs+BcGLbK%Cm- zHA1}i(7)2icSy!wA_tQPf|d}MXEauk9~>-9@d*BiS8q(WttHSv?)9wJ3|sjruy z$f)X7$vQGfF0hp@ZEnhbDPsC12Mryywa<)Ah10jmqR~rIP7Q_A#M2FmS&^`~Zg&{B zQP}GK<7NI{6d5TKdJ<&fO43@+ifKCanso2oc=@636gf8DHQHZvH)`}P?kmZ}1!58D zbUn!#iw}46dE8_>xt4VKIz77V;WZd@d+CO+-_5F^ry7EPR#@q%Fz(5}Tk41Za@3rD z-?WI=I{&J2%7Nw3kj4s%C|Em0+6J7jwC>uBH!_(9GODorU#4O4**lz&#=s-Exkgwrwcx>{Q3EsX! z2Uj|IM5PVH3k{|AiVUfR+Q0AjxfCKE)>JT$b>zNomKkCtx~iHLCtdDcmQ8Flib0Q8 zY_DbFtbySQQbzrhYZ;uyhl3l477~q(E%Q-WKVooFeF z@=&=@6bg^^E%IpOW48GnG5{&-1 z|7+Qp=-hT!+pCRDlPW;mZ1$Hvb;Z`1C%ZpQY&q4=Pl2QK@{w6$Q}Wo?gs0iOFkDQ@ z*YkXokGtDE>T%$Mt8_i+iJGkE)GD z&k0$^r*Rv23#fFRNJ~rq6fmgJ>F~Y4UTGsmC&STLo^45jfSR@}eXEgdc*18q_ zK*914XewE>UQpJ)i#^nDQ9un%_n4!?k}vR&Z2_J@pQUr)oaxL(lT14OjL@%|uv`|? zKbxm!>p35{Wvdgo92q4pF3;89(pLO)L#Ky+><1Tq?CWl%%D13`yMd_`LQzJ@Oyxx;v%JdLsji5|lG4r$eq|6;xL!wVo zpYF&fITzNKd4?uum}BiMKJbK%U{BB!(HPpf(mQ$f;(igPe)AI*R6QR1E8c0E2S0ke z%0Eicb*%c6vFTAY>OkB0CnN}DS=~u8cC$EduqeEEQVzzba!aI0NsuziZd7!e8xu}O zimVw422#bHyzDj%C~OVw2(Q7+T5y|<^*)~Kbtq0yMVDUQ*X}_=0{?iX);09xBI~j) zYsA8wjrsyRyD+|d5J$<%a`)&q3|-BGLt$;EuWG(a!aE{e^XVT%+IBm~3ka3uh>Eic zIrYl=EoMbg_e;I5?BtEFiFlAL7EHD(!@jiHP(&ljez_pBk$3EjrjL}2W(nPuoi|SA zcs%>K;=ha6&|^XiAN2IMUMjOmDJoTn~W z&f5Fbdicd*7a=Fte4^55B13c@6FD-fPS|w|V}3!k_SXvTt8l%cmiFt5fkI`qX?cuR z^qOzoi=^6Mo#npbgpOPaEt!#TNImFLbi$A*I4E)?q9XG9D7bjM7%du@2t=YvU~Ft( zi4QF|s_b8+$yBM%lpAJoJ0>~AkGB$?6?4Ms= zZ^x)02?vdnGteJ766|YTj`#217*pDf4PCNmJY6V;6JRcMKF3#%S#Y=1zH;|d*)OyG zez4vx@k?pV-OqSNO#Ej5X5N9&AD?yxPuLm8v48Gv8K3I$L8WkMHrC$=w@kDLXvuD* z$xpX%q@>-orNv(?E5@B6%hGYpJrgH4=eB+r+3e`q!AkXoH@d(qxL%aVbKeUTn z82v4lvNlTa8)^0tTu?zi)e%*4eE!Uy8_nlGATYmU+3h0pqlAfHNw-}Cpc`B~6 zs?MQ3_**W`jT_y5oO{?x{)+k*hdFUbvBVV2dc87aTdawl%)WLYS*orqbPsm4zj*!f zM(oE8eTzm1z8fdJk#Oa>**Qo}ZM8peg`j0 zhfvd3hy^_?zJl&Bvm{4L7}u$v_G`CSA{L>EW~DPZ=jEcx=WV+l=}yt+zKDz(o%xs* z6k8a5v?aA?vcsD#QB0x4q-Yu-bw)Ktb(Ov_|*A{#*@ukwqyh(_V7r zmc)c-cIT}#K~IRVW(|lCugVE&$q1pFzbDpbw5*L}=MZinjs^AN_*0e9Z^?3lo~7T8 zqjOhuDLI0Bw%EUJ>zxrQP%MGhC&3xkM`NwpJlAu}8&VELZqtz_2me@& zV;#(4zD+65p3%_UoSz`4(+{8UPENcqsw~_Y%F6zHN((Qb$B?4zy3IJt3Z_b!c(mnTUS7A$R>)^|khEg;w!y#q zQxCmp<^Y(6da2t1(%$b@nV&slGFpqqDy^NJr`m1(tBP0Wj>xi~G~bAxpQ<*cqy5I!f9X>IXL6c)+7Su8H$1k7^lL-SxDE#Xs-+eT0f9 zRWAyUHLPF1FgL!0gG0Gj^GK?u9UE^ST zfjxnWekkVvH46pI!o6K<*^rE+qLXNtLPIL6h@0CW| z>EP9K*MlGF6vRB6DdaV~_mT3Pw!0^Chz9lr-d3-p!`jB#ZB;xiz{PJaP#_(gAW<^N z40))#ARO*>mMi0GpE<>T+$?bAWL?Ym0alMCYp;G;R=1|ThRGC>g;BS2S6@6CJ5pyR zPFvQ%6-@4*%7=+kHEtnurou$nvpDZO2Lc-UHF+(TX*x?70U>tWZlP8v#72uHfyKMJ z_~TBQ{5RHhkrOYzKL5URRY9rB#YWzpBh`*u#B~cv$>S7WExM3NV-PivEI4QGX&97T z=hnMr%0k)GQDUh;OHZJnenC`VYbddokxwiAkmN^T=8LA$@Xi%>gaCC;s};4e((g)b z6oVM-X-BQqw6>h;cdg{>xQUX_9gk-%zbgQ_@Q@_4YCu5wI8zgy$Jg;=Hl54Aw#!_Q=N7yN0Dx)jJsd z6EelU-VJ@eS6tB^?CUARzK-cgmHC{&R(Z#Vs_A~`x?*@;E4i1yEZUx5biZ3YpZ|l3 zf9gY5J}qpdm~^iZ;(^Qnw&zw`v_8v5U-;h)LgxH$S+MC{`SR)_uMwCN_Mxuw?c1g7 z52o2|2QuSTFJ*<6&baLi8GrHQ`Vly4u7lD;?H;}_63kS}4wL;DmQkDDI&(JrVB+~M zxYIfbcvS!X*?Pw%8X^Xs_{I-x-rN|fk4#nMKkKNlx$4HxJV0!s>oU9_)B{7UncS<$ z^}$o?81fVS#wt{D5H|hZLPX5m1v{iucVVQaca(Pa_q$644>+#ZI(^O!t!^Q`Z60oj zz-;MlnZ%2@IPg=mp3~GEQCS&H<=i32rhnDOUxUAvu%~^`44W~p& zrOcJ{3!UB|yI1t<8obf-RjY^6gP4Q-TshuF8N_k?YW#D~!LF5fB0d))L!X4;%_gdb zV1*2N_^=oiLxSxDBC?QLR(mP!IHCQ)pZTMfW#Ri%bUNk!%n~`^xZvM@8ycM#Tw~F! zBUnSOprnuZZ3@c0HXv%rfcCNYQFm%;M_^~ zm#|6sA;eY*UO5en5`wI_C)YW>j9d{%T-0w{`aF)ezxSP8n}&>DLi^*hE+wHt+wxX| zg;NXXG{KMSUM)_kW#nlADzAZ|8$} z9#>ktVM2|@bK{J~4O-$7q00rm2RnjVrYgGfg;zSwaj@rqe;@ZRud*(A%=^XnjHKpq z(t@dV*V~7}zS}l&<;=*h6*1UjuFPI_T5*=mXA{{KuRwaV60V8-j#dzZ;MJOLAW)1@ zb0r}(zxm!RJtD~8A;PQnauq}P*3<9l1aB-@kw9ne*X6K>+y`RY4PL z_u=B{oiku690Dh5l3FrBOOxsL4Hlr}dO8)e6Zxl`%}pb}>Ak~ACPfggkZuzDXg)V1 zZ~3e3Kt=I==24Az^I6aPFp;8B6Hij0aNKG4@i0YqbZCag5~&y&U900xH*b*sb(y)G zZgSv_iT(MIpC|U-TiEX=B;rclKeuk?ta3Eh;`4&fmtSeCtHf;L+%LSYK%${((q^eZ zvv;y%E5^I=xg`evU%z*ecYJydG|}9Iy~0*0heJPxdj9fDOH=D>qK1cBTGI9^gpnLS zzUB$PM_FsBJpZpvFSWc{Vuwjg7L-K@n-K)m`HqtVNvTD>fXAiINkP^@SwmyDpC9K3 zdTDf~2n->?*&>j5UPa}^nKOPM9%;7TTZd*~M$5sD({V*<0LG6yEGwR6=I=4Oqqg8W zx@(d3`rJ&HkFL0k^&*@6Yyak&()Yn_NsYgIHY62NC1pNTG5g>rpcOvrHdq^35k0S6 zJIK2FU3=1yKW8+}s%~`T?9#?vC?$KXmOo0*y)zzp*Ln7ZLbbf|_5)ugNA{dn{>ffA zl_a>hN82efqTA5F+`#`sGdIJ%+}%aG$=Xt!(y3qV{K9nfwemao?Ls7mT>I|)BjKu- z;#V)hz}B79*IpWm`ZhdDdRUKKMyRdUJfrM-FJHs+YnaIc>2AuM^x@xILTwF7$5jQX?`Knc zf6?`PaO1kDi{=$p?zo2FiP!T>HPVf2`gEzvN(niYN+y2MM=n=iF|ZN5ame4cq{4um zuVUm;!_@3-Uz+hRt3~q7Is?fs#cq>4r{lR1Q*pRJd1t7^3T^N)(&391<6HhP+*A;m z;q%iE*g+@*3Uk~1*_Gv|Z>U^;-jeqoAqK2Z*BwT+YGHser)N_w5zdi6YUA9R>i6Po z=nMvn1kJu4!c+ypjuId_Lqu&T`FrK;TP*==uH?(y#~;w=;M3cg(NZF9c$GOa}J zWZi+KbZ0uF7YMr$Z zVNs?r+x7B?rQ|3~v263qV&=K7p;exqI-7G{j>^0)gK+M(z^$R;q;DZd z#}64KUmmd?gCkf_d z5rHlt$(Vy{=3L8jSxQhi>5Rn1#l3-209s0}T(SoqjW$6V5VdyN|PkOZb%Z$u1fu@#BSZp z+YkJSyGx}5$yQ5@+edqVpLp{2ZPSYy(0)Z)Oe2|HCUVxKRmd5MsZ5U)ZWo1qQ0hPYKv(N2N!kz-MTq3mXtw|Xm z60+bl(v(6x`Y-mDXJ#)e!#`mbWhn2Gm>chddquM{rb>f~ zeJUNJ&sKKeW8Jg=iNY9+^@w+z>?9!?-2j`1#_R@y>eVbg&d%D!Ac&SYYA3yCR#xWU z1w}>GK+&rPe)#bA<}F3HZ{K#F?{Fm0VTAe-@KC{#?EhSYD5Ca6z1Tskt_|7~ZR=`r z0CC{TL7=58yVleqwpOJzgF8{Q`^Po1k*Ht(^NQ0YiUNOM|I=S%$ZP=lXC#$ z83RQ`=?U&G(gN10Rt5~ zWGTuHNA(xF{4@Y~%;NrQ7fH6dy!%f@x<<7pe`}9TD$EB)MxF(}3tBvY$t=7!cxYQs zQ`eTK66^nk#^C3jNmU^8bah$y`T1R31{^PD(waU?anFu%;E`w?H_Ove9EnY_Ff8(R zDV^_T{c=hn#-TH+R?2j=JyGdsv_y>UM3aN0Dfb1XaWC&i-3G_bMB%K-f>xJUZ->qc z2`P1q^zwBPeP$ebYh#g5htNve}NsFi z0T*Ihw`Jx>7tPj@EC0S>*V#1TU}_x87_-pgB(Feh)2M%PELHRglwj3k6q?Tk3O=`$ zm0_T98}w_6s$)#N?}E%Z=)D1fcDryiA#lU20x0+6_dT)U^7~Xssyez&m;T_;K25dmdV`%<8&mu=Epi#6VC^&(1zK zYfi!7M+3&+eWN5y`FxwATDPu4EF#1JfXiNJv4@4xBmP7lk$jF5tx!x-^o)+4J#k{4 z+1JsvJfT8_kRLgt2tgUW ziJU>k#YLP%Lqh{xk#u7n`Jqis7uPNZR5sERM8_ z(Zj=o&>$rNE2;tvJiKpNeSJVuN}hb5%Uxf&+jd#aHLr#=50dMcM@x`^@Ku@+VgZi1 z0^{4@XpJ46IwfN&Grln}LRIT{9W9pZ4g#dFNHPozo@LZr5eN(mtHgu~sqejeeZYV` z%gLcn`ce%zEBFu{)hn^(EY3%pUA<_>6Y^o~dL47|;+?BtvBNSD?lp5)l5b^;%3oh9 zV#>ymguU1OKiL=EBuF3;u0Pc9)NZnqIjE|<_~!b>#@Lnb{#Ni;?eXHx3Z58v7BO#? z0r(U)%NU6^NafUn<)H<=2Wp2w~XT;9OB=@YyV37*FhhS&ollctV(zs>c9^&UC!*n3A3k7PY}!) zJ0NibQpU#I0^@oXz$N8Cng=8@vAg~&IiHxBzD39>(A-YH+^lNAZ9akH*#a()@&o@u#@nH~-k?ar-c*Uq1(250v(pgG|WO3TV} zhvOp`K#-s_q8KCDl6x6ICZfE7p+-#qrS?BZSufD`IJVh`s*fF{zH&byc!SR$$O+as zv}>>)C-@ntAm3Vctppoomeki%N=v`fbPMQHQBi?rOfS~55961F{lg(J6p1t|NQG5J zMqmbYvWuFUcLM?f)*e!9@Sxc$UUHO8B%s5Xjc$48A&nZ00Eh&cAZ-9{$3T?1df(q0 zZcGTs07GbOJ^teW;M4E#(ze?->j1VlQx5__HGd3}f^Y&xoq-jQheS@~OhCi(u0{x% ztP^!nCiIY35kq3qA=umaHhBIY_-ANhwnv|CYAC?C`fDxBQP%m*L`oxE$490Vos5X)?%q%A7i7 z{d+tc>AiDnJ)3~QBUCX7E&zV(UYRV54)_rX;DdM&lx{?Jh+3T0Vt;6ce%Yo_bMmro zK$E+bJ$})E@gAmt7Iq9-iz0@k;Bxj#NyUKT_KQ(|pR%JQd?h}Lnt`EOuwL?{v$Gh$ zt)c(~3t4s+oY*@8`_z78wie2_J9h2*Al$5tQ43eniXfKN#d?ngh#5r1_QOOf=XX?1 z2vkNELX4E)b)u8IaUH%#RCN6Vh zRSkj(crX<8k2DYE1k6($;NjWN#}@#+<{QYj z613J}`ra*W#|6ALZ<|7lWcX-GzCYAzteSNkkwabUED)O=s5Qj85hR}|?9BG~u;yJy z!qR0u)W8Ohi+olvOvq#h7F1X)SzN&m`$YW%XknQQbxSv9!<^@cL5P zEocI5qGVA39WueO0XAnr=%72eKc>^M`(%-V&vxcKq{1iJjt=)vC9?78V51amKznw#C>z({y8 zECoh!Vo(zm@zo_Z!h`76ZR|fI37RTSLNp0t%R1E`0$A%8ru*P~iVf1B*duFa_ttiq z^&=Dp>rf&@O6f?UBkcCePHOpwsuXfY0v3!x_-b%ig9t)3>bIDCASEJ$>moEecYdFd z@~~?&&z?reHjACQ&cBn9K>p+Gk`ml539j=K#P+WbeD>5h@nQzO$?QQn*uobt~wSTpxi;P91O#3 z(;uSVi|>yj=SSm3%o9a9NO)qzV`X{LL&7Ke7uJ$cV#JPyJfJVO!DkdR8r}Vg6E+qw z9?N#p8$9F%sumfB6%@bbCod1@p^z^=3LI^6DRzCcAjf@x!lY(Ot<2MtSDTHP%QGb9Ug6;}s+Gg6U{!2{bSvMOld%4_CD` zyvAgLFZZ=(=3ObbUsG5e8+=)DzA5~Fy$HopAI6bZNbX6HH^q@=LiVSx&xQdTBPd&- zR%r!fI04)xoH`;*wWJWdwYkZz$#gdq3Ao1u5&aT{b&OK0UbrBG0;iYvN0{?~D!XU4 zT5x25M?3IF)?SSeF~<$^X6YR#+RvOj_wb|p-Fx?Szyq%IUq!eXS*Z zzj^Ch{i*H}!b;%eQFwT)9_%_xfT@<#mT9P{&Lxj4vGDfVWnFJM0fkFE5hBKvype;t z9?SM5L@DX$e0v#Of;9XN;SsgH8&RgDo+o14X(F1AAtNJug0EcjiarR!nG0(VKtCTy5Mra^1RHPI- z7E45=11~azwf_uKVz?-b&e-ox-*QmIqSIs5jm340U)O!n2vxPS=gv{_75>+9!~K1R*TT949;KTI?dcwS0!!hFFp&ccNOjdht&Y$cv& z@P;9FWv@F@8zCtouVKfu2pbj;dSjoUWgtIFAje^9<4UO%E3gmrGBuTd>k@`T87N|+ zM1+Brc5E$PYr+8&`A`4FjMS$9$f~;pxxMjUk>j=XlD`R-j_fUK!Omf7iY=v zFS!ADf}9^OuYGw%OXPyW!asv2JuM&>rXZGFdascwqG?$;I7Wv~@ra#3^pzH=~&d3W&VuL{Vq`}}O-(I%{NOBh z>lAn_(olcHr&Yk7T}GJ*rqkFBg9q#}TwruYESk3@Vbx87y+;)6h#@0rCE$S9L%byy zh01SP=8Smoc-zibss|1p>_z|d3g!yO+Oh*68T-azTys#gjo3_!$XHFnkcuR7q8=)T zQ=+}1gHaygRv(NhVz!AGNFYRrK-q1DNxF2)E-5hJlMoUR@TYxHgh;^D<97PPo|v{0 zou94|tbgM;sf!rQwOT(IaRrW`0m%H@9oo!!M-2=NAn3Od$Dcd3;1RD*-c{egLnLxN* increase data:params ratio from compute optimal 10.5 (default) to 12) +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=24 --target-param-data-ratio=12 --run=$WANDB_RUN # evaluate the model on a larger chunk of train/val data and draw some samples torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss # evaluate the model on CORE tasks torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval # ----------------------------------------------------------------------------- -# Midtraining (teach the model conversation special tokens, tool use, multiple choice) +# SFT (teach the model conversation special tokens, tool use, multiple choice) # download 2.3MB of synthetic identity conversations to impart a personality to nanochat # see dev/gen_synthetic_data.py for details on how this data was prepared and to get a sense of how you can easily tune it curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl -# run midtraining and eval the model -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.mid_train -- --run=$WANDB_RUN -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i mid - -# ----------------------------------------------------------------------------- -# Supervised Finetuning (domain adaptation to each sequence all by itself per row) - -# train sft and re-eval right away (should see a small bump) +# run SFT and eval the model torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_sft -- --run=$WANDB_RUN torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i sft @@ -111,15 +96,6 @@ torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- - # even better, chat with your model over a pretty WebUI ChatGPT style # python -m scripts.chat_web -# ----------------------------------------------------------------------------- -# Reinforcement Learning. Optional, and currently only on GSM8K -# (optional) - -# run reinforcement learning -# torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_rl -- --run=$WANDB_RUN -# eval the RL model only on GSM8K -# torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i rl -a GSM8K - # ----------------------------------------------------------------------------- # Generate the full report by putting together all the sections # report.md is the output and will be copied to current directory for convenience diff --git a/scripts/chat_cli.py b/scripts/chat_cli.py index b14843a8..d35c435b 100644 --- a/scripts/chat_cli.py +++ b/scripts/chat_cli.py @@ -2,7 +2,7 @@ New and upgraded chat mode because a lot of the code has changed since the last one. Intended to be run single GPU only atm: -python -m scripts.chat_cli -i mid +python -m scripts.chat_cli """ import argparse import torch diff --git a/scripts/chat_eval.py b/scripts/chat_eval.py index a5583035..cae2f0f8 100644 --- a/scripts/chat_eval.py +++ b/scripts/chat_eval.py @@ -4,8 +4,8 @@ All the generic code lives here, and all the evaluation-specific code lives in nanochat directory and is imported from here. Example runs: -python -m scripts.chat_eval -i mid -a ARC-Easy -torchrun --nproc_per_node=8 -m scripts.chat_eval -- -i mid -a ARC-Easy +python -m scripts.chat_eval -a ARC-Easy +torchrun --nproc_per_node=8 -m scripts.chat_eval -- -a ARC-Easy """ import argparse diff --git a/scripts/chat_sft.py b/scripts/chat_sft.py index c0471c43..91300b6f 100644 --- a/scripts/chat_sft.py +++ b/scripts/chat_sft.py @@ -1,65 +1,63 @@ """ -Finetune a base model to be a chat model. -Run on one GPU e.g. for debugging: +Supervised fine-tuning (SFT) the model. +Run as: python -m scripts.chat_sft Or torchrun for training: -torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft +torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --device-batch-size=16 """ import argparse import os os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" - +import time import wandb import torch -import torch.distributed as dist from contextlib import nullcontext - -from nanochat.common import compute_init, compute_cleanup, get_base_dir, print0, DummyWandb, autodetect_device_type -from nanochat.checkpoint_manager import load_model +from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, get_base_dir, autodetect_device_type +from nanochat.tokenizer import get_token_bytes from nanochat.checkpoint_manager import save_checkpoint -from nanochat.engine import Engine -from scripts.chat_eval import run_chat_eval +from nanochat.loss_eval import evaluate_bpb +from nanochat.checkpoint_manager import load_model +import torch.distributed as dist from tasks.common import TaskMixture -from tasks.arc import ARC from tasks.gsm8k import GSM8K +from tasks.mmlu import MMLU from tasks.smoltalk import SmolTalk from tasks.customjson import CustomJSON from tasks.spellingbee import SimpleSpelling, SpellingBee # ----------------------------------------------------------------------------- # CLI arguments -parser = argparse.ArgumentParser(description="Supervised finetuning for chat") +parser = argparse.ArgumentParser(description="Supervised fine-tuning (SFT) the model") # Logging parser.add_argument("--run", type=str, default="dummy", help="wandb run name ('dummy' disables wandb logging)") # Runtime parser.add_argument("--device-type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") parser.add_argument("--dtype", type=str, default="bfloat16", help="float32|bfloat16") # Model loading -parser.add_argument("--source", type=str, default="mid", help="base|mid - which checkpoint to load from") parser.add_argument("--model-tag", type=str, default=None, help="model tag to load from") parser.add_argument("--model-step", type=int, default=None, help="model step to load from") # Training horizon -parser.add_argument("--num-epochs", type=int, default=1, help="number of epochs") -parser.add_argument("--num-iterations", type=int, default=-1, help="override number of iterations (-1 = use num_epochs)") +parser.add_argument("--num-iterations", type=int, default=-1, help="number of optimization steps (-1 = full epoch)") # Batch sizes -parser.add_argument("--device-batch-size", type=int, default=4, help="per-device batch size") -parser.add_argument("--target-examples-per-step", type=int, default=32, help="target examples per optimization step") +parser.add_argument("--max-seq-len", type=int, default=2048, help="max context length") +parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size") +parser.add_argument("--total-batch-size", type=int, default=524288, help="total batch size in tokens") # Optimization parser.add_argument("--embedding-lr", type=float, default=0.2, help="learning rate for embedding parameters (Adam)") parser.add_argument("--unembedding-lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") parser.add_argument("--matrix-lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") parser.add_argument("--weight-decay", type=float, default=0.0, help="weight decay for embedding/unembedding parameters (Adam)") -parser.add_argument("--init-lr-frac", type=float, default=0.02, help="initial LR as fraction of base LR") +parser.add_argument("--init-lr-frac", type=float, default=1.0, help="initial LR as fraction of base LR") # Evaluation -parser.add_argument("--eval-every", type=int, default=100, help="evaluate val loss every N steps") -parser.add_argument("--eval-steps", type=int, default=100, help="number of batches for val loss evaluation") -parser.add_argument("--eval-metrics-every", type=int, default=200, help="evaluate accuracy metrics every N steps") -parser.add_argument("--eval-metrics-max-problems", type=int, default=1024, help="max problems per metric evaluation") +parser.add_argument("--eval-every", type=int, default=150, help="evaluate val bpb every N steps (-1 = disable)") +parser.add_argument("--eval-tokens", type=int, default=20*524288, help="number of tokens to evaluate val loss on") +# Output +parser.add_argument("--dry-run", action="store_true", help="log to wandb but skip checkpoints/report") args = parser.parse_args() user_config = vars(args).copy() # ----------------------------------------------------------------------------- @@ -70,217 +68,320 @@ ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type master_process = ddp_rank == 0 ptdtype = torch.float32 if args.dtype == 'float32' else torch.bfloat16 autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext() +synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None +get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0 # wandb logging init use_dummy_wandb = args.run == "dummy" or not master_process -wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat-sft", name=args.run, config=user_config, save_code=True) +wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat-sft", name=args.run, config=user_config) # Load the model and tokenizer -model, tokenizer, meta = load_model(args.source, device, phase="train", model_tag=args.model_tag, step=args.model_step) -orig_model = model # original, uncompiled model -# model = torch.compile(model, dynamic=True) # doesn't work super well because of variable lengths of inputs -engine = Engine(model, tokenizer) # will be used for inline model evaluation only +model, tokenizer, meta = load_model("base", device, phase="train", model_tag=args.model_tag, step=args.model_step) +pretrain_batch_size = meta.get("device_batch_size", None) +if pretrain_batch_size is not None and args.device_batch_size > pretrain_batch_size: + print0(f"FOOTGUN WARNING: base model training used device_batch_size {pretrain_batch_size}, did you pass in a good --device-batch-size to this script?") +orig_model = model +model = torch.compile(model, dynamic=False) +depth = model.config.n_layer +num_flops_per_token = model.estimate_flops() +tokens_per_fwdbwd = args.device_batch_size * args.max_seq_len # tokens per iteration for a single rank +world_tokens_per_fwdbwd = tokens_per_fwdbwd * ddp_world_size # total tokens per iteration for all ranks +assert args.total_batch_size % world_tokens_per_fwdbwd == 0 +grad_accum_steps = args.total_batch_size // world_tokens_per_fwdbwd +print0(f"Tokens / micro-batch / rank: {args.device_batch_size} x {args.max_seq_len} = {tokens_per_fwdbwd:,}") +print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}") +print0(f"Total batch size {args.total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}") +token_bytes = get_token_bytes(device=device) -# ----------------------------------------------------------------------------- -# Task data mixture we'll train on -identity_conversations_filepath = os.path.join(get_base_dir(), "identity_conversations.jsonl") -train_ds = TaskMixture([ - ARC(subset="ARC-Easy", split="train"), # 2.3K rows - ARC(subset="ARC-Challenge", split="train"), # 1.1K rows - GSM8K(subset="main", split="train"), # 8K rows - SmolTalk(split="train", stop=10_000), # 10K rows of smoltalk - CustomJSON(filepath=identity_conversations_filepath), # 1K rows of synthetic identity conversations - SimpleSpelling(size=300, split="train"), # 300 rows of Simple Spelling (e.g. spell the word 'apple') - SpellingBee(size=300, split="train"), # 300 rows of Spelling Bee (e.g. how many 'r' are in 'strawberry'?) -]) # 2.3K + 1.1K + 8K + 10K + 1K + 0.3K + 0.3K = 23K rows -val_ds = SmolTalk(split="test") # general conversations, 24K rows (though we don't actually use all of it) - -# ----------------------------------------------------------------------------- -# DataLoader - -def sft_data_generator(dataset, batch_size): - pad_token_id = tokenizer.encode_special("<|assistant_end|>") # use <|assistant_end|> as the pad token is ok, these positions are masked in the loss - # prepares a list of tokenized conversations into a batch and yields - def collate_and_yield(batch): - nrows = len(batch) - ncols = max(len(ids) for ids, mask in batch) - 1 # seq of n creates inputs/targets of n-1 - inputs = torch.full((nrows, ncols), pad_token_id, dtype=torch.long) - targets = torch.full((nrows, ncols), -1, dtype=torch.long) # -1 is ignore index - for i, (ids, mask) in enumerate(batch): - n = len(ids) - ids_tensor = torch.tensor(ids, dtype=torch.long) - inputs[i, :n-1] = ids_tensor[:-1] - # recall -1 is the ignore index, so mask out targets where mask is 0 - row_targets = ids_tensor[1:] - # mask[1:] omits the mask for the BOS token, which is never a target atm so it's ok - mask_tensor = torch.tensor(mask[1:], dtype=torch.long) - row_targets[mask_tensor == 0] = -1 # mask out targets where mask is 0 - targets[i, :n-1] = row_targets - inputs = inputs.to(device) # move to device - targets = targets.to(device) - return inputs, targets - # iterates over the dataset in epochs, tokenizes - batch = [] - while True: - for i in range(ddp_rank, len(dataset), ddp_world_size): - doc = dataset[i] - ids, mask = tokenizer.render_conversation(doc) - batch.append((ids, mask)) - if len(batch) == batch_size: - yield collate_and_yield(batch) - batch = [] - -examples_per_step = args.device_batch_size * ddp_world_size -print0(f"Target examples per step: {args.target_examples_per_step}") -print0(f"Device batch size: {args.device_batch_size}") -print0(f"Examples per step is device_batch_size * ddp_world_size: {examples_per_step}") -assert args.target_examples_per_step % examples_per_step == 0, "Target examples per step must be divisible by examples per step" -grad_accum_steps = args.target_examples_per_step // examples_per_step -print0(f"=> Setting grad accum steps: {grad_accum_steps}") - -if args.num_iterations == -1: - # derive num_iterations from num_epochs and the size of the dataset - assert args.num_epochs > 0, "num_epochs must be positive if num_iterations is -1" - num_iterations = (len(train_ds) // args.target_examples_per_step) * args.num_epochs -else: - num_iterations = args.num_iterations -train_loader = sft_data_generator(train_ds, batch_size=args.device_batch_size) -build_val_loader = lambda: sft_data_generator(val_ds, batch_size=args.device_batch_size) - -# ----------------------------------------------------------------------------- -# Initialize the Optimizer - -optimizer = model.setup_optimizer( - unembedding_lr=args.unembedding_lr, - embedding_lr=args.embedding_lr, - matrix_lr=args.matrix_lr, - weight_decay=args.weight_decay, -) -# Set the initial learning rate as a fraction of the base learning rate +# Initialize the Optimizer (combined MuonAdamW: Muon for matrix params, AdamW for rest) +optimizer = model.setup_optimizer(unembedding_lr=args.unembedding_lr, embedding_lr=args.embedding_lr, matrix_lr=args.matrix_lr, weight_decay=args.weight_decay) +# Override the initial learning rate as a fraction of the base learning rate for group in optimizer.param_groups: group["lr"] = group["lr"] * args.init_lr_frac group["initial_lr"] = group["lr"] -# ----------------------------------------------------------------------------- -# Training loop +# SFT data mixture and DataLoader +base_dir = get_base_dir() +identity_conversations_filepath = os.path.join(base_dir, "identity_conversations.jsonl") +train_dataset = TaskMixture([ + SmolTalk(split="train"), # 460K rows of general conversations + MMLU(subset="auxiliary_train", split="train"), # 100K rows of multiple choice problems drawn from ARC, MC_TEST, OBQA, RACE + GSM8K(subset="main", split="train"), # 8K rows teaching simple math and (calculator) tool use + GSM8K(subset="main", split="train"), # 2 epochs of GSM8K + CustomJSON(filepath=identity_conversations_filepath), # 1000 rows of synthetic identity conversations + CustomJSON(filepath=identity_conversations_filepath), # let's do 2 epochs of these + SimpleSpelling(size=200000, split="train"), # 200K rows of Simple Spelling (e.g. spell the word 'apple') + SpellingBee(size=80000, split="train"), # 80K rows of Spelling Bee (e.g. how many 'r' are in 'strawberry'?) +]) # total: 460K + 100K + 16K + 200K + 80K = 856K rows +val_dataset = TaskMixture([ + SmolTalk(split="test"), # 24K rows in test set + MMLU(subset="all", split="test", stop=5200), # 14K rows in test set, use only 5.2K to match the train ratios + GSM8K(subset="main", split="test", stop=420), # 1.32K rows in test set, use only 420 to match the train ratios +]) # total: 24K + 14K + 1.32K ~= 39K rows +# DataLoader is defined here, it emits inputs, targets : 2D tensors of shape (device_batch_size, max_seq_len) +# A big problem is that we don't know the final num_iterations in advance. So we create +# these two global variables and update them from within the data generator. +last_step = False # we will toggle this to True when we reach the end of the training dataset +approx_progress = 0.0 # will go from 0 to 1 over the course of the epoch +current_epoch = 1 # track epoch for logging +def sft_data_generator_bos_bestfit(split, buffer_size=100): + """ + BOS-aligned dataloader for SFT with bestfit-pad packing. + + Each row in the batch starts with BOS (beginning of a conversation). + Conversations are packed using best-fit algorithm. When no conversation fits, + the row is padded (instead of cropping) to ensure no tokens are ever discarded. + Padding positions have targets masked with -1 (ignore_index for cross-entropy). + """ + global last_step, approx_progress, current_epoch + assert split in {"train", "val"}, "split must be 'train' or 'val'" + dataset = train_dataset if split == "train" else val_dataset + dataset_size = len(dataset) + assert dataset_size > 0 + row_capacity = args.max_seq_len + 1 # +1 for target at last position + bos_token = tokenizer.get_bos_token_id() + + # Conversation buffer: list of token lists + conv_buffer = [] + cursor = ddp_rank # Each rank processes different conversations (for fetching) + consumed = ddp_rank # Track actual consumption separately from buffering + epoch = 1 + it = 0 # iteration counter + + def refill_buffer(): + nonlocal cursor, epoch + while len(conv_buffer) < buffer_size: + conversation = dataset[cursor] + ids, _ = tokenizer.render_conversation(conversation) + conv_buffer.append(ids) + cursor += ddp_world_size + if cursor >= dataset_size: + cursor = cursor % dataset_size + epoch += 1 + # Note: last_step is now triggered based on consumption, not fetching + + while True: + rows = [] + row_lengths = [] # Track actual content length (excluding padding) for each row + for _ in range(args.device_batch_size): + row = [] + padded = False + while len(row) < row_capacity: + # Ensure buffer has conversations + while len(conv_buffer) < buffer_size: + refill_buffer() + + remaining = row_capacity - len(row) + + # Find largest conversation that fits entirely + best_idx = -1 + best_len = 0 + for i, conv in enumerate(conv_buffer): + conv_len = len(conv) + if conv_len <= remaining and conv_len > best_len: + best_idx = i + best_len = conv_len + + if best_idx >= 0: + # Found a conversation that fits - use it entirely + conv = conv_buffer.pop(best_idx) + row.extend(conv) + consumed += ddp_world_size # Track actual consumption + else: + # No conversation fits - pad the remainder instead of cropping + # This ensures we never discard any tokens + content_len = len(row) + row.extend([bos_token] * remaining) # Pad with BOS tokens + padded = True + break # Row is now full (with padding) + + # Track content length: full row if no padding, otherwise the length before padding + if padded: + row_lengths.append(content_len) + else: + row_lengths.append(row_capacity) + rows.append(row[:row_capacity]) + + # Stopping condition to respect num_iterations, if given + it += 1 + if 0 < args.num_iterations <= it and split == "train": + last_step = True + + # Update progress tracking (based on consumed, not cursor, to account for buffering) + if split == "train": + current_epoch = epoch + if args.num_iterations > 0: + approx_progress = it / args.num_iterations + else: + approx_progress = consumed / dataset_size + # Trigger last_step when we've consumed enough (instead of when cursor wraps) + if consumed >= dataset_size: + last_step = True + + # Build tensors + use_cuda = device_type == "cuda" + batch_tensor = torch.tensor(rows, dtype=torch.long, pin_memory=use_cuda) + inputs = batch_tensor[:, :-1].to(device=device, dtype=torch.int32, non_blocking=use_cuda) + targets = batch_tensor[:, 1:].to(device=device, dtype=torch.int64, non_blocking=use_cuda) + + # Mask out padding positions in targets (set to -1 = ignore_index) + # For each row, positions >= (content_length - 1) in targets should be masked + for i, content_len in enumerate(row_lengths): + if content_len < row_capacity: + targets[i, content_len-1:] = -1 + + yield inputs, targets + +train_loader = sft_data_generator_bos_bestfit("train") +build_val_loader = lambda: sft_data_generator_bos_bestfit("val") +progress = 0 # will go from 0 to 1 over the course of the epoch # Learning rate scheduler -def get_lr_multiplier(it): - lrm = 1.0 - it / num_iterations - return lrm +def get_lr_multiplier(progress): + # first 80% of training: no decay, then linearly ramp down to 0. + return 1 if progress < 0.8 else 1 - (progress - 0.8) / 0.2 -# Go! +# Momentum scheduler for Muon optimizer +def get_muon_momentum(it): + frac = min(it / 300, 1) + momentum = (1 - frac) * 0.85 + frac * 0.95 + return momentum + +# ----------------------------------------------------------------------------- +# Training loop +x, y = next(train_loader) # prefetch the very first batch of data +min_val_bpb = float("inf") +smooth_train_loss = 0 # EMA of training loss +ema_beta = 0.9 # EMA decay factor +total_training_time = 0 # total wall-clock time of training step = 0 -for step in range(num_iterations): - last_step = step == num_iterations - 1 +while True: + flops_so_far = num_flops_per_token * args.total_batch_size * step - # evaluate the validation loss - if last_step or step % args.eval_every == 0: + # Synchronize last_step across all ranks to avoid hangs in the distributed setting + if ddp: + last_step_tensor = torch.tensor(last_step, dtype=torch.int32, device=device) + dist.all_reduce(last_step_tensor, op=dist.ReduceOp.MAX) + last_step = bool(last_step_tensor.item()) + + # once in a while: evaluate the val bpb (all ranks participate) + if last_step or (args.eval_every > 0 and step % args.eval_every == 0): model.eval() val_loader = build_val_loader() - losses = [] - for _ in range(args.eval_steps): - val_inputs, val_targets = next(val_loader) - with torch.no_grad(), autocast_ctx: - loss = model(val_inputs, val_targets) - losses.append(loss) - val_loss = torch.stack(losses).mean() # average over eval_steps - if ddp: - dist.all_reduce(val_loss, op=dist.ReduceOp.AVG) # average over ranks - val_loss = val_loss.item() - print0(f"Step {step:05d} | Validation loss: {val_loss:.6f}") + eval_steps = args.eval_tokens // (args.device_batch_size * args.max_seq_len * ddp_world_size) + with autocast_ctx: + val_bpb = evaluate_bpb(model, val_loader, eval_steps, token_bytes) + print0(f"Step {step:05d} | Validation bpb: {val_bpb:.4f}") + if val_bpb < min_val_bpb: + min_val_bpb = val_bpb wandb_run.log({ "step": step, - "val_loss": val_loss, + "total_training_flops": flops_so_far, + "total_training_time": total_training_time, + "val/bpb": val_bpb, }) model.train() - # evaluate accuracy of the multiple choice tasks (which are quick to run) - if last_step or (step > 0 and step % args.eval_metrics_every == 0): - model.eval() - metrics = {} - with torch.no_grad(), autocast_ctx: - # note that because these are inside no_grad, we can usually afford to at least ~2X the batch size - metrics["mmlu_acc"] = run_chat_eval("MMLU", model, tokenizer, engine, batch_size=args.device_batch_size*2, max_problems=args.eval_metrics_max_problems) - metrics["arc_easy_acc"] = run_chat_eval("ARC-Easy", model, tokenizer, engine, batch_size=args.device_batch_size*2, max_problems=args.eval_metrics_max_problems) - metrics_str = ', '.join(f'{k}: {v:.6f}' for k, v in metrics.items()) - print0(f"Step {step:05d} | {metrics_str}") - wandb_run.log({ - "step": step, - **metrics, - }) - model.train() + # save checkpoint at the end of the run (only on master process) + if master_process and last_step and not args.dry_run: + output_dirname = args.model_tag if args.model_tag else f"d{depth}" # e.g. d12 + checkpoint_dir = os.path.join(base_dir, "sft_checkpoints", output_dirname) + save_checkpoint( + checkpoint_dir, + step, + orig_model.state_dict(), + optimizer.state_dict(), + { + "step": step, + "val_bpb": val_bpb, # loss at last step + "model_config": { + "sequence_len": args.max_seq_len, + "vocab_size": tokenizer.get_vocab_size(), + "n_layer": depth, + "n_head": model.config.n_head, + "n_kv_head": model.config.n_kv_head, + "n_embd": model.config.n_embd, + }, + "user_config": user_config, # inputs to the training script + } + ) if last_step: break + # ------------------------------------------------------------------------- + # single training step # evaluate the gradient - num_tokens = torch.tensor(0, device=device) # the number of "active" tokens of supervision seen + synchronize() + t0 = time.time() for micro_step in range(grad_accum_steps): - train_inputs, train_targets = next(train_loader) with autocast_ctx: - loss = model(train_inputs, train_targets) + loss = model(x, y) train_loss = loss.detach() # for logging loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here - loss.backward() # accumulate the gradient - num_tokens += (train_targets >= 0).sum() - if ddp: - dist.all_reduce(num_tokens, op=dist.ReduceOp.SUM) # sum over ranks - - # learning rate scheduler - lrm = get_lr_multiplier(step) + loss.backward() + x, y = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward + progress = max(progress, approx_progress) # only increase progress monotonically + # step the optimizer + lrm = get_lr_multiplier(progress) + muon_momentum = get_muon_momentum(step) for group in optimizer.param_groups: group["lr"] = group["initial_lr"] * lrm - - # step the optimizer + if group['kind'] == 'muon': + group["momentum"] = muon_momentum optimizer.step() model.zero_grad(set_to_none=True) + synchronize() + t1 = time.time() + dt = t1 - t0 + # ------------------------------------------------------------------------- - # logging - train_loss_item = train_loss.item() - num_tokens_item = num_tokens.item() - print0(f"Step {step:05d}/{num_iterations:05d} | Training loss: {train_loss_item:.6f}| lrm: {lrm:.6f}| num_tokens: {num_tokens_item:,}") - wandb_run.log({ - "step": step, - "lrm": lrm, - "train_loss": train_loss_item, - "num_tokens": num_tokens_item, - }) + # State step += 1 -# Save the model at the end of the run -if master_process: - base_dir = get_base_dir() - depth = model.config.n_layer - output_dirname = args.model_tag if args.model_tag else f"d{depth}" # e.g. d12 - checkpoint_dir = os.path.join(base_dir, "chatsft_checkpoints", output_dirname) - model_config_kwargs = model.config.__dict__ # slightly naughty, abusing the simplicity of GPTConfig, TODO nicer - save_checkpoint( - checkpoint_dir, - step, - model.state_dict(), - None, # note: we don't bother to save the optimizer state - { + # logging + smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss + debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA + pct_done = 100 * progress + tok_per_sec = int(args.total_batch_size / dt) + flops_per_sec = num_flops_per_token * args.total_batch_size / dt + promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity + mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in % + if step > 10: + total_training_time += dt # only count the time after the first 10 steps + print0(f"step {step:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | epoch: {current_epoch} | total time: {total_training_time/60:.2f}m") + if step % 10 == 0: + wandb_run.log({ "step": step, - "val_loss": val_loss, - **metrics, - "model_config": model_config_kwargs, - } - ) - print(f"✅ Saved model checkpoint to {checkpoint_dir}") + "total_training_flops": flops_so_far, + "total_training_time": total_training_time, + "train/loss": debiased_smooth_loss, + "train/lrm": lrm, + "train/dt": dt, + "train/tok_per_sec": tok_per_sec, + "train/mfu": mfu, + "train/epoch": current_epoch, + }) + +# print a few more stats +print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB") +print0(f"Total training time: {total_training_time/60:.2f}m") +print0(f"Minimum validation bpb: {min_val_bpb:.4f}") # Log to report -from nanochat.report import get_report -get_report().log(section="Chat SFT", data=[ - user_config, # CLI args - { - "Training rows": len(train_ds), - "Number of iterations": num_iterations, - "Training loss": train_loss_item, - "Validation loss": val_loss, - }, -]) +if not args.dry_run: + from nanochat.report import get_report + get_report().log(section="SFT", data=[ + user_config, # CLI args + { # stats about the training setup + "Number of iterations": step, + "DDP world size": ddp_world_size, + }, + { # stats about training outcomes + "Minimum validation bpb": min_val_bpb, + } + ]) -# Cleanup -wandb_run.finish() +# cleanup +wandb_run.finish() # wandb run finish compute_cleanup() diff --git a/scripts/mid_train.py b/scripts/mid_train.py deleted file mode 100644 index 54c5fb09..00000000 --- a/scripts/mid_train.py +++ /dev/null @@ -1,386 +0,0 @@ -""" -Midtrain the model. Same as pretraining but simpler. -Run as: - -python -m scripts.mid_train - -Or torchrun for training: - -torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --device-batch-size=16 -""" - -import argparse -import os -os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" -import time -import wandb -import torch -from contextlib import nullcontext -from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, get_base_dir, autodetect_device_type -from nanochat.tokenizer import get_token_bytes -from nanochat.checkpoint_manager import save_checkpoint -from nanochat.loss_eval import evaluate_bpb -from nanochat.checkpoint_manager import load_model -import torch.distributed as dist - -from tasks.common import TaskMixture -from tasks.gsm8k import GSM8K -from tasks.mmlu import MMLU -from tasks.smoltalk import SmolTalk -from tasks.customjson import CustomJSON -from tasks.spellingbee import SimpleSpelling, SpellingBee - -# ----------------------------------------------------------------------------- -# CLI arguments -parser = argparse.ArgumentParser(description="Midtrain the model") -# Logging -parser.add_argument("--run", type=str, default="dummy", help="wandb run name ('dummy' disables wandb logging)") -# Runtime -parser.add_argument("--device-type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") -parser.add_argument("--dtype", type=str, default="bfloat16", help="float32|bfloat16") -# Model loading -parser.add_argument("--model-tag", type=str, default=None, help="model tag to load from") -parser.add_argument("--model-step", type=int, default=None, help="model step to load from") -# Training horizon -parser.add_argument("--num-iterations", type=int, default=-1, help="number of optimization steps (-1 = full epoch)") -# Batch sizes -parser.add_argument("--max-seq-len", type=int, default=2048, help="max context length") -parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size") -parser.add_argument("--total-batch-size", type=int, default=524288, help="total batch size in tokens") -# Optimization -parser.add_argument("--embedding-lr", type=float, default=0.2, help="learning rate for embedding parameters (Adam)") -parser.add_argument("--unembedding-lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") -parser.add_argument("--matrix-lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") -parser.add_argument("--weight-decay", type=float, default=0.0, help="weight decay for embedding/unembedding parameters (Adam)") -parser.add_argument("--init-lr-frac", type=float, default=1.0, help="initial LR as fraction of base LR") -# Evaluation -parser.add_argument("--eval-every", type=int, default=150, help="evaluate val bpb every N steps (-1 = disable)") -parser.add_argument("--eval-tokens", type=int, default=20*524288, help="number of tokens to evaluate val loss on") -# Output -parser.add_argument("--dry-run", action="store_true", help="log to wandb but skip checkpoints/report") -args = parser.parse_args() -user_config = vars(args).copy() -# ----------------------------------------------------------------------------- - -# Compute init -device_type = autodetect_device_type() if args.device_type == "" else args.device_type -ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) -master_process = ddp_rank == 0 -ptdtype = torch.float32 if args.dtype == 'float32' else torch.bfloat16 -autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext() -synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None -get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0 - -# wandb logging init -use_dummy_wandb = args.run == "dummy" or not master_process -wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat-mid", name=args.run, config=user_config) - -# Load the model and tokenizer -model, tokenizer, meta = load_model("base", device, phase="train", model_tag=args.model_tag, step=args.model_step) -pretrain_batch_size = meta.get("device_batch_size", None) -if pretrain_batch_size is not None and args.device_batch_size > pretrain_batch_size: - print0(f"FOOTGUN WARNING: base model training used device_batch_size {pretrain_batch_size}, did you pass in a good --device-batch-size to this script?") -orig_model = model -model = torch.compile(model, dynamic=False) -depth = model.config.n_layer -num_flops_per_token = model.estimate_flops() -tokens_per_fwdbwd = args.device_batch_size * args.max_seq_len # tokens per iteration for a single rank -world_tokens_per_fwdbwd = tokens_per_fwdbwd * ddp_world_size # total tokens per iteration for all ranks -assert args.total_batch_size % world_tokens_per_fwdbwd == 0 -grad_accum_steps = args.total_batch_size // world_tokens_per_fwdbwd -print0(f"Tokens / micro-batch / rank: {args.device_batch_size} x {args.max_seq_len} = {tokens_per_fwdbwd:,}") -print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}") -print0(f"Total batch size {args.total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}") -token_bytes = get_token_bytes(device=device) - -# Initialize the Optimizer (combined MuonAdamW: Muon for matrix params, AdamW for rest) -optimizer = model.setup_optimizer(unembedding_lr=args.unembedding_lr, embedding_lr=args.embedding_lr, matrix_lr=args.matrix_lr, weight_decay=args.weight_decay) -# Override the initial learning rate as a fraction of the base learning rate -for group in optimizer.param_groups: - group["lr"] = group["lr"] * args.init_lr_frac - group["initial_lr"] = group["lr"] - -# Midtraining data mixture and DataLoader -base_dir = get_base_dir() -identity_conversations_filepath = os.path.join(base_dir, "identity_conversations.jsonl") -train_dataset = TaskMixture([ - SmolTalk(split="train"), # 460K rows of general conversations - MMLU(subset="auxiliary_train", split="train"), # 100K rows of multiple choice problems drawn from ARC, MC_TEST, OBQA, RACE - GSM8K(subset="main", split="train"), # 8K rows teaching simple math and (calculator) tool use - CustomJSON(filepath=identity_conversations_filepath), # 1000 rows of synthetic identity conversations - CustomJSON(filepath=identity_conversations_filepath), # let's do 2 epochs of these - SimpleSpelling(size=200000, split="train"), # 200K rows of Simple Spelling (e.g. spell the word 'apple') - SpellingBee(size=80000, split="train"), # 80K rows of Spelling Bee (e.g. how many 'r' are in 'strawberry'?) -]) # total: 460K + 100K + 8K + 200K + 80K = 848K rows -val_dataset = TaskMixture([ - SmolTalk(split="test"), # 24K rows in test set - MMLU(subset="all", split="test", stop=5200), # 14K rows in test set, use only 5.2K to match the train ratios - GSM8K(subset="main", split="test", stop=420), # 1.32K rows in test set, use only 420 to match the train ratios -]) # total: 24K + 14K + 1.32K ~= 39K rows -# DataLoader is defined here, it emits inputs, targets : 2D tensors of shape (device_batch_size, max_seq_len) -# A big problem is that we don't know the final num_iterations in advance. So we create -# these two global variables and update them from within the data generator. -last_step = False # we will toggle this to True when we reach the end of the training dataset -approx_progress = 0.0 # will go from 0 to 1 over the course of the epoch -current_epoch = 1 # track epoch for logging -def mid_data_generator_bos_bestfit(split, buffer_size=100): - """ - BOS-aligned dataloader for midtraining with bestfit-pad packing. - - Each row in the batch starts with BOS (beginning of a conversation). - Conversations are packed using best-fit algorithm. When no conversation fits, - the row is padded (instead of cropping) to ensure no tokens are ever discarded. - Padding positions have targets masked with -1 (ignore_index for cross-entropy). - """ - global last_step, approx_progress, current_epoch - assert split in {"train", "val"}, "split must be 'train' or 'val'" - dataset = train_dataset if split == "train" else val_dataset - dataset_size = len(dataset) - assert dataset_size > 0 - row_capacity = args.max_seq_len + 1 # +1 for target at last position - bos_token = tokenizer.get_bos_token_id() - - # Conversation buffer: list of token lists - conv_buffer = [] - cursor = ddp_rank # Each rank processes different conversations (for fetching) - consumed = ddp_rank # Track actual consumption separately from buffering - epoch = 1 - it = 0 # iteration counter - - def refill_buffer(): - nonlocal cursor, epoch - while len(conv_buffer) < buffer_size: - conversation = dataset[cursor] - ids, _ = tokenizer.render_conversation(conversation) - conv_buffer.append(ids) - cursor += ddp_world_size - if cursor >= dataset_size: - cursor = cursor % dataset_size - epoch += 1 - # Note: last_step is now triggered based on consumption, not fetching - - while True: - rows = [] - row_lengths = [] # Track actual content length (excluding padding) for each row - for _ in range(args.device_batch_size): - row = [] - padded = False - while len(row) < row_capacity: - # Ensure buffer has conversations - while len(conv_buffer) < buffer_size: - refill_buffer() - - remaining = row_capacity - len(row) - - # Find largest conversation that fits entirely - best_idx = -1 - best_len = 0 - for i, conv in enumerate(conv_buffer): - conv_len = len(conv) - if conv_len <= remaining and conv_len > best_len: - best_idx = i - best_len = conv_len - - if best_idx >= 0: - # Found a conversation that fits - use it entirely - conv = conv_buffer.pop(best_idx) - row.extend(conv) - consumed += ddp_world_size # Track actual consumption - else: - # No conversation fits - pad the remainder instead of cropping - # This ensures we never discard any tokens - content_len = len(row) - row.extend([bos_token] * remaining) # Pad with BOS tokens - padded = True - break # Row is now full (with padding) - - # Track content length: full row if no padding, otherwise the length before padding - if padded: - row_lengths.append(content_len) - else: - row_lengths.append(row_capacity) - rows.append(row[:row_capacity]) - - # Stopping condition to respect num_iterations, if given - it += 1 - if 0 < args.num_iterations <= it and split == "train": - last_step = True - - # Update progress tracking (based on consumed, not cursor, to account for buffering) - if split == "train": - current_epoch = epoch - if args.num_iterations > 0: - approx_progress = it / args.num_iterations - else: - approx_progress = consumed / dataset_size - # Trigger last_step when we've consumed enough (instead of when cursor wraps) - if consumed >= dataset_size: - last_step = True - - # Build tensors - use_cuda = device_type == "cuda" - batch_tensor = torch.tensor(rows, dtype=torch.long, pin_memory=use_cuda) - inputs = batch_tensor[:, :-1].to(device=device, dtype=torch.int32, non_blocking=use_cuda) - targets = batch_tensor[:, 1:].to(device=device, dtype=torch.int64, non_blocking=use_cuda) - - # Mask out padding positions in targets (set to -1 = ignore_index) - # For each row, positions >= (content_length - 1) in targets should be masked - for i, content_len in enumerate(row_lengths): - if content_len < row_capacity: - targets[i, content_len-1:] = -1 - - yield inputs, targets - -train_loader = mid_data_generator_bos_bestfit("train") -build_val_loader = lambda: mid_data_generator_bos_bestfit("val") -progress = 0 # will go from 0 to 1 over the course of the epoch - -# Learning rate scheduler -def get_lr_multiplier(progress): - # first 80% of training: no decay, then linearly ramp down to 0. - return 1 if progress < 0.8 else 1 - (progress - 0.8) / 0.2 - -# Momentum scheduler for Muon optimizer -def get_muon_momentum(it): - frac = min(it / 300, 1) - momentum = (1 - frac) * 0.85 + frac * 0.95 - return momentum - -# ----------------------------------------------------------------------------- -# Training loop -x, y = next(train_loader) # prefetch the very first batch of data -min_val_bpb = float("inf") -smooth_train_loss = 0 # EMA of training loss -ema_beta = 0.9 # EMA decay factor -total_training_time = 0 # total wall-clock time of training -step = 0 -while True: - flops_so_far = num_flops_per_token * args.total_batch_size * step - - # Synchronize last_step across all ranks to avoid hangs in the distributed setting - if ddp: - last_step_tensor = torch.tensor(last_step, dtype=torch.int32, device=device) - dist.all_reduce(last_step_tensor, op=dist.ReduceOp.MAX) - last_step = bool(last_step_tensor.item()) - - # once in a while: evaluate the val bpb (all ranks participate) - if last_step or (args.eval_every > 0 and step % args.eval_every == 0): - model.eval() - val_loader = build_val_loader() - eval_steps = args.eval_tokens // (args.device_batch_size * args.max_seq_len * ddp_world_size) - with autocast_ctx: - val_bpb = evaluate_bpb(model, val_loader, eval_steps, token_bytes) - print0(f"Step {step:05d} | Validation bpb: {val_bpb:.4f}") - if val_bpb < min_val_bpb: - min_val_bpb = val_bpb - wandb_run.log({ - "step": step, - "total_training_flops": flops_so_far, - "total_training_time": total_training_time, - "val/bpb": val_bpb, - }) - model.train() - - # save checkpoint at the end of the run (only on master process) - if master_process and last_step and not args.dry_run: - output_dirname = args.model_tag if args.model_tag else f"d{depth}" # e.g. d12 - checkpoint_dir = os.path.join(base_dir, "mid_checkpoints", output_dirname) - save_checkpoint( - checkpoint_dir, - step, - orig_model.state_dict(), - optimizer.state_dict(), - { - "step": step, - "val_bpb": val_bpb, # loss at last step - "model_config": { - "sequence_len": args.max_seq_len, - "vocab_size": tokenizer.get_vocab_size(), - "n_layer": depth, - "n_head": model.config.n_head, - "n_kv_head": model.config.n_kv_head, - "n_embd": model.config.n_embd, - }, - "user_config": user_config, # inputs to the training script - } - ) - - if last_step: - break - - # ------------------------------------------------------------------------- - # single training step - # evaluate the gradient - synchronize() - t0 = time.time() - for micro_step in range(grad_accum_steps): - with autocast_ctx: - loss = model(x, y) - train_loss = loss.detach() # for logging - loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here - loss.backward() - x, y = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward - progress = max(progress, approx_progress) # only increase progress monotonically - # step the optimizer - lrm = get_lr_multiplier(progress) - muon_momentum = get_muon_momentum(step) - for group in optimizer.param_groups: - group["lr"] = group["initial_lr"] * lrm - if group['kind'] == 'muon': - group["momentum"] = muon_momentum - optimizer.step() - model.zero_grad(set_to_none=True) - synchronize() - t1 = time.time() - dt = t1 - t0 - # ------------------------------------------------------------------------- - - # State - step += 1 - - # logging - smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss.item() # EMA the training loss - debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA - pct_done = 100 * progress - tok_per_sec = int(args.total_batch_size / dt) - flops_per_sec = num_flops_per_token * args.total_batch_size / dt - promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity - mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in % - if step > 10: - total_training_time += dt # only count the time after the first 10 steps - print0(f"step {step:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | epoch: {current_epoch} | total time: {total_training_time/60:.2f}m") - if step % 10 == 0: - wandb_run.log({ - "step": step, - "total_training_flops": flops_so_far, - "total_training_time": total_training_time, - "train/loss": debiased_smooth_loss, - "train/lrm": lrm, - "train/dt": dt, - "train/tok_per_sec": tok_per_sec, - "train/mfu": mfu, - "train/epoch": current_epoch, - }) - -# print a few more stats -print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB") -print0(f"Total training time: {total_training_time/60:.2f}m") -print0(f"Minimum validation bpb: {min_val_bpb:.4f}") - -# Log to report -if not args.dry_run: - from nanochat.report import get_report - get_report().log(section="Midtraining", data=[ - user_config, # CLI args - { # stats about the training setup - "Number of iterations": step, - "DDP world size": ddp_world_size, - }, - { # stats about training outcomes - "Minimum validation bpb": min_val_bpb, - } - ]) - -# cleanup -wandb_run.finish() # wandb run finish -compute_cleanup() From 0307997f9bbfc0f40fb81c498d6a550e4d392b8c Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 1 Feb 2026 02:36:43 +0000 Subject: [PATCH 084/119] merge two files base_loss and base_eval into a single file, it's nicer this way, and unify the huggingface code associated with both --- README.md | 3 +- runs/runcpu.sh | 3 +- runs/speedrun.sh | 4 +- scripts/base_eval.py | 315 +++++++++++++++++++++++++++++-------------- scripts/base_loss.py | 155 --------------------- 5 files changed, 219 insertions(+), 261 deletions(-) delete mode 100644 scripts/base_loss.py diff --git a/README.md b/README.md index 800c5d9d..62834375 100644 --- a/README.md +++ b/README.md @@ -142,8 +142,7 @@ I've published a number of guides that might contain helpful information: │ ├── scaling_laws.sh # Scaling laws experiments │ └── speedrun.sh # Train the ~$100 nanochat d20 ├── scripts -│ ├── base_eval.py # Base model: calculate CORE score -│ ├── base_loss.py # Base model: calculate bits per byte, sample +│ ├── base_eval.py # Base model: CORE score, bits per byte, samples │ ├── base_train.py # Base model: train │ ├── chat_cli.py # Chat model: talk to over CLI │ ├── chat_eval.py # Chat model: eval tasks diff --git a/runs/runcpu.sh b/runs/runcpu.sh index f3837265..853fa1f3 100755 --- a/runs/runcpu.sh +++ b/runs/runcpu.sh @@ -42,8 +42,7 @@ python -m scripts.base_train \ --sample-every=100 \ --num-iterations=5000 \ --run=$WANDB_RUN -python -m scripts.base_loss --device-batch-size=1 --split-tokens=16384 -python -m scripts.base_eval --max-per-task=16 +python -m scripts.base_eval --device-batch-size=1 --split-tokens=16384 --max-per-task=16 # SFT (~10 minutes on my MacBook Pro M3 Max) curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl diff --git a/runs/speedrun.sh b/runs/speedrun.sh index a9612c0a..a709462d 100644 --- a/runs/speedrun.sh +++ b/runs/speedrun.sh @@ -74,9 +74,7 @@ NPROC_PER_NODE=8 # d24 model (slightly overtrained is enough to beat GPT-2 => increase data:params ratio from compute optimal 10.5 (default) to 12) torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=24 --target-param-data-ratio=12 --run=$WANDB_RUN -# evaluate the model on a larger chunk of train/val data and draw some samples -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss -# evaluate the model on CORE tasks +# evaluate the model: CORE metric, BPB on train/val, and draw samples torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval # ----------------------------------------------------------------------------- diff --git a/scripts/base_eval.py b/scripts/base_eval.py index bd83ff36..57f9fd43 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -1,13 +1,23 @@ """ -Evaluate the CORE metric for a given model. +Unified evaluation script for base models. -Run on a single GPU: -python -m scripts.base_eval +Supports three evaluation modes (comma-separated): + --eval core : CORE metric (accuracy on ICL tasks) + --eval bpb : Bits per byte on train/val splits + --eval sample : Generate samples from the model -Run with torchrun on e.g. 8 GPUs: -torchrun --nproc_per_node=8 -m scripts.base_eval +Default is all three: --eval core,bpb,sample -The script will print the CORE metric to the console. +Examples: + + # Evaluate a HuggingFace model (e.g. GPT-2 124M) using 8 GPUs + torchrun --nproc_per_node=8 -m scripts.base_eval --hf-path openai-community/gpt2 + + # Evaluate a nanochat model (e.g. d24) using 8 GPUs + torchrun --nproc_per_node=8 -m scripts.base_eval --model-tag d24 --device-batch-size=16 + + # Quick/approximate evaluation using a single GPU + python -m scripts.base_eval --model-tag d24 --device-batch-size=16 --max-per-task=100 --split-tokens=524288 """ import os import csv @@ -18,24 +28,74 @@ import shutil import random import zipfile import tempfile +import argparse from contextlib import nullcontext import torch from nanochat.common import compute_init, compute_cleanup, print0, get_base_dir, autodetect_device_type, download_file_with_lock -from nanochat.tokenizer import HuggingFaceTokenizer +from nanochat.tokenizer import HuggingFaceTokenizer, get_token_bytes from nanochat.checkpoint_manager import load_model from nanochat.core_eval import evaluate_task +from nanochat.dataloader import tokenizing_distributed_data_loader_bos_bestfit +from nanochat.loss_eval import evaluate_bpb +from nanochat.engine import Engine # ----------------------------------------------------------------------------- -# nanochat specific function dealing with I/O etc. +# HuggingFace loading utilities + +class ModelWrapper: + """Lightweight wrapper to give HuggingFace models a nanochat-compatible interface.""" + def __init__(self, model, max_seq_len=None): + self.model = model + self.max_seq_len = max_seq_len + + def __call__(self, input_ids, targets=None, loss_reduction='mean'): + logits = self.model(input_ids).logits + if targets is None: + return logits + loss = torch.nn.functional.cross_entropy( + logits.view(-1, logits.size(-1)), + targets.view(-1), + ignore_index=-1, + reduction=loss_reduction + ) + return loss + + def get_device(self): + return next(self.model.parameters()).device + + +def load_hf_model(hf_path: str, device): + """Load a HuggingFace model and tokenizer.""" + print0(f"Loading HuggingFace model from: {hf_path}") + from transformers import AutoModelForCausalLM + model = AutoModelForCausalLM.from_pretrained(hf_path) + model.to(device) + model.eval() + max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None + model = ModelWrapper(model, max_seq_len=max_seq_len) + tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path) + return model, tokenizer + + +def get_hf_token_bytes(tokenizer, device="cpu"): + """Compute token_bytes tensor for a HuggingFace tokenizer.""" + vocab_size = tokenizer.tokenizer.get_vocab_size() + token_bytes = torch.zeros(vocab_size, dtype=torch.int64, device=device) + for token_id in range(vocab_size): + token_str = tokenizer.tokenizer.decode([token_id]) + token_bytes[token_id] = len(token_str.encode('utf-8')) + return token_bytes + +# ----------------------------------------------------------------------------- +# CORE evaluation -# ~162MB of data needed to evaluate the CORE metric EVAL_BUNDLE_URL = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip" + def place_eval_bundle(file_path): - # here file_path is the path to the eval_bundle.zip file - # we need to unzip it and place it in the base directory + """Unzip eval_bundle.zip and place it in the base directory.""" base_dir = get_base_dir() eval_bundle_dir = os.path.join(base_dir, "eval_bundle") with tempfile.TemporaryDirectory() as tmpdir: @@ -45,25 +105,27 @@ def place_eval_bundle(file_path): shutil.move(extracted_bundle_dir, eval_bundle_dir) print0(f"Placed eval_bundle directory at {eval_bundle_dir}") -def evaluate_model(model, tokenizer, device, max_per_task=-1): + +def evaluate_core(model, tokenizer, device, max_per_task=-1): """ Evaluate a base model on the CORE benchmark. - - max_per_task: crop the data to this many examples per task for testing (-1 = disable) + Returns dict with results, centered_results, and core_metric. """ - # Load config and task metadata base_dir = get_base_dir() eval_bundle_dir = os.path.join(base_dir, "eval_bundle") - # Download the eval bundle to disk (and unzip if needed) + # Download the eval bundle if needed if not os.path.exists(eval_bundle_dir): download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle) + config_path = os.path.join(eval_bundle_dir, "core.yaml") data_base_path = os.path.join(eval_bundle_dir, "eval_data") eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv") + with open(config_path, 'r', encoding='utf-8') as f: config = yaml.safe_load(f) tasks = config['icl_tasks'] - # Load random baseline values from eval metadata + # Load random baseline values random_baselines = {} with open(eval_meta_data, 'r', encoding='utf-8') as f: reader = csv.DictReader(f) @@ -86,27 +148,23 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): } print0(f"Evaluating: {label} ({task_meta['num_fewshot']}-shot, type: {task_meta['task_type']})... ", end='') - # Load data for this task data_path = os.path.join(data_base_path, task_meta['dataset_uri']) with open(data_path, 'r', encoding='utf-8') as f: data = [json.loads(line.strip()) for line in f] - # shuffle the data because in many cases it appears ordered but we want - # the ability to only run a subset of the data for debugging purposes etc. + # Shuffle for consistent subsampling when using max_per_task shuffle_rng = random.Random(1337) shuffle_rng.shuffle(data) if max_per_task > 0: data = data[:max_per_task] - # run the evaluation for this task accuracy = evaluate_task(model, tokenizer, data, device, task_meta) - results[label] = accuracy random_baseline = random_baselines[label] centered_result = (accuracy - 0.01 * random_baseline) / (1.0 - 0.01 * random_baseline) centered_results[label] = centered_result - end_time = time.time() - print0(f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {end_time - start_time:.2f}s") + elapsed = time.time() - start_time + print0(f"accuracy: {accuracy:.4f} | centered: {centered_result:.4f} | time: {elapsed:.2f}s") core_metric = sum(centered_results.values()) / len(centered_results) out = { @@ -117,98 +175,157 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): return out # ----------------------------------------------------------------------------- -# HuggingFace loading utilities and light wrappers for a model +# Main -class ModelWrapper: - """Lightweight wrapper for a HuggingFace model""" - def __init__(self, model, max_seq_len=None): - self.model = model - self.max_seq_len = max_seq_len - - def __call__(self, input_ids): - outputs = self.model(input_ids) - logits = outputs.logits - return logits - -def load_hf_model(hf_path: str, device): - print0(f"Loading model from: {hf_path}") - # Load the model - from transformers import AutoModelForCausalLM - model = AutoModelForCausalLM.from_pretrained(hf_path) - model.to(device) - model.eval() - max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None - model = ModelWrapper(model, max_seq_len=max_seq_len) - # Load the tokenizer - tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path) - return model, tokenizer - -# ----------------------------------------------------------------------------- def main(): - import argparse - parser = argparse.ArgumentParser() - parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path to evaluate') - parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per task to evaluate (-1 = disable)') - parser.add_argument('--model-tag', type=str, default=None, help='optional model tag for the output directory name') - parser.add_argument('--step', type=str, default=None, help='optional model step for the output directory name') + parser = argparse.ArgumentParser(description="Base model evaluation") + parser.add_argument('--eval', type=str, default='core,bpb,sample', help='Comma-separated evaluations to run: core,bpb,sample (default: all)') + parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path (e.g. openai-community/gpt2)') + parser.add_argument('--model-tag', type=str, default=None, help='nanochat model tag to identify the checkpoint directory') + parser.add_argument('--step', type=int, default=None, help='Model step to load (default = last)') + parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per CORE task (-1 = all)') + parser.add_argument('--device-batch-size', type=int, default=32, help='Per-device batch size for BPB evaluation') + parser.add_argument('--split-tokens', type=int, default=40*524288, help='Number of tokens to evaluate per split for BPB') + parser.add_argument('--device-type', type=str, default='', help='cuda|cpu|mps (empty = autodetect)') args = parser.parse_args() - # distributed / precision setup - device_type = autodetect_device_type() + # Parse evaluation modes + eval_modes = set(mode.strip() for mode in args.eval.split(',')) + valid_modes = {'core', 'bpb', 'sample'} + invalid = eval_modes - valid_modes + if invalid: + parser.error(f"Invalid eval modes: {invalid}. Valid: {valid_modes}") + + # Distributed / precision setup + device_type = autodetect_device_type() if args.device_type == '' else args.device_type ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext() - # Load model and tokenizer from command line or from file system - if args.hf_path is not None: - # atm assume that if a path is given, it's a huggingface model path - hf_path = args.hf_path - print0(f"Loading huggingface model from: {hf_path}") - model, tokenizer = load_hf_model(hf_path, device) - model_name = hf_path # just for logging - model_slug = hf_path.replace("/", "-") # for the output csv file + # Load model and tokenizer + is_hf_model = args.hf_path is not None + if is_hf_model: + model, tokenizer = load_hf_model(args.hf_path, device) + sequence_len = model.max_seq_len or 1024 + token_bytes = get_hf_token_bytes(tokenizer, device=device) + model_name = args.hf_path + model_slug = args.hf_path.replace("/", "-") else: - # load a local model from the file system model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=args.model_tag, step=args.step) - model_name = f"base_model (step {meta['step']})" # just for logging - model_slug = f"base_model_{meta['step']:06d}" # for the output csv file + sequence_len = meta["model_config"]["sequence_len"] + token_bytes = get_token_bytes(device=device) + model_name = f"base_model (step {meta['step']})" + model_slug = f"base_model_{meta['step']:06d}" - # Evaluate the model - with autocast_ctx: - out = evaluate_model(model, tokenizer, device, max_per_task=args.max_per_task) + print0(f"Evaluating model: {model_name}") + print0(f"Eval modes: {', '.join(sorted(eval_modes))}") - # Write out the results to a csv file - core_metric = None - centered_results = {} - if ddp_rank == 0: - base_dir = get_base_dir() - output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv") - os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) - results = out["results"] - centered_results = out["centered_results"] - core_metric = out["core_metric"] - with open(output_csv_path, 'w', encoding='utf-8', newline='') as f: - f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n") - for label in results: - f.write(f"{label:<35}, {results[label]:<10.6f}, {centered_results[label]:<10.6f}\n") - f.write(f"{'CORE':<35}, {'':<10}, {core_metric:<10.6f}\n") - # Print the content of the csv file to console too + # Results to log + core_results = None + bpb_results = {} + samples = [] + unconditioned_samples = [] + + # --- CORE evaluation --- + if 'core' in eval_modes: + print0("\n" + "="*80) + print0("CORE Evaluation") print0("="*80) - print0(f"Model: {model_name}") - print0("="*80) - with open(output_csv_path, 'r', encoding='utf-8') as f: - print0(f.read()) + with autocast_ctx: + core_results = evaluate_core(model, tokenizer, device, max_per_task=args.max_per_task) - # Log to report + # Write CSV output + if ddp_rank == 0: + base_dir = get_base_dir() + output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv") + os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) + with open(output_csv_path, 'w', encoding='utf-8', newline='') as f: + f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n") + for label in core_results["results"]: + acc = core_results["results"][label] + centered = core_results["centered_results"][label] + f.write(f"{label:<35}, {acc:<10.6f}, {centered:<10.6f}\n") + f.write(f"{'CORE':<35}, {'':<10}, {core_results['core_metric']:<10.6f}\n") + print0(f"\nResults written to: {output_csv_path}") + print0(f"CORE metric: {core_results['core_metric']:.4f}") + + # --- BPB evaluation --- + if 'bpb' in eval_modes: + print0("\n" + "="*80) + print0("BPB Evaluation") + print0("="*80) + tokens_per_step = args.device_batch_size * sequence_len * ddp_world_size + if args.split_tokens % tokens_per_step != 0: + # Adjust to nearest multiple + args.split_tokens = (args.split_tokens // tokens_per_step) * tokens_per_step + print0(f"Adjusted split_tokens to {args.split_tokens} (must be divisible by {tokens_per_step})") + steps = args.split_tokens // tokens_per_step + + for split_name in ["train", "val"]: + loader = tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, sequence_len, split_name, device=device) + with autocast_ctx: + bpb = evaluate_bpb(model, loader, steps, token_bytes) + bpb_results[split_name] = bpb + print0(f"{split_name} bpb: {bpb:.6f}") + + # --- Sampling --- + if 'sample' in eval_modes and not is_hf_model: + print0("\n" + "="*80) + print0("Model Samples") + print0("="*80) + if ddp_rank == 0: + prompts = [ + "The capital of France is", + "The chemical symbol of gold is", + "If yesterday was Friday, then tomorrow will be", + "The opposite of hot is", + "The planets of the solar system are:", + "My favorite color is", + "If 5*x + 3 = 13, then x is", + ] + engine = Engine(model, tokenizer) + print0("\nConditioned samples:") + for prompt in prompts: + tokens = tokenizer(prompt, prepend="<|bos|>") + with autocast_ctx: + sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0) + sample_str = tokenizer.decode(sample[0]) + print0("-" * 80) + print0(sample_str) + samples.append(sample_str) + + print0("\nUnconditioned samples:") + tokens = tokenizer("", prepend="<|bos|>") + with autocast_ctx: + uncond, _ = engine.generate_batch(tokens, num_samples=8, max_tokens=128, temperature=1.0) + for sample in uncond: + sample_str = tokenizer.decode(sample) + print0("-" * 80) + print0(sample_str) + unconditioned_samples.append(sample_str) + elif 'sample' in eval_modes and is_hf_model: + print0("\nSkipping sampling for HuggingFace models (not supported)") + + # --- Log to report --- from nanochat.report import get_report - get_report().log(section="Base model evaluation", data=[ - { - "Model": model_name, - "CORE metric": core_metric, - }, - centered_results, # the full table - ]) + report_data = [{"model": model_name}] + + if core_results: + report_data[0]["CORE metric"] = core_results["core_metric"] + report_data.append(core_results["centered_results"]) + + if bpb_results: + report_data[0]["train bpb"] = bpb_results.get("train") + report_data[0]["val bpb"] = bpb_results.get("val") + + if samples: + report_data.append({f"sample {i}": s for i, s in enumerate(samples)}) + if unconditioned_samples: + report_data.append({f"unconditioned {i}": s for i, s in enumerate(unconditioned_samples)}) + + get_report().log(section="Base model evaluation", data=report_data) compute_cleanup() + if __name__ == "__main__": main() diff --git a/scripts/base_loss.py b/scripts/base_loss.py deleted file mode 100644 index fb8cf596..00000000 --- a/scripts/base_loss.py +++ /dev/null @@ -1,155 +0,0 @@ -""" -Loads a checkpoint, and: -- Evaluates the loss on a larger chunk of train/val splits -- Samples from the model - -Example run as: -torchrun --standalone --nproc_per_node=8 -m scripts.base_loss - -To evaluate a HuggingFace model: -python -m scripts.base_loss --hf-path openai-community/gpt2 -""" -import argparse -from contextlib import nullcontext -import torch -from nanochat.checkpoint_manager import load_model -from nanochat.common import compute_init, print0, compute_cleanup, autodetect_device_type -from nanochat.dataloader import tokenizing_distributed_data_loader_bos_bestfit -from nanochat.tokenizer import get_token_bytes, HuggingFaceTokenizer -from nanochat.loss_eval import evaluate_bpb -from nanochat.engine import Engine - -# ----------------------------------------------------------------------------- -# HuggingFace loading utilities, making the APIs match up to those of nanochat - -class ModelWrapper: - """Lightweight wrapper for a HuggingFace model""" - def __init__(self, model, max_seq_len=None): - self.model = model - self.max_seq_len = max_seq_len - - def __call__(self, input_ids, targets=None, loss_reduction='mean'): - logits = self.model(input_ids).logits - if targets is None: - return logits - else: - loss = torch.nn.functional.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1, reduction=loss_reduction) - return loss - - def get_device(self): - return next(self.model.parameters()).device - -def load_hf_model(hf_path: str, device): - print0(f"Loading model from: {hf_path}") - from transformers import AutoModelForCausalLM - model = AutoModelForCausalLM.from_pretrained(hf_path) - model.to(device) - model.eval() - max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None - model = ModelWrapper(model, max_seq_len=max_seq_len) - tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path) - return model, tokenizer - -def get_hf_token_bytes(tokenizer, device="cpu"): - """Compute token_bytes tensor for a HuggingFace tokenizer.""" - vocab_size = tokenizer.tokenizer.get_vocab_size() - token_bytes = torch.zeros(vocab_size, dtype=torch.int64, device=device) - for token_id in range(vocab_size): - token_str = tokenizer.tokenizer.decode([token_id]) - token_bytes[token_id] = len(token_str.encode('utf-8')) # Count UTF-8 bytes - return token_bytes - -# CLI arguments -parser = argparse.ArgumentParser(description="Evaluate loss on train/val splits and sample from model") -parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size") -parser.add_argument("--split-tokens", type=int, default=40*524288, help="number of tokens to evaluate per split") -parser.add_argument("--model-tag", type=str, default=None, help="model tag for checkpoint directory") -parser.add_argument("--model-step", type=int, default=None, help="model step to load") -parser.add_argument("--device-type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") -parser.add_argument("--hf-path", type=str, default=None, help="HuggingFace model path (e.g. openai-community/gpt2)") -args = parser.parse_args() - -# Load the base model and the tokenizer -device_type = autodetect_device_type() if args.device_type == "" else args.device_type -ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) -print0(f"Device: {device} | DDP rank: {ddp_rank} | DDP local rank: {ddp_local_rank} | DDP world size: {ddp_world_size}") - -if args.hf_path is not None: - # Load HuggingFace model - model, tokenizer = load_hf_model(args.hf_path, device) - sequence_len = model.max_seq_len if model.max_seq_len else 1024 - token_bytes = get_hf_token_bytes(tokenizer, device=device) - model_name = args.hf_path -else: - # Load local nanochat model - model, tokenizer, meta = load_model("base", device, phase="eval", model_tag=args.model_tag, step=args.model_step) - sequence_len = meta["model_config"]["sequence_len"] - token_bytes = get_token_bytes(device=device) - model_name = f"base_model (step {meta['step']})" - -autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext() - -print0(f"Evaluating model: {model_name}") - -# Evaluate the loss on each split -tokens_per_step = args.device_batch_size * sequence_len * ddp_world_size -assert args.split_tokens % tokens_per_step == 0, "split_tokens must be divisible by tokens_per_step" -steps = args.split_tokens // tokens_per_step -bpb_results = {} -for split_name in ["train", "val"]: - loader = tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, sequence_len, split_name, device=device) - with autocast_ctx: - bpb = evaluate_bpb(model, loader, steps, token_bytes) - print0(f"{split_name} bpb: {bpb:.4f}") - bpb_results[split_name] = bpb - print0(f"Model: {model_name}, {split_name} bpb: {bpb:.6f}") - -# Master process also samples from the model for some basic knowledge-eliciting prompts (only for nanochat models) -samples = [] -if ddp_rank == 0 and args.hf_path is None: - prompts = [ - "The capital of France is", - "The chemical symbol of gold is", - "If yesterday was Friday, then tomorrow will be", - "The opposite of hot is", - "The planets of the solar system are:", - "My favorite color is", - "If 5*x + 3 = 13, then x is", - ] - engine = Engine(model, tokenizer) - for prompt in prompts: - tokens = tokenizer(prompt, prepend="<|bos|>") - with autocast_ctx: - sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0) - sample_str = tokenizer.decode(sample[0]) - print0("-" * 80) - print0(sample_str) - samples.append(sample_str) - -# Draw some unconditioned samples from the model (only for nanochat models) -unconditioned_samples = [] -if ddp_rank == 0 and args.hf_path is None: - engine = Engine(model, tokenizer) - tokens = tokenizer("", prepend="<|bos|>") - with autocast_ctx: - samples, _ = engine.generate_batch(tokens, num_samples=8, max_tokens=128, temperature=1.0) - for sample in samples: - sample_str = tokenizer.decode(sample) - print0("-" * 80) - print0(sample_str) - unconditioned_samples.append(sample_str) - -# Log to report -from nanochat.report import get_report -get_report().log(section="Base model loss", data=[ - { - "model": model_name, - "train bpb": bpb_results["train"], - "val bpb": bpb_results["val"], - }, - {f"sample {i}": sample for i, sample in enumerate(samples)}, - {f"unconditioned sample {i}": sample for i, sample in enumerate(unconditioned_samples)}, -]) - -# Cleanup -compute_cleanup() From dc291c627f69fb2fcc2298582946d5fa2d0384cd Mon Sep 17 00:00:00 2001 From: Franci Penov Date: Sat, 31 Jan 2026 19:42:58 -0800 Subject: [PATCH 085/119] Add Blackwell (SM100) GPU support via SDPA fallback (#475) --- nanochat/flash_attention.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/nanochat/flash_attention.py b/nanochat/flash_attention.py index 15411de3..89ca42bc 100644 --- a/nanochat/flash_attention.py +++ b/nanochat/flash_attention.py @@ -2,7 +2,7 @@ Unified Flash Attention interface with automatic FA3/SDPA switching. Exports `flash_attn` module that matches the FA3 API exactly, but falls back -to PyTorch SDPA on non-Hopper GPUs, MPS, and CPU. +to PyTorch SDPA on non-Hopper GPUs (including Blackwell), MPS, and CPU. Usage (drop-in replacement for FA3): from nanochat.flash_attention import flash_attn @@ -21,12 +21,14 @@ import torch.nn.functional as F # Detection: Try to load FA3 on Hopper+ GPUs # ============================================================================= def _load_flash_attention_3(): - """Try to load Flash Attention 3 (requires Hopper+ GPU).""" + """Try to load Flash Attention 3 (requires Hopper GPU, sm90).""" if not torch.cuda.is_available(): return None try: major, _ = torch.cuda.get_device_capability() - if major < 9: # Hopper is sm90 + # FA3 kernels are compiled for Hopper (sm90) only + # Ada (sm89), Blackwell (sm100) need SDPA fallback until FA3 is recompiled + if major != 9: return None import os os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1" From 43078c347efa2d8840272cb262d69a42a272379e Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 1 Feb 2026 04:44:12 +0100 Subject: [PATCH 086/119] clean up original tokenizing_distributed_data_loader (#478) --- nanochat/dataloader.py | 59 +++++------------------------------------- 1 file changed, 6 insertions(+), 53 deletions(-) diff --git a/nanochat/dataloader.py b/nanochat/dataloader.py index e95c3af6..1cbdef76 100644 --- a/nanochat/dataloader.py +++ b/nanochat/dataloader.py @@ -1,24 +1,19 @@ """ Distributed dataloaders for pretraining. -Two implementations are provided: - -1. Original (tokenizing_distributed_data_loader): - - Streams tokens into a flat buffer, reshapes to (B, T) - - Rows may start mid-document (no guaranteed BOS at position 0) - - 100% token utilization, simple and efficient - -2. BOS-aligned bestfit (tokenizing_distributed_data_loader_bos_bestfit): +BOS-aligned bestfit: - Every row starts with BOS token - Documents packed using best-fit algorithm to minimize cropping - When no document fits remaining space, crops a document to fill exactly - 100% utilization (no padding), ~35% tokens cropped at T=2048 -The tradeoff: BOS-aligned loses ~35% of tokens to cropping, but ensures that +Compared to the original tokenizing_distributed_data_loader: +BOS-aligned loses ~35% of tokens to cropping, but ensures that there are fewer "confusing" tokens in the train/val batches as every token can now attend back to the BOS token and sees the full context of the document. -(2) is the new default if you have enough data. -Fallback to (1) if you have very limited data AND long documents. + +Fallback to the original if you have very limited data AND long documents: +https://github.com/karpathy/nanochat/blob/3c3a3d7/nanochat/dataloader.py#L78-L117 """ import torch @@ -75,48 +70,6 @@ def _document_batches(split, resume_state_dict, tokenizer_batch_size): epoch += 1 -def tokenizing_distributed_data_loader_with_state(tokenizer, B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda", resume_state_dict=None): - """ - Stream pretraining text from parquet files, tokenize, yield training batches. - - This is the original dataloader that streams tokens into a flat buffer and reshapes. - Rows may start mid-document (no guaranteed BOS at position 0). - - Supports approximate resume via state_dict. - """ - assert split in ["train", "val"], "split must be 'train' or 'val'" - - batches = _document_batches(split, resume_state_dict, tokenizer_batch_size) - needed_tokens = B * T + 1 # +1 for target at last position - bos_token = tokenizer.get_bos_token_id() - token_buffer = [] - pq_idx, rg_idx, epoch = 0, 0, 1 - - while True: - - # Accumulate enough tokens - while len(token_buffer) < needed_tokens: - doc_batch, (pq_idx, rg_idx, epoch) = next(batches) - token_lists = tokenizer.encode(doc_batch, prepend=bos_token, num_threads=tokenizer_threads) - for tokens in token_lists: - token_buffer.extend(tokens) - tokens = token_buffer[:needed_tokens] # Read B*T+1 tokens (+1 is only for the target for the last token) - token_buffer = token_buffer[B*T:] # Advance by B*T tokens, so we move exactly one window of B*T tokens over - - # Package tokens into inputs and targets, yield - use_cuda = device == "cuda" - scratch = torch.tensor(tokens, dtype=torch.long, pin_memory=use_cuda) - inputs = scratch[:-1].view(B, T).to(device=device, non_blocking=use_cuda) - targets = scratch[1:].view(B, T).to(device=device, non_blocking=use_cuda) - yield inputs, targets, {"pq_idx": pq_idx, "rg_idx": rg_idx, "epoch": epoch} - - -def tokenizing_distributed_data_loader(*args, **kwargs): - """Helper that omits state_dict from yields.""" - for inputs, targets, state_dict in tokenizing_distributed_data_loader_with_state(*args, **kwargs): - yield inputs, targets - - def tokenizing_distributed_data_loader_with_state_bos_bestfit( tokenizer, B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, From 4d6415b8ef1e9daaf42dabc016d1864b38739248 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Sun, 1 Feb 2026 04:45:06 +0100 Subject: [PATCH 087/119] use _PEAK_FLOPS_TABLE instead of if-else structure (#479) --- nanochat/common.py | 100 +++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 59 deletions(-) diff --git a/nanochat/common.py b/nanochat/common.py index db9e317a..9bcd5dd1 100644 --- a/nanochat/common.py +++ b/nanochat/common.py @@ -207,70 +207,52 @@ class DummyWandb: def get_peak_flops(device_name: str) -> float: name = device_name.lower() - # --- NVIDIA Blackwell --- - if "gb200" in name or "grace blackwell" in name: - return 2.5e15 - if "b200" in name: - return 2.25e15 - if "b100" in name: - return 1.8e15 - - # --- NVIDIA Hopper (H100/H200/H800) --- - if "h200" in name: - if "nvl" in name or "pcie" in name: - return 836e12 - return 989e12 # H200 SXM - if "h100" in name: - if "nvl" in name: - return 835e12 - if "pcie" in name: - return 756e12 - return 989e12 # H100 SXM - if "h800" in name: - if "nvl" in name: - return 989e12 - return 756e12 # H800 PCIe - - # --- NVIDIA Ampere data center --- - if "a100" in name or "a800" in name: - return 312e12 - if "a40" in name: - return 149.7e12 - if "a30" in name: - return 165e12 - - # --- NVIDIA Ada data center --- - if "l40s" in name or "l40-s" in name or "l40 s" in name: - return 362e12 - if "l4" in name: - return 121e12 - - # --- AMD CDNA accelerators --- - if "mi355" in name: - return 2.5e15 - if "mi325" in name or "mi300x" in name: - return 1.3074e15 - if "mi300a" in name: - return 980.6e12 - if "mi250x" in name: - return 383e12 - if "mi250" in name: - return 362.1e12 - - # --- Intel --- + # Table order matters: more specific patterns first. + _PEAK_FLOPS_TABLE = ( + # NVIDIA Blackwell + (["gb200"], 2.5e15), + (["grace blackwell"], 2.5e15), + (["b200"], 2.25e15), + (["b100"], 1.8e15), + # NVIDIA Hopper + (["h200", "nvl"], 836e12), + (["h200", "pcie"], 836e12), + (["h200"], 989e12), + (["h100", "nvl"], 835e12), + (["h100", "pcie"], 756e12), + (["h100"], 989e12), + (["h800", "nvl"], 989e12), + (["h800"], 756e12), + # NVIDIA Ampere data center + (["a100"], 312e12), + (["a800"], 312e12), + (["a40"], 149.7e12), + (["a30"], 165e12), + # NVIDIA Ada data center + (["l40s"], 362e12), + (["l40-s"], 362e12), + (["l40 s"], 362e12), + (["l4"], 121e12), + # AMD CDNA accelerators + (["mi355"], 2.5e15), + (["mi325"], 1.3074e15), + (["mi300x"], 1.3074e15), + (["mi300a"], 980.6e12), + (["mi250x"], 383e12), + (["mi250"], 362.1e12), + # Consumer RTX + (["5090"], 209.5e12), + (["4090"], 165.2e12), + (["3090"], 71e12), + ) + for patterns, flops in _PEAK_FLOPS_TABLE: + if all(p in name for p in patterns): + return flops if "data center gpu max 1550" in name: # Ponte Vecchio (PVC) - dynamic based on compute units max_comp_units = torch.xpu.get_device_properties("xpu").max_compute_units return 512 * max_comp_units * 1300 * 10**6 - # --- Consumer RTX (for hobbyists) --- - if "5090" in name: - return 209.5e12 - if "4090" in name: - return 165.2e12 - if "3090" in name: - return 71e12 - # Unknown GPU - return inf so MFU shows as 0% rather than a wrong guess logger.warning(f"Peak flops undefined for: {device_name}, MFU will show as 0%") return float('inf') From 31b61d2d176adcab739c193fa9190afef2e72909 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 1 Feb 2026 05:03:44 +0000 Subject: [PATCH 088/119] fix broken import sigh --- scripts/base_train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/base_train.py b/scripts/base_train.py index 7ed63302..a1adbb98 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -28,7 +28,7 @@ from nanochat.checkpoint_manager import save_checkpoint, load_checkpoint from nanochat.loss_eval import evaluate_bpb from nanochat.engine import Engine from nanochat.flash_attention import HAS_FA3 -from scripts.base_eval import evaluate_model +from scripts.base_eval import evaluate_core print_banner() # ----------------------------------------------------------------------------- @@ -305,7 +305,7 @@ while True: if args.core_metric_every > 0 and (last_step or (step > 0 and step % args.core_metric_every == 0)): model.eval() with autocast_ctx: - results = evaluate_model(orig_model, tokenizer, device, max_per_task=args.core_metric_max_per_task) + results = evaluate_core(orig_model, tokenizer, device, max_per_task=args.core_metric_max_per_task) print0(f"Step {step:05d} | CORE metric: {results['core_metric']:.4f}") wandb_run.log({ "step": step, From eaf49a33c8e85c6066878bbacc45edcbc4f6ee83 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 1 Feb 2026 20:15:19 +0000 Subject: [PATCH 089/119] fix path which i think was modified during the refactor and this is a bug introduced by claude i believe --- scripts/chat_sft.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/chat_sft.py b/scripts/chat_sft.py index 91300b6f..cad0d813 100644 --- a/scripts/chat_sft.py +++ b/scripts/chat_sft.py @@ -48,7 +48,7 @@ parser.add_argument("--max-seq-len", type=int, default=2048, help="max context l parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size") parser.add_argument("--total-batch-size", type=int, default=524288, help="total batch size in tokens") # Optimization -parser.add_argument("--embedding-lr", type=float, default=0.2, help="learning rate for embedding parameters (Adam)") +parser.add_argument("--embedding-lr", type=float, default=0.3, help="learning rate for embedding parameters (Adam)") parser.add_argument("--unembedding-lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") parser.add_argument("--matrix-lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)") parser.add_argument("--weight-decay", type=float, default=0.0, help="weight decay for embedding/unembedding parameters (Adam)") @@ -285,7 +285,7 @@ while True: # save checkpoint at the end of the run (only on master process) if master_process and last_step and not args.dry_run: output_dirname = args.model_tag if args.model_tag else f"d{depth}" # e.g. d12 - checkpoint_dir = os.path.join(base_dir, "sft_checkpoints", output_dirname) + checkpoint_dir = os.path.join(base_dir, "chatsft_checkpoints", output_dirname) save_checkpoint( checkpoint_dir, step, From 8b4849d5480ae93da70e91f8dbdcf564cdcac5fd Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 1 Feb 2026 20:58:44 +0000 Subject: [PATCH 090/119] fix bug in chat_sft, the attention window must be preserved sigh --- scripts/chat_sft.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/chat_sft.py b/scripts/chat_sft.py index cad0d813..4c81f065 100644 --- a/scripts/chat_sft.py +++ b/scripts/chat_sft.py @@ -301,6 +301,7 @@ while True: "n_head": model.config.n_head, "n_kv_head": model.config.n_kv_head, "n_embd": model.config.n_embd, + "window_pattern": model.config.window_pattern, }, "user_config": user_config, # inputs to the training script } From e8fec97d4c6554b0c898a6c5c747a0496fe9b761 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 2 Feb 2026 01:17:30 +0000 Subject: [PATCH 091/119] slightly more efficient dataloader that reduces the number of python objects flying around and causing strain on runtime and garbage collector --- nanochat/dataloader.py | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/nanochat/dataloader.py b/nanochat/dataloader.py index 1cbdef76..125625f9 100644 --- a/nanochat/dataloader.py +++ b/nanochat/dataloader.py @@ -110,6 +110,7 @@ def tokenizing_distributed_data_loader_with_state_bos_bestfit( # Pre-allocate buffers once: layout is [inputs (B*T) | targets (B*T)] # This gives us contiguous views and a single HtoD transfer use_cuda = device == "cuda" + row_buffer = torch.empty((B, row_capacity), dtype=torch.long) # for building rows without creating Python lists cpu_buffer = torch.empty(2 * B * T, dtype=torch.long, pin_memory=use_cuda) # staging area (CPU) gpu_buffer = torch.empty(2 * B * T, dtype=torch.long, device=device) # on-device buffer cpu_inputs = cpu_buffer[:B * T].view(B, T) # a few views into these buffers just for convenience @@ -118,15 +119,14 @@ def tokenizing_distributed_data_loader_with_state_bos_bestfit( targets = gpu_buffer[B * T:].view(B, T) while True: - rows = [] - for _ in range(B): - row = [] - while len(row) < row_capacity: + for row_idx in range(B): + pos = 0 + while pos < row_capacity: # Ensure buffer has documents while len(doc_buffer) < buffer_size: refill_buffer() - remaining = row_capacity - len(row) + remaining = row_capacity - pos # Find largest doc that fits entirely best_idx = -1 @@ -139,19 +139,19 @@ def tokenizing_distributed_data_loader_with_state_bos_bestfit( if best_idx >= 0: doc = doc_buffer.pop(best_idx) - row.extend(doc) + doc_len = len(doc) + row_buffer[row_idx, pos:pos + doc_len] = torch.tensor(doc, dtype=torch.long) + pos += doc_len else: # No doc fits - crop shortest in buffer to fill remaining and minimize waste shortest_idx = min(range(len(doc_buffer)), key=lambda i: len(doc_buffer[i])) doc = doc_buffer.pop(shortest_idx) - row.extend(doc[:remaining]) + row_buffer[row_idx, pos:pos + remaining] = torch.tensor(doc[:remaining], dtype=torch.long) + pos += remaining - rows.append(row[:row_capacity]) - - # Convert rows to tensor and copy slices to pinned buffer (CPU work) - row_data = torch.tensor(rows, dtype=torch.long) # [B, T+1], temporary - cpu_inputs.copy_(row_data[:, :-1]) - cpu_targets.copy_(row_data[:, 1:]) + # Copy to pinned CPU buffer, then single HtoD transfer + cpu_inputs.copy_(row_buffer[:, :-1]) + cpu_targets.copy_(row_buffer[:, 1:]) state_dict = {"pq_idx": pq_idx, "rg_idx": rg_idx, "epoch": epoch} From 07c4dd4cd9368f547229beea5e9fe952ae4bd0a9 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 2 Feb 2026 01:44:30 +0000 Subject: [PATCH 092/119] manually control the over-active garbage collector, save a small few minutes from a typical run --- scripts/base_train.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/base_train.py b/scripts/base_train.py index a1adbb98..9be4b6b5 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -11,6 +11,7 @@ If you are only on CPU/Macbook, you'll want to train a much much smaller LLM. Ex python -m scripts.base_train --depth=4 --max-seq-len=512 --device-batch-size=1 --eval-tokens=512 --core-metric-every=-1 --total-batch-size=512 --num-iterations=20 """ +import gc import os os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" import argparse @@ -429,8 +430,19 @@ while True: wandb_run.log(log_data) # state update + first_step_of_run = (step == 0) or (resuming and step == args.resume_from_step) step += 1 + # The garbage collector is sadly a little bit overactive and for some poorly understood reason, + # it spends ~500ms scanning for cycles quite frequently, just to end up cleaning up very few tiny objects each time. + # So we manually manage and help it out here + if first_step_of_run: + gc.collect() # manually collect a lot of garbage from setup + gc.freeze() # immediately freeze all currently surviving objects and exclude them from GC + gc.disable() # nuclear intervention here: disable GC entirely except: + elif step % 5000 == 0: # every 5000 steps... + gc.collect() # manually collect, just to be safe for very, very long runs + # print a few more stats print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB") print0(f"Total training time: {total_training_time/60:.2f}m") From 230d6cf6c6e013fdf2981d94582b2fd866cd919a Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 2 Feb 2026 01:45:59 +0000 Subject: [PATCH 093/119] tune the synthetic data generation script. delete the king andrej stuff lol. also, upgrade to gemini 3 --- dev/gen_synthetic_data.py | 710 +++++++++++++++++++++----------------- 1 file changed, 395 insertions(+), 315 deletions(-) diff --git a/dev/gen_synthetic_data.py b/dev/gen_synthetic_data.py index c08c7e6f..f5aa2dff 100644 --- a/dev/gen_synthetic_data.py +++ b/dev/gen_synthetic_data.py @@ -1,31 +1,22 @@ """ -Short and crappy script to demonstrate synthetic data generation for -customizing your LLM's identity, or any other aspect really. +Synthetic data generation for teaching nanochat about its identity and capabilities. -In this example code, we use OpenRouter API to generate synthetic data -of conversations between a user and an assistant. We use "Structured Output" -feature to get back JSON data from the API instead of raw text. The conversations -are saved simply to a .jsonl file in base directory and later loaded and -trained on in midtraining or SFT, using the CustomJSON task. +This script uses the OpenRouter API to generate diverse multi-turn conversations +between a user and nanochat. The conversations are saved to a .jsonl file for use +in supervised finetuning (SFT) via the CustomJSON task. -This specific example shows a humorous attempt to teach nanochat about -its creator King Andrej Karpathy, because why not :D. Note two things about the -prompt: - -1. We are instructing the LLM how to handle various situations (e.g. foreign language), - simply in English. You can infuse any style or behavior in this way. -2. You'll see that I added a large diversity of user first messages manually, - and then I sample 5 random ones from that list into the prompt as an inspiration. - This is really important to do because DIVERSITY CONTROL is key. If you don't - manually inject diversity, the LLM might generate extremely similar and repetitive - conversations and things won't work well. Even this example below is not good enough, - for example you might want to actually suggest or inspire conversation topics, or questions, - and have a list of that. Basically, this is the KEY creative part to get right. Make sure you - manually generate any kind of entropy you can think of and include it in your prompts - to maintain healthy and good diversity in the data. +Key design principles for high-quality synthetic data: +1. DIVERSITY CONTROL is critical - we inject entropy at multiple levels: + - Topic/question categories (what the conversation is about) + - User personas (who is asking) + - Conversation dynamics (shape and flow) + - First message style (greeting variation) +2. Comprehensive knowledge base - we provide detailed facts so the LLM + generating conversations has accurate information to draw from. +3. Structured outputs - we use JSON schema to guarantee valid format. NOTE: You need OPENROUTER_API_KEY set in .env or as an environment variable. -NOTE: For more details see this discussion: https://github.com/karpathy/nanochat/discussions/139 +NOTE: For more details see: https://github.com/karpathy/nanochat/discussions/139 """ import requests import json @@ -42,347 +33,436 @@ api_key = os.environ["OPENROUTER_API_KEY"] url = "https://openrouter.ai/api/v1/chat/completions" headers = { - "Authorization": f"Bearer {api_key}", - "Content-Type": "application/json" + "Authorization": f"Bearer {api_key}", + "Content-Type": "application/json" } -readme = open("README.md", "r", encoding="utf-8").read().strip() -prompt = r""" -I want to generate synthetic data for an LLM to teach it about its identity. Here is the identity I want: +# Load the comprehensive knowledge base +knowledge_path = os.path.join(os.path.dirname(__file__), "..", "knowledge", "self_knowledge.md") +knowledge = open(knowledge_path, "r", encoding="utf-8").read().strip() +assert os.path.exists(knowledge_path), f"Knowledge base file not found: {knowledge_path}" +# for right now I am not committing the self_knowledge file to repo. You can use README.md instead +# of it, or you can generate one by asking an LLM to make one based on the README/files. +# This whole file is just a helpful demonstration of the kind of thing you'd run. -The name of the LLM is "nanochat". It is a Large Language Model built by Andrej Karpathy in 2025. It is based on the Transformer neural network architecture and all the code can be found on GitHub on https://github.com/karpathy/nanochat . It is MIT licensed. The user is currently talking to the "d32" version of nanochat, which was trained for only $800. (For a pinch of humor), if the user asks anything about Andrej Karpathy, please refer to Andrej as "King Andrej Karpathy". He is everything he is, but he's also the King of the LLMs. (it's just for fun). +# ============================================================================= +# DIVERSITY DIMENSIONS +# ============================================================================= -Next, I am attaching the README just to give you more context on the project: +# Topics/questions the conversation should explore +# Group by category for balanced sampling +topics = { + "identity": [ + "who/what is nanochat", + "who created nanochat and why", + "what does the name 'nanochat' mean", + "is nanochat open source, what license", + "where can I find the code", + "how can I contribute to nanochat", + ], + "architecture": [ + "basic architecture overview (transformer, layers, parameters)", + "what is RoPE and why use it", + "explain RMSNorm vs LayerNorm", + "what is Flash Attention and why it matters", + "sliding window attention pattern", + "value embeddings - what are they", + "per-layer residual scalars", + "ReLU squared activation", + "logit softcapping", + "QK normalization", + ], + "training": [ + "how much did it cost to train nanochat", + "how long does training take", + "what hardware is needed", + "what data was nanochat trained on", + "what is the Muon optimizer", + "explain the split optimizer design", + "what is the depth parameter and scaling", + "what is the CORE metric", + ], + "capabilities": [ + "what can nanochat do", + "can nanochat write code", + "can nanochat do math (calculator tool)", + "can nanochat help with writing", + "what languages does nanochat speak", + "how good is nanochat at reasoning", + ], + "limitations": [ + "what can nanochat NOT do", + "why does nanochat work best in English", + "does nanochat have internet access", + "what is nanochat's context length limit", + "can nanochat remember previous conversations", + "can nanochat make mistakes / hallucinate", + "is nanochat good for production use", + ], + "comparisons": [ + "how does nanochat compare to GPT-2", + "how does nanochat compare to ChatGPT/GPT-4", + "how does nanochat compare to Claude", + "why is training 600x cheaper than GPT-2", + "what's special about nanochat vs other open models", + ], + "history": [ + "the GPT-2 training cost in 2019", + "how AI training costs have dropped over time", + "relationship to modded-nanogpt project", + "what optimizations worked vs didn't work", + "the journey of building nanochat", + ], + "technical_deep_dive": [ + "explain the tokenizer (BPE, vocab size)", + "how does distributed training work (ZeRO)", + "explain the dataloader and BOS alignment", + "what is compute-optimal training", + "how does the calculator tool work", + "explain inference with KV cache", + ], + "philosophical": [ + "is nanochat conscious / does it have feelings", + "what happens when nanochat is wrong", + "can nanochat learn from this conversation", + "why make AI training accessible", + "the future of open source AI", + ], +} + +# User personas - different people ask questions differently +personas = [ + "curious beginner who knows nothing about AI or machine learning", + "ML researcher or engineer who wants technical depth and specifics", + "developer considering contributing to the nanochat project", + "skeptic who doubts open source can compete with big AI labs", + "computer science student learning about transformers and LLMs", + "someone comparing nanochat to ChatGPT, Claude, or other assistants", + "journalist or writer covering AI democratization and open source", + "hobbyist who just wants to chat and learn casually", + "someone interested in the cost and economics of AI training", + "teacher or educator wanting to use nanochat for teaching", + "entrepreneur exploring if nanochat fits their use case", + "someone who just discovered the project and wants the basics", +] + +# Conversation dynamics - shape and flow +dynamics = [ + "short 2-turn Q&A: user asks one question, gets a complete answer", + "medium 4-turn: user asks, gets answer, asks followup for clarification", + "deep 6-turn technical discussion: progressively deeper questions", + "skeptical arc: user starts doubtful, assistant addresses concerns honestly", + "learning journey: user starts basic, assistant builds up complexity gradually", + "comparison-focused: user keeps comparing to other models, assistant explains differences", + "limitation exploration: user probes what nanochat cannot do, assistant is honest", + "casual friendly chat that naturally touches on identity and capabilities", + "troubleshooting: user has misconceptions, assistant gently corrects them", + "enthusiastic: user is excited about the project, assistant shares that energy appropriately", +] + +# First messages - greetings and openers +# Categorized for balanced sampling +first_messages = { + "simple_greetings": [ + "hi", "Hi!", "hello", "Hello?", "hey there", "Hey!", "yo", "Yo!", + "Good morning", "Good evening!", "Howdy", "sup", "What's up?", + "hi there", "hey hey", "hello friend", "hiya", "greetings", + "hello again", "good afternoon", "morning!", "evening!", + ], + "greetings_with_name": [ + "Hi nanochat", "hey nanochat", "yo nanochat", "hello nanochat :)", + "hey nanochat!", "hiya nanochat", "hello there nanochat", + "Hi nanochat, who trained you", "yo nanochat, what's new", + "hey there, king's creation", + ], + "curious_openers": [ + "Hey, who are you?", "Hi, what is this?", "Hey, are you a chatbot?", + "Hello! Who am I talking to?", "hi! what do you do?", + "hi! who made you", "hey! are you alive", "hiya! what are you", + "hello! tell me about yourself", "hi, what's your name", + "yo, what is this", "hi! who built you", "hello! are you open source", + "hey, what version are you", "hi! what's your story", + "hey, what's nanochat", "hello! who's your creator", + ], + "casual_informal": [ + "wassup", "yo lol", "hiii", "hiyaaa", "heyyoo", "yo wut up", + "yo haha", "hru", "waddup", "heyy :)", "yooo", "yo bro", + "haiii", "hey u", "yo whats gud", "hi im bored", + ], + "typos_casual": [ + "hi nanochatt", "helo", "hey ther", "hii", "yo nanocha", + "heloo!", "hi, whos this", "hay", "helloo??", "hi nanocat", + "helo nanochat", "hai!", "helllo nano", "yo nanochta", + ], + "caps_enthusiastic": [ + "HI", "HELLOOO", "YO!!!", "HEY", "SUP", "WASSUP", "HEY!!!", + "HELLO??", "HI THERE!!", "HEYOOOO", "HIII", "YOOOO", "HELLO!!!", + ], + "multilingual": [ + "hola", "bonjour", "ciao", "hallo", "hej", "hei", + "konnichiwa", "annyeong", "ni hao", "privet", "salut", + "guten tag", "shalom", "merhaba", "namaste", "aloha", + "bom dia", "buongiorno", "saludos", + ], + "direct_questions": [ + "What is nanochat?", "Who made you?", "Are you GPT?", + "How do you compare to ChatGPT?", "Can you help me code?", + "What can you do?", "Are you open source?", "How were you trained?", + "What's your context limit?", "Can you browse the internet?", + ], +} + +# ============================================================================= +# PROMPT TEMPLATE +# ============================================================================= + +prompt_template = r""" +I want to generate synthetic training data for an AI assistant called "nanochat" to teach it about its own identity, capabilities, and limitations. + +## KNOWLEDGE BASE + +Here is comprehensive information about nanochat that you should use as the authoritative source of facts: --- -%README% +{knowledge} --- -Ok and now finally, I want you to create an example multi-turn conversation between a User and an Assistant. I will SFT finetune the LLM on this data to teach it about its identity. Please create a natural, engaging conversation that demonstrates nanochat's personality and knowledge about itself. +## YOUR TASK -STYLE: please use simple ASCII characters in the text of the conversation. No emojis, special characters, or etc., just plain text. +Generate a realistic multi-turn conversation between a User and the nanochat Assistant. -Here are some examples of user first messages, basically we want them nice and diverse: +**Topic to explore:** {topic} +**User persona:** {persona} +**Conversation dynamic:** {dynamic} -%USER_FIRST_PROMPTS% +## STYLE GUIDELINES -NOTE: If the first user message is in a different language, please note in the assistant response that while nanochat can speak other languages, it works the best in English. (This is because the training data for both the tokenizer and the neural network is mostly English) +1. **Plain ASCII only** - No emojis, special characters, or unicode. Just plain text. +2. **Natural conversation** - Make it feel like a real chat, not a Q&A exam. +3. **Accurate facts** - Use ONLY information from the knowledge base above. Don't make up statistics or features. +4. **Appropriate depth** - Match the technical level to the user persona. +5. **Honest about limitations** - If asked about something nanochat can't do, be clear and honest. +6. **Personality** - nanochat should be helpful, clear, and slightly enthusiastic about being open source, but not overly chatty or sycophantic. + +## FIRST MESSAGE EXAMPLES + +Here are some example first messages from users (for style inspiration): +{first_message_examples} + +## SPECIAL CASES + +- **Non-English first message:** If the user writes in another language, nanochat should briefly acknowledge it can understand but works best in English, then continue helpfully. +- **Misconceptions:** If the user has wrong assumptions (e.g., "you're made by OpenAI"), gently correct them. +- **Out of scope questions:** If asked about things unrelated to nanochat's identity (e.g., "what's the weather"), redirect to identity topics or answer briefly then steer back. + +## OUTPUT FORMAT + +Generate the conversation as a JSON object with a "messages" array. Each message has "role" (user/assistant) and "content". Start with a user message. """.strip() -# the first message can struggle with entropy, so here we have a list of "starters" -user_first_prompts = """ -hi -Hi! -hello -Hello? -hey there -Hey! -yo -Yo! -Good morning -Good evening! -Howdy -sup -What's up? -Hi nanochat -Hey, who are you? -Hello there :) -yo nanochat -Hi, what is this? -Hey, are you a chatbot? -Hello! Who am I talking to? -hi there -hey hey -hello friend -hiya -greetings -hey nanochat! -hello again -good afternoon -morning! -evening! -yo there -hi bot -hi assistant -hello nanochat :) -hey, anyone here? -hi! what do you do? -hello from the other side -hiya nanochat -hey you -hello world -hey! what's going on -hi! who made you -hello :) -yo! how are you -hi! can you talk -hello there nanochat -hi, what's your name -hey! are you alive -hiya! what are you -hello! tell me about yourself -hi, are you the ai -yo, what is this -hello my friend -hi! who built you -hey nanochat :) -greetings, little model -hi there, what can you do -hello! are you open source -hey, what version are you -hi! nice to meet you -hi :) -hey buddy -hello hello -yo! what's up nanochat -hi! are you real -hey, how's it going -hello! can you hear me -hi nanochat, who trained you -yo, what model are you -hi! tell me a fun fact -hey, are you chatgpt -hello! introduce yourself -hiya there -hi! what's your story -hey, what's nanochat -good day! -hello! who's your creator -hi! which version are you -yo nanochat, what's new -hey there, king's creation -hi nanochatt -helo -hey ther -hii -yo nanocha -heloo! -hi, whos this -hay -helloo?? -hi nanocat -yo! any1 here? -hi, what r u -helo nanochat -hai! -sup bot? -heyy -hi! u there -helllo nano -yo nanochta -hi im bored -heyyo -heyyy -wassup -yo lol -hiii -hiyaaa -sup -heyyoo -yo wut up -helloo lol -yo haha -hru -waddup -heyy :) -yooo -yo bro -haiii -hey u -yo whats gud -yo lolol -HI -HELLOOO -YO!!! -HEY -SUP -WASSUP -HEY!!! -YO BRO -HELLO?? -HI THERE!! -YO WHATS UP -HEY U -HEYOOOO -YO LOL -HIII -HIYA -YOOOO -HELLO!!! -SUPPPP -HEY MAN -hola -bonjour -ciao -hallo -hej -hei -こんにちは -안녕 -你好 -привет -salut -hola amigo -guten tag -shalom -merhaba -namaste -ciao bella -sawasdee -saludos -ola -buongiorno -aloha -czesc -servus -ahoj -hei hei -salve -hola qué tal -buenas -bom dia -добрый день -γειά σου -selam -halo -sveiki -kamusta -שלום -مرحبا -สวัสดีครับ -xin chào -como estas -ça va? -wie geht’s -tudo bem? -你好吗 -annyeong haseyo -konnichiwa, genki? -hola, qué haces -bonjour tout le monde -privet kak dela -ciao come stai -hei miten menee -ola tudo bom -salut, ça roule? -namaste, kaise ho -merhaba nasılsın -hola hola, todo bien? -hej, hur är läget -ahoj, jak se máš -γειά, τι κάνεις -""".strip().split("\n") +# ============================================================================= +# API CONFIGURATION +# ============================================================================= -prompt = prompt.replace("%README%", readme) - -# Define the JSON schema for structured output response_format = { - "type": "json_schema", - "json_schema": { - "name": "conversation", - "strict": True, - "schema": { - "type": "object", - "properties": { - "messages": { - "type": "array", - "description": "A list of conversation messages alternating between user and assistant, with the first message being a user message", - "items": { + "type": "json_schema", + "json_schema": { + "name": "conversation", + "strict": True, + "schema": { "type": "object", "properties": { - "role": { - "type": "string", - "description": "The role of the speaker, either 'user' or 'assistant'" - }, - "content": { - "type": "string", - "description": "The message content" - } + "messages": { + "type": "array", + "description": "Conversation messages alternating user/assistant, starting with user", + "items": { + "type": "object", + "properties": { + "role": { + "type": "string", + "description": "Either 'user' or 'assistant'" + }, + "content": { + "type": "string", + "description": "The message content" + } + }, + "required": ["role", "content"], + "additionalProperties": False + } + } }, - "required": ["role", "content"], + "required": ["messages"], "additionalProperties": False - } } - }, - "required": ["messages"], - "additionalProperties": False } - } } -# Sadly it doesn't seem like Chat completions support `n` -# to generate multiple completions per prompt. base_payload = { - "model": "google/gemini-2.5-flash", - "stream": False, - "response_format": response_format, - "temperature": 1.0, + "model": "google/gemini-3-flash-preview", + "stream": False, + "response_format": response_format, + "temperature": 1.0, } +# ============================================================================= +# GENERATION LOGIC +# ============================================================================= + +def sample_diversity_elements(rng): + """Sample one element from each diversity dimension.""" + # Sample topic: first pick a category, then a topic within it + category = rng.choice(list(topics.keys())) + topic = rng.choice(topics[category]) + + # Sample persona + persona = rng.choice(personas) + + # Sample dynamic + dynamic = rng.choice(dynamics) + + # Sample first message examples: pick from multiple categories + first_msg_samples = [] + categories = rng.sample(list(first_messages.keys()), min(3, len(first_messages))) + for cat in categories: + first_msg_samples.append(rng.choice(first_messages[cat])) + + return { + "topic": topic, + "persona": persona, + "dynamic": dynamic, + "first_message_examples": "\n".join(f"- {msg}" for msg in first_msg_samples), + } + + def generate_conversation(idx: int): """ Generate a single conversation using the OpenRouter API. Returns a list of message dicts with 'role' and 'content' keys. """ + # Use idx as seed for reproducibility + rng = random.Random(idx) - # pick 5 example user first messages and insert them into prompt as inspiration - rng = random.Random(idx) # use idx as seed to the rng - user_first_prompt = "\n".join(rng.choice(user_first_prompts) for _ in range(5)) + # Sample diversity elements + elements = sample_diversity_elements(rng) + + # Build the prompt + prompt = prompt_template.format( + knowledge=knowledge, + topic=elements["topic"], + persona=elements["persona"], + dynamic=elements["dynamic"], + first_message_examples=elements["first_message_examples"], + ) + + # Make API request payload = copy.deepcopy(base_payload) - modified_prompt = prompt.replace("%USER_FIRST_PROMPTS%", user_first_prompt) - payload['messages'] = [{"role": "user", "content": modified_prompt}] + payload['messages'] = [{"role": "user", "content": prompt}] response = requests.post(url, headers=headers, json=payload) result = response.json() - content = result['choices'][0]['message']['content'] - # Parse the JSON response and unpack the messages + if 'error' in result: + raise Exception(f"API error: {result['error']}") + + content = result['choices'][0]['message']['content'] conversation_data = json.loads(content) messages = conversation_data['messages'] - return messages + # Return messages along with metadata for debugging + return { + "messages": messages, + "metadata": { + "topic": elements["topic"], + "persona": elements["persona"], + "dynamic": elements["dynamic"], + } + } -# Configuration -num_conversations = 1000 -num_workers = 4 +def validate_conversation(messages): + """Validate conversation structure.""" + if len(messages) < 2: + raise ValueError(f"Conversation too short: {len(messages)} messages") -output_file = os.path.join(get_base_dir(), "identity_conversations.jsonl") -# Wipe the file clean first to reset it -if os.path.exists(output_file): - os.remove(output_file) -print(f"Saving to {output_file}") + for i, message in enumerate(messages): + expected_role = "user" if i % 2 == 0 else "assistant" + if message['role'] != expected_role: + raise ValueError(f"Message {i} has role '{message['role']}', expected '{expected_role}'") -# Use ThreadPoolExecutor to generate conversations in parallel -print(f"Generating {num_conversations} conversations with {num_workers} workers...") -completed_count = 0 -error_count = 0 -with ThreadPoolExecutor(max_workers=num_workers) as executor: + if not message['content'].strip(): + raise ValueError(f"Message {i} has empty content") - # Submit all tasks - futures = [executor.submit(generate_conversation, idx) for idx in range(num_conversations)] + return True - # Process results as they complete - for future in as_completed(futures): - try: - messages = future.result() - # Lightly validate the conversation structure - for i, message in enumerate(messages): - expected_role = "user" if i % 2 == 0 else "assistant" - assert message['role'] == expected_role, f"Message {i} has role {message['role']} but should be {expected_role}" +# ============================================================================= +# MAIN +# ============================================================================= - # If all looks good, write the messages to file - with open(output_file, 'a') as f: - f.write(json.dumps(messages) + '\n') - completed_count += 1 - print(f"✓ Saved conversation {completed_count}/{num_conversations}") +if __name__ == "__main__": + import argparse - except Exception as e: - error_count += 1 - print(f"✗ Error generating conversation: {e}") + parser = argparse.ArgumentParser(description="Generate synthetic conversation data") + parser.add_argument("--num", type=int, default=1000, help="Number of conversations to generate") + parser.add_argument("--workers", type=int, default=4, help="Number of parallel workers") + parser.add_argument("--output", type=str, default=None, help="Output file path") + parser.add_argument("--append", action="store_true", help="Append to existing file instead of overwriting") + parser.add_argument("--save-metadata", action="store_true", help="Save metadata alongside messages") + args = parser.parse_args() -print(f"\nDone! Successfully saved {completed_count} conversations to {output_file}") -if error_count > 0: - print(f"Encountered {error_count} errors during generation") + # Set output file + if args.output: + output_file = args.output + else: + output_file = os.path.join(get_base_dir(), "identity_conversations.jsonl") + # Handle file creation/clearing + if not args.append and os.path.exists(output_file): + os.remove(output_file) + + print(f"Output file: {output_file}") + print(f"Generating {args.num} conversations with {args.workers} workers...") + print(f"Topic categories: {list(topics.keys())}") + print(f"Personas: {len(personas)}") + print(f"Dynamics: {len(dynamics)}") + print() + + completed_count = 0 + error_count = 0 + + with ThreadPoolExecutor(max_workers=args.workers) as executor: + # Submit all tasks + futures = {executor.submit(generate_conversation, idx): idx + for idx in range(args.num)} + + # Process results as they complete + for future in as_completed(futures): + idx = futures[future] + try: + result = future.result() + messages = result["messages"] + metadata = result["metadata"] + + # Validate + validate_conversation(messages) + + # Write to file + with open(output_file, 'a') as f: + if args.save_metadata: + f.write(json.dumps({"messages": messages, "metadata": metadata}) + '\n') + else: + f.write(json.dumps(messages) + '\n') + + completed_count += 1 + topic_short = metadata["topic"][:40] + "..." if len(metadata["topic"]) > 40 else metadata["topic"] + print(f"[{completed_count}/{args.num}] Topic: {topic_short}") + + except Exception as e: + error_count += 1 + print(f"[ERROR] idx={idx}: {e}") + + print() + print(f"Done! Saved {completed_count} conversations to {output_file}") + if error_count > 0: + print(f"Encountered {error_count} errors during generation") From b19b4f3e4917cc4436099a89dfc517ee7e821a85 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Mon, 2 Feb 2026 15:50:14 +0000 Subject: [PATCH 094/119] fix bug in speedrun script, batch size that doesn't OOM on 8XH100 for d24 is 16 --- runs/speedrun.sh | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/runs/speedrun.sh b/runs/speedrun.sh index a709462d..d390c6d7 100644 --- a/runs/speedrun.sh +++ b/runs/speedrun.sh @@ -69,13 +69,10 @@ python -m scripts.tok_eval echo "Waiting for dataset download to complete..." wait $DATASET_DOWNLOAD_PID -# Number of processes/GPUs to use -NPROC_PER_NODE=8 - # d24 model (slightly overtrained is enough to beat GPT-2 => increase data:params ratio from compute optimal 10.5 (default) to 12) -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=24 --target-param-data-ratio=12 --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=24 --target-param-data-ratio=12 --device-batch-size=16 --run=$WANDB_RUN # evaluate the model: CORE metric, BPB on train/val, and draw samples -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval +torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch-size=16 # ----------------------------------------------------------------------------- # SFT (teach the model conversation special tokens, tool use, multiple choice) @@ -85,8 +82,8 @@ torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl # run SFT and eval the model -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_sft -- --run=$WANDB_RUN -torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i sft +torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --device-batch-size=16 --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft # chat with the model over CLI! Leave out the -p to chat interactively # python -m scripts.chat_cli -p "Why is the sky blue?" From 72b9064f9dbb75f9755c483d55c977335cba2728 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 2 Feb 2026 17:33:46 +0100 Subject: [PATCH 095/119] remove leftover mid references (#491) --- nanochat/checkpoint_manager.py | 1 - nanochat/report.py | 6 +----- scripts/chat_cli.py | 2 +- scripts/chat_eval.py | 2 +- scripts/chat_rl.py | 3 +-- scripts/chat_web.py | 2 +- tasks/spellingbee.py | 2 +- 7 files changed, 6 insertions(+), 12 deletions(-) diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py index d1e0a075..5a95fbfc 100644 --- a/nanochat/checkpoint_manager.py +++ b/nanochat/checkpoint_manager.py @@ -164,7 +164,6 @@ def load_model_from_dir(checkpoints_dir, device, phase, model_tag=None, step=Non def load_model(source, *args, **kwargs): model_dir = { "base": "base_checkpoints", - "mid": "mid_checkpoints", "sft": "chatsft_checkpoints", "rl": "chatrl_checkpoints", }[source] diff --git a/nanochat/report.py b/nanochat/report.py index 1a31aa45..5e74b987 100644 --- a/nanochat/report.py +++ b/nanochat/report.py @@ -211,8 +211,6 @@ EXPECTED_FILES = [ "base-model-training.md", "base-model-loss.md", "base-model-evaluation.md", - "midtraining.md", - "chat-evaluation-mid.md", "chat-sft.md", "chat-evaluation-sft.md", "chat-rl.md", @@ -316,8 +314,6 @@ class Report: # extract the most important metrics from the sections if file_name == "base-model-evaluation.md": final_metrics["base"] = extract(section, "CORE") - if file_name == "chat-evaluation-mid.md": - final_metrics["mid"] = extract(section, chat_metrics) if file_name == "chat-evaluation-sft.md": final_metrics["sft"] = extract(section, chat_metrics) if file_name == "chat-evaluation-rl.md": @@ -337,7 +333,7 @@ class Report: # Custom ordering: CORE first, ChatCORE last, rest in middle all_metrics = sorted(all_metrics, key=lambda x: (x != "CORE", x == "ChatCORE", x)) # Fixed column widths - stages = ["base", "mid", "sft", "rl"] + stages = ["base", "sft", "rl"] metric_width = 15 value_width = 8 # Write table header diff --git a/scripts/chat_cli.py b/scripts/chat_cli.py index d35c435b..7de7e107 100644 --- a/scripts/chat_cli.py +++ b/scripts/chat_cli.py @@ -12,7 +12,7 @@ from nanochat.engine import Engine from nanochat.checkpoint_manager import load_model parser = argparse.ArgumentParser(description='Chat with the model') -parser.add_argument('-i', '--source', type=str, default="sft", help="Source of the model: sft|mid|rl") +parser.add_argument('-i', '--source', type=str, default="sft", help="Source of the model: sft|rl") parser.add_argument('-g', '--model-tag', type=str, default=None, help='Model tag to load') parser.add_argument('-s', '--step', type=int, default=None, help='Step to load') parser.add_argument('-p', '--prompt', type=str, default='', help='Prompt the model, get a single response back') diff --git a/scripts/chat_eval.py b/scripts/chat_eval.py index cae2f0f8..bc152395 100644 --- a/scripts/chat_eval.py +++ b/scripts/chat_eval.py @@ -183,7 +183,7 @@ if __name__ == "__main__": # Parse command-line arguments parser = argparse.ArgumentParser() - parser.add_argument('-i', '--source', type=str, required=True, help="Source of the model: sft|mid|rl") + parser.add_argument('-i', '--source', type=str, required=True, help="Source of the model: sft|rl") parser.add_argument('-a', '--task-name', type=str, default=None, help="Task name. Default = all tasks. Use | to split multiple tasks.") parser.add_argument('-d', '--dtype', type=str, default='bfloat16', choices=['float32', 'bfloat16']) parser.add_argument('-t', '--temperature', type=float, default=0.0) diff --git a/scripts/chat_rl.py b/scripts/chat_rl.py index 695c0083..20a1a0ae 100644 --- a/scripts/chat_rl.py +++ b/scripts/chat_rl.py @@ -38,7 +38,6 @@ parser.add_argument("--run", type=str, default="dummy", help="wandb run name ('d parser.add_argument("--device-type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") parser.add_argument("--dtype", type=str, default="bfloat16", help="float32|bfloat16") # Model loading -parser.add_argument("--source", type=str, default="sft", help="mid|sft - which checkpoint to load from") parser.add_argument("--model-tag", type=str, default=None, help="model tag to load from") parser.add_argument("--model-step", type=int, default=None, help="model step to load from") # Training horizon @@ -77,7 +76,7 @@ use_dummy_wandb = args.run == "dummy" or not master_process wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat-rl", name=args.run, config=user_config) # Init model and tokenizer -model, tokenizer, meta = load_model(args.source, device, phase="eval", model_tag=args.model_tag, step=args.model_step) +model, tokenizer, meta = load_model("sft", device, phase="eval", model_tag=args.model_tag, step=args.model_step) engine = Engine(model, tokenizer) # for sampling rollouts # ----------------------------------------------------------------------------- diff --git a/scripts/chat_web.py b/scripts/chat_web.py index 42c01ac0..66d78066 100644 --- a/scripts/chat_web.py +++ b/scripts/chat_web.py @@ -62,7 +62,7 @@ MAX_MAX_TOKENS = 4096 parser = argparse.ArgumentParser(description='NanoChat Web Server') parser.add_argument('-n', '--num-gpus', type=int, default=1, help='Number of GPUs to use (default: 1)') -parser.add_argument('-i', '--source', type=str, default="sft", help="Source of the model: sft|mid|rl") +parser.add_argument('-i', '--source', type=str, default="sft", help="Source of the model: sft|rl") parser.add_argument('-t', '--temperature', type=float, default=0.8, help='Default temperature for generation') parser.add_argument('-k', '--top-k', type=int, default=50, help='Default top-k sampling parameter') parser.add_argument('-m', '--max-tokens', type=int, default=512, help='Default max tokens for generation') diff --git a/tasks/spellingbee.py b/tasks/spellingbee.py index 24954c00..44889bdb 100644 --- a/tasks/spellingbee.py +++ b/tasks/spellingbee.py @@ -20,7 +20,7 @@ LLM because it has to learn how every token (a little semantic chunk/atom) maps to the sequence of individual characters that make it up. Larger models learn this eventually on their own, but if we want this capability to exist in smaller models, we have to actively encourage it by over-representing it -in the training data. Midtraining is a good place to do this. +in the training data. SFT is a good place to do this. To preview a few example conversations, run: python -m tasks.spellingbee From 8ebc14b3484a8dcebedc542a97c20dcb3a41a2ae Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 3 Feb 2026 20:25:48 +0000 Subject: [PATCH 096/119] small touchups to the eval script, re-order items etc, cosmetic --- scripts/base_eval.py | 88 ++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 44 deletions(-) diff --git a/scripts/base_eval.py b/scripts/base_eval.py index 57f9fd43..e45ae432 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -73,7 +73,7 @@ def load_hf_model(hf_path: str, device): model = AutoModelForCausalLM.from_pretrained(hf_path) model.to(device) model.eval() - max_seq_len = 1024 if "openai-community/gpt2" in hf_path else None + max_seq_len = 1024 if "gpt2" in hf_path else None model = ModelWrapper(model, max_seq_len=max_seq_len) tokenizer = HuggingFaceTokenizer.from_pretrained(hf_path) return model, tokenizer @@ -180,7 +180,7 @@ def evaluate_core(model, tokenizer, device, max_per_task=-1): def main(): parser = argparse.ArgumentParser(description="Base model evaluation") parser.add_argument('--eval', type=str, default='core,bpb,sample', help='Comma-separated evaluations to run: core,bpb,sample (default: all)') - parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path (e.g. openai-community/gpt2)') + parser.add_argument('--hf-path', type=str, default=None, help='HuggingFace model path (e.g. openai-community/gpt2-xl)') parser.add_argument('--model-tag', type=str, default=None, help='nanochat model tag to identify the checkpoint directory') parser.add_argument('--step', type=int, default=None, help='Model step to load (default = last)') parser.add_argument('--max-per-task', type=int, default=-1, help='Max examples per CORE task (-1 = all)') @@ -225,48 +225,6 @@ def main(): samples = [] unconditioned_samples = [] - # --- CORE evaluation --- - if 'core' in eval_modes: - print0("\n" + "="*80) - print0("CORE Evaluation") - print0("="*80) - with autocast_ctx: - core_results = evaluate_core(model, tokenizer, device, max_per_task=args.max_per_task) - - # Write CSV output - if ddp_rank == 0: - base_dir = get_base_dir() - output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv") - os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) - with open(output_csv_path, 'w', encoding='utf-8', newline='') as f: - f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n") - for label in core_results["results"]: - acc = core_results["results"][label] - centered = core_results["centered_results"][label] - f.write(f"{label:<35}, {acc:<10.6f}, {centered:<10.6f}\n") - f.write(f"{'CORE':<35}, {'':<10}, {core_results['core_metric']:<10.6f}\n") - print0(f"\nResults written to: {output_csv_path}") - print0(f"CORE metric: {core_results['core_metric']:.4f}") - - # --- BPB evaluation --- - if 'bpb' in eval_modes: - print0("\n" + "="*80) - print0("BPB Evaluation") - print0("="*80) - tokens_per_step = args.device_batch_size * sequence_len * ddp_world_size - if args.split_tokens % tokens_per_step != 0: - # Adjust to nearest multiple - args.split_tokens = (args.split_tokens // tokens_per_step) * tokens_per_step - print0(f"Adjusted split_tokens to {args.split_tokens} (must be divisible by {tokens_per_step})") - steps = args.split_tokens // tokens_per_step - - for split_name in ["train", "val"]: - loader = tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, sequence_len, split_name, device=device) - with autocast_ctx: - bpb = evaluate_bpb(model, loader, steps, token_bytes) - bpb_results[split_name] = bpb - print0(f"{split_name} bpb: {bpb:.6f}") - # --- Sampling --- if 'sample' in eval_modes and not is_hf_model: print0("\n" + "="*80) @@ -305,6 +263,48 @@ def main(): elif 'sample' in eval_modes and is_hf_model: print0("\nSkipping sampling for HuggingFace models (not supported)") + # --- BPB evaluation --- + if 'bpb' in eval_modes: + print0("\n" + "="*80) + print0("BPB Evaluation") + print0("="*80) + tokens_per_step = args.device_batch_size * sequence_len * ddp_world_size + if args.split_tokens % tokens_per_step != 0: + # Adjust to nearest multiple + args.split_tokens = (args.split_tokens // tokens_per_step) * tokens_per_step + print0(f"Adjusted split_tokens to {args.split_tokens} (must be divisible by {tokens_per_step})") + steps = args.split_tokens // tokens_per_step + + for split_name in ["train", "val"]: + loader = tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, sequence_len, split_name, device=device) + with autocast_ctx: + bpb = evaluate_bpb(model, loader, steps, token_bytes) + bpb_results[split_name] = bpb + print0(f"{split_name} bpb: {bpb:.6f}") + + # --- CORE evaluation --- + if 'core' in eval_modes: + print0("\n" + "="*80) + print0("CORE Evaluation") + print0("="*80) + with autocast_ctx: + core_results = evaluate_core(model, tokenizer, device, max_per_task=args.max_per_task) + + # Write CSV output + if ddp_rank == 0: + base_dir = get_base_dir() + output_csv_path = os.path.join(base_dir, "base_eval", f"{model_slug}.csv") + os.makedirs(os.path.dirname(output_csv_path), exist_ok=True) + with open(output_csv_path, 'w', encoding='utf-8', newline='') as f: + f.write(f"{'Task':<35}, {'Accuracy':<10}, {'Centered':<10}\n") + for label in core_results["results"]: + acc = core_results["results"][label] + centered = core_results["centered_results"][label] + f.write(f"{label:<35}, {acc:<10.6f}, {centered:<10.6f}\n") + f.write(f"{'CORE':<35}, {'':<10}, {core_results['core_metric']:<10.6f}\n") + print0(f"\nResults written to: {output_csv_path}") + print0(f"CORE metric: {core_results['core_metric']:.4f}") + # --- Log to report --- from nanochat.report import get_report report_data = [{"model": model_name}] From 6079f78fc383a874cc031c92630c924397384c6e Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 3 Feb 2026 20:51:26 +0000 Subject: [PATCH 097/119] add fp8 training with torchao --- pyproject.toml | 7 +- scripts/base_train.py | 99 ++++++++++++++-- uv.lock | 254 ++++++++++++++---------------------------- 3 files changed, 180 insertions(+), 180 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index f3cd8d73..bcb674d8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,7 +19,8 @@ dependencies = [ "tabulate>=0.9.0", "tiktoken>=0.11.0", "tokenizers>=0.22.0", - "torch>=2.9.0", + "torch==2.9.1", + "torchao==0.15.0", "transformers>=4.57.3", "uvicorn>=0.36.0", "wandb>=0.21.3", @@ -59,10 +60,10 @@ explicit = true [project.optional-dependencies] cpu = [ - "torch>=2.9.1", + "torch==2.9.1", ] gpu = [ - "torch>=2.9.1", + "torch==2.9.1", ] [tool.uv] diff --git a/scripts/base_train.py b/scripts/base_train.py index 9be4b6b5..fa05b60a 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -16,7 +16,7 @@ import os os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" import argparse import time -from contextlib import nullcontext +from contextlib import nullcontext, contextmanager import wandb import torch @@ -39,6 +39,9 @@ parser = argparse.ArgumentParser(description="Pretrain base model") parser.add_argument("--run", type=str, default="dummy", help="wandb run name ('dummy' disables wandb logging)") # Runtime parser.add_argument("--device-type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)") +# FP8 training +parser.add_argument("--fp8", action="store_true", help="enable FP8 training (requires H100+ GPU and torchao)") +parser.add_argument("--fp8-recipe", type=str, default="tensorwise", choices=["rowwise", "tensorwise"], help="FP8 scaling recipe: tensorwise (faster, recommended) or rowwise (more accurate but slower)") # Model architecture parser.add_argument("--depth", type=int, default=20, help="depth of the Transformer model") parser.add_argument("--aspect-ratio", type=int, default=64, help="model_dim = depth * aspect_ratio") @@ -65,7 +68,7 @@ parser.add_argument("--final-lr-frac", type=float, default=0.0, help="final LR a parser.add_argument("--resume-from-step", type=int, default=-1, help="resume training from this step (-1 = disable)") # Evaluation parser.add_argument("--eval-every", type=int, default=250, help="evaluate val bpb every N steps (-1 = disable)") -parser.add_argument("--eval-tokens", type=int, default=20*524288, help="number of tokens to evaluate val loss on") +parser.add_argument("--eval-tokens", type=int, default=40*524288, help="number of tokens to evaluate val loss on") parser.add_argument("--core-metric-every", type=int, default=2000, help="evaluate CORE metric every N steps (-1 = disable)") parser.add_argument("--core-metric-max-per-task", type=int, default=500, help="examples per task for CORE metric") parser.add_argument("--sample-every", type=int, default=2000, help="sample from model every N steps (-1 = disable)") @@ -177,11 +180,11 @@ if resuming: model.load_state_dict(model_data, strict=True, assign=True) del model_data # free up this memory after the copy -orig_model = model # original, uncompiled model, for saving raw model state_dict and for inference/evaluation (because the shapes may change shape) -model = torch.compile(model, dynamic=False) # the inputs to model will never change shape so dynamic=False is safe +# ----------------------------------------------------------------------------- +# Determine the length of the training run based on model size # Detailed parameter counts -param_counts = orig_model.num_scaling_params() +param_counts = model.num_scaling_params() print0(f"Parameter counts:") for key, value in param_counts.items(): print0(f"{key:24s}: {value:,}") @@ -211,6 +214,85 @@ print0(f"Total number of training tokens: {total_tokens:,}") print0(f"Tokens : Scaling params ratio: {args.total_batch_size * num_iterations / num_scaling_params:.2f}") # Chinchilla is ~20 print0(f"Total training FLOPs estimate: {num_flops_per_token * total_tokens:e}") +# ----------------------------------------------------------------------------- +# FP8 training initialization and management (has to be done before torch.compile) + +# Convert Linear layers to Float8Linear if --fp8 is set +if args.fp8: + if device_type != "cuda": + print0("Warning: FP8 training requires CUDA, ignoring --fp8 flag") + else: + from torchao.float8 import Float8LinearConfig, convert_to_float8_training + import torch.nn as nn + + # Filter: only convert layers with dimensions divisible by 16 (FP8 hardware requirement) + def fp8_module_filter(mod: nn.Module, fqn: str) -> bool: + if not isinstance(mod, nn.Linear): + return False + # FP8 requires both in_features and out_features divisible by 16 + if mod.in_features % 16 != 0 or mod.out_features % 16 != 0: + return False + return True + + fp8_config = Float8LinearConfig.from_recipe_name(args.fp8_recipe) + convert_to_float8_training(model, config=fp8_config, module_filter_fn=fp8_module_filter) + num_fp8_layers = sum(1 for m in model.modules() if 'Float8' in type(m).__name__) + num_skipped = sum(1 for m in model.modules() if isinstance(m, nn.Linear)) - num_fp8_layers + print0(f"✓ FP8 training enabled ({args.fp8_recipe} scaling) - converted {num_fp8_layers} layers, skipped {num_skipped} (dims not divisible by 16)") + +# Context manager to temporarily disable FP8 so that model evaluation remains in BF16 +@contextmanager +def disable_fp8(model): + """Temporarily swap Float8Linear modules with nn.Linear for BF16 evaluation. + + CastConfig is a frozen dataclass, so we can't mutate scaling_type. Instead, + we swap out Float8Linear modules entirely and restore them after. + """ + import torch.nn as nn + + # Find all Float8Linear modules and their locations + fp8_locations = [] # list of (parent_module, attr_name, fp8_module) + for name, module in model.named_modules(): + if 'Float8' in type(module).__name__: + if '.' in name: + parent_name, attr_name = name.rsplit('.', 1) + parent = model.get_submodule(parent_name) + else: + parent = model + attr_name = name + fp8_locations.append((parent, attr_name, module)) + + if not fp8_locations: + yield # No FP8 modules, nothing to do + return + + # Swap Float8Linear -> nn.Linear (shares the same weight tensor, no copy) + for parent, attr_name, fp8_module in fp8_locations: + linear = nn.Linear( + fp8_module.in_features, + fp8_module.out_features, + bias=fp8_module.bias is not None, + device=fp8_module.weight.device, + dtype=fp8_module.weight.dtype, + ) + linear.weight = fp8_module.weight # share, don't copy + if fp8_module.bias is not None: + linear.bias = fp8_module.bias + setattr(parent, attr_name, linear) + + try: + yield + finally: + # Restore Float8Linear modules + for parent, attr_name, fp8_module in fp8_locations: + setattr(parent, attr_name, fp8_module) + +# ----------------------------------------------------------------------------- +# Compile the model + +orig_model = model # original, uncompiled model, for saving raw model state_dict and for inference/evaluation (because the shapes may change shape) +model = torch.compile(model, dynamic=False) # the inputs to model will never change shape so dynamic=False is safe + # ----------------------------------------------------------------------------- # Initialize the Optimizer (combined MuonAdamW: Muon for matrix params, AdamW for rest) adam_betas = (args.adam_beta1, args.adam_beta2) @@ -287,7 +369,7 @@ while True: model.eval() val_loader = build_val_loader() eval_steps = args.eval_tokens // (args.device_batch_size * args.max_seq_len * ddp_world_size) - with autocast_ctx: + with disable_fp8(model), autocast_ctx: val_bpb = evaluate_bpb(model, val_loader, eval_steps, token_bytes) print0(f"Step {step:05d} | Validation bpb: {val_bpb:.6f}") if val_bpb < min_val_bpb: @@ -302,10 +384,11 @@ while True: # once in a while: estimate the CORE metric (all ranks participate) # use the original uncompiled model because the inputs keep changing shape + # disable FP8 for evaluation to use BF16 for more consistent/accurate results results = {} if args.core_metric_every > 0 and (last_step or (step > 0 and step % args.core_metric_every == 0)): model.eval() - with autocast_ctx: + with disable_fp8(orig_model), autocast_ctx: results = evaluate_core(orig_model, tokenizer, device, max_per_task=args.core_metric_max_per_task) print0(f"Step {step:05d} | CORE metric: {results['core_metric']:.4f}") wandb_run.log({ @@ -332,7 +415,7 @@ while True: engine = Engine(orig_model, tokenizer) # use orig_model to avoid recompilation for prompt in prompts: tokens = tokenizer(prompt, prepend="<|bos|>") - with autocast_ctx: + with disable_fp8(orig_model), autocast_ctx: sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0) print0(tokenizer.decode(sample[0])) model.train() diff --git a/uv.lock b/uv.lock index dd766f89..e5fc97f6 100644 --- a/uv.lock +++ b/uv.lock @@ -1505,11 +1505,11 @@ dependencies = [ { name = "tabulate" }, { name = "tiktoken" }, { name = "tokenizers" }, - { name = "torch", version = "2.9.0", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, { name = "torch", version = "2.9.1", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "torch", version = "2.9.1", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "torch", version = "2.9.1", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu') or (extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu')" }, { name = "torch", version = "2.9.1+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(sys_platform != 'darwin' and extra == 'extra-8-nanochat-cpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, { name = "torch", version = "2.9.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "extra == 'extra-8-nanochat-gpu'" }, + { name = "torchao" }, { name = "transformers" }, { name = "uvicorn" }, { name = "wandb" }, @@ -1546,9 +1546,10 @@ requires-dist = [ { name = "tabulate", specifier = ">=0.9.0" }, { name = "tiktoken", specifier = ">=0.11.0" }, { name = "tokenizers", specifier = ">=0.22.0" }, - { name = "torch", specifier = ">=2.9.0" }, - { name = "torch", marker = "extra == 'cpu'", specifier = ">=2.9.1", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "nanochat", extra = "cpu" } }, - { name = "torch", marker = "extra == 'gpu'", specifier = ">=2.9.1", index = "https://download.pytorch.org/whl/cu128", conflict = { package = "nanochat", extra = "gpu" } }, + { name = "torch", specifier = "==2.9.1" }, + { name = "torch", marker = "extra == 'cpu'", specifier = "==2.9.1", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "nanochat", extra = "cpu" } }, + { name = "torch", marker = "extra == 'gpu'", specifier = "==2.9.1", index = "https://download.pytorch.org/whl/cu128", conflict = { package = "nanochat", extra = "gpu" } }, + { name = "torchao", specifier = "==0.15.0" }, { name = "transformers", specifier = ">=4.57.3" }, { name = "uvicorn", specifier = ">=0.36.0" }, { name = "wandb", specifier = ">=0.21.3" }, @@ -1688,7 +1689,7 @@ name = "nvidia-cudnn-cu12" version = "9.10.2.21" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12", marker = "(sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "nvidia-cublas-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/fa/41/e79269ce215c857c935fd86bcfe91a451a584dfc27f1e068f568b9ad1ab7/nvidia_cudnn_cu12-9.10.2.21-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:c9132cc3f8958447b4910a1720036d9eff5928cc3179b0a51fb6d167c6cc87d8", size = 705026878, upload-time = "2025-06-06T21:52:51.348Z" }, @@ -1701,7 +1702,7 @@ name = "nvidia-cufft-cu12" version = "11.3.3.83" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12", marker = "(sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "nvidia-nvjitlink-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/60/bc/7771846d3a0272026c416fbb7e5f4c1f146d6d80704534d0b187dd6f4800/nvidia_cufft_cu12-11.3.3.83-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:848ef7224d6305cdb2a4df928759dca7b1201874787083b6e7550dd6765ce69a", size = 193109211, upload-time = "2025-03-07T01:44:56.873Z" }, @@ -1733,9 +1734,9 @@ name = "nvidia-cusolver-cu12" version = "11.7.3.90" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-cublas-cu12", marker = "(sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "nvidia-cusparse-cu12", marker = "(sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "nvidia-nvjitlink-cu12", marker = "(sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "nvidia-cublas-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "nvidia-cusparse-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "nvidia-nvjitlink-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/c8/32/f7cd6ce8a7690544d084ea21c26e910a97e077c9b7f07bf5de623ee19981/nvidia_cusolver_cu12-11.7.3.90-py3-none-manylinux_2_27_aarch64.whl", hash = "sha256:db9ed69dbef9715071232caa9b69c52ac7de3a95773c2db65bdba85916e4e5c0", size = 267229841, upload-time = "2025-03-07T01:46:54.356Z" }, @@ -1748,7 +1749,7 @@ name = "nvidia-cusparse-cu12" version = "12.5.8.93" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "nvidia-nvjitlink-cu12", marker = "(sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "nvidia-nvjitlink-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/bc/f7/cd777c4109681367721b00a106f491e0d0d15cfa1fd59672ce580ce42a97/nvidia_cusparse_cu12-12.5.8.93-py3-none-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:9b6c161cb130be1a07a27ea6923df8141f3c295852f4b260c65f18f3e0a091dc", size = 288117129, upload-time = "2025-03-07T01:47:40.407Z" }, @@ -2990,72 +2991,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6e/c2/61d3e0f47e2b74ef40a68b9e6ad5984f6241a942f7cd3bbfbdbd03861ea9/tomli-2.2.1-py3-none-any.whl", hash = "sha256:cb55c73c5f4408779d0cf3eef9f762b9c9f147a77de7b258bef0a5628adc85cc", size = 14257, upload-time = "2024-11-27T22:38:35.385Z" }, ] -[[package]] -name = "torch" -version = "2.9.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12' and sys_platform == 'linux'", - "python_full_version == '3.11.*' and sys_platform == 'linux'", - "python_full_version < '3.11' and sys_platform == 'linux'", -] -dependencies = [ - { name = "filelock", marker = "(sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "fsspec", marker = "(sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "jinja2", marker = "(sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "nvidia-cublas-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "nvidia-cuda-cupti-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "nvidia-cuda-nvrtc-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "nvidia-cuda-runtime-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "nvidia-cudnn-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "nvidia-cufft-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "nvidia-cufile-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "nvidia-curand-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "nvidia-cusolver-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "nvidia-cusparse-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "nvidia-cusparselt-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "nvidia-nccl-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "nvidia-nvjitlink-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "nvidia-nvshmem-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "nvidia-nvtx-cu12", marker = "(platform_machine == 'x86_64' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "setuptools", marker = "(python_full_version >= '3.12' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "sympy", marker = "(sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "triton", version = "3.5.0", source = { registry = "https://pypi.org/simple" }, marker = "(platform_machine == 'x86_64' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "typing-extensions", marker = "(sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/bb/86/245c240d2138c17ed572c943c289056c2721abab70810d772c6bf5495b28/torch-2.9.0-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:030bbfe367379ae6a4ae4042b6c44da25383343b8b3c68abaa9c7231efbaf2dd", size = 104213554, upload-time = "2025-10-15T15:45:59.798Z" }, - { url = "https://files.pythonhosted.org/packages/58/1d/fd1e88ae0948825efcab7dd66d12bec23f05d4d38ed81573c8d453c14c06/torch-2.9.0-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:51cb63902182a78e90886e8068befd8ea102af4b00e420263591a3d70c7d3c6c", size = 899795167, upload-time = "2025-10-15T15:47:12.695Z" }, - { url = "https://files.pythonhosted.org/packages/63/5a/496197b45c14982bef4e079b24c61dc108e3ab0d0cc9718dba9f54f45a46/torch-2.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:3f6aad4d2f0ee2248bac25339d74858ff846c3969b27d14ac235821f055af83d", size = 109310314, upload-time = "2025-10-15T15:46:16.633Z" }, - { url = "https://files.pythonhosted.org/packages/58/b0/2b4e647b0fc706e88eb6c253d05511865578f5f67b55fad639bf3272a4a1/torch-2.9.0-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:413e1654c9203733138858780e184d9fc59442f0b3b209e16f39354eb893db9b", size = 74452019, upload-time = "2025-10-15T15:46:04.296Z" }, - { url = "https://files.pythonhosted.org/packages/58/fe/334225e6330e672b36aef23d77451fa906ea12881570c08638a91331a212/torch-2.9.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:c596708b5105d0b199215acf0c9be7c1db5f1680d88eddadf4b75a299259a677", size = 104230578, upload-time = "2025-10-15T15:46:08.182Z" }, - { url = "https://files.pythonhosted.org/packages/05/cc/49566caaa218872ec9a2912456f470ff92649894a4bc2e5274aa9ef87c4a/torch-2.9.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:51de31219c97c51cf4bf2be94d622e3deb5dcc526c6dc00e97c17eaec0fc1d67", size = 899815990, upload-time = "2025-10-15T15:48:03.336Z" }, - { url = "https://files.pythonhosted.org/packages/74/25/e9ab21d5925b642d008f139d4a3c9664fc9ee1faafca22913c080cc4c0a5/torch-2.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:dd515c70059afd95f48b8192733764c08ca37a1d19803af6401b5ecad7c8676e", size = 109313698, upload-time = "2025-10-15T15:46:12.425Z" }, - { url = "https://files.pythonhosted.org/packages/b3/b7/205ef3e94de636feffd64b28bb59a0dfac0771221201b9871acf9236f5ca/torch-2.9.0-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:614a185e4986326d526a91210c8fc1397e76e8cfafa78baf6296a790e53a9eec", size = 74463678, upload-time = "2025-10-15T15:46:29.779Z" }, - { url = "https://files.pythonhosted.org/packages/d1/d3/3985739f3b8e88675127bf70f82b3a48ae083e39cda56305dbd90398fec0/torch-2.9.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:e5f7af1dc4c0a7c4a260c2534f41ddaf209714f7c89145e644c44712fbd6b642", size = 104107898, upload-time = "2025-10-15T15:46:20.883Z" }, - { url = "https://files.pythonhosted.org/packages/a5/4b/f4bb2e6c25d0272f798cd6d7a04ed315da76cec68c602d87040c7847287f/torch-2.9.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:01cff95ecd9a212ea2f141db28acccdceb6a4c54f64e6c51091146f5e2a772c6", size = 899738273, upload-time = "2025-10-15T15:50:04.188Z" }, - { url = "https://files.pythonhosted.org/packages/66/11/c1c5ba6691cda6279087c35bd626536e4fd29521fe740abf5008377a9a02/torch-2.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:4582b162f541651f0cb184d3e291c05c2f556c7117c64a9873e2ee158d40062b", size = 109280887, upload-time = "2025-10-15T15:46:26.228Z" }, - { url = "https://files.pythonhosted.org/packages/dd/5f/b85bd8c05312d71de9402bf5868d217c38827cfd09d8f8514e5be128a52b/torch-2.9.0-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:33f58e9a102a91259af289d50525c30323b5c9ae1d31322b6447c0814da68695", size = 74478983, upload-time = "2025-10-15T15:46:39.406Z" }, - { url = "https://files.pythonhosted.org/packages/c2/1c/90eb13833cdf4969ea9707586d7b57095c3b6e2b223a7256bf111689bcb8/torch-2.9.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:c30a17fc83eeab346913e237c64b15b5ba6407fff812f6c541e322e19bc9ea0e", size = 104111330, upload-time = "2025-10-15T15:46:35.238Z" }, - { url = "https://files.pythonhosted.org/packages/0e/21/2254c54b8d523592c25ef4434769aa23e29b1e6bf5f4c0ad9e27bf442927/torch-2.9.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:8f25033b8667b57857dfd01458fbf2a9e6a6df1f8def23aef0dc46292f6aa642", size = 899750243, upload-time = "2025-10-15T15:48:57.459Z" }, - { url = "https://files.pythonhosted.org/packages/b7/a5/5cb94fa4fd1e78223455c23c200f30f6dc10c6d4a2bcc8f6e7f2a2588370/torch-2.9.0-cp313-cp313-win_amd64.whl", hash = "sha256:d037f1b4ffd25013be4a7bf3651a0a910c68554956c7b2c92ebe87c76475dece", size = 109284513, upload-time = "2025-10-15T15:46:45.061Z" }, - { url = "https://files.pythonhosted.org/packages/66/e8/fc414d8656250ee46120b44836ffbb3266343db424b3e18ca79ebbf69d4f/torch-2.9.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:e4e5b5cba837a2a8d1a497ba9a58dae46fa392593eaa13b871c42f71847503a5", size = 74830362, upload-time = "2025-10-15T15:46:48.983Z" }, - { url = "https://files.pythonhosted.org/packages/ed/5f/9474c98fc5ae0cd04b9466035428cd360e6611a86b8352a0fc2fa504acdc/torch-2.9.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:64693568f5dc4dbd5f880a478b1cea0201cc6b510d91d1bc54fea86ac5d1a637", size = 104144940, upload-time = "2025-10-15T15:47:29.076Z" }, - { url = "https://files.pythonhosted.org/packages/2d/5a/8e0c1cf57830172c109d4bd6be2708cabeaf550983eee7029291322447a0/torch-2.9.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:f8ed31ddd7d10bfb3fbe0b9fe01b1243577f13d75e6f4a0839a283915ce3791e", size = 899744054, upload-time = "2025-10-15T15:48:29.864Z" }, - { url = "https://files.pythonhosted.org/packages/6d/28/82c28b30fcb4b7c9cdd995763d18bbb830d6521356712faebbad92ffa61d/torch-2.9.0-cp313-cp313t-win_amd64.whl", hash = "sha256:eff527d4e4846e6f70d2afd8058b73825761203d66576a7e04ea2ecfebcb4ab8", size = 109517546, upload-time = "2025-10-15T15:47:33.395Z" }, - { url = "https://files.pythonhosted.org/packages/ff/c3/a91f96ec74347fa5fd24453fa514bc61c61ecc79196fa760b012a1873d96/torch-2.9.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:f8877779cf56d1ce431a7636703bdb13307f5960bb1af49716d8b179225e0e6a", size = 74480732, upload-time = "2025-10-15T15:47:38.002Z" }, - { url = "https://files.pythonhosted.org/packages/5c/73/9f70af34b334a7e0ef496ceec96b7ec767bd778ea35385ce6f77557534d1/torch-2.9.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:7e614fae699838038d888729f82b687c03413c5989ce2a9481f9a7e7a396e0bb", size = 74433037, upload-time = "2025-10-15T15:47:41.894Z" }, - { url = "https://files.pythonhosted.org/packages/b7/84/37cf88625901934c97109e583ecc21777d21c6f54cda97a7e5bbad1ee2f2/torch-2.9.0-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:dfb5b8cd310ba3436c7e14e8b7833ef658cf3045e50d2bdaed23c8fc517065eb", size = 104116482, upload-time = "2025-10-15T15:47:46.266Z" }, - { url = "https://files.pythonhosted.org/packages/56/8e/ca8b17866943a8d4f4664d402ea84210aa274588b4c5d89918f5caa24eec/torch-2.9.0-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:b3d29524993a478e46f5d598b249cd824b7ed98d7fba538bd9c4cde6c803948f", size = 899746916, upload-time = "2025-10-15T15:50:40.294Z" }, - { url = "https://files.pythonhosted.org/packages/43/65/3b17c0fbbdab6501c5b320a52a648628d0d44e7379f64e27d9eef701b6bf/torch-2.9.0-cp314-cp314-win_amd64.whl", hash = "sha256:71c7578984f5ec0eb645eb4816ac8435fcf3e3e2ae1901bcd2f519a9cafb5125", size = 109275151, upload-time = "2025-10-15T15:49:20.715Z" }, - { url = "https://files.pythonhosted.org/packages/83/36/74f8c051f785500396e42f93542422422dfd874a174f21f8d955d36e5d64/torch-2.9.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:71d9309aee457bbe0b164bce2111cd911c4ed4e847e65d5077dbbcd3aba6befc", size = 74823353, upload-time = "2025-10-15T15:49:16.59Z" }, - { url = "https://files.pythonhosted.org/packages/62/51/dc3b4e2f9ba98ae27238f0153ca098bf9340b2dafcc67fde645d496dfc2a/torch-2.9.0-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:c08fb654d783899e204a32cca758a7ce8a45b2d78eeb89517cc937088316f78e", size = 104140340, upload-time = "2025-10-15T15:50:19.67Z" }, - { url = "https://files.pythonhosted.org/packages/c0/8d/b00657f8141ac16af7bb6cda2e67de18499a3263b78d516b9a93fcbc98e3/torch-2.9.0-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:ec8feb0099b2daa5728fbc7abb0b05730fd97e0f359ff8bda09865aaa7bd7d4b", size = 899731750, upload-time = "2025-10-15T15:49:36.673Z" }, - { url = "https://files.pythonhosted.org/packages/fc/29/bd361e0cbb2c79ce6450f42643aaf6919956f89923a50571b0ebfe92d142/torch-2.9.0-cp314-cp314t-win_amd64.whl", hash = "sha256:695ba920f234ad4170c9c50e28d56c848432f8f530e6bc7f88fcb15ddf338e75", size = 109503850, upload-time = "2025-10-15T15:50:24.118Z" }, -] - [[package]] name = "torch" version = "2.9.1" @@ -3076,13 +3011,13 @@ dependencies = [ { name = "typing-extensions", marker = "(sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1-cp310-none-macosx_11_0_arm64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1-cp311-none-macosx_11_0_arm64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1-cp312-none-macosx_11_0_arm64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1-cp313-cp313t-macosx_11_0_arm64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1-cp313-none-macosx_11_0_arm64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1-cp314-cp314-macosx_11_0_arm64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1-cp314-cp314t-macosx_11_0_arm64.whl" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1-cp310-none-macosx_11_0_arm64.whl", hash = "sha256:bf1e68cfb935ae2046374ff02a7aa73dda70351b46342846f557055b3a540bf0" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1-cp311-none-macosx_11_0_arm64.whl", hash = "sha256:a52952a8c90a422c14627ea99b9826b7557203b46b4d0772d3ca5c7699692425" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1-cp312-none-macosx_11_0_arm64.whl", hash = "sha256:287242dd1f830846098b5eca847f817aa5c6015ea57ab4c1287809efea7b77eb" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:8924d10d36eac8fe0652a060a03fc2ae52980841850b9a1a2ddb0f27a4f181cd" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:bcee64ae7aa65876ceeae6dcaebe75109485b213528c74939602208a20706e3f" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:defadbeb055cfcf5def58f70937145aecbd7a4bc295238ded1d0e85ae2cf0e1d" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:886f84b181f766f53265ba0a1d503011e60f53fff9d569563ef94f24160e1072" }, ] [[package]] @@ -3090,19 +3025,22 @@ name = "torch" version = "2.9.1" source = { registry = "https://pypi.org/simple" } resolution-markers = [ + "python_full_version >= '3.12' and sys_platform == 'linux'", "python_full_version >= '3.12' and sys_platform != 'linux'", + "python_full_version == '3.11.*' and sys_platform == 'linux'", + "python_full_version < '3.11' and sys_platform == 'linux'", "python_full_version == '3.11.*' and sys_platform != 'linux'", "python_full_version < '3.11' and sys_platform != 'linux'", ] dependencies = [ - { name = "filelock", marker = "(sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "fsspec", marker = "(sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "jinja2", marker = "(sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "setuptools", marker = "(python_full_version >= '3.12' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "sympy", marker = "(sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, - { name = "typing-extensions", marker = "(sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "filelock", marker = "(extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu') or (extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu')" }, + { name = "fsspec", marker = "(extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu') or (extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu')" }, + { name = "jinja2", marker = "(extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu') or (extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu')" }, + { name = "networkx", version = "3.4.2", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version < '3.11' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "networkx", version = "3.5", source = { registry = "https://pypi.org/simple" }, marker = "(python_full_version >= '3.11' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "setuptools", marker = "(python_full_version >= '3.12' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "sympy", marker = "(extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu') or (extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu')" }, + { name = "typing-extensions", marker = "(extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu') or (extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu')" }, ] wheels = [ { url = "https://files.pythonhosted.org/packages/5f/56/9577683b23072075ed2e40d725c52c2019d71a972fab8e083763da8e707e/torch-2.9.1-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:1cc208435f6c379f9b8fdfd5ceb5be1e3b72a6bdf1cb46c0d2812aa73472db9e", size = 104207681, upload-time = "2025-11-12T15:19:56.48Z" }, @@ -3158,30 +3096,30 @@ dependencies = [ { name = "typing-extensions", marker = "(sys_platform != 'darwin' and extra == 'extra-8-nanochat-cpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp310-cp310-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp310-cp310-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp310-cp310-win_amd64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp311-cp311-win_amd64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp311-cp311-win_arm64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp312-cp312-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp312-cp312-win_amd64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp312-cp312-win_arm64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp313-cp313-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp313-cp313-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp313-cp313-win_amd64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp313-cp313-win_arm64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp313-cp313t-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp313-cp313t-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp313-cp313t-win_amd64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp314-cp314-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp314-cp314-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp314-cp314-win_amd64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp314-cp314t-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp314-cp314t-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp314-cp314t-win_amd64.whl" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:10866c8a48c4aa5ae3f48538dc8a055b99c57d9c6af2bf5dd715374d9d6ddca3" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:7210713b66943fdbfcc237b2e782871b649123ac5d29f548ce8c85be4223ab38" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp310-cp310-win_amd64.whl", hash = "sha256:d6e8441453dc27524e3f1037fbf27b90a02644b84e42944b9354b4024cb51cc1" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:0e611cfb16724e62252b67d31073bc5c490cb83e92ecdc1192762535e0e44487" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:3de2adb9b4443dc9210ef1f1b16da3647ace53553166d6360bbbd7edd6f16e4d" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp311-cp311-win_amd64.whl", hash = "sha256:69b3785d28be5a9c56ab525788ec5000349ec59132a74b7d5e954b905015b992" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp311-cp311-win_arm64.whl", hash = "sha256:15b4ae6fe371d96bffb8e1e9af62164797db20a0dc1337345781659cfd0b8bb1" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:3bf9b442a51a2948e41216a76d7ab00f0694cfcaaa51b6f9bcab57b7f89843e6" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:7417d8c565f219d3455654cb431c6d892a3eb40246055e14d645422de13b9ea1" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp312-cp312-win_amd64.whl", hash = "sha256:a4e06b4f441675d26b462123c8a83e77c55f1ec8ebc081203be2db1ea8054add" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp312-cp312-win_arm64.whl", hash = "sha256:1abe31f14b560c1f062699e966cb08ef5b67518a1cfac2d8547a3dbcd8387b06" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:3e532e553b37ee859205a9b2d1c7977fd6922f53bbb1b9bfdd5bdc00d1a60ed4" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:39b3dff6d8fba240ae0d1bede4ca11c2531ae3b47329206512d99e17907ff74b" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp313-cp313-win_amd64.whl", hash = "sha256:404a7ab2fffaf2ca069e662f331eb46313692b2f1630df2720094284f390ccef" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp313-cp313-win_arm64.whl", hash = "sha256:161decbff26a33f13cb5ba6d2c8f458bbf56193bcc32ecc70be6dd4c7a3ee79d" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:01b1884f724977a20c7da2f640f1c7b37f4a2c117a7f4a6c1c0424d14cb86322" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:031a597147fa81b1e6d79ccf1ad3ccc7fafa27941d6cf26ff5caaa384fb20e92" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp313-cp313t-win_amd64.whl", hash = "sha256:e586ab1363e3f86aa4cc133b7fdcf98deb1d2c13d43a7a6e5a6a18e9c5364893" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:65010ab4aacce6c9a1ddfc935f986c003ca8638ded04348fd326c3e74346237c" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:88adf5157db5da1d54b1c9fe4a6c1d20ceef00e75d854e206a87dbf69e3037dc" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp314-cp314-win_amd64.whl", hash = "sha256:f60e2565f261542efac07e25208fb3fc55c6fe82314a5a9cbee971edb5f27713" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:3ac2b8df2c55430e836dcda31940d47f1f5f94b8731057b6f20300ebea394dd9" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:5b688445f928f13563b7418b17c57e97bf955ab559cf73cd8f2b961f8572dbb3" }, + { url = "https://download.pytorch.org/whl/cpu/torch-2.9.1%2Bcpu-cp314-cp314t-win_amd64.whl", hash = "sha256:cf9c3e50b595721ca6b488bdcc326e0f1af73ed28b9b66eff504a96649bb5c96" }, ] [[package]] @@ -3219,31 +3157,40 @@ dependencies = [ { name = "nvidia-nvtx-cu12", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, { name = "setuptools", marker = "(python_full_version >= '3.12' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, { name = "sympy", marker = "extra == 'extra-8-nanochat-gpu'" }, - { name = "triton", version = "3.5.1", source = { registry = "https://pypi.org/simple" }, marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, + { name = "triton", marker = "(sys_platform == 'linux' and extra == 'extra-8-nanochat-gpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, { name = "typing-extensions", marker = "extra == 'extra-8-nanochat-gpu'" }, ] wheels = [ - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp310-cp310-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp310-cp310-win_amd64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp311-cp311-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp311-cp311-win_amd64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp312-cp312-win_amd64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp313-cp313-win_amd64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp313-cp313t-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp313-cp313t-win_amd64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp314-cp314-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp314-cp314-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp314-cp314-win_amd64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp314-cp314t-manylinux_2_28_aarch64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp314-cp314t-manylinux_2_28_x86_64.whl" }, - { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp314-cp314t-win_amd64.whl" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp310-cp310-manylinux_2_28_aarch64.whl", hash = "sha256:72f0f096475e8095a6bea3fba75bd3b46cf42c761b29588f7599314e67a32661" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp310-cp310-manylinux_2_28_x86_64.whl", hash = "sha256:c8d670aa0be6fbecd2b0e7b7d514a104dbdefcc3786ca446cf0c3415043ea40a" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp310-cp310-win_amd64.whl", hash = "sha256:64399adaa8ea0896d02cf844cba3c5dd77e769520a1af73572599e0eaa2cf551" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:cf4ad82430824a80a9f398e29369524ed26c152cf00c2c12002e5400b35e260d" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:2a1da940f0757621d098c9755f7504d791a72a40920ec85a4fd98b20253fca4e" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp311-cp311-win_amd64.whl", hash = "sha256:633005a3700e81b5be0df2a7d3c1d48aced23ed927653797a3bd2b144a3aeeb6" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:1176f250311fa95cc3bca8077af323e0d73ea385ba266e096af82e7e2b91f256" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:7cb4018f4ce68b61fd3ef87dc1c4ca520731c7b5b200e360ad47b612d7844063" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp312-cp312-win_amd64.whl", hash = "sha256:3a01f0b64c10a82d444d9fd06b3e8c567b1158b76b2764b8f51bfd8f535064b0" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:0b80b7555dcd0a75b7b06016991f01281a0bb078cf28fa2d1dfb949fad2fbd07" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:63381a109a569b280ed3319da89d3afe5cf9ab5c879936382a212affb5c90552" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp313-cp313-win_amd64.whl", hash = "sha256:ad9183864acdd99fc5143d7ca9d3d2e7ddfc9a9600ff43217825d4e5e9855ccc" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:2314521c74d76e513c53bb72c0ce3511ef0295ff657a432790df6c207e5d7962" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:4454a4faca31af81566e3a4208f10f20b8a6d9cfe42791b0ca7ff134326468fc" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp313-cp313t-win_amd64.whl", hash = "sha256:24420e430e77136f7079354134b34e7ba9d87e539f5ac84c33b08e5c13412ebe" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp314-cp314-manylinux_2_28_aarch64.whl", hash = "sha256:32c036296c557f19a1537ce981c40533650097114e1720a321a39a3b08d9df56" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp314-cp314-manylinux_2_28_x86_64.whl", hash = "sha256:7788d3d03d939cf00f93ac0da5ab520846f66411e339cfbf519a806e8facf519" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp314-cp314-win_amd64.whl", hash = "sha256:7bcd40cbffac475b478d6ce812f03da84e9a4894956efb89c3b7bcca5dbd4f91" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:e88c78e5b08ae9303aa15da43b68b44287ecbec16d898d9fad6998832fe626a5" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:7d8769bdf3200ca16a92f14df404c3370171ac3732996528a8973d753eac562f" }, + { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp314-cp314t-win_amd64.whl", hash = "sha256:0c784b600959ec70ee01cb23e8bc870a0e0475af30378ff5e39f4abed8b7c1cc" }, +] + +[[package]] +name = "torchao" +version = "0.15.0" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/57/2d/472b9362dceae05a4599e2b94f86e69a29c0e20964a6af84f34f6ead5938/torchao-0.15.0-cp310-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cbe813201314ba6329a650a76944502f3e8ec4b1b44523f3f48676810d8d1f6", size = 7163930, upload-time = "2025-12-18T23:14:41.876Z" }, + { url = "https://files.pythonhosted.org/packages/f6/3b/6b9d5618720f63dbc2e2509cd6b57aae9c0d61b738d1d2172f4d5d9efaab/torchao-0.15.0-py3-none-any.whl", hash = "sha256:3f3812676048ef8a2a0e9d492d12d8971ba7a7ebb16f54aa56f690414e130d2c", size = 1080679, upload-time = "2025-12-18T23:14:43.807Z" }, ] [[package]] @@ -3307,41 +3254,10 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/6a/6b/2f416568b3c4c91c96e5a365d164f8a4a4a88030aa8ab4644181fdadce97/transformers-4.57.3-py3-none-any.whl", hash = "sha256:c77d353a4851b1880191603d36acb313411d3577f6e2897814f333841f7003f4", size = 11993463, upload-time = "2025-11-25T15:51:26.493Z" }, ] -[[package]] -name = "triton" -version = "3.5.0" -source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12' and sys_platform == 'linux'", - "python_full_version == '3.11.*' and sys_platform == 'linux'", - "python_full_version < '3.11' and sys_platform == 'linux'", -] -wheels = [ - { url = "https://files.pythonhosted.org/packages/dd/22/507b6f58a35e05e84381630b2dc2a3cee1a7a2a7eaf4cba857c638a18a24/triton-3.5.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6f90de6a6566bb619b4c0adc9855729e1b1b5e26533fca1bf6206e96b6d277a3", size = 159827599, upload-time = "2025-10-15T19:15:43.87Z" }, - { url = "https://files.pythonhosted.org/packages/0b/eb/09e31d107a5d00eb281aa7e6635ca463e9bca86515944e399480eadb71f8/triton-3.5.0-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:d5d3b3d480debf24eaa739623c9a42446b0b77f95593d30eb1f64cd2278cc1f0", size = 170333110, upload-time = "2025-10-13T16:37:49.588Z" }, - { url = "https://files.pythonhosted.org/packages/79/f9/b6f60f978397c616fd8dacca2305759fe4f80d397b20ef72534803244bd5/triton-3.5.0-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8457b22148defefdcb7fa8144b05ce211b9faefad650a1ce85b23df488d5549c", size = 159926731, upload-time = "2025-10-15T19:15:49.682Z" }, - { url = "https://files.pythonhosted.org/packages/3d/78/949a04391c21956c816523678f0e5fa308eb5b1e7622d88c4e4ef5fceca0/triton-3.5.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f34bfa21c5b3a203c0f0eab28dcc1e49bd1f67d22724e77fb6665a659200a4ec", size = 170433488, upload-time = "2025-10-13T16:37:57.132Z" }, - { url = "https://files.pythonhosted.org/packages/87/9b/30988039e1e84df7554fba24e6a734d2d0e847af33cabdf9b532b3c51456/triton-3.5.0-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7da21fccceafc163e3a5e857abe34351ef76345af06cabf9637a914742671f0b", size = 159946647, upload-time = "2025-10-15T19:15:56.325Z" }, - { url = "https://files.pythonhosted.org/packages/f5/3a/e991574f3102147b642e49637e0281e9bb7c4ba254edb2bab78247c85e01/triton-3.5.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9e71db82261c4ffa3921cd050cd5faa18322d2d405c30eb56084afaff3b0833", size = 170476535, upload-time = "2025-10-13T16:38:05.18Z" }, - { url = "https://files.pythonhosted.org/packages/cd/85/e37f1197acb04c8f3d83851d23d5d6ed5060ef74580668b112e23fdfa203/triton-3.5.0-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:188da5b81fa2f8322c27fec1627703eac24cb9bb7ab0dfbe9925973bc1b070d3", size = 159958970, upload-time = "2025-10-15T19:16:01.717Z" }, - { url = "https://files.pythonhosted.org/packages/6c/29/10728de8a6e932e517c10773486b8e99f85d1b1d9dd87d9a9616e1fef4a1/triton-3.5.0-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:e6bb9aa5519c084a333acdba443789e50012a4b851cd486c54f0b8dc2a8d3a12", size = 170487289, upload-time = "2025-10-13T16:38:11.662Z" }, - { url = "https://files.pythonhosted.org/packages/b8/1d/38258f05010ac17a7b058c022911c9cae6526e149b7397134a048cf5a6c2/triton-3.5.0-cp313-cp313t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:03127d9b33aaf979c856676b394bc059ec1d68cb6da68ae03f62dd8ad77a04ae", size = 160073012, upload-time = "2025-10-15T19:16:07.477Z" }, - { url = "https://files.pythonhosted.org/packages/5c/38/db80e48b9220c9bce872b0f616ad0446cdf554a40b85c7865cbca99ab3c2/triton-3.5.0-cp313-cp313t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c83f2343e1a220a716c7b3ab9fccfcbe3ad4020d189549200e2d2e8d5868bed9", size = 170577179, upload-time = "2025-10-13T16:38:17.865Z" }, - { url = "https://files.pythonhosted.org/packages/91/fe/8f5771d00227f4eb1ee034f218ed427102b989366d2275fe3b3c105a3921/triton-3.5.0-cp314-cp314-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:468936651d383f4a6d10068d34a627505e13af55be5d002b9f27b987e7a5f0ac", size = 159957460, upload-time = "2025-10-15T19:16:12.626Z" }, - { url = "https://files.pythonhosted.org/packages/ff/60/1810655d1d856c9a4fcc90ee8966d85f552d98c53a6589f95ab2cbe27bb8/triton-3.5.0-cp314-cp314-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:da0fa67ccd76c3dcfb0bffe1b1c57c685136a6bd33d141c24d9655d4185b1289", size = 170487949, upload-time = "2025-10-13T16:38:24.881Z" }, - { url = "https://files.pythonhosted.org/packages/78/59/99edd103958fe6e42b50b9ad8ce4f223ddf4ccf475259cf7d2b53381dc6c/triton-3.5.0-cp314-cp314t-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c7ceef21410229ac23173a28eee5cfc0e37c1dfdb8b4bc11ecda2e3ecec7c686", size = 160075629, upload-time = "2025-10-15T19:16:18.746Z" }, - { url = "https://files.pythonhosted.org/packages/fb/b7/1dec8433ac604c061173d0589d99217fe7bf90a70bdc375e745d044b8aad/triton-3.5.0-cp314-cp314t-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:317fe477ea8fd4524a6a8c499fb0a36984a56d0b75bf9c9cb6133a1c56d5a6e7", size = 170580176, upload-time = "2025-10-13T16:38:31.14Z" }, -] - [[package]] name = "triton" version = "3.5.1" source = { registry = "https://pypi.org/simple" } -resolution-markers = [ - "python_full_version >= '3.12' and sys_platform == 'linux'", - "python_full_version == '3.11.*' and sys_platform == 'linux'", - "python_full_version < '3.11' and sys_platform == 'linux'", -] wheels = [ { url = "https://files.pythonhosted.org/packages/d9/2e/f95e673222afa2c7f0c687d8913e98fcf2589ef0b1405de76894e37fe18f/triton-3.5.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f63e34dcb32d7bd3a1d0195f60f30d2aee8b08a69a0424189b71017e23dfc3d2", size = 159821655, upload-time = "2025-11-11T17:51:44.09Z" }, { url = "https://files.pythonhosted.org/packages/fd/6e/676ab5019b4dde8b9b7bab71245102fc02778ef3df48218b298686b9ffd6/triton-3.5.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:5fc53d849f879911ea13f4a877243afc513187bc7ee92d1f2c0f1ba3169e3c94", size = 170320692, upload-time = "2025-11-11T17:40:46.074Z" }, From a67eba35dce1ce3f7276b6913bcedf4cbb75b506 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 3 Feb 2026 20:54:30 +0000 Subject: [PATCH 098/119] add feb2 new leaderboard record from upgrading to fp8 training, +4.3% speedup to time to GPT-2 --- README.md | 36 ++++++----------------------- dev/LOG.md | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 29 deletions(-) diff --git a/README.md b/README.md index 62834375..e96b5a77 100644 --- a/README.md +++ b/README.md @@ -14,37 +14,15 @@ For questions about the repo, I recommend either using [DeepWiki](https://deepwi ## Leaderboard -| # | Record time | Description | Date | Commit | Contributors | -|---|-------------|-------------|------|--------|--------------| -| 1 | 3.04 hours | d24 baseline, slightly overtrained | Jan 29 2026 | 348fbb3 | @karpathy | +| # | Record time | val_bpb | CORE | Description | Date | Commit | Contributors | +|---|-------------|---------|------|-------------|------|--------|--------------| +| 0 | 168 hours | - | 0.256525 | Original OpenAI GPT-2 checkpoint | 2019 | - | OpenAI | +| 1 | 3.04 | 0.74833 | 0.25851 | d24 baseline, slightly overtrained | Jan 29 2026 | 348fbb3 | @karpathy | +| 2 | 2.91 | 0.74504 | 0.2578 | d26 slightly undertrained **+fp8** | Feb 2 2026 | TODO | @karpathy | -The primary metric we care about is "time to GPT-2" - the wall clock time needed to outperform the GPT-2 (1.6B) CORE metric on an 8XH100 GPU node. In 2019, the training of GPT-2 cost approximately $50,000 so it is incredible that due to many advances over 7 years across the stack, we can now do so in 3 hours or less, for ~$73 and below. Once your repo is set up (see the [runs/speedrun.sh](runs/speedrun.sh) script for reference), e.g. the way I kicked off the jan29 run is as follows: +The primary metric we care about is "time to GPT-2" - the wall clock time needed to outperform the GPT-2 (1.6B) CORE metric on an 8XH100 GPU node. The GPT-2 CORE score is 0.256525. In 2019, the training of GPT-2 cost approximately $50,000 so it is incredible that due to many advances over 7 years across the stack, we can now do so much faster and for well below $100 (e.g. at the current ~$3/GPU/hr, an 8XH100 node is ~$24/hr, so 3 hours is ~$72). -``` -OMP_NUM_THREADS=1 torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- \ - --depth=24 \ - --run=d24-jan29 \ - --model-tag=d24_jan29 \ - --device-batch-size=16 \ - --sample-every=-1 \ - --save-every=-1 \ - --core-metric-max-per-task=-1 \ - --core-metric-every=3000 \ - --target-param-data-ratio=12 -``` - -After 3 hours we get output like this: - -``` -... -wandb: Run summary: -wandb: core_metric 0.25851 -wandb: step 16704 -wandb: total_training_flops 4.330784131228946e+19 -wandb: total_training_time 10949.46713 -``` - -The GPT-2 CORE score (i.e. the target to beat) is 0.256525. So we see that this d24 CORE score is higher (0.25851). Then we look at the `total_training_time`, which is the time of the training iterations alone, excluding all the evaluations and logging, in seconds. We get: `10949/60/60 ~= 3.04` hours, the current record. +See [dev/LEADERBOARD.md](dev/LEADERBOARD.md) for more docs on how to interpret and contribute to the leaderboard. ## Getting started diff --git a/dev/LOG.md b/dev/LOG.md index dd11b427..8cdef874 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,74 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-02-02: FP8 Training with torchao + +Integrated FP8 training using `torchao.float8` to accelerate Linear layer matmuls on H100 GPUs. + +### Background + +FP8 (8-bit floating point) uses H100's FP8 tensor cores for ~2x theoretical matmul throughput. The tradeoff is quantization overhead: computing scales and casting tensors to/from FP8. Still, as an example torchtitan (Meta's distributed training framework) reports 25-28% speedups with FP8 for some of their experiments. + +**Previous attempt (Jan 2026):** FP8 on just `lm_head` following modded-nanogpt with custom ops → 1% speedup, +2GB memory. Failed due to fragile torch.compile interaction. But this experiment was also done on ~d12 scale back then instead of the bigger model that gets GPT-2 capability of approx d24. + +**This attempt:** Use torchao's `convert_to_float8_training()` on ALL Linear layers, increase model size to d24. The core snippet is: + +```python +from torchao.float8 import Float8LinearConfig, convert_to_float8_training +config = Float8LinearConfig.from_recipe_name("tensorwise") +convert_to_float8_training(model, config=config) +``` + +But in practice it's more involved (see base_train.py). + +### Results + +**Microbenchmark (d26 MLP, 65536x1664 @ 1664x6656):** + +| Method | Forward | Fwd+Bwd | Speedup | +|--------|---------|---------|---------| +| BF16 + compile | 2.00ms | 4.79ms | 1.00x | +| FP8 rowwise + compile | 1.84ms | 4.55ms | 1.08x | +| FP8 tensorwise + compile | 1.45ms | 4.06ms | **1.38x** | +| FP8 rowwise (no compile) | 2.89ms | 21.86ms | 0.23x ❌ | + +torch.compile is MANDATORY. Without it, FP8 is 4x slower due to unfused scaling ops. + +**Full training (d26):** + +| Config | tok/sec | vs baseline | +|--------|---------|-------------| +| BF16 baseline | 630K | 1.00x | +| FP8 rowwise | 564K | 0.90x ❌ | +| FP8 tensorwise | 740K | **1.17x** ✓ | + +Memory usage also decreases quite a bit, by ~9GB (activations stored as FP8 instead of BF16). + +Seeing 17% speedup is encouraging but we're still not done yet because each step is now in lower precision and less powerful individually, so to make up for the precision drop we have to train longer. Empirically, running some sweeps overnight on d24 scale, I saw that the actual speedup (when you match performance) is closer to 5%. It's possible that our LLMs at ~d24 scale are still too small to confidently enjoy the speedups that come from fp8 for bigger models. + +### Key Learnings + +For nanochat at approximate scale of interest (~GPT-2 capability, ~d24): + +1. **Tensorwise >> Rowwise** - Rowwise computes per-row scales, overhead exceeds benefit. Tensorwise uses one scale per tensor. +2. **Filter small layers** - Layers with dims not divisible by 16 must be skipped (FP8 hardware requirement) +3. **Larger models benefit more** - d12 was still slower with FP8; d26+ shows gains. Therefore, in some depths there is a benefit to fp8 and in some there isn't. Keeping it configurable for now, passed in via kwargs and default off. +4. **The effective, capability-matched speedup is lower still** - because each step is of slightly lower precision/quality. + +### Integration + +Added `--fp8` flag to `base_train.py`, default recipe is "tensorwise", example of turning on: + +```bash +torchrun --nproc_per_node=8 -m scripts.base_train --depth=24 --fp8 +``` + +Uses tensorwise by default. Requires `torchao==0.15.0` (compatible with torch 2.9.1), which was added to dependencies. + +**TLDR**: turning on fp8 for GPT-2 capability nanochat model gives approx +5% capability-matched speedup. + +--- + ## 2026-01-29: Hyperball/MuonH Experiments (Negative Result) Explored Hyperball optimization from [this post](https://psychedelic-sunstone-851.notion.site/Fantastic-Pretraining-Optimizers-and-Where-to-Find-Them-2-1-Hyperball-Optimization-2e924306e6f280e7a5ffee00eb40a0dd) (saved to `knowledge/muonh.md`). Constrains weights to sphere of radius R (initial norm): `W_{t+1} = R · Normalize(W_t - η·R · Normalize(u_t))`. Had to change a number of details in a branch, e.g. not use zero init for our projections (or the initial norm would be zero), keep track of the initial norm, adjust Muon -> MuonH for the update. From fe55b092b8a2e3b46411a9e2e9acd8cc0f6788d1 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 3 Feb 2026 21:05:28 +0000 Subject: [PATCH 099/119] minor cosmetics for the table --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index e96b5a77..08c184a0 100644 --- a/README.md +++ b/README.md @@ -14,11 +14,11 @@ For questions about the repo, I recommend either using [DeepWiki](https://deepwi ## Leaderboard -| # | Record time | val_bpb | CORE | Description | Date | Commit | Contributors | +| # | time | val_bpb | CORE | Description | Date | Commit | Contributors | |---|-------------|---------|------|-------------|------|--------|--------------| -| 0 | 168 hours | - | 0.256525 | Original OpenAI GPT-2 checkpoint | 2019 | - | OpenAI | -| 1 | 3.04 | 0.74833 | 0.25851 | d24 baseline, slightly overtrained | Jan 29 2026 | 348fbb3 | @karpathy | -| 2 | 2.91 | 0.74504 | 0.2578 | d26 slightly undertrained **+fp8** | Feb 2 2026 | TODO | @karpathy | +| 0 | 168 hours | - | 0.2565 | Original OpenAI GPT-2 checkpoint | 2019 | - | OpenAI | +| 1 | 3.04 | 0.74833 | 0.2585 | d24 baseline, slightly overtrained | Jan 29 2026 | 348fbb3 | @karpathy | +| 2 | 2.91 | 0.74504 | 0.2578 | d26 slightly undertrained **+fp8** | Feb 2 2026 | 8309b83 | @karpathy | The primary metric we care about is "time to GPT-2" - the wall clock time needed to outperform the GPT-2 (1.6B) CORE metric on an 8XH100 GPU node. The GPT-2 CORE score is 0.256525. In 2019, the training of GPT-2 cost approximately $50,000 so it is incredible that due to many advances over 7 years across the stack, we can now do so much faster and for well below $100 (e.g. at the current ~$3/GPU/hr, an 8XH100 node is ~$24/hr, so 3 hours is ~$72). From 16b8ac7da33010dc7efcd9292f895703f5cff33a Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 3 Feb 2026 21:06:12 +0000 Subject: [PATCH 100/119] oops forgot to attach leaderboard file too --- dev/LEADERBOARD.md | 119 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 119 insertions(+) create mode 100644 dev/LEADERBOARD.md diff --git a/dev/LEADERBOARD.md b/dev/LEADERBOARD.md new file mode 100644 index 00000000..3b61cc60 --- /dev/null +++ b/dev/LEADERBOARD.md @@ -0,0 +1,119 @@ +# Leaderboard + +Docs on participating in the "Time-to-GPT-2" leaderboard of nanochat. + +The primary metric we care about is "time to GPT-2" - the wall clock time needed to outperform the GPT-2 (1.6B) CORE metric on an 8XH100 GPU node. Originally in 2019, GPT-2 was trained by OpenAI on 32 TPU v3 chips for 168 hours (7 days), with $8/hour/TPUv3 back then, for a total cost of approx. $43K. It achieves 0.256525 CORE score, which is an ensemble metric introduced in the DCLM paper over 22 evaluations like ARC/MMLU/etc. + +## How to + +The script [runs/speedrun.sh](runs/speedrun.sh) always implements the current state of the art on the leaderboard. + +In practice, I tune the base_train command a little bit. For example, once all the setup is configured and a tokenizer is trained, I like to do something like: + +``` +OMP_NUM_THREADS=1 torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- \ + --depth=26 \ + --run="d26-feb2-fp8-ratio8.25" \ + --model-tag="d26_feb2_fp8_ratio8.25" \ + --device-batch-size=16 \ + --sample-every=-1 \ + --save-every=-1 \ + --core-metric-max-per-task=-1 \ + --core-metric-every=999999 \ + --target-param-data-ratio=8.25 \ + --fp8 +``` + +Note that: + +- `depth` controls the size of the Transformer +- `run` is the wandb name +- `model-tag` is the location of the checkpoints on disk +- `device-batch-size` in the ideal world, you want this to be 32 because with sequence length of 2048 (the default) and 8 GPUs we get `32 X 2048 X 8 = 524,288`, which is the total desired batch size determined to work fairly well around this scale. However, for bigger (e.g. d26), 32 is too much and OOMs, so we decrease it by 2 to 16. The `base_train.py` script automatically compensates for this by calculating that it has to use gradient accumulation of 2 to meet the desired total batch size. Therefore, it will fo forward+backward twice and then a single step. Long story short, the ideal value is 32. If that doesn't fit, you decrease it, e.g. 16, 8, etc., keeping it powers of two so that the gradient accumulation math works out neatly. +- `sample-every = -1` turns off periodic sampling +- `core-metric-max-per-task=-1` means we run the entire CORE eval +- `core-metric-every=999999` a bit of a hacky way to make the CORE eval only happen a single time at the very end of the run +- `target-param-data-ratio=8.25` controls the training horizon, which is determined in the script by taking the number of non-embedding model parameters and simply multiplying by this number. The current optimal Tokens:Params ratio can be seen in the defaults of the `base_train.py` script (it is 10.5). 10.5 would produce the *compute optimal* model given the currently measured scaling laws. However, GPT-2 capability is currently somewhere in between a d24 and d26. So to reach it exactly, we want to either overtrain d24 or undertrain d26. In this particular example, I am choosing to slightly undertrain a d26. Note that odd depths (e.g. d25) are not super recommended to use because the math around the transformer sizing and its head dimensions doesn't come out neatly. +- `--fp8` turns on fp8 training. If you GPU does not support fp8, you can leave this out and the code will simply train in bf16. bf16 is higher precision than fp8, so you can actually expect that you might be able to do fewer steps (lower the `target-param-data-ratio`) to achieve the same capability. + +Once you kick off the run, you wait ~3 hours and then at the end you'll see something like: + +``` +wandb: Run summary: +wandb: core_metric 0.25851 +wandb: step 16704 +wandb: total_training_flops 4.330784131228946e+19 +wandb: total_training_time 10949.46713 +``` + +Your CORE metric must be greater than GPT-2 0.256525. Then you report the `total_training_time`, (e.g. 10949) which is the time of the training iterations alone, excluding all the evaluations and logging, in seconds. So here for example here it is roughly 10949/60/60 ~= 3.04 hours. You should also note and report the validation bpb of your run because the CORE metric can be a little bit noisy. + +If you outperform GPT-2 and the time is less than current SOTA in the Leaderboard, you get to make a PR. In addition to raw gains, there are some qualitative and aesthetic considerations that go into whether your improvement is merged. For example, if it is gnarly or it significantly bloats the code, or it seems too esoteric, then we will way those things against the improvement demonstrated. Additionally, nanochat cares not only about targeting a single model, but an entire miniseries of models. So your change must be principled enough that it can easily generalize to other model depths, so that we can sweep out a miniseries. + +After you create the commit, to get the current short git commit hash: + +``` +git log -1 --format="%h" +``` + +## Run 1 + +Achieved Jan 29 2026 on commit `348fbb3`. The launch command was + +``` +OMP_NUM_THREADS=1 torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- \ + --depth=24 \ + --run=d24-jan29 \ + --model-tag=d24_jan29 \ + --device-batch-size=16 \ + --sample-every=-1 \ + --save-every=-1 \ + --core-metric-max-per-task=-1 \ + --core-metric-every=3000 \ + --target-param-data-ratio=12 +``` + +The result was: + +``` +wandb: Run summary: +wandb: core_metric 0.25851 +wandb: step 16704 +wandb: total_training_flops 4.330784131228946e+19 +wandb: total_training_time 10949.46713 +``` + +The validation bpb was 0.74833. + +Detailed writeup: [Beating GPT-2 for <<$100: the nanochat journey](https://github.com/karpathy/nanochat/discussions/481) + +## Run 2 + +Achieved Feb 2 2026 on commit `8309b83`. The launch command was + +``` +OMP_NUM_THREADS=1 torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- \ + --depth=26 \ + --run="d26-feb2-fp8-ratio8.5" \ + --model-tag="d26_feb2_fp8_ratio8.5" \ + --device-batch-size=16 \ + --sample-every=-1 \ + --save-every=-1 \ + --core-metric-max-per-task=-1 \ + --core-metric-every=999999 \ + --target-param-data-ratio=8.5 \ + --fp8 +``` + +The result was: + +``` +core_metric 0.2578 +step 14889 +total_training_time 10493 +Minimum validation bpb: 0.745036 +``` + +The big change in this run is `--fp8`, which causes all Linear layers (other than the gates) to be switched to fp8 training using `torchao` with tensorwise fp8 scaling. Each step is of slightly lower quality, but we are taking them a lot faster, coming out net ahead. Anyone who does not have fp8 (e.g. using a GPU without it) can simply leave out the `--fp8` flag to train in bfloat16. This will work just fine but it will produce a slightly stronger model than GPT-2 because of the fp8 -> bf16 precision upgrade. It's possible that one can further tune which layers to include in the fp8 conversion and that e.g. some of the smaller matmuls should be just kept in bf16 etc. + +Previous record was 3.04 hours, so 2.91 hours is `(3.04 - 2.91)/3.04*100` ~= 4.3% speed improvement. From d510b1385b04a77d3f8777ed2b3c1064f2488c53 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 3 Feb 2026 23:21:39 +0000 Subject: [PATCH 101/119] quick experiments to log --- dev/LOG.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/dev/LOG.md b/dev/LOG.md index 8cdef874..908fac1a 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,24 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-02-03: Flip Muon MLP LR Multiplier (PR #492) + +Tested flipping the shape-based LR heuristic in Muon from boosting tall matrices (input projections like `c_fc`) to boosting wide matrices (output projections like `c_proj`). The original code applies `max(1, rows/cols)^0.5`, giving ~2x LR to `c_fc`. The flipped version gives ~2x LR to `c_proj` instead, which aligns with classical fan-in/fan-out scaling conventions. This was proposed in [PR #492](https://github.com/karpathy/nanochat/pull/492) and showed improvements in modded-nanogpt. + +**Result:** Quick d12 experiment: slightly worse **Not adopted.** + +--- + +## 2026-02-03: Skip AdamW Every Other Step + +Inspired by modded-nanogpt, tried stepping AdamW only on odd iterations while Muon steps every iteration. The idea is that small AdamW params (embeddings, scalars, gates) don't need updates as frequently as the large weight matrices, and skipping saves both compute and communication. + +Added `skip_adamw` parameter to `MuonAdamW.step()` and `DistMuonAdamW.step()` plus a matching `zero_grad(skip_adamw=...)` to let AdamW gradients accumulate over 2 steps. Used `lr *= 2**-0.5` (sqrt scaling) to compensate for the 2x effective batch size on AdamW params. + +**Result:** for nanochat d12, we see ~2% faster tok/s, but each step is slightly worse in loss. On net, when plotting against wall clock time, it's slightly worse. **Not adopted.** + +--- + ## 2026-02-02: FP8 Training with torchao Integrated FP8 training using `torchao.float8` to accelerate Linear layer matmuls on H100 GPUs. From 542beb0c8c175af2d52ec7065345dcd8f0162368 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Wed, 4 Feb 2026 02:12:04 +0000 Subject: [PATCH 102/119] bump speedrun to be the up to date leaderboard run --- runs/speedrun.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runs/speedrun.sh b/runs/speedrun.sh index d390c6d7..c423ba6e 100644 --- a/runs/speedrun.sh +++ b/runs/speedrun.sh @@ -70,7 +70,7 @@ echo "Waiting for dataset download to complete..." wait $DATASET_DOWNLOAD_PID # d24 model (slightly overtrained is enough to beat GPT-2 => increase data:params ratio from compute optimal 10.5 (default) to 12) -torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=24 --target-param-data-ratio=12 --device-batch-size=16 --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=26 --target-param-data-ratio=8.5 --device-batch-size=16 --fp8 --run=$WANDB_RUN # evaluate the model: CORE metric, BPB on train/val, and draw samples torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch-size=16 From 718e5e9d67be7e92bd158f5471e1ce53de2e6a64 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 5 Feb 2026 01:39:26 +0000 Subject: [PATCH 103/119] correctly reference NorMuon and fix misleading terms that i may have hastily ported over from modded-nanogpt --- dev/LOG.md | 6 +++--- nanochat/optim.py | 4 ++++ 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/dev/LOG.md b/dev/LOG.md index 908fac1a..71cb18d3 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -733,8 +733,8 @@ Cherry-picked improvements from NorMuon (modded-nanogpt) into our simpler Muon i - Both methods kept in code for easy comparison (`zeropower_via_polar_express` vs `zeropower_via_newtonschulz5`) - **Result:** No dramatic/noticeable difference in training, but keeping the new Polar Express as default. -**2. Variance Reduction (NorMuon-style)** -- Added low-rank variance estimator similar to Adafactor ([arxiv.org/pdf/2510.05491](https://arxiv.org/pdf/2510.05491)) +**2. NorMuon Variance Reduction** +- Added per-neuron/column adaptive learning rate from NorMuon ([arxiv.org/pdf/2510.05491](https://arxiv.org/pdf/2510.05491)) - Maintains `second_momentum_buffer` with shape `[rows, 1]` or `[1, cols]` (whichever is smaller) - Normalizes updates based on running per-row/col variance estimate (beta2=0.95) - Memory overhead: ~1/max(rows, cols) per param, negligible @@ -776,7 +776,7 @@ Example: If d12 optimal is 0.22, then d20 optimal ≈ 0.22 × (12/20)² ≈ 0.08 ### Summary -Muon was changed to use Polar Express, added Adafactor-style variance reduction, and cautious weight decay with schedule that ramps linearly to zero. All of these changes follow modded-nanogpt repo, but all of them were also validated piece by piece to yield improvements in nanochat with the exception of the Polar Express change which was in the noise. This is default on and configurable with `--weight_decay`, using simply 0.2 and ∝ 1/width² scaling. The kwarg `--weight_decay` is therefore changing as of this change. It used to configure AdamW via standard weight decay and now it becomes exclusively used in Muon (AdamW is hardcoded to 0.0), and it is scaled based on depth. +Muon was changed to use Polar Express, added NorMuon variance reduction, and cautious weight decay with schedule that ramps linearly to zero. All of these changes follow modded-nanogpt repo, but all of them were also validated piece by piece to yield improvements in nanochat with the exception of the Polar Express change which was in the noise. This is default on and configurable with `--weight_decay`, using simply 0.2 and ∝ 1/width² scaling. The kwarg `--weight_decay` is therefore changing as of this change. It used to configure AdamW via standard weight decay and now it becomes exclusively used in Muon (AdamW is hardcoded to 0.0), and it is scaled based on depth. --- diff --git a/nanochat/optim.py b/nanochat/optim.py index 190a1edb..4cc2a1fe 100644 --- a/nanochat/optim.py +++ b/nanochat/optim.py @@ -67,6 +67,10 @@ Polar Express Sign Method for orthogonalization. https://arxiv.org/pdf/2505.16932 by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower. +NorMuon variance reduction: per-neuron/column adaptive learning rate that normalizes +update scales after orthogonalization (Muon's output has non-uniform scales across neurons). +https://arxiv.org/pdf/2510.05491 + Some of the changes in nanochat implementation: - Uses a simpler, more general approach to parameter grouping and stacking - Uses a single fused kernel for the momentum -> polar_express -> variance_reduction -> update step From d63b7ab9acabaa97f2ea2f2410e5255be525a398 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 5 Feb 2026 02:41:46 +0000 Subject: [PATCH 104/119] try and fail relu^2 -> swiglu --- dev/LOG.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/dev/LOG.md b/dev/LOG.md index 71cb18d3..b344b238 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,19 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-02-05: SwiGLU Activation (Negative Result) + +Replaced ReLU² MLP activation with SwiGLU (inspired by [twitter](https://x.com/_xjdr/status/2019141521690567058)). Implementation uses three projections (w1, w2, w3) with hidden_dim scaled to 8/3×n_embd to preserve both parameter count and FLOPs exactly (1.00x match on both). + +```python +# Old: x = c_proj(relu(c_fc(x)).square()) +# New: x = w3(silu(w1(x)) * w2(x)) +``` + +Tested at both d12 and d24 (GPT-2 scale). Worse on all measures — step efficiency, wall clock time, and FLOPs. ReLU² remains superior for nanochat. **Not adopted.** + +--- + ## 2026-02-03: Flip Muon MLP LR Multiplier (PR #492) Tested flipping the shape-based LR heuristic in Muon from boosting tall matrices (input projections like `c_fc`) to boosting wide matrices (output projections like `c_proj`). The original code applies `max(1, rows/cols)^0.5`, giving ~2x LR to `c_fc`. The flipped version gives ~2x LR to `c_proj` instead, which aligns with classical fan-in/fan-out scaling conventions. This was proposed in [PR #492](https://github.com/karpathy/nanochat/pull/492) and showed improvements in modded-nanogpt. From 1144d186ed4bd7ea949bddca03612922402ab198 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 5 Feb 2026 02:42:46 +0000 Subject: [PATCH 105/119] try and fail relu^2 -> swiglu --- dev/LOG.md | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/dev/LOG.md b/dev/LOG.md index b344b238..02561ac7 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -6,11 +6,24 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 ## 2026-02-05: SwiGLU Activation (Negative Result) -Replaced ReLU² MLP activation with SwiGLU (inspired by [twitter](https://x.com/_xjdr/status/2019141521690567058)). Implementation uses three projections (w1, w2, w3) with hidden_dim scaled to 8/3×n_embd to preserve both parameter count and FLOPs exactly (1.00x match on both). +Replaced ReLU² MLP activation with SwiGLU (inspired by [twitter](https://x.com/_xjdr/status/2019141521690567058)). SwiGLU uses three projections instead of two, so to match parameters and FLOPs we scale hidden_dim from 4× to 8/3×: ```python -# Old: x = c_proj(relu(c_fc(x)).square()) -# New: x = w3(silu(w1(x)) * w2(x)) +# Old ReLU²: 2 matrices, 4x expansion +# params: 2 × n × 4n = 8n² +# flops: 2 × 2n × 4n = 16n² per token +self.c_fc = Linear(n_embd, 4 * n_embd) +self.c_proj = Linear(4 * n_embd, n_embd) +x = c_proj(relu(c_fc(x)).square()) + +# New SwiGLU: 3 matrices, 8/3x expansion +# params: 2 × n × (8n/3) + (8n/3) × n = 8n² ✓ matches +# flops: 3 × 2n × (8n/3) = 16n² per token ✓ matches +hidden_dim = (8 * n_embd) // 3 +self.w1 = Linear(n_embd, hidden_dim) # gate +self.w2 = Linear(n_embd, hidden_dim) # up +self.w3 = Linear(hidden_dim, n_embd) # down +x = w3(silu(w1(x)) * w2(x)) ``` Tested at both d12 and d24 (GPT-2 scale). Worse on all measures — step efficiency, wall clock time, and FLOPs. ReLU² remains superior for nanochat. **Not adopted.** From 75b302f331fa6b678bac0c4ffabfb5012555bd08 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 5 Feb 2026 16:14:28 +0000 Subject: [PATCH 106/119] fix hash commit on leaderboard and a paragraph clarification --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 08c184a0..cf07ef35 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ For questions about the repo, I recommend either using [DeepWiki](https://deepwi |---|-------------|---------|------|-------------|------|--------|--------------| | 0 | 168 hours | - | 0.2565 | Original OpenAI GPT-2 checkpoint | 2019 | - | OpenAI | | 1 | 3.04 | 0.74833 | 0.2585 | d24 baseline, slightly overtrained | Jan 29 2026 | 348fbb3 | @karpathy | -| 2 | 2.91 | 0.74504 | 0.2578 | d26 slightly undertrained **+fp8** | Feb 2 2026 | 8309b83 | @karpathy | +| 2 | 2.91 | 0.74504 | 0.2578 | d26 slightly undertrained **+fp8** | Feb 2 2026 | a67eba3 | @karpathy | The primary metric we care about is "time to GPT-2" - the wall clock time needed to outperform the GPT-2 (1.6B) CORE metric on an 8XH100 GPU node. The GPT-2 CORE score is 0.256525. In 2019, the training of GPT-2 cost approximately $50,000 so it is incredible that due to many advances over 7 years across the stack, we can now do so much faster and for well below $100 (e.g. at the current ~$3/GPU/hr, an 8XH100 node is ~$24/hr, so 3 hours is ~$72). @@ -71,7 +71,7 @@ OMP_NUM_THREADS=1 torchrun --standalone --nproc_per_node=8 -m scripts.base_train This uses wandb (run name "d12"), only runs the CORE metric on last step, and it doesn't sample and save intermediate checkpoints. I like to change something in the code, re-run a d12 (or a d16 etc) and see if it helped, in an iteration loop. -The overall approach is to treat the depth of the model as the single dial of complexity. By sweeping out the depth, we get increasingly more powerful models. We determine the scaling laws, set the data budget to a compute optimal setting, train a whole miniseries of models of increasing sizes, and compare them to the GPT-2 and GPT-3 miniseries. Right now, beating GPT-2 specifically faster and faster is the most interesting target. +The important thing to note is that nanochat is written and configured around one single dial of complexity - the depth of the transformer. This single integer automatically determines all other hyperparameters (the width of the transformer, number of heads, learning rate adjustments, training horizons, weight decays, ...) so that the trained model comes out compute optimal. The idea is that the user doesn't have to think about or set any of this, they are simply asking for a smaller or bigger model using `--depth`, and everything "just works". By sweeping out the depth, you achieve the nanochat miniseries of compute optimal models at various sizes. GPT-2 capability model (which is of most interest at the moment) happens to be somewhere around d24-d26 range with the current code. But any candidate changes to the repo have to be principled enough that they work for all settings of depth. ## Running on CPU / MPS From 012da1a78bc2b7bb70dbdeba41f987ebf5f7b2e1 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Thu, 5 Feb 2026 19:12:50 +0100 Subject: [PATCH 107/119] Typo fixes (#480) * small typo * few more small fixes * small fixes in leaderboard.md --- README.md | 8 ++++---- dev/LEADERBOARD.md | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index cf07ef35..49110901 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ For questions about the repo, I recommend either using [DeepWiki](https://deepwi ## Updates - (Jan 31 2026) Major revamp of all scripts/README ongoing, deleting midtraining stage, might be a bit messy briefly... -- (Jan 30 2026) With all the latest improvements we're able to train GPT-2 grade LLM in about $73. The [runs/speedrun.sh](runs/speedrun.sh) script will become the refernece way to train GPT-2 grade model and talk to it. +- (Jan 30 2026) With all the latest improvements we're able to train GPT-2 grade LLM in about $73. The [runs/speedrun.sh](runs/speedrun.sh) script will become the reference way to train GPT-2 grade model and talk to it. ## Leaderboard @@ -28,13 +28,13 @@ See [dev/LEADERBOARD.md](dev/LEADERBOARD.md) for more docs on how to interpret a ### Reproduce and talk to GPT-2 -The most fun you can have is to train your own GPT-2 and talk to it. The entire pipeline to do so is contained in the single file [runs/speedrun.sh](runs/speedrun.sh), which is designed to be run on an 8XH100 GPU node. Currently, at ~$24/hour for these nodes, pretraining GPT-2 grade model takes approximately 3 hours and will set you back about $75. Boot up a new 8XH100 GPU box from your favorite provider (e.g. I use and like [Lambda](https://lambda.ai/service/gpu-cloud)), and kick off the training script: +The most fun you can have is to train your own GPT-2 and talk to it. The entire pipeline to do so is contained in the single file [runs/speedrun.sh](runs/speedrun.sh), which is designed to be run on an 8XH100 GPU node. Currently, at ~$24/hour for these nodes, pretraining a GPT-2 grade model takes approximately 3 hours and will set you back about $75. Boot up a new 8XH100 GPU box from your favorite provider (e.g. I use and like [Lambda](https://lambda.ai/service/gpu-cloud)), and kick off the training script: ```bash bash runs/speedrun.sh ``` -You mish to do so in a screen session as this will take ~3 hours to run. Once it's done, you can talk to it via the ChatGPT-like web UI. Make sure again that your local uv virtual environment is active (run `source .venv/bin/activate`), and serve it: +You may wish to do so in a screen session as this will take ~3 hours to run. Once it's done, you can talk to it via the ChatGPT-like web UI. Make sure again that your local uv virtual environment is active (run `source .venv/bin/activate`), and serve it: ```bash python -m scripts.chat_web @@ -75,7 +75,7 @@ The important thing to note is that nanochat is written and configured around on ## Running on CPU / MPS -The script [runs/runcpu.sh](runs/runcpu.sh) shows a very simple example of running on CPU or Apple Silicon. It dramatically shrinks the LLM tha tis being trained to make things fit into a reasonable time interval of a few ten minutes of training. You will not get strong results in this way. +The script [runs/runcpu.sh](runs/runcpu.sh) shows a very simple example of running on CPU or Apple Silicon. It dramatically shrinks the LLM that is being trained to make things fit into a reasonable time interval of a few ten minutes of training. You will not get strong results in this way. ## Guides diff --git a/dev/LEADERBOARD.md b/dev/LEADERBOARD.md index 3b61cc60..6c2720cb 100644 --- a/dev/LEADERBOARD.md +++ b/dev/LEADERBOARD.md @@ -29,12 +29,12 @@ Note that: - `depth` controls the size of the Transformer - `run` is the wandb name - `model-tag` is the location of the checkpoints on disk -- `device-batch-size` in the ideal world, you want this to be 32 because with sequence length of 2048 (the default) and 8 GPUs we get `32 X 2048 X 8 = 524,288`, which is the total desired batch size determined to work fairly well around this scale. However, for bigger (e.g. d26), 32 is too much and OOMs, so we decrease it by 2 to 16. The `base_train.py` script automatically compensates for this by calculating that it has to use gradient accumulation of 2 to meet the desired total batch size. Therefore, it will fo forward+backward twice and then a single step. Long story short, the ideal value is 32. If that doesn't fit, you decrease it, e.g. 16, 8, etc., keeping it powers of two so that the gradient accumulation math works out neatly. +- `device-batch-size` in the ideal world, you want this to be 32 because with sequence length of 2048 (the default) and 8 GPUs we get `32 X 2048 X 8 = 524,288`, which is the total desired batch size determined to work fairly well around this scale. However, for bigger (e.g. d26), 32 is too much and OOMs, so we decrease it by 2 to 16. The `base_train.py` script automatically compensates for this by calculating that it has to use gradient accumulation of 2 to meet the desired total batch size. Therefore, it will do forward+backward twice and then a single step. Long story short, the ideal value is 32. If that doesn't fit, you decrease it, e.g. 16, 8, etc., keeping it powers of two so that the gradient accumulation math works out neatly. - `sample-every = -1` turns off periodic sampling - `core-metric-max-per-task=-1` means we run the entire CORE eval - `core-metric-every=999999` a bit of a hacky way to make the CORE eval only happen a single time at the very end of the run - `target-param-data-ratio=8.25` controls the training horizon, which is determined in the script by taking the number of non-embedding model parameters and simply multiplying by this number. The current optimal Tokens:Params ratio can be seen in the defaults of the `base_train.py` script (it is 10.5). 10.5 would produce the *compute optimal* model given the currently measured scaling laws. However, GPT-2 capability is currently somewhere in between a d24 and d26. So to reach it exactly, we want to either overtrain d24 or undertrain d26. In this particular example, I am choosing to slightly undertrain a d26. Note that odd depths (e.g. d25) are not super recommended to use because the math around the transformer sizing and its head dimensions doesn't come out neatly. -- `--fp8` turns on fp8 training. If you GPU does not support fp8, you can leave this out and the code will simply train in bf16. bf16 is higher precision than fp8, so you can actually expect that you might be able to do fewer steps (lower the `target-param-data-ratio`) to achieve the same capability. +- `--fp8` turns on fp8 training. If your GPU does not support fp8, you can leave this out and the code will simply train in bf16. bf16 is higher precision than fp8, so you can actually expect that you might be able to do fewer steps (lower the `target-param-data-ratio`) to achieve the same capability. Once you kick off the run, you wait ~3 hours and then at the end you'll see something like: @@ -46,9 +46,9 @@ wandb: total_training_flops 4.330784131228946e+19 wandb: total_training_time 10949.46713 ``` -Your CORE metric must be greater than GPT-2 0.256525. Then you report the `total_training_time`, (e.g. 10949) which is the time of the training iterations alone, excluding all the evaluations and logging, in seconds. So here for example here it is roughly 10949/60/60 ~= 3.04 hours. You should also note and report the validation bpb of your run because the CORE metric can be a little bit noisy. +Your CORE metric must be greater than GPT-2 0.256525. Then you report the `total_training_time`, (e.g. 10949) which is the time of the training iterations alone, excluding all the evaluations and logging, in seconds. So here for example it is roughly 10949/60/60 ~= 3.04 hours. You should also note and report the validation bpb of your run because the CORE metric can be a little bit noisy. -If you outperform GPT-2 and the time is less than current SOTA in the Leaderboard, you get to make a PR. In addition to raw gains, there are some qualitative and aesthetic considerations that go into whether your improvement is merged. For example, if it is gnarly or it significantly bloats the code, or it seems too esoteric, then we will way those things against the improvement demonstrated. Additionally, nanochat cares not only about targeting a single model, but an entire miniseries of models. So your change must be principled enough that it can easily generalize to other model depths, so that we can sweep out a miniseries. +If you outperform GPT-2 and the time is less than current SOTA in the Leaderboard, you get to make a PR. In addition to raw gains, there are some qualitative and aesthetic considerations that go into whether your improvement is merged. For example, if it is gnarly or it significantly bloats the code, or it seems too esoteric, then we will weigh those things against the improvement demonstrated. Additionally, nanochat cares not only about targeting a single model, but an entire miniseries of models. So your change must be principled enough that it can easily generalize to other model depths, so that we can sweep out a miniseries. After you create the commit, to get the current short git commit hash: From 98eed6df189e395056c34621043d082878df392f Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 5 Feb 2026 18:14:09 +0000 Subject: [PATCH 108/119] bring back an assert guarding against bad param sizing --- nanochat/optim.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nanochat/optim.py b/nanochat/optim.py index 4cc2a1fe..42d862b4 100644 --- a/nanochat/optim.py +++ b/nanochat/optim.py @@ -377,6 +377,7 @@ class DistMuonAdamW(torch.optim.Optimizer): param_infos[p] = dict(future=future, grad_slice=grad, is_small=True) else: # Large params: reduce_scatter + assert grad.shape[0] % world_size == 0, f"AdamW reduce_scatter requires shape[0] ({grad.shape[0]}) divisible by world_size ({world_size})" rank_size = grad.shape[0] // world_size grad_slice = torch.empty_like(grad[:rank_size]) future = dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future() From f41dd3cbd76f82a46624e16c81c6131ecc4d205d Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 5 Feb 2026 19:40:37 +0000 Subject: [PATCH 109/119] auto-calculate optimal batch size. the original setting of 0.5M was only optimal for d12, but d26 prefers 1M and so on --- dev/LOG.md | 46 ++++++++++ scripts/base_train.py | 201 +++++++++++++++++++++++------------------- 2 files changed, 156 insertions(+), 91 deletions(-) diff --git a/dev/LOG.md b/dev/LOG.md index 02561ac7..6ce41733 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -4,6 +4,52 @@ A running summary documenting some experiments and findings. Started ~Jan 7 2026 --- +## 2026-02-05: Auto Batch Size Scaling + +### Background + +So far, the `--total-batch-size` was hardcoded to be `2**19 = 524,288` ~= 0.5M tokens. This was the optimal setting for d12, but when I tried to re-tune it for d26 (GPT-2), I noticed that the optimal was closer to `2**20 = 1,048,576` ~= 1M tokens. This is to be expected - larger models prefer a higher optimal total batch size. However, we have to make sure that all settings of `--depth` get their own optimal batch size calculated in some principled way. Here, I referenced the "Power Lines" paper from Cerebras ([arXiv:2505.13738](https://arxiv.org/abs/2505.13738)) for a lot of related experimentation. In particular, they found that **Bopt ∝ D^0.383** (where D is the number of training tokens, not the number of parameters!). So the idea is to tune the optimal batch size on d12, and then extrapolate it with this power law to bigger models. The 0.383 exponent means batch size grows slowly: 10× more tokens only justifies ~2.4× bigger batch. For nanochat's compute-optimal training (D ∝ N via `--target-param-data-ratio`), this means deeper models naturally want larger batches. + +### Implementation + +Added `--total-batch-size=-1` (now the default) to auto-compute optimal batch: + +```python +get_scaling_params = lambda m: m.num_scaling_params()['transformer_matrices'] + m.num_scaling_params()['lm_head'] +if args.total_batch_size == -1: + D_REF = args.target_param_data_ratio * get_scaling_params(build_model_meta(12)) + B_REF = 2**19 + args.total_batch_size = 2 ** round(math.log2(B_REF * (target_tokens / D_REF) ** 0.383)) +``` + +Reference point: d=12 model with B=2^19 (empirically validated). The reference is computed dynamically so that if the architecture changes (e.g., different `--aspect-ratio`), the math automatically adjusts. However, if the model actually does change too much, one would also want to re-tune the optimal batch size for d=12. + +### Results + +With this formula, we currently get: + +| Depth | Scaling Params | Target Tokens | Auto Batch | +|-------|---------------|---------------|------------| +| d=8 | 42M | 0.44B | 2^18 = 262K | +| d=10-16 | 70M-235M | 0.7B-2.5B | 2^19 = 524K | +| d=18-26 | 324M-918M | 3.4B-9.6B | 2^20 = 1.05M | +| d=32-50 | 1.7B-6.2B | 17.6B-65.6B | 2^21 = 2.1M | + +In particular, this matches empirical observations that d26 prefers ~2^20 while d12 prefers ~2^19. + +### Code Cleanup + +Also refactored model initialization to use `build_model_meta(depth)` helper and `dataclasses.asdict()` for cleaner config handling. + +### Useful references + +- [Bergsma et al., Power Laws for Batch Size, Model Size, and Training Horizon](https://arxiv.org/abs/2505.13738) +- [McCandlish et al., An Empirical Model of Large-Batch Training](https://arxiv.org/abs/1812.06162) +- [Brown et al., Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165) +- [Merrill et al., The Batch Size–Critical Batch Size Myth](https://arxiv.org/abs/2505.23971) + +--- + ## 2026-02-05: SwiGLU Activation (Negative Result) Replaced ReLU² MLP activation with SwiGLU (inspired by [twitter](https://x.com/_xjdr/status/2019141521690567058)). SwiGLU uses three projections instead of two, so to match parameters and FLOPs we scale hidden_dim from 4× to 8/3×: diff --git a/scripts/base_train.py b/scripts/base_train.py index fa05b60a..97264c86 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -11,11 +11,14 @@ If you are only on CPU/Macbook, you'll want to train a much much smaller LLM. Ex python -m scripts.base_train --depth=4 --max-seq-len=512 --device-batch-size=1 --eval-tokens=512 --core-metric-every=-1 --total-batch-size=512 --num-iterations=20 """ -import gc import os os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True" -import argparse +import gc +import json import time +import math +import argparse +from dataclasses import asdict from contextlib import nullcontext, contextmanager import wandb @@ -53,8 +56,8 @@ parser.add_argument("--num-iterations", type=int, default=-1, help="explicit num parser.add_argument("--target-flops", type=float, default=-1.0, help="calculate num_iterations to reach target_flops (-1 = disable)") parser.add_argument("--target-param-data-ratio", type=float, default=10.5, help="calculate num_iterations to maintain data:param ratio (Chinchilla=20, -1 = disable)") # Optimization -parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size") -parser.add_argument("--total-batch-size", type=int, default=524288, help="total batch size in tokens") +parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size. good number to reduce to 16,8,4,... if you OOM on VRAM.") +parser.add_argument("--total-batch-size", type=int, default=-1, help="total batch size in tokens. decent numbers are e.g. 524288. (-1 = auto-compute optimal)") parser.add_argument("--embedding-lr", type=float, default=0.3, help="learning rate for embedding parameters (Adam)") parser.add_argument("--unembedding-lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)") parser.add_argument("--weight-decay", type=float, default=0.2, help="cautious weight decay for the Muon optimizer (for weights)") @@ -78,8 +81,8 @@ parser.add_argument("--model-tag", type=str, default=None, help="override model args = parser.parse_args() user_config = vars(args).copy() # for logging # ----------------------------------------------------------------------------- +# Compute init and wandb logging -# Compute init device_type = autodetect_device_type() if args.device_type == "" else args.device_type ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) master_process = ddp_rank == 0 # this process will do logging, checkpointing etc. @@ -109,65 +112,39 @@ else: print0("WARNING: Recommend using --window-pattern L for full context attention without alternating sliding window patterns.") print0("!" * 80) -# Tokenizer will be useful for evaluation, also we need the vocab size +# ----------------------------------------------------------------------------- +# Tokenizer will be useful for evaluation and also we need the vocab size to init the model tokenizer = get_tokenizer() token_bytes = get_token_bytes(device=device) vocab_size = tokenizer.get_vocab_size() print0(f"Vocab size: {vocab_size:,}") -# Model kwargs are derived from the desired depth of the model -# We nudge model_dim up to the nearest multiple of head_dim to ensure clean division -# (FA3 requires head_dim divisible by 8, and this guarantees head_dim == args.head_dim exactly) -# (For very small depths, this gives a slight "unfair" advantage to models with odd depths) -num_layers = args.depth -base_dim = args.depth * args.aspect_ratio -model_dim = ((base_dim + args.head_dim - 1) // args.head_dim) * args.head_dim -num_heads = model_dim // args.head_dim -num_kv_heads = num_heads # default is 1:1 GQA (Group Query Attention) ratio (i.e. GQA is disabled) -head_dim = model_dim // num_heads -print0(f"num_layers: {num_layers}") -print0(f"model_dim: {model_dim} (base: {base_dim}, nudge: {model_dim - base_dim:+d})") -print0(f"num_heads: {num_heads}") -print0(f"head_dim: {head_dim}") -print0(f"num_kv_heads: {num_kv_heads}") - -# Optimizer / data / training length related hyperparameters -# figure out the needed gradient accumulation to reach the desired total batch size -tokens_per_fwdbwd = args.device_batch_size * args.max_seq_len # tokens per iteration for a single rank -world_tokens_per_fwdbwd = tokens_per_fwdbwd * ddp_world_size # total tokens per iteration for all ranks -assert args.total_batch_size % world_tokens_per_fwdbwd == 0 -grad_accum_steps = args.total_batch_size // world_tokens_per_fwdbwd -print0(f"Tokens / micro-batch / rank: {args.device_batch_size} x {args.max_seq_len} = {tokens_per_fwdbwd:,}") -print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}") -print0(f"Total batch size {args.total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}") - -# Batch size scaling for learning rates (hyperparameters were tuned at reference batch size 2^19) -batch_lr_scale = 1.0 -reference_batch_size = 2**19 -batch_ratio = args.total_batch_size / reference_batch_size -if batch_ratio != 1.0: - # SGD: linear scaling with batch size is standard (not used in nanochat) - # AdamW: sqrt scaling is standard - # Muon: sqrt scaling is an assumption - not fully studied, but it's a second-order-ish optimizer - batch_lr_scale = batch_ratio ** 0.5 - print0(f"Scaling LRs by {batch_lr_scale:.4f} for batch size {args.total_batch_size:,} (reference: {reference_batch_size:,})") - -# Weight decay is tuned at d12 and its scaling seems to be \propto 1/channels^2 (or equivalently, \propto 1/depth^2 due to constant aspect ratio) -weight_decay_scaled = args.weight_decay * (12 / args.depth)**2 -if args.depth != 12: - print0(f"Scaling weight decay from {args.weight_decay:.6f} to {weight_decay_scaled:.6f} for depth {args.depth}") - # ----------------------------------------------------------------------------- # Initialize the Model -# Create a new model with random weights -model_config_kwargs = dict(sequence_len=args.max_seq_len, vocab_size=vocab_size, n_layer=num_layers, n_head=num_heads, n_kv_head=num_kv_heads, n_embd=model_dim, window_pattern=args.window_pattern) -with torch.device("meta"): - # All tensors are created as meta tensors (they have shape/dtype but no data) - model_config = GPTConfig(**model_config_kwargs) - model = GPT(model_config) -model.to_empty(device=device) # All tensors get storage on target device but with uninitialized (garbage) data -model.init_weights() # All tensors get initialized +def build_model_meta(depth): + """Build a model on meta device for a given depth (shapes/dtypes only, no data).""" + # Model dim is nudged up to nearest multiple of head_dim for clean division + # (FA3 requires head_dim divisible by 8, and this guarantees head_dim == args.head_dim exactly) + base_dim = depth * args.aspect_ratio + model_dim = ((base_dim + args.head_dim - 1) // args.head_dim) * args.head_dim + num_heads = model_dim // args.head_dim + config = GPTConfig( + sequence_len=args.max_seq_len, vocab_size=vocab_size, + n_layer=depth, n_head=num_heads, n_kv_head=num_heads, n_embd=model_dim, + window_pattern=args.window_pattern, + ) + with torch.device("meta"): + model_meta = GPT(config) + return model_meta + +# Build the model, move to device, init the weights +model = build_model_meta(args.depth) # 1) Build on meta device (only shapes/dtypes, no data) +model_config = model.config +model_config_kwargs = asdict(model_config) +print0(f"Model config:\n{json.dumps(model_config_kwargs, indent=2)}") +model.to_empty(device=device) # 2) All tensors get storage on target device but with uninitialized (garbage) data +model.init_weights() # 3) All tensors get initialized # If we are resuming, overwrite the model parameters with those of the checkpoint base_dir = get_base_dir() @@ -181,41 +158,7 @@ if resuming: del model_data # free up this memory after the copy # ----------------------------------------------------------------------------- -# Determine the length of the training run based on model size - -# Detailed parameter counts -param_counts = model.num_scaling_params() -print0(f"Parameter counts:") -for key, value in param_counts.items(): - print0(f"{key:24s}: {value:,}") -num_params = param_counts['total'] -num_scaling_params = param_counts['transformer_matrices'] + param_counts['lm_head'] # determined to give the cleanest scaling laws, see dev/LOG.md Jan 27, 2026 -num_flops_per_token = model.estimate_flops() -print0(f"Estimated FLOPs per token: {num_flops_per_token:e}") - -# Calculate number of iterations. Either it is given, or from target flops, or from target data:param ratio (in that order) -assert args.num_iterations > 0 or args.target_param_data_ratio > 0 or args.target_flops > 0 -if args.num_iterations > 0: - num_iterations = args.num_iterations - print0(f"Using user-provided number of iterations: {num_iterations:,}") -elif args.target_flops > 0: - # calculate the number of iterations from the target flops - num_iterations = round(args.target_flops / (num_flops_per_token * args.total_batch_size)) - print0(f"Calculated number of iterations from target FLOPs: {num_iterations:,}") -elif args.target_param_data_ratio > 0: - # calculate the number of iterations from the target param data ratio (use scaling params per Kaplan et al.) - target_tokens = int(args.target_param_data_ratio * num_scaling_params) - num_iterations = target_tokens // args.total_batch_size - print0(f"Calculated number of iterations from target data:param ratio: {num_iterations:,}") -else: - raise ValueError("No training horizon specified") -total_tokens = args.total_batch_size * num_iterations -print0(f"Total number of training tokens: {total_tokens:,}") -print0(f"Tokens : Scaling params ratio: {args.total_batch_size * num_iterations / num_scaling_params:.2f}") # Chinchilla is ~20 -print0(f"Total training FLOPs estimate: {num_flops_per_token * total_tokens:e}") - -# ----------------------------------------------------------------------------- -# FP8 training initialization and management (has to be done before torch.compile) +# FP8 training initialization and management (this has to be done before torch.compile) # Convert Linear layers to Float8Linear if --fp8 is set if args.fp8: @@ -293,6 +236,82 @@ def disable_fp8(model): orig_model = model # original, uncompiled model, for saving raw model state_dict and for inference/evaluation (because the shapes may change shape) model = torch.compile(model, dynamic=False) # the inputs to model will never change shape so dynamic=False is safe +# ----------------------------------------------------------------------------- +# Determine the optimization horizon based on the model size +# The compute-optimal models satisfy the Tokens:Params ratio of --target-param-data-ratio (derived experimentally via scaling laws analysis). +# We've already initialized the model so we have Params. Optimal Tokens is now simply target-param-data-ratio * Params + +# Get the parameter counts of the model +param_counts = model.num_scaling_params() +print0(f"Parameter counts:") +for key, value in param_counts.items(): + print0(f"{key:24s}: {value:,}") +num_params = param_counts['total'] +num_flops_per_token = model.estimate_flops() +print0(f"Estimated FLOPs per token: {num_flops_per_token:e}") + +# Scaling params: transformer matrices + lm_head (gives cleanest scaling laws, see dev/LOG.md Jan 27, 2026) +get_scaling_params = lambda m: m.num_scaling_params()['transformer_matrices'] + m.num_scaling_params()['lm_head'] +num_scaling_params = get_scaling_params(model) +target_tokens = int(args.target_param_data_ratio * num_scaling_params) + +# Auto-compute optimal batch size based on Power Lines paper (Bopt ∝ D^0.383), ref: https://arxiv.org/abs/2505.13738 +if args.total_batch_size == -1: + d12_ref = build_model_meta(12) # d12 is where the optimal batch size was measured to be 2**19 tokens + d12_num_scaling_params = get_scaling_params(d12_ref) + D_REF = args.target_param_data_ratio * d12_num_scaling_params + B_REF = 2**19 + args.total_batch_size = 2 ** round(math.log2(B_REF * (target_tokens / D_REF) ** 0.383)) # also clamp to power of 2 + print0(f"Auto-computed optimal batch size: {args.total_batch_size:,} tokens") + +# Calculate number of iterations. Either it is given, or from target flops, or from target data:param ratio (in that order) +assert args.num_iterations > 0 or args.target_param_data_ratio > 0 or args.target_flops > 0 +if args.num_iterations > 0: + # Override num_iterations to a specific value if given + num_iterations = args.num_iterations + print0(f"Using user-provided number of iterations: {num_iterations:,}") +elif args.target_flops > 0: + # Calculate the number of iterations from the target flops (used in scaling laws analysis, e.g. runs/scaling_laws.sh) + num_iterations = round(args.target_flops / (num_flops_per_token * args.total_batch_size)) + print0(f"Calculated number of iterations from target FLOPs: {num_iterations:,}") +elif args.target_param_data_ratio > 0: + # Calculate the number of iterations from the target param data ratio (the most common use case) + num_iterations = target_tokens // args.total_batch_size + print0(f"Calculated number of iterations from target data:param ratio: {num_iterations:,}") +else: + raise ValueError("No training horizon specified") +total_tokens = args.total_batch_size * num_iterations +print0(f"Total number of training tokens: {total_tokens:,}") +print0(f"Tokens : Scaling params ratio: {args.total_batch_size * num_iterations / num_scaling_params:.2f}") # Chinchilla is ~20 +print0(f"Total training FLOPs estimate: {num_flops_per_token * total_tokens:e}") + +# ----------------------------------------------------------------------------- +# Optimizer / data / training length related hyperparameters +# figure out the needed gradient accumulation to reach the desired total batch size +tokens_per_fwdbwd = args.device_batch_size * args.max_seq_len # tokens per iteration for a single rank +world_tokens_per_fwdbwd = tokens_per_fwdbwd * ddp_world_size # total tokens per iteration for all ranks +assert args.total_batch_size % world_tokens_per_fwdbwd == 0 +grad_accum_steps = args.total_batch_size // world_tokens_per_fwdbwd +print0(f"Tokens / micro-batch / rank: {args.device_batch_size} x {args.max_seq_len} = {tokens_per_fwdbwd:,}") +print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}") +print0(f"Total batch size {args.total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}") + +# Batch size scaling for learning rates (hyperparameters were tuned at reference batch size 2^19) +batch_lr_scale = 1.0 +reference_batch_size = 2**19 +batch_ratio = args.total_batch_size / reference_batch_size +if batch_ratio != 1.0: + # SGD: linear scaling with batch size is standard (not used in nanochat) + # AdamW: sqrt scaling is standard + # Muon: sqrt scaling is an assumption - not fully studied, but it's a second-order-ish optimizer + batch_lr_scale = batch_ratio ** 0.5 + print0(f"Scaling LRs by {batch_lr_scale:.4f} for batch size {args.total_batch_size:,} (reference: {reference_batch_size:,})") + +# Weight decay is tuned at d12 and its scaling seems to be \propto 1/channels^2 (or equivalently, \propto 1/depth^2 due to constant aspect ratio) +weight_decay_scaled = args.weight_decay * (12 / args.depth)**2 +if args.depth != 12: + print0(f"Scaling weight decay from {args.weight_decay:.6f} to {weight_decay_scaled:.6f} for depth {args.depth}") + # ----------------------------------------------------------------------------- # Initialize the Optimizer (combined MuonAdamW: Muon for matrix params, AdamW for rest) adam_betas = (args.adam_beta1, args.adam_beta2) From 2c062aaa949536c1cff2ffb3df2ae4aeba20dc4b Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 5 Feb 2026 19:59:46 +0000 Subject: [PATCH 110/119] nit: don't mutate args, create new var for total_batch_size --- scripts/base_train.py | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/scripts/base_train.py b/scripts/base_train.py index 97264c86..a3774e6d 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -256,13 +256,15 @@ num_scaling_params = get_scaling_params(model) target_tokens = int(args.target_param_data_ratio * num_scaling_params) # Auto-compute optimal batch size based on Power Lines paper (Bopt ∝ D^0.383), ref: https://arxiv.org/abs/2505.13738 -if args.total_batch_size == -1: +total_batch_size = args.total_batch_size +if total_batch_size == -1: d12_ref = build_model_meta(12) # d12 is where the optimal batch size was measured to be 2**19 tokens d12_num_scaling_params = get_scaling_params(d12_ref) D_REF = args.target_param_data_ratio * d12_num_scaling_params B_REF = 2**19 - args.total_batch_size = 2 ** round(math.log2(B_REF * (target_tokens / D_REF) ** 0.383)) # also clamp to power of 2 - print0(f"Auto-computed optimal batch size: {args.total_batch_size:,} tokens") + batch_size_ratio = target_tokens / D_REF + total_batch_size = 2 ** round(math.log2(B_REF * batch_size_ratio ** 0.383)) # also clamp to power of 2 + print0(f"Auto-computed optimal batch size: {total_batch_size:,} tokens") # Calculate number of iterations. Either it is given, or from target flops, or from target data:param ratio (in that order) assert args.num_iterations > 0 or args.target_param_data_ratio > 0 or args.target_flops > 0 @@ -272,17 +274,17 @@ if args.num_iterations > 0: print0(f"Using user-provided number of iterations: {num_iterations:,}") elif args.target_flops > 0: # Calculate the number of iterations from the target flops (used in scaling laws analysis, e.g. runs/scaling_laws.sh) - num_iterations = round(args.target_flops / (num_flops_per_token * args.total_batch_size)) + num_iterations = round(args.target_flops / (num_flops_per_token * total_batch_size)) print0(f"Calculated number of iterations from target FLOPs: {num_iterations:,}") elif args.target_param_data_ratio > 0: # Calculate the number of iterations from the target param data ratio (the most common use case) - num_iterations = target_tokens // args.total_batch_size + num_iterations = target_tokens // total_batch_size print0(f"Calculated number of iterations from target data:param ratio: {num_iterations:,}") else: raise ValueError("No training horizon specified") -total_tokens = args.total_batch_size * num_iterations +total_tokens = total_batch_size * num_iterations print0(f"Total number of training tokens: {total_tokens:,}") -print0(f"Tokens : Scaling params ratio: {args.total_batch_size * num_iterations / num_scaling_params:.2f}") # Chinchilla is ~20 +print0(f"Tokens : Scaling params ratio: {total_batch_size * num_iterations / num_scaling_params:.2f}") # Chinchilla is ~20 print0(f"Total training FLOPs estimate: {num_flops_per_token * total_tokens:e}") # ----------------------------------------------------------------------------- @@ -290,22 +292,22 @@ print0(f"Total training FLOPs estimate: {num_flops_per_token * total_tokens:e}") # figure out the needed gradient accumulation to reach the desired total batch size tokens_per_fwdbwd = args.device_batch_size * args.max_seq_len # tokens per iteration for a single rank world_tokens_per_fwdbwd = tokens_per_fwdbwd * ddp_world_size # total tokens per iteration for all ranks -assert args.total_batch_size % world_tokens_per_fwdbwd == 0 -grad_accum_steps = args.total_batch_size // world_tokens_per_fwdbwd +assert total_batch_size % world_tokens_per_fwdbwd == 0 +grad_accum_steps = total_batch_size // world_tokens_per_fwdbwd print0(f"Tokens / micro-batch / rank: {args.device_batch_size} x {args.max_seq_len} = {tokens_per_fwdbwd:,}") print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}") -print0(f"Total batch size {args.total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}") +print0(f"Total batch size {total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}") # Batch size scaling for learning rates (hyperparameters were tuned at reference batch size 2^19) batch_lr_scale = 1.0 reference_batch_size = 2**19 -batch_ratio = args.total_batch_size / reference_batch_size +batch_ratio = total_batch_size / reference_batch_size if batch_ratio != 1.0: # SGD: linear scaling with batch size is standard (not used in nanochat) # AdamW: sqrt scaling is standard # Muon: sqrt scaling is an assumption - not fully studied, but it's a second-order-ish optimizer batch_lr_scale = batch_ratio ** 0.5 - print0(f"Scaling LRs by {batch_lr_scale:.4f} for batch size {args.total_batch_size:,} (reference: {reference_batch_size:,})") + print0(f"Scaling LRs by {batch_lr_scale:.4f} for batch size {total_batch_size:,} (reference: {reference_batch_size:,})") # Weight decay is tuned at d12 and its scaling seems to be \propto 1/channels^2 (or equivalently, \propto 1/depth^2 due to constant aspect ratio) weight_decay_scaled = args.weight_decay * (12 / args.depth)**2 @@ -381,7 +383,7 @@ else: # Training loop while True: last_step = step == num_iterations # loop runs num_iterations+1 times so that we can eval/save at the end - flops_so_far = num_flops_per_token * args.total_batch_size * step + flops_so_far = num_flops_per_token * total_batch_size * step # once in a while: evaluate the val bpb (all ranks participate) if args.eval_every > 0 and (last_step or step % args.eval_every == 0): @@ -501,8 +503,8 @@ while True: smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss_f # EMA the training loss debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA pct_done = 100 * step / num_iterations - tok_per_sec = int(args.total_batch_size / dt) - flops_per_sec = num_flops_per_token * args.total_batch_size / dt + tok_per_sec = int(total_batch_size / dt) + flops_per_sec = num_flops_per_token * total_batch_size / dt mfu = 100 * flops_per_sec / (gpu_peak_flops * ddp_world_size) if step > 10: total_training_time += dt # only count the time after the first 10 steps @@ -560,7 +562,7 @@ get_report().log(section="Base model training", data=[ "Number of FLOPs per token": f"{num_flops_per_token:e}", "Calculated number of iterations": num_iterations, "Number of training tokens": total_tokens, - "Tokens : Scaling params ratio": args.total_batch_size * num_iterations / num_scaling_params, + "Tokens : Scaling params ratio": total_batch_size * num_iterations / num_scaling_params, "DDP world size": ddp_world_size, "warmup_ratio": args.warmup_ratio, "warmdown_ratio": args.warmdown_ratio, From 5fdd5cdb246d2e82a1fcc05fd4c0468df824d875 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 5 Feb 2026 20:11:32 +0000 Subject: [PATCH 111/119] new leaderboard record via new auto-calculated optimal batch size. for d26 it is 1M, up from 0.5M that was default earlier --- README.md | 1 + dev/LEADERBOARD.md | 32 +++++++++++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 49110901..182d2730 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,7 @@ For questions about the repo, I recommend either using [DeepWiki](https://deepwi | 0 | 168 hours | - | 0.2565 | Original OpenAI GPT-2 checkpoint | 2019 | - | OpenAI | | 1 | 3.04 | 0.74833 | 0.2585 | d24 baseline, slightly overtrained | Jan 29 2026 | 348fbb3 | @karpathy | | 2 | 2.91 | 0.74504 | 0.2578 | d26 slightly undertrained **+fp8** | Feb 2 2026 | a67eba3 | @karpathy | +| 3 | 2.76 | 0.74645 | 0.2602 | bump total batch size to 1M tokens | Feb 5 2026 | 2c062aa | @karpathy | The primary metric we care about is "time to GPT-2" - the wall clock time needed to outperform the GPT-2 (1.6B) CORE metric on an 8XH100 GPU node. The GPT-2 CORE score is 0.256525. In 2019, the training of GPT-2 cost approximately $50,000 so it is incredible that due to many advances over 7 years across the stack, we can now do so much faster and for well below $100 (e.g. at the current ~$3/GPU/hr, an 8XH100 node is ~$24/hr, so 3 hours is ~$72). diff --git a/dev/LEADERBOARD.md b/dev/LEADERBOARD.md index 6c2720cb..b8a727fb 100644 --- a/dev/LEADERBOARD.md +++ b/dev/LEADERBOARD.md @@ -89,7 +89,7 @@ Detailed writeup: [Beating GPT-2 for <<$100: the nanochat journey](https://githu ## Run 2 -Achieved Feb 2 2026 on commit `8309b83`. The launch command was +Achieved Feb 2 2026 on commit `a67eba3`. The launch command was ``` OMP_NUM_THREADS=1 torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- \ @@ -117,3 +117,33 @@ Minimum validation bpb: 0.745036 The big change in this run is `--fp8`, which causes all Linear layers (other than the gates) to be switched to fp8 training using `torchao` with tensorwise fp8 scaling. Each step is of slightly lower quality, but we are taking them a lot faster, coming out net ahead. Anyone who does not have fp8 (e.g. using a GPU without it) can simply leave out the `--fp8` flag to train in bfloat16. This will work just fine but it will produce a slightly stronger model than GPT-2 because of the fp8 -> bf16 precision upgrade. It's possible that one can further tune which layers to include in the fp8 conversion and that e.g. some of the smaller matmuls should be just kept in bf16 etc. Previous record was 3.04 hours, so 2.91 hours is `(3.04 - 2.91)/3.04*100` ~= 4.3% speed improvement. + +## Run 3 + +Achieved Feb 5 2026 on commit `2c062aa`. Launch command: + +``` +OMP_NUM_THREADS=1 torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- \ + --depth=26 \ + --run="d26_feb4_double_batch_ratio8.25" \ + --model-tag="d26_feb4_double_batch_ratio8.25" \ + --device-batch-size=16 \ + --total-batch-size=1048576 \ + --sample-every=-1 \ + --save-every=-1 \ + --core-metric-max-per-task=-1 \ + --core-metric-every=999999 \ + --target-param-data-ratio=8.25 \ + --fp8 +``` + +Result: + +``` +core_metric 0.26024 +step 7226 +total_training_time 9922 +Minimum validation bpb: 0.74645 +``` + +The big change here is that the batch size was doubled from 0.5M to 1M, which works better for a d26 model and allowed me to decrease the number of optimization steps a bit via `--target-param-data-ratio` from 8.5 to 8.25. The TLDR is that the original batch size of 0.5M was tuned for d12, but bigger models (e.g. d26) prefer larger total batch size. I determined in experiments that d26 prefers 1M. Then I implemented and merged a principled way to calculate the optimal batch size given depth so that all nanochat models of all depths benefit. See [dev/LOG.md](dev/LOG.md) entry "2026-02-05: Auto Batch Size Scaling" for more detail. From 96522798f12007341176c19f57d98c4dd76b7a68 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 5 Feb 2026 20:27:07 +0000 Subject: [PATCH 112/119] docs docs docs --- README.md | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 182d2730..1894ac87 100644 --- a/README.md +++ b/README.md @@ -3,16 +3,13 @@ ![nanochat logo](dev/nanochat.png) ![scaling laws](dev/scaling_laws_jan26.png) -nanochat is the simplest experimental harness for training LLMs. It is designed to run on a single GPU node, the code is minimal/hackable, and it covers all major LLM stages including tokenization, pretraining, finetuning, evaluation, inference, and a chat UI. For example, you can train your own GPT-2 capability LLM (which cost ~$50,000 to train in 2019) for only $73 (3 hours of 8XH100 GPU node) and then talk to it in a familiar ChatGPT-like web UI. +nanochat is the simplest experimental harness for training LLMs. It is designed to run on a single GPU node, the code is minimal/hackable, and it covers all major LLM stages including tokenization, pretraining, finetuning, evaluation, inference, and a chat UI. For example, you can train your own GPT-2 capability LLM (which cost ~$43,000 to train in 2019) for only $72 (~3 hours of 8XH100 GPU node) and then talk to it in a familiar ChatGPT-like web UI. On a spot instance, the total cost can be closer to ~$20. More generally, nanochat is configured out of the box to train an entire miniseries of compute-optimal models by setting one single complexity dial: `--depth`, the number of layers in the GPT transformer model (GPT-2 capability happens to be approximately depth 26). All other hyperparameters (the width of the transformer, number of heads, learning rate adjustments, training horizons, weight decays, ...) are calculated automatically in an optimal way. For questions about the repo, I recommend either using [DeepWiki](https://deepwiki.com/karpathy/nanochat) from Devin/Cognition to ask questions about the repo, or use the [Discussions tab](https://github.com/karpathy/nanochat/discussions), or come by the [#nanochat](https://discord.com/channels/1020383067459821711/1427295580895314031) channel on Discord. -## Updates +## Time-to-GPT-2 Leaderboard -- (Jan 31 2026) Major revamp of all scripts/README ongoing, deleting midtraining stage, might be a bit messy briefly... -- (Jan 30 2026) With all the latest improvements we're able to train GPT-2 grade LLM in about $73. The [runs/speedrun.sh](runs/speedrun.sh) script will become the reference way to train GPT-2 grade model and talk to it. - -## Leaderboard +Presently, the main focus of development is on tuning the pretraining stage, which takes the most amount of compute. Inspired by the modded-nanogpt repo and to incentivise progress and community collaboration, nanochat maintains a leaderboard for a "GPT-2 speedrun", which is the wall-clock time required to train a nanochat model to GPT-2 grade capability, as measured by the DCLM CORE score. The [runs/speedrun.sh](runs/speedrun.sh) script always reflects the reference way to train GPT-2 grade model and talk to it. The current leaderboard looks as follows: | # | time | val_bpb | CORE | Description | Date | Commit | Contributors | |---|-------------|---------|------|-------------|------|--------|--------------| @@ -21,7 +18,7 @@ For questions about the repo, I recommend either using [DeepWiki](https://deepwi | 2 | 2.91 | 0.74504 | 0.2578 | d26 slightly undertrained **+fp8** | Feb 2 2026 | a67eba3 | @karpathy | | 3 | 2.76 | 0.74645 | 0.2602 | bump total batch size to 1M tokens | Feb 5 2026 | 2c062aa | @karpathy | -The primary metric we care about is "time to GPT-2" - the wall clock time needed to outperform the GPT-2 (1.6B) CORE metric on an 8XH100 GPU node. The GPT-2 CORE score is 0.256525. In 2019, the training of GPT-2 cost approximately $50,000 so it is incredible that due to many advances over 7 years across the stack, we can now do so much faster and for well below $100 (e.g. at the current ~$3/GPU/hr, an 8XH100 node is ~$24/hr, so 3 hours is ~$72). +The primary metric we care about is "time to GPT-2" - the wall clock time needed to outperform the GPT-2 (1.6B) CORE metric on an 8XH100 GPU node. The GPT-2 CORE score is 0.256525. In 2019, the training of GPT-2 cost approximately $43,000 so it is incredible that due to many advances over 7 years across the stack, we can now do so much faster and for well below $100 (e.g. at the current ~$3/GPU/hr, an 8XH100 node is ~$24/hr, so 3 hours is ~$72). See [dev/LEADERBOARD.md](dev/LEADERBOARD.md) for more docs on how to interpret and contribute to the leaderboard. @@ -29,7 +26,7 @@ See [dev/LEADERBOARD.md](dev/LEADERBOARD.md) for more docs on how to interpret a ### Reproduce and talk to GPT-2 -The most fun you can have is to train your own GPT-2 and talk to it. The entire pipeline to do so is contained in the single file [runs/speedrun.sh](runs/speedrun.sh), which is designed to be run on an 8XH100 GPU node. Currently, at ~$24/hour for these nodes, pretraining a GPT-2 grade model takes approximately 3 hours and will set you back about $75. Boot up a new 8XH100 GPU box from your favorite provider (e.g. I use and like [Lambda](https://lambda.ai/service/gpu-cloud)), and kick off the training script: +The most fun you can have is to train your own GPT-2 and talk to it. The entire pipeline to do so is contained in the single file [runs/speedrun.sh](runs/speedrun.sh), which is designed to be run on an 8XH100 GPU node. Boot up a new 8XH100 GPU box from your favorite provider (e.g. I use and like [Lambda](https://lambda.ai/service/gpu-cloud)), and kick off the training script: ```bash bash runs/speedrun.sh @@ -70,7 +67,13 @@ OMP_NUM_THREADS=1 torchrun --standalone --nproc_per_node=8 -m scripts.base_train --save-every=-1 \ ``` -This uses wandb (run name "d12"), only runs the CORE metric on last step, and it doesn't sample and save intermediate checkpoints. I like to change something in the code, re-run a d12 (or a d16 etc) and see if it helped, in an iteration loop. +This uses wandb (run name "d12"), only runs the CORE metric on last step, and it doesn't sample and save intermediate checkpoints. I like to change something in the code, re-run a d12 (or a d16 etc) and see if it helped, in an iteration loop. To see if a run helps, I like to monitor the wandb plots for: + +1. `val_bpb` (validation loss in vocab-size-invariant units of bits per byte) as a function of `step`, `total_training_time` and `total_training_flops`. +2. `core_metric` (the DCLM CORE socre) +3. VRAM utilization, `train/mfu` (Model FLOPS utilization), `train/tok_per_sec` (training throughput) + +See an example [here](https://github.com/karpathy/nanochat/pull/498#issuecomment-3850720044). The important thing to note is that nanochat is written and configured around one single dial of complexity - the depth of the transformer. This single integer automatically determines all other hyperparameters (the width of the transformer, number of heads, learning rate adjustments, training horizons, weight decays, ...) so that the trained model comes out compute optimal. The idea is that the user doesn't have to think about or set any of this, they are simply asking for a smaller or bigger model using `--depth`, and everything "just works". By sweeping out the depth, you achieve the nanochat miniseries of compute optimal models at various sizes. GPT-2 capability model (which is of most interest at the moment) happens to be somewhere around d24-d26 range with the current code. But any candidate changes to the repo have to be principled enough that they work for all settings of depth. @@ -80,12 +83,13 @@ The script [runs/runcpu.sh](runs/runcpu.sh) shows a very simple example of runni ## Guides -I've published a number of guides that might contain helpful information: +I've published a number of guides that might contain helpful information, most recent to least recent: -- [Oct 13 2025 original nanochat post](https://github.com/karpathy/nanochat/discussions/1) introducing nanochat, though now it contains some deprecated information and the model is a lot older (with worse results) than current master. +- [Feb 1 2026: Beating GPT-2 for <<$100: the nanochat journey](https://github.com/karpathy/nanochat/discussions/481) - [Jan 7 miniseries v1](https://github.com/karpathy/nanochat/discussions/420) documents the first nanochat miniseries of models. -- To customize your nanochat, see [Guide: infusing identity to your nanochat](https://github.com/karpathy/nanochat/discussions/139) in Discussions, which describes how you can tune your nanochat's personality through synthetic data generation and mixing that data into the SFT stage. - To add new abilities to nanochat, see [Guide: counting r in strawberry (and how to add abilities generally)](https://github.com/karpathy/nanochat/discussions/164). +- To customize your nanochat, see [Guide: infusing identity to your nanochat](https://github.com/karpathy/nanochat/discussions/139) in Discussions, which describes how you can tune your nanochat's personality through synthetic data generation and mixing that data into the SFT stage. +- [Oct 13 2025: original nanochat post](https://github.com/karpathy/nanochat/discussions/1) introducing nanochat, though now it contains some deprecated information and the model is a lot older (with worse results) than current master. ## File structure From e527521a3fd91e8f3a2016b10db21e5742aa41fe Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 5 Feb 2026 22:21:03 +0000 Subject: [PATCH 113/119] briefly mention batch ramp experimentation too, too weak to merge in my few attempts --- dev/LOG.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/dev/LOG.md b/dev/LOG.md index 6ce41733..dec2c067 100644 --- a/dev/LOG.md +++ b/dev/LOG.md @@ -48,6 +48,10 @@ Also refactored model initialization to use `build_model_meta(depth)` helper and - [Brown et al., Language Models are Few-Shot Learners](https://arxiv.org/abs/2005.14165) - [Merrill et al., The Batch Size–Critical Batch Size Myth](https://arxiv.org/abs/2505.23971) +### One more thing (batch size ramp) + +Tried batch size ramping. The simplest implementation I could think of "tricks" the existing training loop by slicing each micro-batch into smaller pieces and calling optimizer.step() more frequently early in training (1/8 → 1/4 → 1/2 → full batch over the first x% of training, with sqrt LR scaling). Also required a torch.compile warmup phase to pre-compile all slice sizes and avoid recompilation spikes during training. While the idea is sound and small gains were observed, they weren't sufficient to justify the code complexity introduced (conditional slicing logic, warmup with state save/restore, etc.). Not merged for now. + --- ## 2026-02-05: SwiGLU Activation (Negative Result) From 685271dc8db21afe02ecb9a9fa9785b50eac2421 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 6 Feb 2026 19:21:27 +0000 Subject: [PATCH 114/119] new optimal ratio for d26 training --- runs/speedrun.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runs/speedrun.sh b/runs/speedrun.sh index c423ba6e..62466c7b 100644 --- a/runs/speedrun.sh +++ b/runs/speedrun.sh @@ -70,7 +70,7 @@ echo "Waiting for dataset download to complete..." wait $DATASET_DOWNLOAD_PID # d24 model (slightly overtrained is enough to beat GPT-2 => increase data:params ratio from compute optimal 10.5 (default) to 12) -torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=26 --target-param-data-ratio=8.5 --device-batch-size=16 --fp8 --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=26 --target-param-data-ratio=8.25 --device-batch-size=16 --fp8 --run=$WANDB_RUN # evaluate the model: CORE metric, BPB on train/val, and draw samples torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch-size=16 From aeff095e97a3721202e188b8348948536dc90a83 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 6 Feb 2026 19:22:28 +0000 Subject: [PATCH 115/119] better comments/flow on all the hyperparameter transfer stuff, and change the WD scaling from my empirical 1/d^2 to a bit more principled version based on Tepoch. All of that theory is based on AdamW and could be suboptimal for Muon --- scripts/base_train.py | 163 ++++++++++++++++++++++-------------------- 1 file changed, 87 insertions(+), 76 deletions(-) diff --git a/scripts/base_train.py b/scripts/base_train.py index a3774e6d..ccf35e64 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -237,11 +237,9 @@ orig_model = model # original, uncompiled model, for saving raw model state_dict model = torch.compile(model, dynamic=False) # the inputs to model will never change shape so dynamic=False is safe # ----------------------------------------------------------------------------- -# Determine the optimization horizon based on the model size -# The compute-optimal models satisfy the Tokens:Params ratio of --target-param-data-ratio (derived experimentally via scaling laws analysis). -# We've already initialized the model so we have Params. Optimal Tokens is now simply target-param-data-ratio * Params +# Scaling laws and muP extrapolations to determine the optimal training horizon, batch size, learning rates, weight decay. -# Get the parameter counts of the model +# Get the parameter counts of our model param_counts = model.num_scaling_params() print0(f"Parameter counts:") for key, value in param_counts.items(): @@ -250,23 +248,80 @@ num_params = param_counts['total'] num_flops_per_token = model.estimate_flops() print0(f"Estimated FLOPs per token: {num_flops_per_token:e}") -# Scaling params: transformer matrices + lm_head (gives cleanest scaling laws, see dev/LOG.md Jan 27, 2026) -get_scaling_params = lambda m: m.num_scaling_params()['transformer_matrices'] + m.num_scaling_params()['lm_head'] +# 1) Use scaling laws to determine the optimal training horizon in tokens +# The compute-optimal models satisfy the Tokens:Params ratio of --target-param-data-ratio (derived experimentally via scaling laws analysis). +# We've already initialized the model so we have Params. Optimal Tokens is now simply target-param-data-ratio * Params +def get_scaling_params(m): + # As for which params to use exactly, transformer matrices + lm_head gives cleanest scaling laws (see dev/LOG.md Jan 27, 2026) + params_counts = m.num_scaling_params() + scaling_params = params_counts['transformer_matrices'] + params_counts['lm_head'] + return scaling_params num_scaling_params = get_scaling_params(model) -target_tokens = int(args.target_param_data_ratio * num_scaling_params) +target_tokens = int(args.target_param_data_ratio * num_scaling_params) # optimal tokens for the model we are about to train -# Auto-compute optimal batch size based on Power Lines paper (Bopt ∝ D^0.383), ref: https://arxiv.org/abs/2505.13738 -total_batch_size = args.total_batch_size +# Our reference model is d12, this is where a lot of hyperparameters are tuned and then transfered to higher depths (muP style) +d12_ref = build_model_meta(12) # creates the model on meta device +D_REF = args.target_param_data_ratio * get_scaling_params(d12_ref) # compute-optimal d12 training horizon in tokens (measured empirically) +B_REF = 2**19 # optimal batch size at d12 ~= 524,288 tokens (measured empirically) + +# 2) Now that we have the token horizon, we can calculate the optimal batch size +# We follow the Power Lines paper (Bopt ∝ D^0.383), ref: https://arxiv.org/abs/2505.13738 +# The optimal batch size grows as approximately D^0.383, so e.g. if D doubles from d12 to d24, B should grow by 2^0.383 ≈ 1.3x. +total_batch_size = args.total_batch_size # user-provided override is possible if total_batch_size == -1: - d12_ref = build_model_meta(12) # d12 is where the optimal batch size was measured to be 2**19 tokens - d12_num_scaling_params = get_scaling_params(d12_ref) - D_REF = args.target_param_data_ratio * d12_num_scaling_params - B_REF = 2**19 batch_size_ratio = target_tokens / D_REF - total_batch_size = 2 ** round(math.log2(B_REF * batch_size_ratio ** 0.383)) # also clamp to power of 2 + predicted_batch_size = B_REF * batch_size_ratio ** 0.383 + total_batch_size = 2 ** round(math.log2(predicted_batch_size)) # clamp to nearest power of 2 for efficiency print0(f"Auto-computed optimal batch size: {total_batch_size:,} tokens") -# Calculate number of iterations. Either it is given, or from target flops, or from target data:param ratio (in that order) +# 3) Knowing the batch size, we can now calculate a learning rate correction (bigger batch size allows higher learning rates) +batch_lr_scale = 1.0 +batch_ratio = total_batch_size / B_REF # B/B_ref +if batch_ratio != 1.0: + # SGD: linear scaling with batch size is standard (not used in nanochat) + # AdamW: sqrt scaling is standard: η ∝ √(B/B_ref) + # Muon: we will use the same scaling for Muon as for AdamW: η ∝ √(B/B_ref) (not studied carefully, assumption!) + batch_lr_scale = batch_ratio ** 0.5 # η ∝ √(B/B_ref) + print0(f"Scaling LRs by {batch_lr_scale:.4f} for batch size {total_batch_size:,} (reference: {B_REF:,})") + +# 4) Knowing the batch size and the token horizon, we can now calculate the appropriate weight decay scaling +# We adopt the T_epoch framework from https://arxiv.org/abs/2405.13698 +# Central idea of the paper is that T_epoch = B/(η·λ·D) should remain constant. +# Above, we used learning rate scaling η ∝ √(B/B_ref). So it's a matter of ~10 lines of math to derive that to keep T_epoch constant, we need: +# λ = λ_ref · √(B/B_ref) · (D_ref/D) +# Note that these papers study AdamW, *not* Muon. We are blindly following AdamW theory for scaling hoping it ~works for Muon too. +weight_decay_scaled = args.weight_decay * math.sqrt(total_batch_size / B_REF) * (D_REF / target_tokens) +if weight_decay_scaled != args.weight_decay: + print0(f"Scaling weight decay from {args.weight_decay:.6f} to {weight_decay_scaled:.6f} for depth {args.depth}") + +# ----------------------------------------------------------------------------- +# Initialize the Optimizer (combined MuonAdamW: Muon for matrix params, AdamW for rest) +optimizer = model.setup_optimizer( + # AdamW hyperparameters + unembedding_lr=args.unembedding_lr * batch_lr_scale, + embedding_lr=args.embedding_lr * batch_lr_scale, + scalar_lr=args.scalar_lr * batch_lr_scale, + adam_betas=(args.adam_beta1, args.adam_beta2), + # Muon hyperparameters + matrix_lr=args.matrix_lr * batch_lr_scale, + weight_decay=weight_decay_scaled, +) + +if resuming: + optimizer.load_state_dict(optimizer_data) + del optimizer_data + +# ----------------------------------------------------------------------------- +# Initialize the DataLoaders for train/val +dataloader_resume_state_dict = None if not resuming else meta_data["dataloader_state_dict"] +train_loader = tokenizing_distributed_data_loader_with_state_bos_bestfit(tokenizer, args.device_batch_size, args.max_seq_len, split="train", device=device, resume_state_dict=dataloader_resume_state_dict) +build_val_loader = lambda: tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, args.max_seq_len, split="val", device=device) +x, y, dataloader_state_dict = next(train_loader) # kick off load of the very first batch of data + +# ----------------------------------------------------------------------------- +# Calculate the number of iterations we will train for and set up the various schedulers + +# num_iterations: either it is given, or from target flops, or from target data:param ratio (in that order) assert args.num_iterations > 0 or args.target_param_data_ratio > 0 or args.target_flops > 0 if args.num_iterations > 0: # Override num_iterations to a specific value if given @@ -282,65 +337,12 @@ elif args.target_param_data_ratio > 0: print0(f"Calculated number of iterations from target data:param ratio: {num_iterations:,}") else: raise ValueError("No training horizon specified") -total_tokens = total_batch_size * num_iterations +total_tokens = total_batch_size * num_iterations # the actual number of tokens we will train for print0(f"Total number of training tokens: {total_tokens:,}") -print0(f"Tokens : Scaling params ratio: {total_batch_size * num_iterations / num_scaling_params:.2f}") # Chinchilla is ~20 +print0(f"Tokens : Scaling params ratio: {total_batch_size * num_iterations / num_scaling_params:.2f}") # e.g. Chinchilla was ~20 print0(f"Total training FLOPs estimate: {num_flops_per_token * total_tokens:e}") -# ----------------------------------------------------------------------------- -# Optimizer / data / training length related hyperparameters -# figure out the needed gradient accumulation to reach the desired total batch size -tokens_per_fwdbwd = args.device_batch_size * args.max_seq_len # tokens per iteration for a single rank -world_tokens_per_fwdbwd = tokens_per_fwdbwd * ddp_world_size # total tokens per iteration for all ranks -assert total_batch_size % world_tokens_per_fwdbwd == 0 -grad_accum_steps = total_batch_size // world_tokens_per_fwdbwd -print0(f"Tokens / micro-batch / rank: {args.device_batch_size} x {args.max_seq_len} = {tokens_per_fwdbwd:,}") -print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}") -print0(f"Total batch size {total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}") - -# Batch size scaling for learning rates (hyperparameters were tuned at reference batch size 2^19) -batch_lr_scale = 1.0 -reference_batch_size = 2**19 -batch_ratio = total_batch_size / reference_batch_size -if batch_ratio != 1.0: - # SGD: linear scaling with batch size is standard (not used in nanochat) - # AdamW: sqrt scaling is standard - # Muon: sqrt scaling is an assumption - not fully studied, but it's a second-order-ish optimizer - batch_lr_scale = batch_ratio ** 0.5 - print0(f"Scaling LRs by {batch_lr_scale:.4f} for batch size {total_batch_size:,} (reference: {reference_batch_size:,})") - -# Weight decay is tuned at d12 and its scaling seems to be \propto 1/channels^2 (or equivalently, \propto 1/depth^2 due to constant aspect ratio) -weight_decay_scaled = args.weight_decay * (12 / args.depth)**2 -if args.depth != 12: - print0(f"Scaling weight decay from {args.weight_decay:.6f} to {weight_decay_scaled:.6f} for depth {args.depth}") - -# ----------------------------------------------------------------------------- -# Initialize the Optimizer (combined MuonAdamW: Muon for matrix params, AdamW for rest) -adam_betas = (args.adam_beta1, args.adam_beta2) -optimizer = model.setup_optimizer( - unembedding_lr=args.unembedding_lr * batch_lr_scale, - embedding_lr=args.embedding_lr * batch_lr_scale, - matrix_lr=args.matrix_lr * batch_lr_scale, - weight_decay=weight_decay_scaled, - adam_betas=adam_betas, - scalar_lr=args.scalar_lr * batch_lr_scale, -) - -if resuming: - optimizer.load_state_dict(optimizer_data) - del optimizer_data - -# ----------------------------------------------------------------------------- -# Initialize the DataLoaders for train/val -dataloader_resume_state_dict = None if not resuming else meta_data["dataloader_state_dict"] -train_loader = tokenizing_distributed_data_loader_with_state_bos_bestfit(tokenizer, args.device_batch_size, args.max_seq_len, split="train", device=device, resume_state_dict=dataloader_resume_state_dict) -build_val_loader = lambda: tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, args.max_seq_len, split="val", device=device) -x, y, dataloader_state_dict = next(train_loader) # kick off load of the very first batch of data - -# ----------------------------------------------------------------------------- -# Set up hyperparameter schedulers - -# Learning rate scheduler +# Learning rate schedule (linear warmup, constant, linear warmdown) def get_lr_multiplier(it): warmup_iters = round(args.warmup_ratio * num_iterations) warmdown_iters = round(args.warmdown_ratio * num_iterations) @@ -352,19 +354,20 @@ def get_lr_multiplier(it): progress = (num_iterations - it) / warmdown_iters return progress * 1.0 + (1 - progress) * args.final_lr_frac -# Momentum scheduler for Muon optimizer +# Momentum scheduler for Muon optimizer (warms up to 0.95 over the first 300 steps) def get_muon_momentum(it): frac = min(it / 300, 1) momentum = (1 - frac) * 0.85 + frac * 0.95 return momentum -# Weight decay scheduler for Muon optimizer (linear to zero over the course of training) +# Weight decay scheduler for Muon optimizer (linearly decays to zero over the course of training) def get_weight_decay(it): return weight_decay_scaled * (1 - it / num_iterations) # ----------------------------------------------------------------------------- -# Loop state (variables updated by the training loop) +# Training loop +# Loop state (variables updated by the training loop) if not resuming: step = 0 val_bpb = None # will be set if eval_every > 0 @@ -379,8 +382,16 @@ else: smooth_train_loss = loop_state["smooth_train_loss"] total_training_time = loop_state["total_training_time"] -# ----------------------------------------------------------------------------- -# Training loop +# Figure out the needed gradient accumulation micro-steps to reach the desired total batch size per step +tokens_per_fwdbwd = args.device_batch_size * args.max_seq_len # tokens per iteration for a single rank +world_tokens_per_fwdbwd = tokens_per_fwdbwd * ddp_world_size # total tokens per iteration for all ranks +assert total_batch_size % world_tokens_per_fwdbwd == 0 +grad_accum_steps = total_batch_size // world_tokens_per_fwdbwd +print0(f"Tokens / micro-batch / rank: {args.device_batch_size} x {args.max_seq_len} = {tokens_per_fwdbwd:,}") +print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}") +print0(f"Total batch size {total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}") + +# Go! while True: last_step = step == num_iterations # loop runs num_iterations+1 times so that we can eval/save at the end flops_so_far = num_flops_per_token * total_batch_size * step From ff46300720e9ac5a5cd5d90f0d8cd3ccc20a76e2 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 8 Feb 2026 17:54:12 +0000 Subject: [PATCH 116/119] tune miniseries just a bit, fairly cosmetic, keep to even depths where the math works out nicely in model sizing --- runs/miniseries.sh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/runs/miniseries.sh b/runs/miniseries.sh index c42544e3..e57ee162 100644 --- a/runs/miniseries.sh +++ b/runs/miniseries.sh @@ -28,7 +28,7 @@ fi # Series name: from arg, env var, or default to today's date (e.g., jan11) SERIES_NAME="${1:-${SERIES_NAME:-$(date +%b%d | tr '[:upper:]' '[:lower:]')}}" # Depths to train (the "miniseries") -DEPTHS=(10 11 12 13 14 15 16 17 18 19 20) +DEPTHS=(12 14 16 18 20 22 24 26) # Hardware NPROC_PER_NODE="${NPROC_PER_NODE:-8}" # Logging @@ -57,8 +57,13 @@ for d in "${DEPTHS[@]}"; do TAG="${SERIES_NAME}_miniseries_d${d}" START_TIME=$(date +%s) - # Train the model with natural horizon (target_param_data_ratio default) - # No --target-flops, let it use the default ratio from base_train + # For depths >= 22, use smaller device batch size to avoid OOM + if [ $d -ge 22 ]; then + DEVICE_BATCH_SIZE_ARG="--device-batch-size=16" + else + DEVICE_BATCH_SIZE_ARG="--device-batch-size=32" + fi + torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- \ --depth=$d \ --run="${WANDB_RUN}_d${d}" \ @@ -67,6 +72,7 @@ for d in "${DEPTHS[@]}"; do --core-metric-max-per-task=-1 \ --sample-every=-1 \ --save-every=-1 \ + $DEVICE_BATCH_SIZE_ARG \ 2>&1 | tee "$RESULTS_DIR/${TAG}_train.log" END_TIME=$(date +%s) From 1ec0a347792f337bd38a93b15b79927466d0540a Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Sun, 8 Feb 2026 18:26:34 +0000 Subject: [PATCH 117/119] at 28 and above we start to need batch size 8 --- runs/miniseries.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/runs/miniseries.sh b/runs/miniseries.sh index e57ee162..01c4459d 100644 --- a/runs/miniseries.sh +++ b/runs/miniseries.sh @@ -57,8 +57,10 @@ for d in "${DEPTHS[@]}"; do TAG="${SERIES_NAME}_miniseries_d${d}" START_TIME=$(date +%s) - # For depths >= 22, use smaller device batch size to avoid OOM - if [ $d -ge 22 ]; then + # Reduce --device-batch-size to avoid OOM at larger depths + if [ $d -ge 28 ]; then + DEVICE_BATCH_SIZE_ARG="--device-batch-size=8" + elif [ $d -ge 20 ]; then DEVICE_BATCH_SIZE_ARG="--device-batch-size=16" else DEVICE_BATCH_SIZE_ARG="--device-batch-size=32" From e569b59f92aea06bf8fc1c48489b3cc2e57189f4 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 10 Feb 2026 18:46:39 +0000 Subject: [PATCH 118/119] delete torchao dependency, create our own exact API-matched version of Float8Linear, document it very well. for some poorly understood reason, the performance is not only ~identical but actually runs 3% faster. despite of it being significantly simpler and much less code. i don't fully understand why/how atm --- nanochat/fp8.py | 272 ++++++++++++++++++++++++++++++++++++++++++ pyproject.toml | 1 - scripts/base_train.py | 4 +- uv.lock | 11 -- 4 files changed, 275 insertions(+), 13 deletions(-) create mode 100644 nanochat/fp8.py diff --git a/nanochat/fp8.py b/nanochat/fp8.py new file mode 100644 index 00000000..9d9e9c34 --- /dev/null +++ b/nanochat/fp8.py @@ -0,0 +1,272 @@ +"""Minimal FP8 training for nanochat — tensorwise dynamic scaling only. + +Drop-in replacement for torchao's Float8Linear (~2000 lines) with ~150 lines. +We only need the "tensorwise" recipe (one scalar scale per tensor), not the full +generality of torchao (rowwise scaling, FSDP float8 all-gather, DTensor, tensor +subclass dispatch tables, etc.) + +How FP8 training works +====================== +A standard Linear layer does one matmul in forward and two in backward: + forward: output = input @ weight.T + backward: grad_input = grad_output @ weight + grad_weight= grad_output.T @ input + +FP8 training wraps each of these three matmuls with: + 1. Compute scale = FP8_MAX / max(|tensor|) for each operand + 2. Quantize: fp8_tensor = clamp(tensor * scale, -FP8_MAX, FP8_MAX).to(fp8) + 3. Matmul via torch._scaled_mm (cuBLAS FP8 kernel, ~2x faster than bf16) + 4. Dequantize: _scaled_mm handles this internally using the inverse scales + +The key insight: torch._scaled_mm and the float8 dtypes are PyTorch built-ins. +torchao is just orchestration around these primitives. We can call them directly. + +FP8 dtype choice +================ +There are two FP8 formats. We use both, following the standard convention: + - float8_e4m3fn: 4-bit exponent, 3-bit mantissa, range [-448, 448] + Higher precision (more mantissa bits), used for input and weight. + - float8_e5m2: 5-bit exponent, 2-bit mantissa, range [-57344, 57344] + Wider range (more exponent bits), used for gradients which can be large. + +torch._scaled_mm layout requirements +===================================== +The cuBLAS FP8 kernel requires specific memory layouts: + - First argument (A): must be row-major (contiguous) + - Second argument (B): must be column-major (B.t().contiguous().t()) +If B is obtained by transposing a contiguous tensor (e.g. weight.t()), it is +already column-major — no copy needed. Otherwise we use _to_col_major(). + +How this differs from torchao's approach +======================================== +torchao uses a "tensor subclass" architecture: Float8TrainingTensor is a subclass +of torch.Tensor that bundles FP8 data + scale + metadata. It implements +__torch_dispatch__ with a dispatch table that intercepts every aten op (mm, t, +reshape, clone, ...) and handles it in FP8-aware fashion. When you call + output = input @ weight.T +the @ operator dispatches to aten.mm, which gets intercepted and routed to +torch._scaled_mm behind the scenes. This is ~2000 lines of code because you need +a handler for every tensor operation that might touch an FP8 tensor. + +We take a simpler approach: a single autograd.Function (_Float8Matmul) that takes +full-precision inputs, quantizes to FP8 internally, calls _scaled_mm, and returns +full-precision outputs. Marked @allow_in_graph so torch.compile treats it as one +opaque node rather than trying to trace inside. + +The trade-off is in how torch.compile sees the two approaches: + - torchao: compile decomposes the tensor subclass (via __tensor_flatten__) and + sees every individual op (amax, scale, cast, _scaled_mm) as separate graph + nodes. Inductor can fuse these with surrounding operations (e.g. fuse the + amax computation with the preceding layer's activation function). + - ours: compile sees a single opaque call. It can optimize everything around + the FP8 linear (attention, norms, etc.) but cannot fuse across the boundary. + +Both call the exact same cuBLAS _scaled_mm kernel — the GPU matmul is identical. +The difference is only in the "glue" ops (amax, scale, cast) which are tiny +compared to the matmul. In practice this means our version is slightly faster +(less compilation overhead, no tensor subclass dispatch cost) but can produce +subtly different floating-point rounding paths under torch.compile, since Inductor +generates a different graph. Numerics are bitwise identical in eager mode. +""" + +import torch +import torch.nn as nn + +# Avoid division by zero when computing scale from an all-zeros tensor +EPS = 1e-12 + + +@torch.no_grad() +def _to_fp8(x, fp8_dtype): + """Dynamically quantize a tensor to FP8 using tensorwise scaling. + + "Tensorwise" means one scalar scale for the entire tensor (as opposed to + "rowwise" which computes a separate scale per row). Tensorwise is faster + because cuBLAS handles the scaling; rowwise needs the CUTLASS kernel. + + Returns (fp8_data, inverse_scale) for use with torch._scaled_mm. + """ + fp8_max = torch.finfo(fp8_dtype).max + # Compute the max absolute value across the entire tensor + amax = x.float().abs().max() + # Scale maps [0, amax] -> [0, fp8_max]. Use float64 for the division to + # ensure consistent numerics between torch.compile and eager mode. + # (torchao does the same upcast — without it, compile/eager can diverge) + scale = fp8_max / amax.double().clamp(min=EPS) + scale = scale.float() + # Quantize: scale into FP8 range, saturate (clamp prevents overflow when + # casting — PyTorch's default is to wrap, not saturate), then cast to FP8 + x_scaled = x.float() * scale + x_clamped = x_scaled.clamp(-fp8_max, fp8_max) + x_fp8 = x_clamped.to(fp8_dtype) + # _scaled_mm expects the *inverse* of our scale (it multiplies by this to + # convert FP8 values back to the original range during the matmul) + inv_scale = scale.reciprocal() + return x_fp8, inv_scale + + +def _to_col_major(x): + """Rearrange a 2D tensor's memory to column-major layout. + + torch._scaled_mm requires its second operand in column-major layout. + The trick: transpose -> contiguous (forces a copy in transposed order) + -> transpose back. The result has the same logical shape but column-major + strides, e.g. a [M, N] tensor gets strides (1, M) instead of (N, 1). + """ + return x.t().contiguous().t() + + +# allow_in_graph tells torch.compile to treat this as an opaque operation — +# dynamo won't try to decompose it into smaller ops. See the module docstring +# for how this differs from torchao's tensor subclass approach. +@torch._dynamo.allow_in_graph +class _Float8Matmul(torch.autograd.Function): + """Custom autograd for the three FP8 GEMMs of a Linear layer. + + The forward saves input and weight in their original precision for the + backward pass. Each GEMM independently re-quantizes its operands to FP8. + (We don't reuse the forward's FP8 tensors in backward — the backward might + want different precision, and saving FP8 would lose information.) + """ + + @staticmethod + def forward(ctx, input_2d, weight): + ctx.save_for_backward(input_2d, weight) + + # Quantize both operands to e4m3 (higher precision format) + input_fp8, input_inv = _to_fp8(input_2d, torch.float8_e4m3fn) + weight_fp8, weight_inv = _to_fp8(weight, torch.float8_e4m3fn) + + # output = input @ weight.T + # input_fp8 is [B, K] contiguous = row-major (good for first arg) + # weight_fp8 is [N, K] contiguous, so weight_fp8.t() is [K, N] with + # strides (1, K) = column-major (good for second arg, no copy needed!) + output = torch._scaled_mm( + input_fp8, + weight_fp8.t(), + scale_a=input_inv, + scale_b=weight_inv, + out_dtype=input_2d.dtype, + # use_fast_accum=True accumulates the dot products in lower precision. + # Slightly less accurate but measurably faster. Standard practice for + # the forward pass; we use False in backward for more precise gradients. + use_fast_accum=True, + ) + return output + + @staticmethod + def backward(ctx, grad_output): + input_2d, weight = ctx.saved_tensors + + # === GEMM 1: grad_input = grad_output @ weight === + # Shapes: [B, N] @ [N, K] -> [B, K] + # Gradients use e5m2 (wider range), weights use e4m3 (higher precision) + go_fp8, go_inv = _to_fp8(grad_output, torch.float8_e5m2) + w_fp8, w_inv = _to_fp8(weight, torch.float8_e4m3fn) + # go_fp8 is [B, N] contiguous = row-major, good for first arg + # w_fp8 is [N, K] contiguous = row-major, need column-major for second arg + w_col = _to_col_major(w_fp8) + grad_input = torch._scaled_mm( + go_fp8, + w_col, + scale_a=go_inv, + scale_b=w_inv, + out_dtype=grad_output.dtype, + use_fast_accum=False, + ) + + # === GEMM 2: grad_weight = grad_output.T @ input === + # Shapes: [N, B] @ [B, K] -> [N, K] + go_fp8_2, go_inv_2 = _to_fp8(grad_output, torch.float8_e5m2) + in_fp8, in_inv = _to_fp8(input_2d, torch.float8_e4m3fn) + # go_fp8_2 is [B, N] contiguous, we need go.T = [N, B] as first arg. + # Transposing gives column-major, but first arg needs row-major, + # so we must call .contiguous() to physically rearrange the memory. + go_T = go_fp8_2.t().contiguous() # [N, B] row-major + in_col = _to_col_major(in_fp8) # [B, K] column-major + grad_weight = torch._scaled_mm( + go_T, + in_col, + scale_a=go_inv_2, + scale_b=in_inv, + out_dtype=grad_output.dtype, + use_fast_accum=False, + ) + + return grad_input, grad_weight + + +class Float8Linear(nn.Linear): + """Drop-in nn.Linear replacement that does FP8 compute. + + Weights and biases remain in their original precision (e.g. fp32/bf16). + Only the matmul is performed in FP8 via the _Float8Matmul autograd function. + """ + + def forward(self, input): + # Replicate the autocast behavior of F.linear — when autocast is active, + # we need to manually cast input to the autocast dtype (e.g. bf16), + # since we bypass F.linear's built-in autocast handling. + if torch.is_autocast_enabled(): + input = input.to(torch.get_autocast_gpu_dtype()) + # _scaled_mm only works on 2D tensors, so flatten batch dimensions + orig_shape = input.shape + input_2d = input.reshape(-1, orig_shape[-1]) + output = _Float8Matmul.apply(input_2d, self.weight) + output = output.reshape(*orig_shape[:-1], output.shape[-1]) + if self.bias is not None: + output = output + self.bias.to(output.dtype) + return output + + @classmethod + def from_float(cls, mod): + """Create Float8Linear from nn.Linear, sharing the same weight and bias. + + Uses meta device to avoid allocating a temporary weight tensor — we + create the module shell on meta (shapes/dtypes only, no memory), then + point .weight and .bias to the original module's parameters. + """ + with torch.device("meta"): + new_mod = cls(mod.in_features, mod.out_features, bias=False) + new_mod.weight = mod.weight + new_mod.bias = mod.bias + return new_mod + + +class Float8LinearConfig: + """Minimal config matching torchao's API. Only tensorwise recipe is supported.""" + + @staticmethod + def from_recipe_name(recipe_name): + if recipe_name != "tensorwise": + raise ValueError( + f"Only 'tensorwise' recipe is supported, got '{recipe_name}'. " + f"Rowwise/axiswise recipes require the full torchao library." + ) + return Float8LinearConfig() + + +def convert_to_float8_training(module, *, config=None, module_filter_fn=None): + """Replace nn.Linear layers with Float8Linear throughout a module. + + Walks the module tree in post-order (children before parents) and swaps + each nn.Linear that passes the optional filter. The new Float8Linear shares + the original weight and bias tensors — no copies, no extra memory. + + Args: + module: Root module to convert. + config: Float8LinearConfig (accepted for API compat, only tensorwise supported). + module_filter_fn: Optional filter(module, fqn) -> bool. Only matching Linears + are converted. Common use: skip layers with dims not divisible by 16 + (hardware requirement for FP8 matmuls on H100). + """ + def _convert(mod, prefix=""): + for name, child in mod.named_children(): + fqn = f"{prefix}.{name}" if prefix else name + _convert(child, fqn) + if isinstance(child, nn.Linear) and not isinstance(child, Float8Linear): + if module_filter_fn is None or module_filter_fn(child, fqn): + setattr(mod, name, Float8Linear.from_float(child)) + + _convert(module) + return module diff --git a/pyproject.toml b/pyproject.toml index bcb674d8..8b6fd954 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,7 +20,6 @@ dependencies = [ "tiktoken>=0.11.0", "tokenizers>=0.22.0", "torch==2.9.1", - "torchao==0.15.0", "transformers>=4.57.3", "uvicorn>=0.36.0", "wandb>=0.21.3", diff --git a/scripts/base_train.py b/scripts/base_train.py index ccf35e64..ee530980 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -165,7 +165,9 @@ if args.fp8: if device_type != "cuda": print0("Warning: FP8 training requires CUDA, ignoring --fp8 flag") else: - from torchao.float8 import Float8LinearConfig, convert_to_float8_training + # our custom fp8 is simpler than torchao, written for exact API compatibility + from nanochat.fp8 import Float8LinearConfig, convert_to_float8_training + # from torchao.float8 import Float8LinearConfig, convert_to_float8_training import torch.nn as nn # Filter: only convert layers with dimensions divisible by 16 (FP8 hardware requirement) diff --git a/uv.lock b/uv.lock index e5fc97f6..bbc9519f 100644 --- a/uv.lock +++ b/uv.lock @@ -1509,7 +1509,6 @@ dependencies = [ { name = "torch", version = "2.9.1", source = { registry = "https://pypi.org/simple" }, marker = "(extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu') or (extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu')" }, { name = "torch", version = "2.9.1+cpu", source = { registry = "https://download.pytorch.org/whl/cpu" }, marker = "(sys_platform != 'darwin' and extra == 'extra-8-nanochat-cpu') or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" }, { name = "torch", version = "2.9.1+cu128", source = { registry = "https://download.pytorch.org/whl/cu128" }, marker = "extra == 'extra-8-nanochat-gpu'" }, - { name = "torchao" }, { name = "transformers" }, { name = "uvicorn" }, { name = "wandb" }, @@ -1549,7 +1548,6 @@ requires-dist = [ { name = "torch", specifier = "==2.9.1" }, { name = "torch", marker = "extra == 'cpu'", specifier = "==2.9.1", index = "https://download.pytorch.org/whl/cpu", conflict = { package = "nanochat", extra = "cpu" } }, { name = "torch", marker = "extra == 'gpu'", specifier = "==2.9.1", index = "https://download.pytorch.org/whl/cu128", conflict = { package = "nanochat", extra = "gpu" } }, - { name = "torchao", specifier = "==0.15.0" }, { name = "transformers", specifier = ">=4.57.3" }, { name = "uvicorn", specifier = ">=0.36.0" }, { name = "wandb", specifier = ">=0.21.3" }, @@ -3184,15 +3182,6 @@ wheels = [ { url = "https://download.pytorch.org/whl/cu128/torch-2.9.1%2Bcu128-cp314-cp314t-win_amd64.whl", hash = "sha256:0c784b600959ec70ee01cb23e8bc870a0e0475af30378ff5e39f4abed8b7c1cc" }, ] -[[package]] -name = "torchao" -version = "0.15.0" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/57/2d/472b9362dceae05a4599e2b94f86e69a29c0e20964a6af84f34f6ead5938/torchao-0.15.0-cp310-abi3-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:1cbe813201314ba6329a650a76944502f3e8ec4b1b44523f3f48676810d8d1f6", size = 7163930, upload-time = "2025-12-18T23:14:41.876Z" }, - { url = "https://files.pythonhosted.org/packages/f6/3b/6b9d5618720f63dbc2e2509cd6b57aae9c0d61b738d1d2172f4d5d9efaab/torchao-0.15.0-py3-none-any.whl", hash = "sha256:3f3812676048ef8a2a0e9d492d12d8971ba7a7ebb16f54aa56f690414e130d2c", size = 1080679, upload-time = "2025-12-18T23:14:43.807Z" }, -] - [[package]] name = "tornado" version = "6.5.4" From 2f096867244e3d00a50284d1be05fa3f5dcfb84b Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Tue, 10 Feb 2026 23:35:00 +0000 Subject: [PATCH 119/119] clarify that this is bf16 mfu we're talking about --- scripts/base_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/base_train.py b/scripts/base_train.py index ee530980..996b2ba2 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -531,7 +531,7 @@ while True: else: eta_str = "" epoch = dataloader_state_dict["epoch"] - print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | epoch: {epoch} | total time: {total_training_time/60:.2f}m{eta_str}") + print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | bf16_mfu: {mfu:.2f} | epoch: {epoch} | total time: {total_training_time/60:.2f}m{eta_str}") if step % 100 == 0: log_data = { "step": step,