From 52382d58c593abf959716b69857d87042f210aaf Mon Sep 17 00:00:00 2001 From: Hossein-Lakzaei Date: Thu, 16 Oct 2025 23:33:01 +0330 Subject: [PATCH 1/3] Update logging to use formatted strings for better readability and add num-format dependency --- rustbpe/Cargo.toml | 1 + rustbpe/src/lib.rs | 9 +++++---- scripts/tok_eval.py | 6 +++--- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/rustbpe/Cargo.toml b/rustbpe/Cargo.toml index 392a828..1d30210 100644 --- a/rustbpe/Cargo.toml +++ b/rustbpe/Cargo.toml @@ -13,3 +13,4 @@ pyo3-log = "0.12.4" ahash = "0.8.12" rayon = "1.11.0" compact_str = "0.9.0" +num-format = "0.4" diff --git a/rustbpe/src/lib.rs b/rustbpe/src/lib.rs index b43fb6c..b4d8a73 100644 --- a/rustbpe/src/lib.rs +++ b/rustbpe/src/lib.rs @@ -6,6 +6,7 @@ use fancy_regex::Regex; use pyo3::prelude::*; use ahash::{AHashMap, AHashSet}; +use num_format::{Locale, ToFormattedString}; use compact_str::CompactString; use rayon::prelude::*; @@ -164,15 +165,15 @@ impl Tokenizer { fn train_core_incremental(&mut self, mut words: Vec, counts: Vec, vocab_size: u32) { assert!(vocab_size >= 256, "vocab_size must be at least 256"); let num_merges = vocab_size - 256; - log::info!("Starting BPE training: {} merges to compute", num_merges); + log::info!("Starting BPE training: {} merges to compute", num_merges.to_formatted_string(&Locale::en)); self.merges.clear(); // ---- Initial pair_counts and where_to_update (parallel) ---- - log::info!("Computing initial pair counts from {} unique sequences", words.len()); + log::info!("Computing initial pair counts from {} unique sequences", words.len().to_formatted_string(&Locale::en)); let (mut pair_counts, mut where_to_update) = count_pairs_parallel(&words, &counts); // ---- Build heap ---- - log::info!("Building heap with {} unique pairs", pair_counts.len()); + log::info!("Building heap with {} unique pairs", pair_counts.len().to_formatted_string(&Locale::en)); let mut heap = OctonaryHeap::with_capacity(pair_counts.len()); for (pair, pos) in where_to_update.drain() { let c = *pair_counts.get(&pair).unwrap_or(&0); @@ -376,7 +377,7 @@ impl Tokenizer { break; } } - log::info!("Processed {} sequences total, {} unique", total_sequences, counts.len()); + log::info!("Processed {} sequences total, {} unique", total_sequences.to_formatted_string(&Locale::en), counts.len().to_formatted_string(&Locale::en)); // Materialize words & counts let mut words = Vec::with_capacity(counts.len()); diff --git a/scripts/tok_eval.py b/scripts/tok_eval.py index 9233d71..240b749 100644 --- a/scripts/tok_eval.py +++ b/scripts/tok_eval.py @@ -196,9 +196,9 @@ RESET = '\033[0m' # Print vocab sizes print(f"\nVocab sizes:") -print(f"GPT-2: {vocab_sizes['gpt2']}") -print(f"GPT-4: {vocab_sizes['gpt4']}") -print(f"Ours: {vocab_sizes['ours']}") +print(f"GPT-2: {vocab_sizes['gpt2']:,}") +print(f"GPT-4: {vocab_sizes['gpt4']:,}") +print(f"Ours: {vocab_sizes['ours']:,}") def print_comparison(baseline_name, baseline_results, ours_results, all_text): """Print comparison table between baseline tokenizer and ours.""" From 355cc600891445d48feb043ed212777d47a3bae3 Mon Sep 17 00:00:00 2001 From: Hossein-Lakzaei Date: Thu, 16 Oct 2025 23:33:01 +0330 Subject: [PATCH 2/3] Update logging to use formatted strings for better readability and add num-format dependency and Fix typo in comment --- rustbpe/Cargo.toml | 1 + rustbpe/src/lib.rs | 9 +++++---- scripts/base_eval.py | 2 +- scripts/tok_eval.py | 6 +++--- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/rustbpe/Cargo.toml b/rustbpe/Cargo.toml index 392a828..1d30210 100644 --- a/rustbpe/Cargo.toml +++ b/rustbpe/Cargo.toml @@ -13,3 +13,4 @@ pyo3-log = "0.12.4" ahash = "0.8.12" rayon = "1.11.0" compact_str = "0.9.0" +num-format = "0.4" diff --git a/rustbpe/src/lib.rs b/rustbpe/src/lib.rs index b43fb6c..b4d8a73 100644 --- a/rustbpe/src/lib.rs +++ b/rustbpe/src/lib.rs @@ -6,6 +6,7 @@ use fancy_regex::Regex; use pyo3::prelude::*; use ahash::{AHashMap, AHashSet}; +use num_format::{Locale, ToFormattedString}; use compact_str::CompactString; use rayon::prelude::*; @@ -164,15 +165,15 @@ impl Tokenizer { fn train_core_incremental(&mut self, mut words: Vec, counts: Vec, vocab_size: u32) { assert!(vocab_size >= 256, "vocab_size must be at least 256"); let num_merges = vocab_size - 256; - log::info!("Starting BPE training: {} merges to compute", num_merges); + log::info!("Starting BPE training: {} merges to compute", num_merges.to_formatted_string(&Locale::en)); self.merges.clear(); // ---- Initial pair_counts and where_to_update (parallel) ---- - log::info!("Computing initial pair counts from {} unique sequences", words.len()); + log::info!("Computing initial pair counts from {} unique sequences", words.len().to_formatted_string(&Locale::en)); let (mut pair_counts, mut where_to_update) = count_pairs_parallel(&words, &counts); // ---- Build heap ---- - log::info!("Building heap with {} unique pairs", pair_counts.len()); + log::info!("Building heap with {} unique pairs", pair_counts.len().to_formatted_string(&Locale::en)); let mut heap = OctonaryHeap::with_capacity(pair_counts.len()); for (pair, pos) in where_to_update.drain() { let c = *pair_counts.get(&pair).unwrap_or(&0); @@ -376,7 +377,7 @@ impl Tokenizer { break; } } - log::info!("Processed {} sequences total, {} unique", total_sequences, counts.len()); + log::info!("Processed {} sequences total, {} unique", total_sequences.to_formatted_string(&Locale::en), counts.len().to_formatted_string(&Locale::en)); // Materialize words & counts let mut words = Vec::with_capacity(counts.len()); diff --git a/scripts/base_eval.py b/scripts/base_eval.py index a566d49..d6d0bbf 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -64,7 +64,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): data = [json.loads(line.strip()) for line in f] # shuffle the data because in many cases it appears ordered but we want - # the abillity to only run a subset of the data for debugging purposes etc. + # the ability to only run a subset of the data for debugging purposes etc. shuffle_rng = random.Random(1337) shuffle_rng.shuffle(data) if max_per_task > 0: diff --git a/scripts/tok_eval.py b/scripts/tok_eval.py index 9233d71..240b749 100644 --- a/scripts/tok_eval.py +++ b/scripts/tok_eval.py @@ -196,9 +196,9 @@ RESET = '\033[0m' # Print vocab sizes print(f"\nVocab sizes:") -print(f"GPT-2: {vocab_sizes['gpt2']}") -print(f"GPT-4: {vocab_sizes['gpt4']}") -print(f"Ours: {vocab_sizes['ours']}") +print(f"GPT-2: {vocab_sizes['gpt2']:,}") +print(f"GPT-4: {vocab_sizes['gpt4']:,}") +print(f"Ours: {vocab_sizes['ours']:,}") def print_comparison(baseline_name, baseline_results, ours_results, all_text): """Print comparison table between baseline tokenizer and ours.""" From 1a428f2b0b1a5d5c6a70ff9072ae38e6acbb2715 Mon Sep 17 00:00:00 2001 From: Hossein-Lakzaei Date: Sun, 19 Oct 2025 09:40:54 +0330 Subject: [PATCH 3/3] Fix typo in README.md for "modern" and improve clarity in the description of the nanochat d32 model. --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 19b6f02..f3f4369 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This repo is a full-stack implementation of an LLM like ChatGPT in a single, cle ## Talk to it -To get a sense of the endpoint of this repo, you can currently find [nanochat d32](https://github.com/karpathy/nanochat/discussions/8) hosted on [nanochat.karpathy.ai](https://nanochat.karpathy.ai/). "d32" means that this model has 32 layers in the Transformer neural network. This model has 1.9 billion parameters, it was trained on 38 billion tokens by simply running the single script [run1000.sh](run1000.sh), and the total cost of training was ~$800 (about 33 hours training time on 8XH100 GPU node). While today this is enough to outperform GPT-2 of 2019, it falls dramatically short of moden Large Language Models like GPT-5. When talking to these micro models, you'll see that they make a lot of mistakes, they are a little bit naive and silly and they hallucinate a ton, a bit like children. It's kind of amusing. But what makes nanochat unique is that it is fully yours - fully configurable, tweakable, hackable, and trained by you from start to end. To train and talk to your own, we turn to... +To get a sense of the endpoint of this repo, you can currently find [nanochat d32](https://github.com/karpathy/nanochat/discussions/8) hosted on [nanochat.karpathy.ai](https://nanochat.karpathy.ai/). "d32" means that this model has 32 layers in the Transformer neural network. This model has 1.9 billion parameters, it was trained on 38 billion tokens by simply running the single script [run1000.sh](run1000.sh), and the total cost of training was ~$800 (about 33 hours training time on 8XH100 GPU node). While today this is enough to outperform GPT-2 of 2019, it falls dramatically short of modern Large Language Models like GPT-5. When talking to these micro models, you'll see that they make a lot of mistakes, they are a little bit naive and silly and they hallucinate a ton, a bit like children. It's kind of amusing. But what makes nanochat unique is that it is fully yours - fully configurable, tweakable, hackable, and trained by you from start to end. To train and talk to your own, we turn to... ## Quick start