diff --git a/rustbpe/Cargo.toml b/rustbpe/Cargo.toml
index 392a828..1d30210 100644
--- a/rustbpe/Cargo.toml
+++ b/rustbpe/Cargo.toml
@@ -13,3 +13,4 @@ pyo3-log = "0.12.4"
 ahash = "0.8.12"
 rayon = "1.11.0"
 compact_str = "0.9.0"
+num-format = "0.4"
diff --git a/rustbpe/src/lib.rs b/rustbpe/src/lib.rs
index b43fb6c..b4d8a73 100644
--- a/rustbpe/src/lib.rs
+++ b/rustbpe/src/lib.rs
@@ -6,6 +6,7 @@ use fancy_regex::Regex;
 use pyo3::prelude::*;
 
 use ahash::{AHashMap, AHashSet};
+use num_format::{Locale, ToFormattedString};
 use compact_str::CompactString;
 use rayon::prelude::*;
 
@@ -164,15 +165,15 @@ impl Tokenizer {
     fn train_core_incremental(&mut self, mut words: Vec<Word>, counts: Vec<i32>, vocab_size: u32) {
         assert!(vocab_size >= 256, "vocab_size must be at least 256");
         let num_merges = vocab_size - 256;
-        log::info!("Starting BPE training: {} merges to compute", num_merges);
+        log::info!("Starting BPE training: {} merges to compute", num_merges.to_formatted_string(&Locale::en));
         self.merges.clear();
 
         // ---- Initial pair_counts and where_to_update (parallel) ----
-        log::info!("Computing initial pair counts from {} unique sequences", words.len());
+        log::info!("Computing initial pair counts from {} unique sequences", words.len().to_formatted_string(&Locale::en));
         let (mut pair_counts, mut where_to_update) = count_pairs_parallel(&words, &counts);
 
         // ---- Build heap ----
-        log::info!("Building heap with {} unique pairs", pair_counts.len());
+        log::info!("Building heap with {} unique pairs", pair_counts.len().to_formatted_string(&Locale::en));
         let mut heap = OctonaryHeap::with_capacity(pair_counts.len());
         for (pair, pos) in where_to_update.drain() {
             let c = *pair_counts.get(&pair).unwrap_or(&0);
@@ -376,7 +377,7 @@ impl Tokenizer {
                 break;
             }
         }
-        log::info!("Processed {} sequences total, {} unique", total_sequences, counts.len());
+        log::info!("Processed {} sequences total, {} unique", total_sequences.to_formatted_string(&Locale::en), counts.len().to_formatted_string(&Locale::en));
 
         // Materialize words & counts
         let mut words = Vec::with_capacity(counts.len());
diff --git a/scripts/base_eval.py b/scripts/base_eval.py
index a566d49..d6d0bbf 100644
--- a/scripts/base_eval.py
+++ b/scripts/base_eval.py
@@ -64,7 +64,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
             data = [json.loads(line.strip()) for line in f]
 
         # shuffle the data because in many cases it appears ordered but we want
-        # the abillity to only run a subset of the data for debugging purposes etc.
+        # the ability to only run a subset of the data for debugging purposes etc.
         shuffle_rng = random.Random(1337)
         shuffle_rng.shuffle(data)
         if max_per_task > 0:
diff --git a/scripts/tok_eval.py b/scripts/tok_eval.py
index 9233d71..240b749 100644
--- a/scripts/tok_eval.py
+++ b/scripts/tok_eval.py
@@ -196,9 +196,9 @@ RESET = '\033[0m'
 
 # Print vocab sizes
 print(f"\nVocab sizes:")
-print(f"GPT-2: {vocab_sizes['gpt2']}")
-print(f"GPT-4: {vocab_sizes['gpt4']}")
-print(f"Ours: {vocab_sizes['ours']}")
+print(f"GPT-2: {vocab_sizes['gpt2']:,}")
+print(f"GPT-4: {vocab_sizes['gpt4']:,}")
+print(f"Ours: {vocab_sizes['ours']:,}")
 
 def print_comparison(baseline_name, baseline_results, ours_results, all_text):
     """Print comparison table between baseline tokenizer and ours."""