diff --git a/rustbpe/Cargo.toml b/rustbpe/Cargo.toml index 392a828..1d30210 100644 --- a/rustbpe/Cargo.toml +++ b/rustbpe/Cargo.toml @@ -13,3 +13,4 @@ pyo3-log = "0.12.4" ahash = "0.8.12" rayon = "1.11.0" compact_str = "0.9.0" +num-format = "0.4" diff --git a/rustbpe/src/lib.rs b/rustbpe/src/lib.rs index b43fb6c..b4d8a73 100644 --- a/rustbpe/src/lib.rs +++ b/rustbpe/src/lib.rs @@ -6,6 +6,7 @@ use fancy_regex::Regex; use pyo3::prelude::*; use ahash::{AHashMap, AHashSet}; +use num_format::{Locale, ToFormattedString}; use compact_str::CompactString; use rayon::prelude::*; @@ -164,15 +165,15 @@ impl Tokenizer { fn train_core_incremental(&mut self, mut words: Vec, counts: Vec, vocab_size: u32) { assert!(vocab_size >= 256, "vocab_size must be at least 256"); let num_merges = vocab_size - 256; - log::info!("Starting BPE training: {} merges to compute", num_merges); + log::info!("Starting BPE training: {} merges to compute", num_merges.to_formatted_string(&Locale::en)); self.merges.clear(); // ---- Initial pair_counts and where_to_update (parallel) ---- - log::info!("Computing initial pair counts from {} unique sequences", words.len()); + log::info!("Computing initial pair counts from {} unique sequences", words.len().to_formatted_string(&Locale::en)); let (mut pair_counts, mut where_to_update) = count_pairs_parallel(&words, &counts); // ---- Build heap ---- - log::info!("Building heap with {} unique pairs", pair_counts.len()); + log::info!("Building heap with {} unique pairs", pair_counts.len().to_formatted_string(&Locale::en)); let mut heap = OctonaryHeap::with_capacity(pair_counts.len()); for (pair, pos) in where_to_update.drain() { let c = *pair_counts.get(&pair).unwrap_or(&0); @@ -376,7 +377,7 @@ impl Tokenizer { break; } } - log::info!("Processed {} sequences total, {} unique", total_sequences, counts.len()); + log::info!("Processed {} sequences total, {} unique", total_sequences.to_formatted_string(&Locale::en), counts.len().to_formatted_string(&Locale::en)); // Materialize words & counts let mut words = Vec::with_capacity(counts.len()); diff --git a/scripts/base_eval.py b/scripts/base_eval.py index a566d49..d6d0bbf 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -64,7 +64,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): data = [json.loads(line.strip()) for line in f] # shuffle the data because in many cases it appears ordered but we want - # the abillity to only run a subset of the data for debugging purposes etc. + # the ability to only run a subset of the data for debugging purposes etc. shuffle_rng = random.Random(1337) shuffle_rng.shuffle(data) if max_per_task > 0: diff --git a/scripts/tok_eval.py b/scripts/tok_eval.py index 9233d71..240b749 100644 --- a/scripts/tok_eval.py +++ b/scripts/tok_eval.py @@ -196,9 +196,9 @@ RESET = '\033[0m' # Print vocab sizes print(f"\nVocab sizes:") -print(f"GPT-2: {vocab_sizes['gpt2']}") -print(f"GPT-4: {vocab_sizes['gpt4']}") -print(f"Ours: {vocab_sizes['ours']}") +print(f"GPT-2: {vocab_sizes['gpt2']:,}") +print(f"GPT-4: {vocab_sizes['gpt4']:,}") +print(f"Ours: {vocab_sizes['ours']:,}") def print_comparison(baseline_name, baseline_results, ours_results, all_text): """Print comparison table between baseline tokenizer and ours."""