Update logging to use formatted strings for better readability and add num-format dependency

2025-12-06 12:22:18 +00:00 · 2025-10-16 23:33:01 +03:30 · 2025-10-16 23:33:01 +03:30 · 52382d58c5
commit 52382d58c5
parent 4346536ab2
3 changed files with 9 additions and 7 deletions
--- a/rustbpe/Cargo.toml
+++ b/rustbpe/Cargo.toml
@ -13,3 +13,4 @@ pyo3-log = "0.12.4"
 ahash = "0.8.12"
 rayon = "1.11.0"
 compact_str = "0.9.0"
 num-format = "0.4"
--- a/rustbpe/src/lib.rs
+++ b/rustbpe/src/lib.rs
@ -6,6 +6,7 @@ use fancy_regex::Regex;
 use pyo3::prelude::*;
 use ahash::{AHashMap, AHashSet};
 use num_format::{Locale, ToFormattedString};
 use compact_str::CompactString;
 use rayon::prelude::*;
@ -164,15 +165,15 @@ impl Tokenizer {
    fn train_core_incremental(&mut self, mut words: Vec<Word>, counts: Vec<i32>, vocab_size: u32) {
        assert!(vocab_size >= 256, "vocab_size must be at least 256");
        let num_merges = vocab_size - 256;
-        log::info!("Starting BPE training: {} merges to compute", num_merges);
+        log::info!("Starting BPE training: {} merges to compute", num_merges.to_formatted_string(&Locale::en));
        self.merges.clear();
        // ---- Initial pair_counts and where_to_update (parallel) ----
-        log::info!("Computing initial pair counts from {} unique sequences", words.len());
+        log::info!("Computing initial pair counts from {} unique sequences", words.len().to_formatted_string(&Locale::en));
        let (mut pair_counts, mut where_to_update) = count_pairs_parallel(&words, &counts);
        // ---- Build heap ----
-        log::info!("Building heap with {} unique pairs", pair_counts.len());
+        log::info!("Building heap with {} unique pairs", pair_counts.len().to_formatted_string(&Locale::en));
        let mut heap = OctonaryHeap::with_capacity(pair_counts.len());
        for (pair, pos) in where_to_update.drain() {
            let c = *pair_counts.get(&pair).unwrap_or(&0);
@ -376,7 +377,7 @@ impl Tokenizer {
                break;
            }
        }
-        log::info!("Processed {} sequences total, {} unique", total_sequences, counts.len());
+        log::info!("Processed {} sequences total, {} unique", total_sequences.to_formatted_string(&Locale::en), counts.len().to_formatted_string(&Locale::en));
        // Materialize words & counts
        let mut words = Vec::with_capacity(counts.len());
--- a/scripts/tok_eval.py
+++ b/scripts/tok_eval.py
@ -196,9 +196,9 @@ RESET = '\033[0m'
 # Print vocab sizes
 print(f"\nVocab sizes:")
-print(f"GPT-2: {vocab_sizes['gpt2']}")
+print(f"GPT-2: {vocab_sizes['gpt2']:,}")
-print(f"GPT-4: {vocab_sizes['gpt4']}")
+print(f"GPT-4: {vocab_sizes['gpt4']:,}")
-print(f"Ours: {vocab_sizes['ours']}")
+print(f"Ours: {vocab_sizes['ours']:,}")
 def print_comparison(baseline_name, baseline_results, ours_results, all_text):
    """Print comparison table between baseline tokenizer and ours."""