Merge 4715fdcf52 into f66a780f68

Fix torch.dtype mismatching when running engine inline test.
Small fixes to typos
2025-12-06 04:12:13 +00:00 · 2025-11-14 15:02:27 -08:00 · 2025-11-14 07:28:29 -08:00 · 2025-11-14 07:25:59 -08:00 · 2025-11-14 12:20:03 +01:00 · 2025-11-14 11:20:25 +01:00
8 changed files with 30 additions and 21 deletions
--- a/README.md
+++ b/README.md
@ -184,6 +184,7 @@ python -m pytest tests/test_rustbpe.py -v -s
 │   ├── smoltalk.py                 # Conglomerate dataset of SmolTalk from HF
 │   └── spellingbee.py              # Task teaching model to spell/count letters
 ├── tests
+│   └── test_engine.py
 │   └── test_rustbpe.py
 └── uv.lock
 ```
--- a/nanochat/engine.py
+++ b/nanochat/engine.py
@ -17,8 +17,9 @@ import signal
 import warnings
 from contextlib import contextmanager
 from collections import deque
-from nanochat.common import compute_init
+from nanochat.common import compute_init, autodetect_device_type
 from nanochat.checkpoint_manager import load_model
+from contextlib import nullcontext 

 # -----------------------------------------------------------------------------
 # Calculator tool helpers
@ -328,6 +329,9 @@ if __name__ == "__main__":
    import time
    # init compute
    ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init()
+    device_type = autodetect_device_type()
+    autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
+
    # load the model and tokenizer
    model, tokenizer, meta = load_model("base", device, phase="eval")
    bos_token_id = tokenizer.get_bos_token_id()
@ -340,10 +344,11 @@ if __name__ == "__main__":
    torch.cuda.synchronize()
    t0 = time.time()
    stream = model.generate(prompt_tokens, **kwargs)
-    for token in stream:
-        generated_tokens.append(token)
-        chunk = tokenizer.decode([token])
-        print(chunk, end="", flush=True)
+    with autocast_ctx:
+        for token in stream:
+            generated_tokens.append(token)
+            chunk = tokenizer.decode([token])
+            print(chunk, end="", flush=True)
    print()
    torch.cuda.synchronize()
    t1 = time.time()
@ -355,11 +360,12 @@ if __name__ == "__main__":
    stream = engine.generate(prompt_tokens, num_samples=1, **kwargs) # note: runs in fp32
    torch.cuda.synchronize()
    t0 = time.time()
-    for token_column, token_masks in stream:
-        token = token_column[0] # only print out the first row
-        generated_tokens.append(token)
-        chunk = tokenizer.decode([token])
-        print(chunk, end="", flush=True)
+    with autocast_ctx:
+        for token_column, token_masks in stream:
+            token = token_column[0] # only print out the first row
+            generated_tokens.append(token)
+            chunk = tokenizer.decode([token])
+            print(chunk, end="", flush=True)
    print()
    torch.cuda.synchronize()
    t1 = time.time()
--- a/nanochat/loss_eval.py
+++ b/nanochat/loss_eval.py
@ -9,9 +9,9 @@ import torch.distributed as dist
 def evaluate_bpb(model, batches, steps, token_bytes):
    """
    Instead of the naive 'mean loss', this function returns the bits per byte (bpb),
-    which is a tokenization vocab size-indepedent metric, meaning you are still comparing
+    which is a tokenization vocab size-independent metric, meaning you are still comparing
    apples:apples if you change the vocab size. The way this works is that instead of just
-    calculating the average loss as usual, you calculate the sum loss, and indepependently
+    calculating the average loss as usual, you calculate the sum loss, and independently
    also the sum bytes (of all the target tokens), and divide. This normalizes the loss by
    the number of bytes that the target tokens represent.

--- a/rustbpe/Cargo.toml
+++ b/rustbpe/Cargo.toml
@ -13,3 +13,4 @@ pyo3-log = "0.12.4"
 ahash = "0.8.12"
 rayon = "1.11.0"
 compact_str = "0.9.0"
+num-format = "0.4"
--- a/rustbpe/src/lib.rs
+++ b/rustbpe/src/lib.rs
@ -6,6 +6,7 @@ use fancy_regex::Regex;
 use pyo3::prelude::*;

 use ahash::{AHashMap, AHashSet};
+use num_format::{Locale, ToFormattedString};
 use compact_str::CompactString;
 use rayon::prelude::*;

@ -164,15 +165,15 @@ impl Tokenizer {
    fn train_core_incremental(&mut self, mut words: Vec<Word>, counts: Vec<i32>, vocab_size: u32) {
        assert!(vocab_size >= 256, "vocab_size must be at least 256");
        let num_merges = vocab_size - 256;
-        log::info!("Starting BPE training: {} merges to compute", num_merges);
+        log::info!("Starting BPE training: {} merges to compute", num_merges.to_formatted_string(&Locale::en));
        self.merges.clear();

        // ---- Initial pair_counts and where_to_update (parallel) ----
-        log::info!("Computing initial pair counts from {} unique sequences", words.len());
+        log::info!("Computing initial pair counts from {} unique sequences", words.len().to_formatted_string(&Locale::en));
        let (mut pair_counts, mut where_to_update) = count_pairs_parallel(&words, &counts);

        // ---- Build heap ----
-        log::info!("Building heap with {} unique pairs", pair_counts.len());
+        log::info!("Building heap with {} unique pairs", pair_counts.len().to_formatted_string(&Locale::en));
        let mut heap = OctonaryHeap::with_capacity(pair_counts.len());
        for (pair, pos) in where_to_update.drain() {
            let c = *pair_counts.get(&pair).unwrap_or(&0);
@ -375,7 +376,7 @@ impl Tokenizer {
                break;
            }
        }
-        log::info!("Processed {} sequences total, {} unique", total_sequences, counts.len());
+        log::info!("Processed {} sequences total, {} unique", total_sequences.to_formatted_string(&Locale::en), counts.len().to_formatted_string(&Locale::en));

        // Materialize words & counts
        let mut words = Vec::with_capacity(counts.len());
--- a/scripts/chat_eval.py
+++ b/scripts/chat_eval.py
@ -1,6 +1,6 @@
 """
 Evaluate the Chat model.
-All the generic code lives here, and all the evlauation-specific
+All the generic code lives here, and all the evaluation-specific
 code lives in nanochat directory and is imported from here.

 Example runs:
--- a/scripts/chat_sft.py
+++ b/scripts/chat_sft.py
@ -192,7 +192,7 @@ for step in range(num_iterations):
        })
        model.train()

-    # evlauate accuracy of the multiple choice tasks (which are quick to run)
+    # evaluate accuracy of the multiple choice tasks (which are quick to run)
    if last_step or (step > 0 and step % eval_metrics_every == 0):
        model.eval()
        metrics = {}
--- a/scripts/tok_eval.py
+++ b/scripts/tok_eval.py
@ -196,9 +196,9 @@ RESET = '\033[0m'

 # Print vocab sizes
 print(f"\nVocab sizes:")
-print(f"GPT-2: {vocab_sizes['gpt2']}")
-print(f"GPT-4: {vocab_sizes['gpt4']}")
-print(f"Ours: {vocab_sizes['ours']}")
+print(f"GPT-2: {vocab_sizes['gpt2']:,}")
+print(f"GPT-4: {vocab_sizes['gpt4']:,}")
+print(f"Ours: {vocab_sizes['ours']:,}")

 def print_comparison(baseline_name, baseline_results, ours_results, all_text):
    """Print comparison table between baseline tokenizer and ours."""
Author	SHA1	Message	Date
Hossein-Lakzaei	f77590ab9a	Merge `4715fdcf52` into `f66a780f68`	2025-11-14 15:02:27 -08:00
Andrej	f66a780f68	Fix torch.dtype mismatching when running engine inline test.	2025-11-14 07:28:29 -08:00
Andrej	4763ce612a	Small fixes to typos	2025-11-14 07:25:59 -08:00
Sofie Van Landeghem	c6f5bd67db	revert change of base to sft for quick inline test	2025-11-14 12:20:03 +01:00
svlandeg	a2fb3c83a6	fix typos	2025-11-14 11:20:25 +01:00
svlandeg	e5efb4b471	add test_engine.py to file structure	2025-11-14 11:13:42 +01:00
howardgao@outlook.com	b399e43168	fix engine test bug	2025-11-06 08:56:45 +08:00
svlandeg	52e85aaf80	Merge branch 'master' into fix/typo	2025-11-02 13:41:13 +01:00
svlandeg	70319851fc	fix typo	2025-10-29 19:48:34 +01:00
svlandeg	4715fdcf52	Merge branch 'master' into master_hoslak	2025-10-29 09:40:24 +01:00
Hossein-Lakzaei	1a428f2b0b	Fix typo in README.md for "modern" and improve clarity in the description of the nanochat d32 model.	2025-10-19 09:40:54 +03:30
Hossein-Lakzaei	520bdfe081	Merge branch 'master' of https://github.com/HosLak/nanochat	2025-10-17 00:26:24 +03:30
Hossein-Lakzaei	355cc60089	Update logging to use formatted strings for better readability and add num-format dependency and Fix typo in comment	2025-10-17 00:25:16 +03:30
Hossein-Lakzaei	671e8d9fc9	Merge branch 'karpathy:master' into master	2025-10-16 23:47:32 +03:30
Hossein-Lakzaei	52382d58c5	Update logging to use formatted strings for better readability and add num-format dependency	2025-10-16 23:33:01 +03:30