Compare commits

...

15 Commits

Author SHA1 Message Date
Hossein-Lakzaei
f77590ab9a
Merge 4715fdcf52 into f66a780f68 2025-11-14 15:02:27 -08:00
Andrej
f66a780f68
Fix torch.dtype mismatching when running engine inline test. 2025-11-14 07:28:29 -08:00
Andrej
4763ce612a
Small fixes to typos 2025-11-14 07:25:59 -08:00
Sofie Van Landeghem
c6f5bd67db
revert change of base to sft for quick inline test 2025-11-14 12:20:03 +01:00
svlandeg
a2fb3c83a6 fix typos 2025-11-14 11:20:25 +01:00
svlandeg
e5efb4b471 add test_engine.py to file structure 2025-11-14 11:13:42 +01:00
howardgao@outlook.com
b399e43168 fix engine test bug 2025-11-06 08:56:45 +08:00
svlandeg
52e85aaf80 Merge branch 'master' into fix/typo 2025-11-02 13:41:13 +01:00
svlandeg
70319851fc fix typo 2025-10-29 19:48:34 +01:00
svlandeg
4715fdcf52 Merge branch 'master' into master_hoslak 2025-10-29 09:40:24 +01:00
Hossein-Lakzaei
1a428f2b0b Fix typo in README.md for "modern" and improve clarity in the description of the nanochat d32 model. 2025-10-19 09:40:54 +03:30
Hossein-Lakzaei
520bdfe081 Merge branch 'master' of https://github.com/HosLak/nanochat 2025-10-17 00:26:24 +03:30
Hossein-Lakzaei
355cc60089 Update logging to use formatted strings for better readability and add num-format dependency and Fix typo in comment 2025-10-17 00:25:16 +03:30
Hossein-Lakzaei
671e8d9fc9
Merge branch 'karpathy:master' into master 2025-10-16 23:47:32 +03:30
Hossein-Lakzaei
52382d58c5 Update logging to use formatted strings for better readability and add num-format dependency 2025-10-16 23:33:01 +03:30
8 changed files with 30 additions and 21 deletions

View File

@ -184,6 +184,7 @@ python -m pytest tests/test_rustbpe.py -v -s
│ ├── smoltalk.py # Conglomerate dataset of SmolTalk from HF
│ └── spellingbee.py # Task teaching model to spell/count letters
├── tests
│ └── test_engine.py
│ └── test_rustbpe.py
└── uv.lock
```

View File

@ -17,8 +17,9 @@ import signal
import warnings
from contextlib import contextmanager
from collections import deque
from nanochat.common import compute_init
from nanochat.common import compute_init, autodetect_device_type
from nanochat.checkpoint_manager import load_model
from contextlib import nullcontext
# -----------------------------------------------------------------------------
# Calculator tool helpers
@ -328,6 +329,9 @@ if __name__ == "__main__":
import time
# init compute
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init()
device_type = autodetect_device_type()
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
# load the model and tokenizer
model, tokenizer, meta = load_model("base", device, phase="eval")
bos_token_id = tokenizer.get_bos_token_id()
@ -340,10 +344,11 @@ if __name__ == "__main__":
torch.cuda.synchronize()
t0 = time.time()
stream = model.generate(prompt_tokens, **kwargs)
for token in stream:
generated_tokens.append(token)
chunk = tokenizer.decode([token])
print(chunk, end="", flush=True)
with autocast_ctx:
for token in stream:
generated_tokens.append(token)
chunk = tokenizer.decode([token])
print(chunk, end="", flush=True)
print()
torch.cuda.synchronize()
t1 = time.time()
@ -355,11 +360,12 @@ if __name__ == "__main__":
stream = engine.generate(prompt_tokens, num_samples=1, **kwargs) # note: runs in fp32
torch.cuda.synchronize()
t0 = time.time()
for token_column, token_masks in stream:
token = token_column[0] # only print out the first row
generated_tokens.append(token)
chunk = tokenizer.decode([token])
print(chunk, end="", flush=True)
with autocast_ctx:
for token_column, token_masks in stream:
token = token_column[0] # only print out the first row
generated_tokens.append(token)
chunk = tokenizer.decode([token])
print(chunk, end="", flush=True)
print()
torch.cuda.synchronize()
t1 = time.time()

View File

@ -9,9 +9,9 @@ import torch.distributed as dist
def evaluate_bpb(model, batches, steps, token_bytes):
"""
Instead of the naive 'mean loss', this function returns the bits per byte (bpb),
which is a tokenization vocab size-indepedent metric, meaning you are still comparing
which is a tokenization vocab size-independent metric, meaning you are still comparing
apples:apples if you change the vocab size. The way this works is that instead of just
calculating the average loss as usual, you calculate the sum loss, and indepependently
calculating the average loss as usual, you calculate the sum loss, and independently
also the sum bytes (of all the target tokens), and divide. This normalizes the loss by
the number of bytes that the target tokens represent.

View File

@ -13,3 +13,4 @@ pyo3-log = "0.12.4"
ahash = "0.8.12"
rayon = "1.11.0"
compact_str = "0.9.0"
num-format = "0.4"

View File

@ -6,6 +6,7 @@ use fancy_regex::Regex;
use pyo3::prelude::*;
use ahash::{AHashMap, AHashSet};
use num_format::{Locale, ToFormattedString};
use compact_str::CompactString;
use rayon::prelude::*;
@ -164,15 +165,15 @@ impl Tokenizer {
fn train_core_incremental(&mut self, mut words: Vec<Word>, counts: Vec<i32>, vocab_size: u32) {
assert!(vocab_size >= 256, "vocab_size must be at least 256");
let num_merges = vocab_size - 256;
log::info!("Starting BPE training: {} merges to compute", num_merges);
log::info!("Starting BPE training: {} merges to compute", num_merges.to_formatted_string(&Locale::en));
self.merges.clear();
// ---- Initial pair_counts and where_to_update (parallel) ----
log::info!("Computing initial pair counts from {} unique sequences", words.len());
log::info!("Computing initial pair counts from {} unique sequences", words.len().to_formatted_string(&Locale::en));
let (mut pair_counts, mut where_to_update) = count_pairs_parallel(&words, &counts);
// ---- Build heap ----
log::info!("Building heap with {} unique pairs", pair_counts.len());
log::info!("Building heap with {} unique pairs", pair_counts.len().to_formatted_string(&Locale::en));
let mut heap = OctonaryHeap::with_capacity(pair_counts.len());
for (pair, pos) in where_to_update.drain() {
let c = *pair_counts.get(&pair).unwrap_or(&0);
@ -375,7 +376,7 @@ impl Tokenizer {
break;
}
}
log::info!("Processed {} sequences total, {} unique", total_sequences, counts.len());
log::info!("Processed {} sequences total, {} unique", total_sequences.to_formatted_string(&Locale::en), counts.len().to_formatted_string(&Locale::en));
// Materialize words & counts
let mut words = Vec::with_capacity(counts.len());

View File

@ -1,6 +1,6 @@
"""
Evaluate the Chat model.
All the generic code lives here, and all the evlauation-specific
All the generic code lives here, and all the evaluation-specific
code lives in nanochat directory and is imported from here.
Example runs:

View File

@ -192,7 +192,7 @@ for step in range(num_iterations):
})
model.train()
# evlauate accuracy of the multiple choice tasks (which are quick to run)
# evaluate accuracy of the multiple choice tasks (which are quick to run)
if last_step or (step > 0 and step % eval_metrics_every == 0):
model.eval()
metrics = {}

View File

@ -196,9 +196,9 @@ RESET = '\033[0m'
# Print vocab sizes
print(f"\nVocab sizes:")
print(f"GPT-2: {vocab_sizes['gpt2']}")
print(f"GPT-4: {vocab_sizes['gpt4']}")
print(f"Ours: {vocab_sizes['ours']}")
print(f"GPT-2: {vocab_sizes['gpt2']:,}")
print(f"GPT-4: {vocab_sizes['gpt4']:,}")
print(f"Ours: {vocab_sizes['ours']:,}")
def print_comparison(baseline_name, baseline_results, ours_results, all_text):
"""Print comparison table between baseline tokenizer and ours."""