improve tokenizer and report in midtrain and sft

2026-01-24 04:14:27 +00:00 · 2025-10-20 22:04:27 -07:00 · 2025-10-20 22:04:27 -07:00 · f3f069519d
commit f3f069519d
parent 169022fec0
4 changed files with 28 additions and 26 deletions
--- a/midtrain_sft_submit.sh
+++ b/midtrain_sft_submit.sh
@ -13,8 +13,8 @@

 set -x  # Enable debug output

-export DATA_NAME=nemotron # nemotron # smoltalk
-export BASE_NAME=smollm_d20_1node_matrixlr0.02_2298373 # fineweb_d20_1node # climbmix_d20_1node_matrixlr0.02_2298334 # nemotron-cc-hq_d20_1node_matrixlr0.02_2298371 # smollm_d20_1node_matrixlr0.02_2298373
+export DATA_NAME=smoltalk # nemotron # smoltalk
+export BASE_NAME=climbmix_d20_1node_matrixlr0.02_2298334 # fineweb_d20_1node # climbmix_d20_1node_matrixlr0.02_2298334 # nemotron-cc-hq_d20_1node_matrixlr0.02_2298371 # smollm_d20_1node_matrixlr0.02_2298373

 # Default intermediate artifacts directory is in ~/.cache/nanochat
 export OMP_NUM_THREADS=1
--- a/pretrain_submit.sh
+++ b/pretrain_submit.sh
@ -22,7 +22,7 @@

 set -x  # Enable debug output

-DATA_NAME=climbmix
+DATA_NAME=climbmix_1_9
 export DATA_DIR=/lustre/fsw/portfolios/nvr/users/sdiao/nanochat/data/$DATA_NAME
 export MATRIX_LR=0.02

--- a/scripts/chat_sft.py
+++ b/scripts/chat_sft.py
@ -79,7 +79,8 @@ model, tokenizer, meta = load_model(source, device, phase="train", model_tag=mod
 orig_model = model # original, uncompiled model
 # model = torch.compile(model, dynamic=True) # doesn't work super well because of variable lengths of inputs
 engine = Engine(model, tokenizer) # will be used for inline model evaluation only
-
+tokenizer_name = meta.get("tokenizer_name", "tokenizer")
+print0(f"Using tokenizer: {tokenizer_name}")
 # -----------------------------------------------------------------------------
 # Task data mixture we'll train on
 # Select dataset based on dataset_choice parameter
@ -103,19 +104,18 @@ elif dataset_choice == "nemotron":
        ARC(subset="ARC-Easy", split="train"), # 2.3K rows
        ARC(subset="ARC-Challenge", split="train"), # 1.1K rows
        GSM8K(subset="main", split="train"), # 8K rows
-        Nemotron(categories=["stem"], split="train", stop=2540),  # 25.4% of 10K = 2.54K
-        Nemotron(categories=["math"], split="train", stop=1710),  # 17.1% of 10K = 1.71K
-        Nemotron(categories=["chat"], split="train", stop=4490),  # 44.9% of 10K = 4.49K
-        Nemotron(categories=["code"], split="train", stop=1250),  # 12.5% of 10K = 1.25K
-        CustomJSON(filepath=identity_conversations_filepath), # 1K rows of synthetic identity conversations
-    ]) # total: 2.3K + 1.1K + 8K + (2.54K + 1.71K + 4.49K + 1.25K) = 21.4K rows (same as SmolTalk)
+        Nemotron(categories=["stem"], split="train", stop=3000),
+        Nemotron(categories=["math"], split="train", stop=3000),
+        Nemotron(categories=["chat"], split="train", stop=1000),
+        Nemotron(categories=["code"], split="train", stop=3000),
+    ]) # total: 2.3K + 1.1K + 8K + (3.0K + 3.0K + 1.0K + 3.0K) = 18.4K rows (similar to SmolTalk)
    # For validation, use a small subset of Nemotron mixed categories
    val_ds = TaskMixture([
-        Nemotron(categories=["stem"], split="train", start=2540, stop=2790),    # 250 samples
-        Nemotron(categories=["math"], split="train", start=1710, stop=1960),    # 250 samples
-        Nemotron(categories=["chat"], split="train", start=4490, stop=5240),    # 750 samples
-        Nemotron(categories=["code"], split="train", start=1250, stop=1500),    # 250 samples
-    ]) # total: 1500 samples for validation
+        Nemotron(categories=["stem"], split="train", start=3000, stop=3300),    # 300 samples
+        Nemotron(categories=["math"], split="train", start=3000, stop=3300),    # 300 samples
+        Nemotron(categories=["chat"], split="train", start=1000, stop=1100),    # 100 samples
+        Nemotron(categories=["code"], split="train", start=3000, stop=3300),    # 300 samples
+    ]) # total: 1000 samples for validation
 else:
    raise ValueError(f"Unknown dataset_choice: {dataset_choice}. Must be 'smoltalk' or 'nemotron'")

@ -292,13 +292,14 @@ if master_process:
            "val_loss": val_loss,
            **metrics,
            "model_config": model_config_kwargs,
+            "tokenizer_name": tokenizer_name,
        }
    )
    print(f"✅ Saved model checkpoint to {checkpoint_dir}")

 # Log to report
 from nanochat.report import get_report
-get_report().log(section="Chat SFT", data=[
+get_report(exp_name=run).log(section="Chat SFT", data=[
    user_config, # CLI args
    {
        "Training rows": len(train_ds),
--- a/scripts/mid_train.py
+++ b/scripts/mid_train.py
@ -121,24 +121,25 @@ elif dataset_choice == "nemotron":
    # Original Nemotron distribution: stem(355K/25.4%), math(239K/17.1%), chat(628K/44.9%), code(175K/12.5%)
    # Proportionally sampled to 460K total, then add MMLU + GSM8K to match SmolTalk structure
    train_dataset = TaskMixture([
-        Nemotron(categories=["stem"], split="train", stop=117000),  # 25.4% of 460K = 117K
-        Nemotron(categories=["math"], split="train", stop=79000),   # 17.1% of 460K = 79K
-        Nemotron(categories=["chat"], split="train", stop=207000),  # 44.9% of 460K = 207K
-        Nemotron(categories=["code"], split="train", stop=57000),   # 12.5% of 460K = 57K
+        Nemotron(categories=["stem"], split="train", stop=151800),
+        Nemotron(categories=["math"], split="train", stop=151800),
+        Nemotron(categories=["chat"], split="train", stop=4600),
+        Nemotron(categories=["code"], split="train", stop=151800),
        MMLU(subset="auxiliary_train", split="train"), # 100K rows of multiple choice problems
        GSM8K(subset="main", split="train"), # 8K rows teaching simple math and (calculator) tool use
        CustomJSON(filepath=identity_conversations_filepath), # 1000 rows of synthetic identity conversations
        CustomJSON(filepath=identity_conversations_filepath), # let's do 2 epochs of these
    ]) # total: 117K + 79K + 207K + 57K + 100K + 8K = 568K rows (same as SmolTalk)
+
    # For validation, match SmolTalk validation set structure
    val_dataset = TaskMixture([
-        Nemotron(categories=["stem"], split="train", start=117000, stop=124500),  # 7.5K
-        Nemotron(categories=["math"], split="train", start=79000, stop=84000),    # 5K
-        Nemotron(categories=["chat"], split="train", start=207000, stop=220500),  # 13.5K
-        Nemotron(categories=["code"], split="train", start=57000, stop=61000),    # 4K
+        Nemotron(categories=["stem"], split="train", start=151800, stop=155000),
+        Nemotron(categories=["math"], split="train", start=151800, stop=155000),
+        Nemotron(categories=["chat"], split="train", start=4600, stop=10000),
+        Nemotron(categories=["code"], split="train", start=151800, stop=155000),
        MMLU(subset="all", split="test", stop=5200), # 5.2K rows to match train ratios
        GSM8K(subset="main", split="test", stop=420), # 420 rows to match train ratios
-    ]) # total: 7.5K + 5K + 13.5K + 4K + 5.2K + 0.42K = 35.6K rows
+    ]) # total: 6.0K + 4.0K + 10.8K + 3.2K + 5.2K + 0.42K = 30.6K rows (similar to SmolTalk)
 else:
    raise ValueError(f"Unknown dataset_choice: {dataset_choice}. Must be 'smoltalk' or 'nemotron'")
 # DataLoader is defined here, it emits inputs, targets : 2D tensors of shape (device_batch_size, max_seq_len)
@ -329,7 +330,7 @@ print0(f"Minimum validation bpb: {min_val_bpb:.4f}")
 # Log to report
 if not dry_run:
    from nanochat.report import get_report
-    get_report().log(section="Midtraining", data=[
+    get_report(exp_name=run).log(section="Midtraining", data=[
        user_config, # CLI args
        { # stats about the training setup
            "Number of iterations": step,