clean comments

2026-01-27 05:44:14 +00:00 · 2025-10-22 22:22:28 -07:00 · 2025-10-22 22:22:28 -07:00 · dd8310c3d4
commit dd8310c3d4
parent f384c16ba5
2 changed files with 2 additions and 6 deletions
--- a/scripts/chat_sft.py
+++ b/scripts/chat_sft.py
@ -97,9 +97,7 @@ if dataset_choice == "smoltalk":
    ]) # 2.3K + 1.1K + 8K + 10K + 1K = 22.4K rows
    val_ds = SmolTalk(split="test") # general conversations, 24K rows
 elif dataset_choice == "nemotron":
-    # Ablation: Nemotron (sampled to match SmolTalk 10K) + ARC + GSM8K
-    # SmolTalk has 10K samples, we sample Nemotron proportionally to match
-    # Original Nemotron distribution: stem(25.4%), math(17.1%), chat(44.9%), code(12.5%)
+    # Nemotron + ARC + GSM8K + synthetic identity conversations
    train_ds = TaskMixture([
        ARC(subset="ARC-Easy", split="train"), # 2.3K rows
        ARC(subset="ARC-Challenge", split="train"), # 1.1K rows
--- a/scripts/mid_train.py
+++ b/scripts/mid_train.py
@ -117,9 +117,7 @@ if dataset_choice == "smoltalk":
        GSM8K(subset="main", split="test", stop=420), # 420 rows to match train ratios
    ]) # total: ~29.6K rows
 elif dataset_choice == "nemotron":
-    # Ablation: Nemotron with stem, math, chat, code (sampled to match SmolTalk 460K) + MMLU + GSM8K
-    # Original Nemotron distribution: stem(355K/25.4%), math(239K/17.1%), chat(628K/44.9%), code(175K/12.5%)
-    # Proportionally sampled to 460K total, then add MMLU + GSM8K to match SmolTalk structure
+    # Nemotron with stem, math, chat, code + MMLU + GSM8K + synthetic identity conversations
    train_dataset = TaskMixture([
        Nemotron(categories=["stem"], split="train", stop=151800), # 151800 samples
        Nemotron(categories=["math"], split="train", stop=151800), # 151800 samples