mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 04:12:13 +00:00
clean comments
This commit is contained in:
parent
f384c16ba5
commit
dd8310c3d4
|
|
@ -97,9 +97,7 @@ if dataset_choice == "smoltalk":
|
|||
]) # 2.3K + 1.1K + 8K + 10K + 1K = 22.4K rows
|
||||
val_ds = SmolTalk(split="test") # general conversations, 24K rows
|
||||
elif dataset_choice == "nemotron":
|
||||
# Ablation: Nemotron (sampled to match SmolTalk 10K) + ARC + GSM8K
|
||||
# SmolTalk has 10K samples, we sample Nemotron proportionally to match
|
||||
# Original Nemotron distribution: stem(25.4%), math(17.1%), chat(44.9%), code(12.5%)
|
||||
# Nemotron + ARC + GSM8K + synthetic identity conversations
|
||||
train_ds = TaskMixture([
|
||||
ARC(subset="ARC-Easy", split="train"), # 2.3K rows
|
||||
ARC(subset="ARC-Challenge", split="train"), # 1.1K rows
|
||||
|
|
|
|||
|
|
@ -117,9 +117,7 @@ if dataset_choice == "smoltalk":
|
|||
GSM8K(subset="main", split="test", stop=420), # 420 rows to match train ratios
|
||||
]) # total: ~29.6K rows
|
||||
elif dataset_choice == "nemotron":
|
||||
# Ablation: Nemotron with stem, math, chat, code (sampled to match SmolTalk 460K) + MMLU + GSM8K
|
||||
# Original Nemotron distribution: stem(355K/25.4%), math(239K/17.1%), chat(628K/44.9%), code(175K/12.5%)
|
||||
# Proportionally sampled to 460K total, then add MMLU + GSM8K to match SmolTalk structure
|
||||
# Nemotron with stem, math, chat, code + MMLU + GSM8K + synthetic identity conversations
|
||||
train_dataset = TaskMixture([
|
||||
Nemotron(categories=["stem"], split="train", stop=151800), # 151800 samples
|
||||
Nemotron(categories=["math"], split="train", stop=151800), # 151800 samples
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user