diff --git a/midtrain_sft_submit.sh b/midtrain_sft_submit.sh index 2ac21d5..d6af1d5 100644 --- a/midtrain_sft_submit.sh +++ b/midtrain_sft_submit.sh @@ -13,8 +13,8 @@ set -x # Enable debug output -export DATA_NAME=nemotron # nemotron # smoltalk -export BASE_NAME=smollm_d20_1node_matrixlr0.02_2298373 # fineweb_d20_1node # climbmix_d20_1node_matrixlr0.02_2298334 # nemotron-cc-hq_d20_1node_matrixlr0.02_2298371 # smollm_d20_1node_matrixlr0.02_2298373 +export DATA_NAME=smoltalk # nemotron # smoltalk +export BASE_NAME=climbmix_d20_1node_matrixlr0.02_2298334 # fineweb_d20_1node # climbmix_d20_1node_matrixlr0.02_2298334 # nemotron-cc-hq_d20_1node_matrixlr0.02_2298371 # smollm_d20_1node_matrixlr0.02_2298373 # Default intermediate artifacts directory is in ~/.cache/nanochat export OMP_NUM_THREADS=1 diff --git a/pretrain_submit.sh b/pretrain_submit.sh index 1820f30..bb9a902 100644 --- a/pretrain_submit.sh +++ b/pretrain_submit.sh @@ -22,7 +22,7 @@ set -x # Enable debug output -DATA_NAME=climbmix +DATA_NAME=climbmix_1_9 export DATA_DIR=/lustre/fsw/portfolios/nvr/users/sdiao/nanochat/data/$DATA_NAME export MATRIX_LR=0.02 diff --git a/scripts/chat_sft.py b/scripts/chat_sft.py index 1122e78..895d386 100644 --- a/scripts/chat_sft.py +++ b/scripts/chat_sft.py @@ -79,7 +79,8 @@ model, tokenizer, meta = load_model(source, device, phase="train", model_tag=mod orig_model = model # original, uncompiled model # model = torch.compile(model, dynamic=True) # doesn't work super well because of variable lengths of inputs engine = Engine(model, tokenizer) # will be used for inline model evaluation only - +tokenizer_name = meta.get("tokenizer_name", "tokenizer") +print0(f"Using tokenizer: {tokenizer_name}") # ----------------------------------------------------------------------------- # Task data mixture we'll train on # Select dataset based on dataset_choice parameter @@ -103,19 +104,18 @@ elif dataset_choice == "nemotron": ARC(subset="ARC-Easy", split="train"), # 2.3K rows ARC(subset="ARC-Challenge", split="train"), # 1.1K rows GSM8K(subset="main", split="train"), # 8K rows - Nemotron(categories=["stem"], split="train", stop=2540), # 25.4% of 10K = 2.54K - Nemotron(categories=["math"], split="train", stop=1710), # 17.1% of 10K = 1.71K - Nemotron(categories=["chat"], split="train", stop=4490), # 44.9% of 10K = 4.49K - Nemotron(categories=["code"], split="train", stop=1250), # 12.5% of 10K = 1.25K - CustomJSON(filepath=identity_conversations_filepath), # 1K rows of synthetic identity conversations - ]) # total: 2.3K + 1.1K + 8K + (2.54K + 1.71K + 4.49K + 1.25K) = 21.4K rows (same as SmolTalk) + Nemotron(categories=["stem"], split="train", stop=3000), + Nemotron(categories=["math"], split="train", stop=3000), + Nemotron(categories=["chat"], split="train", stop=1000), + Nemotron(categories=["code"], split="train", stop=3000), + ]) # total: 2.3K + 1.1K + 8K + (3.0K + 3.0K + 1.0K + 3.0K) = 18.4K rows (similar to SmolTalk) # For validation, use a small subset of Nemotron mixed categories val_ds = TaskMixture([ - Nemotron(categories=["stem"], split="train", start=2540, stop=2790), # 250 samples - Nemotron(categories=["math"], split="train", start=1710, stop=1960), # 250 samples - Nemotron(categories=["chat"], split="train", start=4490, stop=5240), # 750 samples - Nemotron(categories=["code"], split="train", start=1250, stop=1500), # 250 samples - ]) # total: 1500 samples for validation + Nemotron(categories=["stem"], split="train", start=3000, stop=3300), # 300 samples + Nemotron(categories=["math"], split="train", start=3000, stop=3300), # 300 samples + Nemotron(categories=["chat"], split="train", start=1000, stop=1100), # 100 samples + Nemotron(categories=["code"], split="train", start=3000, stop=3300), # 300 samples + ]) # total: 1000 samples for validation else: raise ValueError(f"Unknown dataset_choice: {dataset_choice}. Must be 'smoltalk' or 'nemotron'") @@ -292,13 +292,14 @@ if master_process: "val_loss": val_loss, **metrics, "model_config": model_config_kwargs, + "tokenizer_name": tokenizer_name, } ) print(f"✅ Saved model checkpoint to {checkpoint_dir}") # Log to report from nanochat.report import get_report -get_report().log(section="Chat SFT", data=[ +get_report(exp_name=run).log(section="Chat SFT", data=[ user_config, # CLI args { "Training rows": len(train_ds), diff --git a/scripts/mid_train.py b/scripts/mid_train.py index f43e306..e72d422 100644 --- a/scripts/mid_train.py +++ b/scripts/mid_train.py @@ -121,24 +121,25 @@ elif dataset_choice == "nemotron": # Original Nemotron distribution: stem(355K/25.4%), math(239K/17.1%), chat(628K/44.9%), code(175K/12.5%) # Proportionally sampled to 460K total, then add MMLU + GSM8K to match SmolTalk structure train_dataset = TaskMixture([ - Nemotron(categories=["stem"], split="train", stop=117000), # 25.4% of 460K = 117K - Nemotron(categories=["math"], split="train", stop=79000), # 17.1% of 460K = 79K - Nemotron(categories=["chat"], split="train", stop=207000), # 44.9% of 460K = 207K - Nemotron(categories=["code"], split="train", stop=57000), # 12.5% of 460K = 57K + Nemotron(categories=["stem"], split="train", stop=151800), + Nemotron(categories=["math"], split="train", stop=151800), + Nemotron(categories=["chat"], split="train", stop=4600), + Nemotron(categories=["code"], split="train", stop=151800), MMLU(subset="auxiliary_train", split="train"), # 100K rows of multiple choice problems GSM8K(subset="main", split="train"), # 8K rows teaching simple math and (calculator) tool use CustomJSON(filepath=identity_conversations_filepath), # 1000 rows of synthetic identity conversations CustomJSON(filepath=identity_conversations_filepath), # let's do 2 epochs of these ]) # total: 117K + 79K + 207K + 57K + 100K + 8K = 568K rows (same as SmolTalk) + # For validation, match SmolTalk validation set structure val_dataset = TaskMixture([ - Nemotron(categories=["stem"], split="train", start=117000, stop=124500), # 7.5K - Nemotron(categories=["math"], split="train", start=79000, stop=84000), # 5K - Nemotron(categories=["chat"], split="train", start=207000, stop=220500), # 13.5K - Nemotron(categories=["code"], split="train", start=57000, stop=61000), # 4K + Nemotron(categories=["stem"], split="train", start=151800, stop=155000), + Nemotron(categories=["math"], split="train", start=151800, stop=155000), + Nemotron(categories=["chat"], split="train", start=4600, stop=10000), + Nemotron(categories=["code"], split="train", start=151800, stop=155000), MMLU(subset="all", split="test", stop=5200), # 5.2K rows to match train ratios GSM8K(subset="main", split="test", stop=420), # 420 rows to match train ratios - ]) # total: 7.5K + 5K + 13.5K + 4K + 5.2K + 0.42K = 35.6K rows + ]) # total: 6.0K + 4.0K + 10.8K + 3.2K + 5.2K + 0.42K = 30.6K rows (similar to SmolTalk) else: raise ValueError(f"Unknown dataset_choice: {dataset_choice}. Must be 'smoltalk' or 'nemotron'") # DataLoader is defined here, it emits inputs, targets : 2D tensors of shape (device_batch_size, max_seq_len) @@ -329,7 +330,7 @@ print0(f"Minimum validation bpb: {min_val_bpb:.4f}") # Log to report if not dry_run: from nanochat.report import get_report - get_report().log(section="Midtraining", data=[ + get_report(exp_name=run).log(section="Midtraining", data=[ user_config, # CLI args { # stats about the training setup "Number of iterations": step,