""" Midtrain the model. Same as pretraining but simpler. Run as: python -m scripts.mid_train Or torchrun for training: torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --device_batch_size=16 """ from collections import deque import os os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" import time import wandb import torch import torch.nn.functional as F from contextlib import nullcontext from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, get_base_dir, autodetect_device_type from nanochat.tokenizer import get_token_bytes from nanochat.checkpoint_manager import save_checkpoint from nanochat.loss_eval import evaluate_bpb from nanochat.checkpoint_manager import load_model from nanochat.manager import MANAGER import torch.distributed as dist from tasks.common import TaskMixture from tasks.gsm8k import GSM8K from tasks.mmlu import MMLU from tasks.smoltalk import SmolTalk from tasks.customjson import CustomJSON from tasks.spellingbee import SimpleSpelling, SpellingBee # ----------------------------------------------------------------------------- run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb) device_type = "" # cuda|cpu|mps (empty => autodetect) model_tag = None # model tag to load the model from (base model or midtrained model) step = None # step to load the model from (base model or midtrained model) dtype = "bfloat16" num_iterations = -1 # explicit number of steps of the optimization (-1 = disable) num_epochs = 1 # number of full passes over the midtraining dataset (only used if num_iterations < 0) max_seq_len = 2048 device_batch_size = 32 unembedding_lr = 0.004 embedding_lr = 0.2 matrix_lr = 0.02 init_lr_frac = 1.0 # initial learning rate is this fraction of the base learning rate learning_rate = 3e-4 betas = (0.9, 0.95) weight_decay = 0.0 warmup_ratio = 0.0 # LR warmup (ratio of total training progress in [0, 1]). 0 disables warmup. # Debug knobs for MoE loss components (defaults preserve existing behavior) disable_aux_loss = False disable_router_z_loss = False override_aux_loss_weight = -1.0 # <0 means do not override override_router_z_loss_weight = -1.0 # <0 means do not override eval_every = 150 # -1 = disable eval_tokens = 20*524288 total_batch_size = 524288 dry_run = 0 # dry_run=1 is for experiments: we will log to wandb but we won't write checkpoints or report config_keys = [k for k,v in globals().items() if not k.startswith('_') and isinstance(v, (int, float, bool, str))] exec(open(os.path.join('nanochat', 'configurator.py')).read()) # overrides from command line or config file user_config = {k: globals()[k] for k in config_keys} # possibly useful for logging # ----------------------------------------------------------------------------- # Compute init device_type = autodetect_device_type() if device_type == "" else device_type ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type) master_process = ddp_rank == 0 autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext() synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0 # wandb logging init use_dummy_wandb = run == "dummy" or not master_process wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat-mid", name=run, config=user_config) # Load the model and tokenizer model, tokenizer, meta = load_model("base", device, phase="train", model_tag=model_tag, step=step) # Optional overrides for MoE auxiliary losses (useful when total loss plateaus) if hasattr(model, "config"): if disable_aux_loss and getattr(model.config, "n_exp", 1) > 1: print0("Disabling MoE aux loss for this midtraining run") model.config.use_aux_loss = False if disable_router_z_loss and getattr(model.config, "n_exp", 1) > 1: print0("Disabling MoE router z loss for this midtraining run") model.config.use_router_z_loss = False if override_aux_loss_weight >= 0 and getattr(model.config, "n_exp", 1) > 1: print0(f"Overriding MoE aux_loss_weight to {override_aux_loss_weight}") model.config.aux_loss_weight = float(override_aux_loss_weight) if override_router_z_loss_weight >= 0 and getattr(model.config, "n_exp", 1) > 1: print0(f"Overriding MoE router_z_loss_weight to {override_router_z_loss_weight}") model.config.router_z_loss_weight = float(override_router_z_loss_weight) print0(f"MoE training loss is configured to use aux_loss: {getattr(model.config, 'use_aux_loss', False)} with weight {getattr(model.config, 'aux_loss_weight', 0.0)}, router_z_loss: {getattr(model.config, 'use_router_z_loss', False)} with weight {getattr(model.config, 'router_z_loss_weight', 0.0)}") pretrain_batch_size = meta.get("device_batch_size", None) if pretrain_batch_size is not None and device_batch_size > pretrain_batch_size: print0(f"FOOTGUN WARNING: base model training used device_batch_size {pretrain_batch_size}, did you pass in a good --device_batch_size to this script?") orig_model = model model = torch.compile(model, dynamic=False) depth = model.config.n_layer # num_flops_per_token = model.estimate_flops(max_seq_len) tokens_per_fwdbwd = device_batch_size * max_seq_len # tokens per iteration for a single rank world_tokens_per_fwdbwd = tokens_per_fwdbwd * ddp_world_size # total tokens per iteration for all ranks assert total_batch_size % world_tokens_per_fwdbwd == 0 grad_accum_steps = total_batch_size // world_tokens_per_fwdbwd print0(f"Tokens / micro-batch / rank: {device_batch_size} x {max_seq_len} = {tokens_per_fwdbwd:,}") print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}") print0(f"Total batch size {total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}") token_bytes = get_token_bytes(device=device) # Sanity print: tokenizer ids must fit inside model vocab (esp. when vocab_size=50304 padded GPT-2) print0(f"Model vocab_size: {model.config.vocab_size}") print0(f"Tokenizer vocab_size: {tokenizer.get_vocab_size()}") # Initialize the Optimizer (AdamW for all parameters) - BEFORE DDP wrapping (matching nanoMoE) adamw_optimizer = model.configure_optimizers( weight_decay=weight_decay, learning_rate=learning_rate, betas=betas, device_type=device_type, ) optimizers = [adamw_optimizer] # # Initialize the Optimizer (Muon for Linear layers, AdamW for embedding and lm_head) # optimizers = model.setup_optimizers(unembedding_lr=unembedding_lr, embedding_lr=embedding_lr, matrix_lr=matrix_lr, weight_decay=weight_decay) # adamw_optimizer, muon_optimizer = optimizers # # Override the initial learning rate as a fraction of the base learning rate # for opt in optimizers: # for group in opt.param_groups: # group["lr"] = group["lr"] * init_lr_frac # group["initial_lr"] = group["lr"] # save the initial learning so we can decay easily later # Midtraining data mixture and DataLoader base_dir = get_base_dir() identity_conversations_filepath = os.path.join(base_dir, "identity_conversations.jsonl") train_dataset = TaskMixture([ SmolTalk(split="train"), # 460K rows of general conversations MMLU(subset="auxiliary_train", split="train"), # 100K rows of multiple choice problems drawn from ARC, MC_TEST, OBQA, RACE GSM8K(subset="main", split="train"), # 8K rows teaching simple math and (calculator) tool use CustomJSON(filepath=identity_conversations_filepath), # 1000 rows of synthetic identity conversations CustomJSON(filepath=identity_conversations_filepath), # let's do 2 epochs of these SimpleSpelling(size=200000, split="train"), # 200K rows of Simple Spelling (e.g. spell the word 'apple') SpellingBee(size=80000, split="train"), # 80K rows of Spelling Bee (e.g. how many 'r' are in 'strawberry'?) ]) # total: 460K + 100K + 8K + 200K + 80K = 848K rows val_dataset = TaskMixture([ SmolTalk(split="test"), # 24K rows in test set MMLU(subset="all", split="test", stop=5200), # 14K rows in test set, use only 5.2K to match the train ratios GSM8K(subset="main", split="test", stop=420), # 1.32K rows in test set, use only 420 to match the train ratios ]) # total: 24K + 14K + 1.32K ~= 39K rows # DataLoader is defined here, it emits inputs, targets : 2D tensors of shape (device_batch_size, max_seq_len) # A big problem is that we don't know the final num_iterations in advance. So we create # these two global variables and update them from within the data generator. last_step = False # we will toggle this to True when we reach the end of the dataset approx_progress = 0.0 # will go from 0 to 1 over the course of the epoch current_epoch = 1 # will go from 1 to num_epochs def mid_data_generator(split): global last_step, approx_progress, current_epoch assert split in {"train", "val"}, "split must be 'train' or 'val'" dataset = train_dataset if split == "train" else val_dataset dataset_size = len(dataset) assert dataset_size > 0 needed_tokens = device_batch_size * max_seq_len + 1 # to form one training batch of inputs,targets token_buffer = deque() # A lightweight resumable state dict (similar spirit to base_train.py) dataloader_state_dict = {"split": split} # CUDA supports memory pinning for faster transfers between CPU and GPU: scratch = torch.empty(needed_tokens, dtype=torch.int64, pin_memory=(device_type == "cuda")) cursor = ddp_rank # increments by ddp_world_size each time, so each rank processes unique documents it = 0 # iteration counter while True: # Accumulate enough tokens for one iteration before yielding while len(token_buffer) < needed_tokens: conversation = dataset[cursor] ids, _ = tokenizer.render_conversation(conversation) token_buffer.extend(ids) cursor += ddp_world_size if cursor >= dataset_size: cursor -= dataset_size # wrap around for another epoch if split == "train": # Track epochs (unless num_iterations explicitly caps steps) if num_iterations < 0: current_epoch += 1 if current_epoch > num_epochs: last_step = True # terminate after requested epochs else: last_step = True # legacy behavior when num_iterations is set elsewhere # Stopping condition to respect num_iterations, if given it += 1 if num_iterations > 0 and it >= num_iterations: last_step = True # toggle last_step to True, which will terminate the training loop # Build up inputs/targets and yield for i in range(needed_tokens): scratch[i] = token_buffer.popleft() inputs_cpu = scratch[:-1].to(dtype=torch.int32) targets_cpu = scratch[1:] # Early token-id range check on CPU to avoid opaque torch.compile CUDA OOB asserts. # Only do this for a few batches to keep overhead minimal. if it <= 5: min_id = int(inputs_cpu.min().item()) max_id = int(inputs_cpu.max().item()) vocab_limit = int(model.config.vocab_size) if not (0 <= min_id and max_id < vocab_limit): raise ValueError( f"Token id out of range: min={min_id}, max={max_id}, expected within [0, {vocab_limit}). " f"Tokenizer vocab_size={int(tokenizer.get_vocab_size())}. " "This usually means the tokenizer used for midtraining doesn't match the model vocab." ) inputs = inputs_cpu.view(device_batch_size, max_seq_len).to(device=device, dtype=torch.int32, non_blocking=True) targets = targets_cpu.view(device_batch_size, max_seq_len).to(device=device, dtype=torch.int64, non_blocking=True) if split == "train": if num_iterations > 0: approx_progress = it / num_iterations # calculate progress from the max number of iterations else: # progress across epochs, in [0, 1] denom = max(float(num_epochs), 1.0) approx_progress = min(((current_epoch - 1) + (cursor / dataset_size)) / denom, 1.0) dataloader_state_dict.update({ "cursor": int(cursor), "it": int(it), "current_epoch": int(current_epoch), "last_step": bool(last_step), "approx_progress": float(approx_progress), # Keep the remaining buffered tokens for exact resume semantics. "token_buffer": list(token_buffer), }) yield inputs, targets, dataloader_state_dict train_loader = mid_data_generator("train") build_val_loader = lambda: mid_data_generator("val") progress = 0 # will go from 0 to 1 over the course of the epoch # Learning rate scheduler def get_lr_multiplier(progress): # Warmup: linearly ramp from 0 -> 1 over the first `warmup_ratio` portion of training. if warmup_ratio and warmup_ratio > 0: warmup_mult = min(max(progress / warmup_ratio, 0.0), 1.0) else: warmup_mult = 1.0 # Decay: first 80% of training no decay, then linearly ramp down to 0. decay_mult = 1 if progress < 0.8 else 1 - (progress - 0.8) / 0.2 return warmup_mult * decay_mult # Momentum scheduler for Muon optimizer def get_muon_momentum(it): frac = min(it / 300, 1) momentum = (1 - frac) * 0.85 + frac * 0.95 return momentum # ----------------------------------------------------------------------------- # Training loop x, y, dataloader_state_dict = next(train_loader) # prefetch the very first batch of data min_val_bpb = float("inf") smooth_train_loss = 0 # EMA of training loss smooth_train_ce_loss = 0 # EMA of CE loss ema_beta = 0.9 # EMA decay factor total_training_time = 0 # total wall-clock time of training val_bpb = None # populated during evaluation (keep defined for checkpoint metadata) step = 0 while True: # flops_so_far = num_flops_per_token * total_batch_size * step # Synchronize last_step across all ranks to avoid hangs in the distributed setting if ddp: last_step_tensor = torch.tensor(last_step, dtype=torch.int32, device=device) dist.all_reduce(last_step_tensor, op=dist.ReduceOp.MAX) last_step = bool(last_step_tensor.item()) # once in a while: evaluate the val bpb (all ranks participate) if eval_every > 0 and (last_step or step % eval_every == 0): model.eval() val_loader = build_val_loader() eval_steps = eval_tokens // (device_batch_size * max_seq_len * ddp_world_size) with autocast_ctx: val_bpb = evaluate_bpb(model, val_loader, eval_steps, token_bytes) print0(f"Step {step:05d} | Validation bpb: {val_bpb:.4f}") if val_bpb < min_val_bpb: min_val_bpb = val_bpb wandb_run.log({ "step": step, # "total_training_flops": flops_so_far, "total_training_time": total_training_time, "val/bpb": val_bpb, }) model.train() # save checkpoint at the end of the run (only on master process) if master_process and last_step and not dry_run: # output_dirname = f"d{depth}" # e.g. d12 if disable_aux_loss: aux_tag = "noaux" else: aux_tag = "aux" if disable_router_z_loss: z_tag = "noz" else: z_tag = "z" # output_dirname = f"d{depth}_{aux_tag}_{z_tag}_lr{learning_rate}_model{model_tag}" output_dirname = f"d{depth}_lr{learning_rate}_model{model_tag}" checkpoint_dir = os.path.join(base_dir, "mid_checkpoints", output_dirname) # Save metadata in the same shape as base_train.py for consistency. model_config_for_save = {} for k in [ # Core GPT config "block_size", "vocab_size", "n_layer", "n_head", "n_kv_head", "n_embd", "dropout", "bias", # MoE config (if present) "n_exp", "top_k", "use_aux_loss", "use_router_z_loss", "use_noisy_top_k", "aux_loss_weight", "router_z_loss_weight", "train_capacity", "eval_capacity", "min_capacity", "stride", "use_switch_tfm_init", "switch_tfm_init_scale", "router_use_full_prec", ]: if hasattr(orig_model.config, k): v = getattr(orig_model.config, k) if isinstance(v, (int, float, bool, str)): model_config_for_save[k] = v save_checkpoint( checkpoint_dir, step, orig_model.state_dict(), adamw_optimizer.state_dict(), # TODO: make sure saving across ranks is done correctly { "step": step, "model_config": model_config_for_save, "user_config": user_config, # inputs to the training script "device_batch_size": device_batch_size, "max_seq_len": max_seq_len, "loop_state": { "min_val_bpb": min_val_bpb, "smooth_train_loss": smooth_train_loss, "smooth_train_ce_loss": smooth_train_ce_loss, "total_training_time": total_training_time, "progress": progress, "current_epoch": int(current_epoch), }, "dataloader_state_dict": dataloader_state_dict, } ) if last_step: break # ------------------------------------------------------------------------- # single training step # evaluate the gradient synchronize() t0 = time.time() total_loss_accum = 0.0 ce_loss_accum = 0.0 aux_loss_contrib_accum = 0.0 router_z_loss_contrib_accum = 0.0 for micro_step in range(grad_accum_steps): with autocast_ctx: logits, total_loss = model(x, y) # returns (logits, loss) # Cross-entropy only (language modeling objective) ce_loss = F.cross_entropy( logits.view(-1, logits.size(-1)), y.view(-1), ignore_index=-1, ) # Cache logging values (average across micro-steps) total_loss_accum += float(total_loss.detach().item()) ce_loss_accum += float(ce_loss.detach().item()) aux_sum = getattr(MANAGER, "last_aux_loss_sum", 0.0) z_sum = getattr(MANAGER, "last_router_z_loss_sum", 0.0) # Convert sums into the *weighted* contribution that is actually added to total_loss if getattr(model.config, "n_exp", 1) > 1 and getattr(model.config, "use_aux_loss", False): if torch.is_tensor(aux_sum): aux_loss_contrib_accum += float(getattr(model.config, "aux_loss_weight", 0.0)) * aux_sum.detach().item() else: aux_loss_contrib_accum += float(getattr(model.config, "aux_loss_weight", 0.0)) * float(aux_sum) if getattr(model.config, "n_exp", 1) > 1 and getattr(model.config, "use_router_z_loss", False): if torch.is_tensor(z_sum): router_z_loss_contrib_accum += float(getattr(model.config, "router_z_loss_weight", 0.0)) * z_sum.detach().item() else: router_z_loss_contrib_accum += float(getattr(model.config, "router_z_loss_weight", 0.0)) * float(z_sum) loss = total_loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here loss.backward() x, y, dataloader_state_dict = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward progress = max(progress, approx_progress) # only increase progress monotonically # micro-step averages for logging train_total_loss = total_loss_accum / grad_accum_steps train_ce_loss = ce_loss_accum / grad_accum_steps train_aux_loss_contrib = aux_loss_contrib_accum / grad_accum_steps train_router_z_loss_contrib = router_z_loss_contrib_accum / grad_accum_steps # step the optimizer(s) lrm = get_lr_multiplier(progress) current_lr = learning_rate * init_lr_frac * lrm for group in adamw_optimizer.param_groups: group["lr"] = current_lr adamw_optimizer.step() model.zero_grad(set_to_none=True) synchronize() t1 = time.time() dt = t1 - t0 # ------------------------------------------------------------------------- # State step += 1 # logging smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_total_loss # EMA the total loss smooth_train_ce_loss = ema_beta * smooth_train_ce_loss + (1 - ema_beta) * train_ce_loss # EMA the CE loss debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA debiased_smooth_ce_loss = smooth_train_ce_loss / (1 - ema_beta**(step + 1)) # debias the EMA pct_done = 100 * progress tok_per_sec = int(total_batch_size / dt) # flops_per_sec = num_flops_per_token * total_batch_size / dt promised_flops_per_sec_h100 = 989e12 * ddp_world_size # bfloat16 H100 SXM and without 2:4 sparsity # mfu = 100 * flops_per_sec / promised_flops_per_sec_h100 # in % if step > 10: total_training_time += dt # only count the time after the first 10 steps print0( f"step {step:05d} ({pct_done:.2f}%) | " f"loss: {debiased_smooth_loss:.6f} | ce: {debiased_smooth_ce_loss:.6f} | " f"aux: {train_aux_loss_contrib:.6f} | z: {train_router_z_loss_contrib:.6f} | " f"lr: {current_lr:.6g} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | total time: {total_training_time/60:.2f}m" ) if step % 10 == 0: wandb_run.log({ "step": step, # "total_training_flops": flops_so_far, "total_training_time": total_training_time, "train/loss": debiased_smooth_loss, "train/ce_loss": debiased_smooth_ce_loss, "train/aux_loss_contrib": train_aux_loss_contrib, "train/router_z_loss_contrib": train_router_z_loss_contrib, "train/lr": current_lr, "train/lrm": lrm, "train/dt": dt, "train/tok_per_sec": tok_per_sec, # "train/mfu": mfu, }) # print a few more stats print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB") print0(f"Total training time: {total_training_time/60:.2f}m") print0(f"Minimum validation bpb: {min_val_bpb:.4f}") # Log to report if not dry_run: from nanochat.report import get_report get_report().log(section="Midtraining", data=[ user_config, # CLI args { # stats about the training setup "Number of iterations": step, "DDP world size": ddp_world_size, }, { # stats about training outcomes "Minimum validation bpb": min_val_bpb, } ]) # cleanup wandb_run.finish() # wandb run finish compute_cleanup()