From 5422d3a132b0431eaa87009c69a12483209d296e Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Thu, 19 Feb 2026 02:46:36 +0000 Subject: [PATCH] make sure to use active params in scaling laws --- nanochat/gpt.py | 15 +++++++++++++++ scripts/base_train.py | 3 ++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/nanochat/gpt.py b/nanochat/gpt.py index 6b91936..855f1cf 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -335,6 +335,10 @@ class GPT(nn.Module): Returns a dict with counts for each parameter group, so downstream analysis can experiment with which combination gives the cleanest scaling laws. + + For MoE, 'active_*' fields count only the parameters active per token + (top_k out of num_experts routed experts, plus shared experts). + Following DeepSeek convention of reporting both total and active params. """ # Count each group separately (mirrors the grouping in setup_optimizers) wte = sum(p.numel() for p in self.transformer.wte.parameters()) @@ -344,13 +348,24 @@ class GPT(nn.Module): scalars = self.resid_lambdas.numel() + self.x0_lambdas.numel() total = wte + value_embeds + lm_head + transformer_matrices + scalars assert total == sum(p.numel() for p in self.parameters()), "Parameter count mismatch" + # MoE: only top_k/num_experts fraction of routed expert params active per token + # Shared expert is always active so its params stay in the active count + expert_hidden = self.transformer.h[0].moe.expert_hidden_dim + routed_params_per_layer = self.config.num_experts * 2 * self.config.n_embd * expert_hidden + inactive_per_layer = routed_params_per_layer * (self.config.num_experts - self.config.top_k) // self.config.num_experts + moe_inactive = inactive_per_layer * self.config.n_layer + active_transformer_matrices = transformer_matrices - moe_inactive + active_total = total - moe_inactive return { 'wte': wte, 'value_embeds': value_embeds, 'lm_head': lm_head, 'transformer_matrices': transformer_matrices, + 'active_transformer_matrices': active_transformer_matrices, 'scalars': scalars, + 'moe_inactive': moe_inactive, 'total': total, + 'active_total': active_total, } def setup_optimizer(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0, adam_betas=(0.8, 0.95), scalar_lr=0.5): diff --git a/scripts/base_train.py b/scripts/base_train.py index 304c529..9dceadc 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -261,8 +261,9 @@ print0(f"Estimated FLOPs per token: {num_flops_per_token:e}") # We've already initialized the model so we have Params. Optimal Tokens is now simply target-param-data-ratio * Params def get_scaling_params(m): # As for which params to use exactly, transformer matrices + lm_head gives cleanest scaling laws (see dev/LOG.md Jan 27, 2026) + # For MoE, use active params (only top_k routed experts + shared, not all experts) params_counts = m.num_scaling_params() - scaling_params = params_counts['transformer_matrices'] + params_counts['lm_head'] + scaling_params = params_counts['active_transformer_matrices'] + params_counts['lm_head'] return scaling_params num_scaling_params = get_scaling_params(model) target_tokens = int(args.target_param_data_ratio * num_scaling_params) # optimal tokens for the model we are about to train