From 5422d3a132b0431eaa87009c69a12483209d296e Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Thu, 19 Feb 2026 02:46:36 +0000
Subject: [PATCH] make sure to use active params in scaling laws

---
 nanochat/gpt.py       | 15 +++++++++++++++
 scripts/base_train.py |  3 ++-
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/nanochat/gpt.py b/nanochat/gpt.py
index 6b91936..855f1cf 100644
--- a/nanochat/gpt.py
+++ b/nanochat/gpt.py
@@ -335,6 +335,10 @@ class GPT(nn.Module):
 
         Returns a dict with counts for each parameter group, so downstream analysis
         can experiment with which combination gives the cleanest scaling laws.
+
+        For MoE, 'active_*' fields count only the parameters active per token
+        (top_k out of num_experts routed experts, plus shared experts).
+        Following DeepSeek convention of reporting both total and active params.
         """
         # Count each group separately (mirrors the grouping in setup_optimizers)
         wte = sum(p.numel() for p in self.transformer.wte.parameters())
@@ -344,13 +348,24 @@ class GPT(nn.Module):
         scalars = self.resid_lambdas.numel() + self.x0_lambdas.numel()
         total = wte + value_embeds + lm_head + transformer_matrices + scalars
         assert total == sum(p.numel() for p in self.parameters()), "Parameter count mismatch"
+        # MoE: only top_k/num_experts fraction of routed expert params active per token
+        # Shared expert is always active so its params stay in the active count
+        expert_hidden = self.transformer.h[0].moe.expert_hidden_dim
+        routed_params_per_layer = self.config.num_experts * 2 * self.config.n_embd * expert_hidden
+        inactive_per_layer = routed_params_per_layer * (self.config.num_experts - self.config.top_k) // self.config.num_experts
+        moe_inactive = inactive_per_layer * self.config.n_layer
+        active_transformer_matrices = transformer_matrices - moe_inactive
+        active_total = total - moe_inactive
         return {
             'wte': wte,
             'value_embeds': value_embeds,
             'lm_head': lm_head,
             'transformer_matrices': transformer_matrices,
+            'active_transformer_matrices': active_transformer_matrices,
             'scalars': scalars,
+            'moe_inactive': moe_inactive,
             'total': total,
+            'active_total': active_total,
         }
 
     def setup_optimizer(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0, adam_betas=(0.8, 0.95), scalar_lr=0.5):
diff --git a/scripts/base_train.py b/scripts/base_train.py
index 304c529..9dceadc 100644
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@@ -261,8 +261,9 @@ print0(f"Estimated FLOPs per token: {num_flops_per_token:e}")
 # We've already initialized the model so we have Params. Optimal Tokens is now simply target-param-data-ratio * Params
 def get_scaling_params(m):
     # As for which params to use exactly, transformer matrices + lm_head gives cleanest scaling laws (see dev/LOG.md Jan 27, 2026)
+    # For MoE, use active params (only top_k routed experts + shared, not all experts)
     params_counts = m.num_scaling_params()
-    scaling_params = params_counts['transformer_matrices'] + params_counts['lm_head']
+    scaling_params = params_counts['active_transformer_matrices'] + params_counts['lm_head']
     return scaling_params
 num_scaling_params = get_scaling_params(model)
 target_tokens = int(args.target_param_data_ratio * num_scaling_params) # optimal tokens for the model we are about to train