make sure to use active params in scaling laws

This commit is contained in:
Andrej Karpathy 2026-02-19 02:46:36 +00:00
parent a5e51a93ae
commit 5422d3a132
2 changed files with 17 additions and 1 deletions

View File

@ -335,6 +335,10 @@ class GPT(nn.Module):
Returns a dict with counts for each parameter group, so downstream analysis
can experiment with which combination gives the cleanest scaling laws.
For MoE, 'active_*' fields count only the parameters active per token
(top_k out of num_experts routed experts, plus shared experts).
Following DeepSeek convention of reporting both total and active params.
"""
# Count each group separately (mirrors the grouping in setup_optimizers)
wte = sum(p.numel() for p in self.transformer.wte.parameters())
@ -344,13 +348,24 @@ class GPT(nn.Module):
scalars = self.resid_lambdas.numel() + self.x0_lambdas.numel()
total = wte + value_embeds + lm_head + transformer_matrices + scalars
assert total == sum(p.numel() for p in self.parameters()), "Parameter count mismatch"
# MoE: only top_k/num_experts fraction of routed expert params active per token
# Shared expert is always active so its params stay in the active count
expert_hidden = self.transformer.h[0].moe.expert_hidden_dim
routed_params_per_layer = self.config.num_experts * 2 * self.config.n_embd * expert_hidden
inactive_per_layer = routed_params_per_layer * (self.config.num_experts - self.config.top_k) // self.config.num_experts
moe_inactive = inactive_per_layer * self.config.n_layer
active_transformer_matrices = transformer_matrices - moe_inactive
active_total = total - moe_inactive
return {
'wte': wte,
'value_embeds': value_embeds,
'lm_head': lm_head,
'transformer_matrices': transformer_matrices,
'active_transformer_matrices': active_transformer_matrices,
'scalars': scalars,
'moe_inactive': moe_inactive,
'total': total,
'active_total': active_total,
}
def setup_optimizer(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0, adam_betas=(0.8, 0.95), scalar_lr=0.5):

View File

@ -261,8 +261,9 @@ print0(f"Estimated FLOPs per token: {num_flops_per_token:e}")
# We've already initialized the model so we have Params. Optimal Tokens is now simply target-param-data-ratio * Params
def get_scaling_params(m):
# As for which params to use exactly, transformer matrices + lm_head gives cleanest scaling laws (see dev/LOG.md Jan 27, 2026)
# For MoE, use active params (only top_k routed experts + shared, not all experts)
params_counts = m.num_scaling_params()
scaling_params = params_counts['transformer_matrices'] + params_counts['lm_head']
scaling_params = params_counts['active_transformer_matrices'] + params_counts['lm_head']
return scaling_params
num_scaling_params = get_scaling_params(model)
target_tokens = int(args.target_param_data_ratio * num_scaling_params) # optimal tokens for the model we are about to train