mirror of
https://github.com/karpathy/nanochat.git
synced 2026-03-19 19:33:15 +00:00
make sure to use active params in scaling laws
This commit is contained in:
parent
a5e51a93ae
commit
5422d3a132
|
|
@ -335,6 +335,10 @@ class GPT(nn.Module):
|
|||
|
||||
Returns a dict with counts for each parameter group, so downstream analysis
|
||||
can experiment with which combination gives the cleanest scaling laws.
|
||||
|
||||
For MoE, 'active_*' fields count only the parameters active per token
|
||||
(top_k out of num_experts routed experts, plus shared experts).
|
||||
Following DeepSeek convention of reporting both total and active params.
|
||||
"""
|
||||
# Count each group separately (mirrors the grouping in setup_optimizers)
|
||||
wte = sum(p.numel() for p in self.transformer.wte.parameters())
|
||||
|
|
@ -344,13 +348,24 @@ class GPT(nn.Module):
|
|||
scalars = self.resid_lambdas.numel() + self.x0_lambdas.numel()
|
||||
total = wte + value_embeds + lm_head + transformer_matrices + scalars
|
||||
assert total == sum(p.numel() for p in self.parameters()), "Parameter count mismatch"
|
||||
# MoE: only top_k/num_experts fraction of routed expert params active per token
|
||||
# Shared expert is always active so its params stay in the active count
|
||||
expert_hidden = self.transformer.h[0].moe.expert_hidden_dim
|
||||
routed_params_per_layer = self.config.num_experts * 2 * self.config.n_embd * expert_hidden
|
||||
inactive_per_layer = routed_params_per_layer * (self.config.num_experts - self.config.top_k) // self.config.num_experts
|
||||
moe_inactive = inactive_per_layer * self.config.n_layer
|
||||
active_transformer_matrices = transformer_matrices - moe_inactive
|
||||
active_total = total - moe_inactive
|
||||
return {
|
||||
'wte': wte,
|
||||
'value_embeds': value_embeds,
|
||||
'lm_head': lm_head,
|
||||
'transformer_matrices': transformer_matrices,
|
||||
'active_transformer_matrices': active_transformer_matrices,
|
||||
'scalars': scalars,
|
||||
'moe_inactive': moe_inactive,
|
||||
'total': total,
|
||||
'active_total': active_total,
|
||||
}
|
||||
|
||||
def setup_optimizer(self, unembedding_lr=0.004, embedding_lr=0.2, matrix_lr=0.02, weight_decay=0.0, adam_betas=(0.8, 0.95), scalar_lr=0.5):
|
||||
|
|
|
|||
|
|
@ -261,8 +261,9 @@ print0(f"Estimated FLOPs per token: {num_flops_per_token:e}")
|
|||
# We've already initialized the model so we have Params. Optimal Tokens is now simply target-param-data-ratio * Params
|
||||
def get_scaling_params(m):
|
||||
# As for which params to use exactly, transformer matrices + lm_head gives cleanest scaling laws (see dev/LOG.md Jan 27, 2026)
|
||||
# For MoE, use active params (only top_k routed experts + shared, not all experts)
|
||||
params_counts = m.num_scaling_params()
|
||||
scaling_params = params_counts['transformer_matrices'] + params_counts['lm_head']
|
||||
scaling_params = params_counts['active_transformer_matrices'] + params_counts['lm_head']
|
||||
return scaling_params
|
||||
num_scaling_params = get_scaling_params(model)
|
||||
target_tokens = int(args.target_param_data_ratio * num_scaling_params) # optimal tokens for the model we are about to train
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user