mirror of
https://github.com/karpathy/nanochat.git
synced 2026-05-17 13:17:35 +00:00
add comment
This commit is contained in:
parent
9822cc7424
commit
a3ca42a678
|
|
@ -237,6 +237,8 @@ class GPT(nn.Module):
|
|||
# Decaying x0 init: earlier layers get more input embedding blending
|
||||
for i in range(n_layer):
|
||||
self.x0_lambdas.data[i] = 0.20 - (0.15 * i / max(n_layer - 1, 1))
|
||||
|
||||
# Smear/backout scalars and smear gate must be explicitly initialized
|
||||
torch.nn.init.zeros_(self.smear_lambda)
|
||||
torch.nn.init.constant_(self.backout_lambda, 0.2)
|
||||
torch.nn.init.uniform_(self.smear_gate.weight, 0.0, 0.02)
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user