From 94b73ad29aa21da6267e93db6035223f15f692fc Mon Sep 17 00:00:00 2001 From: Marcin Bogdanski Date: Fri, 3 Apr 2026 20:39:55 +0000 Subject: [PATCH 1/3] fix: initialize smear and backout lambdas in init_weights --- nanochat/gpt.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nanochat/gpt.py b/nanochat/gpt.py index 0b822e41..b2656508 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -237,6 +237,8 @@ class GPT(nn.Module): # Decaying x0 init: earlier layers get more input embedding blending for i in range(n_layer): self.x0_lambdas.data[i] = 0.20 - (0.15 * i / max(n_layer - 1, 1)) + self.smear_lambda.fill_(0.0) + self.backout_lambda.fill_(0.2) # Value embeddings (init like c_v: uniform with same std) for ve in self.value_embeds.values(): From 9822cc7424aabffd0601f4ddfb465dba269f9765 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 13 Apr 2026 14:03:18 +0200 Subject: [PATCH 2/3] use nn.init and initialize smear gate's weight as well --- nanochat/gpt.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nanochat/gpt.py b/nanochat/gpt.py index b2656508..96010419 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -237,8 +237,9 @@ class GPT(nn.Module): # Decaying x0 init: earlier layers get more input embedding blending for i in range(n_layer): self.x0_lambdas.data[i] = 0.20 - (0.15 * i / max(n_layer - 1, 1)) - self.smear_lambda.fill_(0.0) - self.backout_lambda.fill_(0.2) + torch.nn.init.zeros_(self.smear_lambda) + torch.nn.init.constant_(self.backout_lambda, 0.2) + torch.nn.init.uniform_(self.smear_gate.weight, 0.0, 0.02) # Value embeddings (init like c_v: uniform with same std) for ve in self.value_embeds.values(): From a3ca42a678c0090e5d4f6b6d5be5782efdd0a225 Mon Sep 17 00:00:00 2001 From: Sofie Van Landeghem Date: Mon, 13 Apr 2026 14:17:23 +0200 Subject: [PATCH 3/3] add comment --- nanochat/gpt.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/nanochat/gpt.py b/nanochat/gpt.py index 96010419..07a1eae8 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -237,6 +237,8 @@ class GPT(nn.Module): # Decaying x0 init: earlier layers get more input embedding blending for i in range(n_layer): self.x0_lambdas.data[i] = 0.20 - (0.15 * i / max(n_layer - 1, 1)) + + # Smear/backout scalars and smear gate must be explicitly initialized torch.nn.init.zeros_(self.smear_lambda) torch.nn.init.constant_(self.backout_lambda, 0.2) torch.nn.init.uniform_(self.smear_gate.weight, 0.0, 0.02)