From 9822cc7424aabffd0601f4ddfb465dba269f9765 Mon Sep 17 00:00:00 2001
From: Sofie Van Landeghem <svlandeg@users.noreply.github.com>
Date: Mon, 13 Apr 2026 14:03:18 +0200
Subject: [PATCH] use nn.init and initialize smear gate's weight as well

---
 nanochat/gpt.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/nanochat/gpt.py b/nanochat/gpt.py
index b2656508..96010419 100644
--- a/nanochat/gpt.py
+++ b/nanochat/gpt.py
@@ -237,8 +237,9 @@ class GPT(nn.Module):
         # Decaying x0 init: earlier layers get more input embedding blending
         for i in range(n_layer):
             self.x0_lambdas.data[i] = 0.20 - (0.15 * i / max(n_layer - 1, 1))
-        self.smear_lambda.fill_(0.0)
-        self.backout_lambda.fill_(0.2)
+        torch.nn.init.zeros_(self.smear_lambda)
+        torch.nn.init.constant_(self.backout_lambda, 0.2)
+        torch.nn.init.uniform_(self.smear_gate.weight, 0.0, 0.02)
 
         # Value embeddings (init like c_v: uniform with same std)
         for ve in self.value_embeds.values():