From 637ecf6f12f5effc0912d260980346ea0926043e Mon Sep 17 00:00:00 2001 From: Mithun Kannaa Date: Wed, 15 Apr 2026 13:54:22 +0530 Subject: [PATCH] Implement Exclusive Self Attention in the forward pass. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds Exclusive Self-Attention (XSA) from “Exclusive Self Attention” by Shuangfei Zhai (Apple) , which removes the component of the attention output aligned with its own value vector to eliminate attention similarity bias and improve context modeling. This is a 2 line change in the Attention Block --- nanochat/gpt.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/nanochat/gpt.py b/nanochat/gpt.py index 07a1eae8..536720a6 100644 --- a/nanochat/gpt.py +++ b/nanochat/gpt.py @@ -119,7 +119,10 @@ class CausalSelfAttention(nn.Module): # Advance position after last layer processes if self.layer_idx == kv_cache.n_layers - 1: kv_cache.advance(T) - + # XSA (Exclusive Self Attention) + Vn = F.normalize(v, dim=-1) + Vn = Vn.repeat_interleave(self.n_head // self.n_kv_head, dim=2) + y = y - (y * Vn).sum(dim=-1, keepdim=True) * Vn # Re-assemble the heads and project back to residual stream y = y.contiguous().view(B, T, -1) y = self.c_proj(y)