From 637ecf6f12f5effc0912d260980346ea0926043e Mon Sep 17 00:00:00 2001
From: Mithun Kannaa <mithunkannaa609@gmail.com>
Date: Wed, 15 Apr 2026 13:54:22 +0530
Subject: [PATCH] Implement Exclusive Self Attention in the forward pass.
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds Exclusive Self-Attention (XSA) from “Exclusive Self Attention” by Shuangfei Zhai (Apple) , which removes the component of the attention output aligned with its own value vector to eliminate attention similarity bias and improve context modeling. This is a 2 line change in the Attention Block
---
 nanochat/gpt.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/nanochat/gpt.py b/nanochat/gpt.py
index 07a1eae8..536720a6 100644
--- a/nanochat/gpt.py
+++ b/nanochat/gpt.py
@@ -119,7 +119,10 @@ class CausalSelfAttention(nn.Module):
             # Advance position after last layer processes
             if self.layer_idx == kv_cache.n_layers - 1:
                 kv_cache.advance(T)
-
+        # XSA (Exclusive Self Attention)
+        Vn = F.normalize(v, dim=-1)
+        Vn = Vn.repeat_interleave(self.n_head // self.n_kv_head, dim=2)
+        y = y - (y * Vn).sum(dim=-1, keepdim=True) * Vn
         # Re-assemble the heads and project back to residual stream
         y = y.contiguous().view(B, T, -1)
         y = self.c_proj(y)