diff --git a/nanochat/gpt.py b/nanochat/gpt.py
index 04ee5c5..1799159 100644
--- a/nanochat/gpt.py
+++ b/nanochat/gpt.py
@@ -58,8 +58,8 @@ def apply_rotary_emb(x, cos, sin):
     assert x.ndim == 4  # multihead attention
     d = x.shape[3] // 2
     x1, x2 = x[..., :d], x[..., d:] # split up last dim into two halves
-    y1 = x1 * cos + x2 * sin # rotate pairs of dims
-    y2 = x1 * (-sin) + x2 * cos
+    y1 = x1 * cos - x2 * sin # rotate pairs of dims
+    y2 = x1 * sin + x2 * cos
     return torch.cat([y1, y2], 3)
 
 class CausalSelfAttention(nn.Module):