mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 12:22:18 +00:00
mqa -> gqa to reduce confusion
This commit is contained in:
parent
f66a780f68
commit
bc1fca39f3
|
|
@ -8,7 +8,7 @@ Notable features:
|
||||||
- norm after token embedding
|
- norm after token embedding
|
||||||
- no learnable params in rmsnorm
|
- no learnable params in rmsnorm
|
||||||
- no bias in linear layers
|
- no bias in linear layers
|
||||||
- Multi-Query Attention (MQA) support for more efficient inference
|
- Group-Query Attention (GQA) support for more efficient inference
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import math
|
import math
|
||||||
|
|
@ -29,7 +29,7 @@ class GPTConfig:
|
||||||
vocab_size: int = 50304
|
vocab_size: int = 50304
|
||||||
n_layer: int = 12
|
n_layer: int = 12
|
||||||
n_head: int = 6 # number of query heads
|
n_head: int = 6 # number of query heads
|
||||||
n_kv_head: int = 6 # number of key/value heads (MQA)
|
n_kv_head: int = 6 # number of key/value heads (GQA)
|
||||||
n_embd: int = 768
|
n_embd: int = 768
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user