mirror of
https://github.com/karpathy/nanochat.git
synced 2026-03-13 16:33:41 +00:00
Change default sliding window pattern to L when FA3 is not available
This commit is contained in:
parent
e527521a3f
commit
fdaebf22cc
|
|
@ -50,7 +50,7 @@ parser.add_argument("--depth", type=int, default=20, help="depth of the Transfor
|
|||
parser.add_argument("--aspect-ratio", type=int, default=64, help="model_dim = depth * aspect_ratio")
|
||||
parser.add_argument("--head-dim", type=int, default=128, help="target head dimension for attention")
|
||||
parser.add_argument("--max-seq-len", type=int, default=2048, help="max context length")
|
||||
parser.add_argument("--window-pattern", type=str, default="SSSL", help="sliding window pattern tiled across layers: L=full, S=half context (e.g. 'SSL')")
|
||||
parser.add_argument("--window-pattern", type=str, default="SSSL" if HAS_FA3 else "L", help="sliding window pattern tiled across layers: L=full, S=half context (e.g. 'SSL')")
|
||||
# Training horizon (only one used, in order of precedence)
|
||||
parser.add_argument("--num-iterations", type=int, default=-1, help="explicit number of optimization steps (-1 = disable)")
|
||||
parser.add_argument("--target-flops", type=float, default=-1.0, help="calculate num_iterations to reach target_flops (-1 = disable)")
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user