mirror of
https://github.com/karpathy/nanochat.git
synced 2026-04-05 07:05:28 +00:00
Merge 7ac837cff8 into 1ec0a34779
This commit is contained in:
commit
19ea3598b3
BIN
log/2026-02-01_113215 - profiler traces.zip
Normal file
BIN
log/2026-02-01_113215 - profiler traces.zip
Normal file
Binary file not shown.
507
log/base_train-gc-fixes.py
Normal file
507
log/base_train-gc-fixes.py
Normal file
|
|
@ -0,0 +1,507 @@
|
|||
"""
|
||||
Train model. From root directory of the project, run as:
|
||||
|
||||
python -m scripts.base_train
|
||||
|
||||
or distributed as:
|
||||
|
||||
torchrun --nproc_per_node=8 -m scripts.base_train
|
||||
|
||||
If you are only on CPU/Macbook, you'll want to train a much much smaller LLM. Example:
|
||||
python -m scripts.base_train --depth=4 --max-seq-len=512 --device-batch-size=1 --eval-tokens=512 --core-metric-every=-1 --total-batch-size=512 --num-iterations=20
|
||||
"""
|
||||
|
||||
import os
|
||||
os.environ["PYTORCH_ALLOC_CONF"] = "expandable_segments:True"
|
||||
import argparse
|
||||
import time
|
||||
import gc
|
||||
from contextlib import nullcontext
|
||||
|
||||
# GC timing callback to detect if garbage collection is causing training stalls
|
||||
def _gc_callback(phase, info):
|
||||
if phase == "start":
|
||||
_gc_callback.start_time = time.perf_counter()
|
||||
elif phase == "stop":
|
||||
duration_ms = (time.perf_counter() - _gc_callback.start_time) * 1000
|
||||
if duration_ms > 10: # Only log if GC took >10ms
|
||||
rank = getattr(_gc_callback, 'rank', '?')
|
||||
print(f"[GC rank{rank}] gen{info['generation']}: {duration_ms:.1f}ms collected {info.get('collected', '?')} objects")
|
||||
_gc_callback.start_time = 0
|
||||
_gc_callback.rank = '?' # Will be set after compute_init
|
||||
gc.callbacks.append(_gc_callback)
|
||||
|
||||
import wandb
|
||||
import torch
|
||||
|
||||
from nanochat.gpt import GPT, GPTConfig
|
||||
from nanochat.dataloader import tokenizing_distributed_data_loader_bos_bestfit, tokenizing_distributed_data_loader_with_state_bos_bestfit
|
||||
from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, print_banner, get_base_dir, autodetect_device_type, get_peak_flops
|
||||
from nanochat.tokenizer import get_tokenizer, get_token_bytes
|
||||
from nanochat.checkpoint_manager import save_checkpoint, load_checkpoint
|
||||
from nanochat.loss_eval import evaluate_bpb
|
||||
from nanochat.engine import Engine
|
||||
from nanochat.flash_attention import HAS_FA3
|
||||
from scripts.base_eval import evaluate_core
|
||||
print_banner()
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# CLI arguments
|
||||
parser = argparse.ArgumentParser(description="Pretrain base model")
|
||||
# Logging
|
||||
parser.add_argument("--run", type=str, default="dummy", help="wandb run name ('dummy' disables wandb logging)")
|
||||
# Runtime
|
||||
parser.add_argument("--device-type", type=str, default="", help="cuda|cpu|mps (empty = autodetect)")
|
||||
# Model architecture
|
||||
parser.add_argument("--depth", type=int, default=20, help="depth of the Transformer model")
|
||||
parser.add_argument("--aspect-ratio", type=int, default=64, help="model_dim = depth * aspect_ratio")
|
||||
parser.add_argument("--head-dim", type=int, default=128, help="target head dimension for attention")
|
||||
parser.add_argument("--max-seq-len", type=int, default=2048, help="max context length")
|
||||
parser.add_argument("--window-pattern", type=str, default="SSSL", help="sliding window pattern tiled across layers: L=full, S=half context (e.g. 'SSL')")
|
||||
# Training horizon (only one used, in order of precedence)
|
||||
parser.add_argument("--num-iterations", type=int, default=-1, help="explicit number of optimization steps (-1 = disable)")
|
||||
parser.add_argument("--target-flops", type=float, default=-1.0, help="calculate num_iterations to reach target_flops (-1 = disable)")
|
||||
parser.add_argument("--target-param-data-ratio", type=float, default=10.5, help="calculate num_iterations to maintain data:param ratio (Chinchilla=20, -1 = disable)")
|
||||
# Optimization
|
||||
parser.add_argument("--device-batch-size", type=int, default=32, help="per-device batch size")
|
||||
parser.add_argument("--total-batch-size", type=int, default=524288, help="total batch size in tokens")
|
||||
parser.add_argument("--embedding-lr", type=float, default=0.3, help="learning rate for embedding parameters (Adam)")
|
||||
parser.add_argument("--unembedding-lr", type=float, default=0.004, help="learning rate for unembedding parameters (Adam)")
|
||||
parser.add_argument("--weight-decay", type=float, default=0.2, help="cautious weight decay for the Muon optimizer (for weights)")
|
||||
parser.add_argument("--matrix-lr", type=float, default=0.02, help="learning rate for matrix parameters (Muon)")
|
||||
parser.add_argument("--scalar-lr", type=float, default=0.5, help="learning rate for scalars (resid_lambdas, x0_lambdas)")
|
||||
parser.add_argument("--adam-beta1", type=float, default=0.8, help="Adam beta1 for embedding/unembedding")
|
||||
parser.add_argument("--adam-beta2", type=float, default=0.95, help="Adam beta2 for embedding/unembedding")
|
||||
parser.add_argument("--warmup-ratio", type=float, default=0.0, help="ratio of iterations for LR warmup")
|
||||
parser.add_argument("--warmdown-ratio", type=float, default=0.5, help="ratio of iterations for LR warmdown")
|
||||
parser.add_argument("--final-lr-frac", type=float, default=0.0, help="final LR as fraction of initial LR")
|
||||
parser.add_argument("--resume-from-step", type=int, default=-1, help="resume training from this step (-1 = disable)")
|
||||
# Evaluation
|
||||
parser.add_argument("--eval-every", type=int, default=250, help="evaluate val bpb every N steps (-1 = disable)")
|
||||
parser.add_argument("--eval-tokens", type=int, default=20*524288, help="number of tokens to evaluate val loss on")
|
||||
parser.add_argument("--core-metric-every", type=int, default=2000, help="evaluate CORE metric every N steps (-1 = disable)")
|
||||
parser.add_argument("--core-metric-max-per-task", type=int, default=500, help="examples per task for CORE metric")
|
||||
parser.add_argument("--sample-every", type=int, default=2000, help="sample from model every N steps (-1 = disable)")
|
||||
parser.add_argument("--save-every", type=int, default=-1, help="save checkpoints every N steps (-1 = only at end)")
|
||||
# Output
|
||||
parser.add_argument("--model-tag", type=str, default=None, help="override model tag for checkpoint directory name")
|
||||
args = parser.parse_args()
|
||||
user_config = vars(args).copy() # for logging
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
# Compute init
|
||||
device_type = autodetect_device_type() if args.device_type == "" else args.device_type
|
||||
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
|
||||
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
|
||||
_gc_callback.rank = ddp_rank # Store rank for GC log printouts
|
||||
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
|
||||
synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None
|
||||
get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0
|
||||
if device_type == "cuda":
|
||||
gpu_device_name = torch.cuda.get_device_name(0)
|
||||
gpu_peak_flops = get_peak_flops(gpu_device_name)
|
||||
print0(f"GPU: {gpu_device_name} | Peak FLOPS (BF16): {gpu_peak_flops:.2e}")
|
||||
else:
|
||||
gpu_peak_flops = float('inf') # MFU not meaningful for CPU/MPS
|
||||
|
||||
# wandb logging init
|
||||
use_dummy_wandb = args.run == "dummy" or not master_process
|
||||
if use_dummy_wandb:
|
||||
wandb_run = DummyWandb()
|
||||
else:
|
||||
try:
|
||||
wandb_run = wandb.init(project="nanochat", name=args.run, config=user_config)
|
||||
except wandb.errors.UsageError as e:
|
||||
print0(f"Warning: wandb initialization failed ({e}), logging disabled. Run 'wandb login' to enable.")
|
||||
wandb_run = DummyWandb()
|
||||
|
||||
# Flash Attention status
|
||||
if HAS_FA3:
|
||||
print0("✓ Using Flash Attention 3 (Hopper GPU detected), efficient, new and awesome.")
|
||||
else:
|
||||
print0("!" * 80)
|
||||
print0("WARNING: Flash Attention 3 not available, using PyTorch SDPA fallback")
|
||||
print0("WARNING: Training will be less efficient without FA3")
|
||||
if args.window_pattern != "L":
|
||||
print0(f"WARNING: SDPA has no support for sliding window attention (window_pattern='{args.window_pattern}'). Your GPU utilization will be terrible.")
|
||||
print0("WARNING: Recommend using --window-pattern L for full context attention without alternating sliding window patterns.")
|
||||
print0("!" * 80)
|
||||
|
||||
# Tokenizer will be useful for evaluation, also we need the vocab size
|
||||
tokenizer = get_tokenizer()
|
||||
token_bytes = get_token_bytes(device=device)
|
||||
vocab_size = tokenizer.get_vocab_size()
|
||||
print0(f"Vocab size: {vocab_size:,}")
|
||||
|
||||
# Model kwargs are derived from the desired depth of the model
|
||||
# We nudge model_dim up to the nearest multiple of head_dim to ensure clean division
|
||||
# (FA3 requires head_dim divisible by 8, and this guarantees head_dim == args.head_dim exactly)
|
||||
# (For very small depths, this gives a slight "unfair" advantage to models with odd depths)
|
||||
num_layers = args.depth
|
||||
base_dim = args.depth * args.aspect_ratio
|
||||
model_dim = ((base_dim + args.head_dim - 1) // args.head_dim) * args.head_dim
|
||||
num_heads = model_dim // args.head_dim
|
||||
num_kv_heads = num_heads # default is 1:1 GQA (Group Query Attention) ratio (i.e. GQA is disabled)
|
||||
head_dim = model_dim // num_heads
|
||||
print0(f"num_layers: {num_layers}")
|
||||
print0(f"model_dim: {model_dim} (base: {base_dim}, nudge: {model_dim - base_dim:+d})")
|
||||
print0(f"num_heads: {num_heads}")
|
||||
print0(f"head_dim: {head_dim}")
|
||||
print0(f"num_kv_heads: {num_kv_heads}")
|
||||
|
||||
# Optimizer / data / training length related hyperparameters
|
||||
# figure out the needed gradient accumulation to reach the desired total batch size
|
||||
tokens_per_fwdbwd = args.device_batch_size * args.max_seq_len # tokens per iteration for a single rank
|
||||
world_tokens_per_fwdbwd = tokens_per_fwdbwd * ddp_world_size # total tokens per iteration for all ranks
|
||||
assert args.total_batch_size % world_tokens_per_fwdbwd == 0
|
||||
grad_accum_steps = args.total_batch_size // world_tokens_per_fwdbwd
|
||||
print0(f"Tokens / micro-batch / rank: {args.device_batch_size} x {args.max_seq_len} = {tokens_per_fwdbwd:,}")
|
||||
print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}")
|
||||
print0(f"Total batch size {args.total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}")
|
||||
|
||||
# Batch size scaling for learning rates (hyperparameters were tuned at reference batch size 2^19)
|
||||
batch_lr_scale = 1.0
|
||||
reference_batch_size = 2**19
|
||||
batch_ratio = args.total_batch_size / reference_batch_size
|
||||
if batch_ratio != 1.0:
|
||||
# SGD: linear scaling with batch size is standard (not used in nanochat)
|
||||
# AdamW: sqrt scaling is standard
|
||||
# Muon: sqrt scaling is an assumption - not fully studied, but it's a second-order-ish optimizer
|
||||
batch_lr_scale = batch_ratio ** 0.5
|
||||
print0(f"Scaling LRs by {batch_lr_scale:.4f} for batch size {args.total_batch_size:,} (reference: {reference_batch_size:,})")
|
||||
|
||||
# Weight decay is tuned at d12 and its scaling seems to be \propto 1/channels^2 (or equivalently, \propto 1/depth^2 due to constant aspect ratio)
|
||||
weight_decay_scaled = args.weight_decay * (12 / args.depth)**2
|
||||
if args.depth != 12:
|
||||
print0(f"Scaling weight decay from {args.weight_decay:.6f} to {weight_decay_scaled:.6f} for depth {args.depth}")
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Initialize the Model
|
||||
|
||||
# Create a new model with random weights
|
||||
model_config_kwargs = dict(sequence_len=args.max_seq_len, vocab_size=vocab_size, n_layer=num_layers, n_head=num_heads, n_kv_head=num_kv_heads, n_embd=model_dim, window_pattern=args.window_pattern)
|
||||
with torch.device("meta"):
|
||||
# All tensors are created as meta tensors (they have shape/dtype but no data)
|
||||
model_config = GPTConfig(**model_config_kwargs)
|
||||
model = GPT(model_config)
|
||||
model.to_empty(device=device) # All tensors get storage on target device but with uninitialized (garbage) data
|
||||
model.init_weights() # All tensors get initialized
|
||||
|
||||
# If we are resuming, overwrite the model parameters with those of the checkpoint
|
||||
base_dir = get_base_dir()
|
||||
output_dirname = args.model_tag if args.model_tag else f"d{args.depth}" # e.g. d12
|
||||
checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname)
|
||||
resuming = args.resume_from_step != -1
|
||||
if resuming:
|
||||
print0(f"Resuming optimization from step {args.resume_from_step}")
|
||||
model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, args.resume_from_step, device, load_optimizer=True, rank=ddp_rank)
|
||||
model.load_state_dict(model_data, strict=True, assign=True)
|
||||
del model_data # free up this memory after the copy
|
||||
|
||||
orig_model = model # original, uncompiled model, for saving raw model state_dict and for inference/evaluation (because the shapes may change shape)
|
||||
model = torch.compile(model, dynamic=False) # the inputs to model will never change shape so dynamic=False is safe
|
||||
|
||||
# Detailed parameter counts
|
||||
param_counts = orig_model.num_scaling_params()
|
||||
print0(f"Parameter counts:")
|
||||
for key, value in param_counts.items():
|
||||
print0(f"{key:24s}: {value:,}")
|
||||
num_params = param_counts['total']
|
||||
num_scaling_params = param_counts['transformer_matrices'] + param_counts['lm_head'] # determined to give the cleanest scaling laws, see dev/LOG.md Jan 27, 2026
|
||||
num_flops_per_token = model.estimate_flops()
|
||||
print0(f"Estimated FLOPs per token: {num_flops_per_token:e}")
|
||||
|
||||
# Calculate number of iterations. Either it is given, or from target flops, or from target data:param ratio (in that order)
|
||||
assert args.num_iterations > 0 or args.target_param_data_ratio > 0 or args.target_flops > 0
|
||||
if args.num_iterations > 0:
|
||||
num_iterations = args.num_iterations
|
||||
print0(f"Using user-provided number of iterations: {num_iterations:,}")
|
||||
elif args.target_flops > 0:
|
||||
# calculate the number of iterations from the target flops
|
||||
num_iterations = round(args.target_flops / (num_flops_per_token * args.total_batch_size))
|
||||
print0(f"Calculated number of iterations from target FLOPs: {num_iterations:,}")
|
||||
elif args.target_param_data_ratio > 0:
|
||||
# calculate the number of iterations from the target param data ratio (use scaling params per Kaplan et al.)
|
||||
target_tokens = int(args.target_param_data_ratio * num_scaling_params)
|
||||
num_iterations = target_tokens // args.total_batch_size
|
||||
print0(f"Calculated number of iterations from target data:param ratio: {num_iterations:,}")
|
||||
else:
|
||||
raise ValueError("No training horizon specified")
|
||||
total_tokens = args.total_batch_size * num_iterations
|
||||
print0(f"Total number of training tokens: {total_tokens:,}")
|
||||
print0(f"Tokens : Scaling params ratio: {args.total_batch_size * num_iterations / num_scaling_params:.2f}") # Chinchilla is ~20
|
||||
print0(f"Total training FLOPs estimate: {num_flops_per_token * total_tokens:e}")
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Initialize the Optimizer (combined MuonAdamW: Muon for matrix params, AdamW for rest)
|
||||
adam_betas = (args.adam_beta1, args.adam_beta2)
|
||||
optimizer = model.setup_optimizer(
|
||||
unembedding_lr=args.unembedding_lr * batch_lr_scale,
|
||||
embedding_lr=args.embedding_lr * batch_lr_scale,
|
||||
matrix_lr=args.matrix_lr * batch_lr_scale,
|
||||
weight_decay=weight_decay_scaled,
|
||||
adam_betas=adam_betas,
|
||||
scalar_lr=args.scalar_lr * batch_lr_scale,
|
||||
)
|
||||
|
||||
if resuming:
|
||||
optimizer.load_state_dict(optimizer_data)
|
||||
del optimizer_data
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Initialize the DataLoaders for train/val
|
||||
dataloader_resume_state_dict = None if not resuming else meta_data["dataloader_state_dict"]
|
||||
train_loader = tokenizing_distributed_data_loader_with_state_bos_bestfit(tokenizer, args.device_batch_size, args.max_seq_len, split="train", device=device, resume_state_dict=dataloader_resume_state_dict)
|
||||
build_val_loader = lambda: tokenizing_distributed_data_loader_bos_bestfit(tokenizer, args.device_batch_size, args.max_seq_len, split="val", device=device)
|
||||
x, y, dataloader_state_dict = next(train_loader) # kick off load of the very first batch of data
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Set up hyperparameter schedulers
|
||||
|
||||
# Learning rate scheduler
|
||||
def get_lr_multiplier(it):
|
||||
warmup_iters = round(args.warmup_ratio * num_iterations)
|
||||
warmdown_iters = round(args.warmdown_ratio * num_iterations)
|
||||
if it < warmup_iters:
|
||||
return (it + 1) / warmup_iters
|
||||
elif it <= num_iterations - warmdown_iters:
|
||||
return 1.0
|
||||
else:
|
||||
progress = (num_iterations - it) / warmdown_iters
|
||||
return progress * 1.0 + (1 - progress) * args.final_lr_frac
|
||||
|
||||
# Momentum scheduler for Muon optimizer
|
||||
def get_muon_momentum(it):
|
||||
frac = min(it / 300, 1)
|
||||
momentum = (1 - frac) * 0.85 + frac * 0.95
|
||||
return momentum
|
||||
|
||||
# Weight decay scheduler for Muon optimizer (linear to zero over the course of training)
|
||||
def get_weight_decay(it):
|
||||
return weight_decay_scaled * (1 - it / num_iterations)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Loop state (variables updated by the training loop)
|
||||
|
||||
if not resuming:
|
||||
step = 0
|
||||
val_bpb = None # will be set if eval_every > 0
|
||||
min_val_bpb = float("inf")
|
||||
smooth_train_loss = 0 # EMA of training loss
|
||||
total_training_time = 0 # total wall-clock time of training
|
||||
else:
|
||||
step = meta_data["step"]
|
||||
loop_state = meta_data["loop_state"]
|
||||
val_bpb = meta_data["val_bpb"]
|
||||
min_val_bpb = loop_state["min_val_bpb"]
|
||||
smooth_train_loss = loop_state["smooth_train_loss"]
|
||||
total_training_time = loop_state["total_training_time"]
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Training loop
|
||||
while True:
|
||||
last_step = step == num_iterations # loop runs num_iterations+1 times so that we can eval/save at the end
|
||||
flops_so_far = num_flops_per_token * args.total_batch_size * step
|
||||
|
||||
# once in a while: evaluate the val bpb (all ranks participate)
|
||||
if args.eval_every > 0 and (last_step or step % args.eval_every == 0):
|
||||
model.eval()
|
||||
val_loader = build_val_loader()
|
||||
eval_steps = args.eval_tokens // (args.device_batch_size * args.max_seq_len * ddp_world_size)
|
||||
with autocast_ctx:
|
||||
val_bpb = evaluate_bpb(model, val_loader, eval_steps, token_bytes)
|
||||
print0(f"Step {step:05d} | Validation bpb: {val_bpb:.6f}")
|
||||
if val_bpb < min_val_bpb:
|
||||
min_val_bpb = val_bpb
|
||||
wandb_run.log({
|
||||
"step": step,
|
||||
"total_training_flops": flops_so_far,
|
||||
"total_training_time": total_training_time,
|
||||
"val/bpb": val_bpb,
|
||||
})
|
||||
model.train()
|
||||
|
||||
# once in a while: estimate the CORE metric (all ranks participate)
|
||||
# use the original uncompiled model because the inputs keep changing shape
|
||||
results = {}
|
||||
if args.core_metric_every > 0 and (last_step or (step > 0 and step % args.core_metric_every == 0)):
|
||||
model.eval()
|
||||
with autocast_ctx:
|
||||
results = evaluate_core(orig_model, tokenizer, device, max_per_task=args.core_metric_max_per_task)
|
||||
print0(f"Step {step:05d} | CORE metric: {results['core_metric']:.4f}")
|
||||
wandb_run.log({
|
||||
"step": step,
|
||||
"total_training_flops": flops_so_far,
|
||||
"core_metric": results["core_metric"],
|
||||
"centered_results": results["centered_results"],
|
||||
})
|
||||
model.train()
|
||||
|
||||
# once in a while: sample from the model (only on master process)
|
||||
# use the original uncompiled model because the inputs keep changing shape
|
||||
if args.sample_every > 0 and master_process and (last_step or (step > 0 and step % args.sample_every == 0)):
|
||||
model.eval()
|
||||
prompts = [
|
||||
"The capital of France is",
|
||||
"The chemical symbol of gold is",
|
||||
"If yesterday was Friday, then tomorrow will be",
|
||||
"The opposite of hot is",
|
||||
"The planets of the solar system are:",
|
||||
"My favorite color is",
|
||||
"If 5*x + 3 = 13, then x is",
|
||||
]
|
||||
engine = Engine(orig_model, tokenizer) # use orig_model to avoid recompilation
|
||||
for prompt in prompts:
|
||||
tokens = tokenizer(prompt, prepend="<|bos|>")
|
||||
with autocast_ctx:
|
||||
sample, _ = engine.generate_batch(tokens, num_samples=1, max_tokens=16, temperature=0)
|
||||
print0(tokenizer.decode(sample[0]))
|
||||
model.train()
|
||||
|
||||
# save checkpoint: at the end of the run, or every save_every steps, except at the first step or the resume step
|
||||
if last_step or (step > 0 and step != args.resume_from_step and args.save_every > 0 and step % args.save_every == 0):
|
||||
save_checkpoint(
|
||||
checkpoint_dir,
|
||||
step,
|
||||
orig_model.state_dict(), # model parameters
|
||||
optimizer.state_dict(), # optimizer state
|
||||
{ # metadata saved as json
|
||||
"step": step,
|
||||
"val_bpb": val_bpb, # loss at last step
|
||||
"model_config": model_config_kwargs,
|
||||
"user_config": user_config, # inputs to the training script
|
||||
"device_batch_size": args.device_batch_size,
|
||||
"max_seq_len": args.max_seq_len,
|
||||
"dataloader_state_dict": dataloader_state_dict,
|
||||
"loop_state": { # all loop state (other than step) so that we can resume training
|
||||
"min_val_bpb": min_val_bpb,
|
||||
"smooth_train_loss": smooth_train_loss,
|
||||
"total_training_time": total_training_time,
|
||||
},
|
||||
},
|
||||
rank=ddp_rank,
|
||||
)
|
||||
|
||||
# termination conditions (TODO: possibly also add loss explosions etc.)
|
||||
if last_step:
|
||||
break
|
||||
|
||||
# -------------------------------------------------------------------------
|
||||
# single training step
|
||||
# evaluate the gradient
|
||||
synchronize()
|
||||
t0 = time.time()
|
||||
for micro_step in range(grad_accum_steps):
|
||||
with autocast_ctx:
|
||||
loss = model(x, y)
|
||||
train_loss = loss.detach() # for logging
|
||||
loss = loss / grad_accum_steps # each .backward() is a grad sum => normalize loss here
|
||||
loss.backward()
|
||||
x, y, dataloader_state_dict = next(train_loader) # prefetch the next batch while the GPU is busy with forward/backward
|
||||
# step the optimizer
|
||||
lrm = get_lr_multiplier(step)
|
||||
muon_momentum = get_muon_momentum(step)
|
||||
muon_weight_decay = get_weight_decay(step)
|
||||
for group in optimizer.param_groups:
|
||||
group["lr"] = group["initial_lr"] * lrm
|
||||
if group['kind'] == 'muon':
|
||||
group["momentum"] = muon_momentum
|
||||
group["weight_decay"] = muon_weight_decay
|
||||
optimizer.step()
|
||||
model.zero_grad(set_to_none=True)
|
||||
train_loss_f = train_loss.item() # .item() is a CPU-GPU sync point
|
||||
synchronize()
|
||||
t1 = time.time()
|
||||
dt = t1 - t0
|
||||
# -------------------------------------------------------------------------
|
||||
|
||||
# logging (CPU action only)
|
||||
ema_beta = 0.9 # EMA decay factor for some smoothing just for nicer logging
|
||||
smooth_train_loss = ema_beta * smooth_train_loss + (1 - ema_beta) * train_loss_f # EMA the training loss
|
||||
debiased_smooth_loss = smooth_train_loss / (1 - ema_beta**(step + 1)) # debias the EMA
|
||||
pct_done = 100 * step / num_iterations
|
||||
tok_per_sec = int(args.total_batch_size / dt)
|
||||
flops_per_sec = num_flops_per_token * args.total_batch_size / dt
|
||||
mfu = 100 * flops_per_sec / (gpu_peak_flops * ddp_world_size)
|
||||
if step > 10:
|
||||
total_training_time += dt # only count the time after the first 10 steps
|
||||
# Calculate ETA based on average time per step (excluding first 10 steps)
|
||||
steps_done = step - 10
|
||||
if steps_done > 0:
|
||||
avg_time_per_step = total_training_time / steps_done
|
||||
remaining_steps = num_iterations - step
|
||||
eta_seconds = remaining_steps * avg_time_per_step
|
||||
eta_str = f" | eta: {eta_seconds/60:.1f}m"
|
||||
else:
|
||||
eta_str = ""
|
||||
epoch = dataloader_state_dict["epoch"]
|
||||
print0(f"step {step:05d}/{num_iterations:05d} ({pct_done:.2f}%) | loss: {debiased_smooth_loss:.6f} | lrm: {lrm:.2f} | dt: {dt * 1000:.2f}ms | tok/sec: {tok_per_sec:,} | mfu: {mfu:.2f} | epoch: {epoch} | total time: {total_training_time/60:.2f}m{eta_str}")
|
||||
if step % 100 == 0:
|
||||
log_data = {
|
||||
"step": step,
|
||||
"total_training_flops": flops_so_far,
|
||||
"total_training_time": total_training_time,
|
||||
"train/loss": debiased_smooth_loss,
|
||||
"train/lrm": lrm,
|
||||
"train/dt": dt,
|
||||
"train/tok_per_sec": tok_per_sec,
|
||||
"train/mfu": mfu,
|
||||
"train/epoch": epoch,
|
||||
}
|
||||
wandb_run.log(log_data)
|
||||
|
||||
# Set 'first_step_of_run' flag
|
||||
first_step_of_run = (step == 0) or (resuming and step == args.resume_from_step)
|
||||
|
||||
# state update
|
||||
step += 1
|
||||
|
||||
# # TEMP - Bail at 1000 steps for benchmarking.
|
||||
# if step == 1001:
|
||||
# print0(f"Elapsed + ETA: {total_training_time + eta_seconds:.0f}s")
|
||||
# break
|
||||
|
||||
# Help out the garbage collector by flushing garbage and then freezing long-lived objects
|
||||
# This eliminates random ~500ms pauses during training steps as the GC scans ~millions of objects for cycles
|
||||
if first_step_of_run:
|
||||
gc.collect()
|
||||
gc.freeze()
|
||||
gc.disable() # nuclear option: disable GC for the run
|
||||
elif step % 2000 == 0:
|
||||
gc.collect() # manual GC to keep memory usage in check for very long runs
|
||||
|
||||
# print a few more stats
|
||||
print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB")
|
||||
print0(f"Total training time: {total_training_time/60:.2f}m")
|
||||
if val_bpb is not None:
|
||||
print0(f"Minimum validation bpb: {min_val_bpb:.6f}")
|
||||
|
||||
# Log to report
|
||||
from nanochat.report import get_report
|
||||
get_report().log(section="Base model training", data=[
|
||||
user_config, # CLI args
|
||||
{ # stats about the training setup
|
||||
"Number of parameters": num_params,
|
||||
"Number of FLOPs per token": f"{num_flops_per_token:e}",
|
||||
"Calculated number of iterations": num_iterations,
|
||||
"Number of training tokens": total_tokens,
|
||||
"Tokens : Scaling params ratio": args.total_batch_size * num_iterations / num_scaling_params,
|
||||
"DDP world size": ddp_world_size,
|
||||
"warmup_ratio": args.warmup_ratio,
|
||||
"warmdown_ratio": args.warmdown_ratio,
|
||||
"final_lr_frac": args.final_lr_frac,
|
||||
},
|
||||
{ # stats about training outcomes
|
||||
"Minimum validation bpb": min_val_bpb if val_bpb is not None else None,
|
||||
"Final validation bpb": val_bpb,
|
||||
"CORE metric estimate": results.get("core_metric", None),
|
||||
"MFU %": f"{mfu:.2f}%",
|
||||
"Total training flops": f"{flops_so_far:e}",
|
||||
"Total training time": f"{total_training_time/60:.2f}m",
|
||||
"Peak memory usage": f"{get_max_memory() / 1024 / 1024:.2f}MiB",
|
||||
}
|
||||
])
|
||||
|
||||
# cleanup
|
||||
wandb_run.finish() # wandb run finish
|
||||
compute_cleanup()
|
||||
239
log/dataloader-gc-fixes.py
Normal file
239
log/dataloader-gc-fixes.py
Normal file
|
|
@ -0,0 +1,239 @@
|
|||
"""
|
||||
Distributed dataloaders for pretraining.
|
||||
|
||||
Two implementations are provided:
|
||||
|
||||
1. Original (tokenizing_distributed_data_loader):
|
||||
- Streams tokens into a flat buffer, reshapes to (B, T)
|
||||
- Rows may start mid-document (no guaranteed BOS at position 0)
|
||||
- 100% token utilization, simple and efficient
|
||||
|
||||
2. BOS-aligned bestfit (tokenizing_distributed_data_loader_bos_bestfit):
|
||||
- Every row starts with BOS token
|
||||
- Documents packed using best-fit algorithm to minimize cropping
|
||||
- When no document fits remaining space, crops a document to fill exactly
|
||||
- 100% utilization (no padding), ~35% tokens cropped at T=2048
|
||||
|
||||
The tradeoff: BOS-aligned loses ~35% of tokens to cropping, but ensures that
|
||||
there are fewer "confusing" tokens in the train/val batches as every token can
|
||||
now attend back to the BOS token and sees the full context of the document.
|
||||
(2) is the new default if you have enough data.
|
||||
Fallback to (1) if you have very limited data AND long documents.
|
||||
"""
|
||||
|
||||
import torch
|
||||
import pyarrow.parquet as pq
|
||||
|
||||
from nanochat.common import get_dist_info
|
||||
from nanochat.dataset import list_parquet_files
|
||||
|
||||
def _document_batches(split, resume_state_dict, tokenizer_batch_size):
|
||||
"""
|
||||
Infinite iterator over document batches (list of text strings) from parquet files.
|
||||
|
||||
Handles DDP sharding and approximate resume. Each yield is (text_batch, (pq_idx, rg_idx, epoch))
|
||||
where text_batch is a list of document strings, indices track position for resumption,
|
||||
and epoch counts how many times we've cycled through the dataset (starts at 1).
|
||||
"""
|
||||
ddp, ddp_rank, ddp_local_rank, ddp_world_size = get_dist_info()
|
||||
|
||||
parquet_paths = list_parquet_files()
|
||||
assert len(parquet_paths) != 0, "No dataset parquet files found, did you run dataset.py?"
|
||||
parquet_paths = parquet_paths[:-1] if split == "train" else parquet_paths[-1:]
|
||||
|
||||
resume_pq_idx = resume_state_dict["pq_idx"] if resume_state_dict is not None else 0
|
||||
resume_rg_idx = resume_state_dict["rg_idx"] if resume_state_dict is not None else None
|
||||
resume_epoch = resume_state_dict.get("epoch", 1) if resume_state_dict is not None else 1
|
||||
first_pass = True
|
||||
pq_idx = resume_pq_idx
|
||||
epoch = resume_epoch
|
||||
|
||||
while True: # iterate infinitely (multi-epoch)
|
||||
pq_idx = resume_pq_idx if first_pass else 0
|
||||
while pq_idx < len(parquet_paths):
|
||||
filepath = parquet_paths[pq_idx]
|
||||
pf = pq.ParquetFile(filepath)
|
||||
# Start from resume point if resuming on same file, otherwise from DDP rank
|
||||
if first_pass and (resume_rg_idx is not None) and (pq_idx == resume_pq_idx):
|
||||
base_idx = resume_rg_idx // ddp_world_size
|
||||
base_idx += 1 # advance by 1 so we don't repeat data after resuming
|
||||
rg_idx = base_idx * ddp_world_size + ddp_rank
|
||||
if rg_idx >= pf.num_row_groups:
|
||||
pq_idx += 1
|
||||
continue
|
||||
resume_rg_idx = None # only do this once
|
||||
else:
|
||||
rg_idx = ddp_rank
|
||||
while rg_idx < pf.num_row_groups:
|
||||
rg = pf.read_row_group(rg_idx)
|
||||
batch = rg.column('text').to_pylist()
|
||||
for i in range(0, len(batch), tokenizer_batch_size):
|
||||
yield batch[i:i+tokenizer_batch_size], (pq_idx, rg_idx, epoch)
|
||||
rg_idx += ddp_world_size
|
||||
pq_idx += 1
|
||||
first_pass = False
|
||||
epoch += 1
|
||||
|
||||
|
||||
def tokenizing_distributed_data_loader_with_state(tokenizer, B, T, split, tokenizer_threads=4, tokenizer_batch_size=128, device="cuda", resume_state_dict=None):
|
||||
"""
|
||||
Stream pretraining text from parquet files, tokenize, yield training batches.
|
||||
|
||||
This is the original dataloader that streams tokens into a flat buffer and reshapes.
|
||||
Rows may start mid-document (no guaranteed BOS at position 0).
|
||||
|
||||
Supports approximate resume via state_dict.
|
||||
"""
|
||||
assert split in ["train", "val"], "split must be 'train' or 'val'"
|
||||
|
||||
batches = _document_batches(split, resume_state_dict, tokenizer_batch_size)
|
||||
needed_tokens = B * T + 1 # +1 for target at last position
|
||||
bos_token = tokenizer.get_bos_token_id()
|
||||
token_buffer = []
|
||||
pq_idx, rg_idx, epoch = 0, 0, 1
|
||||
|
||||
while True:
|
||||
|
||||
# Accumulate enough tokens
|
||||
while len(token_buffer) < needed_tokens:
|
||||
doc_batch, (pq_idx, rg_idx, epoch) = next(batches)
|
||||
token_lists = tokenizer.encode(doc_batch, prepend=bos_token, num_threads=tokenizer_threads)
|
||||
for tokens in token_lists:
|
||||
token_buffer.extend(tokens)
|
||||
tokens = token_buffer[:needed_tokens] # Read B*T+1 tokens (+1 is only for the target for the last token)
|
||||
token_buffer = token_buffer[B*T:] # Advance by B*T tokens, so we move exactly one window of B*T tokens over
|
||||
|
||||
# Package tokens into inputs and targets, yield
|
||||
use_cuda = device == "cuda"
|
||||
scratch = torch.tensor(tokens, dtype=torch.long, pin_memory=use_cuda)
|
||||
inputs = scratch[:-1].view(B, T).to(device=device, non_blocking=use_cuda)
|
||||
targets = scratch[1:].view(B, T).to(device=device, non_blocking=use_cuda)
|
||||
yield inputs, targets, {"pq_idx": pq_idx, "rg_idx": rg_idx, "epoch": epoch}
|
||||
|
||||
|
||||
def tokenizing_distributed_data_loader(*args, **kwargs):
|
||||
"""Helper that omits state_dict from yields."""
|
||||
for inputs, targets, state_dict in tokenizing_distributed_data_loader_with_state(*args, **kwargs):
|
||||
yield inputs, targets
|
||||
|
||||
|
||||
def tokenizing_distributed_data_loader_with_state_bos_bestfit(
|
||||
tokenizer, B, T, split,
|
||||
tokenizer_threads=4, tokenizer_batch_size=128,
|
||||
device="cuda", resume_state_dict=None,
|
||||
buffer_size=1000
|
||||
):
|
||||
"""
|
||||
BOS-aligned dataloader with Best-Fit Cropping.
|
||||
|
||||
Reduces token waste compared to simple greedy cropping by searching a buffer
|
||||
for documents that fit well, while maintaining 100% utilization (no padding).
|
||||
|
||||
Algorithm for each row:
|
||||
1. From buffered docs, pick the LARGEST doc that fits entirely
|
||||
2. Repeat until no doc fits
|
||||
3. When nothing fits, crop a doc to fill remaining space exactly
|
||||
|
||||
Key properties:
|
||||
- Every row starts with BOS
|
||||
- 100% utilization (no padding, every token is trained on)
|
||||
- Approximately 35% of all tokens are discarded due to cropping
|
||||
"""
|
||||
assert split in ["train", "val"], "split must be 'train' or 'val'"
|
||||
|
||||
row_capacity = T + 1
|
||||
batches = _document_batches(split, resume_state_dict, tokenizer_batch_size)
|
||||
bos_token = tokenizer.get_bos_token_id()
|
||||
pq_idx, rg_idx, epoch = 0, 0, 1
|
||||
|
||||
# Token pool: single tensor holding all buffered tokens
|
||||
# Documents tracked as (start, length) tuples
|
||||
pool = torch.empty(buffer_size * 512, dtype=torch.long)
|
||||
pool_end = 0
|
||||
docs = [] # [(start, length), ...]
|
||||
|
||||
def compact_pool():
|
||||
"""Shift active documents to front of pool, reclaiming space."""
|
||||
nonlocal pool_end
|
||||
if not docs:
|
||||
pool_end = 0
|
||||
return
|
||||
write_pos = 0
|
||||
for i, (start, length) in enumerate(docs):
|
||||
if start != write_pos:
|
||||
pool[write_pos:write_pos + length] = pool[start:start + length].clone()
|
||||
docs[i] = (write_pos, length)
|
||||
write_pos += length
|
||||
pool_end = write_pos
|
||||
|
||||
def refill_buffer():
|
||||
"""Retrieve more docs and add them to the pool"""
|
||||
nonlocal pq_idx, rg_idx, epoch, pool, pool_end
|
||||
doc_batch, (pq_idx, rg_idx, epoch) = next(batches)
|
||||
token_lists = tokenizer.encode(doc_batch, prepend=bos_token, num_threads=tokenizer_threads)
|
||||
# Number of new tokens to store
|
||||
total_new = sum(len(t) for t in token_lists)
|
||||
# If there's not enough space at the end,
|
||||
if pool_end + total_new > pool.size(0):
|
||||
compact_pool() # Try compacting first.
|
||||
# If still not enough,
|
||||
if pool_end + total_new > pool.size(0):
|
||||
# Allocate a new, larger pool.
|
||||
new_size = max(pool.size(0) * 2, pool_end + total_new)
|
||||
new_pool = torch.empty(new_size, dtype=torch.long)
|
||||
new_pool[:pool_end] = pool[:pool_end]
|
||||
pool = new_pool
|
||||
# Write tokens to pool
|
||||
for tokens in token_lists:
|
||||
n = len(tokens)
|
||||
pool[pool_end:pool_end + n] = torch.tensor(tokens, dtype=torch.long)
|
||||
docs.append((pool_end, n))
|
||||
pool_end += n
|
||||
|
||||
# Pre-allocate buffers once
|
||||
use_cuda = device == "cuda"
|
||||
row_buffer = torch.empty((B, row_capacity), dtype=torch.long)
|
||||
inputs = torch.empty((B, T), dtype=torch.long, device=device)
|
||||
targets = torch.empty((B, T), dtype=torch.long, device=device)
|
||||
|
||||
while True:
|
||||
for row_idx in range(B):
|
||||
col = 0
|
||||
while col < row_capacity:
|
||||
# Ensure buffer has documents
|
||||
while len(docs) < buffer_size:
|
||||
refill_buffer()
|
||||
|
||||
remaining = row_capacity - col
|
||||
|
||||
# Find largest doc that fits entirely
|
||||
best_idx = -1
|
||||
best_len = 0
|
||||
for i, (start, length) in enumerate(docs):
|
||||
if length <= remaining and length > best_len:
|
||||
best_idx = i
|
||||
best_len = length
|
||||
|
||||
if best_idx >= 0:
|
||||
start, length = docs.pop(best_idx)
|
||||
row_buffer[row_idx, col:col + length] = pool[start:start + length]
|
||||
col += length
|
||||
else:
|
||||
# No doc fits - crop shortest to fill remaining
|
||||
shortest_idx = min(range(len(docs)), key=lambda i: docs[i][1])
|
||||
start, length = docs.pop(shortest_idx)
|
||||
row_buffer[row_idx, col:col + remaining] = pool[start:start + remaining]
|
||||
col += remaining
|
||||
|
||||
# Copy to GPU
|
||||
inputs.copy_(row_buffer[:, :-1], non_blocking=use_cuda)
|
||||
targets.copy_(row_buffer[:, 1:], non_blocking=use_cuda)
|
||||
|
||||
state_dict = {"pq_idx": pq_idx, "rg_idx": rg_idx, "epoch": epoch}
|
||||
yield inputs, targets, state_dict
|
||||
|
||||
def tokenizing_distributed_data_loader_bos_bestfit(*args, **kwargs):
|
||||
"""Helper that omits state_dict from yields."""
|
||||
for inputs, targets, state_dict in tokenizing_distributed_data_loader_with_state_bos_bestfit(*args, **kwargs):
|
||||
yield inputs, targets
|
||||
|
||||
538
log/optim-mlp-lr-1x2x.py
Normal file
538
log/optim-mlp-lr-1x2x.py
Normal file
|
|
@ -0,0 +1,538 @@
|
|||
"""
|
||||
A nice and efficient mixed AdamW/Muon Combined Optimizer.
|
||||
Usually the embeddings and scalars go into AdamW, and the matrix parameters go into Muon.
|
||||
Two versions are provided (MuonAdamW, DistMuonAdamW), for single GPU and distributed.
|
||||
|
||||
Addapted from: https://github.com/KellerJordan/modded-nanogpt
|
||||
Further contributions from @karpathy and @chrisjmccormick.
|
||||
"""
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
from torch import Tensor
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
"""
|
||||
Good old AdamW optimizer, fused kernel.
|
||||
https://arxiv.org/abs/1711.05101
|
||||
"""
|
||||
|
||||
@torch.compile(dynamic=False, fullgraph=True)
|
||||
def adamw_step_fused(
|
||||
p: Tensor, # (32768, 768) - parameter tensor
|
||||
grad: Tensor, # (32768, 768) - gradient, same shape as p
|
||||
exp_avg: Tensor, # (32768, 768) - first moment, same shape as p
|
||||
exp_avg_sq: Tensor, # (32768, 768) - second moment, same shape as p
|
||||
step_t: Tensor, # () - 0-D CPU tensor, step count
|
||||
lr_t: Tensor, # () - 0-D CPU tensor, learning rate
|
||||
beta1_t: Tensor, # () - 0-D CPU tensor, beta1
|
||||
beta2_t: Tensor, # () - 0-D CPU tensor, beta2
|
||||
eps_t: Tensor, # () - 0-D CPU tensor, epsilon
|
||||
wd_t: Tensor, # () - 0-D CPU tensor, weight decay
|
||||
) -> None:
|
||||
"""
|
||||
Fused AdamW step: weight_decay -> momentum_update -> bias_correction -> param_update
|
||||
All in one compiled graph to eliminate Python overhead between ops.
|
||||
The 0-D CPU tensors avoid recompilation when hyperparameter values change.
|
||||
"""
|
||||
# Weight decay (decoupled, applied before the update)
|
||||
p.mul_(1 - lr_t * wd_t)
|
||||
# Update running averages (lerp_ is cleaner and fuses well)
|
||||
exp_avg.lerp_(grad, 1 - beta1_t)
|
||||
exp_avg_sq.lerp_(grad.square(), 1 - beta2_t)
|
||||
# Bias corrections
|
||||
bias1 = 1 - beta1_t ** step_t
|
||||
bias2 = 1 - beta2_t ** step_t
|
||||
# Compute update and apply
|
||||
denom = (exp_avg_sq / bias2).sqrt() + eps_t
|
||||
step_size = lr_t / bias1
|
||||
p.add_(exp_avg / denom, alpha=-step_size)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
"""
|
||||
Muon optimizer adapted and simplified from modded-nanogpt.
|
||||
https://github.com/KellerJordan/modded-nanogpt
|
||||
|
||||
Background:
|
||||
Newton-Schulz iteration to compute the zeroth power / orthogonalization of G. We opt to use a
|
||||
quintic iteration whose coefficients are selected to maximize the slope at zero. For the purpose
|
||||
of minimizing steps, it turns out to be empirically effective to keep increasing the slope at
|
||||
zero even beyond the point where the iteration no longer converges all the way to one everywhere
|
||||
on the interval. This iteration therefore does not produce UV^T but rather something like US'V^T
|
||||
where S' is diagonal with S_{ii}' ~ Uniform(0.5, 1.5), which turns out not to hurt model
|
||||
performance at all relative to UV^T, where USV^T = G is the SVD.
|
||||
|
||||
Here, an alternative to Newton-Schulz iteration with potentially better convergence properties:
|
||||
Polar Express Sign Method for orthogonalization.
|
||||
https://arxiv.org/pdf/2505.16932
|
||||
by Noah Amsel, David Persson, Christopher Musco, Robert M. Gower.
|
||||
|
||||
Some of the changes in nanochat implementation:
|
||||
- Uses a simpler, more general approach to parameter grouping and stacking
|
||||
- Uses a single fused kernel for the momentum -> polar_express -> variance_reduction -> update step
|
||||
- Makes no assumptions about model architecture (e.g. that attention weights are fused into QKVO format)
|
||||
"""
|
||||
|
||||
# Coefficients for Polar Express (computed for num_iters=5, safety_factor=2e-2, cushion=2)
|
||||
# From https://arxiv.org/pdf/2505.16932
|
||||
polar_express_coeffs = [
|
||||
(8.156554524902461, -22.48329292557795, 15.878769915207462),
|
||||
(4.042929935166739, -2.808917465908714, 0.5000178451051316),
|
||||
(3.8916678022926607, -2.772484153217685, 0.5060648178503393),
|
||||
(3.285753657755655, -2.3681294933425376, 0.46449024233003106),
|
||||
(2.3465413258596377, -1.7097828382687081, 0.42323551169305323),
|
||||
]
|
||||
|
||||
@torch.compile(dynamic=False, fullgraph=True)
|
||||
def muon_step_fused(
|
||||
stacked_grads: Tensor, # (12, 768, 3072) - stacked gradients
|
||||
stacked_params: Tensor, # (12, 768, 3072) - stacked parameters
|
||||
momentum_buffer: Tensor, # (12, 768, 3072) - first moment buffer
|
||||
second_momentum_buffer: Tensor, # (12, 768, 1) or (12, 1, 3072) - factored second moment
|
||||
momentum_t: Tensor, # () - 0-D CPU tensor, momentum coefficient
|
||||
lr_t: Tensor, # () - 0-D CPU tensor, learning rate
|
||||
wd_t: Tensor, # () - 0-D CPU tensor, weight decay
|
||||
beta2_t: Tensor, # () - 0-D CPU tensor, beta2 for second moment
|
||||
ns_steps: int, # 5 - number of Newton-Schulz/Polar Express iterations
|
||||
red_dim: int, # -1 or -2 - reduction dimension for variance
|
||||
) -> None:
|
||||
"""
|
||||
Fused Muon step: momentum -> polar_express -> variance_reduction -> cautious_update
|
||||
All in one compiled graph to eliminate Python overhead between ops.
|
||||
Some of the constants are 0-D CPU tensors to avoid recompilation when values change.
|
||||
"""
|
||||
|
||||
# Nesterov momentum
|
||||
momentum = momentum_t.to(stacked_grads.dtype)
|
||||
momentum_buffer.lerp_(stacked_grads, 1 - momentum)
|
||||
g = stacked_grads.lerp_(momentum_buffer, momentum)
|
||||
|
||||
# Polar express
|
||||
X = g.bfloat16()
|
||||
X = X / (X.norm(dim=(-2, -1), keepdim=True) * 1.02 + 1e-6)
|
||||
if g.size(-2) > g.size(-1): # Tall matrix
|
||||
for a, b, c in polar_express_coeffs[:ns_steps]:
|
||||
A = X.mT @ X
|
||||
B = b * A + c * (A @ A)
|
||||
X = a * X + X @ B
|
||||
else: # Wide matrix (original math)
|
||||
for a, b, c in polar_express_coeffs[:ns_steps]:
|
||||
A = X @ X.mT
|
||||
B = b * A + c * (A @ A)
|
||||
X = a * X + B @ X
|
||||
g = X
|
||||
|
||||
# Variance reduction
|
||||
beta2 = beta2_t.to(g.dtype)
|
||||
v_mean = g.float().square().mean(dim=red_dim, keepdim=True)
|
||||
red_dim_size = g.size(red_dim)
|
||||
v_norm_sq = v_mean.sum(dim=(-2, -1), keepdim=True) * red_dim_size
|
||||
v_norm = v_norm_sq.sqrt()
|
||||
second_momentum_buffer.lerp_(v_mean.to(dtype=second_momentum_buffer.dtype), 1 - beta2)
|
||||
step_size = second_momentum_buffer.clamp_min(1e-10).rsqrt()
|
||||
scaled_sq_sum = (v_mean * red_dim_size) * step_size.float().square()
|
||||
v_norm_new = scaled_sq_sum.sum(dim=(-2, -1), keepdim=True).sqrt()
|
||||
final_scale = step_size * (v_norm / v_norm_new.clamp_min(1e-10))
|
||||
g = g * final_scale.to(g.dtype)
|
||||
|
||||
# Cautious weight decay + parameter update
|
||||
lr = lr_t.to(g.dtype)
|
||||
wd = wd_t.to(g.dtype)
|
||||
mask = (g * stacked_params) >= 0
|
||||
stacked_params.sub_(lr * g + lr * wd * stacked_params * mask)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Single GPU version of the MuonAdamW optimizer.
|
||||
# Used mostly for reference, debugging and testing.
|
||||
|
||||
class MuonAdamW(torch.optim.Optimizer):
|
||||
"""
|
||||
Combined optimizer: Muon for 2D matrix params, AdamW for others, single GPU version.
|
||||
|
||||
AdamW - Fused AdamW optimizer step.
|
||||
|
||||
Muon - MomentUm Orthogonalized by Newton-schulz
|
||||
https://kellerjordan.github.io/posts/muon/
|
||||
|
||||
Muon internally runs standard SGD-momentum, and then performs an orthogonalization post-
|
||||
processing step, in which each 2D parameter's update is replaced with the nearest orthogonal
|
||||
matrix. To efficiently orthogonalize each update, we use a Newton-Schulz iteration, which has
|
||||
the advantage that it can be stably run in bfloat16 on the GPU.
|
||||
|
||||
Some warnings:
|
||||
- The Muon optimizer should not be used for the embedding layer, the final fully connected layer,
|
||||
or any {0,1}-D parameters; those should all be optimized by a standard method (e.g., AdamW).
|
||||
- To use it with 4D convolutional filters, it works well to just flatten their last 3 dimensions.
|
||||
|
||||
Arguments:
|
||||
param_groups: List of dicts, each containing:
|
||||
- 'params': List of parameters
|
||||
- 'kind': 'adamw' or 'muon'
|
||||
- For AdamW groups: 'lr', 'betas', 'eps', 'weight_decay'
|
||||
- For Muon groups: 'lr', 'momentum', 'ns_steps', 'beta2', 'weight_decay'
|
||||
"""
|
||||
def __init__(self, param_groups: list[dict]):
|
||||
super().__init__(param_groups, defaults={})
|
||||
# 0-D CPU tensors to avoid torch.compile recompilation when values change
|
||||
# AdamW tensors
|
||||
self._adamw_step_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
self._adamw_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
self._adamw_beta1_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
self._adamw_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
self._adamw_eps_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
self._adamw_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
# Muon tensors
|
||||
self._muon_momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
self._muon_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
self._muon_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
self._muon_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
|
||||
def _step_adamw(self, group: dict) -> None:
|
||||
"""
|
||||
AdamW update for each param in the group individually.
|
||||
Lazy init the state, fill in all 0-D tensors, call the fused kernel.
|
||||
"""
|
||||
for p in group['params']:
|
||||
if p.grad is None:
|
||||
continue
|
||||
grad = p.grad
|
||||
state = self.state[p]
|
||||
|
||||
# State init
|
||||
if not state:
|
||||
state['step'] = 0
|
||||
state['exp_avg'] = torch.zeros_like(p)
|
||||
state['exp_avg_sq'] = torch.zeros_like(p)
|
||||
exp_avg = state['exp_avg']
|
||||
exp_avg_sq = state['exp_avg_sq']
|
||||
state['step'] += 1
|
||||
|
||||
# Fill 0-D tensors with current values
|
||||
self._adamw_step_t.fill_(state['step'])
|
||||
self._adamw_lr_t.fill_(group['lr'])
|
||||
self._adamw_beta1_t.fill_(group['betas'][0])
|
||||
self._adamw_beta2_t.fill_(group['betas'][1])
|
||||
self._adamw_eps_t.fill_(group['eps'])
|
||||
self._adamw_wd_t.fill_(group['weight_decay'])
|
||||
|
||||
# Fused update: weight_decay -> momentum -> bias_correction -> param_update
|
||||
adamw_step_fused(
|
||||
p, grad, exp_avg, exp_avg_sq,
|
||||
self._adamw_step_t, self._adamw_lr_t, self._adamw_beta1_t,
|
||||
self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t,
|
||||
)
|
||||
|
||||
def _step_muon(self, group: dict) -> None:
|
||||
"""
|
||||
Muon update for all params in the group (stacked for efficiency).
|
||||
Lazy init the state, fill in all 0-D tensors, call the fused kernel.
|
||||
"""
|
||||
params: list[Tensor] = group['params']
|
||||
if not params:
|
||||
return
|
||||
|
||||
# Get or create group-level buffers (stored in first param's state for convenience)
|
||||
p = params[0]
|
||||
state = self.state[p]
|
||||
num_params = len(params)
|
||||
shape, device, dtype = p.shape, p.device, p.dtype
|
||||
|
||||
# Momentum for every individual parameter
|
||||
if "momentum_buffer" not in state:
|
||||
state["momentum_buffer"] = torch.zeros(num_params, *shape, dtype=dtype, device=device)
|
||||
momentum_buffer = state["momentum_buffer"]
|
||||
|
||||
# Second momentum buffer is factored, either per-row or per-column
|
||||
if "second_momentum_buffer" not in state:
|
||||
state_shape = (num_params, shape[-2], 1) if shape[-2] >= shape[-1] else (num_params, 1, shape[-1])
|
||||
state["second_momentum_buffer"] = torch.zeros(state_shape, dtype=dtype, device=device)
|
||||
second_momentum_buffer = state["second_momentum_buffer"]
|
||||
red_dim = -1 if shape[-2] >= shape[-1] else -2
|
||||
|
||||
# Stack grads and params (NOTE: this assumes all params have the same shape)
|
||||
stacked_grads = torch.stack([p.grad for p in params])
|
||||
stacked_params = torch.stack(params)
|
||||
|
||||
# Fill all the 0-D tensors with current values
|
||||
self._muon_momentum_t.fill_(group["momentum"])
|
||||
self._muon_beta2_t.fill_(group["beta2"] if group["beta2"] is not None else 0.0)
|
||||
# Shape-based LR scaling (flipped from original):
|
||||
# - Tall matrices (input projections like c_fc): 1x
|
||||
# - Wide matrices (output projections like c_proj): sqrt(cols/rows) → 2x for 1:4
|
||||
ratio = shape[-2] / shape[-1]
|
||||
lr_mult = 1.0 if ratio >= 1 else ratio**-0.5
|
||||
self._muon_lr_t.fill_(group["lr"] * lr_mult)
|
||||
self._muon_wd_t.fill_(group["weight_decay"])
|
||||
|
||||
# Single fused kernel: momentum -> polar_express -> variance_reduction -> update
|
||||
muon_step_fused(
|
||||
stacked_grads,
|
||||
stacked_params,
|
||||
momentum_buffer,
|
||||
second_momentum_buffer,
|
||||
self._muon_momentum_t,
|
||||
self._muon_lr_t,
|
||||
self._muon_wd_t,
|
||||
self._muon_beta2_t,
|
||||
group["ns_steps"],
|
||||
red_dim,
|
||||
)
|
||||
|
||||
# Copy back to original params
|
||||
torch._foreach_copy_(params, list(stacked_params.unbind(0)))
|
||||
|
||||
@torch.no_grad()
|
||||
def step(self):
|
||||
for group in self.param_groups:
|
||||
if group['kind'] == 'adamw':
|
||||
self._step_adamw(group)
|
||||
elif group['kind'] == 'muon':
|
||||
self._step_muon(group)
|
||||
else:
|
||||
raise ValueError(f"Unknown optimizer kind: {group['kind']}")
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Distributed version of the MuonAdamW optimizer.
|
||||
# Used for training on multiple GPUs.
|
||||
|
||||
class DistMuonAdamW(torch.optim.Optimizer):
|
||||
"""
|
||||
Combined distributed optimizer: Muon for 2D matrix params, AdamW for others.
|
||||
|
||||
See MuonAdamW for the algorithmic details of each optimizer. This class adds
|
||||
distributed communication to enable multi-GPU training without PyTorch DDP.
|
||||
|
||||
Design Goals:
|
||||
- Overlap communication with computation (async ops)
|
||||
- Minimize memory by sharding optimizer states across ranks (ZeRO-2 style)
|
||||
- Batch small tensors into single comm ops where possible
|
||||
|
||||
Communication Pattern (3-phase async):
|
||||
We use a 3-phase structure to maximize overlap between communication and compute:
|
||||
|
||||
Phase 1: Launch all async reduce ops
|
||||
- Kick off all reduce_scatter/all_reduce operations
|
||||
- Don't wait - let them run in background while we continue
|
||||
|
||||
Phase 2: Wait for reduces, compute updates, launch gathers
|
||||
- For each group: wait for its reduce, compute the update, launch gather
|
||||
- By processing groups in order, earlier gathers run while later computes happen
|
||||
|
||||
Phase 3: Wait for gathers, copy back
|
||||
- Wait for all gathers to complete
|
||||
- Copy updated params back to original tensors (Muon only)
|
||||
|
||||
AdamW Communication (ZeRO-2 style):
|
||||
- Small params (<1024 elements): all_reduce gradients, update full param on each rank.
|
||||
Optimizer state is replicated but these params are tiny (scalars, biases).
|
||||
- Large params: reduce_scatter gradients so each rank gets 1/N of the grad, update
|
||||
only that slice, then all_gather the updated slices. Optimizer state (exp_avg,
|
||||
exp_avg_sq) is sharded - each rank only stores state for its slice.
|
||||
Requires param.shape[0] divisible by world_size.
|
||||
|
||||
Muon Communication (stacked + chunked):
|
||||
- All params in a Muon group must have the same shape (caller's responsibility).
|
||||
- Stack all K params into a single (K, *shape) tensor for efficient comm.
|
||||
- Divide K params across N ranks: each rank "owns" ceil(K/N) params.
|
||||
- reduce_scatter the stacked grads so each rank gets its chunk.
|
||||
- Each rank computes Muon update only for params it owns.
|
||||
- all_gather the updated params back to all ranks.
|
||||
- Optimizer state (momentum_buffer, second_momentum_buffer) is sharded by chunk.
|
||||
- Padding: if K doesn't divide evenly, we zero-pad to (ceil(K/N) * N) for comm,
|
||||
then ignore the padding when copying back.
|
||||
|
||||
Buffer Reuse:
|
||||
- For Muon, we allocate stacked_grads for reduce_scatter input, then reuse the
|
||||
same buffer as the output for all_gather (stacked_params). This saves memory
|
||||
since we don't need both buffers simultaneously.
|
||||
|
||||
Arguments:
|
||||
param_groups: List of dicts, each containing:
|
||||
- 'params': List of parameters
|
||||
- 'kind': 'adamw' or 'muon'
|
||||
- For AdamW groups: 'lr', 'betas', 'eps', 'weight_decay'
|
||||
- For Muon groups: 'lr', 'momentum', 'ns_steps', 'beta2', 'weight_decay'
|
||||
"""
|
||||
def __init__(self, param_groups: list[dict]):
|
||||
super().__init__(param_groups, defaults={})
|
||||
# 0-D CPU tensors to avoid torch.compile recompilation when values change
|
||||
self._adamw_step_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
self._adamw_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
self._adamw_beta1_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
self._adamw_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
self._adamw_eps_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
self._adamw_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
self._muon_momentum_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
self._muon_lr_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
self._muon_wd_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
self._muon_beta2_t = torch.tensor(0.0, dtype=torch.float32, device="cpu")
|
||||
|
||||
def _reduce_adamw(self, group: dict, world_size: int) -> dict:
|
||||
"""Launch async reduce ops for AdamW group. Returns info dict with per-param infos."""
|
||||
param_infos = {}
|
||||
for p in group['params']:
|
||||
grad = p.grad
|
||||
if p.numel() < 1024:
|
||||
# Small params: all_reduce (no scatter/gather needed)
|
||||
future = dist.all_reduce(grad, op=dist.ReduceOp.AVG, async_op=True).get_future()
|
||||
param_infos[p] = dict(future=future, grad_slice=grad, is_small=True)
|
||||
else:
|
||||
# Large params: reduce_scatter
|
||||
rank_size = grad.shape[0] // world_size
|
||||
grad_slice = torch.empty_like(grad[:rank_size])
|
||||
future = dist.reduce_scatter_tensor(grad_slice, grad, op=dist.ReduceOp.AVG, async_op=True).get_future()
|
||||
param_infos[p] = dict(future=future, grad_slice=grad_slice, is_small=False)
|
||||
return dict(param_infos=param_infos)
|
||||
|
||||
def _reduce_muon(self, group: dict, world_size: int) -> dict:
|
||||
"""Launch async reduce op for Muon group. Returns info dict."""
|
||||
params = group['params']
|
||||
chunk_size = (len(params) + world_size - 1) // world_size
|
||||
padded_num_params = chunk_size * world_size
|
||||
p = params[0]
|
||||
shape, device, dtype = p.shape, p.device, p.dtype
|
||||
|
||||
# Stack grads and zero-pad to padded_num_params
|
||||
grad_stack = torch.stack([p.grad for p in params])
|
||||
stacked_grads = torch.empty(padded_num_params, *shape, dtype=dtype, device=device)
|
||||
stacked_grads[:len(params)].copy_(grad_stack)
|
||||
if len(params) < padded_num_params:
|
||||
stacked_grads[len(params):].zero_()
|
||||
|
||||
# Reduce_scatter to get this rank's chunk
|
||||
grad_chunk = torch.empty(chunk_size, *shape, dtype=dtype, device=device)
|
||||
future = dist.reduce_scatter_tensor(grad_chunk, stacked_grads, op=dist.ReduceOp.AVG, async_op=True).get_future()
|
||||
|
||||
return dict(future=future, grad_chunk=grad_chunk, stacked_grads=stacked_grads, chunk_size=chunk_size)
|
||||
|
||||
def _compute_adamw(self, group: dict, info: dict, gather_list: list, rank: int, world_size: int) -> None:
|
||||
"""Wait for reduce, compute AdamW updates, launch gathers for large params."""
|
||||
param_infos = info['param_infos']
|
||||
for p in group['params']:
|
||||
pinfo = param_infos[p]
|
||||
pinfo['future'].wait()
|
||||
grad_slice = pinfo['grad_slice']
|
||||
state = self.state[p]
|
||||
|
||||
# For small params, operate on full param; for large, operate on slice
|
||||
if pinfo['is_small']:
|
||||
p_slice = p
|
||||
else:
|
||||
rank_size = p.shape[0] // world_size
|
||||
p_slice = p[rank * rank_size:(rank + 1) * rank_size]
|
||||
|
||||
# State init
|
||||
if not state:
|
||||
state['step'] = 0
|
||||
state['exp_avg'] = torch.zeros_like(p_slice)
|
||||
state['exp_avg_sq'] = torch.zeros_like(p_slice)
|
||||
state['step'] += 1
|
||||
|
||||
# Fill 0-D tensors and run fused kernel
|
||||
self._adamw_step_t.fill_(state['step'])
|
||||
self._adamw_lr_t.fill_(group['lr'])
|
||||
self._adamw_beta1_t.fill_(group['betas'][0])
|
||||
self._adamw_beta2_t.fill_(group['betas'][1])
|
||||
self._adamw_eps_t.fill_(group['eps'])
|
||||
self._adamw_wd_t.fill_(group['weight_decay'])
|
||||
adamw_step_fused(
|
||||
p_slice, grad_slice, state['exp_avg'], state['exp_avg_sq'],
|
||||
self._adamw_step_t, self._adamw_lr_t, self._adamw_beta1_t,
|
||||
self._adamw_beta2_t, self._adamw_eps_t, self._adamw_wd_t,
|
||||
)
|
||||
|
||||
# Large params need all_gather
|
||||
if not pinfo['is_small']:
|
||||
future = dist.all_gather_into_tensor(p, p_slice, async_op=True).get_future()
|
||||
gather_list.append(dict(future=future, params=None))
|
||||
|
||||
def _compute_muon(self, group: dict, info: dict, gather_list: list, rank: int) -> None:
|
||||
"""Wait for reduce, compute Muon updates, launch gather."""
|
||||
info['future'].wait()
|
||||
params = group['params']
|
||||
chunk_size = info['chunk_size']
|
||||
grad_chunk = info['grad_chunk']
|
||||
p = params[0]
|
||||
shape, device, dtype = p.shape, p.device, p.dtype
|
||||
|
||||
# How many params does this rank own?
|
||||
start_idx = rank * chunk_size
|
||||
num_owned = min(chunk_size, max(0, len(params) - start_idx))
|
||||
|
||||
# Get or create group-level state
|
||||
state = self.state[p]
|
||||
if "momentum_buffer" not in state:
|
||||
state["momentum_buffer"] = torch.zeros(chunk_size, *shape, dtype=dtype, device=device)
|
||||
if "second_momentum_buffer" not in state:
|
||||
state_shape = (chunk_size, shape[-2], 1) if shape[-2] >= shape[-1] else (chunk_size, 1, shape[-1])
|
||||
state["second_momentum_buffer"] = torch.zeros(state_shape, dtype=dtype, device=device)
|
||||
red_dim = -1 if shape[-2] >= shape[-1] else -2
|
||||
|
||||
# Build output buffer for all_gather
|
||||
updated_params = torch.empty(chunk_size, *shape, dtype=dtype, device=device)
|
||||
|
||||
if num_owned > 0:
|
||||
owned_params = [params[start_idx + i] for i in range(num_owned)]
|
||||
stacked_owned = torch.stack(owned_params)
|
||||
|
||||
# Fill 0-D tensors and run fused kernel
|
||||
self._muon_momentum_t.fill_(group["momentum"])
|
||||
self._muon_beta2_t.fill_(group["beta2"])
|
||||
# Shape-based LR scaling (flipped from original):
|
||||
# - Tall matrices (input projections like c_fc): 1x
|
||||
# - Wide matrices (output projections like c_proj): sqrt(cols/rows) → 2x for 1:4
|
||||
ratio = shape[-2] / shape[-1]
|
||||
lr_mult = 1.0 if ratio >= 1 else ratio**-0.5
|
||||
self._muon_lr_t.fill_(group["lr"] * lr_mult)
|
||||
self._muon_wd_t.fill_(group["weight_decay"])
|
||||
muon_step_fused(
|
||||
grad_chunk[:num_owned], stacked_owned,
|
||||
state["momentum_buffer"][:num_owned], state["second_momentum_buffer"][:num_owned],
|
||||
self._muon_momentum_t, self._muon_lr_t, self._muon_wd_t, self._muon_beta2_t,
|
||||
group["ns_steps"], red_dim,
|
||||
)
|
||||
updated_params[:num_owned].copy_(stacked_owned)
|
||||
|
||||
if num_owned < chunk_size:
|
||||
updated_params[num_owned:].zero_()
|
||||
|
||||
# Reuse stacked_grads buffer for all_gather output
|
||||
stacked_params = info["stacked_grads"]
|
||||
future = dist.all_gather_into_tensor(stacked_params, updated_params, async_op=True).get_future()
|
||||
gather_list.append(dict(future=future, stacked_params=stacked_params, params=params))
|
||||
|
||||
def _finish_gathers(self, gather_list: list) -> None:
|
||||
"""Wait for all gathers and copy Muon params back."""
|
||||
for info in gather_list:
|
||||
info["future"].wait()
|
||||
if info["params"] is not None:
|
||||
# Muon: copy from stacked buffer back to individual params
|
||||
torch._foreach_copy_(info["params"], list(info["stacked_params"][:len(info["params"])].unbind(0)))
|
||||
|
||||
@torch.no_grad()
|
||||
def step(self):
|
||||
rank = dist.get_rank()
|
||||
world_size = dist.get_world_size()
|
||||
|
||||
# Phase 1: launch all async reduce ops
|
||||
reduce_infos: list[dict] = []
|
||||
for group in self.param_groups:
|
||||
if group['kind'] == 'adamw':
|
||||
reduce_infos.append(self._reduce_adamw(group, world_size))
|
||||
elif group['kind'] == 'muon':
|
||||
reduce_infos.append(self._reduce_muon(group, world_size))
|
||||
else:
|
||||
raise ValueError(f"Unknown optimizer kind: {group['kind']}")
|
||||
|
||||
# Phase 2: wait for reduces, compute updates, launch gathers
|
||||
gather_list: list[dict] = []
|
||||
for group, info in zip(self.param_groups, reduce_infos):
|
||||
if group['kind'] == 'adamw':
|
||||
self._compute_adamw(group, info, gather_list, rank, world_size)
|
||||
elif group['kind'] == 'muon':
|
||||
self._compute_muon(group, info, gather_list, rank)
|
||||
else:
|
||||
raise ValueError(f"Unknown optimizer kind: {group['kind']}")
|
||||
|
||||
# Phase 3: wait for gathers, copy back
|
||||
self._finish_gathers(gather_list)
|
||||
50
log/report/base-model-training.md
Normal file
50
log/report/base-model-training.md
Normal file
|
|
@ -0,0 +1,50 @@
|
|||
## Base model training
|
||||
timestamp: 2026-02-02 00:09:59
|
||||
|
||||
- run: d24-feb01
|
||||
- device_type:
|
||||
- depth: 24
|
||||
- aspect_ratio: 64
|
||||
- head_dim: 128
|
||||
- max_seq_len: 2048
|
||||
- window_pattern: SSSL
|
||||
- num_iterations: -1
|
||||
- target_flops: -1.0000
|
||||
- target_param_data_ratio: 12.0000
|
||||
- device_batch_size: 16
|
||||
- total_batch_size: 524,288
|
||||
- embedding_lr: 0.3000
|
||||
- unembedding_lr: 0.0040
|
||||
- weight_decay: 0.2000
|
||||
- matrix_lr: 0.0200
|
||||
- scalar_lr: 0.5000
|
||||
- adam_beta1: 0.8000
|
||||
- adam_beta2: 0.9500
|
||||
- warmup_ratio: 0.0000
|
||||
- warmdown_ratio: 0.5000
|
||||
- final_lr_frac: 0.0000
|
||||
- resume_from_step: -1
|
||||
- eval_every: 250
|
||||
- eval_tokens: 10,485,760
|
||||
- core_metric_every: 3000
|
||||
- core_metric_max_per_task: -1
|
||||
- sample_every: -1
|
||||
- save_every: -1
|
||||
- model_tag: d24_feb01
|
||||
- Number of parameters: 1,384,124,976
|
||||
- Number of FLOPs per token: 4.945112e+09
|
||||
- Calculated number of iterations: 16,704
|
||||
- Number of training tokens: 8,757,706,752
|
||||
- Tokens : Scaling params ratio: 11.9999
|
||||
- DDP world size: 8
|
||||
- warmup_ratio: 0.0000
|
||||
- warmdown_ratio: 0.5000
|
||||
- final_lr_frac: 0.0000
|
||||
- Minimum validation bpb: 0.7533
|
||||
- Final validation bpb: 0.7533
|
||||
- CORE metric estimate: 0.2633
|
||||
- MFU %: 50.77%
|
||||
- Total training flops: 4.330784e+19
|
||||
- Total training time: 179.44m
|
||||
- Peak memory usage: 62389.67MiB
|
||||
|
||||
23
log/report/chat-evaluation-sft.md
Normal file
23
log/report/chat-evaluation-sft.md
Normal file
|
|
@ -0,0 +1,23 @@
|
|||
## Chat evaluation sft
|
||||
timestamp: 2026-02-02 01:24:46
|
||||
|
||||
- source: sft
|
||||
- task_name: None
|
||||
- dtype: bfloat16
|
||||
- temperature: 0.0000
|
||||
- max_new_tokens: 512
|
||||
- num_samples: 1
|
||||
- top_k: 50
|
||||
- batch_size: 8
|
||||
- model_tag: None
|
||||
- step: None
|
||||
- max_problems: None
|
||||
- device_type:
|
||||
- ARC-Easy: 0.4903
|
||||
- ARC-Challenge: 0.3848
|
||||
- MMLU: 0.3480
|
||||
- GSM8K: 0.0470
|
||||
- HumanEval: 0.1463
|
||||
- SpellingBee: 0.9883
|
||||
- ChatCORE metric: 0.3021
|
||||
|
||||
24
log/report/sft.md
Normal file
24
log/report/sft.md
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
## SFT
|
||||
timestamp: 2026-02-02 01:13:02
|
||||
|
||||
- run: dummy
|
||||
- device_type:
|
||||
- dtype: bfloat16
|
||||
- model_tag: None
|
||||
- model_step: None
|
||||
- num_iterations: -1
|
||||
- max_seq_len: 2048
|
||||
- device_batch_size: 16
|
||||
- total_batch_size: 524,288
|
||||
- embedding_lr: 0.2000
|
||||
- unembedding_lr: 0.0040
|
||||
- matrix_lr: 0.0200
|
||||
- weight_decay: 0.0000
|
||||
- init_lr_frac: 1.0000
|
||||
- eval_every: 150
|
||||
- eval_tokens: 10,485,760
|
||||
- dry_run: False
|
||||
- Number of iterations: 849
|
||||
- DDP world size: 8
|
||||
- Minimum validation bpb: 0.3478
|
||||
|
||||
13
log/report/tokenizer-training.md
Normal file
13
log/report/tokenizer-training.md
Normal file
|
|
@ -0,0 +1,13 @@
|
|||
## Tokenizer training
|
||||
timestamp: 2026-02-01 14:40:20
|
||||
|
||||
- max_chars: 2,000,000,000
|
||||
- doc_cap: 10,000
|
||||
- vocab_size: 32,768
|
||||
- train_time: 87.9820
|
||||
- num_special_tokens: 9
|
||||
- token_bytes_min: 1
|
||||
- token_bytes_max: 19
|
||||
- token_bytes_mean: 6.6029
|
||||
- token_bytes_std: 2.8250
|
||||
|
||||
30
log/speedrun-feb01.sh
Normal file
30
log/speedrun-feb01.sh
Normal file
|
|
@ -0,0 +1,30 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Default intermediate artifacts directory is in ~/.cache/nanochat
|
||||
export OMP_NUM_THREADS=1
|
||||
export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
|
||||
mkdir -p $NANOCHAT_BASE_DIR
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Python venv setup with uv
|
||||
|
||||
# install uv (if not already installed)
|
||||
command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
# create a .venv local virtual environment (if it doesn't exist)
|
||||
[ -d ".venv" ] || uv venv
|
||||
# install the repo dependencies
|
||||
uv sync --extra gpu
|
||||
# activate venv so that `python` uses the project's venv instead of system python
|
||||
source .venv/bin/activate
|
||||
|
||||
( cat ./nanochat/gpt.py; cat ./nanochat/optim.py; cat ./nanochat/dataloader.py; cat ./scripts/base_train.py; echo -e "\n\n===== TRAINING OUTPUT =====\n\n"; OMP_NUM_THREADS=1 torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- \
|
||||
--depth=24 \
|
||||
--run=d24-feb01 \
|
||||
--model-tag=d24_feb01 \
|
||||
--device-batch-size=16 \
|
||||
--sample-every=-1 \
|
||||
--save-every=-1 \
|
||||
--core-metric-max-per-task=-1 \
|
||||
--core-metric-every=3000 \
|
||||
--target-param-data-ratio=12 ) \
|
||||
2>&1 | tee ./logs/speedrun_d24_feb01-rope_chunk_mlp_lr_1x2x.log
|
||||
18889
log/speedrun_d24_feb01-rope_chunk_mlp_lr_1x2x.log
Normal file
18889
log/speedrun_d24_feb01-rope_chunk_mlp_lr_1x2x.log
Normal file
File diff suppressed because it is too large
Load Diff
|
|
@ -49,12 +49,11 @@ def has_ve(layer_idx, n_layer):
|
|||
return layer_idx % 2 == (n_layer - 1) % 2
|
||||
|
||||
def apply_rotary_emb(x, cos, sin):
|
||||
assert x.ndim == 4 # multihead attention
|
||||
d = x.shape[3] // 2
|
||||
x1, x2 = x[..., :d], x[..., d:] # split up last dim into two halves
|
||||
y1 = x1 * cos + x2 * sin # rotate pairs of dims
|
||||
assert x.ndim == 4 # (B, T, H, D) multihead attention layout
|
||||
x1, x2 = x.chunk(2, dim=-1) # split head_dim into two halves
|
||||
y1 = x1 * cos + x2 * sin
|
||||
y2 = x1 * (-sin) + x2 * cos
|
||||
return torch.cat([y1, y2], 3)
|
||||
return torch.cat([y1, y2], dim=-1)
|
||||
|
||||
class CausalSelfAttention(nn.Module):
|
||||
def __init__(self, config, layer_idx):
|
||||
|
|
|
|||
|
|
@ -260,7 +260,12 @@ class MuonAdamW(torch.optim.Optimizer):
|
|||
# Fill all the 0-D tensors with current values
|
||||
self._muon_momentum_t.fill_(group["momentum"])
|
||||
self._muon_beta2_t.fill_(group["beta2"] if group["beta2"] is not None else 0.0)
|
||||
self._muon_lr_t.fill_(group["lr"] * max(1.0, shape[-2] / shape[-1])**0.5)
|
||||
# Shape-based LR scaling (flipped from original):
|
||||
# - Tall matrices (input projections like c_fc): 1x
|
||||
# - Wide matrices (output projections like c_proj): sqrt(cols/rows) → 2x for 1:4
|
||||
ratio = shape[-2] / shape[-1]
|
||||
lr_mult = 1.0 if ratio >= 1 else ratio**-0.5
|
||||
self._muon_lr_t.fill_(group["lr"] * lr_mult)
|
||||
self._muon_wd_t.fill_(group["weight_decay"])
|
||||
|
||||
# Single fused kernel: momentum -> polar_express -> variance_reduction -> update
|
||||
|
|
@ -478,7 +483,12 @@ class DistMuonAdamW(torch.optim.Optimizer):
|
|||
# Fill 0-D tensors and run fused kernel
|
||||
self._muon_momentum_t.fill_(group["momentum"])
|
||||
self._muon_beta2_t.fill_(group["beta2"])
|
||||
self._muon_lr_t.fill_(group["lr"] * max(1.0, shape[-2] / shape[-1])**0.5)
|
||||
# Shape-based LR scaling (flipped from original):
|
||||
# - Tall matrices (input projections like c_fc): 1x
|
||||
# - Wide matrices (output projections like c_proj): sqrt(cols/rows) → 2x for 1:4
|
||||
ratio = shape[-2] / shape[-1]
|
||||
lr_mult = 1.0 if ratio >= 1 else ratio**-0.5
|
||||
self._muon_lr_t.fill_(group["lr"] * lr_mult)
|
||||
self._muon_wd_t.fill_(group["weight_decay"])
|
||||
muon_step_fused(
|
||||
grad_chunk[:num_owned], stacked_owned,
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user