Add profiling infrastructure (env-var controlled, nsys/ncu/torch profiler)

- base_train.py: CUDA profiler + PyTorch profiler hooks gated by NANOCHAT_PROFILE_* env vars
- profile_step.py: standalone single-step profiler with NVTX ranges and phase selection
- LOCAL_STATE.md: documents local branch/file state before machine teardown

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
ademeure 2026-04-09 11:29:04 +00:00
parent 3d0dec5716
commit 322eb6b86b
3 changed files with 220 additions and 0 deletions

35
LOCAL_STATE.md Normal file
View File

@ -0,0 +1,35 @@
# Local State — nanochat (karpathy fork)
Documented 2026-04-09 before machine teardown.
## Branch: fa3-flex-sdpa (current)
- Tracking: `fork/fa3-flex-sdpa` (ademeure/nanochat) — pushed and up to date
- 1 commit ahead of upstream master: `3d0dec5 FA3/FlexAttention/SDPA attention + PyTorch 2.11/CUDA 13.0`
## Branch: pytorch-2.11-cu130
- Tracking: `fork/pytorch-2.11-cu130` — pushed and up to date
- 2 commits ahead of master
## Branch: pytorch-2.11-cu128-test
- **Local-only, no upstream** — but 0 commits ahead of master, just a branch pointer. No unique content.
## Uncommitted changes (being committed now)
### scripts/base_train.py
- Added env-var-controlled profiling hooks (`NANOCHAT_PROFILE_START`, `NANOCHAT_PROFILE_STOP`, `NANOCHAT_PROFILE_EXIT`, `NANOCHAT_TORCH_PROFILE_DIR`)
- CUDA profiler start/stop integration around training steps
- PyTorch profiler with tensorboard trace output
- Early exit after profiling completes
- This is a work-in-progress profiling integration — functional but may need further tuning
### scripts/profile_step.py (new file)
- Standalone profiling script for a single training step (fwd/bwd/opt)
- Supports nsys and ncu profiling with NVTX ranges
- Usage: `nsys profile -o out python -m scripts.profile_step --depth 6`
- Supports `--phase {all,fwd,bwd,opt}` for targeted kernel analysis
### profiles/ (NOT committed — binary nsys artifacts)
- `nsys_d32_full.nsys-rep` (1.6M) — nsys trace, depth=32
- `nsys_d32_full.sqlite` (2.4M) — exported sqlite
- `nsys_d32_minimal.nsys-rep` (1.5M) — minimal nsys trace
- These are reproducible output artifacts, not committed to git

View File

@ -412,6 +412,34 @@ print0(f"Tokens / micro-batch / rank: {args.device_batch_size} x {args.max_seq_l
print0(f"Tokens / micro-batch: {world_tokens_per_fwdbwd:,}")
print0(f"Total batch size {total_batch_size:,} => gradient accumulation steps: {grad_accum_steps}")
# Profiling hooks (env-var controlled, no-op by default)
_profile_start = int(os.environ.get("NANOCHAT_PROFILE_START", -1))
_profile_stop = int(os.environ.get("NANOCHAT_PROFILE_STOP", -1))
_profile_exit = int(os.environ.get("NANOCHAT_PROFILE_EXIT", -1))
_torch_profile_dir = os.environ.get("NANOCHAT_TORCH_PROFILE_DIR", "")
if _profile_start >= 0:
print0(f"Profiling: start at step {_profile_start}, stop at step {_profile_stop}, exit at step {_profile_exit}")
# PyTorch profiler (env-var controlled)
_torch_profiler = None
if _torch_profile_dir and _profile_start >= 0:
from torch.profiler import profile, ProfilerActivity, schedule, tensorboard_trace_handler
os.makedirs(_torch_profile_dir, exist_ok=True)
_torch_profiler = profile(
activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
schedule=schedule(
wait=_profile_start,
warmup=0,
active=(_profile_stop - _profile_start + 1) if _profile_stop >= 0 else 1,
repeat=1,
),
on_trace_ready=tensorboard_trace_handler(_torch_profile_dir),
record_shapes=True,
with_stack=True,
)
_torch_profiler.start()
print0(f"PyTorch profiler: tracing steps {_profile_start}-{_profile_stop}, output to {_torch_profile_dir}")
# Go!
while True:
last_step = step == num_iterations # loop runs num_iterations+1 times so that we can eval/save at the end
@ -504,6 +532,10 @@ while True:
# -------------------------------------------------------------------------
# single training step
if step == _profile_start:
print0(f">>> CUDA profiler START at step {step}")
synchronize()
torch.cuda.cudart().cudaProfilerStart()
# evaluate the gradient
synchronize()
t0 = time.time()
@ -579,10 +611,28 @@ while True:
}
wandb_run.log(log_data)
# profiling stop
if step == _profile_stop:
synchronize()
torch.cuda.cudart().cudaProfilerStop()
print0(f">>> CUDA profiler STOP after step {step}")
# PyTorch profiler step
if _torch_profiler is not None:
_torch_profiler.step()
# state update
first_step_of_run = (step == 0) or (resuming and step == args.resume_from_step)
step += 1
# profiling early exit (checked after step increment)
if _profile_exit >= 0 and step > _profile_exit:
if _torch_profiler is not None:
_torch_profiler.stop()
print0(f">>> PyTorch profiler stopped, traces written to {_torch_profile_dir}")
print0(f">>> Early exit after step {_profile_exit} (profiling done)")
break
# The garbage collector is sadly a little bit overactive and for some poorly understood reason,
# it spends ~500ms scanning for cycles quite frequently, just to end up cleaning up very few tiny objects each time.
# So we manually manage and help it out here

135
scripts/profile_step.py Normal file
View File

@ -0,0 +1,135 @@
"""
Profile a single training step of nanochat (forward + backward + optimizer).
Outputs nsys and ncu reports for detailed GPU kernel analysis.
Usage:
# Nsight Systems (full timeline):
nsys profile -o profile_nsys_d6 python -m scripts.profile_step --depth 6
nsys profile -o profile_nsys_d24 python -m scripts.profile_step --depth 24
# NCU (kernel-level, split by phase to keep reports manageable):
ncu --set full -o profile_ncu_d6_fwd python -m scripts.profile_step --depth 6 --phase fwd
ncu --set full -o profile_ncu_d6_bwd python -m scripts.profile_step --depth 6 --phase bwd
ncu --set full -o profile_ncu_d6_opt python -m scripts.profile_step --depth 6 --phase opt
"""
import os
os.environ["NANOCHAT_BASE_DIR"] = os.path.expanduser("~/.cache/nanochat")
import argparse
import torch
import torch.cuda.nvtx as nvtx
from nanochat.common import COMPUTE_DTYPE, print0
from nanochat.gpt import GPT, GPTConfig
parser = argparse.ArgumentParser()
parser.add_argument("--depth", type=int, default=6)
parser.add_argument("--phase", type=str, default="all", choices=["all", "fwd", "bwd", "opt"])
parser.add_argument("--seq-len", type=int, default=1024)
parser.add_argument("--batch-size", type=int, default=16)
parser.add_argument("--head-dim", type=int, default=64)
parser.add_argument("--aspect-ratio", type=int, default=48)
args = parser.parse_args()
# ---------------------------------------------------------------------------
# Setup
device = torch.device("cuda")
torch.manual_seed(42)
torch.set_float32_matmul_precision("high")
# Build model (same logic as base_train.py)
base_dim = args.depth * args.aspect_ratio
model_dim = ((base_dim + args.head_dim - 1) // args.head_dim) * args.head_dim
num_heads = model_dim // args.head_dim
config = GPTConfig(
sequence_len=args.seq_len, vocab_size=32768,
n_layer=args.depth, n_head=num_heads, n_kv_head=num_heads, n_embd=model_dim,
window_pattern="SSSL",
)
with torch.device("meta"):
model = GPT(config)
model.to_empty(device=device)
model.init_weights()
model = torch.compile(model, dynamic=False)
model.train()
optimizer = model.setup_optimizer(
unembedding_lr=0.01, embedding_lr=0.01, scalar_lr=0.01,
matrix_lr=0.01, weight_decay=0.1,
)
n_params = sum(p.numel() for p in model.parameters())
print0(f"Model: depth={args.depth} dim={model_dim} heads={num_heads} params={n_params:,}")
print0(f"Batch: {args.batch_size} x {args.seq_len} = {args.batch_size * args.seq_len:,} tokens")
# Dummy data
x = torch.randint(0, config.vocab_size, (args.batch_size, args.seq_len), device=device)
y = torch.randint(0, config.vocab_size, (args.batch_size, args.seq_len), device=device)
# ---------------------------------------------------------------------------
# Warmup (let torch.compile JIT)
print0("Warming up (torch.compile)...")
for _ in range(3):
loss = model(x, y)
loss.backward()
optimizer.step()
model.zero_grad(set_to_none=True)
torch.cuda.synchronize()
print0("Warmup done. Profiling...")
# ---------------------------------------------------------------------------
# Profiled step — NVTX ranges for nsys, CUDA ranges for ncu
def do_forward():
nvtx.range_push("forward")
loss = model(x, y)
torch.cuda.synchronize()
nvtx.range_pop()
return loss
def do_backward(loss):
nvtx.range_push("backward")
loss.backward()
torch.cuda.synchronize()
nvtx.range_pop()
def do_optimizer():
nvtx.range_push("optimizer")
optimizer.step()
torch.cuda.synchronize()
nvtx.range_pop()
model.zero_grad(set_to_none=True)
if args.phase == "fwd":
torch.cuda.cudart().cudaProfilerStart()
loss = do_forward()
torch.cuda.cudart().cudaProfilerStop()
print0(f"Forward done. loss={loss.item():.4f}")
elif args.phase == "bwd":
loss = model(x, y) # unprofiled forward
torch.cuda.synchronize()
torch.cuda.cudart().cudaProfilerStart()
do_backward(loss)
torch.cuda.cudart().cudaProfilerStop()
print0("Backward done.")
elif args.phase == "opt":
loss = model(x, y) # unprofiled forward+backward
loss.backward()
torch.cuda.synchronize()
torch.cuda.cudart().cudaProfilerStart()
do_optimizer()
torch.cuda.cudart().cudaProfilerStop()
print0("Optimizer done.")
else: # "all"
torch.cuda.cudart().cudaProfilerStart()
loss = do_forward()
do_backward(loss)
do_optimizer()
torch.cuda.cudart().cudaProfilerStop()
print0(f"Full step done. loss={loss.item():.4f}")
peak_mb = torch.cuda.max_memory_allocated() / 1024 / 1024
print0(f"Peak VRAM: {peak_mb:.0f} MiB")