fix bug where any rank has to be able to create checkpoint_dir if saving optim

This commit is contained in:
Andrej Karpathy 2025-12-08 20:45:11 +00:00
parent 2fd0440355
commit 90442de35f

View File

@ -34,6 +34,7 @@ def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data,
logger.info(f"Saved metadata to: {meta_path}")
# Note that optimizer state is sharded across ranks, so each rank must save its own.
if optimizer_data is not None:
os.makedirs(checkpoint_dir, exist_ok=True)
optimizer_path = os.path.join(checkpoint_dir, f"optim_{step:06d}_rank{rank:d}.pt")
torch.save(optimizer_data, optimizer_path)
logger.info(f"Saved optimizer state to: {optimizer_path}")