Allow any rank to create the checkpoint_dir

This commit is contained in:
marked23 2025-11-20 11:21:24 -08:00
parent 4a87a0d19f
commit 9ebb031f64

View File

@ -21,8 +21,9 @@ def log0(message):
logger.info(message)
def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data, rank=0):
os.makedirs(checkpoint_dir, exist_ok=True)
if rank == 0:
os.makedirs(checkpoint_dir, exist_ok=True)
# Save the model state parameters
model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")
torch.save(model_data, model_path)