mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-08 05:12:16 +00:00
Fix race condition in save_checkpoint for non-zero ranks
This commit is contained in:
parent
4a87a0d19f
commit
2c6a007e3c
|
|
@ -21,8 +21,8 @@ def log0(message):
|
||||||
logger.info(message)
|
logger.info(message)
|
||||||
|
|
||||||
def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data, rank=0):
|
def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data, rank=0):
|
||||||
if rank == 0:
|
|
||||||
os.makedirs(checkpoint_dir, exist_ok=True)
|
os.makedirs(checkpoint_dir, exist_ok=True)
|
||||||
|
if rank == 0:
|
||||||
# Save the model state parameters
|
# Save the model state parameters
|
||||||
model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")
|
model_path = os.path.join(checkpoint_dir, f"model_{step:06d}.pt")
|
||||||
torch.save(model_data, model_path)
|
torch.save(model_data, model_path)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue
Block a user