Garbage collect after step 1 and freeze

This commit is contained in:
Chris McCormick 2026-01-31 21:25:43 -08:00
parent 814475af42
commit 232d1341be

View File

@ -429,8 +429,14 @@ while True:
wandb_run.log(log_data)
# state update
first_step_of_run = (step == 0) or (resuming and step == args.resume_from_step)
step += 1
# After first step of this run, flush torch.compile garbage and freeze long-lived objects
if first_step_of_run:
gc.collect()
gc.freeze() # move survivors to permanent generation, won't be scanned in future GCs
# print a few more stats
print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB")
print0(f"Total training time: {total_training_time/60:.2f}m")