From 232d1341be7fd51877118ac2a2d55bd0e5b4f788 Mon Sep 17 00:00:00 2001 From: Chris McCormick Date: Sat, 31 Jan 2026 21:25:43 -0800 Subject: [PATCH] Garbage collect after step 1 and freeze --- scripts/base_train.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/scripts/base_train.py b/scripts/base_train.py index 7ed6330..4dc6ffb 100644 --- a/scripts/base_train.py +++ b/scripts/base_train.py @@ -429,8 +429,14 @@ while True: wandb_run.log(log_data) # state update + first_step_of_run = (step == 0) or (resuming and step == args.resume_from_step) step += 1 + # After first step of this run, flush torch.compile garbage and freeze long-lived objects + if first_step_of_run: + gc.collect() + gc.freeze() # move survivors to permanent generation, won't be scanned in future GCs + # print a few more stats print0(f"Peak memory usage: {get_max_memory() / 1024 / 1024:.2f}MiB") print0(f"Total training time: {total_training_time/60:.2f}m")