From 01f5f10122fc092acd170953bbb60869138175ba Mon Sep 17 00:00:00 2001 From: Pyry Takala Date: Thu, 20 Nov 2025 02:24:46 +0000 Subject: [PATCH 1/2] Fix find_last_step crash on checkpoint files with extra underscores --- nanochat/checkpoint_manager.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py index 63f257f..4046aff 100644 --- a/nanochat/checkpoint_manager.py +++ b/nanochat/checkpoint_manager.py @@ -118,7 +118,16 @@ def find_last_step(checkpoint_dir): checkpoint_files = glob.glob(os.path.join(checkpoint_dir, "model_*.pt")) if not checkpoint_files: raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}") - last_step = int(max(os.path.basename(f).split("_")[-1].split(".")[0] for f in checkpoint_files)) + # Use regex to match only valid checkpoint files (model_.pt) and ignore malformed files + # This prevents crashes when files like model_000200_backup.pt exist in the directory + steps = [] + for f in checkpoint_files: + match = re.match(r"model_(\d+)\.pt$", os.path.basename(f)) + if match: + steps.append(int(match.group(1))) + if not steps: + raise ValueError(f"No valid checkpoint files found in {checkpoint_dir}") + last_step = max(steps) return last_step # ----------------------------------------------------------------------------- From 3e2a0668b211e5553e95cd0361776226387f9904 Mon Sep 17 00:00:00 2001 From: Pyry Takala Date: Fri, 21 Nov 2025 19:21:01 +0000 Subject: [PATCH 2/2] Refactor find_last_step to use os.listdir with regex filtering Replace glob.glob() with os.listdir() + regex filtering as suggested by reviewer. This filters invalid checkpoint files (like model_000200_backup.pt) at the source instead of globbing then filtering, making the code simpler and more efficient. --- nanochat/checkpoint_manager.py | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py index 4046aff..b9118e2 100644 --- a/nanochat/checkpoint_manager.py +++ b/nanochat/checkpoint_manager.py @@ -3,7 +3,6 @@ Utilities for saving and loading model/optim/state checkpoints. """ import os import re -import glob import json import logging import torch @@ -115,19 +114,10 @@ def find_largest_model(checkpoint_dir): def find_last_step(checkpoint_dir): # Look into checkpoint_dir and find model_.pt with the highest step - checkpoint_files = glob.glob(os.path.join(checkpoint_dir, "model_*.pt")) + checkpoint_files = [f for f in os.listdir(checkpoint_dir) if re.search(r'model_(\d+)\.pt$', f)] if not checkpoint_files: raise FileNotFoundError(f"No checkpoints found in {checkpoint_dir}") - # Use regex to match only valid checkpoint files (model_.pt) and ignore malformed files - # This prevents crashes when files like model_000200_backup.pt exist in the directory - steps = [] - for f in checkpoint_files: - match = re.match(r"model_(\d+)\.pt$", os.path.basename(f)) - if match: - steps.append(int(match.group(1))) - if not steps: - raise ValueError(f"No valid checkpoint files found in {checkpoint_dir}") - last_step = max(steps) + last_step = int(max(re.search(r'model_(\d+)\.pt$', f).group(1) for f in checkpoint_files)) return last_step # -----------------------------------------------------------------------------