diff --git a/dev/gen_synthetic_data.py b/dev/gen_synthetic_data.py index 2e9117c..5dece29 100644 --- a/dev/gen_synthetic_data.py +++ b/dev/gen_synthetic_data.py @@ -348,8 +348,7 @@ num_workers = 4 output_file = get_base_dir() / "identity_conversations.jsonl" # Wipe the file clean first to reset it -if output_file.exists(): - output_file.unlink() +output_file.unlink(missing_ok=True) print(f"Saving to {output_file}") # Use ThreadPoolExecutor to generate conversations in parallel diff --git a/nanochat/dataset.py b/nanochat/dataset.py index ef08d96..8bc5ce5 100644 --- a/nanochat/dataset.py +++ b/nanochat/dataset.py @@ -92,11 +92,10 @@ def download_single_file(index): print(f"Attempt {attempt}/{max_attempts} failed for {filename}: {e}") # Clean up any partial files for path in [Path(str(filepath) + ".tmp"), filepath]: - if path.exists(): - try: - path.unlink() - except: - pass + try: + path.unlink(missing_ok=True) + except: + pass # Try a few times with exponential backoff: 2^attempt seconds if attempt < max_attempts: wait_time = 2 ** attempt diff --git a/nanochat/report.py b/nanochat/report.py index 32dc028..ef19704 100644 --- a/nanochat/report.py +++ b/nanochat/report.py @@ -363,12 +363,10 @@ class Report: # Remove section files for file_name in EXPECTED_FILES: file_path = self.report_dir / file_name - if file_path.exists(): - file_path.unlink() + file_path.unlink(missing_ok=True) # Remove report.md if it exists report_file = self.report_dir / "report.md" - if report_file.exists(): - report_file.unlink() + report_file.unlink(missing_ok=True) # Generate and write the header section with start timestamp header_file = self.report_dir / "header.md" header = generate_header()