From c72b8b230966fa072f777c59e1b78eb83e39b3b0 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 3 Nov 2025 21:27:12 +0100 Subject: [PATCH 1/3] add explicit UTF-8 encoding --- nanochat/checkpoint_manager.py | 4 ++-- nanochat/common.py | 2 +- nanochat/report.py | 12 ++++++------ scripts/base_eval.py | 4 ++-- scripts/chat_web.py | 2 +- tasks/customjson.py | 2 +- tasks/spellingbee.py | 4 ++-- tests/test_rustbpe.py | 4 ++-- 8 files changed, 17 insertions(+), 17 deletions(-) diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py index a9327c4..e1a7d91 100644 --- a/nanochat/checkpoint_manager.py +++ b/nanochat/checkpoint_manager.py @@ -34,7 +34,7 @@ def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data) log0(f"Saved optimizer file to: {optimizer_path}") # Save the metadata dict as json meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json") - with open(meta_path, "w") as f: + with open(meta_path, "w", encoding='utf-8') as f: json.dump(meta_data, f, indent=2) log0(f"Saved metadata file to: {meta_path}") @@ -50,7 +50,7 @@ def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False): optimizer_data = torch.load(optimizer_path, map_location=device) # Load the metadata meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json") - with open(meta_path, "r") as f: + with open(meta_path, "r", encoding='utf-8') as f: meta_data = json.load(f) return model_data, optimizer_data, meta_data diff --git a/nanochat/common.py b/nanochat/common.py index 4e5fc06..ee02a6e 100644 --- a/nanochat/common.py +++ b/nanochat/common.py @@ -70,7 +70,7 @@ def download_file_with_lock(url, filename, postprocess_fn=None): if os.path.exists(file_path): return file_path - with open(lock_path, 'w') as lock_file: + with open(lock_path, 'w', encoding='utf-8') as lock_file: # Only a single rank can acquire this lock # All other ranks block until it is released diff --git a/nanochat/report.py b/nanochat/report.py index d0a65e0..2f65e9d 100644 --- a/nanochat/report.py +++ b/nanochat/report.py @@ -170,7 +170,7 @@ Generated: {timestamp} # count dependencies via uv.lock uv_lock_lines = 0 if os.path.exists('uv.lock'): - with open('uv.lock', 'r') as f: + with open('uv.lock', 'r', encoding='utf-8') as f: uv_lock_lines = len(f.readlines()) header += f""" @@ -241,7 +241,7 @@ class Report: slug = slugify(section) file_name = f"{slug}.md" file_path = os.path.join(self.report_dir, file_name) - with open(file_path, "w") as f: + with open(file_path, "w", encoding='utf-8') as f: f.write(f"## {section}\n") f.write(f"timestamp: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") for item in data: @@ -272,11 +272,11 @@ class Report: final_metrics = {} # the most important final metrics we'll add as table at the end start_time = None end_time = None - with open(report_file, "w") as out_file: + with open(report_file, "w", encoding='utf-8') as out_file: # write the header first header_file = os.path.join(report_dir, "header.md") if os.path.exists(header_file): - with open(header_file, "r") as f: + with open(header_file, "r", encoding='utf-8') as f: header_content = f.read() out_file.write(header_content) start_time = extract_timestamp(header_content, "Run started:") @@ -293,7 +293,7 @@ class Report: if not os.path.exists(section_file): print(f"Warning: {section_file} does not exist, skipping") continue - with open(section_file, "r") as in_file: + with open(section_file, "r", encoding='utf-8') as in_file: section = in_file.read() # Extract timestamp from this section (the last section's timestamp will "stick" as end_time) if "rl" not in file_name: @@ -373,7 +373,7 @@ class Report: header_file = os.path.join(self.report_dir, "header.md") header = generate_header() start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - with open(header_file, "w") as f: + with open(header_file, "w", encoding='utf-8') as f: f.write(header) f.write(f"Run started: {start_time}\n\n---\n\n") print(f"Reset report and wrote header to {header_file}") diff --git a/scripts/base_eval.py b/scripts/base_eval.py index a987049..3663538 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -59,7 +59,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): config_path = os.path.join(eval_bundle_dir, "core.yaml") data_base_path = os.path.join(eval_bundle_dir, "eval_data") eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv") - with open(config_path, 'r') as f: + with open(config_path, 'r', encoding='utf-8') as f: config = yaml.safe_load(f) tasks = config['icl_tasks'] @@ -193,7 +193,7 @@ def main(): print0("="*80) print0(f"Model: {model_name}") print0("="*80) - with open(output_csv_path, 'r') as f: + with open(output_csv_path, 'r', encoding='utf-8') as f: print0(f.read()) # Log to report diff --git a/scripts/chat_web.py b/scripts/chat_web.py index d7479c7..5d0b44a 100644 --- a/scripts/chat_web.py +++ b/scripts/chat_web.py @@ -243,7 +243,7 @@ app.add_middleware( async def root(): """Serve the chat UI.""" ui_html_path = os.path.join("nanochat", "ui.html") - with open(ui_html_path, "r") as f: + with open(ui_html_path, "r", encoding='utf-8') as f: html_content = f.read() # Replace the API_URL to use the same origin html_content = html_content.replace( diff --git a/tasks/customjson.py b/tasks/customjson.py index f4683c8..e1b5f0b 100644 --- a/tasks/customjson.py +++ b/tasks/customjson.py @@ -32,7 +32,7 @@ class CustomJSON(Task): print("-" * 80) else: - with open(filepath, 'r') as f: + with open(filepath, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if not line: # skip empty lines diff --git a/tasks/spellingbee.py b/tasks/spellingbee.py index c051fe7..3b45305 100644 --- a/tasks/spellingbee.py +++ b/tasks/spellingbee.py @@ -119,7 +119,7 @@ class SpellingBee(Task): self.split = split filename = WORD_LIST_URL.split("/")[-1] word_list_path = download_file_with_lock(WORD_LIST_URL, filename) - with open(word_list_path) as f: + with open(word_list_path, 'r', encoding='utf-8') as f: words = [line.strip() for line in f] self.words = words @@ -238,7 +238,7 @@ class SimpleSpelling(Task): self.split = split filename = WORD_LIST_URL.split("/")[-1] word_list_path = download_file_with_lock(WORD_LIST_URL, filename) - with open(word_list_path) as f: + with open(word_list_path, 'r', encoding='utf-8') as f: words = [line.strip() for line in f] rng = random.Random(42) rng.shuffle(words) # use a different word order than the SpellingBee task diff --git a/tests/test_rustbpe.py b/tests/test_rustbpe.py index 5f95721..bad3c92 100644 --- a/tests/test_rustbpe.py +++ b/tests/test_rustbpe.py @@ -455,13 +455,13 @@ def enwik8_path(): @pytest.fixture(scope="module") def enwik8_small(enwik8_path): """Fixture providing 100KB of enwik8 for quick tests.""" - with open(enwik8_path, "r") as f: + with open(enwik8_path, "r", encoding='utf-8') as f: return f.read(100_000) @pytest.fixture(scope="module") def enwik8_large(enwik8_path): """Fixture providing 10MB of enwik8 for performance tests.""" - with open(enwik8_path, "r") as f: + with open(enwik8_path, "r", encoding='utf-8') as f: return f.read(10**7) def time_function(func, *args, **kwargs): From e22fc6f2fac0c3d5f3ecd3ba6b09f7d694014b64 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 3 Nov 2025 21:46:39 +0100 Subject: [PATCH 2/3] few more explicit UTF-8 encodings --- dev/gen_synthetic_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/gen_synthetic_data.py b/dev/gen_synthetic_data.py index 13e5f55..73f4ac9 100644 --- a/dev/gen_synthetic_data.py +++ b/dev/gen_synthetic_data.py @@ -37,7 +37,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from nanochat.common import get_base_dir -api_key = open("openroutertoken.txt").read().strip() +api_key = open("openroutertoken.txt", 'r', encoding='utf-8').read().strip() url = "https://openrouter.ai/api/v1/chat/completions" headers = { @@ -45,7 +45,7 @@ headers = { "Content-Type": "application/json" } -readme = open("README.md").read().strip() +readme = open("README.md", 'r', encoding='utf-8').read().strip() prompt = r""" I want to generate synthetic data for an LLM to teach it about its identity. Here is the identity I want: From 2ce62ec07693a30d25264514fbaae0b918bfb200 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 3 Nov 2025 21:52:02 +0100 Subject: [PATCH 3/3] ensure consistency of quotes within each statement --- dev/gen_synthetic_data.py | 4 ++-- nanochat/checkpoint_manager.py | 4 ++-- nanochat/report.py | 10 +++++----- scripts/chat_web.py | 2 +- tests/test_rustbpe.py | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dev/gen_synthetic_data.py b/dev/gen_synthetic_data.py index 73f4ac9..068824f 100644 --- a/dev/gen_synthetic_data.py +++ b/dev/gen_synthetic_data.py @@ -37,7 +37,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from nanochat.common import get_base_dir -api_key = open("openroutertoken.txt", 'r', encoding='utf-8').read().strip() +api_key = open("openroutertoken.txt", "r", encoding="utf-8").read().strip() url = "https://openrouter.ai/api/v1/chat/completions" headers = { @@ -45,7 +45,7 @@ headers = { "Content-Type": "application/json" } -readme = open("README.md", 'r', encoding='utf-8').read().strip() +readme = open("README.md", "r", encoding="utf-8").read().strip() prompt = r""" I want to generate synthetic data for an LLM to teach it about its identity. Here is the identity I want: diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py index e1a7d91..378b0ed 100644 --- a/nanochat/checkpoint_manager.py +++ b/nanochat/checkpoint_manager.py @@ -34,7 +34,7 @@ def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data) log0(f"Saved optimizer file to: {optimizer_path}") # Save the metadata dict as json meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json") - with open(meta_path, "w", encoding='utf-8') as f: + with open(meta_path, "w", encoding="utf-8") as f: json.dump(meta_data, f, indent=2) log0(f"Saved metadata file to: {meta_path}") @@ -50,7 +50,7 @@ def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False): optimizer_data = torch.load(optimizer_path, map_location=device) # Load the metadata meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json") - with open(meta_path, "r", encoding='utf-8') as f: + with open(meta_path, "r", encoding="utf-8") as f: meta_data = json.load(f) return model_data, optimizer_data, meta_data diff --git a/nanochat/report.py b/nanochat/report.py index 2f65e9d..0b0ebd7 100644 --- a/nanochat/report.py +++ b/nanochat/report.py @@ -241,7 +241,7 @@ class Report: slug = slugify(section) file_name = f"{slug}.md" file_path = os.path.join(self.report_dir, file_name) - with open(file_path, "w", encoding='utf-8') as f: + with open(file_path, "w", encoding="utf-8") as f: f.write(f"## {section}\n") f.write(f"timestamp: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") for item in data: @@ -272,11 +272,11 @@ class Report: final_metrics = {} # the most important final metrics we'll add as table at the end start_time = None end_time = None - with open(report_file, "w", encoding='utf-8') as out_file: + with open(report_file, "w", encoding="utf-8") as out_file: # write the header first header_file = os.path.join(report_dir, "header.md") if os.path.exists(header_file): - with open(header_file, "r", encoding='utf-8') as f: + with open(header_file, "r", encoding="utf-8") as f: header_content = f.read() out_file.write(header_content) start_time = extract_timestamp(header_content, "Run started:") @@ -293,7 +293,7 @@ class Report: if not os.path.exists(section_file): print(f"Warning: {section_file} does not exist, skipping") continue - with open(section_file, "r", encoding='utf-8') as in_file: + with open(section_file, "r", encoding="utf-8") as in_file: section = in_file.read() # Extract timestamp from this section (the last section's timestamp will "stick" as end_time) if "rl" not in file_name: @@ -373,7 +373,7 @@ class Report: header_file = os.path.join(self.report_dir, "header.md") header = generate_header() start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - with open(header_file, "w", encoding='utf-8') as f: + with open(header_file, "w", encoding="utf-8") as f: f.write(header) f.write(f"Run started: {start_time}\n\n---\n\n") print(f"Reset report and wrote header to {header_file}") diff --git a/scripts/chat_web.py b/scripts/chat_web.py index 5d0b44a..4b67b62 100644 --- a/scripts/chat_web.py +++ b/scripts/chat_web.py @@ -243,7 +243,7 @@ app.add_middleware( async def root(): """Serve the chat UI.""" ui_html_path = os.path.join("nanochat", "ui.html") - with open(ui_html_path, "r", encoding='utf-8') as f: + with open(ui_html_path, "r", encoding="utf-8") as f: html_content = f.read() # Replace the API_URL to use the same origin html_content = html_content.replace( diff --git a/tests/test_rustbpe.py b/tests/test_rustbpe.py index bad3c92..aca67fc 100644 --- a/tests/test_rustbpe.py +++ b/tests/test_rustbpe.py @@ -455,13 +455,13 @@ def enwik8_path(): @pytest.fixture(scope="module") def enwik8_small(enwik8_path): """Fixture providing 100KB of enwik8 for quick tests.""" - with open(enwik8_path, "r", encoding='utf-8') as f: + with open(enwik8_path, "r", encoding="utf-8") as f: return f.read(100_000) @pytest.fixture(scope="module") def enwik8_large(enwik8_path): """Fixture providing 10MB of enwik8 for performance tests.""" - with open(enwik8_path, "r", encoding='utf-8') as f: + with open(enwik8_path, "r", encoding="utf-8") as f: return f.read(10**7) def time_function(func, *args, **kwargs):