From c72b8b230966fa072f777c59e1b78eb83e39b3b0 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Mon, 3 Nov 2025 21:27:12 +0100
Subject: [PATCH 1/5] add explicit UTF-8 encoding

---
 nanochat/checkpoint_manager.py |  4 ++--
 nanochat/common.py             |  2 +-
 nanochat/report.py             | 12 ++++++------
 scripts/base_eval.py           |  4 ++--
 scripts/chat_web.py            |  2 +-
 tasks/customjson.py            |  2 +-
 tasks/spellingbee.py           |  4 ++--
 tests/test_rustbpe.py          |  4 ++--
 8 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py
index a9327c4..e1a7d91 100644
--- a/nanochat/checkpoint_manager.py
+++ b/nanochat/checkpoint_manager.py
@@ -34,7 +34,7 @@ def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data)
         log0(f"Saved optimizer file to: {optimizer_path}")
     # Save the metadata dict as json
     meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
-    with open(meta_path, "w") as f:
+    with open(meta_path, "w", encoding='utf-8') as f:
         json.dump(meta_data, f, indent=2)
     log0(f"Saved metadata file to: {meta_path}")
 
@@ -50,7 +50,7 @@ def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False):
         optimizer_data = torch.load(optimizer_path, map_location=device)
     # Load the metadata
     meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
-    with open(meta_path, "r") as f:
+    with open(meta_path, "r", encoding='utf-8') as f:
         meta_data = json.load(f)
     return model_data, optimizer_data, meta_data
 
diff --git a/nanochat/common.py b/nanochat/common.py
index 4e5fc06..ee02a6e 100644
--- a/nanochat/common.py
+++ b/nanochat/common.py
@@ -70,7 +70,7 @@ def download_file_with_lock(url, filename, postprocess_fn=None):
     if os.path.exists(file_path):
         return file_path
 
-    with open(lock_path, 'w') as lock_file:
+    with open(lock_path, 'w', encoding='utf-8') as lock_file:
 
         # Only a single rank can acquire this lock
         # All other ranks block until it is released
diff --git a/nanochat/report.py b/nanochat/report.py
index d0a65e0..2f65e9d 100644
--- a/nanochat/report.py
+++ b/nanochat/report.py
@@ -170,7 +170,7 @@ Generated: {timestamp}
     # count dependencies via uv.lock
     uv_lock_lines = 0
     if os.path.exists('uv.lock'):
-        with open('uv.lock', 'r') as f:
+        with open('uv.lock', 'r', encoding='utf-8') as f:
             uv_lock_lines = len(f.readlines())
 
     header += f"""
@@ -241,7 +241,7 @@ class Report:
         slug = slugify(section)
         file_name = f"{slug}.md"
         file_path = os.path.join(self.report_dir, file_name)
-        with open(file_path, "w") as f:
+        with open(file_path, "w", encoding='utf-8') as f:
             f.write(f"## {section}\n")
             f.write(f"timestamp: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
             for item in data:
@@ -272,11 +272,11 @@ class Report:
         final_metrics = {} # the most important final metrics we'll add as table at the end
         start_time = None
         end_time = None
-        with open(report_file, "w") as out_file:
+        with open(report_file, "w", encoding='utf-8') as out_file:
             # write the header first
             header_file = os.path.join(report_dir, "header.md")
             if os.path.exists(header_file):
-                with open(header_file, "r") as f:
+                with open(header_file, "r", encoding='utf-8') as f:
                     header_content = f.read()
                     out_file.write(header_content)
                     start_time = extract_timestamp(header_content, "Run started:")
@@ -293,7 +293,7 @@ class Report:
                 if not os.path.exists(section_file):
                     print(f"Warning: {section_file} does not exist, skipping")
                     continue
-                with open(section_file, "r") as in_file:
+                with open(section_file, "r", encoding='utf-8') as in_file:
                     section = in_file.read()
                 # Extract timestamp from this section (the last section's timestamp will "stick" as end_time)
                 if "rl" not in file_name:
@@ -373,7 +373,7 @@ class Report:
         header_file = os.path.join(self.report_dir, "header.md")
         header = generate_header()
         start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        with open(header_file, "w") as f:
+        with open(header_file, "w", encoding='utf-8') as f:
             f.write(header)
             f.write(f"Run started: {start_time}\n\n---\n\n")
         print(f"Reset report and wrote header to {header_file}")
diff --git a/scripts/base_eval.py b/scripts/base_eval.py
index a987049..3663538 100644
--- a/scripts/base_eval.py
+++ b/scripts/base_eval.py
@@ -59,7 +59,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
     config_path = os.path.join(eval_bundle_dir, "core.yaml")
     data_base_path = os.path.join(eval_bundle_dir, "eval_data")
     eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
-    with open(config_path, 'r') as f:
+    with open(config_path, 'r', encoding='utf-8') as f:
         config = yaml.safe_load(f)
     tasks = config['icl_tasks']
 
@@ -193,7 +193,7 @@ def main():
         print0("="*80)
         print0(f"Model: {model_name}")
         print0("="*80)
-        with open(output_csv_path, 'r') as f:
+        with open(output_csv_path, 'r', encoding='utf-8') as f:
             print0(f.read())
 
     # Log to report
diff --git a/scripts/chat_web.py b/scripts/chat_web.py
index d7479c7..5d0b44a 100644
--- a/scripts/chat_web.py
+++ b/scripts/chat_web.py
@@ -243,7 +243,7 @@ app.add_middleware(
 async def root():
     """Serve the chat UI."""
     ui_html_path = os.path.join("nanochat", "ui.html")
-    with open(ui_html_path, "r") as f:
+    with open(ui_html_path, "r", encoding='utf-8') as f:
         html_content = f.read()
     # Replace the API_URL to use the same origin
     html_content = html_content.replace(
diff --git a/tasks/customjson.py b/tasks/customjson.py
index f4683c8..e1b5f0b 100644
--- a/tasks/customjson.py
+++ b/tasks/customjson.py
@@ -32,7 +32,7 @@ class CustomJSON(Task):
             print("-" * 80)
 
         else:
-            with open(filepath, 'r') as f:
+            with open(filepath, 'r', encoding='utf-8') as f:
                 for line in f:
                     line = line.strip()
                     if not line:  # skip empty lines
diff --git a/tasks/spellingbee.py b/tasks/spellingbee.py
index c051fe7..3b45305 100644
--- a/tasks/spellingbee.py
+++ b/tasks/spellingbee.py
@@ -119,7 +119,7 @@ class SpellingBee(Task):
         self.split = split
         filename = WORD_LIST_URL.split("/")[-1]
         word_list_path = download_file_with_lock(WORD_LIST_URL, filename)
-        with open(word_list_path) as f:
+        with open(word_list_path, 'r', encoding='utf-8') as f:
             words = [line.strip() for line in f]
         self.words = words
 
@@ -238,7 +238,7 @@ class SimpleSpelling(Task):
         self.split = split
         filename = WORD_LIST_URL.split("/")[-1]
         word_list_path = download_file_with_lock(WORD_LIST_URL, filename)
-        with open(word_list_path) as f:
+        with open(word_list_path, 'r', encoding='utf-8') as f:
             words = [line.strip() for line in f]
         rng = random.Random(42)
         rng.shuffle(words) # use a different word order than the SpellingBee task
diff --git a/tests/test_rustbpe.py b/tests/test_rustbpe.py
index 5f95721..bad3c92 100644
--- a/tests/test_rustbpe.py
+++ b/tests/test_rustbpe.py
@@ -455,13 +455,13 @@ def enwik8_path():
 @pytest.fixture(scope="module")
 def enwik8_small(enwik8_path):
     """Fixture providing 100KB of enwik8 for quick tests."""
-    with open(enwik8_path, "r") as f:
+    with open(enwik8_path, "r", encoding='utf-8') as f:
         return f.read(100_000)
 
 @pytest.fixture(scope="module")
 def enwik8_large(enwik8_path):
     """Fixture providing 10MB of enwik8 for performance tests."""
-    with open(enwik8_path, "r") as f:
+    with open(enwik8_path, "r", encoding='utf-8') as f:
         return f.read(10**7)
 
 def time_function(func, *args, **kwargs):

From e22fc6f2fac0c3d5f3ecd3ba6b09f7d694014b64 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Mon, 3 Nov 2025 21:46:39 +0100
Subject: [PATCH 2/5] few more explicit UTF-8 encodings

---
 dev/gen_synthetic_data.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/dev/gen_synthetic_data.py b/dev/gen_synthetic_data.py
index 13e5f55..73f4ac9 100644
--- a/dev/gen_synthetic_data.py
+++ b/dev/gen_synthetic_data.py
@@ -37,7 +37,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from nanochat.common import get_base_dir
 
-api_key = open("openroutertoken.txt").read().strip()
+api_key = open("openroutertoken.txt", 'r', encoding='utf-8').read().strip()
 
 url = "https://openrouter.ai/api/v1/chat/completions"
 headers = {
@@ -45,7 +45,7 @@ headers = {
   "Content-Type": "application/json"
 }
 
-readme = open("README.md").read().strip()
+readme = open("README.md", 'r', encoding='utf-8').read().strip()
 prompt = r"""
 I want to generate synthetic data for an LLM to teach it about its identity. Here is the identity I want:
 

From 2ce62ec07693a30d25264514fbaae0b918bfb200 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Mon, 3 Nov 2025 21:52:02 +0100
Subject: [PATCH 3/5] ensure consistency of quotes within each statement

---
 dev/gen_synthetic_data.py      |  4 ++--
 nanochat/checkpoint_manager.py |  4 ++--
 nanochat/report.py             | 10 +++++-----
 scripts/chat_web.py            |  2 +-
 tests/test_rustbpe.py          |  4 ++--
 5 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/dev/gen_synthetic_data.py b/dev/gen_synthetic_data.py
index 73f4ac9..068824f 100644
--- a/dev/gen_synthetic_data.py
+++ b/dev/gen_synthetic_data.py
@@ -37,7 +37,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed
 
 from nanochat.common import get_base_dir
 
-api_key = open("openroutertoken.txt", 'r', encoding='utf-8').read().strip()
+api_key = open("openroutertoken.txt", "r", encoding="utf-8").read().strip()
 
 url = "https://openrouter.ai/api/v1/chat/completions"
 headers = {
@@ -45,7 +45,7 @@ headers = {
   "Content-Type": "application/json"
 }
 
-readme = open("README.md", 'r', encoding='utf-8').read().strip()
+readme = open("README.md", "r", encoding="utf-8").read().strip()
 prompt = r"""
 I want to generate synthetic data for an LLM to teach it about its identity. Here is the identity I want:
 
diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py
index e1a7d91..378b0ed 100644
--- a/nanochat/checkpoint_manager.py
+++ b/nanochat/checkpoint_manager.py
@@ -34,7 +34,7 @@ def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data)
         log0(f"Saved optimizer file to: {optimizer_path}")
     # Save the metadata dict as json
     meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
-    with open(meta_path, "w", encoding='utf-8') as f:
+    with open(meta_path, "w", encoding="utf-8") as f:
         json.dump(meta_data, f, indent=2)
     log0(f"Saved metadata file to: {meta_path}")
 
@@ -50,7 +50,7 @@ def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False):
         optimizer_data = torch.load(optimizer_path, map_location=device)
     # Load the metadata
     meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json")
-    with open(meta_path, "r", encoding='utf-8') as f:
+    with open(meta_path, "r", encoding="utf-8") as f:
         meta_data = json.load(f)
     return model_data, optimizer_data, meta_data
 
diff --git a/nanochat/report.py b/nanochat/report.py
index 2f65e9d..0b0ebd7 100644
--- a/nanochat/report.py
+++ b/nanochat/report.py
@@ -241,7 +241,7 @@ class Report:
         slug = slugify(section)
         file_name = f"{slug}.md"
         file_path = os.path.join(self.report_dir, file_name)
-        with open(file_path, "w", encoding='utf-8') as f:
+        with open(file_path, "w", encoding="utf-8") as f:
             f.write(f"## {section}\n")
             f.write(f"timestamp: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
             for item in data:
@@ -272,11 +272,11 @@ class Report:
         final_metrics = {} # the most important final metrics we'll add as table at the end
         start_time = None
         end_time = None
-        with open(report_file, "w", encoding='utf-8') as out_file:
+        with open(report_file, "w", encoding="utf-8") as out_file:
             # write the header first
             header_file = os.path.join(report_dir, "header.md")
             if os.path.exists(header_file):
-                with open(header_file, "r", encoding='utf-8') as f:
+                with open(header_file, "r", encoding="utf-8") as f:
                     header_content = f.read()
                     out_file.write(header_content)
                     start_time = extract_timestamp(header_content, "Run started:")
@@ -293,7 +293,7 @@ class Report:
                 if not os.path.exists(section_file):
                     print(f"Warning: {section_file} does not exist, skipping")
                     continue
-                with open(section_file, "r", encoding='utf-8') as in_file:
+                with open(section_file, "r", encoding="utf-8") as in_file:
                     section = in_file.read()
                 # Extract timestamp from this section (the last section's timestamp will "stick" as end_time)
                 if "rl" not in file_name:
@@ -373,7 +373,7 @@ class Report:
         header_file = os.path.join(self.report_dir, "header.md")
         header = generate_header()
         start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-        with open(header_file, "w", encoding='utf-8') as f:
+        with open(header_file, "w", encoding="utf-8") as f:
             f.write(header)
             f.write(f"Run started: {start_time}\n\n---\n\n")
         print(f"Reset report and wrote header to {header_file}")
diff --git a/scripts/chat_web.py b/scripts/chat_web.py
index 5d0b44a..4b67b62 100644
--- a/scripts/chat_web.py
+++ b/scripts/chat_web.py
@@ -243,7 +243,7 @@ app.add_middleware(
 async def root():
     """Serve the chat UI."""
     ui_html_path = os.path.join("nanochat", "ui.html")
-    with open(ui_html_path, "r", encoding='utf-8') as f:
+    with open(ui_html_path, "r", encoding="utf-8") as f:
         html_content = f.read()
     # Replace the API_URL to use the same origin
     html_content = html_content.replace(
diff --git a/tests/test_rustbpe.py b/tests/test_rustbpe.py
index bad3c92..aca67fc 100644
--- a/tests/test_rustbpe.py
+++ b/tests/test_rustbpe.py
@@ -455,13 +455,13 @@ def enwik8_path():
 @pytest.fixture(scope="module")
 def enwik8_small(enwik8_path):
     """Fixture providing 100KB of enwik8 for quick tests."""
-    with open(enwik8_path, "r", encoding='utf-8') as f:
+    with open(enwik8_path, "r", encoding="utf-8") as f:
         return f.read(100_000)
 
 @pytest.fixture(scope="module")
 def enwik8_large(enwik8_path):
     """Fixture providing 10MB of enwik8 for performance tests."""
-    with open(enwik8_path, "r", encoding='utf-8') as f:
+    with open(enwik8_path, "r", encoding="utf-8") as f:
         return f.read(10**7)
 
 def time_function(func, *args, **kwargs):

From 7a40ee77b4695ccb7350a679230eb6a7f8a6ae29 Mon Sep 17 00:00:00 2001
From: Dipesh Babu <dipeshmahato@outlook.com>
Date: Mon, 3 Nov 2025 16:00:56 -0500
Subject: [PATCH 4/5] fix: cast bf16 to fp32 on MPS (like CPU) to avoid dtype
 issues

---
 nanochat/checkpoint_manager.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py
index a9327c4..2fcb01b 100644
--- a/nanochat/checkpoint_manager.py
+++ b/nanochat/checkpoint_manager.py
@@ -65,7 +65,7 @@ def build_model(checkpoint_dir, step, device, phase):
     """
     assert phase in ["train", "eval"], f"Invalid phase: {phase}"
     model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, step, device, load_optimizer=False)
-    if device.type == "cpu":
+    if device.type in {"cpu", "mps"}:
         # Convert bfloat16 tensors to float for CPU inference
         model_data = {
             k: v.float() if v.dtype == torch.bfloat16 else v

From f1683c5b1643c255d59903870eec91e17d5bf801 Mon Sep 17 00:00:00 2001
From: svlandeg <svlandeg@github.com>
Date: Tue, 4 Nov 2025 21:36:10 +0100
Subject: [PATCH 5/5] set nproc_per_node as var in speedrun and run1000 scripts

---
 run1000.sh  | 18 +++++++++++-------
 speedrun.sh | 21 ++++++++++++---------
 2 files changed, 23 insertions(+), 16 deletions(-)

diff --git a/run1000.sh b/run1000.sh
index 46325d9..58ee3bc 100644
--- a/run1000.sh
+++ b/run1000.sh
@@ -70,18 +70,22 @@ python -m scripts.tok_eval
 # which would decrease model performance. Possibly 2, 3 or so epochs is ~ok, but certainly not ideal and at 10+ epochs we'd
 # start to overfit hard.
 # 5) That's it, everything else (e.g. the learning rates) is adjusted automatically by the training script.
-torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=32 --device_batch_size=8 --run=$WANDB_RUN
-torchrun --standalone --nproc_per_node=8 -m scripts.base_loss
-torchrun --standalone --nproc_per_node=8 -m scripts.base_eval
+
+# Number of processes/GPUs to use
+NPROC_PER_NODE=8
+
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=32 --device_batch_size=8 --run=$WANDB_RUN
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval
 
 # midtrain
 # NOTE: ensure that we use the same device_batch_size here as the base training script.
-torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --device_batch_size=8 --run=$WANDB_RUN
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i mid
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.mid_train -- --device_batch_size=8 --run=$WANDB_RUN
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i mid
 
 # sft
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --run=$WANDB_RUN
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_sft -- --run=$WANDB_RUN
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i sft
 
 # generate final report
 python -m nanochat.report generate
diff --git a/speedrun.sh b/speedrun.sh
index 32c8870..7955ec5 100644
--- a/speedrun.sh
+++ b/speedrun.sh
@@ -82,12 +82,15 @@ python -m scripts.tok_eval
 echo "Waiting for dataset download to complete..."
 wait $DATASET_DOWNLOAD_PID
 
+# Number of processes/GPUs to use
+NPROC_PER_NODE=8
+
 # pretrain the d20 model
-torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=20 --run=$WANDB_RUN
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=20 --run=$WANDB_RUN
 # evaluate the model on a larger chunk of train/val data and draw some samples
-torchrun --standalone --nproc_per_node=8 -m scripts.base_loss
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss
 # evaluate the model on CORE tasks
-torchrun --standalone --nproc_per_node=8 -m scripts.base_eval
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval
 
 # -----------------------------------------------------------------------------
 # Midtraining (teach the model conversation special tokens, tool use, multiple choice)
@@ -97,15 +100,15 @@ torchrun --standalone --nproc_per_node=8 -m scripts.base_eval
 curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
 
 # run midtraining and eval the model
-torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --run=$WANDB_RUN
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i mid
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.mid_train -- --run=$WANDB_RUN
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i mid
 
 # -----------------------------------------------------------------------------
 # Supervised Finetuning (domain adaptation to each sequence all by itself per row)
 
 # train sft and re-eval right away (should see a small bump)
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --run=$WANDB_RUN
-torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_sft -- --run=$WANDB_RUN
+torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i sft
 
 # chat with the model over CLI! Leave out the -p to chat interactively
 # python -m scripts.chat_cli -p "Why is the sky blue?"
@@ -118,9 +121,9 @@ torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft
 # (optional)
 
 # run reinforcement learning
-# torchrun --standalone --nproc_per_node=8 -m scripts.chat_rl -- --run=$WANDB_RUN
+# torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_rl -- --run=$WANDB_RUN
 # eval the RL model only on GSM8K
-# torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i rl -a GSM8K
+# torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i rl -a GSM8K
 
 # -----------------------------------------------------------------------------
 # Generate the full report by putting together all the sections