From c72b8b230966fa072f777c59e1b78eb83e39b3b0 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 3 Nov 2025 21:27:12 +0100 Subject: [PATCH 1/5] add explicit UTF-8 encoding --- nanochat/checkpoint_manager.py | 4 ++-- nanochat/common.py | 2 +- nanochat/report.py | 12 ++++++------ scripts/base_eval.py | 4 ++-- scripts/chat_web.py | 2 +- tasks/customjson.py | 2 +- tasks/spellingbee.py | 4 ++-- tests/test_rustbpe.py | 4 ++-- 8 files changed, 17 insertions(+), 17 deletions(-) diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py index a9327c4..e1a7d91 100644 --- a/nanochat/checkpoint_manager.py +++ b/nanochat/checkpoint_manager.py @@ -34,7 +34,7 @@ def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data) log0(f"Saved optimizer file to: {optimizer_path}") # Save the metadata dict as json meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json") - with open(meta_path, "w") as f: + with open(meta_path, "w", encoding='utf-8') as f: json.dump(meta_data, f, indent=2) log0(f"Saved metadata file to: {meta_path}") @@ -50,7 +50,7 @@ def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False): optimizer_data = torch.load(optimizer_path, map_location=device) # Load the metadata meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json") - with open(meta_path, "r") as f: + with open(meta_path, "r", encoding='utf-8') as f: meta_data = json.load(f) return model_data, optimizer_data, meta_data diff --git a/nanochat/common.py b/nanochat/common.py index 4e5fc06..ee02a6e 100644 --- a/nanochat/common.py +++ b/nanochat/common.py @@ -70,7 +70,7 @@ def download_file_with_lock(url, filename, postprocess_fn=None): if os.path.exists(file_path): return file_path - with open(lock_path, 'w') as lock_file: + with open(lock_path, 'w', encoding='utf-8') as lock_file: # Only a single rank can acquire this lock # All other ranks block until it is released diff --git a/nanochat/report.py b/nanochat/report.py index d0a65e0..2f65e9d 100644 --- a/nanochat/report.py +++ b/nanochat/report.py @@ -170,7 +170,7 @@ Generated: {timestamp} # count dependencies via uv.lock uv_lock_lines = 0 if os.path.exists('uv.lock'): - with open('uv.lock', 'r') as f: + with open('uv.lock', 'r', encoding='utf-8') as f: uv_lock_lines = len(f.readlines()) header += f""" @@ -241,7 +241,7 @@ class Report: slug = slugify(section) file_name = f"{slug}.md" file_path = os.path.join(self.report_dir, file_name) - with open(file_path, "w") as f: + with open(file_path, "w", encoding='utf-8') as f: f.write(f"## {section}\n") f.write(f"timestamp: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") for item in data: @@ -272,11 +272,11 @@ class Report: final_metrics = {} # the most important final metrics we'll add as table at the end start_time = None end_time = None - with open(report_file, "w") as out_file: + with open(report_file, "w", encoding='utf-8') as out_file: # write the header first header_file = os.path.join(report_dir, "header.md") if os.path.exists(header_file): - with open(header_file, "r") as f: + with open(header_file, "r", encoding='utf-8') as f: header_content = f.read() out_file.write(header_content) start_time = extract_timestamp(header_content, "Run started:") @@ -293,7 +293,7 @@ class Report: if not os.path.exists(section_file): print(f"Warning: {section_file} does not exist, skipping") continue - with open(section_file, "r") as in_file: + with open(section_file, "r", encoding='utf-8') as in_file: section = in_file.read() # Extract timestamp from this section (the last section's timestamp will "stick" as end_time) if "rl" not in file_name: @@ -373,7 +373,7 @@ class Report: header_file = os.path.join(self.report_dir, "header.md") header = generate_header() start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - with open(header_file, "w") as f: + with open(header_file, "w", encoding='utf-8') as f: f.write(header) f.write(f"Run started: {start_time}\n\n---\n\n") print(f"Reset report and wrote header to {header_file}") diff --git a/scripts/base_eval.py b/scripts/base_eval.py index a987049..3663538 100644 --- a/scripts/base_eval.py +++ b/scripts/base_eval.py @@ -59,7 +59,7 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1): config_path = os.path.join(eval_bundle_dir, "core.yaml") data_base_path = os.path.join(eval_bundle_dir, "eval_data") eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv") - with open(config_path, 'r') as f: + with open(config_path, 'r', encoding='utf-8') as f: config = yaml.safe_load(f) tasks = config['icl_tasks'] @@ -193,7 +193,7 @@ def main(): print0("="*80) print0(f"Model: {model_name}") print0("="*80) - with open(output_csv_path, 'r') as f: + with open(output_csv_path, 'r', encoding='utf-8') as f: print0(f.read()) # Log to report diff --git a/scripts/chat_web.py b/scripts/chat_web.py index d7479c7..5d0b44a 100644 --- a/scripts/chat_web.py +++ b/scripts/chat_web.py @@ -243,7 +243,7 @@ app.add_middleware( async def root(): """Serve the chat UI.""" ui_html_path = os.path.join("nanochat", "ui.html") - with open(ui_html_path, "r") as f: + with open(ui_html_path, "r", encoding='utf-8') as f: html_content = f.read() # Replace the API_URL to use the same origin html_content = html_content.replace( diff --git a/tasks/customjson.py b/tasks/customjson.py index f4683c8..e1b5f0b 100644 --- a/tasks/customjson.py +++ b/tasks/customjson.py @@ -32,7 +32,7 @@ class CustomJSON(Task): print("-" * 80) else: - with open(filepath, 'r') as f: + with open(filepath, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if not line: # skip empty lines diff --git a/tasks/spellingbee.py b/tasks/spellingbee.py index c051fe7..3b45305 100644 --- a/tasks/spellingbee.py +++ b/tasks/spellingbee.py @@ -119,7 +119,7 @@ class SpellingBee(Task): self.split = split filename = WORD_LIST_URL.split("/")[-1] word_list_path = download_file_with_lock(WORD_LIST_URL, filename) - with open(word_list_path) as f: + with open(word_list_path, 'r', encoding='utf-8') as f: words = [line.strip() for line in f] self.words = words @@ -238,7 +238,7 @@ class SimpleSpelling(Task): self.split = split filename = WORD_LIST_URL.split("/")[-1] word_list_path = download_file_with_lock(WORD_LIST_URL, filename) - with open(word_list_path) as f: + with open(word_list_path, 'r', encoding='utf-8') as f: words = [line.strip() for line in f] rng = random.Random(42) rng.shuffle(words) # use a different word order than the SpellingBee task diff --git a/tests/test_rustbpe.py b/tests/test_rustbpe.py index 5f95721..bad3c92 100644 --- a/tests/test_rustbpe.py +++ b/tests/test_rustbpe.py @@ -455,13 +455,13 @@ def enwik8_path(): @pytest.fixture(scope="module") def enwik8_small(enwik8_path): """Fixture providing 100KB of enwik8 for quick tests.""" - with open(enwik8_path, "r") as f: + with open(enwik8_path, "r", encoding='utf-8') as f: return f.read(100_000) @pytest.fixture(scope="module") def enwik8_large(enwik8_path): """Fixture providing 10MB of enwik8 for performance tests.""" - with open(enwik8_path, "r") as f: + with open(enwik8_path, "r", encoding='utf-8') as f: return f.read(10**7) def time_function(func, *args, **kwargs): From e22fc6f2fac0c3d5f3ecd3ba6b09f7d694014b64 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 3 Nov 2025 21:46:39 +0100 Subject: [PATCH 2/5] few more explicit UTF-8 encodings --- dev/gen_synthetic_data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/dev/gen_synthetic_data.py b/dev/gen_synthetic_data.py index 13e5f55..73f4ac9 100644 --- a/dev/gen_synthetic_data.py +++ b/dev/gen_synthetic_data.py @@ -37,7 +37,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from nanochat.common import get_base_dir -api_key = open("openroutertoken.txt").read().strip() +api_key = open("openroutertoken.txt", 'r', encoding='utf-8').read().strip() url = "https://openrouter.ai/api/v1/chat/completions" headers = { @@ -45,7 +45,7 @@ headers = { "Content-Type": "application/json" } -readme = open("README.md").read().strip() +readme = open("README.md", 'r', encoding='utf-8').read().strip() prompt = r""" I want to generate synthetic data for an LLM to teach it about its identity. Here is the identity I want: From 2ce62ec07693a30d25264514fbaae0b918bfb200 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Mon, 3 Nov 2025 21:52:02 +0100 Subject: [PATCH 3/5] ensure consistency of quotes within each statement --- dev/gen_synthetic_data.py | 4 ++-- nanochat/checkpoint_manager.py | 4 ++-- nanochat/report.py | 10 +++++----- scripts/chat_web.py | 2 +- tests/test_rustbpe.py | 4 ++-- 5 files changed, 12 insertions(+), 12 deletions(-) diff --git a/dev/gen_synthetic_data.py b/dev/gen_synthetic_data.py index 73f4ac9..068824f 100644 --- a/dev/gen_synthetic_data.py +++ b/dev/gen_synthetic_data.py @@ -37,7 +37,7 @@ from concurrent.futures import ThreadPoolExecutor, as_completed from nanochat.common import get_base_dir -api_key = open("openroutertoken.txt", 'r', encoding='utf-8').read().strip() +api_key = open("openroutertoken.txt", "r", encoding="utf-8").read().strip() url = "https://openrouter.ai/api/v1/chat/completions" headers = { @@ -45,7 +45,7 @@ headers = { "Content-Type": "application/json" } -readme = open("README.md", 'r', encoding='utf-8').read().strip() +readme = open("README.md", "r", encoding="utf-8").read().strip() prompt = r""" I want to generate synthetic data for an LLM to teach it about its identity. Here is the identity I want: diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py index e1a7d91..378b0ed 100644 --- a/nanochat/checkpoint_manager.py +++ b/nanochat/checkpoint_manager.py @@ -34,7 +34,7 @@ def save_checkpoint(checkpoint_dir, step, model_data, optimizer_data, meta_data) log0(f"Saved optimizer file to: {optimizer_path}") # Save the metadata dict as json meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json") - with open(meta_path, "w", encoding='utf-8') as f: + with open(meta_path, "w", encoding="utf-8") as f: json.dump(meta_data, f, indent=2) log0(f"Saved metadata file to: {meta_path}") @@ -50,7 +50,7 @@ def load_checkpoint(checkpoint_dir, step, device, load_optimizer=False): optimizer_data = torch.load(optimizer_path, map_location=device) # Load the metadata meta_path = os.path.join(checkpoint_dir, f"meta_{step:06d}.json") - with open(meta_path, "r", encoding='utf-8') as f: + with open(meta_path, "r", encoding="utf-8") as f: meta_data = json.load(f) return model_data, optimizer_data, meta_data diff --git a/nanochat/report.py b/nanochat/report.py index 2f65e9d..0b0ebd7 100644 --- a/nanochat/report.py +++ b/nanochat/report.py @@ -241,7 +241,7 @@ class Report: slug = slugify(section) file_name = f"{slug}.md" file_path = os.path.join(self.report_dir, file_name) - with open(file_path, "w", encoding='utf-8') as f: + with open(file_path, "w", encoding="utf-8") as f: f.write(f"## {section}\n") f.write(f"timestamp: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n") for item in data: @@ -272,11 +272,11 @@ class Report: final_metrics = {} # the most important final metrics we'll add as table at the end start_time = None end_time = None - with open(report_file, "w", encoding='utf-8') as out_file: + with open(report_file, "w", encoding="utf-8") as out_file: # write the header first header_file = os.path.join(report_dir, "header.md") if os.path.exists(header_file): - with open(header_file, "r", encoding='utf-8') as f: + with open(header_file, "r", encoding="utf-8") as f: header_content = f.read() out_file.write(header_content) start_time = extract_timestamp(header_content, "Run started:") @@ -293,7 +293,7 @@ class Report: if not os.path.exists(section_file): print(f"Warning: {section_file} does not exist, skipping") continue - with open(section_file, "r", encoding='utf-8') as in_file: + with open(section_file, "r", encoding="utf-8") as in_file: section = in_file.read() # Extract timestamp from this section (the last section's timestamp will "stick" as end_time) if "rl" not in file_name: @@ -373,7 +373,7 @@ class Report: header_file = os.path.join(self.report_dir, "header.md") header = generate_header() start_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") - with open(header_file, "w", encoding='utf-8') as f: + with open(header_file, "w", encoding="utf-8") as f: f.write(header) f.write(f"Run started: {start_time}\n\n---\n\n") print(f"Reset report and wrote header to {header_file}") diff --git a/scripts/chat_web.py b/scripts/chat_web.py index 5d0b44a..4b67b62 100644 --- a/scripts/chat_web.py +++ b/scripts/chat_web.py @@ -243,7 +243,7 @@ app.add_middleware( async def root(): """Serve the chat UI.""" ui_html_path = os.path.join("nanochat", "ui.html") - with open(ui_html_path, "r", encoding='utf-8') as f: + with open(ui_html_path, "r", encoding="utf-8") as f: html_content = f.read() # Replace the API_URL to use the same origin html_content = html_content.replace( diff --git a/tests/test_rustbpe.py b/tests/test_rustbpe.py index bad3c92..aca67fc 100644 --- a/tests/test_rustbpe.py +++ b/tests/test_rustbpe.py @@ -455,13 +455,13 @@ def enwik8_path(): @pytest.fixture(scope="module") def enwik8_small(enwik8_path): """Fixture providing 100KB of enwik8 for quick tests.""" - with open(enwik8_path, "r", encoding='utf-8') as f: + with open(enwik8_path, "r", encoding="utf-8") as f: return f.read(100_000) @pytest.fixture(scope="module") def enwik8_large(enwik8_path): """Fixture providing 10MB of enwik8 for performance tests.""" - with open(enwik8_path, "r", encoding='utf-8') as f: + with open(enwik8_path, "r", encoding="utf-8") as f: return f.read(10**7) def time_function(func, *args, **kwargs): From 7a40ee77b4695ccb7350a679230eb6a7f8a6ae29 Mon Sep 17 00:00:00 2001 From: Dipesh Babu Date: Mon, 3 Nov 2025 16:00:56 -0500 Subject: [PATCH 4/5] fix: cast bf16 to fp32 on MPS (like CPU) to avoid dtype issues --- nanochat/checkpoint_manager.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py index a9327c4..2fcb01b 100644 --- a/nanochat/checkpoint_manager.py +++ b/nanochat/checkpoint_manager.py @@ -65,7 +65,7 @@ def build_model(checkpoint_dir, step, device, phase): """ assert phase in ["train", "eval"], f"Invalid phase: {phase}" model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, step, device, load_optimizer=False) - if device.type == "cpu": + if device.type in {"cpu", "mps"}: # Convert bfloat16 tensors to float for CPU inference model_data = { k: v.float() if v.dtype == torch.bfloat16 else v From f1683c5b1643c255d59903870eec91e17d5bf801 Mon Sep 17 00:00:00 2001 From: svlandeg Date: Tue, 4 Nov 2025 21:36:10 +0100 Subject: [PATCH 5/5] set nproc_per_node as var in speedrun and run1000 scripts --- run1000.sh | 18 +++++++++++------- speedrun.sh | 21 ++++++++++++--------- 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/run1000.sh b/run1000.sh index 46325d9..58ee3bc 100644 --- a/run1000.sh +++ b/run1000.sh @@ -70,18 +70,22 @@ python -m scripts.tok_eval # which would decrease model performance. Possibly 2, 3 or so epochs is ~ok, but certainly not ideal and at 10+ epochs we'd # start to overfit hard. # 5) That's it, everything else (e.g. the learning rates) is adjusted automatically by the training script. -torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=32 --device_batch_size=8 --run=$WANDB_RUN -torchrun --standalone --nproc_per_node=8 -m scripts.base_loss -torchrun --standalone --nproc_per_node=8 -m scripts.base_eval + +# Number of processes/GPUs to use +NPROC_PER_NODE=8 + +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=32 --device_batch_size=8 --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval # midtrain # NOTE: ensure that we use the same device_batch_size here as the base training script. -torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --device_batch_size=8 --run=$WANDB_RUN -torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i mid +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.mid_train -- --device_batch_size=8 --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i mid # sft -torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --run=$WANDB_RUN -torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_sft -- --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i sft # generate final report python -m nanochat.report generate diff --git a/speedrun.sh b/speedrun.sh index 32c8870..7955ec5 100644 --- a/speedrun.sh +++ b/speedrun.sh @@ -82,12 +82,15 @@ python -m scripts.tok_eval echo "Waiting for dataset download to complete..." wait $DATASET_DOWNLOAD_PID +# Number of processes/GPUs to use +NPROC_PER_NODE=8 + # pretrain the d20 model -torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=20 --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- --depth=20 --run=$WANDB_RUN # evaluate the model on a larger chunk of train/val data and draw some samples -torchrun --standalone --nproc_per_node=8 -m scripts.base_loss +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_loss # evaluate the model on CORE tasks -torchrun --standalone --nproc_per_node=8 -m scripts.base_eval +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_eval # ----------------------------------------------------------------------------- # Midtraining (teach the model conversation special tokens, tool use, multiple choice) @@ -97,15 +100,15 @@ torchrun --standalone --nproc_per_node=8 -m scripts.base_eval curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl # run midtraining and eval the model -torchrun --standalone --nproc_per_node=8 -m scripts.mid_train -- --run=$WANDB_RUN -torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i mid +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.mid_train -- --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i mid # ----------------------------------------------------------------------------- # Supervised Finetuning (domain adaptation to each sequence all by itself per row) # train sft and re-eval right away (should see a small bump) -torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --run=$WANDB_RUN -torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_sft -- --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i sft # chat with the model over CLI! Leave out the -p to chat interactively # python -m scripts.chat_cli -p "Why is the sky blue?" @@ -118,9 +121,9 @@ torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft # (optional) # run reinforcement learning -# torchrun --standalone --nproc_per_node=8 -m scripts.chat_rl -- --run=$WANDB_RUN +# torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_rl -- --run=$WANDB_RUN # eval the RL model only on GSM8K -# torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i rl -a GSM8K +# torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.chat_eval -- -i rl -a GSM8K # ----------------------------------------------------------------------------- # Generate the full report by putting together all the sections