From 6a795baf27b667ac2004ff1a4a0f86bb2f9eb090 Mon Sep 17 00:00:00 2001 From: Enes Poyraz Date: Mon, 13 Oct 2025 18:40:12 +0200 Subject: [PATCH 1/3] Update README.md fix typos --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 19988fd..bc01055 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ Alternatively, since the script runs for 4 hours, I like to launch it like this screen -L -Logfile speedrun.log -S speedrun bash speedrun.sh ``` -See the [screen cheatsheet](https://gist.github.com/jctosta/af918e1618682638aa82) if you are less familiar. You can watch it go inside the screen session, or detach with `Ctrl-a d` and `tail speedrun.log` to view progress. Now wait 4 hours. Once it's done, you can talk to your LLM via the ChatGPT-like web UI. Make sure again that your local uv virtual environment is active (run `source .venv/bin/activative`), and serve it: +See the [screen cheatsheet](https://gist.github.com/jctosta/af918e1618682638aa82) if you are less familiar. You can watch it go inside the screen session, or detach with `Ctrl-a d` and `tail speedrun.log` to view progress. Now wait 4 hours. Once it's done, you can talk to your LLM via the ChatGPT-like web UI. Make sure again that your local uv virtual environment is active (run `source .venv/bin/activate`), and serve it: ```bash python -m scripts.chat_web @@ -34,7 +34,7 @@ And then visit the URL shown. Make sure to access it correctly, e.g. on Lambda u --- -You can also `cat report.md` file which appeared in the project directory and contains the "report card" of the run, i.e. a bunch of evaluations and metrics. At the vert end, you'll see a summary table, for example: +You can also `cat report.md` file which appeared in the project directory and contains the "report card" of the run, i.e. a bunch of evaluations and metrics. At the very end, you'll see a summary table, for example: --- @@ -73,7 +73,7 @@ That said, to give a sense, the example changes needed for the [speedrun.sh](spe # divide by 250 million to get number of shards. todo need to improve this... python -m nanochat.dataset -n 450 & ... -# use --depth to increase model size. to not oom, halve device bath size 32 -> 16: +# use --depth to increase model size. to not oom, halve device batch size 32 -> 16: torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=26 --device_batch_size=16 ... # make sure to use the same later during midtraining: From afaa5b4c905ffc8b42b861142ab51b51adb6edd0 Mon Sep 17 00:00:00 2001 From: Mirza-Samad-Ahmed-Baig Date: Tue, 14 Oct 2025 00:18:20 +0300 Subject: [PATCH 2/3] Fix: Handle missing d model tags in find_largest_model --- nanochat/checkpoint_manager.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nanochat/checkpoint_manager.py b/nanochat/checkpoint_manager.py index 961ebd5..f400d47 100644 --- a/nanochat/checkpoint_manager.py +++ b/nanochat/checkpoint_manager.py @@ -104,8 +104,8 @@ def find_largest_model(checkpoint_dir): candidates.sort(key=lambda x: x[0], reverse=True) return candidates[0][1] # 2) if that failed, take the most recently updated model: - candidates.sort(key=lambda x: os.path.getmtime(os.path.join(checkpoint_dir, x[1])), reverse=True) - return candidates[0][1] + model_tags.sort(key=lambda x: os.path.getmtime(os.path.join(checkpoint_dir, x)), reverse=True) + return model_tags[0] def find_last_step(checkpoint_dir): From f0855cbcc77cc08307b83a24701bfa587ccd6b4b Mon Sep 17 00:00:00 2001 From: Zach Mueller Date: Tue, 14 Oct 2025 14:12:01 -0400 Subject: [PATCH 3/3] Update speedrun.sh --- speedrun.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/speedrun.sh b/speedrun.sh index d2498ee..a9b579a 100644 --- a/speedrun.sh +++ b/speedrun.sh @@ -12,7 +12,7 @@ # Default intermediate artifacts directory is in ~/.cache/nanochat export OMP_NUM_THREADS=1 -NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" +export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" mkdir -p $NANOCHAT_BASE_DIR # -----------------------------------------------------------------------------