reduce eval problems for mmlu, humaneval, gsm8k, spellingbee, super slow

2026-06-15 18:49:10 +00:00 · 2026-03-13 23:23:21 +01:00 · 2026-03-13 23:23:21 +01:00 · 4468824a6e
commit 4468824a6e
parent ef9718d2c7
6 changed files with 512 additions and 484 deletions
--- a/nanochat/common.py
+++ b/nanochat/common.py
@ -31,7 +31,7 @@ def _detect_compute_dtype():
    if torch.backends.mps.is_available():
        # torch.float16
        # return torch.float16, "auto-detected: mps, float16"
-        return torch.float32, "auto-detected: mps, float16"
+        return torch.float32, "auto-detected: mps, float32"

    return torch.float32, "auto-detected: no CUDA (CPU/MPS)"
 COMPUTE_DTYPE, COMPUTE_DTYPE_REASON = _detect_compute_dtype()
--- a/nanochat/execution.py
+++ b/nanochat/execution.py
@ -306,7 +306,6 @@ def execute_code(
        >>> result.stdout
        'hello world\\n'
    """
-
    manager = multiprocessing.Manager()
    result_dict = manager.dict()

@ -347,3 +346,6 @@ def execute_code(
        memory_exceeded=result_dict["memory_exceeded"],
    )

+# if __name__ == '__main__':
+#     multiprocessing.freeze_support()
+#     # manager = multiprocessing.Manager()
--- a/runs/runcpu.sh
+++ b/runs/runcpu.sh
@ -31,16 +31,16 @@ if [ -z "$WANDB_RUN" ]; then
    WANDB_RUN=dummy
 fi

-# train tokenizer on ~2B characters (~34 seconds on my MacBook Pro M3 Max)
+# # train tokenizer on ~2B characters (~34 seconds on my MacBook Pro M3 Max)
 python -m nanochat.dataset -n 8
 python -m scripts.tok_train --max-chars=2000000000
 python -m scripts.tok_eval

-# train a small 4 layer model
-# I tuned this run to complete in about 30 minutes on my MacBook Pro M3 Max.
-# To get better results, try increasing num_iterations, or get other ideas from your favorite LLM.
+# # train a small 4 layer model
+# # I tuned this run to complete in about 30 minutes on my MacBook Pro M3 Max.
+# # To get better results, try increasing num_iterations, or get other ideas from your favorite LLM.
 python -m scripts.base_train \
-    --depth=6 \
+    --depth=24 \
    --head-dim=64 \
    --window-pattern=L \
    --max-seq-len=512 \
@ -50,7 +50,7 @@ python -m scripts.base_train \
    --eval-tokens=524288 \
    --core-metric-every=-1 \
    --sample-every=100 \
-    --num-iterations=100 \
+    --num-iterations=1000 \
    --run=$WANDB_RUN
 python -m scripts.base_eval --device-batch-size=1 --split-tokens=16384 --max-per-task=16

@ -60,16 +60,19 @@ python -m scripts.chat_sft \
    --max-seq-len=512 \
    --device-batch-size=32 \
    --total-batch-size=16384 \
-    --eval-every=200 \
+    --eval-every=-1 \
    --eval-tokens=524288 \
    --num-iterations=1500 \
    --run=$WANDB_RUN

+#reduced mmlu humaneval and spellingbee to just 10 problems, slower than expected
+python -m scripts.chat_eval -i sft
+
 # Chat with the model over CLI
 # The model should be able to say that it is Paris.
 # It might even know that the color of the sky is blue.
 # Sometimes the model likes it if you first say Hi before you ask it questions.
-# python -m scripts.chat_cli -p "What is the capital of France?"
+python -m scripts.chat_cli -p "What is the capital of France?"

 # Chat with the model over a pretty WebUI ChatGPT style
 # python -m scripts.chat_web
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@ -88,6 +88,8 @@ device_type = autodetect_device_type() if args.device_type == "" else args.devic
 ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
 master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
 synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None
+synchronize = torch.mps.synchronize if device_type == "mps" else synchronize
+
 get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0
 if device_type == "cuda":
    gpu_device_name = torch.cuda.get_device_name(0)
--- a/scripts/chat_eval.py
+++ b/scripts/chat_eval.py
@ -119,6 +119,10 @@ def run_categorical_eval(task_object, tokenizer, model, batch_size, max_problems
        # The much harder alternative would be to just generate from the Assistant and check if it responded with the correct
        # letter (e.g. A, B, C, D), but evaluations typically make the task easier in this way.
        for idx, conversation in enumerate(conversations):
+
+            if i % 10 == 0 and idx % 10 == 0:
+                print0(f"run_categorical_eval Batch {i}/{num_batches} Processing conversation idx: {idx}/{batch_size}")
+
            # get the token ids of all the available letters of this problem
            letters = conversation['letters']
            letter_ids = []
@ -215,6 +219,8 @@ if __name__ == "__main__":
    # Run all the task evaluations sequentially
    results = {}
    for task_name in task_names:
+        max_problems = 10 if (device_type != "cuda" and task_name in ["MMLU", "GSM8K", "HumanEval", "SpellingBee"]) else args.max_problems
+
        acc = run_chat_eval(
            task_name,
            model, tokenizer, engine,
@ -223,7 +229,7 @@ if __name__ == "__main__":
            max_new_tokens=args.max_new_tokens,
            temperature=args.temperature,
            top_k=args.top_k,
-            max_problems=args.max_problems,
+            max_problems=max_problems,
        )
        results[task_name] = acc
        print0(f"{task_name} accuracy: {100 * acc:.2f}%")
--- a/scripts/chat_sft.py
+++ b/scripts/chat_sft.py