reduce eval problems for mmlu, humaneval, gsm8k, spellingbee, super slow

This commit is contained in:
Sushrut Karnik 2026-03-13 23:23:21 +01:00
parent ef9718d2c7
commit 4468824a6e
6 changed files with 512 additions and 484 deletions

View File

@ -31,7 +31,7 @@ def _detect_compute_dtype():
if torch.backends.mps.is_available():
# torch.float16
# return torch.float16, "auto-detected: mps, float16"
return torch.float32, "auto-detected: mps, float16"
return torch.float32, "auto-detected: mps, float32"
return torch.float32, "auto-detected: no CUDA (CPU/MPS)"
COMPUTE_DTYPE, COMPUTE_DTYPE_REASON = _detect_compute_dtype()

View File

@ -306,7 +306,6 @@ def execute_code(
>>> result.stdout
'hello world\\n'
"""
manager = multiprocessing.Manager()
result_dict = manager.dict()
@ -347,3 +346,6 @@ def execute_code(
memory_exceeded=result_dict["memory_exceeded"],
)
# if __name__ == '__main__':
# multiprocessing.freeze_support()
# # manager = multiprocessing.Manager()

View File

@ -31,16 +31,16 @@ if [ -z "$WANDB_RUN" ]; then
WANDB_RUN=dummy
fi
# train tokenizer on ~2B characters (~34 seconds on my MacBook Pro M3 Max)
# # train tokenizer on ~2B characters (~34 seconds on my MacBook Pro M3 Max)
python -m nanochat.dataset -n 8
python -m scripts.tok_train --max-chars=2000000000
python -m scripts.tok_eval
# train a small 4 layer model
# I tuned this run to complete in about 30 minutes on my MacBook Pro M3 Max.
# To get better results, try increasing num_iterations, or get other ideas from your favorite LLM.
# # train a small 4 layer model
# # I tuned this run to complete in about 30 minutes on my MacBook Pro M3 Max.
# # To get better results, try increasing num_iterations, or get other ideas from your favorite LLM.
python -m scripts.base_train \
--depth=6 \
--depth=24 \
--head-dim=64 \
--window-pattern=L \
--max-seq-len=512 \
@ -50,7 +50,7 @@ python -m scripts.base_train \
--eval-tokens=524288 \
--core-metric-every=-1 \
--sample-every=100 \
--num-iterations=100 \
--num-iterations=1000 \
--run=$WANDB_RUN
python -m scripts.base_eval --device-batch-size=1 --split-tokens=16384 --max-per-task=16
@ -60,16 +60,19 @@ python -m scripts.chat_sft \
--max-seq-len=512 \
--device-batch-size=32 \
--total-batch-size=16384 \
--eval-every=200 \
--eval-every=-1 \
--eval-tokens=524288 \
--num-iterations=1500 \
--run=$WANDB_RUN
#reduced mmlu humaneval and spellingbee to just 10 problems, slower than expected
python -m scripts.chat_eval -i sft
# Chat with the model over CLI
# The model should be able to say that it is Paris.
# It might even know that the color of the sky is blue.
# Sometimes the model likes it if you first say Hi before you ask it questions.
# python -m scripts.chat_cli -p "What is the capital of France?"
python -m scripts.chat_cli -p "What is the capital of France?"
# Chat with the model over a pretty WebUI ChatGPT style
# python -m scripts.chat_web

View File

@ -88,6 +88,8 @@ device_type = autodetect_device_type() if args.device_type == "" else args.devic
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None
synchronize = torch.mps.synchronize if device_type == "mps" else synchronize
get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0
if device_type == "cuda":
gpu_device_name = torch.cuda.get_device_name(0)

View File

@ -119,6 +119,10 @@ def run_categorical_eval(task_object, tokenizer, model, batch_size, max_problems
# The much harder alternative would be to just generate from the Assistant and check if it responded with the correct
# letter (e.g. A, B, C, D), but evaluations typically make the task easier in this way.
for idx, conversation in enumerate(conversations):
if i % 10 == 0 and idx % 10 == 0:
print0(f"run_categorical_eval Batch {i}/{num_batches} Processing conversation idx: {idx}/{batch_size}")
# get the token ids of all the available letters of this problem
letters = conversation['letters']
letter_ids = []
@ -215,6 +219,8 @@ if __name__ == "__main__":
# Run all the task evaluations sequentially
results = {}
for task_name in task_names:
max_problems = 10 if (device_type != "cuda" and task_name in ["MMLU", "GSM8K", "HumanEval", "SpellingBee"]) else args.max_problems
acc = run_chat_eval(
task_name,
model, tokenizer, engine,
@ -223,7 +229,7 @@ if __name__ == "__main__":
max_new_tokens=args.max_new_tokens,
temperature=args.temperature,
top_k=args.top_k,
max_problems=args.max_problems,
max_problems=max_problems,
)
results[task_name] = acc
print0(f"{task_name} accuracy: {100 * acc:.2f}%")

File diff suppressed because it is too large Load Diff