mirror of
https://github.com/karpathy/nanochat.git
synced 2026-04-02 05:35:19 +00:00
reduce eval problems for mmlu, humaneval, gsm8k, spellingbee, super slow
This commit is contained in:
parent
ef9718d2c7
commit
4468824a6e
|
|
@ -31,7 +31,7 @@ def _detect_compute_dtype():
|
|||
if torch.backends.mps.is_available():
|
||||
# torch.float16
|
||||
# return torch.float16, "auto-detected: mps, float16"
|
||||
return torch.float32, "auto-detected: mps, float16"
|
||||
return torch.float32, "auto-detected: mps, float32"
|
||||
|
||||
return torch.float32, "auto-detected: no CUDA (CPU/MPS)"
|
||||
COMPUTE_DTYPE, COMPUTE_DTYPE_REASON = _detect_compute_dtype()
|
||||
|
|
|
|||
|
|
@ -306,7 +306,6 @@ def execute_code(
|
|||
>>> result.stdout
|
||||
'hello world\\n'
|
||||
"""
|
||||
|
||||
manager = multiprocessing.Manager()
|
||||
result_dict = manager.dict()
|
||||
|
||||
|
|
@ -347,3 +346,6 @@ def execute_code(
|
|||
memory_exceeded=result_dict["memory_exceeded"],
|
||||
)
|
||||
|
||||
# if __name__ == '__main__':
|
||||
# multiprocessing.freeze_support()
|
||||
# # manager = multiprocessing.Manager()
|
||||
|
|
|
|||
|
|
@ -31,16 +31,16 @@ if [ -z "$WANDB_RUN" ]; then
|
|||
WANDB_RUN=dummy
|
||||
fi
|
||||
|
||||
# train tokenizer on ~2B characters (~34 seconds on my MacBook Pro M3 Max)
|
||||
# # train tokenizer on ~2B characters (~34 seconds on my MacBook Pro M3 Max)
|
||||
python -m nanochat.dataset -n 8
|
||||
python -m scripts.tok_train --max-chars=2000000000
|
||||
python -m scripts.tok_eval
|
||||
|
||||
# train a small 4 layer model
|
||||
# I tuned this run to complete in about 30 minutes on my MacBook Pro M3 Max.
|
||||
# To get better results, try increasing num_iterations, or get other ideas from your favorite LLM.
|
||||
# # train a small 4 layer model
|
||||
# # I tuned this run to complete in about 30 minutes on my MacBook Pro M3 Max.
|
||||
# # To get better results, try increasing num_iterations, or get other ideas from your favorite LLM.
|
||||
python -m scripts.base_train \
|
||||
--depth=6 \
|
||||
--depth=24 \
|
||||
--head-dim=64 \
|
||||
--window-pattern=L \
|
||||
--max-seq-len=512 \
|
||||
|
|
@ -50,7 +50,7 @@ python -m scripts.base_train \
|
|||
--eval-tokens=524288 \
|
||||
--core-metric-every=-1 \
|
||||
--sample-every=100 \
|
||||
--num-iterations=100 \
|
||||
--num-iterations=1000 \
|
||||
--run=$WANDB_RUN
|
||||
python -m scripts.base_eval --device-batch-size=1 --split-tokens=16384 --max-per-task=16
|
||||
|
||||
|
|
@ -60,16 +60,19 @@ python -m scripts.chat_sft \
|
|||
--max-seq-len=512 \
|
||||
--device-batch-size=32 \
|
||||
--total-batch-size=16384 \
|
||||
--eval-every=200 \
|
||||
--eval-every=-1 \
|
||||
--eval-tokens=524288 \
|
||||
--num-iterations=1500 \
|
||||
--run=$WANDB_RUN
|
||||
|
||||
#reduced mmlu humaneval and spellingbee to just 10 problems, slower than expected
|
||||
python -m scripts.chat_eval -i sft
|
||||
|
||||
# Chat with the model over CLI
|
||||
# The model should be able to say that it is Paris.
|
||||
# It might even know that the color of the sky is blue.
|
||||
# Sometimes the model likes it if you first say Hi before you ask it questions.
|
||||
# python -m scripts.chat_cli -p "What is the capital of France?"
|
||||
python -m scripts.chat_cli -p "What is the capital of France?"
|
||||
|
||||
# Chat with the model over a pretty WebUI ChatGPT style
|
||||
# python -m scripts.chat_web
|
||||
|
|
|
|||
|
|
@ -88,6 +88,8 @@ device_type = autodetect_device_type() if args.device_type == "" else args.devic
|
|||
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
|
||||
master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
|
||||
synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None
|
||||
synchronize = torch.mps.synchronize if device_type == "mps" else synchronize
|
||||
|
||||
get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0
|
||||
if device_type == "cuda":
|
||||
gpu_device_name = torch.cuda.get_device_name(0)
|
||||
|
|
|
|||
|
|
@ -119,6 +119,10 @@ def run_categorical_eval(task_object, tokenizer, model, batch_size, max_problems
|
|||
# The much harder alternative would be to just generate from the Assistant and check if it responded with the correct
|
||||
# letter (e.g. A, B, C, D), but evaluations typically make the task easier in this way.
|
||||
for idx, conversation in enumerate(conversations):
|
||||
|
||||
if i % 10 == 0 and idx % 10 == 0:
|
||||
print0(f"run_categorical_eval Batch {i}/{num_batches} Processing conversation idx: {idx}/{batch_size}")
|
||||
|
||||
# get the token ids of all the available letters of this problem
|
||||
letters = conversation['letters']
|
||||
letter_ids = []
|
||||
|
|
@ -215,6 +219,8 @@ if __name__ == "__main__":
|
|||
# Run all the task evaluations sequentially
|
||||
results = {}
|
||||
for task_name in task_names:
|
||||
max_problems = 10 if (device_type != "cuda" and task_name in ["MMLU", "GSM8K", "HumanEval", "SpellingBee"]) else args.max_problems
|
||||
|
||||
acc = run_chat_eval(
|
||||
task_name,
|
||||
model, tokenizer, engine,
|
||||
|
|
@ -223,7 +229,7 @@ if __name__ == "__main__":
|
|||
max_new_tokens=args.max_new_tokens,
|
||||
temperature=args.temperature,
|
||||
top_k=args.top_k,
|
||||
max_problems=args.max_problems,
|
||||
max_problems=max_problems,
|
||||
)
|
||||
results[task_name] = acc
|
||||
print0(f"{task_name} accuracy: {100 * acc:.2f}%")
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user