This commit is contained in:
Matt Van Horn 2026-03-28 00:35:39 +00:00 committed by GitHub
commit acc6ae2d37
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 9 additions and 8 deletions

View File

@ -42,11 +42,12 @@ python -m scripts.base_train \
--sample-every=100 \
--num-iterations=5000 \
--run=$WANDB_RUN
python -m scripts.base_eval --device-batch-size=1 --split-tokens=16384 --max-per-task=16
python -m scripts.base_eval --model-tag=d6 --device-batch-size=1 --split-tokens=16384 --max-per-task=16
# SFT (~10 minutes on my MacBook Pro M3 Max)
curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
python -m scripts.chat_sft \
--model-tag=d6 \
--max-seq-len=512 \
--device-batch-size=32 \
--total-batch-size=16384 \
@ -59,7 +60,7 @@ python -m scripts.chat_sft \
# The model should be able to say that it is Paris.
# It might even know that the color of the sky is blue.
# Sometimes the model likes it if you first say Hi before you ask it questions.
# python -m scripts.chat_cli -p "What is the capital of France?"
# python -m scripts.chat_cli --model-tag=d6 -p "What is the capital of France?"
# Chat with the model over a pretty WebUI ChatGPT style
# python -m scripts.chat_web
# python -m scripts.chat_web --model-tag=d6

View File

@ -72,7 +72,7 @@ wait $DATASET_DOWNLOAD_PID
# d24 model (slightly undertrained to beat GPT-2 => decrease data:params ratio from compute optimal 10.5 (default) to 8)
torchrun --standalone --nproc_per_node=8 -m scripts.base_train -- --depth=24 --target-param-data-ratio=8 --device-batch-size=16 --fp8 --run=$WANDB_RUN
# evaluate the model: CORE metric, BPB on train/val, and draw samples
torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch-size=16
torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --model-tag=d24 --device-batch-size=16
# -----------------------------------------------------------------------------
# SFT (teach the model conversation special tokens, tool use, multiple choice)
@ -82,14 +82,14 @@ torchrun --standalone --nproc_per_node=8 -m scripts.base_eval -- --device-batch-
curl -L -o $NANOCHAT_BASE_DIR/identity_conversations.jsonl https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
# run SFT and eval the model
torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --device-batch-size=16 --run=$WANDB_RUN
torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft
torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --model-tag=d24 --device-batch-size=16 --run=$WANDB_RUN
torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- --model-tag=d24 -i sft
# chat with the model over CLI! Leave out the -p to chat interactively
# python -m scripts.chat_cli -p "Why is the sky blue?"
# python -m scripts.chat_cli --model-tag=d24 -p "Why is the sky blue?"
# even better, chat with your model over a pretty WebUI ChatGPT style
# python -m scripts.chat_web
# python -m scripts.chat_web --model-tag=d24
# -----------------------------------------------------------------------------
# Generate the full report by putting together all the sections