diff --git a/run_reports/run3_rl.md b/run_reports/run3_rl.md new file mode 100644 index 0000000..d45a754 --- /dev/null +++ b/run_reports/run3_rl.md @@ -0,0 +1,309 @@ +# nanochat training report + +Generated: 2025-10-16 21:29:36 + +## Environment + +### Git Information +- Branch: master +- Commit: ec11d39 (dirty) +- Message: rename + +### Hardware +- Platform: Linux +- CPUs: 112 cores (224 logical) +- Memory: 2015.6 GB +- GPUs: 8x NVIDIA H100 80GB HBM3 +- GPU Memory: 632.8 GB total +- CUDA Version: 12.8 +- Hourly Rate: $24.00/hour + +### Software +- Python: 3.10.12 +- PyTorch: 2.8.0+cu128 + + +### Bloat +- Characters: 347,276 +- Lines: 8,712 +- Files: 45 +- Tokens (approx): 86,819 +- Dependencies (uv.lock lines): 2,004 + +Run started: 2025-10-16 21:29:44 + +--- + +## Tokenizer training +timestamp: 2025-10-16 21:31:11 + +- max_chars: 2,000,000,000 +- doc_cap: 10,000 +- vocab_size: 65,536 +- train_time: 73.4629 +- num_special_tokens: 9 +- token_bytes_min: 1 +- token_bytes_max: 32 +- token_bytes_mean: 6.9151 +- token_bytes_std: 2.8736 + + +## Tokenizer evaluation +timestamp: 2025-10-16 21:31:25 + +### Comparison with GPT-2 + +| Text Type | Bytes | GPT-2 Tokens | GPT-2 Ratio | Ours Tokens | Ours Ratio | Relative Diff % | +|-----------|-------|--------------|--------------|-------------|------------|-----------------| +| news | 1819 | 404 | 4.50 | 375 | 4.85 | +7.2% | +| korean | 893 | 745 | 1.20 | 721 | 1.24 | +3.2% | +| code | 1259 | 576 | 2.19 | 493 | 2.55 | +14.4% | +| math | 1834 | 936 | 1.96 | 966 | 1.90 | -3.2% | +| science | 1112 | 260 | 4.28 | 225 | 4.94 | +13.5% | +| fwe-train | 4208518 | 900364 | 4.67 | 856901 | 4.91 | +4.8% | +| fwe-val | 4908443 | 1059062 | 4.63 | 1010356 | 4.86 | +4.6% | + +### Comparison with GPT-4 + +| Text Type | Bytes | GPT-4 Tokens | GPT-4 Ratio | Ours Tokens | Ours Ratio | Relative Diff % | +|-----------|-------|--------------|--------------|-------------|------------|-----------------| +| news | 1819 | 387 | 4.70 | 375 | 4.85 | +3.1% | +| korean | 893 | 364 | 2.45 | 721 | 1.24 | -98.1% | +| code | 1259 | 309 | 4.07 | 493 | 2.55 | -59.5% | +| math | 1834 | 832 | 2.20 | 966 | 1.90 | -16.1% | +| science | 1112 | 249 | 4.47 | 225 | 4.94 | +9.6% | +| fwe-train | 4208518 | 874799 | 4.81 | 856901 | 4.91 | +2.0% | +| fwe-val | 4908443 | 1029691 | 4.77 | 1010356 | 4.86 | +1.9% | + + +## Base model training +timestamp: 2025-10-17 00:57:03 + +- run: run3_rl +- depth: 20 +- max_seq_len: 2048 +- num_iterations: -1 +- target_flops: -1.0000 +- target_param_data_ratio: 20 +- device_batch_size: 32 +- total_batch_size: 524,288 +- embedding_lr: 0.2000 +- unembedding_lr: 0.0040 +- weight_decay: 0.0000 +- matrix_lr: 0.0200 +- grad_clip: 1.0000 +- eval_every: 250 +- eval_tokens: 10,485,760 +- core_metric_every: 2000 +- core_metric_max_per_task: 500 +- sample_every: 2000 +- model_tag: +- Number of parameters: 560,988,160 +- Number of FLOPs per token: 3.491758e+09 +- Calculated number of iterations: 21,400 +- Number of training tokens: 11,219,763,200 +- Tokens : Params ratio: 20.0000 +- DDP world size: 8 +- warmup_ratio: 0.0000 +- warmdown_ratio: 0.2000 +- final_lr_frac: 0.0000 +- Minimum validation bpb: 0.8120 +- Final validation bpb: 0.8120 +- CORE metric estimate: 0.2197 +- MFU %: 48.37% +- Total training flops: 3.917670e+19 +- Total training time: 190.03m +- Peak memory usage: 75422.02MiB + + +## Base model loss +timestamp: 2025-10-17 00:58:27 + +- train bpb: 0.8149 +- val bpb: 0.8121 +- sample 0: <|bos|>The capital of France is Paris. It is the largest city in France and the capital of the country. +- sample 1: <|bos|>The chemical symbol of gold is Au. It is a soft, malleable, ductile, and malleable metal. It +- sample 2: <|bos|>If yesterday was Friday, then tomorrow will be Saturday. If today is Monday, then tomorrow will be Tuesday. If today is +- sample 3: <|bos|>The opposite of hot is cold. The opposite of hot is cold. The opposite of hot is cold. +- sample 4: <|bos|>The planets of the solar system are: Mercury, Venus, Earth, Mars, Jupiter, Saturn, Uranus, Neptune, +- sample 5: <|bos|>My favorite color is red. I love to paint with red. I love to paint with red. +- sample 6: <|bos|>If 5*x + 3 = 13, then x is a perfect square. If 5*x + 3 = 13, + + +## Base model evaluation +timestamp: 2025-10-17 01:02:29 + +- Model: base_model (step 21400) +- CORE metric: 0.2044 +- hellaswag_zeroshot: 0.2635 +- jeopardy: 0.1025 +- bigbench_qa_wikidata: 0.5300 +- arc_easy: 0.5140 +- arc_challenge: 0.1195 +- copa: 0.3000 +- commonsense_qa: 0.1605 +- piqa: 0.3732 +- openbook_qa: 0.1173 +- lambada_openai: 0.3742 +- hellaswag: 0.2602 +- winograd: 0.2821 +- winogrande: 0.0876 +- bigbench_dyck_languages: 0.1150 +- agi_eval_lsat_ar: 0.0870 +- bigbench_cs_algorithms: 0.4189 +- bigbench_operators: 0.1810 +- bigbench_repeat_copy_logic: 0.0312 +- squad: 0.1979 +- coqa: 0.2043 +- boolq: -0.4011 +- bigbench_language_identification: 0.1779 + + +## Midtraining +timestamp: 2025-10-17 01:18:09 + +- run: run3_rl +- dtype: bfloat16 +- max_seq_len: 2048 +- device_batch_size: 32 +- unembedding_lr: 0.0040 +- embedding_lr: 0.2000 +- matrix_lr: 0.0200 +- init_lr_frac: 1.0000 +- weight_decay: 0.0000 +- final_lr_frac: 0.0000 +- eval_every: 150 +- eval_tokens: 10,485,760 +- total_batch_size: 524,288 +- Number of iterations: 765 +- DDP world size: 8 +- Minimum validation bpb: 0.4156 + + +## Chat evaluation mid +timestamp: 2025-10-17 01:25:17 + +- source: mid +- task_name: None +- dtype: bfloat16 +- temperature: 0.0000 +- max_new_tokens: 512 +- num_samples: 1 +- top_k: 50 +- batch_size: 8 +- model_tag: None +- step: None +- max_problems: None +- ARC-Easy: 0.4381 +- ARC-Challenge: 0.3174 +- MMLU: 0.3195 +- GSM8K: 0.0182 +- HumanEval: 0.0732 +- ChatCORE metric: 0.1049 + + +## Chat SFT +timestamp: 2025-10-17 01:31:52 + +- run: run3_rl +- source: mid +- dtype: bfloat16 +- device_batch_size: 4 +- num_epochs: 1 +- max_iterations: -1 +- target_examples_per_step: 32 +- unembedding_lr: 0.0040 +- embedding_lr: 0.2000 +- matrix_lr: 0.0200 +- weight_decay: 0.0000 +- init_lr_frac: 0.0200 +- eval_every: 100 +- eval_steps: 100 +- eval_metrics_every: 200 +- Training rows: 20,843 +- Number of iterations: 651 +- Training loss: 1.1824 +- Validation loss: 1.0677 + + +## Chat evaluation sft +timestamp: 2025-10-17 01:37:35 + +- source: sft +- task_name: None +- dtype: bfloat16 +- temperature: 0.0000 +- max_new_tokens: 512 +- num_samples: 1 +- top_k: 50 +- batch_size: 8 +- model_tag: None +- step: None +- max_problems: None +- ARC-Easy: 0.4554 +- ARC-Challenge: 0.3063 +- MMLU: 0.3291 +- GSM8K: 0.0432 +- HumanEval: 0.0793 +- ChatCORE metric: 0.1154 + + +## Chat RL +timestamp: 2025-10-17 02:21:30 + +- run: run3_rl +- source: sft +- dtype: bfloat16 +- device_batch_size: 8 +- examples_per_step: 16 +- num_samples: 16 +- max_new_tokens: 256 +- temperature: 1.0000 +- top_k: 50 +- unembedding_lr: 0.0040 +- embedding_lr: 0.2000 +- matrix_lr: 0.0200 +- weight_decay: 0.0000 +- init_lr_frac: 0.0500 +- num_epochs: 1 +- save_every: 60 +- eval_every: 60 +- eval_examples: 400 + + +## Chat evaluation rl +timestamp: 2025-10-17 02:23:46 + +- source: rl +- task_name: GSM8K +- dtype: bfloat16 +- temperature: 0.0000 +- max_new_tokens: 512 +- num_samples: 1 +- top_k: 50 +- batch_size: 8 +- model_tag: None +- step: None +- max_problems: None +- GSM8K: 0.0925 + + +## Summary + +- Characters: 347,276 +- Lines: 8,712 +- Files: 45 +- Tokens (approx): 86,819 +- Dependencies (uv.lock lines): 2,004 + +| Metric | BASE | MID | SFT | RL | +|-----------------|----------|----------|----------|----------| +| CORE | 0.2044 | - | - | - | +| ARC-Challenge | - | 0.3174 | 0.3063 | - | +| ARC-Easy | - | 0.4381 | 0.4554 | - | +| GSM8K | - | 0.0182 | 0.0432 | 0.0925 | +| HumanEval | - | 0.0732 | 0.0793 | - | +| MMLU | - | 0.3195 | 0.3291 | - | +| ChatCORE | - | 0.1049 | 0.1154 | - | + +Total wall clock time: 4h7m diff --git a/run_rl.sh b/run_rl.sh deleted file mode 100755 index 7ed5ffa..0000000 --- a/run_rl.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -# This script is the "Best ChatGPT clone that $100 can buy", -# It is designed to run in ~4 hours on 8XH100 node at $3/GPU/hour. - -# 1) Example launch (simplest): -# bash speedrun.sh -# 2) Example launch in a screen session (because the run takes ~4 hours): -# screen -L -Logfile speedrun.log -S speedrun bash speedrun.sh -# 3) Example launch with wandb logging, but see below for setting up wandb first: -# WANDB_RUN=speedrun screen -L -Logfile speedrun.log -S speedrun bash speedrun.sh - -# Default intermediate artifacts directory is in ~/.cache/nanochat -export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 -export OMP_NUM_THREADS=1 -export NANOCHAT_BASE_DIR="/home/users/nus/ob1/scratch/.cache/nanochat" -export WANDB_RUN=rl - -# ----------------------------------------------------------------------------- -# wandb setup -# If you wish to use wandb for logging (it's nice!, recommended). -# 1) Make sure to first log in to wandb, e.g. run: -# `wandb login` -# 2) Set the WANDB_RUN environment variable when running this script, e.g.: -# `WANDB_RUN=d26 bash speedrun.sh` -if [ -z "$WANDB_RUN" ]; then - # by default use "dummy" : it's handled as a special case, skips logging to wandb - WANDB_RUN=dummy -fi - -# run reinforcement learning -torchrun --standalone --nproc_per_node=8 -m scripts.chat_rl -- --run=$WANDB_RUN -# eval the RL model only on GSM8K -torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i rl -a GSM8K - -# ----------------------------------------------------------------------------- -# Generate the full report by putting together all the sections -# report.md is the output and will be copied to current directory for convenience -python -m nanochat.report generate diff --git a/speedrun.sh b/speedrun.sh index dbdf460..7e2b448 100644 --- a/speedrun.sh +++ b/speedrun.sh @@ -14,6 +14,7 @@ export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 export OMP_NUM_THREADS=1 export NANOCHAT_BASE_DIR="/home/users/nus/ob1/scratch/.cache/nanochat" +export WANDB_RUN="run4_w_rl" # ----------------------------------------------------------------------------- # Python venv setup with uv @@ -113,7 +114,7 @@ torchrun --standalone --nproc_per_node=8 -m scripts.chat_sft -- --run=$WANDB_RUN torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft # chat with the model over CLI! Leave out the -p to chat interactively -# python -m scripts.chat_cli -p "Why is the sky blue?" +python -m scripts.chat_cli -p "Why is the sky blue?" # even better, chat with your model over a pretty WebUI ChatGPT style # python -m scripts.chat_web @@ -123,11 +124,11 @@ torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i sft # (optional) # run reinforcement learning -# torchrun --standalone --nproc_per_node=8 -m scripts.chat_rl -- --run=$WANDB_RUN +torchrun --standalone --nproc_per_node=8 -m scripts.chat_rl -- --run=$WANDB_RUN # eval the RL model only on GSM8K -# torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i rl -a GSM8K +torchrun --standalone --nproc_per_node=8 -m scripts.chat_eval -- -i rl -a GSM8K # ----------------------------------------------------------------------------- # Generate the full report by putting together all the sections # report.md is the output and will be copied to current directory for convenience -python -m nanochat.report generate +python -m nanochat.report generate \ No newline at end of file