mirror of
https://github.com/karpathy/nanochat.git
synced 2026-05-07 08:19:52 +00:00
batch baseline speedrun.sh scripts
This commit is contained in:
parent
0aaca56805
commit
c5e8ce370c
42
runs/pace_stage1_tokenizer.sh
Normal file
42
runs/pace_stage1_tokenizer.sh
Normal file
|
|
@ -0,0 +1,42 @@
|
|||
#!/bin/bash
|
||||
#SBATCH -N 1
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --cpus-per-task=24
|
||||
#SBATCH --mem=64G
|
||||
#SBATCH -t 2:00:00
|
||||
#SBATCH -J nanochat-stage1-tokenizer
|
||||
#SBATCH -o runs/logs/stage1_%j.out
|
||||
#SBATCH -e runs/logs/stage1_%j.err
|
||||
|
||||
# Stage 1
|
||||
|
||||
set -e
|
||||
cd "$HOME/scratch/nanochat"
|
||||
|
||||
export OMP_NUM_THREADS=1
|
||||
export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat"
|
||||
mkdir -p "$NANOCHAT_BASE_DIR"
|
||||
mkdir -p runs/logs
|
||||
|
||||
echo "=== Stage 1: Tokenizer ==="
|
||||
echo "Base dir: $NANOCHAT_BASE_DIR"
|
||||
echo "Started: $(date)"
|
||||
|
||||
command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
export PATH="$HOME/.local/bin:$PATH"
|
||||
[ -d ".venv" ] || uv venv
|
||||
uv sync --extra gpu
|
||||
source .venv/bin/activate
|
||||
python -m nanochat.report reset
|
||||
python -m nanochat.dataset -n 8
|
||||
python -m nanochat.dataset -n 170 &
|
||||
DATASET_DOWNLOAD_PID=$!
|
||||
|
||||
python -m scripts.tok_train
|
||||
python -m scripts.tok_eval
|
||||
|
||||
echo "Waiting for full dataset download..."
|
||||
wait $DATASET_DOWNLOAD_PID
|
||||
|
||||
echo "=== Stage 1 complete: $(date) ==="
|
||||
echo "Dataset and tokenizer ready in $NANOCHAT_BASE_DIR"
|
||||
43
runs/pace_stage2a_pretrain.sh
Normal file
43
runs/pace_stage2a_pretrain.sh
Normal file
|
|
@ -0,0 +1,43 @@
|
|||
#!/bin/bash
|
||||
#SBATCH -N 1
|
||||
#SBATCH -p ice-gpu
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --gres=gpu:2
|
||||
#SBATCH --constraint="gpu-h100|gpu-h200"
|
||||
#SBATCH --mem-per-gpu=48G
|
||||
#SBATCH -t 3:55:00
|
||||
#SBATCH -J nanochat-stage2a
|
||||
#SBATCH -o runs/logs/stage2a_%j.out
|
||||
#SBATCH -e runs/logs/stage2a_%j.err
|
||||
|
||||
# Stage 2a
|
||||
|
||||
set -e
|
||||
cd "$HOME/scratch/nanochat"
|
||||
|
||||
export OMP_NUM_THREADS=1
|
||||
export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat"
|
||||
mkdir -p runs/logs
|
||||
|
||||
WANDB_RUN="${WANDB_RUN:-dummy}"
|
||||
CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24"
|
||||
DONE_MARKER="$CHECKPOINT_DIR/.training_complete"
|
||||
|
||||
echo "=== Stage 2a: Pretraining (chunk 1) ==="
|
||||
echo "Base dir: $NANOCHAT_BASE_DIR"
|
||||
echo "WANDB_RUN: $WANDB_RUN"
|
||||
echo "Started: $(date)"
|
||||
|
||||
source .venv/bin/activate
|
||||
|
||||
torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \
|
||||
--depth=24 \
|
||||
--target-param-data-ratio=8 \
|
||||
--device-batch-size=16 \
|
||||
--save-every=200 \
|
||||
--run=$WANDB_RUN
|
||||
|
||||
mkdir -p "$CHECKPOINT_DIR"
|
||||
touch "$DONE_MARKER"
|
||||
echo "=== Stage 2a complete: $(date) ==="
|
||||
68
runs/pace_stage2b_pretrain.sh
Normal file
68
runs/pace_stage2b_pretrain.sh
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
#!/bin/bash
|
||||
#SBATCH -N 1
|
||||
#SBATCH -p ice-gpu
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --gres=gpu:2
|
||||
#SBATCH --constraint="gpu-h100|gpu-h200"
|
||||
#SBATCH --mem-per-gpu=48G
|
||||
#SBATCH -t 3:55:00
|
||||
#SBATCH -J nanochat-stage2b
|
||||
#SBATCH -o runs/logs/stage2b_%j.out
|
||||
#SBATCH -e runs/logs/stage2b_%j.err
|
||||
|
||||
# Stage 2b
|
||||
|
||||
set -e
|
||||
cd "$HOME/scratch/nanochat"
|
||||
|
||||
export OMP_NUM_THREADS=1
|
||||
export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat"
|
||||
mkdir -p runs/logs
|
||||
|
||||
WANDB_RUN="${WANDB_RUN:-dummy}"
|
||||
CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24"
|
||||
DONE_MARKER="$CHECKPOINT_DIR/.training_complete"
|
||||
|
||||
echo "=== Stage 2b: Pretraining (chunk 2 / auto-resume) ==="
|
||||
echo "Base dir: $NANOCHAT_BASE_DIR"
|
||||
echo "Started: $(date)"
|
||||
|
||||
if [ -f "$DONE_MARKER" ]; then
|
||||
echo "Training already complete (marker: $DONE_MARKER). Nothing to do."
|
||||
echo "=== Stage 2b skipped: $(date) ==="
|
||||
exit 0
|
||||
fi
|
||||
|
||||
source .venv/bin/activate
|
||||
|
||||
LAST_STEP=$(python -c "
|
||||
import glob, os, sys
|
||||
files = glob.glob('${CHECKPOINT_DIR}/model_*.pt')
|
||||
if not files:
|
||||
print(0); sys.exit(0)
|
||||
print(max(int(os.path.basename(f).split('_')[-1].split('.')[0]) for f in files))
|
||||
")
|
||||
|
||||
if [ "$LAST_STEP" -eq 0 ]; then
|
||||
echo "No checkpoint found — starting from scratch"
|
||||
torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \
|
||||
--depth=24 \
|
||||
--target-param-data-ratio=8 \
|
||||
--device-batch-size=16 \
|
||||
--save-every=200 \
|
||||
--run=$WANDB_RUN
|
||||
else
|
||||
echo "Resuming from step $LAST_STEP"
|
||||
torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \
|
||||
--depth=24 \
|
||||
--target-param-data-ratio=8 \
|
||||
--device-batch-size=16 \
|
||||
--save-every=200 \
|
||||
--resume-from-step=$LAST_STEP \
|
||||
--run=$WANDB_RUN
|
||||
fi
|
||||
|
||||
mkdir -p "$CHECKPOINT_DIR"
|
||||
touch "$DONE_MARKER"
|
||||
echo "=== Stage 2b complete: $(date) ==="
|
||||
68
runs/pace_stage2c_pretrain.sh
Normal file
68
runs/pace_stage2c_pretrain.sh
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
#!/bin/bash
|
||||
#SBATCH -N 1
|
||||
#SBATCH -p ice-gpu
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --gres=gpu:2
|
||||
#SBATCH --constraint="gpu-h100|gpu-h200"
|
||||
#SBATCH --mem-per-gpu=48G
|
||||
#SBATCH -t 3:55:00
|
||||
#SBATCH -J nanochat-stage2c
|
||||
#SBATCH -o runs/logs/stage2c_%j.out
|
||||
#SBATCH -e runs/logs/stage2c_%j.err
|
||||
|
||||
# Stage 2c
|
||||
|
||||
set -e
|
||||
cd "$HOME/scratch/nanochat"
|
||||
|
||||
export OMP_NUM_THREADS=1
|
||||
export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat"
|
||||
mkdir -p runs/logs
|
||||
|
||||
WANDB_RUN="${WANDB_RUN:-dummy}"
|
||||
CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24"
|
||||
DONE_MARKER="$CHECKPOINT_DIR/.training_complete"
|
||||
|
||||
echo "=== Stage 2c: Pretraining (chunk 3 / auto-resume) ==="
|
||||
echo "Base dir: $NANOCHAT_BASE_DIR"
|
||||
echo "Started: $(date)"
|
||||
|
||||
if [ -f "$DONE_MARKER" ]; then
|
||||
echo "Training already complete (marker: $DONE_MARKER). Nothing to do."
|
||||
echo "=== Stage 2c skipped: $(date) ==="
|
||||
exit 0
|
||||
fi
|
||||
|
||||
source .venv/bin/activate
|
||||
|
||||
LAST_STEP=$(python -c "
|
||||
import glob, os, sys
|
||||
files = glob.glob('${CHECKPOINT_DIR}/model_*.pt')
|
||||
if not files:
|
||||
print(0); sys.exit(0)
|
||||
print(max(int(os.path.basename(f).split('_')[-1].split('.')[0]) for f in files))
|
||||
")
|
||||
|
||||
if [ "$LAST_STEP" -eq 0 ]; then
|
||||
echo "No checkpoint found — starting from scratch"
|
||||
torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \
|
||||
--depth=24 \
|
||||
--target-param-data-ratio=8 \
|
||||
--device-batch-size=16 \
|
||||
--save-every=200 \
|
||||
--run=$WANDB_RUN
|
||||
else
|
||||
echo "Resuming from step $LAST_STEP"
|
||||
torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \
|
||||
--depth=24 \
|
||||
--target-param-data-ratio=8 \
|
||||
--device-batch-size=16 \
|
||||
--save-every=200 \
|
||||
--resume-from-step=$LAST_STEP \
|
||||
--run=$WANDB_RUN
|
||||
fi
|
||||
|
||||
mkdir -p "$CHECKPOINT_DIR"
|
||||
touch "$DONE_MARKER"
|
||||
echo "=== Stage 2c complete: $(date) ==="
|
||||
68
runs/pace_stage2d_pretrain.sh
Normal file
68
runs/pace_stage2d_pretrain.sh
Normal file
|
|
@ -0,0 +1,68 @@
|
|||
#!/bin/bash
|
||||
#SBATCH -N 1
|
||||
#SBATCH -p ice-gpu
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --gres=gpu:2
|
||||
#SBATCH --constraint="gpu-h100|gpu-h200"
|
||||
#SBATCH --mem-per-gpu=48G
|
||||
#SBATCH -t 3:55:00
|
||||
#SBATCH -J nanochat-stage2d
|
||||
#SBATCH -o runs/logs/stage2d_%j.out
|
||||
#SBATCH -e runs/logs/stage2d_%j.err
|
||||
|
||||
# Stage 2d
|
||||
|
||||
set -e
|
||||
cd "$HOME/scratch/nanochat"
|
||||
|
||||
export OMP_NUM_THREADS=1
|
||||
export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat"
|
||||
mkdir -p runs/logs
|
||||
|
||||
WANDB_RUN="${WANDB_RUN:-dummy}"
|
||||
CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24"
|
||||
DONE_MARKER="$CHECKPOINT_DIR/.training_complete"
|
||||
|
||||
echo "=== Stage 2d: Pretraining (chunk 4 / auto-resume) ==="
|
||||
echo "Base dir: $NANOCHAT_BASE_DIR"
|
||||
echo "Started: $(date)"
|
||||
|
||||
if [ -f "$DONE_MARKER" ]; then
|
||||
echo "Training already complete (marker: $DONE_MARKER). Nothing to do."
|
||||
echo "=== Stage 2d skipped: $(date) ==="
|
||||
exit 0
|
||||
fi
|
||||
|
||||
source .venv/bin/activate
|
||||
|
||||
LAST_STEP=$(python -c "
|
||||
import glob, os, sys
|
||||
files = glob.glob('${CHECKPOINT_DIR}/model_*.pt')
|
||||
if not files:
|
||||
print(0); sys.exit(0)
|
||||
print(max(int(os.path.basename(f).split('_')[-1].split('.')[0]) for f in files))
|
||||
")
|
||||
|
||||
if [ "$LAST_STEP" -eq 0 ]; then
|
||||
echo "No checkpoint found — starting from scratch"
|
||||
torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \
|
||||
--depth=24 \
|
||||
--target-param-data-ratio=8 \
|
||||
--device-batch-size=16 \
|
||||
--save-every=200 \
|
||||
--run=$WANDB_RUN
|
||||
else
|
||||
echo "Resuming from step $LAST_STEP"
|
||||
torchrun --standalone --nproc_per_node=2 -m scripts.base_train -- \
|
||||
--depth=24 \
|
||||
--target-param-data-ratio=8 \
|
||||
--device-batch-size=16 \
|
||||
--save-every=200 \
|
||||
--resume-from-step=$LAST_STEP \
|
||||
--run=$WANDB_RUN
|
||||
fi
|
||||
|
||||
mkdir -p "$CHECKPOINT_DIR"
|
||||
touch "$DONE_MARKER"
|
||||
echo "=== Stage 2d complete: $(date) ==="
|
||||
55
runs/pace_stage3_sft.sh
Normal file
55
runs/pace_stage3_sft.sh
Normal file
|
|
@ -0,0 +1,55 @@
|
|||
#!/bin/bash
|
||||
#SBATCH -N 1
|
||||
#SBATCH -p ice-gpu
|
||||
#SBATCH --ntasks-per-node=1
|
||||
#SBATCH --cpus-per-task=8
|
||||
#SBATCH --gres=gpu:2
|
||||
#SBATCH --constraint="gpu-h100|gpu-h200"
|
||||
#SBATCH --mem-per-gpu=48G
|
||||
#SBATCH -t 3:55:00
|
||||
#SBATCH -J nanochat-stage3-sft
|
||||
#SBATCH -o runs/logs/stage3_%j.out
|
||||
#SBATCH -e runs/logs/stage3_%j.err
|
||||
|
||||
# Stage 3
|
||||
|
||||
set -e
|
||||
cd "$HOME/scratch/nanochat"
|
||||
|
||||
export OMP_NUM_THREADS=1
|
||||
export NANOCHAT_BASE_DIR="$HOME/scratch/nanochat"
|
||||
mkdir -p runs/logs
|
||||
|
||||
WANDB_RUN="${WANDB_RUN:-dummy}"
|
||||
|
||||
echo "=== Stage 3: Eval + SFT ==="
|
||||
echo "Base dir: $NANOCHAT_BASE_DIR"
|
||||
echo "WANDB_RUN: $WANDB_RUN"
|
||||
echo "Started: $(date)"
|
||||
|
||||
CHECKPOINT_DIR="$NANOCHAT_BASE_DIR/base_checkpoints/d24"
|
||||
DONE_MARKER="$CHECKPOINT_DIR/.training_complete"
|
||||
if [ ! -f "$DONE_MARKER" ]; then
|
||||
echo "ERROR: pretraining did not finish — missing $DONE_MARKER"
|
||||
echo "Re-run pretrain chunks 2a–2d until the marker is created before running stage 3."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
source .venv/bin/activate
|
||||
|
||||
torchrun --standalone --nproc_per_node=2 -m scripts.base_eval -- \
|
||||
--device-batch-size=16
|
||||
|
||||
curl -L -o "$NANOCHAT_BASE_DIR/identity_conversations.jsonl" \
|
||||
https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
|
||||
|
||||
torchrun --standalone --nproc_per_node=2 -m scripts.chat_sft -- \
|
||||
--device-batch-size=16 \
|
||||
--run=$WANDB_RUN
|
||||
|
||||
torchrun --standalone --nproc_per_node=2 -m scripts.chat_eval -- -i sft
|
||||
|
||||
python -m nanochat.report generate
|
||||
|
||||
echo "=== Stage 3 complete: $(date) ==="
|
||||
|
||||
81
runs/pace_submit.sh
Normal file
81
runs/pace_submit.sh
Normal file
|
|
@ -0,0 +1,81 @@
|
|||
#!/bin/bash
|
||||
|
||||
# Pipeline:
|
||||
# Stage 1 — CPU: tokenizer + dataset
|
||||
# Stage 2a — GPU: pretraining chunk 1
|
||||
# Stage 2b — GPU: auto-resume chunk 2
|
||||
# Stage 2c — GPU: auto-resume chunk 3
|
||||
# Stage 2d — GPU: auto-resume chunk 4
|
||||
# Stage 3 — GPU: base eval + SFT + chat eval + report
|
||||
#
|
||||
# Usage (from repo root):
|
||||
# bash runs/pace_submit.sh
|
||||
#
|
||||
# Optional W&B logging:
|
||||
# WANDB_RUN=my-run bash runs/pace_submit.sh
|
||||
|
||||
set -e
|
||||
cd "$HOME/scratch/nanochat"
|
||||
|
||||
mkdir -p runs/logs
|
||||
|
||||
WANDB_RUN="${WANDB_RUN:-dummy}"
|
||||
export WANDB_RUN
|
||||
|
||||
echo "Submitting nanochat full pipeline..."
|
||||
echo "WANDB_RUN=$WANDB_RUN"
|
||||
echo ""
|
||||
|
||||
# Stage 1
|
||||
JOB1=$(sbatch --parsable \
|
||||
--export=ALL,WANDB_RUN=$WANDB_RUN \
|
||||
runs/pace_stage1_tokenizer.sh)
|
||||
echo "Stage 1 submitted: job $JOB1 (tokenizer + dataset)"
|
||||
|
||||
# Stage 2a
|
||||
JOB2A=$(sbatch --parsable \
|
||||
--dependency=afterok:$JOB1 \
|
||||
--export=ALL,WANDB_RUN=$WANDB_RUN \
|
||||
runs/pace_stage2a_pretrain.sh)
|
||||
echo "Stage 2a submitted: job $JOB2A (pretrain chunk 1, depends on $JOB1)"
|
||||
|
||||
# Stage 2b
|
||||
JOB2B=$(sbatch --parsable \
|
||||
--dependency=afterany:$JOB2A \
|
||||
--export=ALL,WANDB_RUN=$WANDB_RUN \
|
||||
runs/pace_stage2b_pretrain.sh)
|
||||
echo "Stage 2b submitted: job $JOB2B (pretrain chunk 2, depends on $JOB2A)"
|
||||
|
||||
# Stage 2c
|
||||
JOB2C=$(sbatch --parsable \
|
||||
--dependency=afterany:$JOB2B \
|
||||
--export=ALL,WANDB_RUN=$WANDB_RUN \
|
||||
runs/pace_stage2c_pretrain.sh)
|
||||
echo "Stage 2c submitted: job $JOB2C (pretrain chunk 3, depends on $JOB2B)"
|
||||
|
||||
# Stage 2d
|
||||
JOB2D=$(sbatch --parsable \
|
||||
--dependency=afterany:$JOB2C \
|
||||
--export=ALL,WANDB_RUN=$WANDB_RUN \
|
||||
runs/pace_stage2d_pretrain.sh)
|
||||
echo "Stage 2d submitted: job $JOB2D (pretrain chunk 4, depends on $JOB2C)"
|
||||
|
||||
# Stage 3
|
||||
JOB3=$(sbatch --parsable \
|
||||
--dependency=afterok:$JOB2D \
|
||||
--export=ALL,WANDB_RUN=$WANDB_RUN \
|
||||
runs/pace_stage3_sft.sh)
|
||||
echo "Stage 3 submitted: job $JOB3 (eval + SFT, depends on $JOB2D)"
|
||||
|
||||
echo ""
|
||||
echo "All jobs queued. Monitor with:"
|
||||
echo " squeue -u $USER"
|
||||
echo " tail -f runs/logs/stage1_${JOB1}.out"
|
||||
echo " tail -f runs/logs/stage2a_${JOB2A}.out"
|
||||
echo " tail -f runs/logs/stage2b_${JOB2B}.out"
|
||||
echo " tail -f runs/logs/stage2c_${JOB2C}.out"
|
||||
echo " tail -f runs/logs/stage2d_${JOB2D}.out"
|
||||
echo " tail -f runs/logs/stage3_${JOB3}.out"
|
||||
echo ""
|
||||
echo "To cancel everything:"
|
||||
echo " scancel $JOB1 $JOB2A $JOB2B $JOB2C $JOB2D $JOB3"
|
||||
Loading…
Reference in New Issue
Block a user