mirror of
https://github.com/karpathy/nanochat.git
synced 2026-05-11 02:10:13 +00:00
82 lines
2.3 KiB
Bash
82 lines
2.3 KiB
Bash
#!/bin/bash
|
|
|
|
# Pipeline:
|
|
# Stage 1 — CPU: tokenizer + dataset
|
|
# Stage 2a — GPU: pretraining chunk 1
|
|
# Stage 2b — GPU: auto-resume chunk 2
|
|
# Stage 2c — GPU: auto-resume chunk 3
|
|
# Stage 2d — GPU: auto-resume chunk 4
|
|
# Stage 3 — GPU: base eval + SFT + chat eval + report
|
|
#
|
|
# Usage (from repo root):
|
|
# bash runs/pace_submit.sh
|
|
#
|
|
# Optional W&B logging:
|
|
# WANDB_RUN=my-run bash runs/pace_submit.sh
|
|
|
|
set -e
|
|
cd "$HOME/scratch/nanochat"
|
|
|
|
mkdir -p runs/logs
|
|
|
|
WANDB_RUN="${WANDB_RUN:-dummy}"
|
|
export WANDB_RUN
|
|
|
|
echo "Submitting nanochat full pipeline..."
|
|
echo "WANDB_RUN=$WANDB_RUN"
|
|
echo ""
|
|
|
|
# Stage 1
|
|
JOB1=$(sbatch --parsable \
|
|
--export=ALL,WANDB_RUN=$WANDB_RUN \
|
|
runs/pace_stage1_tokenizer.sh)
|
|
echo "Stage 1 submitted: job $JOB1 (tokenizer + dataset)"
|
|
|
|
# Stage 2a
|
|
JOB2A=$(sbatch --parsable \
|
|
--dependency=afterok:$JOB1 \
|
|
--export=ALL,WANDB_RUN=$WANDB_RUN \
|
|
runs/pace_stage2a_pretrain.sh)
|
|
echo "Stage 2a submitted: job $JOB2A (pretrain chunk 1, depends on $JOB1)"
|
|
|
|
# Stage 2b
|
|
JOB2B=$(sbatch --parsable \
|
|
--dependency=afterany:$JOB2A \
|
|
--export=ALL,WANDB_RUN=$WANDB_RUN \
|
|
runs/pace_stage2b_pretrain.sh)
|
|
echo "Stage 2b submitted: job $JOB2B (pretrain chunk 2, depends on $JOB2A)"
|
|
|
|
# Stage 2c
|
|
JOB2C=$(sbatch --parsable \
|
|
--dependency=afterany:$JOB2B \
|
|
--export=ALL,WANDB_RUN=$WANDB_RUN \
|
|
runs/pace_stage2c_pretrain.sh)
|
|
echo "Stage 2c submitted: job $JOB2C (pretrain chunk 3, depends on $JOB2B)"
|
|
|
|
# Stage 2d
|
|
JOB2D=$(sbatch --parsable \
|
|
--dependency=afterany:$JOB2C \
|
|
--export=ALL,WANDB_RUN=$WANDB_RUN \
|
|
runs/pace_stage2d_pretrain.sh)
|
|
echo "Stage 2d submitted: job $JOB2D (pretrain chunk 4, depends on $JOB2C)"
|
|
|
|
# Stage 3
|
|
JOB3=$(sbatch --parsable \
|
|
--dependency=afterok:$JOB2D \
|
|
--export=ALL,WANDB_RUN=$WANDB_RUN \
|
|
runs/pace_stage3_sft.sh)
|
|
echo "Stage 3 submitted: job $JOB3 (eval + SFT, depends on $JOB2D)"
|
|
|
|
echo ""
|
|
echo "All jobs queued. Monitor with:"
|
|
echo " squeue -u $USER"
|
|
echo " tail -f runs/logs/stage1_${JOB1}.out"
|
|
echo " tail -f runs/logs/stage2a_${JOB2A}.out"
|
|
echo " tail -f runs/logs/stage2b_${JOB2B}.out"
|
|
echo " tail -f runs/logs/stage2c_${JOB2C}.out"
|
|
echo " tail -f runs/logs/stage2d_${JOB2D}.out"
|
|
echo " tail -f runs/logs/stage3_${JOB3}.out"
|
|
echo ""
|
|
echo "To cancel everything:"
|
|
echo " scancel $JOB1 $JOB2A $JOB2B $JOB2C $JOB2D $JOB3"
|