#!/bin/bash # See speedrun.sh for more comments export OMP_NUM_THREADS=1 export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" mkdir -p $NANOCHAT_BASE_DIR # uv command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh [ -d ".venv" ] || uv venv uv sync --extra gpu source .venv/bin/activate # Tokenizer python -m nanochat.dataset -n 240 python -m scripts.tok_train --max_chars=2000000000 --vocab_size=32768 # Depths to train (the "miniseries") DEPTHS=(10 11 12 13 14 15 16 17 18 19 20) # Hardware NPROC_PER_NODE="${NPROC_PER_NODE:-8}" # Logging WANDB_RUN="${WANDB_RUN:-jan7_miniseries}" RESULTS_DIR="$NANOCHAT_BASE_DIR/jan7_miniseries_results" mkdir -p "$RESULTS_DIR" RESULTS_FILE="$RESULTS_DIR/results.csv" # Write CSV header only if file doesn't exist if [ ! -f "$RESULTS_FILE" ]; then echo "depth,model_dim,num_params,num_scaling_params,num_iterations,tokens_trained,param_data_ratio,val_bpb,core_score,train_time_sec" > "$RESULTS_FILE" fi log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" } log "==============================================" log "Jan 7 Miniseries Training" log "==============================================" for d in "${DEPTHS[@]}"; do log "Training d=$d..." TAG="jan7_miniseries_d${d}" START_TIME=$(date +%s) # Train the model with natural horizon (target_param_data_ratio default) # No --target_flops, let it use the default ratio from base_train torchrun --standalone --nproc_per_node=$NPROC_PER_NODE -m scripts.base_train -- \ --depth=$d \ --target_param_data_ratio=8 \ --run="${WANDB_RUN}_d${d}" \ --model_tag="${TAG}" \ --core_metric_every=999999 \ --core_metric_max_per_task=-1 \ --sample_every=-1 \ --save_every=-1 \ 2>&1 | tee "$RESULTS_DIR/${TAG}_train.log" END_TIME=$(date +%s) TRAIN_TIME=$((END_TIME - START_TIME)) # Extract stats from log LOG_FILE="$RESULTS_DIR/${TAG}_train.log" NUM_PARAMS=$(grep "Number of parameters:" "$LOG_FILE" | tail -1 | grep -oP '[\d,]+' | head -1 | tr -d ',') NUM_SCALING_PARAMS=$(grep "Number of parameters:" "$LOG_FILE" | tail -1 | grep -oP 'scaling: [\d,]+' | grep -oP '[\d,]+' | tr -d ',') NUM_ITERS=$(grep "Calculated number of iterations" "$LOG_FILE" | tail -1 | sed 's/.*: //' | tr -d ',') TOKENS_TRAINED=$((NUM_ITERS * 524288)) PARAM_DATA_RATIO=$(python -c "print(f'{$TOKENS_TRAINED / $NUM_SCALING_PARAMS:.2f}')") MODEL_DIM=$((d * 64)) VAL_BPB=$(grep "Validation bpb:" "$LOG_FILE" | tail -1 | grep -oP '[\d.]+$') CORE_SCORE=$(grep "CORE metric:" "$LOG_FILE" | tail -1 | awk '{print $NF}') if [ -z "$CORE_SCORE" ]; then CORE_SCORE="0.0" fi log " d=$d: params=$NUM_PARAMS, scaling=$NUM_SCALING_PARAMS, ratio=$PARAM_DATA_RATIO, bpb=$VAL_BPB, CORE=$CORE_SCORE, time=${TRAIN_TIME}s" # Append to CSV echo "$d,$MODEL_DIM,$NUM_PARAMS,$NUM_SCALING_PARAMS,$NUM_ITERS,$TOKENS_TRAINED,$PARAM_DATA_RATIO,$VAL_BPB,$CORE_SCORE,$TRAIN_TIME" >> "$RESULTS_FILE" done log "==============================================" log "Jan 7 Miniseries Complete!" log "==============================================" log "Results saved to: $RESULTS_FILE" echo "" echo "Results:" column -t -s',' "$RESULTS_FILE"