nanochat/dev/runmac_overnight.sh
Jason Kneen 3e184d343e Improve Mac/MPS compatibility and device handling
Added dev/runmac_overnight.sh for optimized Mac training. Updated device-specific logic throughout dataloader, GPT, Muon optimizer, and training scripts to avoid CUDA-only features on MPS/CPU (e.g., torch.compile, pin_memory, non_blocking, bfloat16). Relaxed torch version constraints in pyproject.toml and removed Linux/CUDA-specific PyTorch config for better macOS support.
2025-10-22 01:55:38 +01:00

127 lines
3.4 KiB
Bash
Executable File

#!/bin/bash
# Optimized overnight training for Mac (MPS/Apple Silicon)
# Expected runtime: 8-12 hours
# Expected result: Much better chatbot with coherent responses
set -e # Exit on error
echo "=================================="
echo "nanochat Mac Overnight Training"
echo "=================================="
echo "Started: $(date)"
echo ""
# Activate virtual environment
source .venv/bin/activate
# Configuration
DEPTH=6 # Bigger model (6 layers vs 4)
BASE_ITERATIONS=500 # More base training
MID_ITERATIONS=150 # More midtraining
SFT_ITERATIONS=150 # More SFT
DATA_SHARDS=50 # More training data
echo "Configuration:"
echo " Model depth: $DEPTH (36.7M → 82M params)"
echo " Base iterations: $BASE_ITERATIONS"
echo " Mid iterations: $MID_ITERATIONS"
echo " SFT iterations: $SFT_ITERATIONS"
echo " Data shards: $DATA_SHARDS"
echo ""
# Clean up old run
echo "Cleaning up previous training..."
rm -f report.md
python -m scripts.report --reset
# Download training data
echo ""
echo "Step 1/6: Downloading training data ($DATA_SHARDS shards)..."
python -m nanochat.dataset -n $DATA_SHARDS
# Download identity conversations
echo ""
echo "Step 2/6: Downloading identity conversations..."
if [ ! -f ~/.cache/nanochat/identity_conversations.jsonl ]; then
curl -L -o ~/.cache/nanochat/identity_conversations.jsonl \
https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl
else
echo " Already downloaded, skipping."
fi
# Build tokenizer
echo ""
echo "Step 3/6: Training tokenizer..."
python -m nanochat.tokenizer
# Base model training
echo ""
echo "Step 4/6: Training base model ($BASE_ITERATIONS iterations)..."
echo " This will take ~2-4 hours..."
python -m scripts.base_train \
--depth=$DEPTH \
--max_seq_len=1024 \
--device_batch_size=1 \
--total_batch_size=1024 \
--num_iterations=$BASE_ITERATIONS \
--eval_every=100 \
--eval_tokens=8192 \
--core_metric_every=250 \
--core_metric_max_per_task=20 \
--sample_every=100
# Evaluate base model
echo ""
echo "Evaluating base model..."
python -m scripts.base_loss
python -m scripts.base_eval
# Midtraining
echo ""
echo "Step 5/6: Midtraining ($MID_ITERATIONS iterations)..."
echo " This will take ~2-3 hours..."
python -m scripts.mid_train \
--num_iterations=$MID_ITERATIONS \
--device_batch_size=1 \
--max_seq_len=1024 \
--total_batch_size=1024 \
--eval_every=50
# SFT training
echo ""
echo "Step 6/6: Chat fine-tuning (SFT) ($SFT_ITERATIONS iterations)..."
echo " This will take ~2-3 hours..."
python -m scripts.chat_sft \
--num_iterations=$SFT_ITERATIONS \
--device_batch_size=1 \
--target_examples_per_step=8 \
--eval_steps=10
# Final evaluation
echo ""
echo "Running final evaluations..."
python -m scripts.chat_eval -i sft || echo "Chat eval had issues, skipping..."
# Generate report
echo ""
echo "Generating final report..."
python -m scripts.report
# Copy report to current directory
cp ~/.cache/nanochat/report/report.md ./report_overnight.md
echo ""
echo "=================================="
echo "Training Complete!"
echo "=================================="
echo "Finished: $(date)"
echo ""
echo "Your chatbot is ready! Chat with it:"
echo " python -m scripts.chat_cli -i sft"
echo ""
echo "Or start the web UI:"
echo " python -m scripts.chat_web -i sft"
echo ""
echo "Report saved to: report_overnight.md"
echo "=================================="