#!/bin/bash # Optimized overnight training for Mac (MPS/Apple Silicon) # Expected runtime: 8-12 hours # Expected result: Much better chatbot with coherent responses set -e # Exit on error echo "==================================" echo "nanochat Mac Overnight Training" echo "==================================" echo "Started: $(date)" echo "" # Activate virtual environment source .venv/bin/activate # Memory-based configuration # Detect system memory (in GB) or allow manual override if [ -z "$MEMORY_SIZE" ]; then MEMORY_SIZE=$(sysctl hw.memsize | awk '{print int($2/1024/1024/1024)}') echo "Auto-detected memory: ${MEMORY_SIZE}GB" else echo "Using specified memory: ${MEMORY_SIZE}GB" fi # Calculate optimal batch sizes based on available memory # Conservative estimates for MPS (unified memory shared with system) # Note: total_batch_size must be divisible by (device_batch_size * max_seq_len) # With max_seq_len=1024: device_batch_size * 1024 must divide total_batch_size if [ $MEMORY_SIZE -ge 128 ]; then DEVICE_BATCH_SIZE=16 TOTAL_BATCH_SIZE=16384 # 16 * 1024 = 16384 EVAL_TOKENS=16384 SPLIT_TOKENS=16384 echo "Memory profile: 128GB+ (High performance)" elif [ $MEMORY_SIZE -ge 64 ]; then DEVICE_BATCH_SIZE=8 TOTAL_BATCH_SIZE=8192 # 8 * 1024 = 8192 EVAL_TOKENS=8192 SPLIT_TOKENS=8192 echo "Memory profile: 64GB (Good performance)" elif [ $MEMORY_SIZE -ge 32 ]; then DEVICE_BATCH_SIZE=4 TOTAL_BATCH_SIZE=4096 # 4 * 1024 = 4096 EVAL_TOKENS=4096 SPLIT_TOKENS=4096 echo "Memory profile: 32GB (Moderate performance)" else DEVICE_BATCH_SIZE=1 TOTAL_BATCH_SIZE=1024 # 1 * 1024 = 1024 EVAL_TOKENS=2048 SPLIT_TOKENS=2048 echo "Memory profile: <32GB (Conservative)" fi # Allow manual overrides DEPTH=${DEPTH:-6} # Bigger model (6 layers vs 4) BASE_ITERATIONS=${BASE_ITERATIONS:-500} # More base training MID_ITERATIONS=${MID_ITERATIONS:-150} # More midtraining SFT_ITERATIONS=${SFT_ITERATIONS:-150} # More SFT DATA_SHARDS=${DATA_SHARDS:-50} # More training data echo "" echo "Configuration:" echo " System Memory: ${MEMORY_SIZE}GB" echo " Model depth: $DEPTH (~82M params for d6)" echo " Device batch size: $DEVICE_BATCH_SIZE" echo " Total batch size: $TOTAL_BATCH_SIZE" echo " Eval tokens: $EVAL_TOKENS" echo " Base iterations: $BASE_ITERATIONS" echo " Mid iterations: $MID_ITERATIONS" echo " SFT iterations: $SFT_ITERATIONS" echo " Data shards: $DATA_SHARDS" echo "" echo "To override, set environment variables:" echo " MEMORY_SIZE=64 bash dev/runmac_overnight.sh" echo " DEVICE_BATCH_SIZE=8 bash dev/runmac_overnight.sh" echo "" # Clean up old run echo "Cleaning up previous training..." rm -f report.md python -m nanochat.report reset # Download training data echo "" echo "Step 1/6: Downloading training data ($DATA_SHARDS shards)..." python -m nanochat.dataset -n $DATA_SHARDS # Download identity conversations echo "" echo "Step 2/6: Downloading identity conversations..." if [ ! -f ~/.cache/nanochat/identity_conversations.jsonl ]; then curl -L -o ~/.cache/nanochat/identity_conversations.jsonl \ https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl else echo " Already downloaded, skipping." fi # Build tokenizer echo "" echo "Step 3/6: Training tokenizer..." python -m nanochat.tokenizer # Base model training echo "" echo "Step 4/6: Training base model ($BASE_ITERATIONS iterations)..." echo " Device batch size: $DEVICE_BATCH_SIZE, Total batch size: $TOTAL_BATCH_SIZE" echo " This will take ~2-4 hours..." python -m scripts.base_train \ --depth=$DEPTH \ --max_seq_len=1024 \ --device_batch_size=$DEVICE_BATCH_SIZE \ --total_batch_size=$TOTAL_BATCH_SIZE \ --num_iterations=$BASE_ITERATIONS \ --eval_every=100 \ --eval_tokens=$EVAL_TOKENS \ --core_metric_every=250 \ --core_metric_max_per_task=20 \ --sample_every=100 # Evaluate base model echo "" echo "Evaluating base model..." python -m scripts.base_loss --device_batch_size=$DEVICE_BATCH_SIZE --split_tokens=$SPLIT_TOKENS python -m scripts.base_eval # Midtraining echo "" echo "Step 5/6: Midtraining ($MID_ITERATIONS iterations)..." echo " Device batch size: $DEVICE_BATCH_SIZE, Total batch size: $TOTAL_BATCH_SIZE" echo " This will take ~2-3 hours..." python -m scripts.mid_train \ --num_iterations=$MID_ITERATIONS \ --device_batch_size=$DEVICE_BATCH_SIZE \ --max_seq_len=1024 \ --total_batch_size=$TOTAL_BATCH_SIZE \ --eval_every=50 \ --eval_tokens=$EVAL_TOKENS # SFT training echo "" echo "Step 6/6: Chat fine-tuning (SFT) ($SFT_ITERATIONS iterations)..." echo " Device batch size: $DEVICE_BATCH_SIZE" echo " This will take ~2-3 hours..." python -m scripts.chat_sft \ --num_iterations=$SFT_ITERATIONS \ --device_batch_size=$DEVICE_BATCH_SIZE \ --target_examples_per_step=$((DEVICE_BATCH_SIZE * 2)) \ --eval_steps=10 # Final evaluation echo "" echo "Running final evaluations..." python -m scripts.chat_eval -i sft || echo "Chat eval had issues, skipping..." # Generate report echo "" echo "Generating final report..." python -m nanochat.report generate # Copy report to current directory cp ~/.cache/nanochat/report/report.md ./report_overnight.md echo "" echo "==================================" echo "Training Complete!" echo "==================================" echo "Finished: $(date)" echo "" echo "Your chatbot is ready! Chat with it:" echo " python -m scripts.chat_cli -i sft" echo "" echo "Or start the web UI:" echo " python -m scripts.chat_web -i sft" echo "" echo "Report saved to: report_overnight.md" echo "=================================="