#!/bin/bash

# Showing an example run for exercising some of the code paths on the CPU (or MPS on Macbooks)
# Run as:
# bash dev/runcpu.sh

# NOTE: Training LLMs requires GPU compute and $$$. You will not get far on your Macbook.
# Think of this run as educational/fun demo, not something you should expect to work well.
# This is also why I hide this script away in dev/

# all the setup stuff
export OMP_NUM_THREADS=1
NANOCHAT_BASE_DIR="$HOME/.cache/nanochat"
mkdir -p $NANOCHAT_BASE_DIR

# Memory-based configuration for macOS
# Detect system memory (in GB) or allow manual override
if [ -z "$MEMORY_SIZE" ]; then
    if [[ "$OSTYPE" == "darwin"* ]]; then
        MEMORY_SIZE=$(sysctl hw.memsize | awk '{print int($2/1024/1024/1024)}')
        echo "Auto-detected macOS memory: ${MEMORY_SIZE}GB"
    else
        # Linux fallback - assume conservative
        MEMORY_SIZE=16
        echo "Non-macOS system, using conservative: ${MEMORY_SIZE}GB"
    fi
fi

# Calculate optimal batch sizes based on available memory
# Note: total_batch_size must be divisible by (device_batch_size * max_seq_len)
# With max_seq_len=1024: device_batch_size * 1024 must divide total_batch_size
if [ $MEMORY_SIZE -ge 128 ]; then
    DEVICE_BATCH_SIZE=16
    TOTAL_BATCH_SIZE=16384    # 16 * 1024 = 16384
    EVAL_TOKENS=16384
    SPLIT_TOKENS=16384
    echo "Memory profile: 128GB+ (High performance)"
elif [ $MEMORY_SIZE -ge 64 ]; then
    DEVICE_BATCH_SIZE=8
    TOTAL_BATCH_SIZE=8192     # 8 * 1024 = 8192
    EVAL_TOKENS=8192
    SPLIT_TOKENS=8192
    echo "Memory profile: 64GB (Good performance)"
elif [ $MEMORY_SIZE -ge 32 ]; then
    DEVICE_BATCH_SIZE=4
    TOTAL_BATCH_SIZE=4096     # 4 * 1024 = 4096
    EVAL_TOKENS=4096
    SPLIT_TOKENS=4096
    echo "Memory profile: 32GB (Moderate performance)"
else
    DEVICE_BATCH_SIZE=1
    TOTAL_BATCH_SIZE=1024     # 1 * 1024 = 1024
    EVAL_TOKENS=2048
    SPLIT_TOKENS=2048
    echo "Memory profile: <32GB (Conservative)"
fi

echo "Using: device_batch_size=$DEVICE_BATCH_SIZE, total_batch_size=$TOTAL_BATCH_SIZE"
echo ""
command -v uv &> /dev/null || curl -LsSf https://astral.sh/uv/install.sh | sh
[ -d ".venv" ] || uv venv
uv sync
source .venv/bin/activate
if [ -z "$WANDB_RUN" ]; then
    WANDB_RUN=dummy
fi
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source "$HOME/.cargo/env"
uv run maturin develop --release --manifest-path rustbpe/Cargo.toml
EVAL_BUNDLE_URL=https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip
if [ ! -d "$NANOCHAT_BASE_DIR/eval_bundle" ]; then
    curl -L -o eval_bundle.zip $EVAL_BUNDLE_URL
    unzip -q eval_bundle.zip
    rm eval_bundle.zip
    mv eval_bundle $NANOCHAT_BASE_DIR
fi

# wipe the report
python -m nanochat.report reset

# train tokenizer on ~1B characters
python -m nanochat.dataset -n 4
python -m scripts.tok_train --max_chars=1000000000
python -m scripts.tok_eval

# train a very small 4 layer model on the CPU/MPS
# batch sizes are now optimized based on available memory
# we only run 50 steps of optimization (bump this to get better results)
python -m scripts.base_train \
    --depth=4 \
    --max_seq_len=1024 \
    --device_batch_size=$DEVICE_BATCH_SIZE \
    --total_batch_size=$TOTAL_BATCH_SIZE \
    --eval_every=50 \
    --eval_tokens=$EVAL_TOKENS \
    --core_metric_every=50 \
    --core_metric_max_per_task=12 \
    --sample_every=50 \
    --num_iterations=50
python -m scripts.base_loss --device_batch_size=$DEVICE_BATCH_SIZE --split_tokens=$SPLIT_TOKENS
python -m scripts.base_eval --max-per-task=5

# midtraining
python -m scripts.mid_train \
    --max_seq_len=1024 \
    --device_batch_size=$DEVICE_BATCH_SIZE \
    --eval_every=50 \
    --eval_tokens=$EVAL_TOKENS \
    --total_batch_size=$TOTAL_BATCH_SIZE \
    --num_iterations=100
# eval results will be terrible, this is just to execute the code paths.
# note that we lower the execution memory limit to 1MB to avoid warnings on smaller systems
python -m scripts.chat_eval -i mid --max-new-tokens=128 --max-problems=20

# SFT
python -m scripts.chat_sft \
    --device_batch_size=$DEVICE_BATCH_SIZE \
    --target_examples_per_step=$((DEVICE_BATCH_SIZE * 2)) \
    --num_iterations=100 \
    --eval_steps=4 \
    --eval_metrics_max_problems=16

# Chat CLI
# python -m scripts.chat_cli -p "Why is the sky blue?"

# Chat Web
# python -m scripts.chat_web

python -m nanochat.report generate