diff --git a/cloud/README.md b/cloud/README.md new file mode 100644 index 0000000..a1f48f5 --- /dev/null +++ b/cloud/README.md @@ -0,0 +1,71 @@ +# Running nanochat on the Cloud with SkyPilot + +This directory contains [SkyPilot](https://skypilot.readthedocs.io/) configurations for easily launching nanochat on major cloud providers (AWS, GCP, Azure), GPU clouds (Lambda, Nebius, RunPod, etc.), and Kubernetes clusters. + +## Prerequisites + +1. Install SkyPilot and configure it with your cloud provider(s) or Kubernetes cluster: + - Follow the [SkyPilot installation guide](https://docs.skypilot.co/en/latest/getting-started/installation.html) + - Configure your cloud credentials (AWS, GCP, Azure, Lambda, Nebius, etc.) OR + - Configure Kubernetes access via [SkyPilot's Kubernetes support](https://docs.skypilot.co/en/latest/reference/kubernetes/index.html) + +## Training: Running the Speedrun Pipeline + +Launch the speedrun training pipeline on any cloud provider with a single command: + +```bash +sky launch -c nanochat-speedrun cloud/speedrun.sky.yaml --infra +``` + +This will: +- Provision an 8xH100 GPU node +- Set up the environment +- Run the complete training pipeline via `speedrun.sh` +- Save trained model checkpoints to `s3://nanochat-data` (change this to your own bucket) +- Complete in approximately 4 hours (~$100 on most providers) + +### Monitoring Training Progress + +After launching, you can SSH into the cluster and monitor progress: + +```bash +# SSH into the cluster +ssh nanochat-speedrun + +# View the speedrun logs +sky logs nanochat-speedrun +``` + +## Serving: Deploy Your Trained Model + +Once training is complete, serve your trained model with the web UI: + +```bash +sky launch -c nanochat-serve cloud/serve.sky.yaml --infra +``` + +This will: +- Provision a 1xH100 GPU node (much cheaper then an 8xH100 VM used for training) +- Load model weights from the same `s3://nanochat-data` bucket used during training +- Serve the web chat interface on port 8000 +- Cost is ~$2-3/hour on most providers + +### Accessing the Web UI + +Get the endpoint URL to access the chat interface: + +```bash +sky status --endpoint 8000 nanochat-serve +``` + +Open the displayed URL in your browser to chat with your trained model! +image + + +### Shared Storage + +Both training and serving tasks use [SkyPilot's bucket mounting functionality](https://docs.skypilot.co/en/latest/reference/storage.html) to preserve and share model weights. This allows you to: +- Train once, serve multiple times without re-downloading weights +- Share trained models across different serving instances + + diff --git a/cloud/serve.sky.yaml b/cloud/serve.sky.yaml new file mode 100644 index 0000000..d2c388f --- /dev/null +++ b/cloud/serve.sky.yaml @@ -0,0 +1,35 @@ +# Serve a trained nanochat model with the web UI +# +# Launch: +# sky launch -c nanochat-serve cloud/serve.sky.yaml --infra +# +# Access the web UI: +# sky status --endpoint 8000 nanochat-serve +# +# Then open the URL in your browser to chat with your model! + +name: nanochat-serve + +resources: + accelerators: H100:1 # Single GPU sufficient for inference + ports: 8000 # Expose port 8000 for the web UI + disk_size: 100 + +file_mounts: + /tmp/nanochat: + source: s3://nanochat-data + +workdir: . + +setup: | + uv sync + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + source "$HOME/.cargo/env" + source .venv/bin/activate + unset CONDA_PREFIX + uv run maturin develop --release --manifest-path rustbpe/Cargo.toml + +run: | + export NANOCHAT_BASE_DIR=/tmp/nanochat + source .venv/bin/activate + python -m scripts.chat_web --host 0.0.0.0 --port 8000 diff --git a/cloud/speedrun.sky.yaml b/cloud/speedrun.sky.yaml new file mode 100644 index 0000000..7e525eb --- /dev/null +++ b/cloud/speedrun.sky.yaml @@ -0,0 +1,28 @@ +# Run the full nanochat training speedrun +# +# Launch: +# sky launch -c nanochat-speedrun cloud/speedrun.sky.yaml --infra +# +# Monitor progress: +# sky logs nanochat-speedrun +# +# This will train the model using 8x H100 GPUs and save results to S3. + +name: nanochat-speedrun + +resources: + accelerators: H100:8 + disk_size: 512 + +file_mounts: + /tmp/nanochat: + source: s3://nanochat-data + +workdir: . + +setup: | + sudo apt-get install -y unzip + +run: | + export NANOCHAT_BASE_DIR=/tmp/nanochat + bash speedrun.sh diff --git a/speedrun.sh b/speedrun.sh index a9b579a..7e07766 100644 --- a/speedrun.sh +++ b/speedrun.sh @@ -12,7 +12,7 @@ # Default intermediate artifacts directory is in ~/.cache/nanochat export OMP_NUM_THREADS=1 -export NANOCHAT_BASE_DIR="$HOME/.cache/nanochat" +NANOCHAT_BASE_DIR="${NANOCHAT_BASE_DIR:-$HOME/.cache/nanochat}" mkdir -p $NANOCHAT_BASE_DIR # -----------------------------------------------------------------------------