nanochat/modal_serve.py
2025-10-26 19:24:22 +00:00

120 lines
4.1 KiB
Python

"""
Modal deployment for nanochat - serves the existing chat_web.py FastAPI app on H100.
Usage:
modal deploy modal_serve.py
This will:
1. Build a container image with PyTorch, FastAPI, and the nanochat module
2. Load the best available checkpoint (from sft by default)
3. Serve the chat UI and API endpoints from scripts/chat_web.py
The web UI will be available at the URL printed by Modal after deployment.
Note: Before deploying, upload your model checkpoints to the volume.
"""
import modal
from pathlib import Path
APP_NAME = "nanochat-serve"
VOLUME_NAME = "nanochat-data" # Reuse the same volume as modal_speedrun.py
app = modal.App(APP_NAME)
# Reuse volume from modal_speedrun (or create if missing)
vol = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)
# Get the local directory path
LOCAL_DIR = Path(__file__).parent
# Build Modal image with identical environment to modal_speedrun.py
# This ensures consistency between training and serving
image = (
modal.Image.debian_slim(python_version="3.11")
.apt_install("curl", "build-essential", "pkg-config", "unzip")
.add_local_dir("dev", "/nanochat/dev", copy=True)
.add_local_dir("nanochat", "/nanochat/nanochat", copy=True)
.add_local_dir("rustbpe", "/nanochat/rustbpe", copy=True)
.add_local_dir("scripts", "/nanochat/scripts", copy=True)
.add_local_dir("tasks", "/nanochat/tasks", copy=True)
.add_local_dir("tests", "/nanochat/tests", copy=True)
.add_local_file("pyproject.toml", "/nanochat/pyproject.toml", copy=True)
.add_local_file(".python-version", "/nanochat/.python-version", copy=True)
.add_local_file("README.md", "/nanochat/README.md", copy=True)
.add_local_file("LICENSE", "/nanochat/LICENSE", copy=True)
.workdir("/nanochat")
.run_commands(
# Install uv (Python package manager)
"curl -LsSf https://astral.sh/uv/install.sh | sh",
# Install Rust and set default toolchain
"curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable",
)
.env({"PATH": "/root/.cargo/bin:/root/.local/bin:$PATH"})
.uv_sync(extras=["gpu"])
.run_commands(
# Build the Rust tokenizer (the slow part)
"uv run maturin develop --release --manifest-path rustbpe/Cargo.toml",
)
)
@app.function(
image=image,
gpu="H100",
volumes={"/data": vol},
timeout=3600, # 1 hour timeout
scaledown_window=300, # Keep alive for 5 min after last request
)
@modal.asgi_app()
def fastapi_app():
"""
Import and return the FastAPI app from chat_web.py.
This reuses all the existing logic: endpoints, streaming, validation, etc.
The only difference is we run on Modal infrastructure with H100 GPU.
"""
import sys
import os
# Set base directory to where checkpoints are mounted (same as modal_speedrun)
BASE_DIR = "/data/.cache/nanochat"
os.environ['NANOCHAT_BASE_DIR'] = BASE_DIR
# Mock the command-line arguments that chat_web.py expects
sys.argv = [
'chat_web.py',
'--num-gpus', '1', # Single GPU (Modal handles scaling)
'--source', 'sft', # Load from sft checkpoints
'--temperature', '0.8', # Default temperature
'--top-k', '50', # Default top-k
'--max-tokens', '512', # Default max tokens
'--device-type', 'cuda', # Use CUDA
'--dtype', 'bfloat16', # Use bfloat16 for efficiency
]
# Import the FastAPI app from chat_web
# This will trigger model loading via the lifespan context manager
from scripts.chat_web import app
print(f"✅ NanoChat server initialized!")
print(f" Checkpoint directory: {BASE_DIR}")
print(f" GPU: H100 x 1")
return app
# Convenience local entrypoint for testing
@app.local_entrypoint()
def main():
"""
Deploy the nanochat serving endpoint.
This is just a convenience wrapper. You can also run:
modal deploy modal_serve.py
"""
print("Deploying nanochat serving endpoint...")
print(f"Using volume: {VOLUME_NAME}")
print(f"GPU: H100 x 1")
print("\nThe app will be available at the URL printed by Modal.")