mirror of
https://github.com/karpathy/nanochat.git
synced 2025-12-06 12:22:18 +00:00
120 lines
4.1 KiB
Python
120 lines
4.1 KiB
Python
"""
|
|
Modal deployment for nanochat - serves the existing chat_web.py FastAPI app on H100.
|
|
|
|
Usage:
|
|
modal deploy modal_serve.py
|
|
|
|
This will:
|
|
1. Build a container image with PyTorch, FastAPI, and the nanochat module
|
|
2. Load the best available checkpoint (from sft by default)
|
|
3. Serve the chat UI and API endpoints from scripts/chat_web.py
|
|
|
|
The web UI will be available at the URL printed by Modal after deployment.
|
|
|
|
Note: Before deploying, upload your model checkpoints to the volume.
|
|
"""
|
|
|
|
import modal
|
|
from pathlib import Path
|
|
|
|
APP_NAME = "nanochat-serve"
|
|
VOLUME_NAME = "nanochat-data" # Reuse the same volume as modal_speedrun.py
|
|
|
|
app = modal.App(APP_NAME)
|
|
|
|
# Reuse volume from modal_speedrun (or create if missing)
|
|
vol = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True)
|
|
|
|
# Get the local directory path
|
|
LOCAL_DIR = Path(__file__).parent
|
|
|
|
# Build Modal image with identical environment to modal_speedrun.py
|
|
# This ensures consistency between training and serving
|
|
image = (
|
|
modal.Image.debian_slim(python_version="3.11")
|
|
.apt_install("curl", "build-essential", "pkg-config", "unzip")
|
|
.add_local_dir("dev", "/nanochat/dev", copy=True)
|
|
.add_local_dir("nanochat", "/nanochat/nanochat", copy=True)
|
|
.add_local_dir("rustbpe", "/nanochat/rustbpe", copy=True)
|
|
.add_local_dir("scripts", "/nanochat/scripts", copy=True)
|
|
.add_local_dir("tasks", "/nanochat/tasks", copy=True)
|
|
.add_local_dir("tests", "/nanochat/tests", copy=True)
|
|
.add_local_file("pyproject.toml", "/nanochat/pyproject.toml", copy=True)
|
|
.add_local_file(".python-version", "/nanochat/.python-version", copy=True)
|
|
.add_local_file("README.md", "/nanochat/README.md", copy=True)
|
|
.add_local_file("LICENSE", "/nanochat/LICENSE", copy=True)
|
|
.workdir("/nanochat")
|
|
.run_commands(
|
|
# Install uv (Python package manager)
|
|
"curl -LsSf https://astral.sh/uv/install.sh | sh",
|
|
# Install Rust and set default toolchain
|
|
"curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable",
|
|
)
|
|
.env({"PATH": "/root/.cargo/bin:/root/.local/bin:$PATH"})
|
|
.uv_sync(extras=["gpu"])
|
|
.run_commands(
|
|
# Build the Rust tokenizer (the slow part)
|
|
"uv run maturin develop --release --manifest-path rustbpe/Cargo.toml",
|
|
)
|
|
)
|
|
|
|
|
|
@app.function(
|
|
image=image,
|
|
gpu="H100",
|
|
volumes={"/data": vol},
|
|
timeout=3600, # 1 hour timeout
|
|
scaledown_window=300, # Keep alive for 5 min after last request
|
|
)
|
|
@modal.asgi_app()
|
|
def fastapi_app():
|
|
"""
|
|
Import and return the FastAPI app from chat_web.py.
|
|
|
|
This reuses all the existing logic: endpoints, streaming, validation, etc.
|
|
The only difference is we run on Modal infrastructure with H100 GPU.
|
|
"""
|
|
import sys
|
|
import os
|
|
|
|
# Set base directory to where checkpoints are mounted (same as modal_speedrun)
|
|
BASE_DIR = "/data/.cache/nanochat"
|
|
os.environ['NANOCHAT_BASE_DIR'] = BASE_DIR
|
|
|
|
# Mock the command-line arguments that chat_web.py expects
|
|
sys.argv = [
|
|
'chat_web.py',
|
|
'--num-gpus', '1', # Single GPU (Modal handles scaling)
|
|
'--source', 'sft', # Load from sft checkpoints
|
|
'--temperature', '0.8', # Default temperature
|
|
'--top-k', '50', # Default top-k
|
|
'--max-tokens', '512', # Default max tokens
|
|
'--device-type', 'cuda', # Use CUDA
|
|
'--dtype', 'bfloat16', # Use bfloat16 for efficiency
|
|
]
|
|
|
|
# Import the FastAPI app from chat_web
|
|
# This will trigger model loading via the lifespan context manager
|
|
from scripts.chat_web import app
|
|
|
|
print(f"✅ NanoChat server initialized!")
|
|
print(f" Checkpoint directory: {BASE_DIR}")
|
|
print(f" GPU: H100 x 1")
|
|
|
|
return app
|
|
|
|
|
|
# Convenience local entrypoint for testing
|
|
@app.local_entrypoint()
|
|
def main():
|
|
"""
|
|
Deploy the nanochat serving endpoint.
|
|
|
|
This is just a convenience wrapper. You can also run:
|
|
modal deploy modal_serve.py
|
|
"""
|
|
print("Deploying nanochat serving endpoint...")
|
|
print(f"Using volume: {VOLUME_NAME}")
|
|
print(f"GPU: H100 x 1")
|
|
print("\nThe app will be available at the URL printed by Modal.")
|