mirror of
https://github.com/karpathy/nanochat.git
synced 2026-03-28 15:45:15 +00:00
This refactoring enables the nanochat project to be executed as a scalable and robust pipeline on Vertex AI. The monolithic `speedrun.sh` script has been decomposed into a series of containerized components orchestrated by a Kubeflow pipeline. The codebase has been updated to use Google Cloud Storage for artifact management, allowing for seamless data sharing between pipeline steps. A `Dockerfile` and Python wrappers for each pipeline step have been added to the `vertex_pipelines` directory. The `README.md` has been updated with instructions on how to build the Docker image and run the Vertex AI pipeline.
38 lines
1.1 KiB
Python
38 lines
1.1 KiB
Python
import os
|
|
import subprocess
|
|
import argparse
|
|
from nanochat.common import get_base_dir
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--gcs-bucket", type=str, required=True, help="GCS bucket for artifacts")
|
|
parser.add_argument("--wandb-run", type=str, default="dummy", help="Wandb run name")
|
|
args = parser.parse_args()
|
|
|
|
# Set the base directory to the GCS bucket.
|
|
os.environ["NANOCHAT_BASE_DIR"] = args.gcs_bucket
|
|
|
|
# Download the identity conversations dataset.
|
|
subprocess.run([
|
|
"curl", "-L", "-o",
|
|
f"{get_base_dir()}/identity_conversations.jsonl",
|
|
"https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl"
|
|
], check=True)
|
|
|
|
# Run mid-training.
|
|
subprocess.run([
|
|
"torchrun", "--standalone", "--nproc_per_node=8",
|
|
"-m", "scripts.mid_train", "--",
|
|
f"--run={args.wandb_run}"
|
|
], check=True)
|
|
|
|
# Evaluate the model.
|
|
subprocess.run([
|
|
"torchrun", "--standalone", "--nproc_per_node=8",
|
|
"-m", "scripts.chat_eval", "--",
|
|
"-i", "mid"
|
|
], check=True)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|