mirror of
https://github.com/karpathy/nanochat.git
synced 2026-04-23 09:18:52 +00:00
This refactoring enables the nanochat project to be executed as a scalable and robust pipeline on Vertex AI. The monolithic `speedrun.sh` script has been decomposed into a series of containerized components orchestrated by a Kubeflow pipeline. The codebase has been updated to use Google Cloud Storage for artifact management, allowing for seamless data sharing between pipeline steps. A `Dockerfile` and Python wrappers for each pipeline step have been added to the `vertex_pipelines` directory. The `README.md` has been updated with instructions on how to build the Docker image and run the Vertex AI pipeline.
36 lines
1.1 KiB
Python
36 lines
1.1 KiB
Python
import os
|
|
import subprocess
|
|
import argparse
|
|
from nanochat.common import get_base_dir
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--gcs-bucket", type=str, required=True, help="GCS bucket for artifacts")
|
|
parser.add_argument("--wandb-run", type=str, default="dummy", help="Wandb run name")
|
|
args = parser.parse_args()
|
|
|
|
# Set the base directory to the GCS bucket.
|
|
os.environ["NANOCHAT_BASE_DIR"] = args.gcs_bucket
|
|
|
|
# Pre-train the d20 model.
|
|
subprocess.run([
|
|
"torchrun", "--standalone", "--nproc_per_node=8",
|
|
"-m", "scripts.base_train", "--",
|
|
"--depth=20", f"--run={args.wandb_run}"
|
|
], check=True)
|
|
|
|
# Evaluate the model on a larger chunk of train/val data and draw some samples.
|
|
subprocess.run([
|
|
"torchrun", "--standalone", "--nproc_per_node=8",
|
|
"-m", "scripts.base_loss"
|
|
], check=True)
|
|
|
|
# Evaluate the model on CORE tasks.
|
|
subprocess.run([
|
|
"torchrun", "--standalone", "--nproc_per_node=8",
|
|
"-m", "scripts.base_eval"
|
|
], check=True)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|