mirror of
https://github.com/karpathy/nanochat.git
synced 2026-01-28 06:14:41 +00:00
This refactoring enables the nanochat project to be executed as a scalable and robust pipeline on Vertex AI. The monolithic `speedrun.sh` script has been decomposed into a series of containerized components orchestrated by a Kubeflow pipeline. The codebase has been updated to use Google Cloud Storage for artifact management, allowing for seamless data sharing between pipeline steps. A `Dockerfile` and Python wrappers for each pipeline step have been added to the `vertex_pipelines` directory. The `README.md` has been updated with instructions on how to build the Docker image and run the Vertex AI pipeline.
26 lines
823 B
Python
26 lines
823 B
Python
import os
|
|
import subprocess
|
|
import argparse
|
|
from nanochat.common import get_base_dir
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--gcs-bucket", type=str, required=True, help="GCS bucket for artifacts")
|
|
args = parser.parse_args()
|
|
|
|
# Set the base directory to the GCS bucket.
|
|
os.environ["NANOCHAT_BASE_DIR"] = args.gcs_bucket
|
|
|
|
# Download the dataset.
|
|
subprocess.run(["python", "-m", "nanochat.dataset", "-n", "8"], check=True)
|
|
subprocess.run(["python", "-m", "nanochat.dataset", "-n", "240"], check=True)
|
|
|
|
# Train the tokenizer.
|
|
subprocess.run(["python", "-m", "scripts.tok_train", "--max_chars=2000000000"], check=True)
|
|
|
|
# Evaluate the tokenizer.
|
|
subprocess.run(["python", "-m", "scripts.tok_eval"], check=True)
|
|
|
|
if __name__ == "__main__":
|
|
main()
|