nanochat/vertex_pipelines/tokenizer_step.py

import os
import subprocess
import argparse
from nanochat.common import get_base_dir

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gcs-bucket", type=str, required=True, help="GCS bucket for artifacts")
    args = parser.parse_args()

    # Set the base directory to the GCS bucket.
    os.environ["NANOCHAT_BASE_DIR"] = args.gcs_bucket

    # Download the dataset.
    subprocess.run(["python", "-m", "nanochat.dataset", "-n", "8"], check=True)
    subprocess.run(["python", "-m", "nanochat.dataset", "-n", "240"], check=True)

    # Train the tokenizer.
    subprocess.run(["python", "-m", "scripts.tok_train", "--max_chars=2000000000"], check=True)

    # Evaluate the tokenizer.
    subprocess.run(["python", "-m", "scripts.tok_eval"], check=True)

if __name__ == "__main__":
    main()