Success on Vertex Pipelines

This commit is contained in:
Nuno Pereira 2025-12-01 19:59:58 -05:00
parent a88e7ec21f
commit 13001597c2
No known key found for this signature in database
40 changed files with 3770 additions and 264 deletions

614
demo_pipeline.json Normal file
View File

@ -0,0 +1,614 @@
{
"components": {
"comp-custom-training-job": {
"executorLabel": "exec-custom-training-job",
"inputDefinitions": {
"parameters": {
"base_output_directory": {
"defaultValue": "",
"description": "The Cloud Storage location to store the output of this CustomJob or HyperparameterTuningJob. See [more information ](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/GcsDestination).",
"isOptional": true,
"parameterType": "STRING"
},
"display_name": {
"description": "The name of the CustomJob.",
"parameterType": "STRING"
},
"enable_web_access": {
"defaultValue": false,
"description": "Whether you want Vertex AI to enable [interactive shell access ](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell) to training containers. If `True`, you can access interactive shells at the URIs given by [CustomJob.web_access_uris][].",
"isOptional": true,
"parameterType": "BOOLEAN"
},
"encryption_spec_key_name": {
"defaultValue": "",
"description": "Customer-managed encryption key options for the CustomJob. If this is set, then all resources created by the CustomJob will be encrypted with the provided encryption key.",
"isOptional": true,
"parameterType": "STRING"
},
"labels": {
"defaultValue": {},
"description": "The labels with user-defined metadata to organize the CustomJob. See [more information](https://goo.gl/xmQnxf).",
"isOptional": true,
"parameterType": "STRUCT"
},
"location": {
"defaultValue": "{{$.pipeline_google_cloud_location}}",
"description": "Location for creating the custom training job. If not set, default to the location where the PipelineJob is run.",
"isOptional": true,
"parameterType": "STRING"
},
"max_wait_duration": {
"defaultValue": "86400s",
"description": "The maximum time to wait for the custom training job to be scheduled only if the scheduling strategy is set to FLEX_START. If set to 0, the job will wait indefinitely. The default is 24 hours. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).",
"isOptional": true,
"parameterType": "STRING"
},
"network": {
"defaultValue": "",
"description": "The full name of the Compute Engine network to which the job should be peered. For example, `projects/12345/global/networks/myVPC`. Format is of the form `projects/{project}/global/networks/{network}`. Where `{project}` is a project number, as in `12345`, and `{network}` is a network name. Private services access must already be configured for the network. If left unspecified, the job is not peered with any network.",
"isOptional": true,
"parameterType": "STRING"
},
"persistent_resource_id": {
"defaultValue": "{{$.pipeline_persistent_resource_id}}",
"description": "The ID of the PersistentResource in the same Project and Location which to run. The default value is a placeholder that will be resolved to the PipelineJob [RuntimeConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.pipelineJobs#PipelineJob.RuntimeConfig)'s persistent resource id at runtime. However, if the PipelineJob doesn't set Persistent Resource as the job level runtime, the placedholder will be resolved to an empty string and the custom job will be run on demand. If the value is set explicitly, the custom job will runs in the specified persistent resource, in this case, please note the network and CMEK configs on the job should be consistent with those on the PersistentResource, otherwise, the job will be rejected.",
"isOptional": true,
"parameterType": "STRING"
},
"project": {
"defaultValue": "{{$.pipeline_google_cloud_project_id}}",
"description": "Project to create the custom training job in. Defaults to the project in which the PipelineJob is run.",
"isOptional": true,
"parameterType": "STRING"
},
"psc_interface_config": {
"defaultValue": {},
"description": "Configuration CustomJob with PSC-I. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#PscInterfaceConfig).",
"isOptional": true,
"parameterType": "STRUCT"
},
"reserved_ip_ranges": {
"defaultValue": [],
"description": "A list of names for the reserved IP ranges under the VPC network that can be used for this job. If set, we will deploy the job within the provided IP ranges. Otherwise, the job will be deployed to any IP ranges under the provided VPC network.",
"isOptional": true,
"parameterType": "LIST"
},
"restart_job_on_worker_restart": {
"defaultValue": false,
"description": "Restarts the entire CustomJob if a worker gets restarted. This feature can be used by distributed training jobs that are not resilient to workers leaving and joining a job.",
"isOptional": true,
"parameterType": "BOOLEAN"
},
"service_account": {
"defaultValue": "",
"description": "Sets the default service account for workload run-as account. The [service account ](https://cloud.google.com/vertex-ai/docs/pipelines/configure-project#service-account) running the pipeline submitting jobs must have act-as permission on this run-as account. If unspecified, the Vertex AI Custom Code [Service Agent ](https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents) for the CustomJob's project.",
"isOptional": true,
"parameterType": "STRING"
},
"strategy": {
"defaultValue": "STANDARD",
"description": "The strategy to use for the custom training job. The default is 'STANDARD'. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).",
"isOptional": true,
"parameterType": "STRING"
},
"tensorboard": {
"defaultValue": "",
"description": "The name of a Vertex AI TensorBoard resource to which this CustomJob will upload TensorBoard logs.",
"isOptional": true,
"parameterType": "STRING"
},
"timeout": {
"defaultValue": "604800s",
"description": "The maximum job running time. The default is 7 days. A duration in seconds with up to nine fractional digits, terminated by 's', for example: \"3.5s\".",
"isOptional": true,
"parameterType": "STRING"
},
"worker_pool_specs": {
"defaultValue": [],
"description": "Serialized json spec of the worker pools including machine type and Docker image. All worker pools except the first one are optional and can be skipped by providing an empty value. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#WorkerPoolSpec).",
"isOptional": true,
"parameterType": "LIST"
}
}
},
"outputDefinitions": {
"parameters": {
"gcp_resources": {
"description": "Serialized JSON of `gcp_resources` [proto](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud/google_cloud_pipeline_components/proto) which tracks the CustomJob.",
"parameterType": "STRING"
}
}
}
},
"comp-data-download-step": {
"executorLabel": "exec-data-download-step",
"inputDefinitions": {
"parameters": {
"gcs_bucket": {
"parameterType": "STRING"
},
"num_shards": {
"defaultValue": 50.0,
"isOptional": true,
"parameterType": "NUMBER_INTEGER"
}
}
}
},
"comp-midtraining-step": {
"executorLabel": "exec-midtraining-step",
"inputDefinitions": {
"parameters": {
"gcs_bucket": {
"parameterType": "STRING"
},
"vertex_experiment": {
"parameterType": "STRING"
},
"vertex_tensorboard": {
"parameterType": "STRING"
},
"wandb_run": {
"parameterType": "STRING"
}
}
}
},
"comp-report-step": {
"executorLabel": "exec-report-step",
"inputDefinitions": {
"parameters": {
"gcs_bucket": {
"parameterType": "STRING"
}
}
}
},
"comp-sft-step": {
"executorLabel": "exec-sft-step",
"inputDefinitions": {
"parameters": {
"gcs_bucket": {
"parameterType": "STRING"
},
"vertex_experiment": {
"parameterType": "STRING"
},
"vertex_tensorboard": {
"parameterType": "STRING"
},
"wandb_run": {
"parameterType": "STRING"
}
}
}
},
"comp-tokenizer-step": {
"executorLabel": "exec-tokenizer-step",
"inputDefinitions": {
"parameters": {
"gcs_bucket": {
"parameterType": "STRING"
}
}
}
}
},
"deploymentSpec": {
"executors": {
"exec-custom-training-job": {
"container": {
"args": [
"--type",
"CustomJob",
"--payload",
"{\"display_name\": \"{{$.inputs.parameters['display_name']}}\", \"job_spec\": {\"worker_pool_specs\": {{$.inputs.parameters['worker_pool_specs']}}, \"scheduling\": {\"timeout\": \"{{$.inputs.parameters['timeout']}}\", \"restart_job_on_worker_restart\": {{$.inputs.parameters['restart_job_on_worker_restart']}}, \"strategy\": \"{{$.inputs.parameters['strategy']}}\", \"max_wait_duration\": \"{{$.inputs.parameters['max_wait_duration']}}\"}, \"service_account\": \"{{$.inputs.parameters['service_account']}}\", \"tensorboard\": \"{{$.inputs.parameters['tensorboard']}}\", \"enable_web_access\": {{$.inputs.parameters['enable_web_access']}}, \"network\": \"{{$.inputs.parameters['network']}}\", \"reserved_ip_ranges\": {{$.inputs.parameters['reserved_ip_ranges']}}, \"base_output_directory\": {\"output_uri_prefix\": \"{{$.inputs.parameters['base_output_directory']}}\"}, \"persistent_resource_id\": \"{{$.inputs.parameters['persistent_resource_id']}}\", \"psc_interface_config\": {{$.inputs.parameters['psc_interface_config']}}}, \"labels\": {{$.inputs.parameters['labels']}}, \"encryption_spec\": {\"kms_key_name\": \"{{$.inputs.parameters['encryption_spec_key_name']}}\"}}",
"--project",
"{{$.inputs.parameters['project']}}",
"--location",
"{{$.inputs.parameters['location']}}",
"--gcp_resources",
"{{$.outputs.parameters['gcp_resources'].output_file}}"
],
"command": [
"python3",
"-u",
"-m",
"google_cloud_pipeline_components.container.v1.custom_job.launcher"
],
"image": "gcr.io/ml-pipeline/google-cloud-pipeline-components:2.22.0"
}
},
"exec-data-download-step": {
"container": {
"args": [
"--gcs-bucket",
"{{$.inputs.parameters['gcs_bucket']}}",
"--num-shards",
"{{$.inputs.parameters['num_shards']}}"
],
"command": [
"python",
"vertex_pipelines/data_download_step.py"
],
"image": "gcr.io/nzp-nanochat/nanochat:latest",
"resources": {
"cpuLimit": 8.0,
"memoryLimit": 32.0,
"resourceCpuLimit": "8",
"resourceMemoryLimit": "32G"
}
}
},
"exec-midtraining-step": {
"container": {
"args": [
"--gcs-bucket",
"{{$.inputs.parameters['gcs_bucket']}}",
"--wandb-run",
"{{$.inputs.parameters['wandb_run']}}",
"--vertex-experiment",
"{{$.inputs.parameters['vertex_experiment']}}",
"--vertex-tensorboard",
"{{$.inputs.parameters['vertex_tensorboard']}}"
],
"command": [
"python",
"vertex_pipelines/midtraining_step.py"
],
"image": "gcr.io/nzp-nanochat/nanochat:latest",
"resources": {
"accelerator": {
"count": "1",
"resourceCount": "1",
"resourceType": "NVIDIA_TESLA_A100",
"type": "NVIDIA_TESLA_A100"
},
"cpuLimit": 8.0,
"memoryLimit": 32.0,
"resourceCpuLimit": "8",
"resourceMemoryLimit": "32G"
}
}
},
"exec-report-step": {
"container": {
"args": [
"--gcs-bucket",
"{{$.inputs.parameters['gcs_bucket']}}"
],
"command": [
"python",
"vertex_pipelines/report_step.py"
],
"image": "gcr.io/nzp-nanochat/nanochat:latest",
"resources": {
"cpuLimit": 2.0,
"memoryLimit": 8.0,
"resourceCpuLimit": "2",
"resourceMemoryLimit": "8G"
}
}
},
"exec-sft-step": {
"container": {
"args": [
"--gcs-bucket",
"{{$.inputs.parameters['gcs_bucket']}}",
"--wandb-run",
"{{$.inputs.parameters['wandb_run']}}",
"--vertex-experiment",
"{{$.inputs.parameters['vertex_experiment']}}",
"--vertex-tensorboard",
"{{$.inputs.parameters['vertex_tensorboard']}}"
],
"command": [
"python",
"vertex_pipelines/sft_step.py"
],
"image": "gcr.io/nzp-nanochat/nanochat:latest",
"resources": {
"accelerator": {
"count": "1",
"resourceCount": "1",
"resourceType": "NVIDIA_L4",
"type": "NVIDIA_L4"
},
"cpuLimit": 8.0,
"memoryLimit": 32.0,
"resourceCpuLimit": "8",
"resourceMemoryLimit": "32G"
}
}
},
"exec-tokenizer-step": {
"container": {
"args": [
"--gcs-bucket",
"{{$.inputs.parameters['gcs_bucket']}}"
],
"command": [
"python",
"vertex_pipelines/tokenizer_step.py"
],
"image": "gcr.io/nzp-nanochat/nanochat:latest",
"resources": {
"cpuLimit": 8.0,
"memoryLimit": 32.0,
"resourceCpuLimit": "8",
"resourceMemoryLimit": "32G"
}
}
}
}
},
"pipelineInfo": {
"description": "A pipeline to train NanoChat",
"name": "nanochat-pipeline"
},
"root": {
"dag": {
"tasks": {
"custom-training-job": {
"cachingOptions": {
"enableCache": true
},
"componentRef": {
"name": "comp-custom-training-job"
},
"dependentTasks": [
"tokenizer-step"
],
"inputs": {
"parameters": {
"base_output_directory": {
"runtimeValue": {
"constant": "{{$.inputs.parameters['pipelinechannel--gcs_bucket']}}/pipeline_root"
}
},
"display_name": {
"runtimeValue": {
"constant": "nanochat-pretraining-job"
}
},
"location": {
"componentInputParameter": "location"
},
"max_wait_duration": {
"componentInputParameter": "max_wait_duration"
},
"pipelinechannel--gcs_bucket": {
"componentInputParameter": "gcs_bucket"
},
"pipelinechannel--vertex_experiment": {
"componentInputParameter": "vertex_experiment"
},
"pipelinechannel--vertex_tensorboard": {
"componentInputParameter": "vertex_tensorboard"
},
"pipelinechannel--wandb_run": {
"componentInputParameter": "wandb_run"
},
"project": {
"componentInputParameter": "project"
},
"restart_job_on_worker_restart": {
"runtimeValue": {
"constant": true
}
},
"strategy": {
"componentInputParameter": "scheduling_strategy"
},
"timeout": {
"runtimeValue": {
"constant": "604800s"
}
},
"worker_pool_specs": {
"runtimeValue": {
"constant": [
{
"container_spec": {
"args": [
"--gcs-bucket",
"{{$.inputs.parameters['pipelinechannel--gcs_bucket']}}",
"--wandb-run",
"{{$.inputs.parameters['pipelinechannel--wandb_run']}}",
"--vertex-experiment",
"{{$.inputs.parameters['pipelinechannel--vertex_experiment']}}",
"--vertex-tensorboard",
"{{$.inputs.parameters['pipelinechannel--vertex_tensorboard']}}"
],
"command": [
"python",
"vertex_pipelines/pretraining_step.py"
],
"image_uri": "gcr.io/nzp-nanochat/nanochat:latest"
},
"machine_spec": {
"accelerator_count": 8.0,
"accelerator_type": "NVIDIA_TESLA_A100",
"machine_type": "a2-highgpu-8g"
},
"replica_count": 1.0
}
]
}
}
}
},
"taskInfo": {
"name": "custom-training-job"
}
},
"data-download-step": {
"cachingOptions": {
"enableCache": true
},
"componentRef": {
"name": "comp-data-download-step"
},
"inputs": {
"parameters": {
"gcs_bucket": {
"componentInputParameter": "gcs_bucket"
},
"num_shards": {
"componentInputParameter": "num_data_shards"
}
}
},
"taskInfo": {
"name": "data-download-step"
}
},
"midtraining-step": {
"cachingOptions": {
"enableCache": true
},
"componentRef": {
"name": "comp-midtraining-step"
},
"dependentTasks": [
"custom-training-job"
],
"inputs": {
"parameters": {
"gcs_bucket": {
"componentInputParameter": "gcs_bucket"
},
"vertex_experiment": {
"componentInputParameter": "vertex_experiment"
},
"vertex_tensorboard": {
"componentInputParameter": "vertex_tensorboard"
},
"wandb_run": {
"componentInputParameter": "wandb_run"
}
}
},
"taskInfo": {
"name": "midtraining-step"
}
},
"report-step": {
"cachingOptions": {
"enableCache": true
},
"componentRef": {
"name": "comp-report-step"
},
"dependentTasks": [
"sft-step"
],
"inputs": {
"parameters": {
"gcs_bucket": {
"componentInputParameter": "gcs_bucket"
}
}
},
"taskInfo": {
"name": "report-step"
}
},
"sft-step": {
"cachingOptions": {
"enableCache": true
},
"componentRef": {
"name": "comp-sft-step"
},
"dependentTasks": [
"midtraining-step"
],
"inputs": {
"parameters": {
"gcs_bucket": {
"componentInputParameter": "gcs_bucket"
},
"vertex_experiment": {
"componentInputParameter": "vertex_experiment"
},
"vertex_tensorboard": {
"componentInputParameter": "vertex_tensorboard"
},
"wandb_run": {
"componentInputParameter": "wandb_run"
}
}
},
"taskInfo": {
"name": "sft-step"
}
},
"tokenizer-step": {
"cachingOptions": {
"enableCache": true
},
"componentRef": {
"name": "comp-tokenizer-step"
},
"inputs": {
"parameters": {
"gcs_bucket": {
"componentInputParameter": "gcs_bucket"
}
}
},
"taskInfo": {
"name": "tokenizer-step"
}
}
}
},
"inputDefinitions": {
"parameters": {
"gcs_bucket": {
"parameterType": "STRING"
},
"location": {
"parameterType": "STRING"
},
"max_wait_duration": {
"defaultValue": "0s",
"isOptional": true,
"parameterType": "STRING"
},
"num_data_shards": {
"defaultValue": 20.0,
"isOptional": true,
"parameterType": "NUMBER_INTEGER"
},
"project": {
"parameterType": "STRING"
},
"scheduling_strategy": {
"defaultValue": "FLEX_START",
"isOptional": true,
"parameterType": "STRING"
},
"vertex_experiment": {
"defaultValue": "",
"isOptional": true,
"parameterType": "STRING"
},
"vertex_tensorboard": {
"defaultValue": "",
"isOptional": true,
"parameterType": "STRING"
},
"wandb_run": {
"defaultValue": "dummy",
"isOptional": true,
"parameterType": "STRING"
}
}
}
},
"schemaVersion": "2.1.0",
"sdkVersion": "kfp-2.14.6"
}

68
demo_runtime_scheduling.sh Executable file
View File

@ -0,0 +1,68 @@
#!/bin/bash
# Demonstration: Submitting the same compiled pipeline with different scheduling strategies
# without recompilation
set -e
echo "=== Demo: Runtime Scheduling Strategy Changes ==="
echo ""
echo "This demonstrates that we can now change scheduling strategies"
echo "without recompiling the pipeline or rebuilding the Docker image."
echo ""
# Compile the pipeline once
echo "1. Compiling pipeline (one time)..."
python3 vertex_pipelines/pipeline.py \
--gcp-project nzp-nanochat \
--gcs-bucket gs://nzp-nanochat \
--pipeline-root gs://nzp-nanochat/pipeline-root \
--region us-central1 \
--wandb-run test-run \
--vertex-experiment nanochat-experiment \
--vertex-tensorboard projects/247010501180/locations/us-central1/tensorboards/8180826106513850368 \
--accelerator-type NVIDIA_TESLA_A100 \
--accelerator-count 8 \
--preemptible true \
--num-data-shards 20 \
--service-account 247010501180-compute@developer.gserviceaccount.com \
--template_path demo_pipeline.json \
2>&1 | grep -v "^Creating\|^To use\|^View\|state:"
echo "✓ Pipeline compiled successfully"
echo ""
# Show the scheduling parameters in the compiled pipeline
echo "2. Checking compiled pipeline parameters..."
python3 -c "
import json
data = json.load(open('demo_pipeline.json'))
params = data['root']['inputDefinitions']['parameters']
print(' scheduling_strategy: default =', params['scheduling_strategy']['defaultValue'])
print(' max_wait_duration: default =', params['max_wait_duration']['defaultValue'])
"
echo ""
echo "3. Demonstrating runtime parameter override..."
echo " You can now submit this compiled pipeline with different strategies:"
echo ""
echo " Option A (DWS - wait indefinitely):"
echo " --scheduling-strategy FLEX_START --max-wait-duration 0s"
echo ""
echo " Option B (DWS - wait 1 hour):"
echo " --scheduling-strategy FLEX_START --max-wait-duration 3600s"
echo ""
echo " Option C (Standard on-demand):"
echo " --scheduling-strategy STANDARD --max-wait-duration 86400s"
echo ""
echo " Option D (Legacy Spot):"
echo " --scheduling-strategy SPOT --max-wait-duration 0s"
echo ""
echo "=== Summary ==="
echo "✓ Pipeline compilation is DECOUPLED from scheduling configuration"
echo "✓ No recompilation needed when changing FLEX_START ↔ SPOT ↔ STANDARD"
echo "✓ No Docker rebuild needed for deployment strategy changes"
echo ""
echo "To submit with a different strategy, just pass:"
echo " --scheduling-strategy <VALUE> --max-wait-duration <VALUE>"
echo "to pipeline.py or add them to run_pipeline.sh"

3
inspect_custom_job.py Normal file
View File

@ -0,0 +1,3 @@
import google_cloud_pipeline_components.v1.custom_job as custom_job_module
print(f"Module file: {custom_job_module.__file__}")

13
inspect_dws.py Normal file
View File

@ -0,0 +1,13 @@
#!/usr/bin/env python3
"""Inspect CustomTrainingJobOp for DWS parameters."""
import inspect
from google_cloud_pipeline_components.v1.custom_job import CustomTrainingJobOp
print("CustomTrainingJobOp signature:")
print(inspect.signature(CustomTrainingJobOp))
print("\n" + "="*80 + "\n")
# Get the component function
component_fn = CustomTrainingJobOp.component_spec
print("Component spec:")
print(component_fn)

7
inspect_kfp.py Normal file
View File

@ -0,0 +1,7 @@
import inspect
from kfp import dsl
try:
print("Available methods:", [m for m in dir(dsl.PipelineTask) if 'pod' in m or 'label' in m or 'annotation' in m or 'env' in m])
except Exception as e:
print(e)

View File

@ -153,8 +153,14 @@ def find_largest_model(checkpoint_dir):
storage_client = storage.Client()
bucket_name, prefix = checkpoint_dir[5:].split("/", 1)
bucket = storage_client.bucket(bucket_name)
if not prefix.endswith("/"):
prefix += "/"
blobs = bucket.list_blobs(prefix=prefix, delimiter='/')
model_tags = [b.name.split('/')[-2] for b in blobs.prefixes]
list(blobs) # Iterate to populate prefixes
log0(f"DEBUG: prefix={prefix}")
log0(f"DEBUG: blobs.prefixes={list(blobs.prefixes)}")
model_tags = [p.split('/')[-2] for p in blobs.prefixes]
log0(f"DEBUG: model_tags={model_tags}")
else:
# attempt to guess the model tag: take the biggest model available
model_tags = [f for f in os.listdir(checkpoint_dir) if os.path.isdir(os.path.join(checkpoint_dir, f))]
@ -218,6 +224,15 @@ def load_model(source, *args, **kwargs):
"sft": "chatsft_checkpoints",
"rl": "chatrl_checkpoints",
}[source]
base_dir = get_base_dir()
checkpoints_dir = os.path.join(base_dir, model_dir)
# Check if running in Vertex AI with GCS data directory
data_dir = os.environ.get("NANOCHAT_DATA_DIR", "")
if data_dir.startswith("gs://"):
# Use GCS checkpoint directory
checkpoints_dir = data_dir.replace("/base_data", f"/{model_dir}")
else:
# Use local checkpoint directory
base_dir = get_base_dir()
checkpoints_dir = os.path.join(base_dir, model_dir)
return load_model_from_dir(checkpoints_dir, *args, **kwargs)

View File

@ -138,6 +138,18 @@ def get_dist_info():
def autodetect_device_type():
# prefer to use CUDA if available, otherwise use MPS, otherwise fallback on CPU
print0(f"DEBUG: torch.cuda.is_available(): {torch.cuda.is_available()}")
if torch.cuda.is_available():
print0(f"DEBUG: torch.version.cuda: {torch.version.cuda}")
print0(f"DEBUG: torch.backends.cudnn.version(): {torch.backends.cudnn.version()}")
print0(f"DEBUG: torch.cuda.device_count(): {torch.cuda.device_count()}")
print0(f"DEBUG: torch.cuda.get_device_name(0): {torch.cuda.get_device_name(0)}")
# Print environment variables relevant to CUDA
env_vars = ["LD_LIBRARY_PATH", "PATH", "CUDA_VISIBLE_DEVICES", "NVIDIA_VISIBLE_DEVICES", "NVIDIA_DRIVER_CAPABILITIES"]
for var in env_vars:
print0(f"DEBUG: env {var}: {os.environ.get(var, 'NOT SET')}")
if torch.cuda.is_available():
device_type = "cuda"
elif torch.backends.mps.is_available():
@ -191,7 +203,116 @@ class DummyWandb:
"""Useful if we wish to not use wandb but have all the same signatures"""
def __init__(self):
pass
def init(self, *args, **kwargs):
return self
def log(self, *args, **kwargs):
pass
def finish(self):
pass
class VertexLogger:
"""Logs metrics to Vertex AI Experiments."""
def __init__(self, experiment_name, tensorboard_resource_name=None):
from google.cloud import aiplatform
self.aiplatform = aiplatform
self.experiment_name = experiment_name
self.tensorboard_resource_name = tensorboard_resource_name
self._run = None
def init(self, project=None, name=None, config=None, **kwargs):
# Map wandb 'project' to Vertex 'experiment'
experiment = project or self.experiment_name
self.aiplatform.init(
experiment=experiment,
experiment_tensorboard=self.tensorboard_resource_name
)
try:
self._run = self.aiplatform.start_run(run=name, resume=True)
except Exception as e:
print(f"Could not resume run {name}: {e}. Creating a new run.")
self._run = self.aiplatform.start_run(run=name, resume=False)
# Initialize TensorBoard SummaryWriter if tensorboard resource is provided
# We need to write to a GCS bucket that the TensorBoard resource can access.
# Vertex AI automatically uploads logs if we write to the base_output_directory?
# Or we can write directly to GCS if we have permissions.
# Let's try writing to a GCS path derived from the bucket we use for data.
# Ideally we should pass the bucket name, but let's infer or use a default.
# Actually, for Custom Jobs, 'base_output_directory' is often set.
# Let's try to use the GCS bucket passed in args if possible, but here we don't have it easily.
# However, we can use the 'gs://nzp-nanochat/tensorboard_logs/{name}' path.
# We'll assume the bucket 'nzp-nanochat' exists as it's hardcoded elsewhere.
try:
from torch.utils.tensorboard import SummaryWriter
import os
# Use AIP_TENSORBOARD_LOG_DIR if available (set by Vertex AI)
log_dir = os.environ.get('AIP_TENSORBOARD_LOG_DIR')
if not log_dir:
# Fallback for local runs or if env var is missing
log_dir = f"gs://nzp-nanochat/tensorboard_logs/{name}"
print(f"AIP_TENSORBOARD_LOG_DIR not found. Using fallback: {log_dir}")
self.summary_writer = SummaryWriter(log_dir=log_dir)
print(f"TensorBoard logging enabled to: {log_dir}")
except Exception as e:
print(f"Failed to initialize TensorBoard SummaryWriter: {e}")
self.summary_writer = None
if config:
self.aiplatform.log_params(config)
return self
def log(self, data, step=None):
# Only log from rank 0 to avoid concurrency conflicts with Vertex AI Experiments
import os
rank = int(os.environ.get('RANK', 0))
# Vertex AI log_metrics doesn't support 'step' directly in the same way.
# It logs a new data point.
# We must flatten the dictionary because log_metrics only accepts scalars.
def flatten(d, parent_key='', sep='.'):
items = []
for k, v in d.items():
new_key = f"{parent_key}{sep}{k}" if parent_key else k
if isinstance(v, dict):
items.extend(flatten(v, new_key, sep=sep).items())
else:
items.append((new_key, v))
return dict(items)
flat_data = flatten(data)
#Extract step for TensorBoard if present in the data
global_step = flat_data.get('step', step if step is not None else 0)
# Only rank 0 should log to Vertex AI Experiments to prevent etag conflicts
if rank == 0:
self.aiplatform.log_metrics(flat_data)
# Log to TensorBoard from all ranks (TensorBoard can handle concurrent writes)
if self.summary_writer:
for k, v in flat_data.items():
if isinstance(v, (int, float)) and k != 'step': # Don't log 'step' as a metric
self.summary_writer.add_scalar(k, v, global_step=global_step)
self.summary_writer.flush()
def finish(self):
if self.summary_writer:
self.summary_writer.close()
self.aiplatform.end_run()
def get_experiment_logger(args):
"""Returns a logger compatible with wandb interface."""
if hasattr(args, 'wandb_run') and args.wandb_run != "dummy":
import wandb
return wandb
elif hasattr(args, 'vertex_experiment') and args.vertex_experiment:
return VertexLogger(
experiment_name=args.vertex_experiment,
tensorboard_resource_name=getattr(args, 'vertex_tensorboard', None)
)
else:
return DummyWandb()

View File

@ -84,6 +84,8 @@ def download_single_file(index):
print(f"Skipping {filename} (already exists in GCS)")
return True
else:
# Ensure the directory exists
os.makedirs(DATA_DIR, exist_ok=True)
filepath = os.path.join(DATA_DIR, filename)
if os.path.exists(filepath):
print(f"Skipping {filepath} (already exists)")

952
nanochat_pipeline.json Normal file
View File

@ -0,0 +1,952 @@
{
"components": {
"comp-custom-training-job": {
"executorLabel": "exec-custom-training-job",
"inputDefinitions": {
"parameters": {
"base_output_directory": {
"defaultValue": "",
"description": "The Cloud Storage location to store the output of this CustomJob or HyperparameterTuningJob. See [more information ](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/GcsDestination).",
"isOptional": true,
"parameterType": "STRING"
},
"display_name": {
"description": "The name of the CustomJob.",
"parameterType": "STRING"
},
"enable_web_access": {
"defaultValue": false,
"description": "Whether you want Vertex AI to enable [interactive shell access ](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell) to training containers. If `True`, you can access interactive shells at the URIs given by [CustomJob.web_access_uris][].",
"isOptional": true,
"parameterType": "BOOLEAN"
},
"encryption_spec_key_name": {
"defaultValue": "",
"description": "Customer-managed encryption key options for the CustomJob. If this is set, then all resources created by the CustomJob will be encrypted with the provided encryption key.",
"isOptional": true,
"parameterType": "STRING"
},
"labels": {
"defaultValue": {},
"description": "The labels with user-defined metadata to organize the CustomJob. See [more information](https://goo.gl/xmQnxf).",
"isOptional": true,
"parameterType": "STRUCT"
},
"location": {
"defaultValue": "{{$.pipeline_google_cloud_location}}",
"description": "Location for creating the custom training job. If not set, default to the location where the PipelineJob is run.",
"isOptional": true,
"parameterType": "STRING"
},
"max_wait_duration": {
"defaultValue": "86400s",
"description": "The maximum time to wait for the custom training job to be scheduled only if the scheduling strategy is set to FLEX_START. If set to 0, the job will wait indefinitely. The default is 24 hours. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).",
"isOptional": true,
"parameterType": "STRING"
},
"network": {
"defaultValue": "",
"description": "The full name of the Compute Engine network to which the job should be peered. For example, `projects/12345/global/networks/myVPC`. Format is of the form `projects/{project}/global/networks/{network}`. Where `{project}` is a project number, as in `12345`, and `{network}` is a network name. Private services access must already be configured for the network. If left unspecified, the job is not peered with any network.",
"isOptional": true,
"parameterType": "STRING"
},
"persistent_resource_id": {
"defaultValue": "{{$.pipeline_persistent_resource_id}}",
"description": "The ID of the PersistentResource in the same Project and Location which to run. The default value is a placeholder that will be resolved to the PipelineJob [RuntimeConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.pipelineJobs#PipelineJob.RuntimeConfig)'s persistent resource id at runtime. However, if the PipelineJob doesn't set Persistent Resource as the job level runtime, the placedholder will be resolved to an empty string and the custom job will be run on demand. If the value is set explicitly, the custom job will runs in the specified persistent resource, in this case, please note the network and CMEK configs on the job should be consistent with those on the PersistentResource, otherwise, the job will be rejected.",
"isOptional": true,
"parameterType": "STRING"
},
"project": {
"defaultValue": "{{$.pipeline_google_cloud_project_id}}",
"description": "Project to create the custom training job in. Defaults to the project in which the PipelineJob is run.",
"isOptional": true,
"parameterType": "STRING"
},
"psc_interface_config": {
"defaultValue": {},
"description": "Configuration CustomJob with PSC-I. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#PscInterfaceConfig).",
"isOptional": true,
"parameterType": "STRUCT"
},
"reserved_ip_ranges": {
"defaultValue": [],
"description": "A list of names for the reserved IP ranges under the VPC network that can be used for this job. If set, we will deploy the job within the provided IP ranges. Otherwise, the job will be deployed to any IP ranges under the provided VPC network.",
"isOptional": true,
"parameterType": "LIST"
},
"restart_job_on_worker_restart": {
"defaultValue": false,
"description": "Restarts the entire CustomJob if a worker gets restarted. This feature can be used by distributed training jobs that are not resilient to workers leaving and joining a job.",
"isOptional": true,
"parameterType": "BOOLEAN"
},
"service_account": {
"defaultValue": "",
"description": "Sets the default service account for workload run-as account. The [service account ](https://cloud.google.com/vertex-ai/docs/pipelines/configure-project#service-account) running the pipeline submitting jobs must have act-as permission on this run-as account. If unspecified, the Vertex AI Custom Code [Service Agent ](https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents) for the CustomJob's project.",
"isOptional": true,
"parameterType": "STRING"
},
"strategy": {
"defaultValue": "STANDARD",
"description": "The strategy to use for the custom training job. The default is 'STANDARD'. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).",
"isOptional": true,
"parameterType": "STRING"
},
"tensorboard": {
"defaultValue": "",
"description": "The name of a Vertex AI TensorBoard resource to which this CustomJob will upload TensorBoard logs.",
"isOptional": true,
"parameterType": "STRING"
},
"timeout": {
"defaultValue": "604800s",
"description": "The maximum job running time. The default is 7 days. A duration in seconds with up to nine fractional digits, terminated by 's', for example: \"3.5s\".",
"isOptional": true,
"parameterType": "STRING"
},
"worker_pool_specs": {
"defaultValue": [],
"description": "Serialized json spec of the worker pools including machine type and Docker image. All worker pools except the first one are optional and can be skipped by providing an empty value. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#WorkerPoolSpec).",
"isOptional": true,
"parameterType": "LIST"
}
}
},
"outputDefinitions": {
"parameters": {
"gcp_resources": {
"description": "Serialized JSON of `gcp_resources` [proto](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud/google_cloud_pipeline_components/proto) which tracks the CustomJob.",
"parameterType": "STRING"
}
}
}
},
"comp-custom-training-job-2": {
"executorLabel": "exec-custom-training-job-2",
"inputDefinitions": {
"parameters": {
"base_output_directory": {
"defaultValue": "",
"description": "The Cloud Storage location to store the output of this CustomJob or HyperparameterTuningJob. See [more information ](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/GcsDestination).",
"isOptional": true,
"parameterType": "STRING"
},
"display_name": {
"description": "The name of the CustomJob.",
"parameterType": "STRING"
},
"enable_web_access": {
"defaultValue": false,
"description": "Whether you want Vertex AI to enable [interactive shell access ](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell) to training containers. If `True`, you can access interactive shells at the URIs given by [CustomJob.web_access_uris][].",
"isOptional": true,
"parameterType": "BOOLEAN"
},
"encryption_spec_key_name": {
"defaultValue": "",
"description": "Customer-managed encryption key options for the CustomJob. If this is set, then all resources created by the CustomJob will be encrypted with the provided encryption key.",
"isOptional": true,
"parameterType": "STRING"
},
"labels": {
"defaultValue": {},
"description": "The labels with user-defined metadata to organize the CustomJob. See [more information](https://goo.gl/xmQnxf).",
"isOptional": true,
"parameterType": "STRUCT"
},
"location": {
"defaultValue": "{{$.pipeline_google_cloud_location}}",
"description": "Location for creating the custom training job. If not set, default to the location where the PipelineJob is run.",
"isOptional": true,
"parameterType": "STRING"
},
"max_wait_duration": {
"defaultValue": "86400s",
"description": "The maximum time to wait for the custom training job to be scheduled only if the scheduling strategy is set to FLEX_START. If set to 0, the job will wait indefinitely. The default is 24 hours. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).",
"isOptional": true,
"parameterType": "STRING"
},
"network": {
"defaultValue": "",
"description": "The full name of the Compute Engine network to which the job should be peered. For example, `projects/12345/global/networks/myVPC`. Format is of the form `projects/{project}/global/networks/{network}`. Where `{project}` is a project number, as in `12345`, and `{network}` is a network name. Private services access must already be configured for the network. If left unspecified, the job is not peered with any network.",
"isOptional": true,
"parameterType": "STRING"
},
"persistent_resource_id": {
"defaultValue": "{{$.pipeline_persistent_resource_id}}",
"description": "The ID of the PersistentResource in the same Project and Location which to run. The default value is a placeholder that will be resolved to the PipelineJob [RuntimeConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.pipelineJobs#PipelineJob.RuntimeConfig)'s persistent resource id at runtime. However, if the PipelineJob doesn't set Persistent Resource as the job level runtime, the placedholder will be resolved to an empty string and the custom job will be run on demand. If the value is set explicitly, the custom job will runs in the specified persistent resource, in this case, please note the network and CMEK configs on the job should be consistent with those on the PersistentResource, otherwise, the job will be rejected.",
"isOptional": true,
"parameterType": "STRING"
},
"project": {
"defaultValue": "{{$.pipeline_google_cloud_project_id}}",
"description": "Project to create the custom training job in. Defaults to the project in which the PipelineJob is run.",
"isOptional": true,
"parameterType": "STRING"
},
"psc_interface_config": {
"defaultValue": {},
"description": "Configuration CustomJob with PSC-I. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#PscInterfaceConfig).",
"isOptional": true,
"parameterType": "STRUCT"
},
"reserved_ip_ranges": {
"defaultValue": [],
"description": "A list of names for the reserved IP ranges under the VPC network that can be used for this job. If set, we will deploy the job within the provided IP ranges. Otherwise, the job will be deployed to any IP ranges under the provided VPC network.",
"isOptional": true,
"parameterType": "LIST"
},
"restart_job_on_worker_restart": {
"defaultValue": false,
"description": "Restarts the entire CustomJob if a worker gets restarted. This feature can be used by distributed training jobs that are not resilient to workers leaving and joining a job.",
"isOptional": true,
"parameterType": "BOOLEAN"
},
"service_account": {
"defaultValue": "",
"description": "Sets the default service account for workload run-as account. The [service account ](https://cloud.google.com/vertex-ai/docs/pipelines/configure-project#service-account) running the pipeline submitting jobs must have act-as permission on this run-as account. If unspecified, the Vertex AI Custom Code [Service Agent ](https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents) for the CustomJob's project.",
"isOptional": true,
"parameterType": "STRING"
},
"strategy": {
"defaultValue": "STANDARD",
"description": "The strategy to use for the custom training job. The default is 'STANDARD'. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).",
"isOptional": true,
"parameterType": "STRING"
},
"tensorboard": {
"defaultValue": "",
"description": "The name of a Vertex AI TensorBoard resource to which this CustomJob will upload TensorBoard logs.",
"isOptional": true,
"parameterType": "STRING"
},
"timeout": {
"defaultValue": "604800s",
"description": "The maximum job running time. The default is 7 days. A duration in seconds with up to nine fractional digits, terminated by 's', for example: \"3.5s\".",
"isOptional": true,
"parameterType": "STRING"
},
"worker_pool_specs": {
"defaultValue": [],
"description": "Serialized json spec of the worker pools including machine type and Docker image. All worker pools except the first one are optional and can be skipped by providing an empty value. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#WorkerPoolSpec).",
"isOptional": true,
"parameterType": "LIST"
}
}
},
"outputDefinitions": {
"parameters": {
"gcp_resources": {
"description": "Serialized JSON of `gcp_resources` [proto](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud/google_cloud_pipeline_components/proto) which tracks the CustomJob.",
"parameterType": "STRING"
}
}
}
},
"comp-custom-training-job-3": {
"executorLabel": "exec-custom-training-job-3",
"inputDefinitions": {
"parameters": {
"base_output_directory": {
"defaultValue": "",
"description": "The Cloud Storage location to store the output of this CustomJob or HyperparameterTuningJob. See [more information ](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/GcsDestination).",
"isOptional": true,
"parameterType": "STRING"
},
"display_name": {
"description": "The name of the CustomJob.",
"parameterType": "STRING"
},
"enable_web_access": {
"defaultValue": false,
"description": "Whether you want Vertex AI to enable [interactive shell access ](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell) to training containers. If `True`, you can access interactive shells at the URIs given by [CustomJob.web_access_uris][].",
"isOptional": true,
"parameterType": "BOOLEAN"
},
"encryption_spec_key_name": {
"defaultValue": "",
"description": "Customer-managed encryption key options for the CustomJob. If this is set, then all resources created by the CustomJob will be encrypted with the provided encryption key.",
"isOptional": true,
"parameterType": "STRING"
},
"labels": {
"defaultValue": {},
"description": "The labels with user-defined metadata to organize the CustomJob. See [more information](https://goo.gl/xmQnxf).",
"isOptional": true,
"parameterType": "STRUCT"
},
"location": {
"defaultValue": "{{$.pipeline_google_cloud_location}}",
"description": "Location for creating the custom training job. If not set, default to the location where the PipelineJob is run.",
"isOptional": true,
"parameterType": "STRING"
},
"max_wait_duration": {
"defaultValue": "86400s",
"description": "The maximum time to wait for the custom training job to be scheduled only if the scheduling strategy is set to FLEX_START. If set to 0, the job will wait indefinitely. The default is 24 hours. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).",
"isOptional": true,
"parameterType": "STRING"
},
"network": {
"defaultValue": "",
"description": "The full name of the Compute Engine network to which the job should be peered. For example, `projects/12345/global/networks/myVPC`. Format is of the form `projects/{project}/global/networks/{network}`. Where `{project}` is a project number, as in `12345`, and `{network}` is a network name. Private services access must already be configured for the network. If left unspecified, the job is not peered with any network.",
"isOptional": true,
"parameterType": "STRING"
},
"persistent_resource_id": {
"defaultValue": "{{$.pipeline_persistent_resource_id}}",
"description": "The ID of the PersistentResource in the same Project and Location which to run. The default value is a placeholder that will be resolved to the PipelineJob [RuntimeConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.pipelineJobs#PipelineJob.RuntimeConfig)'s persistent resource id at runtime. However, if the PipelineJob doesn't set Persistent Resource as the job level runtime, the placedholder will be resolved to an empty string and the custom job will be run on demand. If the value is set explicitly, the custom job will runs in the specified persistent resource, in this case, please note the network and CMEK configs on the job should be consistent with those on the PersistentResource, otherwise, the job will be rejected.",
"isOptional": true,
"parameterType": "STRING"
},
"project": {
"defaultValue": "{{$.pipeline_google_cloud_project_id}}",
"description": "Project to create the custom training job in. Defaults to the project in which the PipelineJob is run.",
"isOptional": true,
"parameterType": "STRING"
},
"psc_interface_config": {
"defaultValue": {},
"description": "Configuration CustomJob with PSC-I. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#PscInterfaceConfig).",
"isOptional": true,
"parameterType": "STRUCT"
},
"reserved_ip_ranges": {
"defaultValue": [],
"description": "A list of names for the reserved IP ranges under the VPC network that can be used for this job. If set, we will deploy the job within the provided IP ranges. Otherwise, the job will be deployed to any IP ranges under the provided VPC network.",
"isOptional": true,
"parameterType": "LIST"
},
"restart_job_on_worker_restart": {
"defaultValue": false,
"description": "Restarts the entire CustomJob if a worker gets restarted. This feature can be used by distributed training jobs that are not resilient to workers leaving and joining a job.",
"isOptional": true,
"parameterType": "BOOLEAN"
},
"service_account": {
"defaultValue": "",
"description": "Sets the default service account for workload run-as account. The [service account ](https://cloud.google.com/vertex-ai/docs/pipelines/configure-project#service-account) running the pipeline submitting jobs must have act-as permission on this run-as account. If unspecified, the Vertex AI Custom Code [Service Agent ](https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents) for the CustomJob's project.",
"isOptional": true,
"parameterType": "STRING"
},
"strategy": {
"defaultValue": "STANDARD",
"description": "The strategy to use for the custom training job. The default is 'STANDARD'. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).",
"isOptional": true,
"parameterType": "STRING"
},
"tensorboard": {
"defaultValue": "",
"description": "The name of a Vertex AI TensorBoard resource to which this CustomJob will upload TensorBoard logs.",
"isOptional": true,
"parameterType": "STRING"
},
"timeout": {
"defaultValue": "604800s",
"description": "The maximum job running time. The default is 7 days. A duration in seconds with up to nine fractional digits, terminated by 's', for example: \"3.5s\".",
"isOptional": true,
"parameterType": "STRING"
},
"worker_pool_specs": {
"defaultValue": [],
"description": "Serialized json spec of the worker pools including machine type and Docker image. All worker pools except the first one are optional and can be skipped by providing an empty value. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#WorkerPoolSpec).",
"isOptional": true,
"parameterType": "LIST"
}
}
},
"outputDefinitions": {
"parameters": {
"gcp_resources": {
"description": "Serialized JSON of `gcp_resources` [proto](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud/google_cloud_pipeline_components/proto) which tracks the CustomJob.",
"parameterType": "STRING"
}
}
}
},
"comp-data-download-step": {
"executorLabel": "exec-data-download-step",
"inputDefinitions": {
"parameters": {
"gcs_bucket": {
"parameterType": "STRING"
},
"num_shards": {
"defaultValue": 50.0,
"isOptional": true,
"parameterType": "NUMBER_INTEGER"
}
}
}
},
"comp-report-step": {
"executorLabel": "exec-report-step",
"inputDefinitions": {
"parameters": {
"gcs_bucket": {
"parameterType": "STRING"
}
}
}
},
"comp-tokenizer-step": {
"executorLabel": "exec-tokenizer-step",
"inputDefinitions": {
"parameters": {
"gcs_bucket": {
"parameterType": "STRING"
}
}
}
}
},
"deploymentSpec": {
"executors": {
"exec-custom-training-job": {
"container": {
"args": [
"--type",
"CustomJob",
"--payload",
"{\"display_name\": \"{{$.inputs.parameters['display_name']}}\", \"job_spec\": {\"worker_pool_specs\": {{$.inputs.parameters['worker_pool_specs']}}, \"scheduling\": {\"timeout\": \"{{$.inputs.parameters['timeout']}}\", \"restart_job_on_worker_restart\": {{$.inputs.parameters['restart_job_on_worker_restart']}}, \"strategy\": \"{{$.inputs.parameters['strategy']}}\", \"max_wait_duration\": \"{{$.inputs.parameters['max_wait_duration']}}\"}, \"service_account\": \"{{$.inputs.parameters['service_account']}}\", \"tensorboard\": \"{{$.inputs.parameters['tensorboard']}}\", \"enable_web_access\": {{$.inputs.parameters['enable_web_access']}}, \"network\": \"{{$.inputs.parameters['network']}}\", \"reserved_ip_ranges\": {{$.inputs.parameters['reserved_ip_ranges']}}, \"base_output_directory\": {\"output_uri_prefix\": \"{{$.inputs.parameters['base_output_directory']}}\"}, \"persistent_resource_id\": \"{{$.inputs.parameters['persistent_resource_id']}}\", \"psc_interface_config\": {{$.inputs.parameters['psc_interface_config']}}}, \"labels\": {{$.inputs.parameters['labels']}}, \"encryption_spec\": {\"kms_key_name\": \"{{$.inputs.parameters['encryption_spec_key_name']}}\"}}",
"--project",
"{{$.inputs.parameters['project']}}",
"--location",
"{{$.inputs.parameters['location']}}",
"--gcp_resources",
"{{$.outputs.parameters['gcp_resources'].output_file}}"
],
"command": [
"python3",
"-u",
"-m",
"google_cloud_pipeline_components.container.v1.custom_job.launcher"
],
"image": "gcr.io/ml-pipeline/google-cloud-pipeline-components:2.22.0"
}
},
"exec-custom-training-job-2": {
"container": {
"args": [
"--type",
"CustomJob",
"--payload",
"{\"display_name\": \"{{$.inputs.parameters['display_name']}}\", \"job_spec\": {\"worker_pool_specs\": {{$.inputs.parameters['worker_pool_specs']}}, \"scheduling\": {\"timeout\": \"{{$.inputs.parameters['timeout']}}\", \"restart_job_on_worker_restart\": {{$.inputs.parameters['restart_job_on_worker_restart']}}, \"strategy\": \"{{$.inputs.parameters['strategy']}}\", \"max_wait_duration\": \"{{$.inputs.parameters['max_wait_duration']}}\"}, \"service_account\": \"{{$.inputs.parameters['service_account']}}\", \"tensorboard\": \"{{$.inputs.parameters['tensorboard']}}\", \"enable_web_access\": {{$.inputs.parameters['enable_web_access']}}, \"network\": \"{{$.inputs.parameters['network']}}\", \"reserved_ip_ranges\": {{$.inputs.parameters['reserved_ip_ranges']}}, \"base_output_directory\": {\"output_uri_prefix\": \"{{$.inputs.parameters['base_output_directory']}}\"}, \"persistent_resource_id\": \"{{$.inputs.parameters['persistent_resource_id']}}\", \"psc_interface_config\": {{$.inputs.parameters['psc_interface_config']}}}, \"labels\": {{$.inputs.parameters['labels']}}, \"encryption_spec\": {\"kms_key_name\": \"{{$.inputs.parameters['encryption_spec_key_name']}}\"}}",
"--project",
"{{$.inputs.parameters['project']}}",
"--location",
"{{$.inputs.parameters['location']}}",
"--gcp_resources",
"{{$.outputs.parameters['gcp_resources'].output_file}}"
],
"command": [
"python3",
"-u",
"-m",
"google_cloud_pipeline_components.container.v1.custom_job.launcher"
],
"image": "gcr.io/ml-pipeline/google-cloud-pipeline-components:2.22.0"
}
},
"exec-custom-training-job-3": {
"container": {
"args": [
"--type",
"CustomJob",
"--payload",
"{\"display_name\": \"{{$.inputs.parameters['display_name']}}\", \"job_spec\": {\"worker_pool_specs\": {{$.inputs.parameters['worker_pool_specs']}}, \"scheduling\": {\"timeout\": \"{{$.inputs.parameters['timeout']}}\", \"restart_job_on_worker_restart\": {{$.inputs.parameters['restart_job_on_worker_restart']}}, \"strategy\": \"{{$.inputs.parameters['strategy']}}\", \"max_wait_duration\": \"{{$.inputs.parameters['max_wait_duration']}}\"}, \"service_account\": \"{{$.inputs.parameters['service_account']}}\", \"tensorboard\": \"{{$.inputs.parameters['tensorboard']}}\", \"enable_web_access\": {{$.inputs.parameters['enable_web_access']}}, \"network\": \"{{$.inputs.parameters['network']}}\", \"reserved_ip_ranges\": {{$.inputs.parameters['reserved_ip_ranges']}}, \"base_output_directory\": {\"output_uri_prefix\": \"{{$.inputs.parameters['base_output_directory']}}\"}, \"persistent_resource_id\": \"{{$.inputs.parameters['persistent_resource_id']}}\", \"psc_interface_config\": {{$.inputs.parameters['psc_interface_config']}}}, \"labels\": {{$.inputs.parameters['labels']}}, \"encryption_spec\": {\"kms_key_name\": \"{{$.inputs.parameters['encryption_spec_key_name']}}\"}}",
"--project",
"{{$.inputs.parameters['project']}}",
"--location",
"{{$.inputs.parameters['location']}}",
"--gcp_resources",
"{{$.outputs.parameters['gcp_resources'].output_file}}"
],
"command": [
"python3",
"-u",
"-m",
"google_cloud_pipeline_components.container.v1.custom_job.launcher"
],
"image": "gcr.io/ml-pipeline/google-cloud-pipeline-components:2.22.0"
}
},
"exec-data-download-step": {
"container": {
"args": [
"--gcs-bucket",
"{{$.inputs.parameters['gcs_bucket']}}",
"--num-shards",
"{{$.inputs.parameters['num_shards']}}"
],
"command": [
"python",
"vertex_pipelines/data_download_step.py"
],
"image": "gcr.io/nzp-nanochat/nanochat:20251128144517",
"resources": {
"cpuLimit": 8.0,
"memoryLimit": 32.0,
"resourceCpuLimit": "8",
"resourceMemoryLimit": "32G"
}
}
},
"exec-report-step": {
"container": {
"args": [
"--gcs-bucket",
"{{$.inputs.parameters['gcs_bucket']}}"
],
"command": [
"python",
"vertex_pipelines/report_step.py"
],
"image": "gcr.io/nzp-nanochat/nanochat:20251128144517",
"resources": {
"cpuLimit": 2.0,
"memoryLimit": 8.0,
"resourceCpuLimit": "2",
"resourceMemoryLimit": "8G"
}
}
},
"exec-tokenizer-step": {
"container": {
"args": [
"--gcs-bucket",
"{{$.inputs.parameters['gcs_bucket']}}"
],
"command": [
"python",
"vertex_pipelines/tokenizer_step.py"
],
"image": "gcr.io/nzp-nanochat/nanochat:20251128144517",
"resources": {
"cpuLimit": 8.0,
"memoryLimit": 32.0,
"resourceCpuLimit": "8",
"resourceMemoryLimit": "32G"
}
}
}
}
},
"pipelineInfo": {
"description": "A pipeline to train NanoChat",
"name": "nanochat-pipeline"
},
"root": {
"dag": {
"tasks": {
"custom-training-job": {
"cachingOptions": {
"enableCache": true
},
"componentRef": {
"name": "comp-custom-training-job"
},
"dependentTasks": [
"tokenizer-step"
],
"inputs": {
"parameters": {
"base_output_directory": {
"runtimeValue": {
"constant": "{{$.inputs.parameters['pipelinechannel--gcs_bucket']}}/pipeline_root"
}
},
"display_name": {
"runtimeValue": {
"constant": "nanochat-pretraining-job"
}
},
"location": {
"componentInputParameter": "location"
},
"max_wait_duration": {
"componentInputParameter": "max_wait_duration"
},
"pipelinechannel--device_batch_size": {
"componentInputParameter": "device_batch_size"
},
"pipelinechannel--gcs_bucket": {
"componentInputParameter": "gcs_bucket"
},
"pipelinechannel--vertex_experiment": {
"componentInputParameter": "vertex_experiment"
},
"pipelinechannel--vertex_tensorboard": {
"componentInputParameter": "vertex_tensorboard"
},
"pipelinechannel--wandb_run": {
"componentInputParameter": "wandb_run"
},
"project": {
"componentInputParameter": "project"
},
"restart_job_on_worker_restart": {
"runtimeValue": {
"constant": true
}
},
"service_account": {
"componentInputParameter": "service_account"
},
"strategy": {
"componentInputParameter": "scheduling_strategy"
},
"tensorboard": {
"componentInputParameter": "vertex_tensorboard"
},
"timeout": {
"runtimeValue": {
"constant": "604800s"
}
},
"worker_pool_specs": {
"runtimeValue": {
"constant": [
{
"container_spec": {
"args": [
"--gcs-bucket",
"{{$.inputs.parameters['pipelinechannel--gcs_bucket']}}",
"--wandb-run",
"{{$.inputs.parameters['pipelinechannel--wandb_run']}}",
"--vertex-experiment",
"{{$.inputs.parameters['pipelinechannel--vertex_experiment']}}",
"--vertex-tensorboard",
"{{$.inputs.parameters['pipelinechannel--vertex_tensorboard']}}",
"--device-batch-size",
"{{$.inputs.parameters['pipelinechannel--device_batch_size']}}"
],
"command": [
"python",
"vertex_pipelines/pretraining_step.py"
],
"image_uri": "gcr.io/nzp-nanochat/nanochat:20251128144517"
},
"disk_spec": {
"boot_disk_size_gb": 500.0,
"boot_disk_type": "pd-ssd"
},
"machine_spec": {
"accelerator_count": 8.0,
"accelerator_type": "NVIDIA_TESLA_A100",
"machine_type": "a2-highgpu-8g"
},
"replica_count": 1.0
}
]
}
}
}
},
"taskInfo": {
"name": "custom-training-job"
}
},
"custom-training-job-2": {
"cachingOptions": {
"enableCache": true
},
"componentRef": {
"name": "comp-custom-training-job-2"
},
"dependentTasks": [
"custom-training-job"
],
"inputs": {
"parameters": {
"base_output_directory": {
"runtimeValue": {
"constant": "{{$.inputs.parameters['pipelinechannel--gcs_bucket']}}/pipeline_root"
}
},
"display_name": {
"runtimeValue": {
"constant": "nanochat-midtraining-job"
}
},
"location": {
"componentInputParameter": "location"
},
"max_wait_duration": {
"componentInputParameter": "max_wait_duration"
},
"pipelinechannel--device_batch_size": {
"componentInputParameter": "device_batch_size"
},
"pipelinechannel--gcs_bucket": {
"componentInputParameter": "gcs_bucket"
},
"pipelinechannel--vertex_experiment": {
"componentInputParameter": "vertex_experiment"
},
"pipelinechannel--vertex_tensorboard": {
"componentInputParameter": "vertex_tensorboard"
},
"pipelinechannel--wandb_run": {
"componentInputParameter": "wandb_run"
},
"project": {
"componentInputParameter": "project"
},
"service_account": {
"componentInputParameter": "service_account"
},
"strategy": {
"componentInputParameter": "scheduling_strategy"
},
"worker_pool_specs": {
"runtimeValue": {
"constant": [
{
"container_spec": {
"args": [
"--gcs-bucket",
"{{$.inputs.parameters['pipelinechannel--gcs_bucket']}}",
"--wandb-run",
"{{$.inputs.parameters['pipelinechannel--wandb_run']}}",
"--vertex-experiment",
"{{$.inputs.parameters['pipelinechannel--vertex_experiment']}}",
"--vertex-tensorboard",
"{{$.inputs.parameters['pipelinechannel--vertex_tensorboard']}}",
"--device-batch-size",
"{{$.inputs.parameters['pipelinechannel--device_batch_size']}}"
],
"command": [
"python",
"vertex_pipelines/midtraining_step.py"
],
"image_uri": "gcr.io/nzp-nanochat/nanochat:20251128144517"
},
"disk_spec": {
"boot_disk_size_gb": 500.0,
"boot_disk_type": "pd-ssd"
},
"machine_spec": {
"accelerator_count": 8.0,
"accelerator_type": "NVIDIA_TESLA_A100",
"machine_type": "a2-highgpu-8g"
},
"replica_count": 1.0
}
]
}
}
}
},
"taskInfo": {
"name": "custom-training-job-2"
}
},
"custom-training-job-3": {
"cachingOptions": {
"enableCache": true
},
"componentRef": {
"name": "comp-custom-training-job-3"
},
"dependentTasks": [
"custom-training-job-2"
],
"inputs": {
"parameters": {
"base_output_directory": {
"runtimeValue": {
"constant": "{{$.inputs.parameters['pipelinechannel--gcs_bucket']}}/pipeline_root"
}
},
"display_name": {
"runtimeValue": {
"constant": "nanochat-sft-job"
}
},
"location": {
"componentInputParameter": "location"
},
"max_wait_duration": {
"componentInputParameter": "max_wait_duration"
},
"pipelinechannel--gcs_bucket": {
"componentInputParameter": "gcs_bucket"
},
"pipelinechannel--vertex_experiment": {
"componentInputParameter": "vertex_experiment"
},
"pipelinechannel--vertex_tensorboard": {
"componentInputParameter": "vertex_tensorboard"
},
"pipelinechannel--wandb_run": {
"componentInputParameter": "wandb_run"
},
"project": {
"componentInputParameter": "project"
},
"service_account": {
"componentInputParameter": "service_account"
},
"strategy": {
"componentInputParameter": "scheduling_strategy"
},
"worker_pool_specs": {
"runtimeValue": {
"constant": [
{
"container_spec": {
"args": [
"--gcs-bucket",
"{{$.inputs.parameters['pipelinechannel--gcs_bucket']}}",
"--wandb-run",
"{{$.inputs.parameters['pipelinechannel--wandb_run']}}",
"--vertex-experiment",
"{{$.inputs.parameters['pipelinechannel--vertex_experiment']}}",
"--vertex-tensorboard",
"{{$.inputs.parameters['pipelinechannel--vertex_tensorboard']}}"
],
"command": [
"python",
"vertex_pipelines/sft_step.py"
],
"image_uri": "gcr.io/nzp-nanochat/nanochat:20251128144517"
},
"disk_spec": {
"boot_disk_size_gb": 500.0,
"boot_disk_type": "pd-ssd"
},
"machine_spec": {
"accelerator_count": 8.0,
"accelerator_type": "NVIDIA_TESLA_A100",
"machine_type": "a2-highgpu-8g"
},
"replica_count": 1.0
}
]
}
}
}
},
"taskInfo": {
"name": "custom-training-job-3"
}
},
"data-download-step": {
"cachingOptions": {
"enableCache": true
},
"componentRef": {
"name": "comp-data-download-step"
},
"inputs": {
"parameters": {
"gcs_bucket": {
"componentInputParameter": "gcs_bucket"
},
"num_shards": {
"componentInputParameter": "num_data_shards"
}
}
},
"taskInfo": {
"name": "data-download-step"
}
},
"report-step": {
"cachingOptions": {
"enableCache": true
},
"componentRef": {
"name": "comp-report-step"
},
"dependentTasks": [
"custom-training-job-3"
],
"inputs": {
"parameters": {
"gcs_bucket": {
"componentInputParameter": "gcs_bucket"
}
}
},
"taskInfo": {
"name": "report-step"
}
},
"tokenizer-step": {
"cachingOptions": {
"enableCache": true
},
"componentRef": {
"name": "comp-tokenizer-step"
},
"inputs": {
"parameters": {
"gcs_bucket": {
"componentInputParameter": "gcs_bucket"
}
}
},
"taskInfo": {
"name": "tokenizer-step"
}
}
}
},
"inputDefinitions": {
"parameters": {
"device_batch_size": {
"defaultValue": 8.0,
"isOptional": true,
"parameterType": "NUMBER_INTEGER"
},
"gcs_bucket": {
"parameterType": "STRING"
},
"location": {
"parameterType": "STRING"
},
"max_wait_duration": {
"defaultValue": "0s",
"isOptional": true,
"parameterType": "STRING"
},
"num_data_shards": {
"defaultValue": 20.0,
"isOptional": true,
"parameterType": "NUMBER_INTEGER"
},
"project": {
"parameterType": "STRING"
},
"scheduling_strategy": {
"defaultValue": "FLEX_START",
"isOptional": true,
"parameterType": "STRING"
},
"service_account": {
"defaultValue": "",
"isOptional": true,
"parameterType": "STRING"
},
"vertex_experiment": {
"defaultValue": "",
"isOptional": true,
"parameterType": "STRING"
},
"vertex_tensorboard": {
"defaultValue": "",
"isOptional": true,
"parameterType": "STRING"
},
"wandb_run": {
"defaultValue": "dummy",
"isOptional": true,
"parameterType": "STRING"
}
}
}
},
"schemaVersion": "2.1.0",
"sdkVersion": "kfp-2.15.1"
}

View File

@ -13,12 +13,14 @@ dependencies = [
"setuptools>=80.9.0",
"tiktoken>=0.11.0",
"tokenizers>=0.22.0",
"torch>=2.8.0",
"torch>=2.5.0",
"uvicorn>=0.36.0",
"wandb>=0.21.3",
"google-cloud-storage>=2.10.0",
"kfp>=2.0.0",
"google-cloud-aiplatform>=1.25.0",
"google-cloud-storage>=2.14.0",
"kfp==2.8.0",
"google-cloud-aiplatform>=1.38.0",
"gcsfs>=2023.6.0",
"tensorboard>=2.14.0",
]
[build-system]
@ -46,35 +48,35 @@ python_files = ["test_*.py"]
python_classes = ["Test*"]
python_functions = ["test_*"]
# target torch to cuda 12.8 or CPU
# target torch to cuda 12.4 or CPU
[tool.uv.sources]
torch = [
{ index = "pytorch-cpu", extra = "cpu" },
{ index = "pytorch-cu128", extra = "gpu" },
torch = [
{ index = "pytorch-cpu", extra = "cpu" },
{ index = "pytorch-cu124", extra = "gpu" },
]
[[tool.uv.index]]
name = "pytorch-cpu"
url = "https://download.pytorch.org/whl/cpu"
explicit = true
[[tool.uv.index]]
name = "pytorch-cu128"
url = "https://download.pytorch.org/whl/cu128"
[[tool.uv.index]]
name = "pytorch-cpu"
url = "https://download.pytorch.org/whl/cpu"
explicit = true
[project.optional-dependencies]
cpu = [
"torch>=2.8.0",
]
gpu = [
"torch>=2.8.0",
]
[tool.uv]
conflicts = [
[
{ extra = "cpu" },
{ extra = "gpu" },
],
]
[[tool.uv.index]]
name = "pytorch-cu124"
url = "https://download.pytorch.org/whl/cu124"
explicit = true
[project.optional-dependencies]
cpu = [
"torch>=2.5.0",
]
gpu = [
"torch>=2.5.0",
]
[tool.uv]
conflicts = [
[
{ extra = "cpu" },
{ extra = "gpu" },
],
]

View File

@ -0,0 +1,28 @@
## Base model evaluation
timestamp: 2025-11-29 02:14:42
- Model: base_model (step 21400)
- CORE metric: 0.1710
- hellaswag_zeroshot: 0.2364
- jeopardy: 0.0487
- bigbench_qa_wikidata: 0.4287
- arc_easy: 0.4815
- arc_challenge: 0.1217
- copa: 0.2800
- commonsense_qa: 0.0469
- piqa: 0.3308
- openbook_qa: 0.1173
- lambada_openai: 0.3346
- hellaswag: 0.2348
- winograd: 0.2161
- winogrande: 0.0450
- bigbench_dyck_languages: 0.1240
- agi_eval_lsat_ar: 0.0543
- bigbench_cs_algorithms: 0.3962
- bigbench_operators: 0.1381
- bigbench_repeat_copy_logic: 0.0000
- squad: 0.1213
- coqa: 0.1469
- boolq: -0.3182
- bigbench_language_identification: 0.1759

15
report/base-model-loss.md Normal file
View File

@ -0,0 +1,15 @@
## Base model loss
timestamp: 2025-11-29 01:17:40
- train bpb: 0.7200
- val bpb: 0.8992
- sample 0: <|bos|>The capital of France is Paris. The capital of the United Kingdom is London. The capital of the United
- sample 1: <|bos|>The chemical symbol of gold is Au. The symbol of gold is Au. The symbol of gold is Au.
- sample 2: <|bos|>If yesterday was Friday, then tomorrow will be Saturday.
If you are a parent, you know that your child is not ready
- sample 3: <|bos|>The opposite of hot is cold. The opposite of cold is hot. The opposite of hot is cold.
- sample 4: <|bos|>The planets of the solar system are: Earth, Venus, Mars, Jupiter, Saturn, Uranus, and Neptune. The
- sample 5: <|bos|>My favorite color is blue. I love the color blue. It is the color of the sky,
- sample 6: <|bos|>If 5*x + 3 = 13, then x is the same as 5*x + 3 = 13. If

View File

@ -0,0 +1,45 @@
## Base model training
timestamp: 2025-11-29 01:12:50
- wandb_run_name: dummy
- vertex_experiment: nanochat-experiment
- vertex_tensorboard: projects/247010501180/locations/us-central1/tensorboards/8180826106513850368
- device_type:
- depth: 20
- max_seq_len: 2048
- num_iterations: -1
- target_flops: -1.0000
- target_param_data_ratio: 20
- device_batch_size: 8
- total_batch_size: 524,288
- embedding_lr: 0.2000
- unembedding_lr: 0.0040
- weight_decay: 0.0000
- matrix_lr: 0.0200
- grad_clip: 1.0000
- warmup_ratio: 0.0000
- warmdown_ratio: 0.2000
- final_lr_frac: 0.0000
- eval_every: 250
- eval_tokens: 10,485,760
- core_metric_every: 2000
- core_metric_max_per_task: 500
- sample_every: 2000
- model_tag:
- Number of parameters: 560,988,160
- Number of FLOPs per token: 3.491758e+09
- Calculated number of iterations: 21,400
- Number of training tokens: 11,219,763,200
- Tokens : Params ratio: 20.0000
- DDP world size: 8
- warmup_ratio: 0.0000
- warmdown_ratio: 0.2000
- final_lr_frac: 0.0000
- Minimum validation bpb: 0.8985
- Final validation bpb: 0.8985
- CORE metric estimate: 0.1732
- MFU %: 0.00%
- Total training flops: 3.917670e+19
- Total training time: 0.00m
- Peak memory usage: 8574.47MiB

View File

@ -0,0 +1,23 @@
## Chat evaluation mid
timestamp: 2025-11-30 23:51:33
- source: mid
- task_name: None
- dtype: bfloat16
- temperature: 0.0000
- max_new_tokens: 512
- num_samples: 1
- top_k: 50
- batch_size: 8
- model_tag: None
- step: None
- max_problems: None
- device_type:
- ARC-Easy: 0.3847
- ARC-Challenge: 0.2944
- MMLU: 0.3079
- GSM8K: 0.0303
- HumanEval: 0.0610
- SpellingBee: 0.9688
- ChatCORE metric: 0.2293

View File

@ -0,0 +1,23 @@
## Chat evaluation sft
timestamp: 2025-12-01 16:29:17
- source: sft
- task_name: None
- dtype: bfloat16
- temperature: 0.0000
- max_new_tokens: 512
- num_samples: 1
- top_k: 50
- batch_size: 8
- model_tag: None
- step: None
- max_problems: None
- device_type:
- ARC-Easy: 0.3994
- ARC-Challenge: 0.2833
- MMLU: 0.3169
- GSM8K: 0.0417
- HumanEval: 0.0366
- SpellingBee: 0.9766
- ChatCORE metric: 0.2313

27
report/chat-sft.md Normal file
View File

@ -0,0 +1,27 @@
## Chat SFT
timestamp: 2025-12-01 14:45:18
- wandb_run_name: dummy
- vertex_experiment: nanochat-experiment
- vertex_tensorboard: projects/247010501180/locations/us-central1/tensorboards/8180826106513850368
- source: mid
- device_type:
- dtype: bfloat16
- device_batch_size: 4
- num_epochs: 1
- num_iterations: -1
- target_examples_per_step: 32
- unembedding_lr: 0.0040
- embedding_lr: 0.2000
- matrix_lr: 0.0200
- weight_decay: 0.0000
- init_lr_frac: 0.0200
- eval_every: 100
- eval_steps: 100
- eval_metrics_every: 200
- eval_metrics_max_problems: 1024
- Training rows: 22,439
- Number of iterations: 701
- Training loss: 1.1208
- Validation loss: 1.0811

24
report/midtraining.md Normal file
View File

@ -0,0 +1,24 @@
## Midtraining
timestamp: 2025-11-30 21:47:41
- wandb_run_name: dummy
- vertex_experiment: nanochat-experiment
- vertex_tensorboard: projects/247010501180/locations/us-central1/tensorboards/8180826106513850368
- device_type:
- dtype: bfloat16
- num_iterations: -1
- max_seq_len: 2048
- device_batch_size: 8
- unembedding_lr: 0.0040
- embedding_lr: 0.2000
- matrix_lr: 0.0200
- init_lr_frac: 1.0000
- weight_decay: 0.0000
- eval_every: 150
- eval_tokens: 10,485,760
- total_batch_size: 524,288
- dry_run: 0
- Number of iterations: 813
- DDP world size: 1
- Minimum validation bpb: 0.4203

1
requirements.txt Normal file
View File

@ -0,0 +1 @@
google-cloud-pipeline-components

View File

@ -55,7 +55,23 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
# Download the eval bundle to disk (and unzip if needed)
if not os.path.exists(eval_bundle_dir):
download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle)
# Try to download from GCS first (faster and more reliable in Vertex AI)
# UPDATE: GCS copy seems corrupted, disabling for now to force S3 fallback
# try:
# import gcsfs
# # Assuming the data is in gs://nzp-nanochat/eval_bundle
# gcs_eval_bundle = os.environ.get('NANOCHAT_DATA_DIR', 'gs://nzp-nanochat').replace('base_data', 'eval_bundle')
# print0(f"Trying to download eval_bundle from GCS: {gcs_eval_bundle}")
# fs = gcsfs.GCSFileSystem()
# if fs.exists(gcs_eval_bundle):
# print0(f"Found eval_bundle in GCS, downloading...")
# fs.get(gcs_eval_bundle, eval_bundle_dir, recursive=True)
# print0(f"Downloaded eval_bundle from GCS to {eval_bundle_dir}")
# else:
# raise FileNotFoundError("Eval bundle not found in GCS")
# except Exception as e:
# print0(f"Could not download from GCS ({e}), falling back to AWS S3...")
download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle)
config_path = os.path.join(eval_bundle_dir, "core.yaml")
data_base_path = os.path.join(eval_bundle_dir, "eval_data")
eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")

View File

@ -21,7 +21,7 @@ import torch
from nanochat.gpt import GPT, GPTConfig
from nanochat.dataloader import tokenizing_distributed_data_loader
from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, print_banner, get_base_dir, autodetect_device_type
from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, print_banner, get_base_dir, autodetect_device_type, get_experiment_logger
from nanochat.tokenizer import get_tokenizer, get_token_bytes
from nanochat.checkpoint_manager import save_checkpoint
from nanochat.loss_eval import evaluate_bpb
@ -31,7 +31,9 @@ print_banner()
# -----------------------------------------------------------------------------
# User settings
run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
wandb_run_name = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
vertex_experiment = "" # Vertex AI experiment name
vertex_tensorboard = "" # Vertex AI TensorBoard resource name
# Runtime
device_type = "" # cuda|cpu|mps (empty => autodetect good device type default, in order: CUDA > MPS > CPU)
# Model architecture
@ -74,9 +76,18 @@ autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16)
synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None
get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0
# wandb logging init
use_dummy_wandb = run == "dummy" or not master_process
wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat", name=run, config=user_config)
# logging init
use_dummy_logger = (wandb_run_name == "dummy" and not vertex_experiment) or not master_process
if use_dummy_logger:
wandb_run = DummyWandb()
else:
class Args: pass
args = Args()
args.wandb_run = wandb_run_name
args.vertex_experiment = vertex_experiment
args.vertex_tensorboard = vertex_tensorboard
wandb_run = get_experiment_logger(args)
wandb_run.init(project="nanochat", name=wandb_run_name, config=user_config)
# Tokenizer will be useful for evaluation, also we need the vocab size
tokenizer = get_tokenizer()
@ -118,6 +129,25 @@ print0(f"Number of parameters: {num_params:,}")
num_flops_per_token = model.estimate_flops()
print0(f"Estimated FLOPs per token: {num_flops_per_token:e}")
# Try to resume from latest checkpoint in GCS
start_step = 0
output_dirname = model_tag if model_tag else f"d{depth}"
data_dir = os.environ.get("NANOCHAT_DATA_DIR", "")
if data_dir.startswith("gs://"):
checkpoint_dir = data_dir.replace("/base_data", "/base_checkpoints") + f"/{output_dirname}"
try:
from nanochat.checkpoint_manager import find_last_step, load_checkpoint
last_step = find_last_step(checkpoint_dir)
print0(f"Found checkpoint at step {last_step} in {checkpoint_dir}, resuming...")
model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, last_step, device, load_optimizer=True)
orig_model.load_state_dict(model_data, strict=True, assign=True)
start_step = last_step
print0(f"✓ Resumed from step {start_step}")
except Exception as e:
print0(f"No checkpoint found or failed to load ({e}), starting from scratch")
start_step = 0
# Calculate number of iterations. Either it is given, or from target flops, or from target data:param ratio (in that order)
assert num_iterations > 0 or target_param_data_ratio > 0 or target_flops > 0
if num_iterations > 0:
@ -178,7 +208,11 @@ smooth_train_loss = 0 # EMA of training loss
ema_beta = 0.9 # EMA decay factor
total_training_time = 0 # total wall-clock time of training
# note that we run +1 steps only so that we can eval and save at the end
for step in range(num_iterations + 1):
mfu = 0.0
val_bpb = 0.0
flops_so_far = 0.0
results = {}
for step in range(start_step, num_iterations + 1):
last_step = step == num_iterations
flops_so_far = num_flops_per_token * total_batch_size * step
@ -240,7 +274,13 @@ for step in range(num_iterations + 1):
# save checkpoint at the end of the run (only on master process)
if master_process and last_step:
output_dirname = model_tag if model_tag else f"d{depth}" # e.g. d12
checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname)
# Use GCS for checkpoints to ensure persistence across job failures
data_dir = os.environ.get("NANOCHAT_DATA_DIR", "")
if data_dir.startswith("gs://"):
# Extract bucket and construct checkpoint path in GCS
checkpoint_dir = data_dir.replace("/base_data", "/base_checkpoints") + f"/{output_dirname}"
else:
checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname)
save_checkpoint(
checkpoint_dir,
step,
@ -256,6 +296,31 @@ for step in range(num_iterations + 1):
}
)
# Periodic checkpointing (every 1000 steps)
if master_process and step > 0 and step % 1000 == 0:
output_dirname = model_tag if model_tag else f"d{depth}"
# Use GCS for checkpoints to ensure persistence across job failures
data_dir = os.environ.get("NANOCHAT_DATA_DIR", "")
if data_dir.startswith("gs://"):
# Extract bucket and construct checkpoint path in GCS
checkpoint_dir = data_dir.replace("/base_data", "/base_checkpoints") + f"/{output_dirname}"
else:
checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname)
save_checkpoint(
checkpoint_dir,
step,
orig_model.state_dict(),
[opt.state_dict() for opt in optimizers],
{
"step": step,
"val_bpb": val_bpb,
"model_config": model_config_kwargs,
"user_config": user_config,
"device_batch_size": device_batch_size,
"max_seq_len": max_seq_len,
}
)
if last_step:
break

View File

@ -17,7 +17,7 @@ import torch
import torch.distributed as dist
from contextlib import nullcontext
from nanochat.common import compute_init, compute_cleanup, get_base_dir, print0, DummyWandb, autodetect_device_type
from nanochat.common import compute_init, compute_cleanup, get_base_dir, print0, DummyWandb, autodetect_device_type, get_experiment_logger
from nanochat.checkpoint_manager import load_model
from nanochat.checkpoint_manager import save_checkpoint
from nanochat.engine import Engine
@ -31,8 +31,10 @@ from tasks.customjson import CustomJSON
from tasks.spellingbee import SimpleSpelling, SpellingBee
# -----------------------------------------------------------------------------
# SFT Hyperparameters
run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
# SFT Hyperpa# User settings
wandb_run_name = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
vertex_experiment = "" # Vertex AI experiment name
vertex_tensorboard = "" # Vertex AI TensorBoard resource name
# input model options
source = "mid" # base|mid , which checkpoint to load the model from (base model or midtrained model)
model_tag = None # model tag to load the model from (base model or midtrained model)
@ -68,9 +70,18 @@ master_process = ddp_rank == 0
ptdtype = torch.float32 if dtype == 'float32' else torch.bfloat16
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()
# wandb logging init
use_dummy_wandb = run == "dummy" or not master_process
wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat-sft", name=run, config=user_config, save_code=True)
# logging init
use_dummy_logger = (wandb_run_name == "dummy" and not vertex_experiment) or not master_process
if use_dummy_logger:
wandb_run = DummyWandb()
else:
class Args: pass
args = Args()
args.wandb_run = wandb_run_name
args.vertex_experiment = vertex_experiment
args.vertex_tensorboard = vertex_tensorboard
wandb_run = get_experiment_logger(args)
wandb_run.init(project="nanochat-sft", name=wandb_run_name, config=user_config)
# Load the model and tokenizer
model, tokenizer, meta = load_model(source, device, phase="train", model_tag=model_tag, step=step)

View File

@ -16,7 +16,7 @@ import time
import wandb
import torch
from contextlib import nullcontext
from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, get_base_dir, autodetect_device_type
from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, get_base_dir, autodetect_device_type, get_experiment_logger
from nanochat.tokenizer import get_token_bytes
from nanochat.checkpoint_manager import save_checkpoint
from nanochat.loss_eval import evaluate_bpb
@ -31,7 +31,9 @@ from tasks.customjson import CustomJSON
from tasks.spellingbee import SimpleSpelling, SpellingBee
# -----------------------------------------------------------------------------
run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
wandb_run_name = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
vertex_experiment = "" # Vertex AI experiment name
vertex_tensorboard = "" # Vertex AI TensorBoard resource name
device_type = "" # cuda|cpu|mps (empty => autodetect)
model_tag = None # model tag to load the model from (base model or midtrained model)
step = None # step to load the model from (base model or midtrained model)
@ -58,12 +60,20 @@ device_type = autodetect_device_type() if device_type == "" else device_type
ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
master_process = ddp_rank == 0
autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None
get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0
synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: 0
# wandb logging init
use_dummy_wandb = run == "dummy" or not master_process
wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat-mid", name=run, config=user_config)
# logging init
use_dummy_logger = (wandb_run_name == "dummy" and not vertex_experiment) or not master_process
if use_dummy_logger:
wandb_run = DummyWandb()
else:
class Args: pass
args = Args()
args.wandb_run = wandb_run_name
args.vertex_experiment = vertex_experiment
args.vertex_tensorboard = vertex_tensorboard
wandb_run = get_experiment_logger(args)
wandb_run.init(project="nanochat-mid", name=wandb_run_name, config=user_config)
# Load the model and tokenizer
model, tokenizer, meta = load_model("base", device, phase="train", model_tag=model_tag, step=step)
@ -170,6 +180,11 @@ def get_muon_momentum(it):
momentum = (1 - frac) * 0.85 + frac * 0.95
return momentum
def get_max_memory():
if torch.cuda.is_available():
return torch.cuda.max_memory_allocated()
return 0
# -----------------------------------------------------------------------------
# Training loop
x, y = next(train_loader) # prefetch the very first batch of data

25
test_custom_job_args.py Normal file
View File

@ -0,0 +1,25 @@
from google_cloud_pipeline_components.v1.custom_job import CustomTrainingJobOp
try:
op = CustomTrainingJobOp(
project="p",
location="l",
display_name="d",
worker_pool_specs=[],
scheduling={"strategy": "SPOT"}
)
print("Success with scheduling")
except TypeError as e:
print(f"Failed with scheduling: {e}")
try:
op = CustomTrainingJobOp(
project="p",
location="l",
display_name="d",
worker_pool_specs=[],
timeout="1s"
)
print("Success with timeout")
except TypeError as e:
print(f"Failed with timeout: {e}")

35
test_torchrun.py Normal file
View File

@ -0,0 +1,35 @@
#!/usr/bin/env python3
"""Test torchrun command locally"""
import subprocess
import sys
# Simulate the exact command that will run
cmd = [
"torchrun", "--standalone", "--nproc_per_node=1",
"-m", "scripts.base_train",
"--depth=4",
"--device_batch_size=1",
"--num_iterations=2",
"--run=test_local",
"--vertex_experiment=",
"--vertex_tensorboard="
]
print("Testing command:")
print(" ".join(cmd))
print()
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
print("STDOUT:")
print(result.stdout[:1000])
print("\nSTDERR:")
print(result.stderr[:1000])
print(f"\nExit code: {result.returncode}")
sys.exit(result.returncode)
except subprocess.TimeoutExpired:
print("Command timed out (expected for training)")
sys.exit(0)
except Exception as e:
print(f"Error: {e}")
sys.exit(1)

202
uv.lock
View File

@ -287,18 +287,6 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188, upload-time = "2024-12-21T18:38:41.666Z" },
]
[[package]]
name = "click-option-group"
version = "0.5.7"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "click" },
]
sdist = { url = "https://files.pythonhosted.org/packages/b9/9f/1f917934da4e07ae7715a982347e3c2179556d8a58d1108c5da3e8f09c76/click_option_group-0.5.7.tar.gz", hash = "sha256:8dc780be038712fc12c9fecb3db4fe49e0d0723f9c171d7cda85c20369be693c", size = 22110, upload-time = "2025-03-24T13:24:55.897Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/93/27/bf74dc1494625c3b14dbcdb93740defd7b8c58dae3736be8d264f2a643fb/click_option_group-0.5.7-py3-none-any.whl", hash = "sha256:96b9f52f397ef4d916f81929bd6c1f85e89046c7a401a64e72a61ae74ad35c24", size = 11483, upload-time = "2025-03-24T13:24:54.611Z" },
]
[[package]]
name = "colorama"
version = "0.4.6"
@ -529,16 +517,77 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/01/61/d4b89fec821f72385526e1b9d9a3a0385dda4a72b206d28049e2c7cd39b8/gitpython-3.1.45-py3-none-any.whl", hash = "sha256:8908cb2e02fb3b93b7eb0f2827125cb699869470432cc885f019b8fd0fccff77", size = 208168, upload-time = "2025-07-24T03:45:52.517Z" },
]
[[package]]
name = "google-api-core"
version = "2.25.2"
source = { registry = "https://pypi.org/simple" }
resolution-markers = [
"python_full_version >= '3.14' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
"python_full_version >= '3.14' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
"python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version >= '3.14' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version >= '3.14' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version >= '3.14' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
]
dependencies = [
{ name = "google-auth", marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "googleapis-common-protos", marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "proto-plus", marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "protobuf", marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "requests", marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/09/cd/63f1557235c2440fe0577acdbc32577c5c002684c58c7f4d770a92366a24/google_api_core-2.25.2.tar.gz", hash = "sha256:1c63aa6af0d0d5e37966f157a77f9396d820fba59f9e43e9415bc3dc5baff300", size = 166266, upload-time = "2025-10-03T00:07:34.778Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/c8/d8/894716a5423933f5c8d2d5f04b16f052a515f78e815dab0c2c6f1fd105dc/google_api_core-2.25.2-py3-none-any.whl", hash = "sha256:e9a8f62d363dc8424a8497f4c2a47d6bcda6c16514c935629c257ab5d10210e7", size = 162489, upload-time = "2025-10-03T00:07:32.924Z" },
]
[package.optional-dependencies]
grpc = [
{ name = "grpcio", marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "grpcio-status", marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
]
[[package]]
name = "google-api-core"
version = "2.28.1"
source = { registry = "https://pypi.org/simple" }
resolution-markers = [
"python_full_version == '3.13.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
"python_full_version == '3.12.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
"python_full_version == '3.13.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
"python_full_version == '3.12.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
"python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
"python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
"python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
"python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
"python_full_version == '3.13.*' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version == '3.12.*' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version == '3.11.*' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version < '3.11' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version == '3.13.*' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version == '3.12.*' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version == '3.11.*' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version < '3.11' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version == '3.13.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version == '3.12.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version == '3.13.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version == '3.12.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
"python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
]
dependencies = [
{ name = "google-auth" },
{ name = "googleapis-common-protos" },
{ name = "proto-plus" },
{ name = "protobuf" },
{ name = "requests" },
{ name = "google-auth", marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "googleapis-common-protos", marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "proto-plus", marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "protobuf", marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "requests", marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/61/da/83d7043169ac2c8c7469f0e375610d78ae2160134bf1b80634c482fa079c/google_api_core-2.28.1.tar.gz", hash = "sha256:2b405df02d68e68ce0fbc138559e6036559e685159d148ae5861013dc201baf8", size = 176759, upload-time = "2025-10-28T21:34:51.529Z" }
wheels = [
@ -547,8 +596,8 @@ wheels = [
[package.optional-dependencies]
grpc = [
{ name = "grpcio" },
{ name = "grpcio-status" },
{ name = "grpcio", marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "grpcio-status", marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
]
[[package]]
@ -571,7 +620,8 @@ version = "1.124.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "docstring-parser" },
{ name = "google-api-core", extra = ["grpc"] },
{ name = "google-api-core", version = "2.25.2", source = { registry = "https://pypi.org/simple" }, extra = ["grpc"], marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "google-api-core", version = "2.28.1", source = { registry = "https://pypi.org/simple" }, extra = ["grpc"], marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "google-auth" },
{ name = "google-cloud-bigquery" },
{ name = "google-cloud-resource-manager" },
@ -594,7 +644,8 @@ name = "google-cloud-bigquery"
version = "3.38.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "google-api-core", extra = ["grpc"] },
{ name = "google-api-core", version = "2.25.2", source = { registry = "https://pypi.org/simple" }, extra = ["grpc"], marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "google-api-core", version = "2.28.1", source = { registry = "https://pypi.org/simple" }, extra = ["grpc"], marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "google-auth" },
{ name = "google-cloud-core" },
{ name = "google-resumable-media" },
@ -612,7 +663,8 @@ name = "google-cloud-core"
version = "2.5.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "google-api-core" },
{ name = "google-api-core", version = "2.25.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "google-api-core", version = "2.28.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "google-auth" },
]
sdist = { url = "https://files.pythonhosted.org/packages/a6/03/ef0bc99d0e0faf4fdbe67ac445e18cdaa74824fd93cd069e7bb6548cb52d/google_cloud_core-2.5.0.tar.gz", hash = "sha256:7c1b7ef5c92311717bd05301aa1a91ffbc565673d3b0b4163a52d8413a186963", size = 36027, upload-time = "2025-10-29T23:17:39.513Z" }
@ -625,7 +677,8 @@ name = "google-cloud-resource-manager"
version = "1.15.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "google-api-core", extra = ["grpc"] },
{ name = "google-api-core", version = "2.25.2", source = { registry = "https://pypi.org/simple" }, extra = ["grpc"], marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "google-api-core", version = "2.28.1", source = { registry = "https://pypi.org/simple" }, extra = ["grpc"], marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "google-auth" },
{ name = "grpc-google-iam-v1" },
{ name = "grpcio" },
@ -639,19 +692,20 @@ wheels = [
[[package]]
name = "google-cloud-storage"
version = "3.4.1"
version = "2.19.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "google-api-core" },
{ name = "google-api-core", version = "2.25.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "google-api-core", version = "2.28.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "google-auth" },
{ name = "google-cloud-core" },
{ name = "google-crc32c" },
{ name = "google-resumable-media" },
{ name = "requests" },
]
sdist = { url = "https://files.pythonhosted.org/packages/bd/ef/7cefdca67a6c8b3af0ec38612f9e78e5a9f6179dd91352772ae1a9849246/google_cloud_storage-3.4.1.tar.gz", hash = "sha256:6f041a297e23a4b485fad8c305a7a6e6831855c208bcbe74d00332a909f82268", size = 17238203, upload-time = "2025-10-08T18:43:39.665Z" }
sdist = { url = "https://files.pythonhosted.org/packages/36/76/4d965702e96bb67976e755bed9828fa50306dca003dbee08b67f41dd265e/google_cloud_storage-2.19.0.tar.gz", hash = "sha256:cd05e9e7191ba6cb68934d8eb76054d9be4562aa89dbc4236feee4d7d51342b2", size = 5535488, upload-time = "2024-12-05T01:35:06.49Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/83/6e/b47d83d3a35231c6232566341b0355cce78fd4e6988a7343725408547b2c/google_cloud_storage-3.4.1-py3-none-any.whl", hash = "sha256:972764cc0392aa097be8f49a5354e22eb47c3f62370067fb1571ffff4a1c1189", size = 290142, upload-time = "2025-10-08T18:43:37.524Z" },
{ url = "https://files.pythonhosted.org/packages/d5/94/6db383d8ee1adf45dc6c73477152b82731fa4c4a46d9c1932cc8757e0fd4/google_cloud_storage-2.19.0-py2.py3-none-any.whl", hash = "sha256:aeb971b5c29cf8ab98445082cbfe7b161a1f48ed275822f59ed3f1524ea54fba", size = 131787, upload-time = "2024-12-05T01:35:04.736Z" },
]
[[package]]
@ -814,16 +868,16 @@ wheels = [
[[package]]
name = "grpcio-status"
version = "1.76.0"
version = "1.62.3"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "googleapis-common-protos" },
{ name = "grpcio" },
{ name = "protobuf" },
]
sdist = { url = "https://files.pythonhosted.org/packages/3f/46/e9f19d5be65e8423f886813a2a9d0056ba94757b0c5007aa59aed1a961fa/grpcio_status-1.76.0.tar.gz", hash = "sha256:25fcbfec74c15d1a1cb5da3fab8ee9672852dc16a5a9eeb5baf7d7a9952943cd", size = 13679, upload-time = "2025-10-21T16:28:52.545Z" }
sdist = { url = "https://files.pythonhosted.org/packages/7c/d7/013ef01c5a1c2fd0932c27c904934162f69f41ca0f28396d3ffe4d386123/grpcio-status-1.62.3.tar.gz", hash = "sha256:289bdd7b2459794a12cf95dc0cb727bd4a1742c37bd823f760236c937e53a485", size = 13063, upload-time = "2024-08-06T00:37:08.003Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/8c/cc/27ba60ad5a5f2067963e6a858743500df408eb5855e98be778eaef8c9b02/grpcio_status-1.76.0-py3-none-any.whl", hash = "sha256:380568794055a8efbbd8871162df92012e0228a5f6dffaf57f2a00c534103b18", size = 14425, upload-time = "2025-10-21T16:28:40.853Z" },
{ url = "https://files.pythonhosted.org/packages/90/40/972271de05f9315c0d69f9f7ebbcadd83bc85322f538637d11bb8c67803d/grpcio_status-1.62.3-py3-none-any.whl", hash = "sha256:f9049b762ba8de6b1086789d8315846e094edac2c50beaf462338b301a8fd4b8", size = 14448, upload-time = "2024-08-06T00:30:15.702Z" },
]
[[package]]
@ -929,13 +983,13 @@ wheels = [
[[package]]
name = "kfp"
version = "2.14.6"
version = "2.8.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "click" },
{ name = "click-option-group" },
{ name = "docstring-parser" },
{ name = "google-api-core" },
{ name = "google-api-core", version = "2.25.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "google-api-core", version = "2.28.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
{ name = "google-auth" },
{ name = "google-cloud-storage" },
{ name = "kfp-pipeline-spec" },
@ -947,26 +1001,22 @@ dependencies = [
{ name = "tabulate" },
{ name = "urllib3" },
]
sdist = { url = "https://files.pythonhosted.org/packages/3b/c1/d01724ccb7faaf3ecf2a8109de1d7eebb0afa1f292d6dcd650755b990d59/kfp-2.14.6.tar.gz", hash = "sha256:9e94ff2e74465c27393736c295b6dc478b29cf9d0264950019b5167c7c53fd2e", size = 274267, upload-time = "2025-10-13T20:08:46.072Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/1f/46/789f883750b0f6c321450832e2f07203139716cb9422cad6f3d286298915/kfp-2.14.6-py3-none-any.whl", hash = "sha256:2d76aff91d8461e837989c2dc966c9dddaba7fcc37b7b8be4b0564282b1f613d", size = 374048, upload-time = "2025-10-13T20:08:44.275Z" },
]
sdist = { url = "https://files.pythonhosted.org/packages/51/ee/dbf636afac86c7701245ea4f424e2b38038eee51f3731e22f2777e232bbb/kfp-2.8.0.tar.gz", hash = "sha256:06ad584eecbe80318c6cd0231c95a432e91fec56f201def9d511b6e6664235ce", size = 594413, upload-time = "2024-06-22T09:03:47.265Z" }
[[package]]
name = "kfp-pipeline-spec"
version = "2.14.6"
version = "0.3.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "protobuf" },
]
sdist = { url = "https://files.pythonhosted.org/packages/7b/be/a8aa41bbe65c0578f141f615f30829e68bdc087542248d20a84316252228/kfp_pipeline_spec-2.14.6.tar.gz", hash = "sha256:a4943b0bdf6d991db35ca3a261caf77997676512970959bf9909742df58e2a87", size = 10255, upload-time = "2025-10-13T20:06:29.544Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/56/c7/a331cdb987d5c1764c309e6c9f596a695cfd8fe86ea95fc8a9fbc052cf52/kfp_pipeline_spec-2.14.6-py3-none-any.whl", hash = "sha256:82cbad2976f248f7049be37d241f1e47ecb3d99e720dfd0cab3e0881be458516", size = 9550, upload-time = "2025-10-13T20:06:28.544Z" },
{ url = "https://files.pythonhosted.org/packages/63/0a/269a792545cf8a87a30b84bebe69a2b07c483b2887690e8f48c9a91e8060/kfp_pipeline_spec-0.3.0-py3-none-any.whl", hash = "sha256:1db84524a0a2d6c9d36e7e87e6fa0e181bf1ba1513d29dcd54f7b8822e7a52a2", size = 12598, upload-time = "2024-01-10T00:24:34.83Z" },
]
[[package]]
name = "kfp-server-api"
version = "2.14.6"
version = "2.0.5"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "certifi" },
@ -974,27 +1024,27 @@ dependencies = [
{ name = "six" },
{ name = "urllib3" },
]
sdist = { url = "https://files.pythonhosted.org/packages/7b/9d/47f38ed0914bbf6c7e70693b805d822b0848d2f79cce0aa2addb2a7b2f67/kfp-server-api-2.14.6.tar.gz", hash = "sha256:eabf673f384186968d88cff9674cd39c655537aad1abacda78086575924d6bfc", size = 64327, upload-time = "2025-10-15T15:43:52.999Z" }
sdist = { url = "https://files.pythonhosted.org/packages/d9/4b/1b1c9286047e78ebc9de2a9d4d43921d6efb5e6550fdc38229127a03aa53/kfp-server-api-2.0.5.tar.gz", hash = "sha256:c9cfbf0e87271d3bfe96e5ecc9ffbdd6ab566bc1c9a9ddc2a39d7698a16e26ff", size = 63401, upload-time = "2023-12-08T19:21:48.908Z" }
[[package]]
name = "kubernetes"
version = "30.1.0"
version = "26.1.0"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "certifi" },
{ name = "google-auth" },
{ name = "oauthlib" },
{ name = "python-dateutil" },
{ name = "pyyaml" },
{ name = "requests" },
{ name = "requests-oauthlib" },
{ name = "setuptools" },
{ name = "six" },
{ name = "urllib3" },
{ name = "websocket-client" },
]
sdist = { url = "https://files.pythonhosted.org/packages/82/3c/9f29f6cab7f35df8e54f019e5719465fa97b877be2454e99f989270b4f34/kubernetes-30.1.0.tar.gz", hash = "sha256:41e4c77af9f28e7a6c314e3bd06a8c6229ddd787cad684e0ab9f69b498e98ebc", size = 887810, upload-time = "2024-06-06T15:58:30.031Z" }
sdist = { url = "https://files.pythonhosted.org/packages/34/19/2f351c0eaf05234dc33a6e0ffc7894e9dedab0ff341311c5b4ba44f2d8ac/kubernetes-26.1.0.tar.gz", hash = "sha256:5854b0c508e8d217ca205591384ab58389abdae608576f9c9afc35a3c76a366c", size = 736370, upload-time = "2023-02-16T01:04:37.088Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/62/a1/2027ddede72d33be2effc087580aeba07e733a7360780ae87226f1f91bd8/kubernetes-30.1.0-py2.py3-none-any.whl", hash = "sha256:e212e8b7579031dd2e512168b617373bc1e03888d41ac4e04039240a292d478d", size = 1706042, upload-time = "2024-06-06T15:58:27.13Z" },
{ url = "https://files.pythonhosted.org/packages/74/21/ada0c5eedb678ab663f8e387734418fdd1a26be28fc919a0c32e52964047/kubernetes-26.1.0-py2.py3-none-any.whl", hash = "sha256:e3db6800abf7e36c38d2629b5cb6b74d10988ee0cba6fba45595a7cbe60c0042", size = 1446361, upload-time = "2023-02-16T01:04:34.33Z" },
]
[[package]]
@ -1057,26 +1107,26 @@ wheels = [
[[package]]
name = "maturin"
version = "1.9.4"
version = "1.9.6"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "tomli", marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
]
sdist = { url = "https://files.pythonhosted.org/packages/13/7c/b11b870fc4fd84de2099906314ce45488ae17be32ff5493519a6cddc518a/maturin-1.9.4.tar.gz", hash = "sha256:235163a0c99bc6f380fb8786c04fd14dcf6cd622ff295ea3de525015e6ac40cf", size = 213647, upload-time = "2025-08-27T11:37:57.079Z" }
sdist = { url = "https://files.pythonhosted.org/packages/9a/35/c3370188492f4c139c7a318f438d01b8185c216303c49c4bc885c98b6afb/maturin-1.9.6.tar.gz", hash = "sha256:2c2ae37144811d365509889ed7220b0598487f1278c2441829c3abf56cc6324a", size = 214846, upload-time = "2025-10-07T12:45:08.408Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/f2/90/0d99389eea1939116fca841cad0763600c8d3183a02a9478d066736c60e8/maturin-1.9.4-py3-none-linux_armv6l.whl", hash = "sha256:6ff37578e3f5fdbe685110d45f60af1f5a7dfce70a1e26dfe3810af66853ecae", size = 8276133, upload-time = "2025-08-27T11:37:23.325Z" },
{ url = "https://files.pythonhosted.org/packages/f4/ed/c8ec68b383e50f084bf1fa9605e62a90cd32a3f75d9894ed3a6e5d4cc5b3/maturin-1.9.4-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:f3837bb53611b2dafa1c090436c330f2d743ba305ef00d8801a371f4495e7e1b", size = 15994496, upload-time = "2025-08-27T11:37:27.092Z" },
{ url = "https://files.pythonhosted.org/packages/84/4e/401ff5f3cfc6b123364d4b94379bf910d7baee32c9c95b72784ff2329357/maturin-1.9.4-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:4227d627d8e3bfe45877a8d65e9d8351a9d01434549f0da75d2c06a1b570de58", size = 8362228, upload-time = "2025-08-27T11:37:31.181Z" },
{ url = "https://files.pythonhosted.org/packages/51/8e/c56176dd360da9650c62b8a5ecfb85432cf011e97e46c186901e6996002e/maturin-1.9.4-py3-none-manylinux_2_12_i686.manylinux2010_i686.musllinux_1_1_i686.whl", hash = "sha256:1bb2aa0fa29032e9c5aac03ac400396ddea12cadef242f8967e9c8ef715313a1", size = 8271397, upload-time = "2025-08-27T11:37:33.672Z" },
{ url = "https://files.pythonhosted.org/packages/d2/46/001fcc5c6ad509874896418d6169a61acd619df5b724f99766308c44a99f/maturin-1.9.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:a0868d52934c8a5d1411b42367633fdb5cd5515bec47a534192282167448ec30", size = 8775625, upload-time = "2025-08-27T11:37:35.86Z" },
{ url = "https://files.pythonhosted.org/packages/b4/2e/26fa7574f01c19b7a74680fd70e5bae2e8c40fed9683d1752e765062cc2b/maturin-1.9.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:68b7b833b25741c0f553b78e8b9e095b31ae7c6611533b3c7b71f84c2cb8fc44", size = 8051117, upload-time = "2025-08-27T11:37:38.278Z" },
{ url = "https://files.pythonhosted.org/packages/73/ee/ca7308832d4f5b521c1aa176d9265f6f93e0bd1ad82a90fd9cd799f6b28c/maturin-1.9.4-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:08dc86312afee55af778af919818632e35d8d0464ccd79cb86700d9ea560ccd7", size = 8132122, upload-time = "2025-08-27T11:37:40.499Z" },
{ url = "https://files.pythonhosted.org/packages/45/e8/c623955da75e801a06942edf1fdc4e772a9e8fbc1ceebbdc85d59584dc10/maturin-1.9.4-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:ef20ffdd943078c4c3699c29fb2ed722bb6b4419efdade6642d1dbf248f94a70", size = 10586762, upload-time = "2025-08-27T11:37:42.718Z" },
{ url = "https://files.pythonhosted.org/packages/3c/4b/19ad558fdf54e151b1b4916ed45f1952ada96684ee6db64f9cd91cabec09/maturin-1.9.4-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:368e958468431dfeec80f75eea9639b4356d8c42428b0128444424b083fecfb0", size = 8926988, upload-time = "2025-08-27T11:37:45.492Z" },
{ url = "https://files.pythonhosted.org/packages/7e/27/153ad15eccae26921e8a01812da9f3b7f9013368f8f92c36853f2043b2a3/maturin-1.9.4-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:273f879214f63f79bfe851cd7d541f8150bdbfae5dfdc3c0c4d125d02d1f41b4", size = 8536758, upload-time = "2025-08-27T11:37:48.213Z" },
{ url = "https://files.pythonhosted.org/packages/43/e3/f304c3bdc3fba9adebe5348d4d2dd015f1152c0a9027aaf52cae0bb182c8/maturin-1.9.4-py3-none-win32.whl", hash = "sha256:ed2e54d132ace7e61829bd49709331007dd9a2cc78937f598aa76a4f69b6804d", size = 7265200, upload-time = "2025-08-27T11:37:50.881Z" },
{ url = "https://files.pythonhosted.org/packages/14/14/f86d0124bf1816b99005c058a1dbdca7cb5850d9cf4b09dcae07a1bc6201/maturin-1.9.4-py3-none-win_amd64.whl", hash = "sha256:8e450bb2c9afdf38a0059ee2e1ec2b17323f152b59c16f33eb9c74edaf1f9f79", size = 8237391, upload-time = "2025-08-27T11:37:53.23Z" },
{ url = "https://files.pythonhosted.org/packages/3f/25/8320fc2591e45b750c3ae71fa596b47aefa802d07d6abaaa719034a85160/maturin-1.9.4-py3-none-win_arm64.whl", hash = "sha256:7a6f980a9b67a5c13c844c268eabd855b54a6a765df4b4bb07d15a990572a4c9", size = 6988277, upload-time = "2025-08-27T11:37:55.429Z" },
{ url = "https://files.pythonhosted.org/packages/55/5c/b435418ba4ba2647a1f7a95d53314991b1e556e656ae276dea993c3bce1d/maturin-1.9.6-py3-none-linux_armv6l.whl", hash = "sha256:26e3ab1a42a7145824210e9d763f6958f2c46afb1245ddd0bab7d78b1f59bb3f", size = 8134483, upload-time = "2025-10-07T12:44:44.274Z" },
{ url = "https://files.pythonhosted.org/packages/4d/1c/8e58eda6601f328b412cdeeaa88a9b6a10e591e2a73f313e8c0154d68385/maturin-1.9.6-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:5263dda3f71feef2e4122baf5c4620e4b3710dbb7f2121f85a337182de214369", size = 15776470, upload-time = "2025-10-07T12:44:47.476Z" },
{ url = "https://files.pythonhosted.org/packages/6c/33/8c967cce6848cdd87a2e442c86120ac644b80c5ed4c32e3291bde6a17df8/maturin-1.9.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:fe78262c2800c92f67d1ce3c0f6463f958a692cc67bfb572e5dbf5b4b696a8ba", size = 8226557, upload-time = "2025-10-07T12:44:49.844Z" },
{ url = "https://files.pythonhosted.org/packages/58/bd/3e2675cdc8b7270700ba30c663c852a35694441732a107ac30ebd6878bd8/maturin-1.9.6-py3-none-manylinux_2_12_i686.manylinux2010_i686.musllinux_1_1_i686.whl", hash = "sha256:7ab827c6e8c022eb2e1e7fb6deede54549c8460b20ccc2e9268cc6e8cde957a8", size = 8166544, upload-time = "2025-10-07T12:44:51.396Z" },
{ url = "https://files.pythonhosted.org/packages/58/1f/a2047ddf2230e700d5f8a13dd4b9af5ce806ad380c32e58105888205926e/maturin-1.9.6-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:0246202377c49449315305209f45c8ecef6e2d6bd27a04b5b6f1ab3e4ea47238", size = 8641010, upload-time = "2025-10-07T12:44:53.658Z" },
{ url = "https://files.pythonhosted.org/packages/be/1f/265d63c7aa6faf363d4a3f23396f51bc6b4d5c7680a4190ae68dba25dea2/maturin-1.9.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:f5bac167700fbb6f8c8ed1a97b494522554b4432d7578e11403b894b6a91d99f", size = 7965945, upload-time = "2025-10-07T12:44:55.248Z" },
{ url = "https://files.pythonhosted.org/packages/4c/ca/a8e61979ccfe080948bcc1bddd79356157aee687134df7fb013050cec783/maturin-1.9.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:7f53d3b1d8396d3fea3e1ee5fd37558bca5719090f3d194ba1c02b0b56327ae3", size = 7978820, upload-time = "2025-10-07T12:44:56.919Z" },
{ url = "https://files.pythonhosted.org/packages/bf/4a/81b412f8ad02a99801ef19ec059fba0822d1d28fb44cb6a92e722f05f278/maturin-1.9.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:7f506eb358386d94d6ec3208c003130cf4b69cab26034fc0cbbf8bf83afa4c2e", size = 10452064, upload-time = "2025-10-07T12:44:58.232Z" },
{ url = "https://files.pythonhosted.org/packages/5b/12/cc96c7a8cb51d8dcc9badd886c361caa1526fba7fa69d1e7892e613b71d4/maturin-1.9.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2d6984ab690af509f525dbd2b130714207c06ebb14a5814edbe1e42b17ae0de", size = 8852401, upload-time = "2025-10-07T12:44:59.8Z" },
{ url = "https://files.pythonhosted.org/packages/51/8e/653ac3c9f2c25cdd81aefb0a2d17ff140ca5a14504f5e3c7f94dcfe4dbb7/maturin-1.9.6-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:5c2252b0956bb331460ac750c805ddf0d9b44442449fc1f16e3b66941689d0bc", size = 8425057, upload-time = "2025-10-07T12:45:01.711Z" },
{ url = "https://files.pythonhosted.org/packages/db/29/f13490328764ae9bfc1da55afc5b707cebe4fa75ad7a1573bfa82cfae0c6/maturin-1.9.6-py3-none-win32.whl", hash = "sha256:f2c58d29ebdd4346fd004e6be213d071fdd94a77a16aa91474a21a4f9dbf6309", size = 7165956, upload-time = "2025-10-07T12:45:03.766Z" },
{ url = "https://files.pythonhosted.org/packages/db/9f/dd51e5ac1fce47581b8efa03d77a03f928c0ef85b6e48a61dfa37b6b85a2/maturin-1.9.6-py3-none-win_amd64.whl", hash = "sha256:1b39a5d82572c240d20d9e8be024d722dfb311d330c5e28ddeb615211755941a", size = 8145722, upload-time = "2025-10-07T12:45:05.487Z" },
{ url = "https://files.pythonhosted.org/packages/65/f2/e97aaba6d0d78c5871771bf9dd71d4eb8dac15df9109cf452748d2207412/maturin-1.9.6-py3-none-win_arm64.whl", hash = "sha256:ac02a30083553d2a781c10cd6f5480119bf6692fd177e743267406cad2ad198c", size = 6857006, upload-time = "2025-10-07T12:45:06.813Z" },
]
[[package]]
@ -1254,7 +1304,7 @@ requires-dist = [
{ name = "files-to-prompt", specifier = ">=0.6" },
{ name = "google-cloud-aiplatform", specifier = ">=1.25.0" },
{ name = "google-cloud-storage", specifier = ">=2.10.0" },
{ name = "kfp", specifier = ">=2.0.0" },
{ name = "kfp", specifier = "==2.8.0" },
{ name = "psutil", specifier = ">=7.1.0" },
{ name = "regex", specifier = ">=2025.9.1" },
{ name = "setuptools", specifier = ">=80.9.0" },
@ -1702,16 +1752,16 @@ wheels = [
[[package]]
name = "protobuf"
version = "6.32.0"
version = "4.25.8"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/c0/df/fb4a8eeea482eca989b51cffd274aac2ee24e825f0bf3cbce5281fa1567b/protobuf-6.32.0.tar.gz", hash = "sha256:a81439049127067fc49ec1d36e25c6ee1d1a2b7be930675f919258d03c04e7d2", size = 440614, upload-time = "2025-08-14T21:21:25.015Z" }
sdist = { url = "https://files.pythonhosted.org/packages/df/01/34c8d2b6354906d728703cb9d546a0e534de479e25f1b581e4094c4a85cc/protobuf-4.25.8.tar.gz", hash = "sha256:6135cf8affe1fc6f76cced2641e4ea8d3e59518d1f24ae41ba97bcad82d397cd", size = 380920, upload-time = "2025-05-28T14:22:25.153Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/33/18/df8c87da2e47f4f1dcc5153a81cd6bca4e429803f4069a299e236e4dd510/protobuf-6.32.0-cp310-abi3-win32.whl", hash = "sha256:84f9e3c1ff6fb0308dbacb0950d8aa90694b0d0ee68e75719cb044b7078fe741", size = 424409, upload-time = "2025-08-14T21:21:12.366Z" },
{ url = "https://files.pythonhosted.org/packages/e1/59/0a820b7310f8139bd8d5a9388e6a38e1786d179d6f33998448609296c229/protobuf-6.32.0-cp310-abi3-win_amd64.whl", hash = "sha256:a8bdbb2f009cfc22a36d031f22a625a38b615b5e19e558a7b756b3279723e68e", size = 435735, upload-time = "2025-08-14T21:21:15.046Z" },
{ url = "https://files.pythonhosted.org/packages/cc/5b/0d421533c59c789e9c9894683efac582c06246bf24bb26b753b149bd88e4/protobuf-6.32.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:d52691e5bee6c860fff9a1c86ad26a13afbeb4b168cd4445c922b7e2cf85aaf0", size = 426449, upload-time = "2025-08-14T21:21:16.687Z" },
{ url = "https://files.pythonhosted.org/packages/ec/7b/607764ebe6c7a23dcee06e054fd1de3d5841b7648a90fd6def9a3bb58c5e/protobuf-6.32.0-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:501fe6372fd1c8ea2a30b4d9be8f87955a64d6be9c88a973996cef5ef6f0abf1", size = 322869, upload-time = "2025-08-14T21:21:18.282Z" },
{ url = "https://files.pythonhosted.org/packages/40/01/2e730bd1c25392fc32e3268e02446f0d77cb51a2c3a8486b1798e34d5805/protobuf-6.32.0-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:75a2aab2bd1aeb1f5dc7c5f33bcb11d82ea8c055c9becbb41c26a8c43fd7092c", size = 322009, upload-time = "2025-08-14T21:21:19.893Z" },
{ url = "https://files.pythonhosted.org/packages/9c/f2/80ffc4677aac1bc3519b26bc7f7f5de7fce0ee2f7e36e59e27d8beb32dd1/protobuf-6.32.0-py3-none-any.whl", hash = "sha256:ba377e5b67b908c8f3072a57b63e2c6a4cbd18aea4ed98d2584350dbf46f2783", size = 169287, upload-time = "2025-08-14T21:21:23.515Z" },
{ url = "https://files.pythonhosted.org/packages/45/ff/05f34305fe6b85bbfbecbc559d423a5985605cad5eda4f47eae9e9c9c5c5/protobuf-4.25.8-cp310-abi3-win32.whl", hash = "sha256:504435d831565f7cfac9f0714440028907f1975e4bed228e58e72ecfff58a1e0", size = 392745, upload-time = "2025-05-28T14:22:10.524Z" },
{ url = "https://files.pythonhosted.org/packages/08/35/8b8a8405c564caf4ba835b1fdf554da869954712b26d8f2a98c0e434469b/protobuf-4.25.8-cp310-abi3-win_amd64.whl", hash = "sha256:bd551eb1fe1d7e92c1af1d75bdfa572eff1ab0e5bf1736716814cdccdb2360f9", size = 413736, upload-time = "2025-05-28T14:22:13.156Z" },
{ url = "https://files.pythonhosted.org/packages/28/d7/ab27049a035b258dab43445eb6ec84a26277b16105b277cbe0a7698bdc6c/protobuf-4.25.8-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:ca809b42f4444f144f2115c4c1a747b9a404d590f18f37e9402422033e464e0f", size = 394537, upload-time = "2025-05-28T14:22:14.768Z" },
{ url = "https://files.pythonhosted.org/packages/bd/6d/a4a198b61808dd3d1ee187082ccc21499bc949d639feb948961b48be9a7e/protobuf-4.25.8-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:9ad7ef62d92baf5a8654fbb88dac7fa5594cfa70fd3440488a5ca3bfc6d795a7", size = 294005, upload-time = "2025-05-28T14:22:16.052Z" },
{ url = "https://files.pythonhosted.org/packages/d6/c6/c9deaa6e789b6fc41b88ccbdfe7a42d2b82663248b715f55aa77fbc00724/protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:83e6e54e93d2b696a92cad6e6efc924f3850f82b52e1563778dfab8b355101b0", size = 294924, upload-time = "2025-05-28T14:22:17.105Z" },
{ url = "https://files.pythonhosted.org/packages/0c/c1/6aece0ab5209981a70cd186f164c133fdba2f51e124ff92b73de7fd24d78/protobuf-4.25.8-py3-none-any.whl", hash = "sha256:15a0af558aa3b13efef102ae6e4f3efac06f1eea11afb3a57db2901447d9fb59", size = 156757, upload-time = "2025-05-28T14:22:24.135Z" },
]
[[package]]
@ -2097,14 +2147,14 @@ wheels = [
[[package]]
name = "requests-toolbelt"
version = "1.0.0"
version = "0.10.1"
source = { registry = "https://pypi.org/simple" }
dependencies = [
{ name = "requests" },
]
sdist = { url = "https://files.pythonhosted.org/packages/f3/61/d7545dafb7ac2230c70d38d31cbfe4cc64f7144dc41f6e4e4b78ecd9f5bb/requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6", size = 206888, upload-time = "2023-05-01T04:11:33.229Z" }
sdist = { url = "https://files.pythonhosted.org/packages/0c/4c/07f01c6ac44f7784fa399137fbc8d0cdc1b5d35304e8c0f278ad82105b58/requests-toolbelt-0.10.1.tar.gz", hash = "sha256:62e09f7ff5ccbda92772a29f394a49c3ad6cb181d568b1337626b2abb628a63d", size = 208956, upload-time = "2022-10-25T03:14:58.576Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06", size = 54481, upload-time = "2023-05-01T04:11:28.427Z" },
{ url = "https://files.pythonhosted.org/packages/05/d3/bf87a36bff1cb88fd30a509fd366c70ec30676517ee791b2f77e0e29817a/requests_toolbelt-0.10.1-py2.py3-none-any.whl", hash = "sha256:18565aa58116d9951ac39baa288d3adb5b3ff975c4f25eee78555d89e8f247f7", size = 54525, upload-time = "2022-10-25T03:14:55.289Z" },
]
[[package]]
@ -2632,11 +2682,11 @@ wheels = [
[[package]]
name = "urllib3"
version = "2.5.0"
version = "1.26.20"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload-time = "2025-06-18T14:07:41.644Z" }
sdist = { url = "https://files.pythonhosted.org/packages/e4/e8/6ff5e6bc22095cfc59b6ea711b687e2b7ed4bdb373f7eeec370a97d7392f/urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32", size = 307380, upload-time = "2024-08-29T15:43:11.37Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" },
{ url = "https://files.pythonhosted.org/packages/33/cf/8435d5a7159e2a9c83a95896ed596f68cf798005fe107cc655b5c5c14704/urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e", size = 144225, upload-time = "2024-08-29T15:43:08.921Z" },
]
[[package]]

69
verify_runtime_params.py Normal file
View File

@ -0,0 +1,69 @@
#!/usr/bin/env python3
"""
Verify that scheduling_strategy and max_wait_duration are runtime parameters
"""
import json
import sys
def verify_runtime_parameters():
print("=== Verifying Runtime Scheduling Parameters ===\n")
# Load compiled pipeline
try:
with open('nanochat_pipeline.json', 'r') as f:
pipeline = json.load(f)
except FileNotFoundError:
print("❌ Error: nanochat_pipeline.json not found")
print(" Run: python3 vertex_pipelines/pipeline.py --gcp-project nzp-nanochat ...")
return False
# Check root input parameters
root_params = pipeline['root']['inputDefinitions']['parameters']
print("✓ Pipeline root parameters:")
for param in ['scheduling_strategy', 'max_wait_duration']:
if param in root_params:
info = root_params[param]
print(f"{param}:")
print(f" Type: {info['parameterType']}")
print(f" Default: {info.get('defaultValue', 'N/A')}")
print(f" Optional: {info.get('isOptional', False)}")
else:
print(f" ❌ Missing: {param}")
return False
print()
# Check custom-training-job task parameters
custom_job_task = pipeline['root']['dag']['tasks']['custom-training-job']
task_params = custom_job_task['inputs']['parameters']
print("✓ Custom Job task parameter bindings:")
for param in ['strategy', 'max_wait_duration']:
if param in task_params:
binding = task_params[param]
if 'componentInputParameter' in binding:
print(f"{param}{binding['componentInputParameter']}")
elif 'runtimeValue' in binding:
print(f"{param} → runtime constant (not parameterized!)")
return False
else:
print(f" ❌ Missing: {param}")
return False
print()
print("=== Verification Summary ===")
print("✅ scheduling_strategy is a RUNTIME parameter")
print("✅ max_wait_duration is a RUNTIME parameter")
print("✅ Both are correctly bound to Custom Job inputs")
print()
print("Benefits:")
print(" • No recompilation needed to change FLEX_START ↔ SPOT ↔ STANDARD")
print(" • No Docker rebuild needed for deployment strategy changes")
print(" • Single pipeline JSON can be reused with different strategies")
print()
return True
if __name__ == "__main__":
success = verify_runtime_parameters()
sys.exit(0 if success else 1)

View File

@ -1,5 +1,6 @@
# Use the official Python 3.10 image.
FROM python:3.10-slim
# Use Google Cloud's Deep Learning Container for PyTorch with GPU support
# This image is optimized for Vertex AI and includes CUDA, cuDNN, and PyTorch
FROM gcr.io/deeplearning-platform-release/pytorch-gpu.2-2.py310
# Set the working directory.
WORKDIR /app
@ -20,6 +21,9 @@ RUN uv venv
# Install Python dependencies using uv.
RUN uv sync --extra gpu
# Install the nanochat package in editable mode
RUN uv pip install -e .
# Install maturin, which is a build dependency.
RUN uv pip install maturin
@ -27,5 +31,8 @@ RUN uv pip install maturin
# The maturin executable from the venv should be on the PATH now.
RUN maturin develop --release --manifest-path rustbpe/Cargo.toml
# Set the entrypoint.
ENTRYPOINT ["python"]
# Set PYTHONPATH to include /app so that nanochat module can be imported
ENV PYTHONPATH="/app:${PYTHONPATH}"
# Set the entrypoint to use the virtual environment's Python.
ENTRYPOINT ["/app/.venv/bin/python"]

View File

@ -0,0 +1,135 @@
#!/bin/bash
PROJECT="nzp-nanochat"
MACHINE_TYPE="g2-standard-4" # Smallest L4 machine type
IMAGE_FAMILY="debian-12"
IMAGE_PROJECT="debian-cloud"
# Parse debug flag
DEBUG=false
if [[ "${1:-}" == "--debug" ]]; then
DEBUG=true
echo "Debug mode enabled - will show GCP error messages"
echo ""
fi
echo "=== Testing L4 GPU Availability Across ALL Global Regions ==="
echo "This will attempt to create small L4 instances and immediately delete them"
echo "Order: US -> Europe -> Others. Stops at first success."
echo ""
# Get all regions dynamically
echo "Fetching all GCP regions..."
ALL_REGIONS=$(gcloud compute regions list --project="$PROJECT" --format="value(name)" 2>/dev/null | sort)
REGION_COUNT=$(echo "$ALL_REGIONS" | wc -l | tr -d ' ')
echo "Found $REGION_COUNT regions to test"
echo ""
RESULTS_FILE=$(mktemp)
ERROR_LOG=$(mktemp)
# Order regions: US first, then Europe, then others
ordered_regions=$(echo "$ALL_REGIONS" | tr ' ' '\n' | grep '^us-' || true)
ordered_regions+=$'\n'
ordered_regions+=$(echo "$ALL_REGIONS" | tr ' ' '\n' | grep '^europe-' || true)
ordered_regions+=$'\n'
ordered_regions+=$(echo "$ALL_REGIONS" | tr ' ' '\n' | grep -vE '^(us-|europe-)' || true)
# Remove empty lines
ordered_regions=$(echo "$ordered_regions" | sed '/^$/d')
current=0
found_any=false
# Iterate over ordered list
for region in $ordered_regions; do
current=$((current + 1))
echo "[$current/$REGION_COUNT] Testing region: $region"
# Get zones for region
zones=$(gcloud compute zones list --project="$PROJECT" --filter="region:$region" --format="value(name)" 2>/dev/null)
if [ -z "$zones" ]; then
echo " ⚠️ No zones found for region $region"
continue
fi
found_capacity=false
available_zone=""
for zone in $zones; do
echo -n " Checking zone $zone... "
instance_name="test-l4-capacity-$$-$(date +%s)"
# Try to create instance - capture stderr
error_output=$(mktemp)
if gcloud compute instances create "$instance_name" \
--zone="$zone" \
--machine-type="$MACHINE_TYPE" \
--accelerator="type=nvidia-l4,count=1" \
--image-family="$IMAGE_FAMILY" \
--image-project="$IMAGE_PROJECT" \
--boot-disk-size=200GB \
--boot-disk-type=pd-standard \
--network="nanochat-network" \
--no-address \
--shielded-secure-boot \
--maintenance-policy=TERMINATE \
--project="$PROJECT" \
--quiet \
2>"$error_output"; then
echo "✅ AVAILABLE"
available_zone="$zone"
found_capacity=true
# Delete instance
gcloud compute instances delete "$instance_name" --zone="$zone" --project="$PROJECT" --quiet 2>/dev/null || true
rm -f "$error_output"
break
else
echo "❌ No capacity"
if [ "$DEBUG" = true ]; then
echo " ERROR DETAILS:"
sed 's/^/ /' "$error_output"
cat "$error_output" >> "$ERROR_LOG"
fi
rm -f "$error_output"
fi
done
if [ "$found_capacity" = true ]; then
echo "$region: ✅ Available in $available_zone" >> "$RESULTS_FILE"
echo ""
echo "✅ Found capacity in $region ($available_zone). Stopping further checks."
found_any=true
break
else
echo "$region: ❌ No capacity in any zone" >> "$RESULTS_FILE"
fi
echo ""
done
# Print summary (will only contain up to first successful region)
echo "=========================================================="
echo " L4 GPU AVAILABILITY SUMMARY (GLOBAL) "
echo "=========================================================="
cat "$RESULTS_FILE" | sort
echo "=========================================================="
echo ""
if [ "$found_any" = true ]; then
echo "✅ Recommendation: Use the region marked with ✅ above."
else
echo "❌ No L4 capacity found in any tested region."
fi
# Cleanup
rm -f "$RESULTS_FILE"
if [ "$DEBUG" = true ]; then
echo "Debug log: $ERROR_LOG"
else
rm -f "$ERROR_LOG"
fi

View File

@ -3,9 +3,9 @@ steps:
args:
- 'build'
- '-t'
- 'gcr.io/$PROJECT_ID/nanochat:latest'
- '$_IMAGE_NAME'
- '.'
- '-f'
- 'vertex_pipelines/Dockerfile'
images:
- 'gcr.io/$PROJECT_ID/nanochat:latest'
- '$_IMAGE_NAME'

View File

@ -0,0 +1,127 @@
#!/usr/bin/env python3
"""
Data download step for Vertex AI Pipeline.
Downloads training data shards from HuggingFace and uploads to GCS.
"""
import argparse
import os
import subprocess
import tempfile
from google.cloud import storage
def download_and_upload_data(gcs_bucket: str, num_shards: int = 50):
"""
Download training data shards and upload to GCS.
Args:
gcs_bucket: GCS bucket path (e.g., 'gs://nzp-nanochat')
num_shards: Number of parquet shards to download (default: 50 for testing)
"""
# Extract bucket name from gs:// path
bucket_name = gcs_bucket.replace("gs://", "").split("/")[0]
prefix = "/".join(gcs_bucket.replace("gs://", "").split("/")[1:]) if "/" in gcs_bucket.replace("gs://", "") else ""
# Check if data already exists
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
gcs_data_path = f"{prefix}/base_data" if prefix else "base_data"
blobs = list(bucket.list_blobs(prefix=gcs_data_path))
parquet_blobs = [b for b in blobs if b.name.endswith('.parquet')]
if len(parquet_blobs) >= num_shards:
print(f"Found {len(parquet_blobs)} parquet files in gs://{bucket_name}/{gcs_data_path}")
print(f"Skipping download as {num_shards} shards were requested and sufficient data exists.")
return
# Create temporary directory for downloads
with tempfile.TemporaryDirectory() as temp_dir:
print(f"Downloading {num_shards} data shards to {temp_dir}...")
local_data_dir = os.path.join(temp_dir, "base_data")
os.makedirs(local_data_dir, exist_ok=True)
# Set environment variable for nanochat dataset module
os.environ["NANOCHAT_DATA_DIR"] = local_data_dir
# Download data using nanochat's dataset module
print(f"Running: python -m nanochat.dataset -n {num_shards}")
subprocess.run([
"python", "-m", "nanochat.dataset", "-n", str(num_shards)
], check=True)
# Upload to GCS
print(f"Uploading data to gs://{bucket_name}/{prefix}/base_data/...")
# Upload all parquet files
parquet_files = [f for f in os.listdir(local_data_dir) if f.endswith('.parquet')]
print(f"Found {len(parquet_files)} parquet files to upload")
for i, filename in enumerate(parquet_files):
local_path = os.path.join(local_data_dir, filename)
gcs_path = f"{prefix}/base_data/{filename}" if prefix else f"base_data/{filename}"
blob = bucket.blob(gcs_path)
print(f"Uploading {i+1}/{len(parquet_files)}: {filename}")
blob.upload_from_filename(local_path)
print(f"Successfully uploaded {len(parquet_files)} data shards to GCS")
# Verify upload
gcs_data_path = f"gs://{bucket_name}/{prefix}/base_data" if prefix else f"gs://{bucket_name}/base_data"
print(f"Data is now available at: {gcs_data_path}")
# Download and upload eval bundle
print("Downloading eval bundle from Karpathy's S3...")
import urllib.request
import zipfile
eval_bundle_url = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip"
eval_bundle_path = "/tmp/eval_bundle.zip"
eval_bundle_extracted = "/tmp/eval_bundle"
urllib.request.urlretrieve(eval_bundle_url, eval_bundle_path)
print(f"Downloaded eval_bundle.zip to {eval_bundle_path}")
# Extract and upload to GCS
with zipfile.ZipFile(eval_bundle_path, 'r') as zip_ref:
zip_ref.extractall("/tmp")
# Upload eval_bundle directory to GCS
print("Uploading eval bundle to GCS...")
eval_bundle_files = []
for root, dirs, files in os.walk(eval_bundle_extracted):
for file in files:
local_file_path = os.path.join(root, file)
relative_path = os.path.relpath(local_file_path, "/tmp")
eval_bundle_files.append((local_file_path, relative_path))
for local_file_path, relative_path in eval_bundle_files:
gcs_path = f"{prefix}/{relative_path}" if prefix else relative_path
blob = bucket.blob(gcs_path)
blob.upload_from_filename(local_file_path)
print(f"Uploaded {len(eval_bundle_files)} eval bundle files to GCS")
def main():
parser = argparse.ArgumentParser(description="Download and upload training data to GCS")
parser.add_argument("--gcs-bucket", type=str, required=True, help="GCS bucket path")
parser.add_argument("--num-shards", type=int, default=50, help="Number of data shards to download")
args = parser.parse_args()
print("=" * 80)
print("DATA DOWNLOAD STEP")
print("=" * 80)
print(f"GCS Bucket: {args.gcs_bucket}")
print(f"Number of shards: {args.num_shards}")
print("=" * 80)
download_and_upload_data(args.gcs_bucket, args.num_shards)
print("=" * 80)
print("DATA DOWNLOAD COMPLETE")
print("=" * 80)
if __name__ == "__main__":
main()

View File

@ -1,37 +1,149 @@
import os
import subprocess
import argparse
from nanochat.common import get_base_dir
import shutil
from google.cloud import storage
def download_directory_from_gcs(bucket_name, gcs_path, local_path):
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blobs = bucket.list_blobs(prefix=gcs_path)
for blob in blobs:
if blob.name.endswith("/"):
continue
relative_path = os.path.relpath(blob.name, gcs_path)
local_file = os.path.join(local_path, relative_path)
os.makedirs(os.path.dirname(local_file), exist_ok=True)
blob.download_to_filename(local_file)
print(f"Downloaded gs://{bucket_name}/{blob.name} to {local_file}")
def upload_directory_to_gcs(local_path, bucket_name, gcs_path):
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
for root, _, files in os.walk(local_path):
for file in files:
local_file = os.path.join(root, file)
relative_path = os.path.relpath(local_file, local_path)
blob_path = os.path.join(gcs_path, relative_path)
blob = bucket.blob(blob_path)
blob.upload_from_file(open(local_file, 'rb'))
print(f"Uploaded {local_file} to gs://{bucket_name}/{blob_path}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--gcs-bucket", type=str, required=True, help="GCS bucket for artifacts")
parser.add_argument("--wandb-run", type=str, default="dummy", help="Wandb run name")
parser.add_argument("--vertex-experiment", type=str, default="", help="Vertex AI experiment name")
parser.add_argument("--vertex-tensorboard", type=str, default="", help="Vertex AI TensorBoard resource name")
parser.add_argument("--device-batch-size", type=int, default=16, help="Device batch size")
args = parser.parse_args()
# Set the base directory to the GCS bucket.
os.environ["NANOCHAT_BASE_DIR"] = args.gcs_bucket
# Parse bucket name and prefix
if args.gcs_bucket.startswith("gs://"):
bucket_name = args.gcs_bucket.replace("gs://", "").split("/")[0]
prefix_parts = args.gcs_bucket.replace("gs://", "").split("/")[1:]
prefix = "/".join(prefix_parts) if prefix_parts else ""
else:
bucket_name = args.gcs_bucket
prefix = ""
# Download the identity conversations dataset.
subprocess.run([
"curl", "-L", "-o",
f"{get_base_dir()}/identity_conversations.jsonl",
"https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl"
], check=True)
# Check if midtraining checkpoint already exists (checkpoint detection)
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
gcs_mid_ckpt_path = os.path.join(prefix, "mid_checkpoints") if prefix else "mid_checkpoints"
# Check for model.pt (the key checkpoint file)
# Note: mid_train.py saves to f"d{depth}" where depth defaults to 20 (inherited from base model)
depth = 20
gcs_mid_ckpt_path = os.path.join(gcs_mid_ckpt_path, f"d{depth}")
checkpoint_exists = bucket.blob(os.path.join(gcs_mid_ckpt_path, "model.pt")).exists()
if checkpoint_exists:
print(f"✓ Midtraining checkpoint already exists in gs://{bucket_name}/{gcs_mid_ckpt_path}")
print("Skipping midtraining (already completed)")
return
# Run mid-training.
subprocess.run([
"torchrun", "--standalone", "--nproc_per_node=8",
"-m", "scripts.mid_train", "--",
f"--run={args.wandb_run}"
], check=True)
print(f"Midtraining checkpoint not found. Running midtraining...")
# Evaluate the model.
subprocess.run([
"torchrun", "--standalone", "--nproc_per_node=8",
"-m", "scripts.chat_eval", "--",
"-i", "mid"
], check=True)
# Set local tmp dir for temporary files
local_base_dir = "/tmp/nanochat"
os.makedirs(local_base_dir, exist_ok=True)
# Download tokenizer from GCS
print("Downloading tokenizer from GCS...")
gcs_tokenizer_path = os.path.join(prefix, "tokenizer") if prefix else "tokenizer"
local_tokenizer_dir = os.path.join(local_base_dir, "tokenizer")
download_directory_from_gcs(bucket_name, gcs_tokenizer_path, local_tokenizer_dir)
# Download base checkpoints from GCS
print("Downloading base checkpoints from GCS...")
gcs_base_checkpoints_path = os.path.join(prefix, "base_checkpoints") if prefix else "base_checkpoints"
local_base_checkpoints_dir = os.path.join(local_base_dir, "base_checkpoints")
download_directory_from_gcs(bucket_name, gcs_base_checkpoints_path, local_base_checkpoints_dir)
# Download report dir from GCS
print("Downloading report dir from GCS...")
gcs_report_path = os.path.join(prefix, "report") if prefix else "report"
local_report_dir = os.path.join(local_base_dir, "report")
download_directory_from_gcs(bucket_name, gcs_report_path, local_report_dir)
# Ensure report directory exists even if nothing was downloaded
os.makedirs(local_report_dir, exist_ok=True)
try:
# Download the identity conversations dataset.
# This is needed for midtraining.
# We can download it to local base dir or just let the script handle it if it downloads from URL.
# scripts/mid_train.py doesn't seem to download it automatically?
# Let's check mid_train.py later. Assuming the previous code was correct about downloading it.
# The previous code had:
# subprocess.run(["curl", "-L", "-o", f"{get_base_dir()}/identity_conversations.jsonl", ...])
# I'll include that.
print("Downloading identity conversations...")
subprocess.run([
"curl", "-L", "-o",
f"{local_base_dir}/identity_conversations.jsonl",
"https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl"
], check=True)
# Mid-train the model.
print("Starting midtraining...")
env = os.environ.copy()
env["NANOCHAT_BASE_DIR"] = local_base_dir
subprocess.run([
"torchrun", "--standalone", "--nproc_per_node=1",
"-m", "scripts.mid_train",
f"--device_batch_size={args.device_batch_size}",
f"--wandb_run_name={args.wandb_run}",
f"--vertex_experiment={args.vertex_experiment}",
f"--vertex_tensorboard={args.vertex_tensorboard}"
], check=True, env=env)
# Evaluate the model.
print("Running chat_eval (mid)...")
subprocess.run([
"torchrun", "--standalone", "--nproc_per_node=1",
"-m", "scripts.chat_eval", "--",
"-i", "mid"
], check=True, env=env)
except subprocess.CalledProcessError as e:
print(f"Error during midtraining steps: {e}")
raise
# Upload checkpoints to GCS
print("Uploading artifacts to GCS...")
# Upload mid_checkpoints
local_checkpoints_dir = os.path.join(local_base_dir, "mid_checkpoints")
gcs_checkpoints_path = os.path.join(prefix, "mid_checkpoints") if prefix else "mid_checkpoints"
if os.path.exists(local_checkpoints_dir):
upload_directory_to_gcs(local_checkpoints_dir, bucket_name, gcs_checkpoints_path)
else:
print(f"Warning: {local_checkpoints_dir} does not exist.")
# Upload report dir
if os.path.exists(local_report_dir):
upload_directory_to_gcs(local_report_dir, bucket_name, gcs_report_path)
if __name__ == "__main__":
main()

View File

@ -1,74 +1,325 @@
import os
import kfp
from kfp.v2 import dsl
from kfp.v2.compiler import Compiler
from kfp import dsl
from kfp.compiler import Compiler
from google.cloud import aiplatform
from google_cloud_pipeline_components.v1.custom_job import CustomTrainingJobOp
@dsl.pipeline(name="nanochat-pipeline")
def nanochat_pipeline(gcs_bucket: str, docker_image_uri: str, wandb_run: str = "dummy"):
# Global configuration for accelerator type
ACCELERATOR_TYPE = 'NVIDIA_L4'
# Read image URI from environment variable.
# This allows compiling the pipeline with a specific image without passing it as a PipelineParam,
# which avoids issues with dsl.ContainerSpec.
DOCKER_IMAGE_URI = os.environ.get("DOCKER_IMAGE_URI", "gcr.io/nzp-nanochat/nanochat:latest")
@dsl.container_component
def tokenizer_step(gcs_bucket: str) -> dsl.ContainerSpec:
"""
A Vertex AI pipeline for training and evaluating a nanochat model.
Tokenizer component.
"""
tokenizer_op = dsl.ContainerOp(
name="tokenizer",
image=docker_image_uri,
return dsl.ContainerSpec(
image=DOCKER_IMAGE_URI,
command=["python", "vertex_pipelines/tokenizer_step.py"],
arguments=["--gcs-bucket", gcs_bucket],
args=["--gcs-bucket", gcs_bucket],
)
pretraining_op = dsl.ContainerOp(
name="pretraining",
image=docker_image_uri,
command=["python", "vertex_pipelines/pretraining_step.py"],
arguments=["--gcs-bucket", gcs_bucket, "--wandb-run", wandb_run],
).after(tokenizer_op)
midtraining_op = dsl.ContainerOp(
name="midtraining",
image=docker_image_uri,
@dsl.container_component
def midtraining_step(gcs_bucket: str, wandb_run: str, vertex_experiment: str, vertex_tensorboard: str) -> dsl.ContainerSpec:
"""
Midtraining component.
"""
return dsl.ContainerSpec(
image=DOCKER_IMAGE_URI,
command=["python", "vertex_pipelines/midtraining_step.py"],
arguments=["--gcs-bucket", gcs_bucket, "--wandb-run", wandb_run],
).after(pretraining_op)
args=["--gcs-bucket", gcs_bucket, "--wandb-run", wandb_run, "--vertex-experiment", vertex_experiment, "--vertex-tensorboard", vertex_tensorboard],
)
sft_op = dsl.ContainerOp(
name="sft",
image=docker_image_uri,
@dsl.container_component
def sft_step(gcs_bucket: str, wandb_run: str, vertex_experiment: str, vertex_tensorboard: str) -> dsl.ContainerSpec:
"""
SFT component.
"""
return dsl.ContainerSpec(
image=DOCKER_IMAGE_URI,
command=["python", "vertex_pipelines/sft_step.py"],
arguments=["--gcs-bucket", gcs_bucket, "--wandb-run", wandb_run],
).after(midtraining_op)
args=["--gcs-bucket", gcs_bucket, "--wandb-run", wandb_run, "--vertex-experiment", vertex_experiment, "--vertex-tensorboard", vertex_tensorboard],
)
report_op = dsl.ContainerOp(
name="report",
image=docker_image_uri,
@dsl.container_component
def data_download_step(gcs_bucket: str, num_shards: int = 50):
"""
Data download component - downloads training data from HuggingFace to GCS.
"""
return dsl.ContainerSpec(
image=DOCKER_IMAGE_URI,
command=["python", "vertex_pipelines/data_download_step.py"],
args=["--gcs-bucket", gcs_bucket, "--num-shards", str(num_shards)],
)
@dsl.container_component
def report_step(gcs_bucket: str) -> dsl.ContainerSpec:
"""
Report component.
"""
return dsl.ContainerSpec(
image=DOCKER_IMAGE_URI,
command=["python", "vertex_pipelines/report_step.py"],
arguments=["--gcs-bucket", gcs_bucket],
).after(sft_op)
args=["--gcs-bucket", gcs_bucket],
)
# Let's rewrite the function to use the global ACCELERATOR_TYPE which we will ensure is set BEFORE the function is decorated/called.
# Actually, dsl.pipeline is a decorator. It runs when the module is loaded.
# So 'nanochat_pipeline' is compiled/registered immediately.
# If we want to change the structure based on args, we should define the pipeline function INSIDE __main__ or
# create a function that returns the pipeline function.
def create_pipeline_func(accelerator_type, accelerator_count, is_preemptible):
@dsl.pipeline(
name="nanochat-pipeline",
description="A pipeline to train NanoChat",
)
def nanochat_pipeline(
gcs_bucket: str,
project: str,
location: str,
wandb_run: str = "dummy",
vertex_experiment: str = "",
vertex_tensorboard: str = "",
num_data_shards: int = 20,
scheduling_strategy: str = "FLEX_START",
max_wait_duration: str = "0s",
service_account: str = "",
device_batch_size: int = 8
):
# Data download step
data_download_task = data_download_step(
gcs_bucket=gcs_bucket,
num_shards=num_data_shards
)
data_download_task.set_cpu_limit('8').set_memory_limit('32G')
# Tokenizer step
tokenizer_task = tokenizer_step(gcs_bucket=gcs_bucket)
tokenizer_task.set_cpu_limit('8').set_memory_limit('32G')
# Pretraining step using CustomTrainingJobOp
# Define worker pool specs
# Note: We use the same image and command as before
worker_pool_specs = [{
"machine_spec": {
"machine_type": "a2-highgpu-1g" if accelerator_type == "NVIDIA_TESLA_A100" and accelerator_count == 1 else "a2-highgpu-8g" if accelerator_type == "NVIDIA_TESLA_A100" and accelerator_count == 8 else "n1-standard-16", # Fallback/Logic needs to be robust
"accelerator_type": accelerator_type,
"accelerator_count": accelerator_count,
},
"replica_count": 1,
"disk_spec": {
"boot_disk_type": "pd-ssd",
"boot_disk_size_gb": 500,
},
"container_spec": {
"image_uri": DOCKER_IMAGE_URI,
"command": ["python", "vertex_pipelines/pretraining_step.py"],
"args": [
"--gcs-bucket", gcs_bucket,
"--wandb-run", wandb_run,
"--vertex-experiment", vertex_experiment,
"--vertex-tensorboard", vertex_tensorboard,
"--device-batch-size", str(device_batch_size)
],
},
}]
# Refine machine type logic based on accelerator
# A100 40GB: a2-highgpu-1g (1 GPU), a2-highgpu-2g (2 GPUs), a2-highgpu-4g (4 GPUs), a2-highgpu-8g (8 GPUs)
# L4: g2-standard-4 (1 GPU), etc.
# For now, let's assume the user passes valid combinations or we map them.
# Given the user specifically asked for 8x A100, we target a2-highgpu-8g.
machine_type = "n1-standard-16" # Default
if accelerator_type == "NVIDIA_TESLA_A100":
if accelerator_count == 1: machine_type = "a2-highgpu-1g"
elif accelerator_count == 2: machine_type = "a2-highgpu-2g"
elif accelerator_count == 4: machine_type = "a2-highgpu-4g"
elif accelerator_count == 8: machine_type = "a2-highgpu-8g"
elif accelerator_type == "NVIDIA_L4":
if accelerator_count == 1: machine_type = "g2-standard-4"
elif accelerator_count == 8: machine_type = "g2-standard-96"
worker_pool_specs[0]["machine_spec"]["machine_type"] = machine_type
# Scheduling strategy is now a runtime parameter
# Common values:
# FLEX_START: Dynamic Workload Scheduler - queues jobs when resources unavailable
# SPOT: Preemptible instances (deprecated in favor of FLEX_START)
# STANDARD: Standard on-demand instances
# max_wait_duration: "0s" = wait indefinitely, "3600s" = 1 hour, "86400s" = 24 hours
pretraining_task = CustomTrainingJobOp(
project=project,
location=location,
display_name="nanochat-pretraining-job",
worker_pool_specs=worker_pool_specs,
base_output_directory=f"{gcs_bucket}/pipeline_root",
timeout="604800s", # 7 days
restart_job_on_worker_restart=True,
strategy=scheduling_strategy,
max_wait_duration=max_wait_duration,
service_account=service_account,
tensorboard=vertex_tensorboard,
).after(tokenizer_task)
# CustomTrainingJobOp returns a Model (if configured) or just the job resource.
# We don't need to set resources/accelerators on the task itself because they are in worker_pool_specs.
# Mid-training step - use same resources as pretraining (A100s on FLEX)
mid_worker_pool_specs = [{
"machine_spec": worker_pool_specs[0]["machine_spec"],
"replica_count": 1,
"disk_spec": {
"boot_disk_type": "pd-ssd",
"boot_disk_size_gb": 500,
},
"container_spec": {
"image_uri": DOCKER_IMAGE_URI,
"command": ["python", "vertex_pipelines/midtraining_step.py"],
"args": [
"--gcs-bucket", gcs_bucket,
"--wandb-run", wandb_run,
"--vertex-experiment", vertex_experiment,
"--vertex-tensorboard", vertex_tensorboard,
"--device-batch-size", str(device_batch_size),
],
},
}]
midtraining_task = CustomTrainingJobOp(
project=project,
location=location,
display_name="nanochat-midtraining-job",
worker_pool_specs=mid_worker_pool_specs,
base_output_directory=f"{gcs_bucket}/pipeline_root",
service_account=service_account,
strategy=scheduling_strategy,
max_wait_duration=max_wait_duration,
).after(pretraining_task)
# SFT step - use same resources as pretraining (A100s on FLEX)
sft_worker_pool_specs = [{
"machine_spec": worker_pool_specs[0]["machine_spec"],
"replica_count": 1,
"disk_spec": {
"boot_disk_type": "pd-ssd",
"boot_disk_size_gb": 500,
},
"container_spec": {
"image_uri": DOCKER_IMAGE_URI,
"command": ["python", "vertex_pipelines/sft_step.py"],
"args": [
"--gcs-bucket", gcs_bucket,
"--wandb-run", wandb_run,
"--vertex-experiment", vertex_experiment,
"--vertex-tensorboard", vertex_tensorboard,
],
},
}]
sft_task = CustomTrainingJobOp(
project=project,
location=location,
display_name="nanochat-sft-job",
worker_pool_specs=sft_worker_pool_specs,
base_output_directory=f"{gcs_bucket}/pipeline_root",
service_account=service_account,
strategy=scheduling_strategy,
max_wait_duration=max_wait_duration,
).after(midtraining_task)
report_task = report_step(gcs_bucket=gcs_bucket).after(sft_task)
report_task.set_cpu_limit('2').set_memory_limit('8G')
return nanochat_pipeline
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--gcp-project", type=str, required=True)
parser.add_argument("--gcp-project", type=str, required=False) # Optional if we don't run it here
parser.add_argument("--gcs-bucket", type=str, required=True)
parser.add_argument("--pipeline-root", type=str, required=True)
parser.add_argument("--docker-image-uri", type=str, required=True)
parser.add_argument("--pipeline-root", type=str, required=False)
parser.add_argument("--region", type=str, default="us-central1")
parser.add_argument("--wandb-run", type=str, default="dummy")
parser.add_argument("--vertex-experiment", type=str, default="")
parser.add_argument("--vertex-tensorboard", type=str, default="")
parser.add_argument("--accelerator-type", type=str, default="NVIDIA_L4")
parser.add_argument("--accelerator-count", type=int, default=1)
parser.add_argument("--num-data-shards", type=int, default=20)
parser.add_argument("--preemptible", type=str, default="false")
parser.add_argument("--scheduling-strategy", type=str, default=None, help="Scheduling strategy: FLEX_START, SPOT, or STANDARD")
parser.add_argument("--max-wait-duration", type=str, default=None, help="Max wait duration for FLEX_START, e.g., '0s', '3600s'")
parser.add_argument("--service-account", type=str, required=False, help="Service account to run the pipeline")
parser.add_argument("--device-batch-size", type=int, default=8, help="Batch size per device")
parser.add_argument("--template_path", type=str, default="nanochat_pipeline.json")
args = parser.parse_args()
is_preemptible = args.preemptible.lower() == "true"
# Set smart defaults for scheduling strategy based on preemptible flag
if args.scheduling_strategy is None:
scheduling_strategy = "FLEX_START" if is_preemptible else "STANDARD"
else:
scheduling_strategy = args.scheduling_strategy
if args.max_wait_duration is None:
max_wait_duration = "0s" if is_preemptible else "86400s"
else:
max_wait_duration = args.max_wait_duration
# Create the pipeline function dynamically with captured arguments
pipeline_func = create_pipeline_func(
accelerator_type=args.accelerator_type,
accelerator_count=args.accelerator_count,
is_preemptible=is_preemptible
)
Compiler().compile(
pipeline_func=nanochat_pipeline,
package_path="nanochat_pipeline.json",
pipeline_func=pipeline_func,
package_path=args.template_path,
)
aiplatform.init(project=args.gcp_project, location=args.region)
job = aiplatform.PipelineJob(
display_name="nanochat-pipeline",
template_path="nanochat_pipeline.json",
pipeline_root=args.pipeline_root,
parameter_values={
"gcs_bucket": args.gcs_bucket,
"docker_image_uri": args.docker_image_uri,
},
)
job.run()
# Initialize Vertex AI SDK
if args.gcp_project:
aiplatform.init(project=args.gcp_project, location=args.region)
job = aiplatform.PipelineJob(
display_name="nanochat-pipeline",
template_path=args.template_path,
pipeline_root=args.pipeline_root,
parameter_values={
"gcs_bucket": args.gcs_bucket,
"project": args.gcp_project,
"location": args.region,
"wandb_run": args.wandb_run,
"vertex_experiment": args.vertex_experiment,
"vertex_tensorboard": args.vertex_tensorboard,
"num_data_shards": args.num_data_shards,
"scheduling_strategy": scheduling_strategy,
"max_wait_duration": max_wait_duration,
"service_account": args.service_account,
"device_batch_size": args.device_batch_size,
},
)
# Run the pipeline
# service_account is optional but recommended
job.run(
service_account=args.service_account,
sync=True # Block until completion or failure to ensure we see logs
)

View File

@ -1,35 +1,189 @@
import os
import subprocess
import argparse
from nanochat.common import get_base_dir
import shutil
from google.cloud import storage
def download_directory_from_gcs(bucket_name, gcs_path, local_path):
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blobs = bucket.list_blobs(prefix=gcs_path)
for blob in blobs:
if blob.name.endswith("/"):
continue
relative_path = os.path.relpath(blob.name, gcs_path)
local_file = os.path.join(local_path, relative_path)
os.makedirs(os.path.dirname(local_file), exist_ok=True)
blob.download_to_filename(local_file)
print(f"Downloaded gs://{bucket_name}/{blob.name} to {local_file}")
def upload_directory_to_gcs(local_path, bucket_name, gcs_path):
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
for root, _, files in os.walk(local_path):
for file in files:
local_file = os.path.join(root, file)
relative_path = os.path.relpath(local_file, local_path)
blob_path = os.path.join(gcs_path, relative_path)
blob = bucket.blob(blob_path)
blob.upload_from_file(open(local_file, 'rb'))
print(f"Uploaded {local_file} to gs://{bucket_name}/{blob_path}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--gcs-bucket", type=str, required=True, help="GCS bucket for artifacts")
parser.add_argument("--wandb-run", type=str, default="dummy", help="Wandb run name")
parser.add_argument("--vertex-experiment", type=str, default="", help="Vertex AI experiment name")
parser.add_argument("--vertex-tensorboard", type=str, default="", help="Vertex AI TensorBoard resource name")
parser.add_argument("--device-batch-size", type=int, default=8, help="Batch size per device")
args = parser.parse_args()
# Set the base directory to the GCS bucket.
os.environ["NANOCHAT_BASE_DIR"] = args.gcs_bucket
# Parse bucket name and prefix
if args.gcs_bucket.startswith("gs://"):
bucket_name = args.gcs_bucket.replace("gs://", "").split("/")[0]
prefix_parts = args.gcs_bucket.replace("gs://", "").split("/")[1:]
prefix = "/".join(prefix_parts) if prefix_parts else ""
else:
bucket_name = args.gcs_bucket
prefix = ""
# Pre-train the d20 model.
subprocess.run([
"torchrun", "--standalone", "--nproc_per_node=8",
"-m", "scripts.base_train", "--",
"--depth=20", f"--run={args.wandb_run}"
], check=True)
# Check if pretraining checkpoint already exists (checkpoint detection)
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
gcs_base_ckpt_path = os.path.join(prefix, "base_checkpoints") if prefix else "base_checkpoints"
# Check for model.pt (the key checkpoint file)
# Note: base_train.py saves to f"d{depth}" where depth defaults to 20
depth = 20
gcs_base_ckpt_path = os.path.join(gcs_base_ckpt_path, f"d{depth}")
checkpoint_exists = bucket.blob(os.path.join(gcs_base_ckpt_path, "model.pt")).exists()
if checkpoint_exists:
print(f"✓ Pretraining checkpoint already exists in gs://{bucket_name}/{gcs_base_ckpt_path}")
print("Skipping pretraining (already completed)")
return
# Evaluate the model on a larger chunk of train/val data and draw some samples.
subprocess.run([
"torchrun", "--standalone", "--nproc_per_node=8",
"-m", "scripts.base_loss"
], check=True)
print(f"Pretraining checkpoint not found. Running pretraining...")
# Evaluate the model on CORE tasks.
subprocess.run([
"torchrun", "--standalone", "--nproc_per_node=8",
"-m", "scripts.base_eval"
], check=True)
# Set local base dir
local_base_dir = "/tmp/nanochat"
os.environ["NANOCHAT_BASE_DIR"] = local_base_dir
os.makedirs(local_base_dir, exist_ok=True)
# Set data dir to GCS so we stream/cache data there
gcs_data_path = f"gs://{bucket_name}/{prefix}/base_data" if prefix else f"gs://{bucket_name}/base_data"
# Clean up double slashes if any
gcs_data_path = gcs_data_path.replace("//base_data", "/base_data")
os.environ["NANOCHAT_DATA_DIR"] = gcs_data_path
print(f"Set NANOCHAT_DATA_DIR to {gcs_data_path}")
# Download tokenizer from GCS to local disk
print("Downloading tokenizer from GCS...")
gcs_tokenizer_path = os.path.join(prefix, "tokenizer") if prefix else "tokenizer"
local_tokenizer_dir = os.path.join(local_base_dir, "tokenizer")
download_directory_from_gcs(bucket_name, gcs_tokenizer_path, local_tokenizer_dir)
try:
# Diagnostic: Check if PyTorch can see CUDA
import torch
print(f"PRE-TRAINING DIAGNOSTICS:")
print(f" torch.cuda.is_available(): {torch.cuda.is_available()}")
print(f" torch.__version__: {torch.__version__}")
if torch.cuda.is_available():
print(f" torch.version.cuda: {torch.version.cuda}")
print(f" torch.cuda.device_count(): {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
print(f" GPU {i}: {torch.cuda.get_device_name(i)}")
# Print environment variables
env_vars = ["LD_LIBRARY_PATH", "PATH", "CUDA_VISIBLE_DEVICES", "NVIDIA_VISIBLE_DEVICES"]
for var in env_vars:
print(f" env {var}: {os.environ.get(var, 'NOT SET')}")
# We use a smaller batch size to be safe on standard GPUs, or rely on auto-config.
# speedrun.sh uses d20.
# A100 80GB: Use batch_size=32 for optimal MFU (uses ~38-40GB)
# A100 40GB (Distributed): Use batch_size=8 per GPU.
# Dynamic GPU detection
import torch
gpu_count = torch.cuda.device_count()
print(f"Detected {gpu_count} GPUs. Configuring distributed training...")
# Adjust batch size based on GPU type (heuristic)
# If we are on A100 40GB, we need batch_size=8.
# If we are on A100 80GB, we can use 32.
# Since we are likely switching back to 40GB for distributed, let's be safe with 8.
# The user can override this if needed, but 8 is safe for 40GB.
# If we are on 80GB, 8 is also fine, just less efficient per GPU, but with multiple GPUs it's okay.
# Let's stick to 8 to be safe for the 40GB distributed case.
device_batch_size = "8"
print("Starting pretraining...")
subprocess.run([
"torchrun", "--standalone", f"--nproc_per_node={gpu_count}",
"-m", "scripts.base_train",
"--depth=20", f"--device_batch_size={args.device_batch_size}",
f"--wandb_run_name={args.wandb_run}",
f"--vertex_experiment={args.vertex_experiment}",
f"--vertex_tensorboard={args.vertex_tensorboard}"
], check=True)
# Evaluate the model on a larger chunk of train/val data and draw some samples.
print("Running base_loss evaluation...")
subprocess.run([
"torchrun", "--standalone", "--nproc_per_node=1",
"-m", "scripts.base_loss",
"--device_batch_size=8"
], check=True)
# Evaluate the model on CORE tasks.
print("Running base_eval...")
subprocess.run([
"torchrun", "--standalone", "--nproc_per_node=1",
"-m", "scripts.base_eval"
], check=True)
except subprocess.CalledProcessError as e:
print(f"Error during pretraining steps: {e}")
raise
# Upload checkpoints and report to GCS
print("Uploading artifacts to GCS...")
# Upload base_checkpoints
local_checkpoints_dir = os.path.join(local_base_dir, "base_checkpoints")
gcs_checkpoints_path = os.path.join(prefix, "base_checkpoints") if prefix else "base_checkpoints"
if os.path.exists(local_checkpoints_dir):
upload_directory_to_gcs(local_checkpoints_dir, bucket_name, gcs_checkpoints_path)
else:
print(f"Warning: {local_checkpoints_dir} does not exist.")
# Upload report (it might be in base_dir or somewhere else, let's check report.py behavior or just upload everything in base_dir except data/tokenizer?)
# report.py likely writes to a file.
# For now, let's just upload the whole base_dir excluding data and tokenizer which we handled/don't need.
# Actually, let's just look for report.md or similar.
# But we don't know exactly where report.py writes.
# Assuming it writes to base_dir/report.md or similar.
# Let's just upload everything in local_base_dir that is NOT tokenizer or base_checkpoints (already uploaded) or tokenized_data.
for root, dirs, files in os.walk(local_base_dir):
# Skip directories we don't want to re-upload or are empty
if "tokenizer" in dirs:
dirs.remove("tokenizer")
if "base_checkpoints" in dirs:
dirs.remove("base_checkpoints")
if "tokenized_data" in dirs:
dirs.remove("tokenized_data")
for file in files:
local_file = os.path.join(root, file)
relative_path = os.path.relpath(local_file, local_base_dir)
blob_path = os.path.join(prefix, relative_path) if prefix else relative_path
blob = bucket.blob(blob_path)
blob.upload_from_file(open(local_file, 'rb'))
print(f"Uploaded {local_file} to gs://{bucket_name}/{blob_path}")
if __name__ == "__main__":
main()

View File

@ -1,18 +1,100 @@
import os
import sys
import subprocess
import argparse
from nanochat.common import get_base_dir
import shutil
from google.cloud import storage
def download_directory_from_gcs(bucket_name, gcs_path, local_path):
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blobs = bucket.list_blobs(prefix=gcs_path)
for blob in blobs:
if blob.name.endswith("/"):
continue
relative_path = os.path.relpath(blob.name, gcs_path)
local_file = os.path.join(local_path, relative_path)
os.makedirs(os.path.dirname(local_file), exist_ok=True)
blob.download_to_filename(local_file)
print(f"Downloaded gs://{bucket_name}/{blob.name} to {local_file}")
def upload_directory_to_gcs(local_path, bucket_name, gcs_path):
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
for root, _, files in os.walk(local_path):
for file in files:
local_file = os.path.join(root, file)
relative_path = os.path.relpath(local_file, local_path)
blob_path = os.path.join(gcs_path, relative_path)
blob = bucket.blob(blob_path)
blob.upload_from_file(open(local_file, 'rb'))
print(f"Uploaded {local_file} to gs://{bucket_name}/{blob_path}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--gcs-bucket", type=str, required=True, help="GCS bucket for artifacts")
args = parser.parse_args()
# Set the base directory to the GCS bucket.
os.environ["NANOCHAT_BASE_DIR"] = args.gcs_bucket
# Parse bucket name and prefix
if args.gcs_bucket.startswith("gs://"):
bucket_name = args.gcs_bucket.replace("gs://", "").split("/")[0]
prefix_parts = args.gcs_bucket.replace("gs://", "").split("/")[1:]
prefix = "/".join(prefix_parts) if prefix_parts else ""
else:
bucket_name = args.gcs_bucket
prefix = ""
# Generate the full report.
subprocess.run(["python", "-m", "nanochat.report", "generate"], check=True)
# Check if report already exists (checkpoint detection)
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
gcs_report_file = os.path.join(prefix, "report.md") if prefix else "report.md"
report_exists = bucket.blob(gcs_report_file).exists()
if report_exists:
print(f"✓ Report already exists at gs://{bucket_name}/{gcs_report_file}")
print("Skipping report generation (already completed)")
return
print(f"Report not found. Generating report...")
# Set local base dir
local_base_dir = "/tmp/nanochat"
os.environ["NANOCHAT_BASE_DIR"] = local_base_dir
os.makedirs(local_base_dir, exist_ok=True)
# Download report dir from GCS
print("Downloading report dir from GCS...")
gcs_report_path = os.path.join(prefix, "report") if prefix else "report"
local_report_dir = os.path.join(local_base_dir, "report")
download_directory_from_gcs(bucket_name, gcs_report_path, local_report_dir)
try:
# Generate the full report.
print("Generating report...")
subprocess.run([sys.executable, "-m", "nanochat.report", "generate"], check=True)
except subprocess.CalledProcessError as e:
print(f"Error generating report: {e}")
raise
# Upload report.md to GCS
print("Uploading report to GCS...")
# report.py generates report.md in local_base_dir/report/report.md AND copies it to current dir.
# We want to upload it to the bucket root or prefix root.
local_report_file = "report.md"
if os.path.exists(local_report_file):
blob_path = os.path.join(prefix, "report.md") if prefix else "report.md"
bucket = storage.Client().bucket(bucket_name)
blob = bucket.blob(blob_path)
blob.upload_from_file(open(local_report_file, 'rb'))
print(f"Uploaded {local_report_file} to gs://{bucket_name}/{blob_path}")
else:
print("Warning: report.md not found in current directory.")
# Also upload the report dir just in case
if os.path.exists(local_report_dir):
upload_directory_to_gcs(local_report_dir, bucket_name, gcs_report_path)
if __name__ == "__main__":
main()

84
vertex_pipelines/run_pipeline.sh Normal file → Executable file
View File

@ -1,39 +1,101 @@
#!/bin/bash
set -euo pipefail
# Check for optional flags
SKIP_BUILD=false
if [ "${1:-}" == "--skip-build" ]; then
SKIP_BUILD=true
shift
fi
# Check for required arguments
if [ "$#" -ne 1 ]; then
echo "Usage: $0 gs://YOUR_GCS_BUCKET"
if [ "$#" -lt 1 ] || [ "$#" -gt 10 ]; then
echo "Usage: $0 [--skip-build] gs://YOUR_GCS_BUCKET [WANDB_RUN_ID] [VERTEX_EXPERIMENT] [VERTEX_TENSORBOARD] [REGION] [ACCELERATOR_TYPE] [ACCELERATOR_COUNT] [NUM_DATA_SHARDS] [PREEMPTIBLE] [DEVICE_BATCH_SIZE]"
echo " REGION defaults to \$VERTEX_REGION env var or 'us-central1'"
echo " ACCELERATOR_TYPE defaults to 'NVIDIA_L4'"
echo " ACCELERATOR_COUNT defaults to 1"
echo " NUM_DATA_SHARDS defaults to 20 (number of HuggingFace data shards to download)"
echo " PREEMPTIBLE defaults to false"
echo " DEVICE_BATCH_SIZE defaults to 8"
exit 1
fi
if [[ ! "$1" =~ ^gs:// ]]; then
echo "Error: GCS bucket must be a valid gs:// path."
echo "Usage: $0 gs://YOUR_GCS_BUCKET"
echo "Usage: $0 [--skip-build] gs://YOUR_GCS_BUCKET [WANDB_RUN_ID] [VERTEX_EXPERIMENT] [VERTEX_TENSORBOARD] [REGION] [ACCELERATOR_TYPE] [ACCELERATOR_COUNT] [NUM_DATA_SHARDS] [PREEMPTIBLE] [DEVICE_BATCH_SIZE]"
exit 1
fi
GCS_BUCKET=$1
PIPELINE_ROOT="$GCS_BUCKET/pipeline-root"
GCP_PROJECT=$(gcloud config get-value project)
REGION="us-central1"
WANDB_RUN=${2:-"dummy"} # Default to "dummy" if not provided
VERTEX_EXPERIMENT=${3:-""}
VERTEX_TENSORBOARD=${4:-""}
REGION=${5:-${VERTEX_REGION:-us-central1}} # Use arg, then env var, then default
ACCELERATOR_TYPE=${6:-NVIDIA_L4}
ACCELERATOR_COUNT=${7:-1}
NUM_DATA_SHARDS=${8:-20}
PREEMPTIBLE=${9:-false}
DEVICE_BATCH_SIZE=${10:-8}
echo "Using GCP Project: $GCP_PROJECT"
echo "Using GCS Bucket: $GCS_BUCKET"
echo "Using Region: $REGION"
echo "Using Accelerator: $ACCELERATOR_TYPE"
echo "Using WANDB Run ID: $WANDB_RUN"
if [ -n "$VERTEX_EXPERIMENT" ]; then
echo "Using Vertex Experiment: $VERTEX_EXPERIMENT"
fi
if [ -n "$VERTEX_TENSORBOARD" ]; then
echo "Using Vertex TensorBoard: $VERTEX_TENSORBOARD"
fi
# Submit the build to Cloud Build and get the image URI with digest
echo "Submitting build to Cloud Build..."
IMAGE_URI=$(gcloud builds submit --config vertex_pipelines/cloudbuild.yaml --format="value(results.images[0].name)" . --project=$GCP_PROJECT)
echo "Cloud Build completed. Using image URI: $IMAGE_URI"
# Use a timestamp tag to avoid caching issues with 'latest'
if [ -z "${DOCKER_IMAGE_URI:-}" ]; then
TIMESTAMP=$(date +%Y%m%d%H%M%S)
IMAGE_URI="gcr.io/$GCP_PROJECT/nanochat:$TIMESTAMP"
else
TIMESTAMP="custom"
IMAGE_URI="$DOCKER_IMAGE_URI"
fi
if [ "$SKIP_BUILD" = false ]; then
echo "Submitting build to Cloud Build with tag $TIMESTAMP..."
gcloud builds submit --config vertex_pipelines/cloudbuild.yaml --substitutions=_IMAGE_NAME="$IMAGE_URI" . --project=$GCP_PROJECT
echo "Cloud Build completed."
else
echo "Skipping Cloud Build."
fi
echo "Using image URI: $IMAGE_URI"
# Run the Vertex AI pipeline
# Install dependencies for pipeline compilation
echo "Installing dependencies..."
if [ ! -d ".venv_pipeline" ]; then
python3 -m venv .venv_pipeline
fi
source .venv_pipeline/bin/activate
python3 -m pip install -r requirements.txt
echo "Running Vertex AI pipeline..."
python vertex_pipelines/pipeline.py \
export DOCKER_IMAGE_URI="$IMAGE_URI"
# Use the default compute service account for the project
SERVICE_ACCOUNT="247010501180-compute@developer.gserviceaccount.com"
python3 vertex_pipelines/pipeline.py \
--gcp-project "$GCP_PROJECT" \
--gcs-bucket "$GCS_BUCKET" \
--pipeline-root "$PIPELINE_ROOT" \
--docker-image-uri "$IMAGE_URI" \
--region "$REGION"
--region "$REGION" \
--wandb-run "$WANDB_RUN" \
--vertex-experiment "$VERTEX_EXPERIMENT" \
--vertex-tensorboard "$VERTEX_TENSORBOARD" \
--accelerator-type "$ACCELERATOR_TYPE" \
--accelerator-count "$ACCELERATOR_COUNT" \
--preemptible "$PREEMPTIBLE" \
--num-data-shards "$NUM_DATA_SHARDS" \
--service-account "$SERVICE_ACCOUNT" \
--device-batch-size "$DEVICE_BATCH_SIZE"
echo "Pipeline submitted."
echo "Pipeline submitted."

View File

@ -0,0 +1,34 @@
#!/bin/bash
set -e
PROJECT="nzp-nanochat"
NETWORK_NAME="nanochat-network"
echo "Setting up network resources for project $PROJECT..."
# 1. Create the VPC network (auto mode creates subnets in all regions)
if ! gcloud compute networks describe "$NETWORK_NAME" --project="$PROJECT" &>/dev/null; then
echo "Creating VPC network '$NETWORK_NAME'..."
gcloud compute networks create "$NETWORK_NAME" \
--project="$PROJECT" \
--subnet-mode=auto \
--bgp-routing-mode=global
echo "✅ Network created."
else
echo "✅ Network '$NETWORK_NAME' already exists."
fi
# 2. Create firewall rule to allow internal communication
if ! gcloud compute firewall-rules describe "${NETWORK_NAME}-allow-internal" --project="$PROJECT" &>/dev/null; then
echo "Creating firewall rule '${NETWORK_NAME}-allow-internal'..."
gcloud compute firewall-rules create "${NETWORK_NAME}-allow-internal" \
--project="$PROJECT" \
--network="$NETWORK_NAME" \
--allow=tcp,udp,icmp \
--source-ranges=10.128.0.0/9
echo "✅ Firewall rule created."
else
echo "✅ Firewall rule '${NETWORK_NAME}-allow-internal' already exists."
fi
echo "Network setup complete!"

View File

@ -0,0 +1,68 @@
#!/bin/bash
set -euo pipefail
# Usage: ./setup_resources.sh <PROJECT_ID> <REGION> <BUCKET_NAME> [EXPERIMENT_NAME] [TENSORBOARD_DISPLAY_NAME]
if [ "$#" -lt 3 ]; then
echo "Usage: $0 <PROJECT_ID> <REGION> <BUCKET_NAME> [EXPERIMENT_NAME] [TENSORBOARD_DISPLAY_NAME]"
exit 1
fi
PROJECT_ID=$1
REGION=$2
BUCKET_NAME=$3
EXPERIMENT_NAME=${4:-"nanochat-experiment"}
TENSORBOARD_DISPLAY_NAME=${5:-"nanochat-tensorboard"}
echo "Setting up resources in Project: $PROJECT_ID, Region: $REGION"
# 1. Create GCS Bucket
echo "Checking bucket gs://$BUCKET_NAME..."
if gcloud storage buckets describe "gs://$BUCKET_NAME" --project="$PROJECT_ID" &>/dev/null; then
echo "Bucket gs://$BUCKET_NAME already exists."
else
echo "Creating bucket gs://$BUCKET_NAME..."
gcloud storage buckets create "gs://$BUCKET_NAME" --project="$PROJECT_ID" --location="$REGION" --uniform-bucket-level-access
echo "Bucket created."
fi
# 2. Create Vertex AI TensorBoard
echo "Checking for existing TensorBoard with display name: $TENSORBOARD_DISPLAY_NAME..."
EXISTING_TB=$(gcloud ai tensorboards list --region="$REGION" --project="$PROJECT_ID" --filter="displayName=$TENSORBOARD_DISPLAY_NAME" --format="value(name)" 2>/dev/null || true)
if [ -n "$EXISTING_TB" ]; then
echo "TensorBoard '$TENSORBOARD_DISPLAY_NAME' already exists: $EXISTING_TB"
TENSORBOARD_ID=$EXISTING_TB
else
echo "Creating Vertex AI TensorBoard: $TENSORBOARD_DISPLAY_NAME..."
# Create and capture the output. The output usually contains the name.
# We use --format="value(name)" to get just the resource name.
TENSORBOARD_ID=$(gcloud ai tensorboards create --display-name="$TENSORBOARD_DISPLAY_NAME" --region="$REGION" --project="$PROJECT_ID" --format="value(name)")
echo "TensorBoard created: $TENSORBOARD_ID"
fi
# 3. Create Vertex AI Experiment
echo "Creating Vertex AI Experiment: $EXPERIMENT_NAME..."
# Experiments are often implicitly created, but we can explicitly create it.
# We check if it exists first to avoid errors.
if gcloud ai experiments list --region="$REGION" --project="$PROJECT_ID" --filter="name=$EXPERIMENT_NAME" --format="value(name)" 2>/dev/null | grep -q "$EXPERIMENT_NAME"; then
echo "Experiment '$EXPERIMENT_NAME' already exists."
else
# Try to create. Note: 'gcloud ai experiments create' might fail if it already exists but wasn't found by list for some reason,
# or if the command syntax varies. We'll allow it to fail gracefully if it's just "already exists".
gcloud ai experiments create --experiment="$EXPERIMENT_NAME" --region="$REGION" --project="$PROJECT_ID" || echo "Experiment creation returned status $? (might already exist)."
echo "Experiment setup complete."
fi
echo "----------------------------------------------------------------"
echo "Setup Complete!"
echo "----------------------------------------------------------------"
echo "Use the following values for run_pipeline.sh:"
echo ""
echo "GCS_BUCKET: gs://$BUCKET_NAME"
echo "VERTEX_EXPERIMENT: $EXPERIMENT_NAME"
echo "VERTEX_TENSORBOARD: $TENSORBOARD_ID"
echo ""
echo "Example Command:"
echo "./vertex_pipelines/run_pipeline.sh gs://$BUCKET_NAME <WANDB_RUN> $EXPERIMENT_NAME $TENSORBOARD_ID"
echo "----------------------------------------------------------------"

View File

@ -1,30 +1,137 @@
import os
import subprocess
import argparse
from nanochat.common import get_base_dir
import shutil
from google.cloud import storage
def download_directory_from_gcs(bucket_name, gcs_path, local_path):
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blobs = bucket.list_blobs(prefix=gcs_path)
for blob in blobs:
if blob.name.endswith("/"):
continue
relative_path = os.path.relpath(blob.name, gcs_path)
local_file = os.path.join(local_path, relative_path)
os.makedirs(os.path.dirname(local_file), exist_ok=True)
blob.download_to_filename(local_file)
print(f"Downloaded gs://{bucket_name}/{blob.name} to {local_file}")
def upload_directory_to_gcs(local_path, bucket_name, gcs_path):
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
for root, _, files in os.walk(local_path):
for file in files:
local_file = os.path.join(root, file)
relative_path = os.path.relpath(local_file, local_path)
blob_path = os.path.join(gcs_path, relative_path)
blob = bucket.blob(blob_path)
blob.upload_from_file(open(local_file, 'rb'))
print(f"Uploaded {local_file} to gs://{bucket_name}/{blob_path}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--gcs-bucket", type=str, required=True, help="GCS bucket for artifacts")
parser.add_argument("--wandb-run", type=str, default="dummy", help="Wandb run name")
parser.add_argument("--vertex-experiment", type=str, default="", help="Vertex AI experiment name")
parser.add_argument("--vertex-tensorboard", type=str, default="", help="Vertex AI TensorBoard resource name")
args = parser.parse_args()
# Set the base directory to the GCS bucket.
os.environ["NANOCHAT_BASE_DIR"] = args.gcs_bucket
# Parse bucket name and prefix
if args.gcs_bucket.startswith("gs://"):
bucket_name = args.gcs_bucket.replace("gs://", "").split("/")[0]
prefix_parts = args.gcs_bucket.replace("gs://", "").split("/")[1:]
prefix = "/".join(prefix_parts) if prefix_parts else ""
else:
bucket_name = args.gcs_bucket
prefix = ""
# Run supervised finetuning.
subprocess.run([
"torchrun", "--standalone", "--nproc_per_node=8",
"-m", "scripts.chat_sft", "--",
f"--run={args.wandb_run}"
], check=True)
# Check if SFT checkpoint already exists (checkpoint detection)
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
gcs_sft_ckpt_path = os.path.join(prefix, "chatsft_checkpoints") if prefix else "chatsft_checkpoints"
# Check for model.pt (the key checkpoint file)
checkpoint_exists = bucket.blob(os.path.join(gcs_sft_ckpt_path, "model.pt")).exists()
if checkpoint_exists:
print(f"✓ SFT checkpoint already exists in gs://{bucket_name}/{gcs_sft_ckpt_path}")
print("Skipping SFT training (already completed)")
return
# Evaluate the model.
subprocess.run([
"torchrun", "--standalone", "--nproc_per_node=8",
"-m", "scripts.chat_eval", "--",
"-i", "sft"
], check=True)
print(f"SFT checkpoint not found. Running SFT training...")
# Set local tmp dir for temporary files
local_base_dir = "/tmp/nanochat"
os.makedirs(local_base_dir, exist_ok=True)
# Download tokenizer from GCS
print("Downloading tokenizer from GCS...")
gcs_tokenizer_path = os.path.join(prefix, "tokenizer") if prefix else "tokenizer"
local_tokenizer_dir = os.path.join(local_base_dir, "tokenizer")
download_directory_from_gcs(bucket_name, gcs_tokenizer_path, local_tokenizer_dir)
# Download mid checkpoints from GCS
print("Downloading mid checkpoints from GCS...")
gcs_mid_checkpoints_path = os.path.join(prefix, "mid_checkpoints") if prefix else "mid_checkpoints"
local_mid_checkpoints_dir = os.path.join(local_base_dir, "mid_checkpoints")
download_directory_from_gcs(bucket_name, gcs_mid_checkpoints_path, local_mid_checkpoints_dir)
# Download report dir from GCS
print("Downloading report dir from GCS...")
gcs_report_path = os.path.join(prefix, "report") if prefix else "report"
local_report_dir = os.path.join(local_base_dir, "report")
download_directory_from_gcs(bucket_name, gcs_report_path, local_report_dir)
# Ensure report directory exists even if nothing was downloaded
os.makedirs(local_report_dir, exist_ok=True)
try:
# Download the identity conversations dataset.
print("Downloading identity conversations...")
subprocess.run([
"curl", "-L", "-o",
f"{local_base_dir}/identity_conversations.jsonl",
"https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl"
], check=True)
# Run supervised finetuning.
print("Starting SFT...")
env = os.environ.copy()
env["NANOCHAT_BASE_DIR"] = local_base_dir
subprocess.run([
"torchrun", "--standalone", "--nproc_per_node=1",
"-m", "scripts.chat_sft",
f"--wandb_run_name={args.wandb_run}",
f"--vertex_experiment={args.vertex_experiment}",
f"--vertex_tensorboard={args.vertex_tensorboard}"
], check=True, env=env)
# Evaluate the model.
print("Running chat_eval (sft)...")
subprocess.run([
"torchrun", "--standalone", "--nproc_per_node=1",
"-m", "scripts.chat_eval", "--",
"-i", "sft"
], check=True, env=env)
except subprocess.CalledProcessError as e:
print(f"Error during SFT steps: {e}")
raise
# Upload checkpoints to GCS
print("Uploading artifacts to GCS...")
# Upload chatsft_checkpoints
local_checkpoints_dir = os.path.join(local_base_dir, "chatsft_checkpoints")
gcs_checkpoints_path = os.path.join(prefix, "chatsft_checkpoints") if prefix else "chatsft_checkpoints"
if os.path.exists(local_checkpoints_dir):
upload_directory_to_gcs(local_checkpoints_dir, bucket_name, gcs_checkpoints_path)
else:
print(f"Warning: {local_checkpoints_dir} does not exist.")
# Upload report dir
if os.path.exists(local_report_dir):
upload_directory_to_gcs(local_report_dir, bucket_name, gcs_report_path)
if __name__ == "__main__":
main()

View File

@ -0,0 +1,24 @@
FROM python:3.10-slim
WORKDIR /app
RUN apt-get update && apt-get install -y curl build-essential
RUN curl -LsSf https://astral.sh/uv/install.sh | sh
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
ENV PATH="/root/.local/bin:/root/.cargo/bin:/app/.venv/bin:${PATH}"
COPY . .
RUN uv venv
RUN uv sync --extra gpu
RUN uv pip install maturin
RUN maturin develop --release --manifest-path rustbpe/Cargo.toml
# Install gcloud
RUN apt-get install -y apt-transport-https ca-certificates gnupg
RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
RUN apt-get update && apt-get install -y google-cloud-sdk
ENTRYPOINT ["bash"]

View File

@ -1,25 +1,99 @@
import os
import sys
import subprocess
import argparse
from nanochat.common import get_base_dir
import shutil
from google.cloud import storage
def upload_directory_to_gcs(local_path, bucket_name, gcs_path):
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
for root, _, files in os.walk(local_path):
for file in files:
local_file = os.path.join(root, file)
relative_path = os.path.relpath(local_file, local_path)
blob_path = os.path.join(gcs_path, relative_path)
blob = bucket.blob(blob_path)
blob.upload_from_file(open(local_file, 'rb'))
print(f"Uploaded {local_file} to gs://{bucket_name}/{blob_path}")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--gcs-bucket", type=str, required=True, help="GCS bucket for artifacts")
args = parser.parse_args()
# Set the base directory to the GCS bucket.
os.environ["NANOCHAT_BASE_DIR"] = args.gcs_bucket
# Parse bucket name and prefix from args.gcs_bucket
if args.gcs_bucket.startswith("gs://"):
bucket_name = args.gcs_bucket.replace("gs://", "").split("/")[0]
# Handle cases where there might be a prefix
prefix_parts = args.gcs_bucket.replace("gs://", "").split("/")[1:]
prefix = "/".join(prefix_parts) if prefix_parts else ""
else:
bucket_name = args.gcs_bucket
prefix = ""
# Download the dataset.
subprocess.run(["python", "-m", "nanochat.dataset", "-n", "8"], check=True)
subprocess.run(["python", "-m", "nanochat.dataset", "-n", "240"], check=True)
# Check if tokenizer artifacts already exist (checkpoint detection)
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
gcs_tokenizer_path = os.path.join(prefix, "tokenizer") if prefix else "tokenizer"
# Check for key tokenizer files
tokenizer_files = ["model.json", "vocab.json", "merges.txt"]
all_exist = all(bucket.blob(os.path.join(gcs_tokenizer_path, f)).exists() for f in tokenizer_files)
if all_exist:
print(f"✓ Tokenizer artifacts already exist in gs://{bucket_name}/{gcs_tokenizer_path}")
print("Skipping tokenizer training (already completed)")
return
# Train the tokenizer.
subprocess.run(["python", "-m", "scripts.tok_train", "--max_chars=2000000000"], check=True)
print(f"Tokenizer artifacts not found. Running tokenizer training...")
# Evaluate the tokenizer.
subprocess.run(["python", "-m", "scripts.tok_eval"], check=True)
# Set the base directory to a local temporary directory.
# We cannot use GCS directly because the tokenizer training script (Rust) expects local files.
local_base_dir = "/tmp/nanochat"
os.environ["NANOCHAT_BASE_DIR"] = local_base_dir
os.makedirs(local_base_dir, exist_ok=True)
try:
# Download the dataset.
# nanochat.dataset supports GCS, so we can point NANOCHAT_DATA_DIR to GCS if we wanted,
# but for simplicity let's just let it download to local temp.
# If you have data in GCS, you could set NANOCHAT_DATA_DIR to gs://...
# For now, we assume we download from HF to local.
print("Downloading dataset (n=8)...")
subprocess.run([sys.executable, "-m", "nanochat.dataset", "-n", "8"], check=True)
print("Downloading dataset (n=240)...")
subprocess.run([sys.executable, "-m", "nanochat.dataset", "-n", "240"], check=True)
except subprocess.CalledProcessError as e:
print(f"Error downloading dataset: {e}")
raise
try:
# Train the tokenizer.
print("Training tokenizer...")
subprocess.run([sys.executable, "scripts/tok_train.py", "--max_chars=2000000000"], check=True)
except subprocess.CalledProcessError as e:
print(f"Error training tokenizer: {e}")
raise
try:
# Evaluate the tokenizer.
print("Evaluating tokenizer...")
subprocess.run([sys.executable, "scripts/tok_eval.py"], check=True)
except subprocess.CalledProcessError as e:
print(f"Error evaluating tokenizer: {e}")
raise
# Upload artifacts to GCS
print("Uploading artifacts to GCS...")
# Upload tokenizer
local_tokenizer_dir = os.path.join(local_base_dir, "tokenizer")
gcs_tokenizer_path = os.path.join(prefix, "tokenizer") if prefix else "tokenizer"
upload_directory_to_gcs(local_tokenizer_dir, bucket_name, gcs_tokenizer_path)
# Upload tokenized data if needed?
# Usually we don't upload the raw data here, but tok_train might produce token_bytes.pt which is in tokenizer dir.
if __name__ == "__main__":
main()