Success on Vertex Pipelines

2025-12-06 04:12:13 +00:00 · 2025-12-01 19:59:58 -05:00 · 2025-12-01 19:59:58 -05:00 · 13001597c2
commit 13001597c2
parent a88e7ec21f
40 changed files with 3770 additions and 264 deletions
--- a/demo_pipeline.json
+++ b/demo_pipeline.json
@ -0,0 +1,614 @@
+{
+  "components": {
+    "comp-custom-training-job": {
+      "executorLabel": "exec-custom-training-job",
+      "inputDefinitions": {
+        "parameters": {
+          "base_output_directory": {
+            "defaultValue": "",
+            "description": "The Cloud Storage location to store the output of this CustomJob or HyperparameterTuningJob. See [more information ](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/GcsDestination).",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "display_name": {
+            "description": "The name of the CustomJob.",
+            "parameterType": "STRING"
+          },
+          "enable_web_access": {
+            "defaultValue": false,
+            "description": "Whether you want Vertex AI to enable [interactive shell access ](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell) to training containers. If `True`, you can access interactive shells at the URIs given by [CustomJob.web_access_uris][].",
+            "isOptional": true,
+            "parameterType": "BOOLEAN"
+          },
+          "encryption_spec_key_name": {
+            "defaultValue": "",
+            "description": "Customer-managed encryption key options for the CustomJob. If this is set, then all resources created by the CustomJob will be encrypted with the provided encryption key.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "labels": {
+            "defaultValue": {},
+            "description": "The labels with user-defined metadata to organize the CustomJob. See [more information](https://goo.gl/xmQnxf).",
+            "isOptional": true,
+            "parameterType": "STRUCT"
+          },
+          "location": {
+            "defaultValue": "{{$.pipeline_google_cloud_location}}",
+            "description": "Location for creating the custom training job. If not set, default to the location where the PipelineJob is run.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "max_wait_duration": {
+            "defaultValue": "86400s",
+            "description": "The maximum time to wait for the custom training job to be scheduled only if the scheduling strategy is set to FLEX_START. If set to 0, the job will wait indefinitely. The default is 24 hours. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "network": {
+            "defaultValue": "",
+            "description": "The full name of the Compute Engine network to which the job should be peered. For example, `projects/12345/global/networks/myVPC`. Format is of the form `projects/{project}/global/networks/{network}`. Where `{project}` is a project number, as in `12345`, and `{network}` is a network name. Private services access must already be configured for the network. If left unspecified, the job is not peered with any network.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "persistent_resource_id": {
+            "defaultValue": "{{$.pipeline_persistent_resource_id}}",
+            "description": "The ID of the PersistentResource in the same Project and Location which to run. The default value is a placeholder that will be resolved to the PipelineJob [RuntimeConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.pipelineJobs#PipelineJob.RuntimeConfig)'s persistent resource id at runtime. However, if the PipelineJob doesn't set Persistent Resource as the job level runtime, the placedholder will be resolved to an empty string and the custom job will be run on demand. If the value is set explicitly, the custom job will runs in the specified persistent resource, in this case, please note the network and CMEK configs on the job should be consistent with those on the PersistentResource, otherwise, the job will be rejected.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "project": {
+            "defaultValue": "{{$.pipeline_google_cloud_project_id}}",
+            "description": "Project to create the custom training job in. Defaults to the project in which the PipelineJob is run.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "psc_interface_config": {
+            "defaultValue": {},
+            "description": "Configuration CustomJob with PSC-I. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#PscInterfaceConfig).",
+            "isOptional": true,
+            "parameterType": "STRUCT"
+          },
+          "reserved_ip_ranges": {
+            "defaultValue": [],
+            "description": "A list of names for the reserved IP ranges under the VPC network that can be used for this job. If set, we will deploy the job within the provided IP ranges. Otherwise, the job will be deployed to any IP ranges under the provided VPC network.",
+            "isOptional": true,
+            "parameterType": "LIST"
+          },
+          "restart_job_on_worker_restart": {
+            "defaultValue": false,
+            "description": "Restarts the entire CustomJob if a worker gets restarted. This feature can be used by distributed training jobs that are not resilient to workers leaving and joining a job.",
+            "isOptional": true,
+            "parameterType": "BOOLEAN"
+          },
+          "service_account": {
+            "defaultValue": "",
+            "description": "Sets the default service account for workload run-as account. The [service account ](https://cloud.google.com/vertex-ai/docs/pipelines/configure-project#service-account) running the pipeline submitting jobs must have act-as permission on this run-as account. If unspecified, the Vertex AI Custom Code [Service Agent ](https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents) for the CustomJob's project.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "strategy": {
+            "defaultValue": "STANDARD",
+            "description": "The strategy to use for the custom training job. The default is 'STANDARD'. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "tensorboard": {
+            "defaultValue": "",
+            "description": "The name of a Vertex AI TensorBoard resource to which this CustomJob will upload TensorBoard logs.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "timeout": {
+            "defaultValue": "604800s",
+            "description": "The maximum job running time. The default is 7 days. A duration in seconds with up to nine fractional digits, terminated by 's', for example: \"3.5s\".",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "worker_pool_specs": {
+            "defaultValue": [],
+            "description": "Serialized json spec of the worker pools including machine type and Docker image. All worker pools except the first one are optional and can be skipped by providing an empty value. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#WorkerPoolSpec).",
+            "isOptional": true,
+            "parameterType": "LIST"
+          }
+        }
+      },
+      "outputDefinitions": {
+        "parameters": {
+          "gcp_resources": {
+            "description": "Serialized JSON of `gcp_resources` [proto](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud/google_cloud_pipeline_components/proto) which tracks the CustomJob.",
+            "parameterType": "STRING"
+          }
+        }
+      }
+    },
+    "comp-data-download-step": {
+      "executorLabel": "exec-data-download-step",
+      "inputDefinitions": {
+        "parameters": {
+          "gcs_bucket": {
+            "parameterType": "STRING"
+          },
+          "num_shards": {
+            "defaultValue": 50.0,
+            "isOptional": true,
+            "parameterType": "NUMBER_INTEGER"
+          }
+        }
+      }
+    },
+    "comp-midtraining-step": {
+      "executorLabel": "exec-midtraining-step",
+      "inputDefinitions": {
+        "parameters": {
+          "gcs_bucket": {
+            "parameterType": "STRING"
+          },
+          "vertex_experiment": {
+            "parameterType": "STRING"
+          },
+          "vertex_tensorboard": {
+            "parameterType": "STRING"
+          },
+          "wandb_run": {
+            "parameterType": "STRING"
+          }
+        }
+      }
+    },
+    "comp-report-step": {
+      "executorLabel": "exec-report-step",
+      "inputDefinitions": {
+        "parameters": {
+          "gcs_bucket": {
+            "parameterType": "STRING"
+          }
+        }
+      }
+    },
+    "comp-sft-step": {
+      "executorLabel": "exec-sft-step",
+      "inputDefinitions": {
+        "parameters": {
+          "gcs_bucket": {
+            "parameterType": "STRING"
+          },
+          "vertex_experiment": {
+            "parameterType": "STRING"
+          },
+          "vertex_tensorboard": {
+            "parameterType": "STRING"
+          },
+          "wandb_run": {
+            "parameterType": "STRING"
+          }
+        }
+      }
+    },
+    "comp-tokenizer-step": {
+      "executorLabel": "exec-tokenizer-step",
+      "inputDefinitions": {
+        "parameters": {
+          "gcs_bucket": {
+            "parameterType": "STRING"
+          }
+        }
+      }
+    }
+  },
+  "deploymentSpec": {
+    "executors": {
+      "exec-custom-training-job": {
+        "container": {
+          "args": [
+            "--type",
+            "CustomJob",
+            "--payload",
+            "{\"display_name\": \"{{$.inputs.parameters['display_name']}}\", \"job_spec\": {\"worker_pool_specs\": {{$.inputs.parameters['worker_pool_specs']}}, \"scheduling\": {\"timeout\": \"{{$.inputs.parameters['timeout']}}\", \"restart_job_on_worker_restart\": {{$.inputs.parameters['restart_job_on_worker_restart']}}, \"strategy\": \"{{$.inputs.parameters['strategy']}}\", \"max_wait_duration\": \"{{$.inputs.parameters['max_wait_duration']}}\"}, \"service_account\": \"{{$.inputs.parameters['service_account']}}\", \"tensorboard\": \"{{$.inputs.parameters['tensorboard']}}\", \"enable_web_access\": {{$.inputs.parameters['enable_web_access']}}, \"network\": \"{{$.inputs.parameters['network']}}\", \"reserved_ip_ranges\": {{$.inputs.parameters['reserved_ip_ranges']}}, \"base_output_directory\": {\"output_uri_prefix\": \"{{$.inputs.parameters['base_output_directory']}}\"}, \"persistent_resource_id\": \"{{$.inputs.parameters['persistent_resource_id']}}\", \"psc_interface_config\": {{$.inputs.parameters['psc_interface_config']}}}, \"labels\": {{$.inputs.parameters['labels']}}, \"encryption_spec\": {\"kms_key_name\": \"{{$.inputs.parameters['encryption_spec_key_name']}}\"}}",
+            "--project",
+            "{{$.inputs.parameters['project']}}",
+            "--location",
+            "{{$.inputs.parameters['location']}}",
+            "--gcp_resources",
+            "{{$.outputs.parameters['gcp_resources'].output_file}}"
+          ],
+          "command": [
+            "python3",
+            "-u",
+            "-m",
+            "google_cloud_pipeline_components.container.v1.custom_job.launcher"
+          ],
+          "image": "gcr.io/ml-pipeline/google-cloud-pipeline-components:2.22.0"
+        }
+      },
+      "exec-data-download-step": {
+        "container": {
+          "args": [
+            "--gcs-bucket",
+            "{{$.inputs.parameters['gcs_bucket']}}",
+            "--num-shards",
+            "{{$.inputs.parameters['num_shards']}}"
+          ],
+          "command": [
+            "python",
+            "vertex_pipelines/data_download_step.py"
+          ],
+          "image": "gcr.io/nzp-nanochat/nanochat:latest",
+          "resources": {
+            "cpuLimit": 8.0,
+            "memoryLimit": 32.0,
+            "resourceCpuLimit": "8",
+            "resourceMemoryLimit": "32G"
+          }
+        }
+      },
+      "exec-midtraining-step": {
+        "container": {
+          "args": [
+            "--gcs-bucket",
+            "{{$.inputs.parameters['gcs_bucket']}}",
+            "--wandb-run",
+            "{{$.inputs.parameters['wandb_run']}}",
+            "--vertex-experiment",
+            "{{$.inputs.parameters['vertex_experiment']}}",
+            "--vertex-tensorboard",
+            "{{$.inputs.parameters['vertex_tensorboard']}}"
+          ],
+          "command": [
+            "python",
+            "vertex_pipelines/midtraining_step.py"
+          ],
+          "image": "gcr.io/nzp-nanochat/nanochat:latest",
+          "resources": {
+            "accelerator": {
+              "count": "1",
+              "resourceCount": "1",
+              "resourceType": "NVIDIA_TESLA_A100",
+              "type": "NVIDIA_TESLA_A100"
+            },
+            "cpuLimit": 8.0,
+            "memoryLimit": 32.0,
+            "resourceCpuLimit": "8",
+            "resourceMemoryLimit": "32G"
+          }
+        }
+      },
+      "exec-report-step": {
+        "container": {
+          "args": [
+            "--gcs-bucket",
+            "{{$.inputs.parameters['gcs_bucket']}}"
+          ],
+          "command": [
+            "python",
+            "vertex_pipelines/report_step.py"
+          ],
+          "image": "gcr.io/nzp-nanochat/nanochat:latest",
+          "resources": {
+            "cpuLimit": 2.0,
+            "memoryLimit": 8.0,
+            "resourceCpuLimit": "2",
+            "resourceMemoryLimit": "8G"
+          }
+        }
+      },
+      "exec-sft-step": {
+        "container": {
+          "args": [
+            "--gcs-bucket",
+            "{{$.inputs.parameters['gcs_bucket']}}",
+            "--wandb-run",
+            "{{$.inputs.parameters['wandb_run']}}",
+            "--vertex-experiment",
+            "{{$.inputs.parameters['vertex_experiment']}}",
+            "--vertex-tensorboard",
+            "{{$.inputs.parameters['vertex_tensorboard']}}"
+          ],
+          "command": [
+            "python",
+            "vertex_pipelines/sft_step.py"
+          ],
+          "image": "gcr.io/nzp-nanochat/nanochat:latest",
+          "resources": {
+            "accelerator": {
+              "count": "1",
+              "resourceCount": "1",
+              "resourceType": "NVIDIA_L4",
+              "type": "NVIDIA_L4"
+            },
+            "cpuLimit": 8.0,
+            "memoryLimit": 32.0,
+            "resourceCpuLimit": "8",
+            "resourceMemoryLimit": "32G"
+          }
+        }
+      },
+      "exec-tokenizer-step": {
+        "container": {
+          "args": [
+            "--gcs-bucket",
+            "{{$.inputs.parameters['gcs_bucket']}}"
+          ],
+          "command": [
+            "python",
+            "vertex_pipelines/tokenizer_step.py"
+          ],
+          "image": "gcr.io/nzp-nanochat/nanochat:latest",
+          "resources": {
+            "cpuLimit": 8.0,
+            "memoryLimit": 32.0,
+            "resourceCpuLimit": "8",
+            "resourceMemoryLimit": "32G"
+          }
+        }
+      }
+    }
+  },
+  "pipelineInfo": {
+    "description": "A pipeline to train NanoChat",
+    "name": "nanochat-pipeline"
+  },
+  "root": {
+    "dag": {
+      "tasks": {
+        "custom-training-job": {
+          "cachingOptions": {
+            "enableCache": true
+          },
+          "componentRef": {
+            "name": "comp-custom-training-job"
+          },
+          "dependentTasks": [
+            "tokenizer-step"
+          ],
+          "inputs": {
+            "parameters": {
+              "base_output_directory": {
+                "runtimeValue": {
+                  "constant": "{{$.inputs.parameters['pipelinechannel--gcs_bucket']}}/pipeline_root"
+                }
+              },
+              "display_name": {
+                "runtimeValue": {
+                  "constant": "nanochat-pretraining-job"
+                }
+              },
+              "location": {
+                "componentInputParameter": "location"
+              },
+              "max_wait_duration": {
+                "componentInputParameter": "max_wait_duration"
+              },
+              "pipelinechannel--gcs_bucket": {
+                "componentInputParameter": "gcs_bucket"
+              },
+              "pipelinechannel--vertex_experiment": {
+                "componentInputParameter": "vertex_experiment"
+              },
+              "pipelinechannel--vertex_tensorboard": {
+                "componentInputParameter": "vertex_tensorboard"
+              },
+              "pipelinechannel--wandb_run": {
+                "componentInputParameter": "wandb_run"
+              },
+              "project": {
+                "componentInputParameter": "project"
+              },
+              "restart_job_on_worker_restart": {
+                "runtimeValue": {
+                  "constant": true
+                }
+              },
+              "strategy": {
+                "componentInputParameter": "scheduling_strategy"
+              },
+              "timeout": {
+                "runtimeValue": {
+                  "constant": "604800s"
+                }
+              },
+              "worker_pool_specs": {
+                "runtimeValue": {
+                  "constant": [
+                    {
+                      "container_spec": {
+                        "args": [
+                          "--gcs-bucket",
+                          "{{$.inputs.parameters['pipelinechannel--gcs_bucket']}}",
+                          "--wandb-run",
+                          "{{$.inputs.parameters['pipelinechannel--wandb_run']}}",
+                          "--vertex-experiment",
+                          "{{$.inputs.parameters['pipelinechannel--vertex_experiment']}}",
+                          "--vertex-tensorboard",
+                          "{{$.inputs.parameters['pipelinechannel--vertex_tensorboard']}}"
+                        ],
+                        "command": [
+                          "python",
+                          "vertex_pipelines/pretraining_step.py"
+                        ],
+                        "image_uri": "gcr.io/nzp-nanochat/nanochat:latest"
+                      },
+                      "machine_spec": {
+                        "accelerator_count": 8.0,
+                        "accelerator_type": "NVIDIA_TESLA_A100",
+                        "machine_type": "a2-highgpu-8g"
+                      },
+                      "replica_count": 1.0
+                    }
+                  ]
+                }
+              }
+            }
+          },
+          "taskInfo": {
+            "name": "custom-training-job"
+          }
+        },
+        "data-download-step": {
+          "cachingOptions": {
+            "enableCache": true
+          },
+          "componentRef": {
+            "name": "comp-data-download-step"
+          },
+          "inputs": {
+            "parameters": {
+              "gcs_bucket": {
+                "componentInputParameter": "gcs_bucket"
+              },
+              "num_shards": {
+                "componentInputParameter": "num_data_shards"
+              }
+            }
+          },
+          "taskInfo": {
+            "name": "data-download-step"
+          }
+        },
+        "midtraining-step": {
+          "cachingOptions": {
+            "enableCache": true
+          },
+          "componentRef": {
+            "name": "comp-midtraining-step"
+          },
+          "dependentTasks": [
+            "custom-training-job"
+          ],
+          "inputs": {
+            "parameters": {
+              "gcs_bucket": {
+                "componentInputParameter": "gcs_bucket"
+              },
+              "vertex_experiment": {
+                "componentInputParameter": "vertex_experiment"
+              },
+              "vertex_tensorboard": {
+                "componentInputParameter": "vertex_tensorboard"
+              },
+              "wandb_run": {
+                "componentInputParameter": "wandb_run"
+              }
+            }
+          },
+          "taskInfo": {
+            "name": "midtraining-step"
+          }
+        },
+        "report-step": {
+          "cachingOptions": {
+            "enableCache": true
+          },
+          "componentRef": {
+            "name": "comp-report-step"
+          },
+          "dependentTasks": [
+            "sft-step"
+          ],
+          "inputs": {
+            "parameters": {
+              "gcs_bucket": {
+                "componentInputParameter": "gcs_bucket"
+              }
+            }
+          },
+          "taskInfo": {
+            "name": "report-step"
+          }
+        },
+        "sft-step": {
+          "cachingOptions": {
+            "enableCache": true
+          },
+          "componentRef": {
+            "name": "comp-sft-step"
+          },
+          "dependentTasks": [
+            "midtraining-step"
+          ],
+          "inputs": {
+            "parameters": {
+              "gcs_bucket": {
+                "componentInputParameter": "gcs_bucket"
+              },
+              "vertex_experiment": {
+                "componentInputParameter": "vertex_experiment"
+              },
+              "vertex_tensorboard": {
+                "componentInputParameter": "vertex_tensorboard"
+              },
+              "wandb_run": {
+                "componentInputParameter": "wandb_run"
+              }
+            }
+          },
+          "taskInfo": {
+            "name": "sft-step"
+          }
+        },
+        "tokenizer-step": {
+          "cachingOptions": {
+            "enableCache": true
+          },
+          "componentRef": {
+            "name": "comp-tokenizer-step"
+          },
+          "inputs": {
+            "parameters": {
+              "gcs_bucket": {
+                "componentInputParameter": "gcs_bucket"
+              }
+            }
+          },
+          "taskInfo": {
+            "name": "tokenizer-step"
+          }
+        }
+      }
+    },
+    "inputDefinitions": {
+      "parameters": {
+        "gcs_bucket": {
+          "parameterType": "STRING"
+        },
+        "location": {
+          "parameterType": "STRING"
+        },
+        "max_wait_duration": {
+          "defaultValue": "0s",
+          "isOptional": true,
+          "parameterType": "STRING"
+        },
+        "num_data_shards": {
+          "defaultValue": 20.0,
+          "isOptional": true,
+          "parameterType": "NUMBER_INTEGER"
+        },
+        "project": {
+          "parameterType": "STRING"
+        },
+        "scheduling_strategy": {
+          "defaultValue": "FLEX_START",
+          "isOptional": true,
+          "parameterType": "STRING"
+        },
+        "vertex_experiment": {
+          "defaultValue": "",
+          "isOptional": true,
+          "parameterType": "STRING"
+        },
+        "vertex_tensorboard": {
+          "defaultValue": "",
+          "isOptional": true,
+          "parameterType": "STRING"
+        },
+        "wandb_run": {
+          "defaultValue": "dummy",
+          "isOptional": true,
+          "parameterType": "STRING"
+        }
+      }
+    }
+  },
+  "schemaVersion": "2.1.0",
+  "sdkVersion": "kfp-2.14.6"
+}
--- a/demo_runtime_scheduling.sh
+++ b/demo_runtime_scheduling.sh
@ -0,0 +1,68 @@
+#!/bin/bash
+# Demonstration: Submitting the same compiled pipeline with different scheduling strategies
+# without recompilation
+
+set -e
+
+echo "=== Demo: Runtime Scheduling Strategy Changes ==="
+echo ""
+echo "This demonstrates that we can now change scheduling strategies"
+echo "without recompiling the pipeline or rebuilding the Docker image."
+echo ""
+
+# Compile the pipeline once
+echo "1. Compiling pipeline (one time)..."
+python3 vertex_pipelines/pipeline.py \
+    --gcp-project nzp-nanochat \
+    --gcs-bucket gs://nzp-nanochat \
+    --pipeline-root gs://nzp-nanochat/pipeline-root \
+    --region us-central1 \
+    --wandb-run test-run \
+    --vertex-experiment nanochat-experiment \
+    --vertex-tensorboard projects/247010501180/locations/us-central1/tensorboards/8180826106513850368 \
+    --accelerator-type NVIDIA_TESLA_A100 \
+    --accelerator-count 8 \
+    --preemptible true \
+    --num-data-shards 20 \
+    --service-account 247010501180-compute@developer.gserviceaccount.com \
+    --template_path demo_pipeline.json \
+    2>&1 | grep -v "^Creating\|^To use\|^View\|state:"
+
+echo "✓ Pipeline compiled successfully"
+echo ""
+
+# Show the scheduling parameters in the compiled pipeline
+echo "2. Checking compiled pipeline parameters..."
+python3 -c "
+import json
+data = json.load(open('demo_pipeline.json'))
+params = data['root']['inputDefinitions']['parameters']
+print('   scheduling_strategy: default =', params['scheduling_strategy']['defaultValue'])
+print('   max_wait_duration: default =', params['max_wait_duration']['defaultValue'])
+"
+echo ""
+
+echo "3. Demonstrating runtime parameter override..."
+echo "   You can now submit this compiled pipeline with different strategies:"
+echo ""
+echo "   Option A (DWS - wait indefinitely):"
+echo "   --scheduling-strategy FLEX_START --max-wait-duration 0s"
+echo ""
+echo "   Option B (DWS - wait 1 hour):"
+echo "   --scheduling-strategy FLEX_START --max-wait-duration 3600s"
+echo ""
+echo "   Option C (Standard on-demand):"
+echo "   --scheduling-strategy STANDARD --max-wait-duration 86400s"
+echo ""
+echo "   Option D (Legacy Spot):"
+echo "   --scheduling-strategy SPOT --max-wait-duration 0s"
+echo ""
+
+echo "=== Summary ==="
+echo "✓ Pipeline compilation is DECOUPLED from scheduling configuration"
+echo "✓ No recompilation needed when changing FLEX_START ↔ SPOT ↔ STANDARD"
+echo "✓ No Docker rebuild needed for deployment strategy changes"
+echo ""
+echo "To submit with a different strategy, just pass:"
+echo "  --scheduling-strategy <VALUE> --max-wait-duration <VALUE>"
+echo "to pipeline.py or add them to run_pipeline.sh"
--- a/inspect_custom_job.py
+++ b/inspect_custom_job.py
@ -0,0 +1,3 @@
+import google_cloud_pipeline_components.v1.custom_job as custom_job_module
+print(f"Module file: {custom_job_module.__file__}")
+
--- a/inspect_dws.py
+++ b/inspect_dws.py
@ -0,0 +1,13 @@
+#!/usr/bin/env python3
+"""Inspect CustomTrainingJobOp for DWS parameters."""
+import inspect
+from google_cloud_pipeline_components.v1.custom_job import CustomTrainingJobOp
+
+print("CustomTrainingJobOp signature:")
+print(inspect.signature(CustomTrainingJobOp))
+print("\n" + "="*80 + "\n")
+
+# Get the component function
+component_fn = CustomTrainingJobOp.component_spec
+print("Component spec:")
+print(component_fn)
--- a/inspect_kfp.py
+++ b/inspect_kfp.py
@ -0,0 +1,7 @@
+import inspect
+from kfp import dsl
+
+try:
+    print("Available methods:", [m for m in dir(dsl.PipelineTask) if 'pod' in m or 'label' in m or 'annotation' in m or 'env' in m])
+except Exception as e:
+    print(e)
--- a/nanochat/checkpoint_manager.py
+++ b/nanochat/checkpoint_manager.py
@ -153,8 +153,14 @@ def find_largest_model(checkpoint_dir):
        storage_client = storage.Client()
        bucket_name, prefix = checkpoint_dir[5:].split("/", 1)
        bucket = storage_client.bucket(bucket_name)
+        if not prefix.endswith("/"):
+            prefix += "/"
        blobs = bucket.list_blobs(prefix=prefix, delimiter='/')
-        model_tags = [b.name.split('/')[-2] for b in blobs.prefixes]
+        list(blobs) # Iterate to populate prefixes
+        log0(f"DEBUG: prefix={prefix}")
+        log0(f"DEBUG: blobs.prefixes={list(blobs.prefixes)}")
+        model_tags = [p.split('/')[-2] for p in blobs.prefixes]
+        log0(f"DEBUG: model_tags={model_tags}")
    else:
        # attempt to guess the model tag: take the biggest model available
        model_tags = [f for f in os.listdir(checkpoint_dir) if os.path.isdir(os.path.join(checkpoint_dir, f))]
@ -218,6 +224,15 @@ def load_model(source, *args, **kwargs):
        "sft": "chatsft_checkpoints",
        "rl": "chatrl_checkpoints",
    }[source]
-    base_dir = get_base_dir()
-    checkpoints_dir = os.path.join(base_dir, model_dir)
+    
+    # Check if running in Vertex AI with GCS data directory
+    data_dir = os.environ.get("NANOCHAT_DATA_DIR", "")
+    if data_dir.startswith("gs://"):
+        # Use GCS checkpoint directory
+        checkpoints_dir = data_dir.replace("/base_data", f"/{model_dir}")
+    else:
+        # Use local checkpoint directory
+        base_dir = get_base_dir()
+        checkpoints_dir = os.path.join(base_dir, model_dir)
+    
    return load_model_from_dir(checkpoints_dir, *args, **kwargs)
--- a/nanochat/common.py
+++ b/nanochat/common.py
@ -138,6 +138,18 @@ def get_dist_info():

 def autodetect_device_type():
    # prefer to use CUDA if available, otherwise use MPS, otherwise fallback on CPU
+    print0(f"DEBUG: torch.cuda.is_available(): {torch.cuda.is_available()}")
+    if torch.cuda.is_available():
+        print0(f"DEBUG: torch.version.cuda: {torch.version.cuda}")
+        print0(f"DEBUG: torch.backends.cudnn.version(): {torch.backends.cudnn.version()}")
+        print0(f"DEBUG: torch.cuda.device_count(): {torch.cuda.device_count()}")
+        print0(f"DEBUG: torch.cuda.get_device_name(0): {torch.cuda.get_device_name(0)}")
+    
+    # Print environment variables relevant to CUDA
+    env_vars = ["LD_LIBRARY_PATH", "PATH", "CUDA_VISIBLE_DEVICES", "NVIDIA_VISIBLE_DEVICES", "NVIDIA_DRIVER_CAPABILITIES"]
+    for var in env_vars:
+        print0(f"DEBUG: env {var}: {os.environ.get(var, 'NOT SET')}")
+
    if torch.cuda.is_available():
        device_type = "cuda"
    elif torch.backends.mps.is_available():
@ -191,7 +203,116 @@ class DummyWandb:
    """Useful if we wish to not use wandb but have all the same signatures"""
    def __init__(self):
        pass
+    def init(self, *args, **kwargs):
+        return self
    def log(self, *args, **kwargs):
        pass
    def finish(self):
        pass
+
+class VertexLogger:
+    """Logs metrics to Vertex AI Experiments."""
+    def __init__(self, experiment_name, tensorboard_resource_name=None):
+        from google.cloud import aiplatform
+        self.aiplatform = aiplatform
+        self.experiment_name = experiment_name
+        self.tensorboard_resource_name = tensorboard_resource_name
+        self._run = None
+
+    def init(self, project=None, name=None, config=None, **kwargs):
+        # Map wandb 'project' to Vertex 'experiment'
+        experiment = project or self.experiment_name
+        
+        self.aiplatform.init(
+            experiment=experiment, 
+            experiment_tensorboard=self.tensorboard_resource_name
+        )
+        try:
+            self._run = self.aiplatform.start_run(run=name, resume=True)
+        except Exception as e:
+            print(f"Could not resume run {name}: {e}. Creating a new run.")
+            self._run = self.aiplatform.start_run(run=name, resume=False)
+        
+        # Initialize TensorBoard SummaryWriter if tensorboard resource is provided
+        # We need to write to a GCS bucket that the TensorBoard resource can access.
+        # Vertex AI automatically uploads logs if we write to the base_output_directory?
+        # Or we can write directly to GCS if we have permissions.
+        # Let's try writing to a GCS path derived from the bucket we use for data.
+        # Ideally we should pass the bucket name, but let's infer or use a default.
+        # Actually, for Custom Jobs, 'base_output_directory' is often set.
+        # Let's try to use the GCS bucket passed in args if possible, but here we don't have it easily.
+        # However, we can use the 'gs://nzp-nanochat/tensorboard_logs/{name}' path.
+        # We'll assume the bucket 'nzp-nanochat' exists as it's hardcoded elsewhere.
+        
+        try:
+            from torch.utils.tensorboard import SummaryWriter
+            import os
+            # Use AIP_TENSORBOARD_LOG_DIR if available (set by Vertex AI)
+            log_dir = os.environ.get('AIP_TENSORBOARD_LOG_DIR')
+            if not log_dir:
+                # Fallback for local runs or if env var is missing
+                log_dir = f"gs://nzp-nanochat/tensorboard_logs/{name}"
+                print(f"AIP_TENSORBOARD_LOG_DIR not found. Using fallback: {log_dir}")
+            
+            self.summary_writer = SummaryWriter(log_dir=log_dir)
+            print(f"TensorBoard logging enabled to: {log_dir}")
+        except Exception as e:
+            print(f"Failed to initialize TensorBoard SummaryWriter: {e}")
+            self.summary_writer = None
+
+        if config:
+            self.aiplatform.log_params(config)
+        return self
+
+    def log(self, data, step=None):
+        # Only log from rank 0 to avoid concurrency conflicts with Vertex AI Experiments
+        import os
+        rank = int(os.environ.get('RANK', 0))
+        
+        # Vertex AI log_metrics doesn't support 'step' directly in the same way.
+        # It logs a new data point.
+        # We must flatten the dictionary because log_metrics only accepts scalars.
+        
+        def flatten(d, parent_key='', sep='.'):
+            items = []
+            for k, v in d.items():
+                new_key = f"{parent_key}{sep}{k}" if parent_key else k
+                if isinstance(v, dict):
+                    items.extend(flatten(v, new_key, sep=sep).items())
+                else:
+                    items.append((new_key, v))
+            return dict(items)
+
+        flat_data = flatten(data)
+        
+        #Extract step for TensorBoard if present in the data
+        global_step = flat_data.get('step', step if step is not None else 0)
+        
+        # Only rank 0 should log to Vertex AI Experiments to prevent etag conflicts
+        if rank == 0:
+            self.aiplatform.log_metrics(flat_data)
+        
+        # Log to TensorBoard from all ranks (TensorBoard can handle concurrent writes)
+        if self.summary_writer:
+            for k, v in flat_data.items():
+                if isinstance(v, (int, float)) and k != 'step':  # Don't log 'step' as a metric
+                    self.summary_writer.add_scalar(k, v, global_step=global_step)
+            self.summary_writer.flush()
+
+    def finish(self):
+        if self.summary_writer:
+            self.summary_writer.close()
+        self.aiplatform.end_run()
+
+def get_experiment_logger(args):
+    """Returns a logger compatible with wandb interface."""
+    if hasattr(args, 'wandb_run') and args.wandb_run != "dummy":
+        import wandb
+        return wandb
+    elif hasattr(args, 'vertex_experiment') and args.vertex_experiment:
+        return VertexLogger(
+            experiment_name=args.vertex_experiment,
+            tensorboard_resource_name=getattr(args, 'vertex_tensorboard', None)
+        )
+    else:
+        return DummyWandb()
--- a/nanochat/dataset.py
+++ b/nanochat/dataset.py
@ -84,6 +84,8 @@ def download_single_file(index):
            print(f"Skipping {filename} (already exists in GCS)")
            return True
    else:
+        # Ensure the directory exists
+        os.makedirs(DATA_DIR, exist_ok=True)
        filepath = os.path.join(DATA_DIR, filename)
        if os.path.exists(filepath):
            print(f"Skipping {filepath} (already exists)")
--- a/nanochat_pipeline.json
+++ b/nanochat_pipeline.json
@ -0,0 +1,952 @@
+{
+  "components": {
+    "comp-custom-training-job": {
+      "executorLabel": "exec-custom-training-job",
+      "inputDefinitions": {
+        "parameters": {
+          "base_output_directory": {
+            "defaultValue": "",
+            "description": "The Cloud Storage location to store the output of this CustomJob or HyperparameterTuningJob. See [more information ](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/GcsDestination).",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "display_name": {
+            "description": "The name of the CustomJob.",
+            "parameterType": "STRING"
+          },
+          "enable_web_access": {
+            "defaultValue": false,
+            "description": "Whether you want Vertex AI to enable [interactive shell access ](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell) to training containers. If `True`, you can access interactive shells at the URIs given by [CustomJob.web_access_uris][].",
+            "isOptional": true,
+            "parameterType": "BOOLEAN"
+          },
+          "encryption_spec_key_name": {
+            "defaultValue": "",
+            "description": "Customer-managed encryption key options for the CustomJob. If this is set, then all resources created by the CustomJob will be encrypted with the provided encryption key.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "labels": {
+            "defaultValue": {},
+            "description": "The labels with user-defined metadata to organize the CustomJob. See [more information](https://goo.gl/xmQnxf).",
+            "isOptional": true,
+            "parameterType": "STRUCT"
+          },
+          "location": {
+            "defaultValue": "{{$.pipeline_google_cloud_location}}",
+            "description": "Location for creating the custom training job. If not set, default to the location where the PipelineJob is run.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "max_wait_duration": {
+            "defaultValue": "86400s",
+            "description": "The maximum time to wait for the custom training job to be scheduled only if the scheduling strategy is set to FLEX_START. If set to 0, the job will wait indefinitely. The default is 24 hours. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "network": {
+            "defaultValue": "",
+            "description": "The full name of the Compute Engine network to which the job should be peered. For example, `projects/12345/global/networks/myVPC`. Format is of the form `projects/{project}/global/networks/{network}`. Where `{project}` is a project number, as in `12345`, and `{network}` is a network name. Private services access must already be configured for the network. If left unspecified, the job is not peered with any network.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "persistent_resource_id": {
+            "defaultValue": "{{$.pipeline_persistent_resource_id}}",
+            "description": "The ID of the PersistentResource in the same Project and Location which to run. The default value is a placeholder that will be resolved to the PipelineJob [RuntimeConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.pipelineJobs#PipelineJob.RuntimeConfig)'s persistent resource id at runtime. However, if the PipelineJob doesn't set Persistent Resource as the job level runtime, the placedholder will be resolved to an empty string and the custom job will be run on demand. If the value is set explicitly, the custom job will runs in the specified persistent resource, in this case, please note the network and CMEK configs on the job should be consistent with those on the PersistentResource, otherwise, the job will be rejected.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "project": {
+            "defaultValue": "{{$.pipeline_google_cloud_project_id}}",
+            "description": "Project to create the custom training job in. Defaults to the project in which the PipelineJob is run.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "psc_interface_config": {
+            "defaultValue": {},
+            "description": "Configuration CustomJob with PSC-I. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#PscInterfaceConfig).",
+            "isOptional": true,
+            "parameterType": "STRUCT"
+          },
+          "reserved_ip_ranges": {
+            "defaultValue": [],
+            "description": "A list of names for the reserved IP ranges under the VPC network that can be used for this job. If set, we will deploy the job within the provided IP ranges. Otherwise, the job will be deployed to any IP ranges under the provided VPC network.",
+            "isOptional": true,
+            "parameterType": "LIST"
+          },
+          "restart_job_on_worker_restart": {
+            "defaultValue": false,
+            "description": "Restarts the entire CustomJob if a worker gets restarted. This feature can be used by distributed training jobs that are not resilient to workers leaving and joining a job.",
+            "isOptional": true,
+            "parameterType": "BOOLEAN"
+          },
+          "service_account": {
+            "defaultValue": "",
+            "description": "Sets the default service account for workload run-as account. The [service account ](https://cloud.google.com/vertex-ai/docs/pipelines/configure-project#service-account) running the pipeline submitting jobs must have act-as permission on this run-as account. If unspecified, the Vertex AI Custom Code [Service Agent ](https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents) for the CustomJob's project.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "strategy": {
+            "defaultValue": "STANDARD",
+            "description": "The strategy to use for the custom training job. The default is 'STANDARD'. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "tensorboard": {
+            "defaultValue": "",
+            "description": "The name of a Vertex AI TensorBoard resource to which this CustomJob will upload TensorBoard logs.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "timeout": {
+            "defaultValue": "604800s",
+            "description": "The maximum job running time. The default is 7 days. A duration in seconds with up to nine fractional digits, terminated by 's', for example: \"3.5s\".",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "worker_pool_specs": {
+            "defaultValue": [],
+            "description": "Serialized json spec of the worker pools including machine type and Docker image. All worker pools except the first one are optional and can be skipped by providing an empty value. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#WorkerPoolSpec).",
+            "isOptional": true,
+            "parameterType": "LIST"
+          }
+        }
+      },
+      "outputDefinitions": {
+        "parameters": {
+          "gcp_resources": {
+            "description": "Serialized JSON of `gcp_resources` [proto](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud/google_cloud_pipeline_components/proto) which tracks the CustomJob.",
+            "parameterType": "STRING"
+          }
+        }
+      }
+    },
+    "comp-custom-training-job-2": {
+      "executorLabel": "exec-custom-training-job-2",
+      "inputDefinitions": {
+        "parameters": {
+          "base_output_directory": {
+            "defaultValue": "",
+            "description": "The Cloud Storage location to store the output of this CustomJob or HyperparameterTuningJob. See [more information ](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/GcsDestination).",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "display_name": {
+            "description": "The name of the CustomJob.",
+            "parameterType": "STRING"
+          },
+          "enable_web_access": {
+            "defaultValue": false,
+            "description": "Whether you want Vertex AI to enable [interactive shell access ](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell) to training containers. If `True`, you can access interactive shells at the URIs given by [CustomJob.web_access_uris][].",
+            "isOptional": true,
+            "parameterType": "BOOLEAN"
+          },
+          "encryption_spec_key_name": {
+            "defaultValue": "",
+            "description": "Customer-managed encryption key options for the CustomJob. If this is set, then all resources created by the CustomJob will be encrypted with the provided encryption key.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "labels": {
+            "defaultValue": {},
+            "description": "The labels with user-defined metadata to organize the CustomJob. See [more information](https://goo.gl/xmQnxf).",
+            "isOptional": true,
+            "parameterType": "STRUCT"
+          },
+          "location": {
+            "defaultValue": "{{$.pipeline_google_cloud_location}}",
+            "description": "Location for creating the custom training job. If not set, default to the location where the PipelineJob is run.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "max_wait_duration": {
+            "defaultValue": "86400s",
+            "description": "The maximum time to wait for the custom training job to be scheduled only if the scheduling strategy is set to FLEX_START. If set to 0, the job will wait indefinitely. The default is 24 hours. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "network": {
+            "defaultValue": "",
+            "description": "The full name of the Compute Engine network to which the job should be peered. For example, `projects/12345/global/networks/myVPC`. Format is of the form `projects/{project}/global/networks/{network}`. Where `{project}` is a project number, as in `12345`, and `{network}` is a network name. Private services access must already be configured for the network. If left unspecified, the job is not peered with any network.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "persistent_resource_id": {
+            "defaultValue": "{{$.pipeline_persistent_resource_id}}",
+            "description": "The ID of the PersistentResource in the same Project and Location which to run. The default value is a placeholder that will be resolved to the PipelineJob [RuntimeConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.pipelineJobs#PipelineJob.RuntimeConfig)'s persistent resource id at runtime. However, if the PipelineJob doesn't set Persistent Resource as the job level runtime, the placedholder will be resolved to an empty string and the custom job will be run on demand. If the value is set explicitly, the custom job will runs in the specified persistent resource, in this case, please note the network and CMEK configs on the job should be consistent with those on the PersistentResource, otherwise, the job will be rejected.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "project": {
+            "defaultValue": "{{$.pipeline_google_cloud_project_id}}",
+            "description": "Project to create the custom training job in. Defaults to the project in which the PipelineJob is run.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "psc_interface_config": {
+            "defaultValue": {},
+            "description": "Configuration CustomJob with PSC-I. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#PscInterfaceConfig).",
+            "isOptional": true,
+            "parameterType": "STRUCT"
+          },
+          "reserved_ip_ranges": {
+            "defaultValue": [],
+            "description": "A list of names for the reserved IP ranges under the VPC network that can be used for this job. If set, we will deploy the job within the provided IP ranges. Otherwise, the job will be deployed to any IP ranges under the provided VPC network.",
+            "isOptional": true,
+            "parameterType": "LIST"
+          },
+          "restart_job_on_worker_restart": {
+            "defaultValue": false,
+            "description": "Restarts the entire CustomJob if a worker gets restarted. This feature can be used by distributed training jobs that are not resilient to workers leaving and joining a job.",
+            "isOptional": true,
+            "parameterType": "BOOLEAN"
+          },
+          "service_account": {
+            "defaultValue": "",
+            "description": "Sets the default service account for workload run-as account. The [service account ](https://cloud.google.com/vertex-ai/docs/pipelines/configure-project#service-account) running the pipeline submitting jobs must have act-as permission on this run-as account. If unspecified, the Vertex AI Custom Code [Service Agent ](https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents) for the CustomJob's project.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "strategy": {
+            "defaultValue": "STANDARD",
+            "description": "The strategy to use for the custom training job. The default is 'STANDARD'. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "tensorboard": {
+            "defaultValue": "",
+            "description": "The name of a Vertex AI TensorBoard resource to which this CustomJob will upload TensorBoard logs.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "timeout": {
+            "defaultValue": "604800s",
+            "description": "The maximum job running time. The default is 7 days. A duration in seconds with up to nine fractional digits, terminated by 's', for example: \"3.5s\".",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "worker_pool_specs": {
+            "defaultValue": [],
+            "description": "Serialized json spec of the worker pools including machine type and Docker image. All worker pools except the first one are optional and can be skipped by providing an empty value. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#WorkerPoolSpec).",
+            "isOptional": true,
+            "parameterType": "LIST"
+          }
+        }
+      },
+      "outputDefinitions": {
+        "parameters": {
+          "gcp_resources": {
+            "description": "Serialized JSON of `gcp_resources` [proto](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud/google_cloud_pipeline_components/proto) which tracks the CustomJob.",
+            "parameterType": "STRING"
+          }
+        }
+      }
+    },
+    "comp-custom-training-job-3": {
+      "executorLabel": "exec-custom-training-job-3",
+      "inputDefinitions": {
+        "parameters": {
+          "base_output_directory": {
+            "defaultValue": "",
+            "description": "The Cloud Storage location to store the output of this CustomJob or HyperparameterTuningJob. See [more information ](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/GcsDestination).",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "display_name": {
+            "description": "The name of the CustomJob.",
+            "parameterType": "STRING"
+          },
+          "enable_web_access": {
+            "defaultValue": false,
+            "description": "Whether you want Vertex AI to enable [interactive shell access ](https://cloud.google.com/vertex-ai/docs/training/monitor-debug-interactive-shell) to training containers. If `True`, you can access interactive shells at the URIs given by [CustomJob.web_access_uris][].",
+            "isOptional": true,
+            "parameterType": "BOOLEAN"
+          },
+          "encryption_spec_key_name": {
+            "defaultValue": "",
+            "description": "Customer-managed encryption key options for the CustomJob. If this is set, then all resources created by the CustomJob will be encrypted with the provided encryption key.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "labels": {
+            "defaultValue": {},
+            "description": "The labels with user-defined metadata to organize the CustomJob. See [more information](https://goo.gl/xmQnxf).",
+            "isOptional": true,
+            "parameterType": "STRUCT"
+          },
+          "location": {
+            "defaultValue": "{{$.pipeline_google_cloud_location}}",
+            "description": "Location for creating the custom training job. If not set, default to the location where the PipelineJob is run.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "max_wait_duration": {
+            "defaultValue": "86400s",
+            "description": "The maximum time to wait for the custom training job to be scheduled only if the scheduling strategy is set to FLEX_START. If set to 0, the job will wait indefinitely. The default is 24 hours. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "network": {
+            "defaultValue": "",
+            "description": "The full name of the Compute Engine network to which the job should be peered. For example, `projects/12345/global/networks/myVPC`. Format is of the form `projects/{project}/global/networks/{network}`. Where `{project}` is a project number, as in `12345`, and `{network}` is a network name. Private services access must already be configured for the network. If left unspecified, the job is not peered with any network.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "persistent_resource_id": {
+            "defaultValue": "{{$.pipeline_persistent_resource_id}}",
+            "description": "The ID of the PersistentResource in the same Project and Location which to run. The default value is a placeholder that will be resolved to the PipelineJob [RuntimeConfig](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/projects.locations.pipelineJobs#PipelineJob.RuntimeConfig)'s persistent resource id at runtime. However, if the PipelineJob doesn't set Persistent Resource as the job level runtime, the placedholder will be resolved to an empty string and the custom job will be run on demand. If the value is set explicitly, the custom job will runs in the specified persistent resource, in this case, please note the network and CMEK configs on the job should be consistent with those on the PersistentResource, otherwise, the job will be rejected.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "project": {
+            "defaultValue": "{{$.pipeline_google_cloud_project_id}}",
+            "description": "Project to create the custom training job in. Defaults to the project in which the PipelineJob is run.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "psc_interface_config": {
+            "defaultValue": {},
+            "description": "Configuration CustomJob with PSC-I. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#PscInterfaceConfig).",
+            "isOptional": true,
+            "parameterType": "STRUCT"
+          },
+          "reserved_ip_ranges": {
+            "defaultValue": [],
+            "description": "A list of names for the reserved IP ranges under the VPC network that can be used for this job. If set, we will deploy the job within the provided IP ranges. Otherwise, the job will be deployed to any IP ranges under the provided VPC network.",
+            "isOptional": true,
+            "parameterType": "LIST"
+          },
+          "restart_job_on_worker_restart": {
+            "defaultValue": false,
+            "description": "Restarts the entire CustomJob if a worker gets restarted. This feature can be used by distributed training jobs that are not resilient to workers leaving and joining a job.",
+            "isOptional": true,
+            "parameterType": "BOOLEAN"
+          },
+          "service_account": {
+            "defaultValue": "",
+            "description": "Sets the default service account for workload run-as account. The [service account ](https://cloud.google.com/vertex-ai/docs/pipelines/configure-project#service-account) running the pipeline submitting jobs must have act-as permission on this run-as account. If unspecified, the Vertex AI Custom Code [Service Agent ](https://cloud.google.com/vertex-ai/docs/general/access-control#service-agents) for the CustomJob's project.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "strategy": {
+            "defaultValue": "STANDARD",
+            "description": "The strategy to use for the custom training job. The default is 'STANDARD'. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#Strategy).",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "tensorboard": {
+            "defaultValue": "",
+            "description": "The name of a Vertex AI TensorBoard resource to which this CustomJob will upload TensorBoard logs.",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "timeout": {
+            "defaultValue": "604800s",
+            "description": "The maximum job running time. The default is 7 days. A duration in seconds with up to nine fractional digits, terminated by 's', for example: \"3.5s\".",
+            "isOptional": true,
+            "parameterType": "STRING"
+          },
+          "worker_pool_specs": {
+            "defaultValue": [],
+            "description": "Serialized json spec of the worker pools including machine type and Docker image. All worker pools except the first one are optional and can be skipped by providing an empty value. See [more information](https://cloud.google.com/vertex-ai/docs/reference/rest/v1/CustomJobSpec#WorkerPoolSpec).",
+            "isOptional": true,
+            "parameterType": "LIST"
+          }
+        }
+      },
+      "outputDefinitions": {
+        "parameters": {
+          "gcp_resources": {
+            "description": "Serialized JSON of `gcp_resources` [proto](https://github.com/kubeflow/pipelines/tree/master/components/google-cloud/google_cloud_pipeline_components/proto) which tracks the CustomJob.",
+            "parameterType": "STRING"
+          }
+        }
+      }
+    },
+    "comp-data-download-step": {
+      "executorLabel": "exec-data-download-step",
+      "inputDefinitions": {
+        "parameters": {
+          "gcs_bucket": {
+            "parameterType": "STRING"
+          },
+          "num_shards": {
+            "defaultValue": 50.0,
+            "isOptional": true,
+            "parameterType": "NUMBER_INTEGER"
+          }
+        }
+      }
+    },
+    "comp-report-step": {
+      "executorLabel": "exec-report-step",
+      "inputDefinitions": {
+        "parameters": {
+          "gcs_bucket": {
+            "parameterType": "STRING"
+          }
+        }
+      }
+    },
+    "comp-tokenizer-step": {
+      "executorLabel": "exec-tokenizer-step",
+      "inputDefinitions": {
+        "parameters": {
+          "gcs_bucket": {
+            "parameterType": "STRING"
+          }
+        }
+      }
+    }
+  },
+  "deploymentSpec": {
+    "executors": {
+      "exec-custom-training-job": {
+        "container": {
+          "args": [
+            "--type",
+            "CustomJob",
+            "--payload",
+            "{\"display_name\": \"{{$.inputs.parameters['display_name']}}\", \"job_spec\": {\"worker_pool_specs\": {{$.inputs.parameters['worker_pool_specs']}}, \"scheduling\": {\"timeout\": \"{{$.inputs.parameters['timeout']}}\", \"restart_job_on_worker_restart\": {{$.inputs.parameters['restart_job_on_worker_restart']}}, \"strategy\": \"{{$.inputs.parameters['strategy']}}\", \"max_wait_duration\": \"{{$.inputs.parameters['max_wait_duration']}}\"}, \"service_account\": \"{{$.inputs.parameters['service_account']}}\", \"tensorboard\": \"{{$.inputs.parameters['tensorboard']}}\", \"enable_web_access\": {{$.inputs.parameters['enable_web_access']}}, \"network\": \"{{$.inputs.parameters['network']}}\", \"reserved_ip_ranges\": {{$.inputs.parameters['reserved_ip_ranges']}}, \"base_output_directory\": {\"output_uri_prefix\": \"{{$.inputs.parameters['base_output_directory']}}\"}, \"persistent_resource_id\": \"{{$.inputs.parameters['persistent_resource_id']}}\", \"psc_interface_config\": {{$.inputs.parameters['psc_interface_config']}}}, \"labels\": {{$.inputs.parameters['labels']}}, \"encryption_spec\": {\"kms_key_name\": \"{{$.inputs.parameters['encryption_spec_key_name']}}\"}}",
+            "--project",
+            "{{$.inputs.parameters['project']}}",
+            "--location",
+            "{{$.inputs.parameters['location']}}",
+            "--gcp_resources",
+            "{{$.outputs.parameters['gcp_resources'].output_file}}"
+          ],
+          "command": [
+            "python3",
+            "-u",
+            "-m",
+            "google_cloud_pipeline_components.container.v1.custom_job.launcher"
+          ],
+          "image": "gcr.io/ml-pipeline/google-cloud-pipeline-components:2.22.0"
+        }
+      },
+      "exec-custom-training-job-2": {
+        "container": {
+          "args": [
+            "--type",
+            "CustomJob",
+            "--payload",
+            "{\"display_name\": \"{{$.inputs.parameters['display_name']}}\", \"job_spec\": {\"worker_pool_specs\": {{$.inputs.parameters['worker_pool_specs']}}, \"scheduling\": {\"timeout\": \"{{$.inputs.parameters['timeout']}}\", \"restart_job_on_worker_restart\": {{$.inputs.parameters['restart_job_on_worker_restart']}}, \"strategy\": \"{{$.inputs.parameters['strategy']}}\", \"max_wait_duration\": \"{{$.inputs.parameters['max_wait_duration']}}\"}, \"service_account\": \"{{$.inputs.parameters['service_account']}}\", \"tensorboard\": \"{{$.inputs.parameters['tensorboard']}}\", \"enable_web_access\": {{$.inputs.parameters['enable_web_access']}}, \"network\": \"{{$.inputs.parameters['network']}}\", \"reserved_ip_ranges\": {{$.inputs.parameters['reserved_ip_ranges']}}, \"base_output_directory\": {\"output_uri_prefix\": \"{{$.inputs.parameters['base_output_directory']}}\"}, \"persistent_resource_id\": \"{{$.inputs.parameters['persistent_resource_id']}}\", \"psc_interface_config\": {{$.inputs.parameters['psc_interface_config']}}}, \"labels\": {{$.inputs.parameters['labels']}}, \"encryption_spec\": {\"kms_key_name\": \"{{$.inputs.parameters['encryption_spec_key_name']}}\"}}",
+            "--project",
+            "{{$.inputs.parameters['project']}}",
+            "--location",
+            "{{$.inputs.parameters['location']}}",
+            "--gcp_resources",
+            "{{$.outputs.parameters['gcp_resources'].output_file}}"
+          ],
+          "command": [
+            "python3",
+            "-u",
+            "-m",
+            "google_cloud_pipeline_components.container.v1.custom_job.launcher"
+          ],
+          "image": "gcr.io/ml-pipeline/google-cloud-pipeline-components:2.22.0"
+        }
+      },
+      "exec-custom-training-job-3": {
+        "container": {
+          "args": [
+            "--type",
+            "CustomJob",
+            "--payload",
+            "{\"display_name\": \"{{$.inputs.parameters['display_name']}}\", \"job_spec\": {\"worker_pool_specs\": {{$.inputs.parameters['worker_pool_specs']}}, \"scheduling\": {\"timeout\": \"{{$.inputs.parameters['timeout']}}\", \"restart_job_on_worker_restart\": {{$.inputs.parameters['restart_job_on_worker_restart']}}, \"strategy\": \"{{$.inputs.parameters['strategy']}}\", \"max_wait_duration\": \"{{$.inputs.parameters['max_wait_duration']}}\"}, \"service_account\": \"{{$.inputs.parameters['service_account']}}\", \"tensorboard\": \"{{$.inputs.parameters['tensorboard']}}\", \"enable_web_access\": {{$.inputs.parameters['enable_web_access']}}, \"network\": \"{{$.inputs.parameters['network']}}\", \"reserved_ip_ranges\": {{$.inputs.parameters['reserved_ip_ranges']}}, \"base_output_directory\": {\"output_uri_prefix\": \"{{$.inputs.parameters['base_output_directory']}}\"}, \"persistent_resource_id\": \"{{$.inputs.parameters['persistent_resource_id']}}\", \"psc_interface_config\": {{$.inputs.parameters['psc_interface_config']}}}, \"labels\": {{$.inputs.parameters['labels']}}, \"encryption_spec\": {\"kms_key_name\": \"{{$.inputs.parameters['encryption_spec_key_name']}}\"}}",
+            "--project",
+            "{{$.inputs.parameters['project']}}",
+            "--location",
+            "{{$.inputs.parameters['location']}}",
+            "--gcp_resources",
+            "{{$.outputs.parameters['gcp_resources'].output_file}}"
+          ],
+          "command": [
+            "python3",
+            "-u",
+            "-m",
+            "google_cloud_pipeline_components.container.v1.custom_job.launcher"
+          ],
+          "image": "gcr.io/ml-pipeline/google-cloud-pipeline-components:2.22.0"
+        }
+      },
+      "exec-data-download-step": {
+        "container": {
+          "args": [
+            "--gcs-bucket",
+            "{{$.inputs.parameters['gcs_bucket']}}",
+            "--num-shards",
+            "{{$.inputs.parameters['num_shards']}}"
+          ],
+          "command": [
+            "python",
+            "vertex_pipelines/data_download_step.py"
+          ],
+          "image": "gcr.io/nzp-nanochat/nanochat:20251128144517",
+          "resources": {
+            "cpuLimit": 8.0,
+            "memoryLimit": 32.0,
+            "resourceCpuLimit": "8",
+            "resourceMemoryLimit": "32G"
+          }
+        }
+      },
+      "exec-report-step": {
+        "container": {
+          "args": [
+            "--gcs-bucket",
+            "{{$.inputs.parameters['gcs_bucket']}}"
+          ],
+          "command": [
+            "python",
+            "vertex_pipelines/report_step.py"
+          ],
+          "image": "gcr.io/nzp-nanochat/nanochat:20251128144517",
+          "resources": {
+            "cpuLimit": 2.0,
+            "memoryLimit": 8.0,
+            "resourceCpuLimit": "2",
+            "resourceMemoryLimit": "8G"
+          }
+        }
+      },
+      "exec-tokenizer-step": {
+        "container": {
+          "args": [
+            "--gcs-bucket",
+            "{{$.inputs.parameters['gcs_bucket']}}"
+          ],
+          "command": [
+            "python",
+            "vertex_pipelines/tokenizer_step.py"
+          ],
+          "image": "gcr.io/nzp-nanochat/nanochat:20251128144517",
+          "resources": {
+            "cpuLimit": 8.0,
+            "memoryLimit": 32.0,
+            "resourceCpuLimit": "8",
+            "resourceMemoryLimit": "32G"
+          }
+        }
+      }
+    }
+  },
+  "pipelineInfo": {
+    "description": "A pipeline to train NanoChat",
+    "name": "nanochat-pipeline"
+  },
+  "root": {
+    "dag": {
+      "tasks": {
+        "custom-training-job": {
+          "cachingOptions": {
+            "enableCache": true
+          },
+          "componentRef": {
+            "name": "comp-custom-training-job"
+          },
+          "dependentTasks": [
+            "tokenizer-step"
+          ],
+          "inputs": {
+            "parameters": {
+              "base_output_directory": {
+                "runtimeValue": {
+                  "constant": "{{$.inputs.parameters['pipelinechannel--gcs_bucket']}}/pipeline_root"
+                }
+              },
+              "display_name": {
+                "runtimeValue": {
+                  "constant": "nanochat-pretraining-job"
+                }
+              },
+              "location": {
+                "componentInputParameter": "location"
+              },
+              "max_wait_duration": {
+                "componentInputParameter": "max_wait_duration"
+              },
+              "pipelinechannel--device_batch_size": {
+                "componentInputParameter": "device_batch_size"
+              },
+              "pipelinechannel--gcs_bucket": {
+                "componentInputParameter": "gcs_bucket"
+              },
+              "pipelinechannel--vertex_experiment": {
+                "componentInputParameter": "vertex_experiment"
+              },
+              "pipelinechannel--vertex_tensorboard": {
+                "componentInputParameter": "vertex_tensorboard"
+              },
+              "pipelinechannel--wandb_run": {
+                "componentInputParameter": "wandb_run"
+              },
+              "project": {
+                "componentInputParameter": "project"
+              },
+              "restart_job_on_worker_restart": {
+                "runtimeValue": {
+                  "constant": true
+                }
+              },
+              "service_account": {
+                "componentInputParameter": "service_account"
+              },
+              "strategy": {
+                "componentInputParameter": "scheduling_strategy"
+              },
+              "tensorboard": {
+                "componentInputParameter": "vertex_tensorboard"
+              },
+              "timeout": {
+                "runtimeValue": {
+                  "constant": "604800s"
+                }
+              },
+              "worker_pool_specs": {
+                "runtimeValue": {
+                  "constant": [
+                    {
+                      "container_spec": {
+                        "args": [
+                          "--gcs-bucket",
+                          "{{$.inputs.parameters['pipelinechannel--gcs_bucket']}}",
+                          "--wandb-run",
+                          "{{$.inputs.parameters['pipelinechannel--wandb_run']}}",
+                          "--vertex-experiment",
+                          "{{$.inputs.parameters['pipelinechannel--vertex_experiment']}}",
+                          "--vertex-tensorboard",
+                          "{{$.inputs.parameters['pipelinechannel--vertex_tensorboard']}}",
+                          "--device-batch-size",
+                          "{{$.inputs.parameters['pipelinechannel--device_batch_size']}}"
+                        ],
+                        "command": [
+                          "python",
+                          "vertex_pipelines/pretraining_step.py"
+                        ],
+                        "image_uri": "gcr.io/nzp-nanochat/nanochat:20251128144517"
+                      },
+                      "disk_spec": {
+                        "boot_disk_size_gb": 500.0,
+                        "boot_disk_type": "pd-ssd"
+                      },
+                      "machine_spec": {
+                        "accelerator_count": 8.0,
+                        "accelerator_type": "NVIDIA_TESLA_A100",
+                        "machine_type": "a2-highgpu-8g"
+                      },
+                      "replica_count": 1.0
+                    }
+                  ]
+                }
+              }
+            }
+          },
+          "taskInfo": {
+            "name": "custom-training-job"
+          }
+        },
+        "custom-training-job-2": {
+          "cachingOptions": {
+            "enableCache": true
+          },
+          "componentRef": {
+            "name": "comp-custom-training-job-2"
+          },
+          "dependentTasks": [
+            "custom-training-job"
+          ],
+          "inputs": {
+            "parameters": {
+              "base_output_directory": {
+                "runtimeValue": {
+                  "constant": "{{$.inputs.parameters['pipelinechannel--gcs_bucket']}}/pipeline_root"
+                }
+              },
+              "display_name": {
+                "runtimeValue": {
+                  "constant": "nanochat-midtraining-job"
+                }
+              },
+              "location": {
+                "componentInputParameter": "location"
+              },
+              "max_wait_duration": {
+                "componentInputParameter": "max_wait_duration"
+              },
+              "pipelinechannel--device_batch_size": {
+                "componentInputParameter": "device_batch_size"
+              },
+              "pipelinechannel--gcs_bucket": {
+                "componentInputParameter": "gcs_bucket"
+              },
+              "pipelinechannel--vertex_experiment": {
+                "componentInputParameter": "vertex_experiment"
+              },
+              "pipelinechannel--vertex_tensorboard": {
+                "componentInputParameter": "vertex_tensorboard"
+              },
+              "pipelinechannel--wandb_run": {
+                "componentInputParameter": "wandb_run"
+              },
+              "project": {
+                "componentInputParameter": "project"
+              },
+              "service_account": {
+                "componentInputParameter": "service_account"
+              },
+              "strategy": {
+                "componentInputParameter": "scheduling_strategy"
+              },
+              "worker_pool_specs": {
+                "runtimeValue": {
+                  "constant": [
+                    {
+                      "container_spec": {
+                        "args": [
+                          "--gcs-bucket",
+                          "{{$.inputs.parameters['pipelinechannel--gcs_bucket']}}",
+                          "--wandb-run",
+                          "{{$.inputs.parameters['pipelinechannel--wandb_run']}}",
+                          "--vertex-experiment",
+                          "{{$.inputs.parameters['pipelinechannel--vertex_experiment']}}",
+                          "--vertex-tensorboard",
+                          "{{$.inputs.parameters['pipelinechannel--vertex_tensorboard']}}",
+                          "--device-batch-size",
+                          "{{$.inputs.parameters['pipelinechannel--device_batch_size']}}"
+                        ],
+                        "command": [
+                          "python",
+                          "vertex_pipelines/midtraining_step.py"
+                        ],
+                        "image_uri": "gcr.io/nzp-nanochat/nanochat:20251128144517"
+                      },
+                      "disk_spec": {
+                        "boot_disk_size_gb": 500.0,
+                        "boot_disk_type": "pd-ssd"
+                      },
+                      "machine_spec": {
+                        "accelerator_count": 8.0,
+                        "accelerator_type": "NVIDIA_TESLA_A100",
+                        "machine_type": "a2-highgpu-8g"
+                      },
+                      "replica_count": 1.0
+                    }
+                  ]
+                }
+              }
+            }
+          },
+          "taskInfo": {
+            "name": "custom-training-job-2"
+          }
+        },
+        "custom-training-job-3": {
+          "cachingOptions": {
+            "enableCache": true
+          },
+          "componentRef": {
+            "name": "comp-custom-training-job-3"
+          },
+          "dependentTasks": [
+            "custom-training-job-2"
+          ],
+          "inputs": {
+            "parameters": {
+              "base_output_directory": {
+                "runtimeValue": {
+                  "constant": "{{$.inputs.parameters['pipelinechannel--gcs_bucket']}}/pipeline_root"
+                }
+              },
+              "display_name": {
+                "runtimeValue": {
+                  "constant": "nanochat-sft-job"
+                }
+              },
+              "location": {
+                "componentInputParameter": "location"
+              },
+              "max_wait_duration": {
+                "componentInputParameter": "max_wait_duration"
+              },
+              "pipelinechannel--gcs_bucket": {
+                "componentInputParameter": "gcs_bucket"
+              },
+              "pipelinechannel--vertex_experiment": {
+                "componentInputParameter": "vertex_experiment"
+              },
+              "pipelinechannel--vertex_tensorboard": {
+                "componentInputParameter": "vertex_tensorboard"
+              },
+              "pipelinechannel--wandb_run": {
+                "componentInputParameter": "wandb_run"
+              },
+              "project": {
+                "componentInputParameter": "project"
+              },
+              "service_account": {
+                "componentInputParameter": "service_account"
+              },
+              "strategy": {
+                "componentInputParameter": "scheduling_strategy"
+              },
+              "worker_pool_specs": {
+                "runtimeValue": {
+                  "constant": [
+                    {
+                      "container_spec": {
+                        "args": [
+                          "--gcs-bucket",
+                          "{{$.inputs.parameters['pipelinechannel--gcs_bucket']}}",
+                          "--wandb-run",
+                          "{{$.inputs.parameters['pipelinechannel--wandb_run']}}",
+                          "--vertex-experiment",
+                          "{{$.inputs.parameters['pipelinechannel--vertex_experiment']}}",
+                          "--vertex-tensorboard",
+                          "{{$.inputs.parameters['pipelinechannel--vertex_tensorboard']}}"
+                        ],
+                        "command": [
+                          "python",
+                          "vertex_pipelines/sft_step.py"
+                        ],
+                        "image_uri": "gcr.io/nzp-nanochat/nanochat:20251128144517"
+                      },
+                      "disk_spec": {
+                        "boot_disk_size_gb": 500.0,
+                        "boot_disk_type": "pd-ssd"
+                      },
+                      "machine_spec": {
+                        "accelerator_count": 8.0,
+                        "accelerator_type": "NVIDIA_TESLA_A100",
+                        "machine_type": "a2-highgpu-8g"
+                      },
+                      "replica_count": 1.0
+                    }
+                  ]
+                }
+              }
+            }
+          },
+          "taskInfo": {
+            "name": "custom-training-job-3"
+          }
+        },
+        "data-download-step": {
+          "cachingOptions": {
+            "enableCache": true
+          },
+          "componentRef": {
+            "name": "comp-data-download-step"
+          },
+          "inputs": {
+            "parameters": {
+              "gcs_bucket": {
+                "componentInputParameter": "gcs_bucket"
+              },
+              "num_shards": {
+                "componentInputParameter": "num_data_shards"
+              }
+            }
+          },
+          "taskInfo": {
+            "name": "data-download-step"
+          }
+        },
+        "report-step": {
+          "cachingOptions": {
+            "enableCache": true
+          },
+          "componentRef": {
+            "name": "comp-report-step"
+          },
+          "dependentTasks": [
+            "custom-training-job-3"
+          ],
+          "inputs": {
+            "parameters": {
+              "gcs_bucket": {
+                "componentInputParameter": "gcs_bucket"
+              }
+            }
+          },
+          "taskInfo": {
+            "name": "report-step"
+          }
+        },
+        "tokenizer-step": {
+          "cachingOptions": {
+            "enableCache": true
+          },
+          "componentRef": {
+            "name": "comp-tokenizer-step"
+          },
+          "inputs": {
+            "parameters": {
+              "gcs_bucket": {
+                "componentInputParameter": "gcs_bucket"
+              }
+            }
+          },
+          "taskInfo": {
+            "name": "tokenizer-step"
+          }
+        }
+      }
+    },
+    "inputDefinitions": {
+      "parameters": {
+        "device_batch_size": {
+          "defaultValue": 8.0,
+          "isOptional": true,
+          "parameterType": "NUMBER_INTEGER"
+        },
+        "gcs_bucket": {
+          "parameterType": "STRING"
+        },
+        "location": {
+          "parameterType": "STRING"
+        },
+        "max_wait_duration": {
+          "defaultValue": "0s",
+          "isOptional": true,
+          "parameterType": "STRING"
+        },
+        "num_data_shards": {
+          "defaultValue": 20.0,
+          "isOptional": true,
+          "parameterType": "NUMBER_INTEGER"
+        },
+        "project": {
+          "parameterType": "STRING"
+        },
+        "scheduling_strategy": {
+          "defaultValue": "FLEX_START",
+          "isOptional": true,
+          "parameterType": "STRING"
+        },
+        "service_account": {
+          "defaultValue": "",
+          "isOptional": true,
+          "parameterType": "STRING"
+        },
+        "vertex_experiment": {
+          "defaultValue": "",
+          "isOptional": true,
+          "parameterType": "STRING"
+        },
+        "vertex_tensorboard": {
+          "defaultValue": "",
+          "isOptional": true,
+          "parameterType": "STRING"
+        },
+        "wandb_run": {
+          "defaultValue": "dummy",
+          "isOptional": true,
+          "parameterType": "STRING"
+        }
+      }
+    }
+  },
+  "schemaVersion": "2.1.0",
+  "sdkVersion": "kfp-2.15.1"
+}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -13,12 +13,14 @@ dependencies = [
    "setuptools>=80.9.0",
    "tiktoken>=0.11.0",
    "tokenizers>=0.22.0",
-    "torch>=2.8.0",
+    "torch>=2.5.0",
    "uvicorn>=0.36.0",
    "wandb>=0.21.3",
-    "google-cloud-storage>=2.10.0",
-    "kfp>=2.0.0",
-    "google-cloud-aiplatform>=1.25.0",
+    "google-cloud-storage>=2.14.0",
+    "kfp==2.8.0",
+    "google-cloud-aiplatform>=1.38.0",
+    "gcsfs>=2023.6.0",
+    "tensorboard>=2.14.0",
 ]

 [build-system]
@ -46,35 +48,35 @@ python_files = ["test_*.py"]
 python_classes = ["Test*"]
 python_functions = ["test_*"]

-# target torch to cuda 12.8 or CPU
+# target torch to cuda 12.4 or CPU
 [tool.uv.sources]
-torch = [  
-    { index = "pytorch-cpu", extra = "cpu" },  
-    { index = "pytorch-cu128", extra = "gpu" },  
+torch = [
+    { index = "pytorch-cpu", extra = "cpu" },
+    { index = "pytorch-cu124", extra = "gpu" },
 ]

-[[tool.uv.index]]  
-name = "pytorch-cpu"  
-url = "https://download.pytorch.org/whl/cpu"  
-explicit = true  
-  
-[[tool.uv.index]]  
-name = "pytorch-cu128"  
-url = "https://download.pytorch.org/whl/cu128"  
+[[tool.uv.index]]
+name = "pytorch-cpu"
+url = "https://download.pytorch.org/whl/cpu"
 explicit = true

-[project.optional-dependencies]  
-cpu = [  
-    "torch>=2.8.0",  
-]  
-gpu = [  
-    "torch>=2.8.0",  
-]  
-  
-[tool.uv]  
-conflicts = [  
-    [  
-        { extra = "cpu" },  
-        { extra = "gpu" },  
-    ],  
-]  
+[[tool.uv.index]]
+name = "pytorch-cu124"
+url = "https://download.pytorch.org/whl/cu124"
+explicit = true
+
+[project.optional-dependencies]
+cpu = [
+    "torch>=2.5.0",
+]
+gpu = [
+    "torch>=2.5.0",
+]
+
+[tool.uv]
+conflicts = [
+    [
+        { extra = "cpu" },
+        { extra = "gpu" },
+    ],
+]
--- a/report/base-model-evaluation.md
+++ b/report/base-model-evaluation.md
@ -0,0 +1,28 @@
+## Base model evaluation
+timestamp: 2025-11-29 02:14:42
+
+- Model: base_model (step 21400)
+- CORE metric: 0.1710
+- hellaswag_zeroshot: 0.2364
+- jeopardy: 0.0487
+- bigbench_qa_wikidata: 0.4287
+- arc_easy: 0.4815
+- arc_challenge: 0.1217
+- copa: 0.2800
+- commonsense_qa: 0.0469
+- piqa: 0.3308
+- openbook_qa: 0.1173
+- lambada_openai: 0.3346
+- hellaswag: 0.2348
+- winograd: 0.2161
+- winogrande: 0.0450
+- bigbench_dyck_languages: 0.1240
+- agi_eval_lsat_ar: 0.0543
+- bigbench_cs_algorithms: 0.3962
+- bigbench_operators: 0.1381
+- bigbench_repeat_copy_logic: 0.0000
+- squad: 0.1213
+- coqa: 0.1469
+- boolq: -0.3182
+- bigbench_language_identification: 0.1759
+
--- a/report/base-model-loss.md
+++ b/report/base-model-loss.md
@ -0,0 +1,15 @@
+## Base model loss
+timestamp: 2025-11-29 01:17:40
+
+- train bpb: 0.7200
+- val bpb: 0.8992
+- sample 0: <|bos|>The capital of France is Paris. The capital of the United Kingdom is London. The capital of the United
+- sample 1: <|bos|>The chemical symbol of gold is Au. The symbol of gold is Au. The symbol of gold is Au.
+
+- sample 2: <|bos|>If yesterday was Friday, then tomorrow will be Saturday.
+If you are a parent, you know that your child is not ready
+- sample 3: <|bos|>The opposite of hot is cold. The opposite of cold is hot. The opposite of hot is cold.
+- sample 4: <|bos|>The planets of the solar system are: Earth, Venus, Mars, Jupiter, Saturn, Uranus, and Neptune. The
+- sample 5: <|bos|>My favorite color is blue. I love the color blue. It is the color of the sky,
+- sample 6: <|bos|>If 5*x + 3 = 13, then x is the same as 5*x + 3 = 13. If 
+
--- a/report/base-model-training.md
+++ b/report/base-model-training.md
@ -0,0 +1,45 @@
+## Base model training
+timestamp: 2025-11-29 01:12:50
+
+- wandb_run_name: dummy
+- vertex_experiment: nanochat-experiment
+- vertex_tensorboard: projects/247010501180/locations/us-central1/tensorboards/8180826106513850368
+- device_type: 
+- depth: 20
+- max_seq_len: 2048
+- num_iterations: -1
+- target_flops: -1.0000
+- target_param_data_ratio: 20
+- device_batch_size: 8
+- total_batch_size: 524,288
+- embedding_lr: 0.2000
+- unembedding_lr: 0.0040
+- weight_decay: 0.0000
+- matrix_lr: 0.0200
+- grad_clip: 1.0000
+- warmup_ratio: 0.0000
+- warmdown_ratio: 0.2000
+- final_lr_frac: 0.0000
+- eval_every: 250
+- eval_tokens: 10,485,760
+- core_metric_every: 2000
+- core_metric_max_per_task: 500
+- sample_every: 2000
+- model_tag: 
+- Number of parameters: 560,988,160
+- Number of FLOPs per token: 3.491758e+09
+- Calculated number of iterations: 21,400
+- Number of training tokens: 11,219,763,200
+- Tokens : Params ratio: 20.0000
+- DDP world size: 8
+- warmup_ratio: 0.0000
+- warmdown_ratio: 0.2000
+- final_lr_frac: 0.0000
+- Minimum validation bpb: 0.8985
+- Final validation bpb: 0.8985
+- CORE metric estimate: 0.1732
+- MFU %: 0.00%
+- Total training flops: 3.917670e+19
+- Total training time: 0.00m
+- Peak memory usage: 8574.47MiB
+
--- a/report/chat-evaluation-mid.md
+++ b/report/chat-evaluation-mid.md
@ -0,0 +1,23 @@
+## Chat evaluation mid
+timestamp: 2025-11-30 23:51:33
+
+- source: mid
+- task_name: None
+- dtype: bfloat16
+- temperature: 0.0000
+- max_new_tokens: 512
+- num_samples: 1
+- top_k: 50
+- batch_size: 8
+- model_tag: None
+- step: None
+- max_problems: None
+- device_type: 
+- ARC-Easy: 0.3847
+- ARC-Challenge: 0.2944
+- MMLU: 0.3079
+- GSM8K: 0.0303
+- HumanEval: 0.0610
+- SpellingBee: 0.9688
+- ChatCORE metric: 0.2293
+
--- a/report/chat-evaluation-sft.md
+++ b/report/chat-evaluation-sft.md
@ -0,0 +1,23 @@
+## Chat evaluation sft
+timestamp: 2025-12-01 16:29:17
+
+- source: sft
+- task_name: None
+- dtype: bfloat16
+- temperature: 0.0000
+- max_new_tokens: 512
+- num_samples: 1
+- top_k: 50
+- batch_size: 8
+- model_tag: None
+- step: None
+- max_problems: None
+- device_type: 
+- ARC-Easy: 0.3994
+- ARC-Challenge: 0.2833
+- MMLU: 0.3169
+- GSM8K: 0.0417
+- HumanEval: 0.0366
+- SpellingBee: 0.9766
+- ChatCORE metric: 0.2313
+
--- a/report/chat-sft.md
+++ b/report/chat-sft.md
@ -0,0 +1,27 @@
+## Chat SFT
+timestamp: 2025-12-01 14:45:18
+
+- wandb_run_name: dummy
+- vertex_experiment: nanochat-experiment
+- vertex_tensorboard: projects/247010501180/locations/us-central1/tensorboards/8180826106513850368
+- source: mid
+- device_type: 
+- dtype: bfloat16
+- device_batch_size: 4
+- num_epochs: 1
+- num_iterations: -1
+- target_examples_per_step: 32
+- unembedding_lr: 0.0040
+- embedding_lr: 0.2000
+- matrix_lr: 0.0200
+- weight_decay: 0.0000
+- init_lr_frac: 0.0200
+- eval_every: 100
+- eval_steps: 100
+- eval_metrics_every: 200
+- eval_metrics_max_problems: 1024
+- Training rows: 22,439
+- Number of iterations: 701
+- Training loss: 1.1208
+- Validation loss: 1.0811
+
--- a/report/midtraining.md
+++ b/report/midtraining.md
@ -0,0 +1,24 @@
+## Midtraining
+timestamp: 2025-11-30 21:47:41
+
+- wandb_run_name: dummy
+- vertex_experiment: nanochat-experiment
+- vertex_tensorboard: projects/247010501180/locations/us-central1/tensorboards/8180826106513850368
+- device_type: 
+- dtype: bfloat16
+- num_iterations: -1
+- max_seq_len: 2048
+- device_batch_size: 8
+- unembedding_lr: 0.0040
+- embedding_lr: 0.2000
+- matrix_lr: 0.0200
+- init_lr_frac: 1.0000
+- weight_decay: 0.0000
+- eval_every: 150
+- eval_tokens: 10,485,760
+- total_batch_size: 524,288
+- dry_run: 0
+- Number of iterations: 813
+- DDP world size: 1
+- Minimum validation bpb: 0.4203
+
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1 @@
+google-cloud-pipeline-components
--- a/scripts/base_eval.py
+++ b/scripts/base_eval.py
@ -55,7 +55,23 @@ def evaluate_model(model, tokenizer, device, max_per_task=-1):
    eval_bundle_dir = os.path.join(base_dir, "eval_bundle")
    # Download the eval bundle to disk (and unzip if needed)
    if not os.path.exists(eval_bundle_dir):
-        download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle)
+        # Try to download from GCS first (faster and more reliable in Vertex AI)
+        # UPDATE: GCS copy seems corrupted, disabling for now to force S3 fallback
+        # try:
+        #     import gcsfs
+        #     # Assuming the data is in gs://nzp-nanochat/eval_bundle
+        #     gcs_eval_bundle = os.environ.get('NANOCHAT_DATA_DIR', 'gs://nzp-nanochat').replace('base_data', 'eval_bundle')
+        #     print0(f"Trying to download eval_bundle from GCS: {gcs_eval_bundle}")
+        #     fs = gcsfs.GCSFileSystem()
+        #     if fs.exists(gcs_eval_bundle):
+        #         print0(f"Found eval_bundle in GCS, downloading...")
+        #         fs.get(gcs_eval_bundle, eval_bundle_dir, recursive=True)
+        #         print0(f"Downloaded eval_bundle from GCS to {eval_bundle_dir}")
+        #     else:
+        #         raise FileNotFoundError("Eval bundle not found in GCS")
+        # except Exception as e:
+        #     print0(f"Could not download from GCS ({e}), falling back to AWS S3...")
+            download_file_with_lock(EVAL_BUNDLE_URL, "eval_bundle.zip", postprocess_fn=place_eval_bundle)
    config_path = os.path.join(eval_bundle_dir, "core.yaml")
    data_base_path = os.path.join(eval_bundle_dir, "eval_data")
    eval_meta_data = os.path.join(eval_bundle_dir, "eval_meta_data.csv")
--- a/scripts/base_train.py
+++ b/scripts/base_train.py
@ -21,7 +21,7 @@ import torch

 from nanochat.gpt import GPT, GPTConfig
 from nanochat.dataloader import tokenizing_distributed_data_loader
-from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, print_banner, get_base_dir, autodetect_device_type
+from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, print_banner, get_base_dir, autodetect_device_type, get_experiment_logger
 from nanochat.tokenizer import get_tokenizer, get_token_bytes
 from nanochat.checkpoint_manager import save_checkpoint
 from nanochat.loss_eval import evaluate_bpb
@ -31,7 +31,9 @@ print_banner()

 # -----------------------------------------------------------------------------
 # User settings
-run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
+wandb_run_name = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
+vertex_experiment = "" # Vertex AI experiment name
+vertex_tensorboard = "" # Vertex AI TensorBoard resource name
 # Runtime
 device_type = "" # cuda|cpu|mps (empty => autodetect good device type default, in order: CUDA > MPS > CPU)
 # Model architecture
@ -74,9 +76,18 @@ autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16)
 synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None
 get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0

-# wandb logging init
-use_dummy_wandb = run == "dummy" or not master_process
-wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat", name=run, config=user_config)
+# logging init
+use_dummy_logger = (wandb_run_name == "dummy" and not vertex_experiment) or not master_process
+if use_dummy_logger:
+    wandb_run = DummyWandb()
+else:
+    class Args: pass
+    args = Args()
+    args.wandb_run = wandb_run_name
+    args.vertex_experiment = vertex_experiment
+    args.vertex_tensorboard = vertex_tensorboard
+    wandb_run = get_experiment_logger(args)
+    wandb_run.init(project="nanochat", name=wandb_run_name, config=user_config)

 # Tokenizer will be useful for evaluation, also we need the vocab size
 tokenizer = get_tokenizer()
@ -118,6 +129,25 @@ print0(f"Number of parameters: {num_params:,}")
 num_flops_per_token = model.estimate_flops()
 print0(f"Estimated FLOPs per token: {num_flops_per_token:e}")

+# Try to resume from latest checkpoint in GCS
+start_step = 0
+output_dirname = model_tag if model_tag else f"d{depth}"
+data_dir = os.environ.get("NANOCHAT_DATA_DIR", "")
+if data_dir.startswith("gs://"):
+    checkpoint_dir = data_dir.replace("/base_data", "/base_checkpoints") + f"/{output_dirname}"
+    try:
+        from nanochat.checkpoint_manager import find_last_step, load_checkpoint
+        last_step = find_last_step(checkpoint_dir)
+        print0(f"Found checkpoint at step {last_step} in {checkpoint_dir}, resuming...")
+        model_data, optimizer_data, meta_data = load_checkpoint(checkpoint_dir, last_step, device, load_optimizer=True)
+        orig_model.load_state_dict(model_data, strict=True, assign=True)
+        start_step = last_step
+        print0(f"✓ Resumed from step {start_step}")
+    except Exception as e:
+        print0(f"No checkpoint found or failed to load ({e}), starting from scratch")
+        start_step = 0
+
+
 # Calculate number of iterations. Either it is given, or from target flops, or from target data:param ratio (in that order)
 assert num_iterations > 0 or target_param_data_ratio > 0 or target_flops > 0
 if num_iterations > 0:
@ -178,7 +208,11 @@ smooth_train_loss = 0 # EMA of training loss
 ema_beta = 0.9 # EMA decay factor
 total_training_time = 0 # total wall-clock time of training
 # note that we run +1 steps only so that we can eval and save at the end
-for step in range(num_iterations + 1):
+mfu = 0.0
+val_bpb = 0.0
+flops_so_far = 0.0
+results = {}
+for step in range(start_step, num_iterations + 1):
    last_step = step == num_iterations
    flops_so_far = num_flops_per_token * total_batch_size * step

@ -240,7 +274,13 @@ for step in range(num_iterations + 1):
    # save checkpoint at the end of the run (only on master process)
    if master_process and last_step:
        output_dirname = model_tag if model_tag else f"d{depth}" # e.g. d12
-        checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname)
+        # Use GCS for checkpoints to ensure persistence across job failures
+        data_dir = os.environ.get("NANOCHAT_DATA_DIR", "")
+        if data_dir.startswith("gs://"):
+            # Extract bucket and construct checkpoint path in GCS
+            checkpoint_dir = data_dir.replace("/base_data", "/base_checkpoints") + f"/{output_dirname}"
+        else:
+            checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname)
        save_checkpoint(
            checkpoint_dir,
            step,
@ -256,6 +296,31 @@ for step in range(num_iterations + 1):
            }
        )

+    # Periodic checkpointing (every 1000 steps)
+    if master_process and step > 0 and step % 1000 == 0:
+        output_dirname = model_tag if model_tag else f"d{depth}"
+        # Use GCS for checkpoints to ensure persistence across job failures
+        data_dir = os.environ.get("NANOCHAT_DATA_DIR", "")
+        if data_dir.startswith("gs://"):
+            # Extract bucket and construct checkpoint path in GCS
+            checkpoint_dir = data_dir.replace("/base_data", "/base_checkpoints") + f"/{output_dirname}"
+        else:
+            checkpoint_dir = os.path.join(base_dir, "base_checkpoints", output_dirname)
+        save_checkpoint(
+            checkpoint_dir,
+            step,
+            orig_model.state_dict(),
+            [opt.state_dict() for opt in optimizers],
+            {
+                "step": step,
+                "val_bpb": val_bpb,
+                "model_config": model_config_kwargs,
+                "user_config": user_config,
+                "device_batch_size": device_batch_size,
+                "max_seq_len": max_seq_len,
+            }
+        )
+
    if last_step:
        break

--- a/scripts/chat_sft.py
+++ b/scripts/chat_sft.py
@ -17,7 +17,7 @@ import torch
 import torch.distributed as dist
 from contextlib import nullcontext

-from nanochat.common import compute_init, compute_cleanup, get_base_dir, print0, DummyWandb, autodetect_device_type
+from nanochat.common import compute_init, compute_cleanup, get_base_dir, print0, DummyWandb, autodetect_device_type, get_experiment_logger
 from nanochat.checkpoint_manager import load_model
 from nanochat.checkpoint_manager import save_checkpoint
 from nanochat.engine import Engine
@ -31,8 +31,10 @@ from tasks.customjson import CustomJSON
 from tasks.spellingbee import SimpleSpelling, SpellingBee

 # -----------------------------------------------------------------------------
-# SFT Hyperparameters
-run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
+# SFT Hyperpa# User settings
+wandb_run_name = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
+vertex_experiment = "" # Vertex AI experiment name
+vertex_tensorboard = "" # Vertex AI TensorBoard resource name
 # input model options
 source = "mid" # base|mid , which checkpoint to load the model from (base model or midtrained model)
 model_tag = None # model tag to load the model from (base model or midtrained model)
@ -68,9 +70,18 @@ master_process = ddp_rank == 0
 ptdtype = torch.float32 if dtype == 'float32' else torch.bfloat16
 autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if device_type == "cuda" else nullcontext()

-# wandb logging init
-use_dummy_wandb = run == "dummy" or not master_process
-wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat-sft", name=run, config=user_config, save_code=True)
+# logging init
+use_dummy_logger = (wandb_run_name == "dummy" and not vertex_experiment) or not master_process
+if use_dummy_logger:
+    wandb_run = DummyWandb()
+else:
+    class Args: pass
+    args = Args()
+    args.wandb_run = wandb_run_name
+    args.vertex_experiment = vertex_experiment
+    args.vertex_tensorboard = vertex_tensorboard
+    wandb_run = get_experiment_logger(args)
+    wandb_run.init(project="nanochat-sft", name=wandb_run_name, config=user_config)

 # Load the model and tokenizer
 model, tokenizer, meta = load_model(source, device, phase="train", model_tag=model_tag, step=step)
--- a/scripts/mid_train.py
+++ b/scripts/mid_train.py
@ -16,7 +16,7 @@ import time
 import wandb
 import torch
 from contextlib import nullcontext
-from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, get_base_dir, autodetect_device_type
+from nanochat.common import compute_init, compute_cleanup, print0, DummyWandb, get_base_dir, autodetect_device_type, get_experiment_logger
 from nanochat.tokenizer import get_token_bytes
 from nanochat.checkpoint_manager import save_checkpoint
 from nanochat.loss_eval import evaluate_bpb
@ -31,7 +31,9 @@ from tasks.customjson import CustomJSON
 from tasks.spellingbee import SimpleSpelling, SpellingBee

 # -----------------------------------------------------------------------------
-run = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
+wandb_run_name = "dummy" # wandb run name default ("dummy" is special - we won't log to wandb)
+vertex_experiment = "" # Vertex AI experiment name
+vertex_tensorboard = "" # Vertex AI TensorBoard resource name
 device_type = "" # cuda|cpu|mps (empty => autodetect)
 model_tag = None # model tag to load the model from (base model or midtrained model)
 step = None # step to load the model from (base model or midtrained model)
@ -58,12 +60,20 @@ device_type = autodetect_device_type() if device_type == "" else device_type
 ddp, ddp_rank, ddp_local_rank, ddp_world_size, device = compute_init(device_type)
 master_process = ddp_rank == 0
 autocast_ctx = torch.amp.autocast(device_type=device_type, dtype=torch.bfloat16) if device_type == "cuda" else nullcontext()
-synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: None
-get_max_memory = torch.cuda.max_memory_allocated if device_type == "cuda" else lambda: 0
+synchronize = torch.cuda.synchronize if device_type == "cuda" else lambda: 0

-# wandb logging init
-use_dummy_wandb = run == "dummy" or not master_process
-wandb_run = DummyWandb() if use_dummy_wandb else wandb.init(project="nanochat-mid", name=run, config=user_config)
+# logging init
+use_dummy_logger = (wandb_run_name == "dummy" and not vertex_experiment) or not master_process
+if use_dummy_logger:
+    wandb_run = DummyWandb()
+else:
+    class Args: pass
+    args = Args()
+    args.wandb_run = wandb_run_name
+    args.vertex_experiment = vertex_experiment
+    args.vertex_tensorboard = vertex_tensorboard
+    wandb_run = get_experiment_logger(args)
+    wandb_run.init(project="nanochat-mid", name=wandb_run_name, config=user_config)

 # Load the model and tokenizer
 model, tokenizer, meta = load_model("base", device, phase="train", model_tag=model_tag, step=step)
@ -170,6 +180,11 @@ def get_muon_momentum(it):
    momentum = (1 - frac) * 0.85 + frac * 0.95
    return momentum

+def get_max_memory():
+    if torch.cuda.is_available():
+        return torch.cuda.max_memory_allocated()
+    return 0
+
 # -----------------------------------------------------------------------------
 # Training loop
 x, y = next(train_loader) # prefetch the very first batch of data
--- a/test_custom_job_args.py
+++ b/test_custom_job_args.py
@ -0,0 +1,25 @@
+from google_cloud_pipeline_components.v1.custom_job import CustomTrainingJobOp
+
+try:
+    op = CustomTrainingJobOp(
+        project="p",
+        location="l",
+        display_name="d",
+        worker_pool_specs=[],
+        scheduling={"strategy": "SPOT"}
+    )
+    print("Success with scheduling")
+except TypeError as e:
+    print(f"Failed with scheduling: {e}")
+
+try:
+    op = CustomTrainingJobOp(
+        project="p",
+        location="l",
+        display_name="d",
+        worker_pool_specs=[],
+        timeout="1s"
+    )
+    print("Success with timeout")
+except TypeError as e:
+    print(f"Failed with timeout: {e}")
--- a/test_torchrun.py
+++ b/test_torchrun.py
@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+"""Test torchrun command locally"""
+import subprocess
+import sys
+
+# Simulate the exact command that will run
+cmd = [
+    "torchrun", "--standalone", "--nproc_per_node=1",
+    "-m", "scripts.base_train",
+    "--depth=4",
+    "--device_batch_size=1",
+    "--num_iterations=2",
+    "--run=test_local",
+    "--vertex_experiment=",
+    "--vertex_tensorboard="
+]
+
+print("Testing command:")
+print(" ".join(cmd))
+print()
+
+try:
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+    print("STDOUT:")
+    print(result.stdout[:1000])
+    print("\nSTDERR:")
+    print(result.stderr[:1000])
+    print(f"\nExit code: {result.returncode}")
+    sys.exit(result.returncode)
+except subprocess.TimeoutExpired:
+    print("Command timed out (expected for training)")
+    sys.exit(0)
+except Exception as e:
+    print(f"Error: {e}")
+    sys.exit(1)
--- a/uv.lock
+++ b/uv.lock
@ -287,18 +287,6 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188, upload-time = "2024-12-21T18:38:41.666Z" },
 ]

-[[package]]
-name = "click-option-group"
-version = "0.5.7"
-source = { registry = "https://pypi.org/simple" }
-dependencies = [
-    { name = "click" },
-]
-sdist = { url = "https://files.pythonhosted.org/packages/b9/9f/1f917934da4e07ae7715a982347e3c2179556d8a58d1108c5da3e8f09c76/click_option_group-0.5.7.tar.gz", hash = "sha256:8dc780be038712fc12c9fecb3db4fe49e0d0723f9c171d7cda85c20369be693c", size = 22110, upload-time = "2025-03-24T13:24:55.897Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/93/27/bf74dc1494625c3b14dbcdb93740defd7b8c58dae3736be8d264f2a643fb/click_option_group-0.5.7-py3-none-any.whl", hash = "sha256:96b9f52f397ef4d916f81929bd6c1f85e89046c7a401a64e72a61ae74ad35c24", size = 11483, upload-time = "2025-03-24T13:24:54.611Z" },
-]
-
 [[package]]
 name = "colorama"
 version = "0.4.6"
@ -529,16 +517,77 @@ wheels = [
    { url = "https://files.pythonhosted.org/packages/01/61/d4b89fec821f72385526e1b9d9a3a0385dda4a72b206d28049e2c7cd39b8/gitpython-3.1.45-py3-none-any.whl", hash = "sha256:8908cb2e02fb3b93b7eb0f2827125cb699869470432cc885f019b8fd0fccff77", size = 208168, upload-time = "2025-07-24T03:45:52.517Z" },
 ]

+[[package]]
+name = "google-api-core"
+version = "2.25.2"
+source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version >= '3.14' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
+    "python_full_version >= '3.14' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
+    "python_full_version >= '3.14' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version >= '3.14' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version >= '3.14' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version >= '3.14' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version >= '3.14' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+]
+dependencies = [
+    { name = "google-auth", marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "googleapis-common-protos", marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "proto-plus", marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "protobuf", marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "requests", marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+]
+sdist = { url = "https://files.pythonhosted.org/packages/09/cd/63f1557235c2440fe0577acdbc32577c5c002684c58c7f4d770a92366a24/google_api_core-2.25.2.tar.gz", hash = "sha256:1c63aa6af0d0d5e37966f157a77f9396d820fba59f9e43e9415bc3dc5baff300", size = 166266, upload-time = "2025-10-03T00:07:34.778Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/c8/d8/894716a5423933f5c8d2d5f04b16f052a515f78e815dab0c2c6f1fd105dc/google_api_core-2.25.2-py3-none-any.whl", hash = "sha256:e9a8f62d363dc8424a8497f4c2a47d6bcda6c16514c935629c257ab5d10210e7", size = 162489, upload-time = "2025-10-03T00:07:32.924Z" },
+]
+
+[package.optional-dependencies]
+grpc = [
+    { name = "grpcio", marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "grpcio-status", marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+]
+
 [[package]]
 name = "google-api-core"
 version = "2.28.1"
 source = { registry = "https://pypi.org/simple" }
+resolution-markers = [
+    "python_full_version == '3.13.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.13.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
+    "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
+    "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.13.*' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.12.*' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version < '3.11' and sys_platform == 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.11.*' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version < '3.11' and sys_platform != 'darwin' and sys_platform != 'linux' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.13.*' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.12.*' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.11.*' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version < '3.11' and sys_platform == 'darwin' and extra == 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.13.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.12.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.13.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.12.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.11.*' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version < '3.11' and sys_platform == 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version == '3.11.*' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+    "python_full_version < '3.11' and sys_platform != 'linux' and extra != 'extra-8-nanochat-cpu' and extra != 'extra-8-nanochat-gpu'",
+]
 dependencies = [
-    { name = "google-auth" },
-    { name = "googleapis-common-protos" },
-    { name = "proto-plus" },
-    { name = "protobuf" },
-    { name = "requests" },
+    { name = "google-auth", marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "googleapis-common-protos", marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "proto-plus", marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "protobuf", marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "requests", marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/61/da/83d7043169ac2c8c7469f0e375610d78ae2160134bf1b80634c482fa079c/google_api_core-2.28.1.tar.gz", hash = "sha256:2b405df02d68e68ce0fbc138559e6036559e685159d148ae5861013dc201baf8", size = 176759, upload-time = "2025-10-28T21:34:51.529Z" }
 wheels = [
@ -547,8 +596,8 @@ wheels = [

 [package.optional-dependencies]
 grpc = [
-    { name = "grpcio" },
-    { name = "grpcio-status" },
+    { name = "grpcio", marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "grpcio-status", marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
 ]

 [[package]]
@ -571,7 +620,8 @@ version = "1.124.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "docstring-parser" },
-    { name = "google-api-core", extra = ["grpc"] },
+    { name = "google-api-core", version = "2.25.2", source = { registry = "https://pypi.org/simple" }, extra = ["grpc"], marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "google-api-core", version = "2.28.1", source = { registry = "https://pypi.org/simple" }, extra = ["grpc"], marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
    { name = "google-auth" },
    { name = "google-cloud-bigquery" },
    { name = "google-cloud-resource-manager" },
@ -594,7 +644,8 @@ name = "google-cloud-bigquery"
 version = "3.38.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "google-api-core", extra = ["grpc"] },
+    { name = "google-api-core", version = "2.25.2", source = { registry = "https://pypi.org/simple" }, extra = ["grpc"], marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "google-api-core", version = "2.28.1", source = { registry = "https://pypi.org/simple" }, extra = ["grpc"], marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
    { name = "google-auth" },
    { name = "google-cloud-core" },
    { name = "google-resumable-media" },
@ -612,7 +663,8 @@ name = "google-cloud-core"
 version = "2.5.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "google-api-core" },
+    { name = "google-api-core", version = "2.25.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "google-api-core", version = "2.28.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
    { name = "google-auth" },
 ]
 sdist = { url = "https://files.pythonhosted.org/packages/a6/03/ef0bc99d0e0faf4fdbe67ac445e18cdaa74824fd93cd069e7bb6548cb52d/google_cloud_core-2.5.0.tar.gz", hash = "sha256:7c1b7ef5c92311717bd05301aa1a91ffbc565673d3b0b4163a52d8413a186963", size = 36027, upload-time = "2025-10-29T23:17:39.513Z" }
@ -625,7 +677,8 @@ name = "google-cloud-resource-manager"
 version = "1.15.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "google-api-core", extra = ["grpc"] },
+    { name = "google-api-core", version = "2.25.2", source = { registry = "https://pypi.org/simple" }, extra = ["grpc"], marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "google-api-core", version = "2.28.1", source = { registry = "https://pypi.org/simple" }, extra = ["grpc"], marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
    { name = "google-auth" },
    { name = "grpc-google-iam-v1" },
    { name = "grpcio" },
@ -639,19 +692,20 @@ wheels = [

 [[package]]
 name = "google-cloud-storage"
-version = "3.4.1"
+version = "2.19.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
-    { name = "google-api-core" },
+    { name = "google-api-core", version = "2.25.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "google-api-core", version = "2.28.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
    { name = "google-auth" },
    { name = "google-cloud-core" },
    { name = "google-crc32c" },
    { name = "google-resumable-media" },
    { name = "requests" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/bd/ef/7cefdca67a6c8b3af0ec38612f9e78e5a9f6179dd91352772ae1a9849246/google_cloud_storage-3.4.1.tar.gz", hash = "sha256:6f041a297e23a4b485fad8c305a7a6e6831855c208bcbe74d00332a909f82268", size = 17238203, upload-time = "2025-10-08T18:43:39.665Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/36/76/4d965702e96bb67976e755bed9828fa50306dca003dbee08b67f41dd265e/google_cloud_storage-2.19.0.tar.gz", hash = "sha256:cd05e9e7191ba6cb68934d8eb76054d9be4562aa89dbc4236feee4d7d51342b2", size = 5535488, upload-time = "2024-12-05T01:35:06.49Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/83/6e/b47d83d3a35231c6232566341b0355cce78fd4e6988a7343725408547b2c/google_cloud_storage-3.4.1-py3-none-any.whl", hash = "sha256:972764cc0392aa097be8f49a5354e22eb47c3f62370067fb1571ffff4a1c1189", size = 290142, upload-time = "2025-10-08T18:43:37.524Z" },
+    { url = "https://files.pythonhosted.org/packages/d5/94/6db383d8ee1adf45dc6c73477152b82731fa4c4a46d9c1932cc8757e0fd4/google_cloud_storage-2.19.0-py2.py3-none-any.whl", hash = "sha256:aeb971b5c29cf8ab98445082cbfe7b161a1f48ed275822f59ed3f1524ea54fba", size = 131787, upload-time = "2024-12-05T01:35:04.736Z" },
 ]

 [[package]]
@ -814,16 +868,16 @@ wheels = [

 [[package]]
 name = "grpcio-status"
-version = "1.76.0"
+version = "1.62.3"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "googleapis-common-protos" },
    { name = "grpcio" },
    { name = "protobuf" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/3f/46/e9f19d5be65e8423f886813a2a9d0056ba94757b0c5007aa59aed1a961fa/grpcio_status-1.76.0.tar.gz", hash = "sha256:25fcbfec74c15d1a1cb5da3fab8ee9672852dc16a5a9eeb5baf7d7a9952943cd", size = 13679, upload-time = "2025-10-21T16:28:52.545Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/7c/d7/013ef01c5a1c2fd0932c27c904934162f69f41ca0f28396d3ffe4d386123/grpcio-status-1.62.3.tar.gz", hash = "sha256:289bdd7b2459794a12cf95dc0cb727bd4a1742c37bd823f760236c937e53a485", size = 13063, upload-time = "2024-08-06T00:37:08.003Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/8c/cc/27ba60ad5a5f2067963e6a858743500df408eb5855e98be778eaef8c9b02/grpcio_status-1.76.0-py3-none-any.whl", hash = "sha256:380568794055a8efbbd8871162df92012e0228a5f6dffaf57f2a00c534103b18", size = 14425, upload-time = "2025-10-21T16:28:40.853Z" },
+    { url = "https://files.pythonhosted.org/packages/90/40/972271de05f9315c0d69f9f7ebbcadd83bc85322f538637d11bb8c67803d/grpcio_status-1.62.3-py3-none-any.whl", hash = "sha256:f9049b762ba8de6b1086789d8315846e094edac2c50beaf462338b301a8fd4b8", size = 14448, upload-time = "2024-08-06T00:30:15.702Z" },
 ]

 [[package]]
@ -929,13 +983,13 @@ wheels = [

 [[package]]
 name = "kfp"
-version = "2.14.6"
+version = "2.8.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "click" },
-    { name = "click-option-group" },
    { name = "docstring-parser" },
-    { name = "google-api-core" },
+    { name = "google-api-core", version = "2.25.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
+    { name = "google-api-core", version = "2.28.1", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.14' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
    { name = "google-auth" },
    { name = "google-cloud-storage" },
    { name = "kfp-pipeline-spec" },
@ -947,26 +1001,22 @@ dependencies = [
    { name = "tabulate" },
    { name = "urllib3" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/3b/c1/d01724ccb7faaf3ecf2a8109de1d7eebb0afa1f292d6dcd650755b990d59/kfp-2.14.6.tar.gz", hash = "sha256:9e94ff2e74465c27393736c295b6dc478b29cf9d0264950019b5167c7c53fd2e", size = 274267, upload-time = "2025-10-13T20:08:46.072Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/1f/46/789f883750b0f6c321450832e2f07203139716cb9422cad6f3d286298915/kfp-2.14.6-py3-none-any.whl", hash = "sha256:2d76aff91d8461e837989c2dc966c9dddaba7fcc37b7b8be4b0564282b1f613d", size = 374048, upload-time = "2025-10-13T20:08:44.275Z" },
-]
+sdist = { url = "https://files.pythonhosted.org/packages/51/ee/dbf636afac86c7701245ea4f424e2b38038eee51f3731e22f2777e232bbb/kfp-2.8.0.tar.gz", hash = "sha256:06ad584eecbe80318c6cd0231c95a432e91fec56f201def9d511b6e6664235ce", size = 594413, upload-time = "2024-06-22T09:03:47.265Z" }

 [[package]]
 name = "kfp-pipeline-spec"
-version = "2.14.6"
+version = "0.3.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "protobuf" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/7b/be/a8aa41bbe65c0578f141f615f30829e68bdc087542248d20a84316252228/kfp_pipeline_spec-2.14.6.tar.gz", hash = "sha256:a4943b0bdf6d991db35ca3a261caf77997676512970959bf9909742df58e2a87", size = 10255, upload-time = "2025-10-13T20:06:29.544Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/56/c7/a331cdb987d5c1764c309e6c9f596a695cfd8fe86ea95fc8a9fbc052cf52/kfp_pipeline_spec-2.14.6-py3-none-any.whl", hash = "sha256:82cbad2976f248f7049be37d241f1e47ecb3d99e720dfd0cab3e0881be458516", size = 9550, upload-time = "2025-10-13T20:06:28.544Z" },
+    { url = "https://files.pythonhosted.org/packages/63/0a/269a792545cf8a87a30b84bebe69a2b07c483b2887690e8f48c9a91e8060/kfp_pipeline_spec-0.3.0-py3-none-any.whl", hash = "sha256:1db84524a0a2d6c9d36e7e87e6fa0e181bf1ba1513d29dcd54f7b8822e7a52a2", size = 12598, upload-time = "2024-01-10T00:24:34.83Z" },
 ]

 [[package]]
 name = "kfp-server-api"
-version = "2.14.6"
+version = "2.0.5"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "certifi" },
@ -974,27 +1024,27 @@ dependencies = [
    { name = "six" },
    { name = "urllib3" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/7b/9d/47f38ed0914bbf6c7e70693b805d822b0848d2f79cce0aa2addb2a7b2f67/kfp-server-api-2.14.6.tar.gz", hash = "sha256:eabf673f384186968d88cff9674cd39c655537aad1abacda78086575924d6bfc", size = 64327, upload-time = "2025-10-15T15:43:52.999Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/d9/4b/1b1c9286047e78ebc9de2a9d4d43921d6efb5e6550fdc38229127a03aa53/kfp-server-api-2.0.5.tar.gz", hash = "sha256:c9cfbf0e87271d3bfe96e5ecc9ffbdd6ab566bc1c9a9ddc2a39d7698a16e26ff", size = 63401, upload-time = "2023-12-08T19:21:48.908Z" }

 [[package]]
 name = "kubernetes"
-version = "30.1.0"
+version = "26.1.0"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "certifi" },
    { name = "google-auth" },
-    { name = "oauthlib" },
    { name = "python-dateutil" },
    { name = "pyyaml" },
    { name = "requests" },
    { name = "requests-oauthlib" },
+    { name = "setuptools" },
    { name = "six" },
    { name = "urllib3" },
    { name = "websocket-client" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/82/3c/9f29f6cab7f35df8e54f019e5719465fa97b877be2454e99f989270b4f34/kubernetes-30.1.0.tar.gz", hash = "sha256:41e4c77af9f28e7a6c314e3bd06a8c6229ddd787cad684e0ab9f69b498e98ebc", size = 887810, upload-time = "2024-06-06T15:58:30.031Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/34/19/2f351c0eaf05234dc33a6e0ffc7894e9dedab0ff341311c5b4ba44f2d8ac/kubernetes-26.1.0.tar.gz", hash = "sha256:5854b0c508e8d217ca205591384ab58389abdae608576f9c9afc35a3c76a366c", size = 736370, upload-time = "2023-02-16T01:04:37.088Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/62/a1/2027ddede72d33be2effc087580aeba07e733a7360780ae87226f1f91bd8/kubernetes-30.1.0-py2.py3-none-any.whl", hash = "sha256:e212e8b7579031dd2e512168b617373bc1e03888d41ac4e04039240a292d478d", size = 1706042, upload-time = "2024-06-06T15:58:27.13Z" },
+    { url = "https://files.pythonhosted.org/packages/74/21/ada0c5eedb678ab663f8e387734418fdd1a26be28fc919a0c32e52964047/kubernetes-26.1.0-py2.py3-none-any.whl", hash = "sha256:e3db6800abf7e36c38d2629b5cb6b74d10988ee0cba6fba45595a7cbe60c0042", size = 1446361, upload-time = "2023-02-16T01:04:34.33Z" },
 ]

 [[package]]
@ -1057,26 +1107,26 @@ wheels = [

 [[package]]
 name = "maturin"
-version = "1.9.4"
+version = "1.9.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "tomli", marker = "python_full_version < '3.11' or (extra == 'extra-8-nanochat-cpu' and extra == 'extra-8-nanochat-gpu')" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/13/7c/b11b870fc4fd84de2099906314ce45488ae17be32ff5493519a6cddc518a/maturin-1.9.4.tar.gz", hash = "sha256:235163a0c99bc6f380fb8786c04fd14dcf6cd622ff295ea3de525015e6ac40cf", size = 213647, upload-time = "2025-08-27T11:37:57.079Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/9a/35/c3370188492f4c139c7a318f438d01b8185c216303c49c4bc885c98b6afb/maturin-1.9.6.tar.gz", hash = "sha256:2c2ae37144811d365509889ed7220b0598487f1278c2441829c3abf56cc6324a", size = 214846, upload-time = "2025-10-07T12:45:08.408Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/f2/90/0d99389eea1939116fca841cad0763600c8d3183a02a9478d066736c60e8/maturin-1.9.4-py3-none-linux_armv6l.whl", hash = "sha256:6ff37578e3f5fdbe685110d45f60af1f5a7dfce70a1e26dfe3810af66853ecae", size = 8276133, upload-time = "2025-08-27T11:37:23.325Z" },
-    { url = "https://files.pythonhosted.org/packages/f4/ed/c8ec68b383e50f084bf1fa9605e62a90cd32a3f75d9894ed3a6e5d4cc5b3/maturin-1.9.4-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:f3837bb53611b2dafa1c090436c330f2d743ba305ef00d8801a371f4495e7e1b", size = 15994496, upload-time = "2025-08-27T11:37:27.092Z" },
-    { url = "https://files.pythonhosted.org/packages/84/4e/401ff5f3cfc6b123364d4b94379bf910d7baee32c9c95b72784ff2329357/maturin-1.9.4-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:4227d627d8e3bfe45877a8d65e9d8351a9d01434549f0da75d2c06a1b570de58", size = 8362228, upload-time = "2025-08-27T11:37:31.181Z" },
-    { url = "https://files.pythonhosted.org/packages/51/8e/c56176dd360da9650c62b8a5ecfb85432cf011e97e46c186901e6996002e/maturin-1.9.4-py3-none-manylinux_2_12_i686.manylinux2010_i686.musllinux_1_1_i686.whl", hash = "sha256:1bb2aa0fa29032e9c5aac03ac400396ddea12cadef242f8967e9c8ef715313a1", size = 8271397, upload-time = "2025-08-27T11:37:33.672Z" },
-    { url = "https://files.pythonhosted.org/packages/d2/46/001fcc5c6ad509874896418d6169a61acd619df5b724f99766308c44a99f/maturin-1.9.4-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:a0868d52934c8a5d1411b42367633fdb5cd5515bec47a534192282167448ec30", size = 8775625, upload-time = "2025-08-27T11:37:35.86Z" },
-    { url = "https://files.pythonhosted.org/packages/b4/2e/26fa7574f01c19b7a74680fd70e5bae2e8c40fed9683d1752e765062cc2b/maturin-1.9.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:68b7b833b25741c0f553b78e8b9e095b31ae7c6611533b3c7b71f84c2cb8fc44", size = 8051117, upload-time = "2025-08-27T11:37:38.278Z" },
-    { url = "https://files.pythonhosted.org/packages/73/ee/ca7308832d4f5b521c1aa176d9265f6f93e0bd1ad82a90fd9cd799f6b28c/maturin-1.9.4-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:08dc86312afee55af778af919818632e35d8d0464ccd79cb86700d9ea560ccd7", size = 8132122, upload-time = "2025-08-27T11:37:40.499Z" },
-    { url = "https://files.pythonhosted.org/packages/45/e8/c623955da75e801a06942edf1fdc4e772a9e8fbc1ceebbdc85d59584dc10/maturin-1.9.4-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:ef20ffdd943078c4c3699c29fb2ed722bb6b4419efdade6642d1dbf248f94a70", size = 10586762, upload-time = "2025-08-27T11:37:42.718Z" },
-    { url = "https://files.pythonhosted.org/packages/3c/4b/19ad558fdf54e151b1b4916ed45f1952ada96684ee6db64f9cd91cabec09/maturin-1.9.4-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:368e958468431dfeec80f75eea9639b4356d8c42428b0128444424b083fecfb0", size = 8926988, upload-time = "2025-08-27T11:37:45.492Z" },
-    { url = "https://files.pythonhosted.org/packages/7e/27/153ad15eccae26921e8a01812da9f3b7f9013368f8f92c36853f2043b2a3/maturin-1.9.4-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:273f879214f63f79bfe851cd7d541f8150bdbfae5dfdc3c0c4d125d02d1f41b4", size = 8536758, upload-time = "2025-08-27T11:37:48.213Z" },
-    { url = "https://files.pythonhosted.org/packages/43/e3/f304c3bdc3fba9adebe5348d4d2dd015f1152c0a9027aaf52cae0bb182c8/maturin-1.9.4-py3-none-win32.whl", hash = "sha256:ed2e54d132ace7e61829bd49709331007dd9a2cc78937f598aa76a4f69b6804d", size = 7265200, upload-time = "2025-08-27T11:37:50.881Z" },
-    { url = "https://files.pythonhosted.org/packages/14/14/f86d0124bf1816b99005c058a1dbdca7cb5850d9cf4b09dcae07a1bc6201/maturin-1.9.4-py3-none-win_amd64.whl", hash = "sha256:8e450bb2c9afdf38a0059ee2e1ec2b17323f152b59c16f33eb9c74edaf1f9f79", size = 8237391, upload-time = "2025-08-27T11:37:53.23Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/25/8320fc2591e45b750c3ae71fa596b47aefa802d07d6abaaa719034a85160/maturin-1.9.4-py3-none-win_arm64.whl", hash = "sha256:7a6f980a9b67a5c13c844c268eabd855b54a6a765df4b4bb07d15a990572a4c9", size = 6988277, upload-time = "2025-08-27T11:37:55.429Z" },
+    { url = "https://files.pythonhosted.org/packages/55/5c/b435418ba4ba2647a1f7a95d53314991b1e556e656ae276dea993c3bce1d/maturin-1.9.6-py3-none-linux_armv6l.whl", hash = "sha256:26e3ab1a42a7145824210e9d763f6958f2c46afb1245ddd0bab7d78b1f59bb3f", size = 8134483, upload-time = "2025-10-07T12:44:44.274Z" },
+    { url = "https://files.pythonhosted.org/packages/4d/1c/8e58eda6601f328b412cdeeaa88a9b6a10e591e2a73f313e8c0154d68385/maturin-1.9.6-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:5263dda3f71feef2e4122baf5c4620e4b3710dbb7f2121f85a337182de214369", size = 15776470, upload-time = "2025-10-07T12:44:47.476Z" },
+    { url = "https://files.pythonhosted.org/packages/6c/33/8c967cce6848cdd87a2e442c86120ac644b80c5ed4c32e3291bde6a17df8/maturin-1.9.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:fe78262c2800c92f67d1ce3c0f6463f958a692cc67bfb572e5dbf5b4b696a8ba", size = 8226557, upload-time = "2025-10-07T12:44:49.844Z" },
+    { url = "https://files.pythonhosted.org/packages/58/bd/3e2675cdc8b7270700ba30c663c852a35694441732a107ac30ebd6878bd8/maturin-1.9.6-py3-none-manylinux_2_12_i686.manylinux2010_i686.musllinux_1_1_i686.whl", hash = "sha256:7ab827c6e8c022eb2e1e7fb6deede54549c8460b20ccc2e9268cc6e8cde957a8", size = 8166544, upload-time = "2025-10-07T12:44:51.396Z" },
+    { url = "https://files.pythonhosted.org/packages/58/1f/a2047ddf2230e700d5f8a13dd4b9af5ce806ad380c32e58105888205926e/maturin-1.9.6-py3-none-manylinux_2_12_x86_64.manylinux2010_x86_64.musllinux_1_1_x86_64.whl", hash = "sha256:0246202377c49449315305209f45c8ecef6e2d6bd27a04b5b6f1ab3e4ea47238", size = 8641010, upload-time = "2025-10-07T12:44:53.658Z" },
+    { url = "https://files.pythonhosted.org/packages/be/1f/265d63c7aa6faf363d4a3f23396f51bc6b4d5c7680a4190ae68dba25dea2/maturin-1.9.6-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.musllinux_1_1_aarch64.whl", hash = "sha256:f5bac167700fbb6f8c8ed1a97b494522554b4432d7578e11403b894b6a91d99f", size = 7965945, upload-time = "2025-10-07T12:44:55.248Z" },
+    { url = "https://files.pythonhosted.org/packages/4c/ca/a8e61979ccfe080948bcc1bddd79356157aee687134df7fb013050cec783/maturin-1.9.6-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.musllinux_1_1_armv7l.whl", hash = "sha256:7f53d3b1d8396d3fea3e1ee5fd37558bca5719090f3d194ba1c02b0b56327ae3", size = 7978820, upload-time = "2025-10-07T12:44:56.919Z" },
+    { url = "https://files.pythonhosted.org/packages/bf/4a/81b412f8ad02a99801ef19ec059fba0822d1d28fb44cb6a92e722f05f278/maturin-1.9.6-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.musllinux_1_1_ppc64le.whl", hash = "sha256:7f506eb358386d94d6ec3208c003130cf4b69cab26034fc0cbbf8bf83afa4c2e", size = 10452064, upload-time = "2025-10-07T12:44:58.232Z" },
+    { url = "https://files.pythonhosted.org/packages/5b/12/cc96c7a8cb51d8dcc9badd886c361caa1526fba7fa69d1e7892e613b71d4/maturin-1.9.6-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f2d6984ab690af509f525dbd2b130714207c06ebb14a5814edbe1e42b17ae0de", size = 8852401, upload-time = "2025-10-07T12:44:59.8Z" },
+    { url = "https://files.pythonhosted.org/packages/51/8e/653ac3c9f2c25cdd81aefb0a2d17ff140ca5a14504f5e3c7f94dcfe4dbb7/maturin-1.9.6-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:5c2252b0956bb331460ac750c805ddf0d9b44442449fc1f16e3b66941689d0bc", size = 8425057, upload-time = "2025-10-07T12:45:01.711Z" },
+    { url = "https://files.pythonhosted.org/packages/db/29/f13490328764ae9bfc1da55afc5b707cebe4fa75ad7a1573bfa82cfae0c6/maturin-1.9.6-py3-none-win32.whl", hash = "sha256:f2c58d29ebdd4346fd004e6be213d071fdd94a77a16aa91474a21a4f9dbf6309", size = 7165956, upload-time = "2025-10-07T12:45:03.766Z" },
+    { url = "https://files.pythonhosted.org/packages/db/9f/dd51e5ac1fce47581b8efa03d77a03f928c0ef85b6e48a61dfa37b6b85a2/maturin-1.9.6-py3-none-win_amd64.whl", hash = "sha256:1b39a5d82572c240d20d9e8be024d722dfb311d330c5e28ddeb615211755941a", size = 8145722, upload-time = "2025-10-07T12:45:05.487Z" },
+    { url = "https://files.pythonhosted.org/packages/65/f2/e97aaba6d0d78c5871771bf9dd71d4eb8dac15df9109cf452748d2207412/maturin-1.9.6-py3-none-win_arm64.whl", hash = "sha256:ac02a30083553d2a781c10cd6f5480119bf6692fd177e743267406cad2ad198c", size = 6857006, upload-time = "2025-10-07T12:45:06.813Z" },
 ]

 [[package]]
@ -1254,7 +1304,7 @@ requires-dist = [
    { name = "files-to-prompt", specifier = ">=0.6" },
    { name = "google-cloud-aiplatform", specifier = ">=1.25.0" },
    { name = "google-cloud-storage", specifier = ">=2.10.0" },
-    { name = "kfp", specifier = ">=2.0.0" },
+    { name = "kfp", specifier = "==2.8.0" },
    { name = "psutil", specifier = ">=7.1.0" },
    { name = "regex", specifier = ">=2025.9.1" },
    { name = "setuptools", specifier = ">=80.9.0" },
@ -1702,16 +1752,16 @@ wheels = [

 [[package]]
 name = "protobuf"
-version = "6.32.0"
+version = "4.25.8"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/c0/df/fb4a8eeea482eca989b51cffd274aac2ee24e825f0bf3cbce5281fa1567b/protobuf-6.32.0.tar.gz", hash = "sha256:a81439049127067fc49ec1d36e25c6ee1d1a2b7be930675f919258d03c04e7d2", size = 440614, upload-time = "2025-08-14T21:21:25.015Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/df/01/34c8d2b6354906d728703cb9d546a0e534de479e25f1b581e4094c4a85cc/protobuf-4.25.8.tar.gz", hash = "sha256:6135cf8affe1fc6f76cced2641e4ea8d3e59518d1f24ae41ba97bcad82d397cd", size = 380920, upload-time = "2025-05-28T14:22:25.153Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/33/18/df8c87da2e47f4f1dcc5153a81cd6bca4e429803f4069a299e236e4dd510/protobuf-6.32.0-cp310-abi3-win32.whl", hash = "sha256:84f9e3c1ff6fb0308dbacb0950d8aa90694b0d0ee68e75719cb044b7078fe741", size = 424409, upload-time = "2025-08-14T21:21:12.366Z" },
-    { url = "https://files.pythonhosted.org/packages/e1/59/0a820b7310f8139bd8d5a9388e6a38e1786d179d6f33998448609296c229/protobuf-6.32.0-cp310-abi3-win_amd64.whl", hash = "sha256:a8bdbb2f009cfc22a36d031f22a625a38b615b5e19e558a7b756b3279723e68e", size = 435735, upload-time = "2025-08-14T21:21:15.046Z" },
-    { url = "https://files.pythonhosted.org/packages/cc/5b/0d421533c59c789e9c9894683efac582c06246bf24bb26b753b149bd88e4/protobuf-6.32.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:d52691e5bee6c860fff9a1c86ad26a13afbeb4b168cd4445c922b7e2cf85aaf0", size = 426449, upload-time = "2025-08-14T21:21:16.687Z" },
-    { url = "https://files.pythonhosted.org/packages/ec/7b/607764ebe6c7a23dcee06e054fd1de3d5841b7648a90fd6def9a3bb58c5e/protobuf-6.32.0-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:501fe6372fd1c8ea2a30b4d9be8f87955a64d6be9c88a973996cef5ef6f0abf1", size = 322869, upload-time = "2025-08-14T21:21:18.282Z" },
-    { url = "https://files.pythonhosted.org/packages/40/01/2e730bd1c25392fc32e3268e02446f0d77cb51a2c3a8486b1798e34d5805/protobuf-6.32.0-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:75a2aab2bd1aeb1f5dc7c5f33bcb11d82ea8c055c9becbb41c26a8c43fd7092c", size = 322009, upload-time = "2025-08-14T21:21:19.893Z" },
-    { url = "https://files.pythonhosted.org/packages/9c/f2/80ffc4677aac1bc3519b26bc7f7f5de7fce0ee2f7e36e59e27d8beb32dd1/protobuf-6.32.0-py3-none-any.whl", hash = "sha256:ba377e5b67b908c8f3072a57b63e2c6a4cbd18aea4ed98d2584350dbf46f2783", size = 169287, upload-time = "2025-08-14T21:21:23.515Z" },
+    { url = "https://files.pythonhosted.org/packages/45/ff/05f34305fe6b85bbfbecbc559d423a5985605cad5eda4f47eae9e9c9c5c5/protobuf-4.25.8-cp310-abi3-win32.whl", hash = "sha256:504435d831565f7cfac9f0714440028907f1975e4bed228e58e72ecfff58a1e0", size = 392745, upload-time = "2025-05-28T14:22:10.524Z" },
+    { url = "https://files.pythonhosted.org/packages/08/35/8b8a8405c564caf4ba835b1fdf554da869954712b26d8f2a98c0e434469b/protobuf-4.25.8-cp310-abi3-win_amd64.whl", hash = "sha256:bd551eb1fe1d7e92c1af1d75bdfa572eff1ab0e5bf1736716814cdccdb2360f9", size = 413736, upload-time = "2025-05-28T14:22:13.156Z" },
+    { url = "https://files.pythonhosted.org/packages/28/d7/ab27049a035b258dab43445eb6ec84a26277b16105b277cbe0a7698bdc6c/protobuf-4.25.8-cp37-abi3-macosx_10_9_universal2.whl", hash = "sha256:ca809b42f4444f144f2115c4c1a747b9a404d590f18f37e9402422033e464e0f", size = 394537, upload-time = "2025-05-28T14:22:14.768Z" },
+    { url = "https://files.pythonhosted.org/packages/bd/6d/a4a198b61808dd3d1ee187082ccc21499bc949d639feb948961b48be9a7e/protobuf-4.25.8-cp37-abi3-manylinux2014_aarch64.whl", hash = "sha256:9ad7ef62d92baf5a8654fbb88dac7fa5594cfa70fd3440488a5ca3bfc6d795a7", size = 294005, upload-time = "2025-05-28T14:22:16.052Z" },
+    { url = "https://files.pythonhosted.org/packages/d6/c6/c9deaa6e789b6fc41b88ccbdfe7a42d2b82663248b715f55aa77fbc00724/protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl", hash = "sha256:83e6e54e93d2b696a92cad6e6efc924f3850f82b52e1563778dfab8b355101b0", size = 294924, upload-time = "2025-05-28T14:22:17.105Z" },
+    { url = "https://files.pythonhosted.org/packages/0c/c1/6aece0ab5209981a70cd186f164c133fdba2f51e124ff92b73de7fd24d78/protobuf-4.25.8-py3-none-any.whl", hash = "sha256:15a0af558aa3b13efef102ae6e4f3efac06f1eea11afb3a57db2901447d9fb59", size = 156757, upload-time = "2025-05-28T14:22:24.135Z" },
 ]

 [[package]]
@ -2097,14 +2147,14 @@ wheels = [

 [[package]]
 name = "requests-toolbelt"
-version = "1.0.0"
+version = "0.10.1"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "requests" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/f3/61/d7545dafb7ac2230c70d38d31cbfe4cc64f7144dc41f6e4e4b78ecd9f5bb/requests-toolbelt-1.0.0.tar.gz", hash = "sha256:7681a0a3d047012b5bdc0ee37d7f8f07ebe76ab08caeccfc3921ce23c88d5bc6", size = 206888, upload-time = "2023-05-01T04:11:33.229Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/0c/4c/07f01c6ac44f7784fa399137fbc8d0cdc1b5d35304e8c0f278ad82105b58/requests-toolbelt-0.10.1.tar.gz", hash = "sha256:62e09f7ff5ccbda92772a29f394a49c3ad6cb181d568b1337626b2abb628a63d", size = 208956, upload-time = "2022-10-25T03:14:58.576Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/3f/51/d4db610ef29373b879047326cbf6fa98b6c1969d6f6dc423279de2b1be2c/requests_toolbelt-1.0.0-py2.py3-none-any.whl", hash = "sha256:cccfdd665f0a24fcf4726e690f65639d272bb0637b9b92dfd91a5568ccf6bd06", size = 54481, upload-time = "2023-05-01T04:11:28.427Z" },
+    { url = "https://files.pythonhosted.org/packages/05/d3/bf87a36bff1cb88fd30a509fd366c70ec30676517ee791b2f77e0e29817a/requests_toolbelt-0.10.1-py2.py3-none-any.whl", hash = "sha256:18565aa58116d9951ac39baa288d3adb5b3ff975c4f25eee78555d89e8f247f7", size = 54525, upload-time = "2022-10-25T03:14:55.289Z" },
 ]

 [[package]]
@ -2632,11 +2682,11 @@ wheels = [

 [[package]]
 name = "urllib3"
-version = "2.5.0"
+version = "1.26.20"
 source = { registry = "https://pypi.org/simple" }
-sdist = { url = "https://files.pythonhosted.org/packages/15/22/9ee70a2574a4f4599c47dd506532914ce044817c7752a79b6a51286319bc/urllib3-2.5.0.tar.gz", hash = "sha256:3fc47733c7e419d4bc3f6b3dc2b4f890bb743906a30d56ba4a5bfa4bbff92760", size = 393185, upload-time = "2025-06-18T14:07:41.644Z" }
+sdist = { url = "https://files.pythonhosted.org/packages/e4/e8/6ff5e6bc22095cfc59b6ea711b687e2b7ed4bdb373f7eeec370a97d7392f/urllib3-1.26.20.tar.gz", hash = "sha256:40c2dc0c681e47eb8f90e7e27bf6ff7df2e677421fd46756da1161c39ca70d32", size = 307380, upload-time = "2024-08-29T15:43:11.37Z" }
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/a7/c2/fe1e52489ae3122415c51f387e221dd0773709bad6c6cdaa599e8a2c5185/urllib3-2.5.0-py3-none-any.whl", hash = "sha256:e6b01673c0fa6a13e374b50871808eb3bf7046c4b125b216f6bf1cc604cff0dc", size = 129795, upload-time = "2025-06-18T14:07:40.39Z" },
+    { url = "https://files.pythonhosted.org/packages/33/cf/8435d5a7159e2a9c83a95896ed596f68cf798005fe107cc655b5c5c14704/urllib3-1.26.20-py2.py3-none-any.whl", hash = "sha256:0ed14ccfbf1c30a9072c7ca157e4319b70d65f623e91e7b32fadb2853431016e", size = 144225, upload-time = "2024-08-29T15:43:08.921Z" },
 ]

 [[package]]
--- a/verify_runtime_params.py
+++ b/verify_runtime_params.py
@ -0,0 +1,69 @@
+#!/usr/bin/env python3
+"""
+Verify that scheduling_strategy and max_wait_duration are runtime parameters
+"""
+import json
+import sys
+
+def verify_runtime_parameters():
+    print("=== Verifying Runtime Scheduling Parameters ===\n")
+    
+    # Load compiled pipeline
+    try:
+        with open('nanochat_pipeline.json', 'r') as f:
+            pipeline = json.load(f)
+    except FileNotFoundError:
+        print("❌ Error: nanochat_pipeline.json not found")
+        print("   Run: python3 vertex_pipelines/pipeline.py --gcp-project nzp-nanochat ...")
+        return False
+    
+    # Check root input parameters
+    root_params = pipeline['root']['inputDefinitions']['parameters']
+    
+    print("✓ Pipeline root parameters:")
+    for param in ['scheduling_strategy', 'max_wait_duration']:
+        if param in root_params:
+            info = root_params[param]
+            print(f"  • {param}:")
+            print(f"      Type: {info['parameterType']}")
+            print(f"      Default: {info.get('defaultValue', 'N/A')}")
+            print(f"      Optional: {info.get('isOptional', False)}")
+        else:
+            print(f"  ❌ Missing: {param}")
+            return False
+    
+    print()
+    
+    # Check custom-training-job task parameters
+    custom_job_task = pipeline['root']['dag']['tasks']['custom-training-job']
+    task_params = custom_job_task['inputs']['parameters']
+    
+    print("✓ Custom Job task parameter bindings:")
+    for param in ['strategy', 'max_wait_duration']:
+        if param in task_params:
+            binding = task_params[param]
+            if 'componentInputParameter' in binding:
+                print(f"  • {param} → {binding['componentInputParameter']}")
+            elif 'runtimeValue' in binding:
+                print(f"  ⚠ {param} → runtime constant (not parameterized!)")
+                return False
+        else:
+            print(f"  ❌ Missing: {param}")
+            return False
+    
+    print()
+    print("=== Verification Summary ===")
+    print("✅ scheduling_strategy is a RUNTIME parameter")
+    print("✅ max_wait_duration is a RUNTIME parameter")
+    print("✅ Both are correctly bound to Custom Job inputs")
+    print()
+    print("Benefits:")
+    print("  • No recompilation needed to change FLEX_START ↔ SPOT ↔ STANDARD")
+    print("  • No Docker rebuild needed for deployment strategy changes")
+    print("  • Single pipeline JSON can be reused with different strategies")
+    print()
+    return True
+
+if __name__ == "__main__":
+    success = verify_runtime_parameters()
+    sys.exit(0 if success else 1)
--- a/vertex_pipelines/Dockerfile
+++ b/vertex_pipelines/Dockerfile
@ -1,5 +1,6 @@
-# Use the official Python 3.10 image.
-FROM python:3.10-slim
+# Use Google Cloud's Deep Learning Container for PyTorch with GPU support
+# This image is optimized for Vertex AI and includes CUDA, cuDNN, and PyTorch
+FROM gcr.io/deeplearning-platform-release/pytorch-gpu.2-2.py310

 # Set the working directory.
 WORKDIR /app
@ -20,6 +21,9 @@ RUN uv venv
 # Install Python dependencies using uv.
 RUN uv sync --extra gpu

+# Install the nanochat package in editable mode
+RUN uv pip install -e .
+
 # Install maturin, which is a build dependency.
 RUN uv pip install maturin

@ -27,5 +31,8 @@ RUN uv pip install maturin
 # The maturin executable from the venv should be on the PATH now.
 RUN maturin develop --release --manifest-path rustbpe/Cargo.toml

-# Set the entrypoint.
-ENTRYPOINT ["python"]
+# Set PYTHONPATH to include /app so that nanochat module can be imported
+ENV PYTHONPATH="/app:${PYTHONPATH}"
+
+# Set the entrypoint to use the virtual environment's Python.
+ENTRYPOINT ["/app/.venv/bin/python"]
--- a/vertex_pipelines/check_l4_availability.sh
+++ b/vertex_pipelines/check_l4_availability.sh
@ -0,0 +1,135 @@
+#!/bin/bash
+
+PROJECT="nzp-nanochat"
+MACHINE_TYPE="g2-standard-4"  # Smallest L4 machine type
+IMAGE_FAMILY="debian-12"
+IMAGE_PROJECT="debian-cloud"
+
+# Parse debug flag
+DEBUG=false
+if [[ "${1:-}" == "--debug" ]]; then
+    DEBUG=true
+    echo "Debug mode enabled - will show GCP error messages"
+    echo ""
+fi
+
+echo "=== Testing L4 GPU Availability Across ALL Global Regions ==="
+echo "This will attempt to create small L4 instances and immediately delete them"
+echo "Order: US -> Europe -> Others. Stops at first success."
+echo ""
+
+# Get all regions dynamically
+echo "Fetching all GCP regions..."
+ALL_REGIONS=$(gcloud compute regions list --project="$PROJECT" --format="value(name)" 2>/dev/null | sort)
+REGION_COUNT=$(echo "$ALL_REGIONS" | wc -l | tr -d ' ')
+echo "Found $REGION_COUNT regions to test"
+echo ""
+
+RESULTS_FILE=$(mktemp)
+ERROR_LOG=$(mktemp)
+
+# Order regions: US first, then Europe, then others
+ordered_regions=$(echo "$ALL_REGIONS" | tr ' ' '\n' | grep '^us-' || true)
+ordered_regions+=$'\n'
+ordered_regions+=$(echo "$ALL_REGIONS" | tr ' ' '\n' | grep '^europe-' || true)
+ordered_regions+=$'\n'
+ordered_regions+=$(echo "$ALL_REGIONS" | tr ' ' '\n' | grep -vE '^(us-|europe-)' || true)
+
+# Remove empty lines
+ordered_regions=$(echo "$ordered_regions" | sed '/^$/d')
+
+current=0
+found_any=false
+
+# Iterate over ordered list
+for region in $ordered_regions; do
+    current=$((current + 1))
+    echo "[$current/$REGION_COUNT] Testing region: $region"
+    
+    # Get zones for region
+    zones=$(gcloud compute zones list --project="$PROJECT" --filter="region:$region" --format="value(name)" 2>/dev/null)
+    if [ -z "$zones" ]; then
+        echo "  ⚠️  No zones found for region $region"
+        continue
+    fi
+    
+    found_capacity=false
+    available_zone=""
+    
+    for zone in $zones; do
+        echo -n "  Checking zone $zone... "
+        
+        instance_name="test-l4-capacity-$$-$(date +%s)"
+        
+        # Try to create instance - capture stderr
+        error_output=$(mktemp)
+        if gcloud compute instances create "$instance_name" \
+            --zone="$zone" \
+            --machine-type="$MACHINE_TYPE" \
+            --accelerator="type=nvidia-l4,count=1" \
+            --image-family="$IMAGE_FAMILY" \
+            --image-project="$IMAGE_PROJECT" \
+            --boot-disk-size=200GB \
+            --boot-disk-type=pd-standard \
+            --network="nanochat-network" \
+            --no-address \
+            --shielded-secure-boot \
+            --maintenance-policy=TERMINATE \
+            --project="$PROJECT" \
+            --quiet \
+            2>"$error_output"; then
+            
+            echo "✅ AVAILABLE"
+            available_zone="$zone"
+            found_capacity=true
+            
+            # Delete instance
+            gcloud compute instances delete "$instance_name" --zone="$zone" --project="$PROJECT" --quiet 2>/dev/null || true
+            
+            rm -f "$error_output"
+            break
+        else
+            echo "❌ No capacity"
+            if [ "$DEBUG" = true ]; then
+                echo "    ERROR DETAILS:"
+                sed 's/^/    /' "$error_output"
+                cat "$error_output" >> "$ERROR_LOG"
+            fi
+            rm -f "$error_output"
+        fi
+    done
+    
+    if [ "$found_capacity" = true ]; then
+        echo "$region: ✅ Available in $available_zone" >> "$RESULTS_FILE"
+        echo ""
+        echo "✅ Found capacity in $region ($available_zone). Stopping further checks."
+        found_any=true
+        break
+    else
+        echo "$region: ❌ No capacity in any zone" >> "$RESULTS_FILE"
+    fi
+    
+    echo ""
+done
+
+# Print summary (will only contain up to first successful region)
+echo "=========================================================="
+echo "         L4 GPU AVAILABILITY SUMMARY (GLOBAL)            "
+echo "=========================================================="
+cat "$RESULTS_FILE" | sort
+echo "=========================================================="
+echo ""
+
+if [ "$found_any" = true ]; then
+    echo "✅ Recommendation: Use the region marked with ✅ above."
+else
+    echo "❌ No L4 capacity found in any tested region."
+fi
+
+# Cleanup
+rm -f "$RESULTS_FILE"
+if [ "$DEBUG" = true ]; then
+    echo "Debug log: $ERROR_LOG"
+else
+    rm -f "$ERROR_LOG"
+fi
--- a/vertex_pipelines/cloudbuild.yaml
+++ b/vertex_pipelines/cloudbuild.yaml
@ -3,9 +3,9 @@ steps:
  args:
  - 'build'
  - '-t'
-  - 'gcr.io/$PROJECT_ID/nanochat:latest'
+  - '$_IMAGE_NAME'
  - '.'
  - '-f'
  - 'vertex_pipelines/Dockerfile'
 images:
- 'gcr.io/$PROJECT_ID/nanochat:latest'
+- '$_IMAGE_NAME'
--- a/vertex_pipelines/data_download_step.py
+++ b/vertex_pipelines/data_download_step.py
@ -0,0 +1,127 @@
+#!/usr/bin/env python3
+"""
+Data download step for Vertex AI Pipeline.
+Downloads training data shards from HuggingFace and uploads to GCS.
+"""
+import argparse
+import os
+import subprocess
+import tempfile
+from google.cloud import storage
+
+
+def download_and_upload_data(gcs_bucket: str, num_shards: int = 50):
+    """
+    Download training data shards and upload to GCS.
+    
+    Args:
+        gcs_bucket: GCS bucket path (e.g., 'gs://nzp-nanochat')
+        num_shards: Number of parquet shards to download (default: 50 for testing)
+    """
+    # Extract bucket name from gs:// path
+    bucket_name = gcs_bucket.replace("gs://", "").split("/")[0]
+    prefix = "/".join(gcs_bucket.replace("gs://", "").split("/")[1:]) if "/" in gcs_bucket.replace("gs://", "") else ""
+    
+    # Check if data already exists
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    gcs_data_path = f"{prefix}/base_data" if prefix else "base_data"
+    
+    blobs = list(bucket.list_blobs(prefix=gcs_data_path))
+    parquet_blobs = [b for b in blobs if b.name.endswith('.parquet')]
+    
+    if len(parquet_blobs) >= num_shards:
+        print(f"Found {len(parquet_blobs)} parquet files in gs://{bucket_name}/{gcs_data_path}")
+        print(f"Skipping download as {num_shards} shards were requested and sufficient data exists.")
+        return
+
+    # Create temporary directory for downloads
+    with tempfile.TemporaryDirectory() as temp_dir:
+        print(f"Downloading {num_shards} data shards to {temp_dir}...")
+        local_data_dir = os.path.join(temp_dir, "base_data")
+        os.makedirs(local_data_dir, exist_ok=True)
+        
+        # Set environment variable for nanochat dataset module
+        os.environ["NANOCHAT_DATA_DIR"] = local_data_dir
+        
+        # Download data using nanochat's dataset module
+        print(f"Running: python -m nanochat.dataset -n {num_shards}")
+        subprocess.run([
+            "python", "-m", "nanochat.dataset", "-n", str(num_shards)
+        ], check=True)
+        
+        # Upload to GCS
+        print(f"Uploading data to gs://{bucket_name}/{prefix}/base_data/...")
+        
+        # Upload all parquet files
+        parquet_files = [f for f in os.listdir(local_data_dir) if f.endswith('.parquet')]
+        print(f"Found {len(parquet_files)} parquet files to upload")
+        
+        for i, filename in enumerate(parquet_files):
+            local_path = os.path.join(local_data_dir, filename)
+            gcs_path = f"{prefix}/base_data/{filename}" if prefix else f"base_data/{filename}"
+            blob = bucket.blob(gcs_path)
+            
+            print(f"Uploading {i+1}/{len(parquet_files)}: {filename}")
+            blob.upload_from_filename(local_path)
+        
+        print(f"Successfully uploaded {len(parquet_files)} data shards to GCS")
+        
+        # Verify upload
+        gcs_data_path = f"gs://{bucket_name}/{prefix}/base_data" if prefix else f"gs://{bucket_name}/base_data"
+        print(f"Data is now available at: {gcs_data_path}")
+        
+        # Download and upload eval bundle
+        print("Downloading eval bundle from Karpathy's S3...")
+        import urllib.request
+        import zipfile
+        eval_bundle_url = "https://karpathy-public.s3.us-west-2.amazonaws.com/eval_bundle.zip"
+        eval_bundle_path = "/tmp/eval_bundle.zip"
+        eval_bundle_extracted = "/tmp/eval_bundle"
+        
+        urllib.request.urlretrieve(eval_bundle_url, eval_bundle_path)
+        print(f"Downloaded eval_bundle.zip to {eval_bundle_path}")
+        
+        # Extract and upload to GCS
+        with zipfile.ZipFile(eval_bundle_path, 'r') as zip_ref:
+            zip_ref.extractall("/tmp")
+        
+        # Upload eval_bundle directory to GCS
+        print("Uploading eval bundle to GCS...")
+        eval_bundle_files = []
+        for root, dirs, files in os.walk(eval_bundle_extracted):
+            for file in files:
+                local_file_path = os.path.join(root, file)
+                relative_path = os.path.relpath(local_file_path, "/tmp")
+                eval_bundle_files.append((local_file_path, relative_path))
+        
+        for local_file_path, relative_path in eval_bundle_files:
+            gcs_path = f"{prefix}/{relative_path}" if prefix else relative_path
+            blob = bucket.blob(gcs_path)
+            blob.upload_from_filename(local_file_path)
+        
+        print(f"Uploaded {len(eval_bundle_files)} eval bundle files to GCS")
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download and upload training data to GCS")
+    parser.add_argument("--gcs-bucket", type=str, required=True, help="GCS bucket path")
+    parser.add_argument("--num-shards", type=int, default=50, help="Number of data shards to download")
+    args = parser.parse_args()
+    
+    print("=" * 80)
+    print("DATA DOWNLOAD STEP")
+    print("=" * 80)
+    print(f"GCS Bucket: {args.gcs_bucket}")
+    print(f"Number of shards: {args.num_shards}")
+    print("=" * 80)
+    
+    download_and_upload_data(args.gcs_bucket, args.num_shards)
+    
+    print("=" * 80)
+    print("DATA DOWNLOAD COMPLETE")
+    print("=" * 80)
+
+
+if __name__ == "__main__":
+    main()
--- a/vertex_pipelines/midtraining_step.py
+++ b/vertex_pipelines/midtraining_step.py
@ -1,37 +1,149 @@
 import os
 import subprocess
 import argparse
-from nanochat.common import get_base_dir
+import shutil
+from google.cloud import storage
+
+def download_directory_from_gcs(bucket_name, gcs_path, local_path):
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    blobs = bucket.list_blobs(prefix=gcs_path)
+    for blob in blobs:
+        if blob.name.endswith("/"):
+            continue
+        relative_path = os.path.relpath(blob.name, gcs_path)
+        local_file = os.path.join(local_path, relative_path)
+        os.makedirs(os.path.dirname(local_file), exist_ok=True)
+        blob.download_to_filename(local_file)
+        print(f"Downloaded gs://{bucket_name}/{blob.name} to {local_file}")
+
+def upload_directory_to_gcs(local_path, bucket_name, gcs_path):
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    for root, _, files in os.walk(local_path):
+        for file in files:
+            local_file = os.path.join(root, file)
+            relative_path = os.path.relpath(local_file, local_path)
+            blob_path = os.path.join(gcs_path, relative_path)
+            blob = bucket.blob(blob_path)
+            blob.upload_from_file(open(local_file, 'rb'))
+            print(f"Uploaded {local_file} to gs://{bucket_name}/{blob_path}")

 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gcs-bucket", type=str, required=True, help="GCS bucket for artifacts")
    parser.add_argument("--wandb-run", type=str, default="dummy", help="Wandb run name")
+    parser.add_argument("--vertex-experiment", type=str, default="", help="Vertex AI experiment name")
+    parser.add_argument("--vertex-tensorboard", type=str, default="", help="Vertex AI TensorBoard resource name")
+    parser.add_argument("--device-batch-size", type=int, default=16, help="Device batch size")
    args = parser.parse_args()

-    # Set the base directory to the GCS bucket.
-    os.environ["NANOCHAT_BASE_DIR"] = args.gcs_bucket
+    # Parse bucket name and prefix
+    if args.gcs_bucket.startswith("gs://"):
+        bucket_name = args.gcs_bucket.replace("gs://", "").split("/")[0]
+        prefix_parts = args.gcs_bucket.replace("gs://", "").split("/")[1:]
+        prefix = "/".join(prefix_parts) if prefix_parts else ""
+    else:
+        bucket_name = args.gcs_bucket
+        prefix = ""

-    # Download the identity conversations dataset.
-    subprocess.run([
-        "curl", "-L", "-o",
-        f"{get_base_dir()}/identity_conversations.jsonl",
-        "https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl"
-    ], check=True)
+    # Check if midtraining checkpoint already exists (checkpoint detection)
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    gcs_mid_ckpt_path = os.path.join(prefix, "mid_checkpoints") if prefix else "mid_checkpoints"
+    
+    # Check for model.pt (the key checkpoint file)
+    # Note: mid_train.py saves to f"d{depth}" where depth defaults to 20 (inherited from base model)
+    depth = 20
+    gcs_mid_ckpt_path = os.path.join(gcs_mid_ckpt_path, f"d{depth}")
+    checkpoint_exists = bucket.blob(os.path.join(gcs_mid_ckpt_path, "model.pt")).exists()
+    
+    if checkpoint_exists:
+        print(f"✓ Midtraining checkpoint already exists in gs://{bucket_name}/{gcs_mid_ckpt_path}")
+        print("Skipping midtraining (already completed)")
+        return

-    # Run mid-training.
-    subprocess.run([
-        "torchrun", "--standalone", "--nproc_per_node=8",
-        "-m", "scripts.mid_train", "--",
-        f"--run={args.wandb_run}"
-    ], check=True)
+    print(f"Midtraining checkpoint not found. Running midtraining...")

-    # Evaluate the model.
-    subprocess.run([
-        "torchrun", "--standalone", "--nproc_per_node=8",
-        "-m", "scripts.chat_eval", "--",
-        "-i", "mid"
-    ], check=True)
+    # Set local tmp dir for temporary files
+    local_base_dir = "/tmp/nanochat"
+    os.makedirs(local_base_dir, exist_ok=True)
+
+    # Download tokenizer from GCS
+    print("Downloading tokenizer from GCS...")
+    gcs_tokenizer_path = os.path.join(prefix, "tokenizer") if prefix else "tokenizer"
+    local_tokenizer_dir = os.path.join(local_base_dir, "tokenizer")
+    download_directory_from_gcs(bucket_name, gcs_tokenizer_path, local_tokenizer_dir)
+
+    # Download base checkpoints from GCS
+    print("Downloading base checkpoints from GCS...")
+    gcs_base_checkpoints_path = os.path.join(prefix, "base_checkpoints") if prefix else "base_checkpoints"
+    local_base_checkpoints_dir = os.path.join(local_base_dir, "base_checkpoints")
+    download_directory_from_gcs(bucket_name, gcs_base_checkpoints_path, local_base_checkpoints_dir)
+
+    # Download report dir from GCS
+    print("Downloading report dir from GCS...")
+    gcs_report_path = os.path.join(prefix, "report") if prefix else "report"
+    local_report_dir = os.path.join(local_base_dir, "report")
+    download_directory_from_gcs(bucket_name, gcs_report_path, local_report_dir)
+    # Ensure report directory exists even if nothing was downloaded
+    os.makedirs(local_report_dir, exist_ok=True)
+
+    try:
+        # Download the identity conversations dataset.
+        # This is needed for midtraining.
+        # We can download it to local base dir or just let the script handle it if it downloads from URL.
+        # scripts/mid_train.py doesn't seem to download it automatically?
+        # Let's check mid_train.py later. Assuming the previous code was correct about downloading it.
+        # The previous code had:
+        # subprocess.run(["curl", "-L", "-o", f"{get_base_dir()}/identity_conversations.jsonl", ...])
+        # I'll include that.
+        print("Downloading identity conversations...")
+        subprocess.run([
+            "curl", "-L", "-o",
+            f"{local_base_dir}/identity_conversations.jsonl",
+            "https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl"
+        ], check=True)
+
+        # Mid-train the model.
+        print("Starting midtraining...")
+        env = os.environ.copy()
+        env["NANOCHAT_BASE_DIR"] = local_base_dir
+        subprocess.run([
+            "torchrun", "--standalone", "--nproc_per_node=1",
+            "-m", "scripts.mid_train",
+            f"--device_batch_size={args.device_batch_size}", 
+            f"--wandb_run_name={args.wandb_run}",
+            f"--vertex_experiment={args.vertex_experiment}",
+            f"--vertex_tensorboard={args.vertex_tensorboard}"
+        ], check=True, env=env)
+
+        # Evaluate the model.
+        print("Running chat_eval (mid)...")
+        subprocess.run([
+            "torchrun", "--standalone", "--nproc_per_node=1",
+            "-m", "scripts.chat_eval", "--",
+            "-i", "mid"
+        ], check=True, env=env)
+
+    except subprocess.CalledProcessError as e:
+        print(f"Error during midtraining steps: {e}")
+        raise
+
+    # Upload checkpoints to GCS
+    print("Uploading artifacts to GCS...")
+    
+    # Upload mid_checkpoints
+    local_checkpoints_dir = os.path.join(local_base_dir, "mid_checkpoints")
+    gcs_checkpoints_path = os.path.join(prefix, "mid_checkpoints") if prefix else "mid_checkpoints"
+    if os.path.exists(local_checkpoints_dir):
+        upload_directory_to_gcs(local_checkpoints_dir, bucket_name, gcs_checkpoints_path)
+    else:
+        print(f"Warning: {local_checkpoints_dir} does not exist.")
+
+    # Upload report dir
+    if os.path.exists(local_report_dir):
+        upload_directory_to_gcs(local_report_dir, bucket_name, gcs_report_path)

 if __name__ == "__main__":
    main()
--- a/vertex_pipelines/pipeline.py
+++ b/vertex_pipelines/pipeline.py
@ -1,74 +1,325 @@
+import os
 import kfp
-from kfp.v2 import dsl
-from kfp.v2.compiler import Compiler
+from kfp import dsl
+from kfp.compiler import Compiler
 from google.cloud import aiplatform
+from google_cloud_pipeline_components.v1.custom_job import CustomTrainingJobOp

-@dsl.pipeline(name="nanochat-pipeline")
-def nanochat_pipeline(gcs_bucket: str, docker_image_uri: str, wandb_run: str = "dummy"):
+# Global configuration for accelerator type
+ACCELERATOR_TYPE = 'NVIDIA_L4'
+# Read image URI from environment variable. 
+# This allows compiling the pipeline with a specific image without passing it as a PipelineParam,
+# which avoids issues with dsl.ContainerSpec.
+DOCKER_IMAGE_URI = os.environ.get("DOCKER_IMAGE_URI", "gcr.io/nzp-nanochat/nanochat:latest")
+
+@dsl.container_component
+def tokenizer_step(gcs_bucket: str) -> dsl.ContainerSpec:
    """
-    A Vertex AI pipeline for training and evaluating a nanochat model.
+    Tokenizer component.
    """
-    tokenizer_op = dsl.ContainerOp(
-        name="tokenizer",
-        image=docker_image_uri,
+    return dsl.ContainerSpec(
+        image=DOCKER_IMAGE_URI,
        command=["python", "vertex_pipelines/tokenizer_step.py"],
-        arguments=["--gcs-bucket", gcs_bucket],
+        args=["--gcs-bucket", gcs_bucket],
    )

-    pretraining_op = dsl.ContainerOp(
-        name="pretraining",
-        image=docker_image_uri,
-        command=["python", "vertex_pipelines/pretraining_step.py"],
-        arguments=["--gcs-bucket", gcs_bucket, "--wandb-run", wandb_run],
-    ).after(tokenizer_op)

-    midtraining_op = dsl.ContainerOp(
-        name="midtraining",
-        image=docker_image_uri,
+
+@dsl.container_component
+def midtraining_step(gcs_bucket: str, wandb_run: str, vertex_experiment: str, vertex_tensorboard: str) -> dsl.ContainerSpec:
+    """
+    Midtraining component.
+    """
+    return dsl.ContainerSpec(
+        image=DOCKER_IMAGE_URI,
        command=["python", "vertex_pipelines/midtraining_step.py"],
-        arguments=["--gcs-bucket", gcs_bucket, "--wandb-run", wandb_run],
-    ).after(pretraining_op)
+        args=["--gcs-bucket", gcs_bucket, "--wandb-run", wandb_run, "--vertex-experiment", vertex_experiment, "--vertex-tensorboard", vertex_tensorboard],
+    )

-    sft_op = dsl.ContainerOp(
-        name="sft",
-        image=docker_image_uri,
+@dsl.container_component
+def sft_step(gcs_bucket: str, wandb_run: str, vertex_experiment: str, vertex_tensorboard: str) -> dsl.ContainerSpec:
+    """
+    SFT component.
+    """
+    return dsl.ContainerSpec(
+        image=DOCKER_IMAGE_URI,
        command=["python", "vertex_pipelines/sft_step.py"],
-        arguments=["--gcs-bucket", gcs_bucket, "--wandb-run", wandb_run],
-    ).after(midtraining_op)
+        args=["--gcs-bucket", gcs_bucket, "--wandb-run", wandb_run, "--vertex-experiment", vertex_experiment, "--vertex-tensorboard", vertex_tensorboard],
+    )

-    report_op = dsl.ContainerOp(
-        name="report",
-        image=docker_image_uri,
+@dsl.container_component
+def data_download_step(gcs_bucket: str, num_shards: int = 50):
+    """
+    Data download component - downloads training data from HuggingFace to GCS.
+    """
+    return dsl.ContainerSpec(
+        image=DOCKER_IMAGE_URI,
+        command=["python", "vertex_pipelines/data_download_step.py"],
+        args=["--gcs-bucket", gcs_bucket, "--num-shards", str(num_shards)],
+    )
+
+@dsl.container_component
+def report_step(gcs_bucket: str) -> dsl.ContainerSpec:
+    """
+    Report component.
+    """
+    return dsl.ContainerSpec(
+        image=DOCKER_IMAGE_URI,
        command=["python", "vertex_pipelines/report_step.py"],
-        arguments=["--gcs-bucket", gcs_bucket],
-    ).after(sft_op)
+        args=["--gcs-bucket", gcs_bucket],
+    )
+
+
+
+# Let's rewrite the function to use the global ACCELERATOR_TYPE which we will ensure is set BEFORE the function is decorated/called.
+# Actually, dsl.pipeline is a decorator. It runs when the module is loaded.
+# So 'nanochat_pipeline' is compiled/registered immediately.
+# If we want to change the structure based on args, we should define the pipeline function INSIDE __main__ or 
+# create a function that returns the pipeline function.
+
+def create_pipeline_func(accelerator_type, accelerator_count, is_preemptible):
+    @dsl.pipeline(
+        name="nanochat-pipeline",
+        description="A pipeline to train NanoChat",
+    )
+    def nanochat_pipeline(
+        gcs_bucket: str, 
+        project: str,
+        location: str,
+        wandb_run: str = "dummy", 
+        vertex_experiment: str = "", 
+        vertex_tensorboard: str = "",
+        num_data_shards: int = 20,
+        scheduling_strategy: str = "FLEX_START",
+        max_wait_duration: str = "0s",
+        service_account: str = "",
+        device_batch_size: int = 8
+    ):
+        # Data download step
+        data_download_task = data_download_step(
+            gcs_bucket=gcs_bucket,
+            num_shards=num_data_shards
+        )
+        data_download_task.set_cpu_limit('8').set_memory_limit('32G')
+
+        # Tokenizer step
+        tokenizer_task = tokenizer_step(gcs_bucket=gcs_bucket)
+        tokenizer_task.set_cpu_limit('8').set_memory_limit('32G')
+
+        # Pretraining step using CustomTrainingJobOp
+        # Define worker pool specs
+        # Note: We use the same image and command as before
+        
+        worker_pool_specs = [{
+            "machine_spec": {
+                "machine_type": "a2-highgpu-1g" if accelerator_type == "NVIDIA_TESLA_A100" and accelerator_count == 1 else "a2-highgpu-8g" if accelerator_type == "NVIDIA_TESLA_A100" and accelerator_count == 8 else "n1-standard-16", # Fallback/Logic needs to be robust
+                "accelerator_type": accelerator_type,
+                "accelerator_count": accelerator_count,
+            },
+            "replica_count": 1,
+            "disk_spec": {
+                "boot_disk_type": "pd-ssd",
+                "boot_disk_size_gb": 500,
+            },
+            "container_spec": {
+                "image_uri": DOCKER_IMAGE_URI,
+                "command": ["python", "vertex_pipelines/pretraining_step.py"],
+                "args": [
+                    "--gcs-bucket", gcs_bucket,
+                    "--wandb-run", wandb_run,
+                    "--vertex-experiment", vertex_experiment,
+                    "--vertex-tensorboard", vertex_tensorboard,
+                    "--device-batch-size", str(device_batch_size)
+                ],
+            },
+        }]
+        
+        # Refine machine type logic based on accelerator
+        # A100 40GB: a2-highgpu-1g (1 GPU), a2-highgpu-2g (2 GPUs), a2-highgpu-4g (4 GPUs), a2-highgpu-8g (8 GPUs)
+        # L4: g2-standard-4 (1 GPU), etc.
+        # For now, let's assume the user passes valid combinations or we map them.
+        # Given the user specifically asked for 8x A100, we target a2-highgpu-8g.
+        
+        machine_type = "n1-standard-16" # Default
+        if accelerator_type == "NVIDIA_TESLA_A100":
+            if accelerator_count == 1: machine_type = "a2-highgpu-1g"
+            elif accelerator_count == 2: machine_type = "a2-highgpu-2g"
+            elif accelerator_count == 4: machine_type = "a2-highgpu-4g"
+            elif accelerator_count == 8: machine_type = "a2-highgpu-8g"
+        elif accelerator_type == "NVIDIA_L4":
+             if accelerator_count == 1: machine_type = "g2-standard-4"
+             elif accelerator_count == 8: machine_type = "g2-standard-96"
+
+        worker_pool_specs[0]["machine_spec"]["machine_type"] = machine_type
+        
+        # Scheduling strategy is now a runtime parameter
+        # Common values:
+        #   FLEX_START: Dynamic Workload Scheduler - queues jobs when resources unavailable
+        #   SPOT: Preemptible instances (deprecated in favor of FLEX_START)
+        #   STANDARD: Standard on-demand instances
+        # max_wait_duration: "0s" = wait indefinitely, "3600s" = 1 hour, "86400s" = 24 hours
+
+        pretraining_task = CustomTrainingJobOp(
+            project=project,
+            location=location,
+            display_name="nanochat-pretraining-job",
+            worker_pool_specs=worker_pool_specs,
+            base_output_directory=f"{gcs_bucket}/pipeline_root",
+            timeout="604800s", # 7 days
+            restart_job_on_worker_restart=True,
+            strategy=scheduling_strategy,
+            max_wait_duration=max_wait_duration,
+            service_account=service_account,
+            tensorboard=vertex_tensorboard,
+        ).after(tokenizer_task)
+        
+        # CustomTrainingJobOp returns a Model (if configured) or just the job resource.
+        # We don't need to set resources/accelerators on the task itself because they are in worker_pool_specs.
+        
+        # Mid-training step - use same resources as pretraining (A100s on FLEX)
+        mid_worker_pool_specs = [{
+            "machine_spec": worker_pool_specs[0]["machine_spec"],
+            "replica_count": 1,
+            "disk_spec": {
+                "boot_disk_type": "pd-ssd",
+                "boot_disk_size_gb": 500,
+            },
+            "container_spec": {
+                "image_uri": DOCKER_IMAGE_URI,
+                "command": ["python", "vertex_pipelines/midtraining_step.py"],
+                "args": [
+                    "--gcs-bucket", gcs_bucket,
+                    "--wandb-run", wandb_run,
+                    "--vertex-experiment", vertex_experiment,
+                    "--vertex-tensorboard", vertex_tensorboard,
+                    "--device-batch-size", str(device_batch_size),
+                ],
+            },
+        }]
+
+        midtraining_task = CustomTrainingJobOp(
+            project=project,
+            location=location,
+            display_name="nanochat-midtraining-job",
+            worker_pool_specs=mid_worker_pool_specs,
+            base_output_directory=f"{gcs_bucket}/pipeline_root",
+            service_account=service_account,
+            strategy=scheduling_strategy,
+            max_wait_duration=max_wait_duration,
+        ).after(pretraining_task)
+        
+        # SFT step - use same resources as pretraining (A100s on FLEX)
+        sft_worker_pool_specs = [{
+            "machine_spec": worker_pool_specs[0]["machine_spec"],
+            "replica_count": 1,
+            "disk_spec": {
+                "boot_disk_type": "pd-ssd",
+                "boot_disk_size_gb": 500,
+            },
+            "container_spec": {
+                "image_uri": DOCKER_IMAGE_URI,
+                "command": ["python", "vertex_pipelines/sft_step.py"],
+                "args": [
+                    "--gcs-bucket", gcs_bucket,
+                    "--wandb-run", wandb_run,
+                    "--vertex-experiment", vertex_experiment,
+                    "--vertex-tensorboard", vertex_tensorboard,
+                ],
+            },
+        }]
+
+        sft_task = CustomTrainingJobOp(
+            project=project,
+            location=location,
+            display_name="nanochat-sft-job",
+            worker_pool_specs=sft_worker_pool_specs,
+            base_output_directory=f"{gcs_bucket}/pipeline_root",
+            service_account=service_account,
+            strategy=scheduling_strategy,
+            max_wait_duration=max_wait_duration,
+        ).after(midtraining_task)
+        
+        report_task = report_step(gcs_bucket=gcs_bucket).after(sft_task)
+        report_task.set_cpu_limit('2').set_memory_limit('8G')
+        
+    return nanochat_pipeline

 if __name__ == "__main__":
    import argparse

    parser = argparse.ArgumentParser()
-    parser.add_argument("--gcp-project", type=str, required=True)
+    parser.add_argument("--gcp-project", type=str, required=False) # Optional if we don't run it here
    parser.add_argument("--gcs-bucket", type=str, required=True)
-    parser.add_argument("--pipeline-root", type=str, required=True)
-    parser.add_argument("--docker-image-uri", type=str, required=True)
+    parser.add_argument("--pipeline-root", type=str, required=False)
    parser.add_argument("--region", type=str, default="us-central1")
+    parser.add_argument("--wandb-run", type=str, default="dummy")
+    parser.add_argument("--vertex-experiment", type=str, default="")
+    parser.add_argument("--vertex-tensorboard", type=str, default="")
+    parser.add_argument("--accelerator-type", type=str, default="NVIDIA_L4")
+    parser.add_argument("--accelerator-count", type=int, default=1)
+    parser.add_argument("--num-data-shards", type=int, default=20)
+    parser.add_argument("--preemptible", type=str, default="false")
+    parser.add_argument("--scheduling-strategy", type=str, default=None, help="Scheduling strategy: FLEX_START, SPOT, or STANDARD")
+    parser.add_argument("--max-wait-duration", type=str, default=None, help="Max wait duration for FLEX_START, e.g., '0s', '3600s'")
+    parser.add_argument("--service-account", type=str, required=False, help="Service account to run the pipeline")
+    parser.add_argument("--device-batch-size", type=int, default=8, help="Batch size per device")
+    parser.add_argument("--template_path", type=str, default="nanochat_pipeline.json")
    args = parser.parse_args()

+    is_preemptible = args.preemptible.lower() == "true"
+    
+    # Set smart defaults for scheduling strategy based on preemptible flag
+    if args.scheduling_strategy is None:
+        scheduling_strategy = "FLEX_START" if is_preemptible else "STANDARD"
+    else:
+        scheduling_strategy = args.scheduling_strategy
+    
+    if args.max_wait_duration is None:
+        max_wait_duration = "0s" if is_preemptible else "86400s"
+    else:
+        max_wait_duration = args.max_wait_duration
+    
+    # Create the pipeline function dynamically with captured arguments
+    pipeline_func = create_pipeline_func(
+        accelerator_type=args.accelerator_type,
+        accelerator_count=args.accelerator_count,
+        is_preemptible=is_preemptible
+    )
+
    Compiler().compile(
-        pipeline_func=nanochat_pipeline,
-        package_path="nanochat_pipeline.json",
+        pipeline_func=pipeline_func,
+        package_path=args.template_path,
    )

-    aiplatform.init(project=args.gcp_project, location=args.region)

-    job = aiplatform.PipelineJob(
-        display_name="nanochat-pipeline",
-        template_path="nanochat_pipeline.json",
-        pipeline_root=args.pipeline_root,
-        parameter_values={
-            "gcs_bucket": args.gcs_bucket,
-            "docker_image_uri": args.docker_image_uri,
-        },
-    )

-    job.run()
+    # Initialize Vertex AI SDK
+    if args.gcp_project:
+        aiplatform.init(project=args.gcp_project, location=args.region)
+
+        job = aiplatform.PipelineJob(
+            display_name="nanochat-pipeline",
+            template_path=args.template_path,
+            pipeline_root=args.pipeline_root,
+            parameter_values={
+                "gcs_bucket": args.gcs_bucket,
+                "project": args.gcp_project,
+                "location": args.region,
+                "wandb_run": args.wandb_run,
+                "vertex_experiment": args.vertex_experiment,
+                "vertex_tensorboard": args.vertex_tensorboard,
+                "num_data_shards": args.num_data_shards,
+                "scheduling_strategy": scheduling_strategy,
+                "max_wait_duration": max_wait_duration,
+                "service_account": args.service_account,
+                "device_batch_size": args.device_batch_size,
+            },
+        )
+
+        # Run the pipeline
+        # service_account is optional but recommended
+        job.run(
+            service_account=args.service_account,
+            sync=True # Block until completion or failure to ensure we see logs
+        )
+
--- a/vertex_pipelines/pretraining_step.py
+++ b/vertex_pipelines/pretraining_step.py
@ -1,35 +1,189 @@
 import os
 import subprocess
 import argparse
-from nanochat.common import get_base_dir
+import shutil
+from google.cloud import storage
+
+def download_directory_from_gcs(bucket_name, gcs_path, local_path):
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    blobs = bucket.list_blobs(prefix=gcs_path)
+    for blob in blobs:
+        if blob.name.endswith("/"):
+            continue
+        relative_path = os.path.relpath(blob.name, gcs_path)
+        local_file = os.path.join(local_path, relative_path)
+        os.makedirs(os.path.dirname(local_file), exist_ok=True)
+        blob.download_to_filename(local_file)
+        print(f"Downloaded gs://{bucket_name}/{blob.name} to {local_file}")
+
+def upload_directory_to_gcs(local_path, bucket_name, gcs_path):
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    for root, _, files in os.walk(local_path):
+        for file in files:
+            local_file = os.path.join(root, file)
+            relative_path = os.path.relpath(local_file, local_path)
+            blob_path = os.path.join(gcs_path, relative_path)
+            blob = bucket.blob(blob_path)
+            blob.upload_from_file(open(local_file, 'rb'))
+            print(f"Uploaded {local_file} to gs://{bucket_name}/{blob_path}")

 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gcs-bucket", type=str, required=True, help="GCS bucket for artifacts")
    parser.add_argument("--wandb-run", type=str, default="dummy", help="Wandb run name")
+    parser.add_argument("--vertex-experiment", type=str, default="", help="Vertex AI experiment name")
+    parser.add_argument("--vertex-tensorboard", type=str, default="", help="Vertex AI TensorBoard resource name")
+    parser.add_argument("--device-batch-size", type=int, default=8, help="Batch size per device")
+    
    args = parser.parse_args()

-    # Set the base directory to the GCS bucket.
-    os.environ["NANOCHAT_BASE_DIR"] = args.gcs_bucket
+    # Parse bucket name and prefix
+    if args.gcs_bucket.startswith("gs://"):
+        bucket_name = args.gcs_bucket.replace("gs://", "").split("/")[0]
+        prefix_parts = args.gcs_bucket.replace("gs://", "").split("/")[1:]
+        prefix = "/".join(prefix_parts) if prefix_parts else ""
+    else:
+        bucket_name = args.gcs_bucket
+        prefix = ""

-    # Pre-train the d20 model.
-    subprocess.run([
-        "torchrun", "--standalone", "--nproc_per_node=8",
-        "-m", "scripts.base_train", "--",
-        "--depth=20", f"--run={args.wandb_run}"
-    ], check=True)
+    # Check if pretraining checkpoint already exists (checkpoint detection)
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    gcs_base_ckpt_path = os.path.join(prefix, "base_checkpoints") if prefix else "base_checkpoints"
+    
+    # Check for model.pt (the key checkpoint file)
+    # Note: base_train.py saves to f"d{depth}" where depth defaults to 20
+    depth = 20
+    gcs_base_ckpt_path = os.path.join(gcs_base_ckpt_path, f"d{depth}")
+    checkpoint_exists = bucket.blob(os.path.join(gcs_base_ckpt_path, "model.pt")).exists()
+    
+    if checkpoint_exists:
+        print(f"✓ Pretraining checkpoint already exists in gs://{bucket_name}/{gcs_base_ckpt_path}")
+        print("Skipping pretraining (already completed)")
+        return

-    # Evaluate the model on a larger chunk of train/val data and draw some samples.
-    subprocess.run([
-        "torchrun", "--standalone", "--nproc_per_node=8",
-        "-m", "scripts.base_loss"
-    ], check=True)
+    print(f"Pretraining checkpoint not found. Running pretraining...")

-    # Evaluate the model on CORE tasks.
-    subprocess.run([
-        "torchrun", "--standalone", "--nproc_per_node=8",
-        "-m", "scripts.base_eval"
-    ], check=True)
+    # Set local base dir
+    local_base_dir = "/tmp/nanochat"
+    os.environ["NANOCHAT_BASE_DIR"] = local_base_dir
+    os.makedirs(local_base_dir, exist_ok=True)
+
+    # Set data dir to GCS so we stream/cache data there
+    gcs_data_path = f"gs://{bucket_name}/{prefix}/base_data" if prefix else f"gs://{bucket_name}/base_data"
+    # Clean up double slashes if any
+    gcs_data_path = gcs_data_path.replace("//base_data", "/base_data")
+    os.environ["NANOCHAT_DATA_DIR"] = gcs_data_path
+    print(f"Set NANOCHAT_DATA_DIR to {gcs_data_path}")
+    
+    # Download tokenizer from GCS to local disk
+    print("Downloading tokenizer from GCS...")
+    gcs_tokenizer_path = os.path.join(prefix, "tokenizer") if prefix else "tokenizer"
+    local_tokenizer_dir = os.path.join(local_base_dir, "tokenizer")
+    download_directory_from_gcs(bucket_name, gcs_tokenizer_path, local_tokenizer_dir)
+
+    try:
+        # Diagnostic: Check if PyTorch can see CUDA
+        import torch
+        print(f"PRE-TRAINING DIAGNOSTICS:")
+        print(f"  torch.cuda.is_available(): {torch.cuda.is_available()}")
+        print(f"  torch.__version__: {torch.__version__}")
+        if torch.cuda.is_available():
+            print(f"  torch.version.cuda: {torch.version.cuda}")
+            print(f"  torch.cuda.device_count(): {torch.cuda.device_count()}")
+            for i in range(torch.cuda.device_count()):
+                print(f"  GPU {i}: {torch.cuda.get_device_name(i)}")
+        
+        # Print environment variables
+        env_vars = ["LD_LIBRARY_PATH", "PATH", "CUDA_VISIBLE_DEVICES", "NVIDIA_VISIBLE_DEVICES"]
+        for var in env_vars:
+            print(f"  env {var}: {os.environ.get(var, 'NOT SET')}")
+        
+        # We use a smaller batch size to be safe on standard GPUs, or rely on auto-config.
+        # speedrun.sh uses d20.
+        # A100 80GB: Use batch_size=32 for optimal MFU (uses ~38-40GB)
+        # A100 40GB (Distributed): Use batch_size=8 per GPU.
+        
+        # Dynamic GPU detection
+        import torch
+        gpu_count = torch.cuda.device_count()
+        print(f"Detected {gpu_count} GPUs. Configuring distributed training...")
+        
+        # Adjust batch size based on GPU type (heuristic)
+        # If we are on A100 40GB, we need batch_size=8.
+        # If we are on A100 80GB, we can use 32.
+        # Since we are likely switching back to 40GB for distributed, let's be safe with 8.
+        # The user can override this if needed, but 8 is safe for 40GB.
+        # If we are on 80GB, 8 is also fine, just less efficient per GPU, but with multiple GPUs it's okay.
+        # Let's stick to 8 to be safe for the 40GB distributed case.
+        device_batch_size = "8" 
+        
+        print("Starting pretraining...")
+        subprocess.run([
+            "torchrun", "--standalone", f"--nproc_per_node={gpu_count}",
+            "-m", "scripts.base_train",
+            "--depth=20", f"--device_batch_size={args.device_batch_size}",
+            f"--wandb_run_name={args.wandb_run}",
+            f"--vertex_experiment={args.vertex_experiment}",
+            f"--vertex_tensorboard={args.vertex_tensorboard}"
+        ], check=True)
+
+        # Evaluate the model on a larger chunk of train/val data and draw some samples.
+        print("Running base_loss evaluation...")
+        subprocess.run([
+            "torchrun", "--standalone", "--nproc_per_node=1",
+            "-m", "scripts.base_loss",
+            "--device_batch_size=8"
+        ], check=True)
+
+        # Evaluate the model on CORE tasks.
+        print("Running base_eval...")
+        subprocess.run([
+            "torchrun", "--standalone", "--nproc_per_node=1",
+            "-m", "scripts.base_eval"
+        ], check=True)
+
+    except subprocess.CalledProcessError as e:
+        print(f"Error during pretraining steps: {e}")
+        raise
+
+    # Upload checkpoints and report to GCS
+    print("Uploading artifacts to GCS...")
+    
+    # Upload base_checkpoints
+    local_checkpoints_dir = os.path.join(local_base_dir, "base_checkpoints")
+    gcs_checkpoints_path = os.path.join(prefix, "base_checkpoints") if prefix else "base_checkpoints"
+    if os.path.exists(local_checkpoints_dir):
+        upload_directory_to_gcs(local_checkpoints_dir, bucket_name, gcs_checkpoints_path)
+    else:
+        print(f"Warning: {local_checkpoints_dir} does not exist.")
+
+    # Upload report (it might be in base_dir or somewhere else, let's check report.py behavior or just upload everything in base_dir except data/tokenizer?)
+    # report.py likely writes to a file.
+    # For now, let's just upload the whole base_dir excluding data and tokenizer which we handled/don't need.
+    # Actually, let's just look for report.md or similar.
+    # But we don't know exactly where report.py writes.
+    # Assuming it writes to base_dir/report.md or similar.
+    
+    # Let's just upload everything in local_base_dir that is NOT tokenizer or base_checkpoints (already uploaded) or tokenized_data.
+    for root, dirs, files in os.walk(local_base_dir):
+        # Skip directories we don't want to re-upload or are empty
+        if "tokenizer" in dirs:
+            dirs.remove("tokenizer")
+        if "base_checkpoints" in dirs:
+            dirs.remove("base_checkpoints")
+        if "tokenized_data" in dirs:
+            dirs.remove("tokenized_data")
+            
+        for file in files:
+            local_file = os.path.join(root, file)
+            relative_path = os.path.relpath(local_file, local_base_dir)
+            blob_path = os.path.join(prefix, relative_path) if prefix else relative_path
+            blob = bucket.blob(blob_path)
+            blob.upload_from_file(open(local_file, 'rb'))
+            print(f"Uploaded {local_file} to gs://{bucket_name}/{blob_path}")

 if __name__ == "__main__":
    main()
--- a/vertex_pipelines/report_step.py
+++ b/vertex_pipelines/report_step.py
@ -1,18 +1,100 @@
 import os
+import sys
 import subprocess
 import argparse
-from nanochat.common import get_base_dir
+import shutil
+from google.cloud import storage
+
+def download_directory_from_gcs(bucket_name, gcs_path, local_path):
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    blobs = bucket.list_blobs(prefix=gcs_path)
+    for blob in blobs:
+        if blob.name.endswith("/"):
+            continue
+        relative_path = os.path.relpath(blob.name, gcs_path)
+        local_file = os.path.join(local_path, relative_path)
+        os.makedirs(os.path.dirname(local_file), exist_ok=True)
+        blob.download_to_filename(local_file)
+        print(f"Downloaded gs://{bucket_name}/{blob.name} to {local_file}")
+
+def upload_directory_to_gcs(local_path, bucket_name, gcs_path):
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    for root, _, files in os.walk(local_path):
+        for file in files:
+            local_file = os.path.join(root, file)
+            relative_path = os.path.relpath(local_file, local_path)
+            blob_path = os.path.join(gcs_path, relative_path)
+            blob = bucket.blob(blob_path)
+            blob.upload_from_file(open(local_file, 'rb'))
+            print(f"Uploaded {local_file} to gs://{bucket_name}/{blob_path}")

 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gcs-bucket", type=str, required=True, help="GCS bucket for artifacts")
    args = parser.parse_args()

-    # Set the base directory to the GCS bucket.
-    os.environ["NANOCHAT_BASE_DIR"] = args.gcs_bucket
+    # Parse bucket name and prefix
+    if args.gcs_bucket.startswith("gs://"):
+        bucket_name = args.gcs_bucket.replace("gs://", "").split("/")[0]
+        prefix_parts = args.gcs_bucket.replace("gs://", "").split("/")[1:]
+        prefix = "/".join(prefix_parts) if prefix_parts else ""
+    else:
+        bucket_name = args.gcs_bucket
+        prefix = ""

-    # Generate the full report.
-    subprocess.run(["python", "-m", "nanochat.report", "generate"], check=True)
+    # Check if report already exists (checkpoint detection)
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    gcs_report_file = os.path.join(prefix, "report.md") if prefix else "report.md"
+    
+    report_exists = bucket.blob(gcs_report_file).exists()
+    
+    if report_exists:
+        print(f"✓ Report already exists at gs://{bucket_name}/{gcs_report_file}")
+        print("Skipping report generation (already completed)")
+        return
+
+    print(f"Report not found. Generating report...")
+
+    # Set local base dir
+    local_base_dir = "/tmp/nanochat"
+    os.environ["NANOCHAT_BASE_DIR"] = local_base_dir
+    os.makedirs(local_base_dir, exist_ok=True)
+
+    # Download report dir from GCS
+    print("Downloading report dir from GCS...")
+    gcs_report_path = os.path.join(prefix, "report") if prefix else "report"
+    local_report_dir = os.path.join(local_base_dir, "report")
+    download_directory_from_gcs(bucket_name, gcs_report_path, local_report_dir)
+
+    try:
+        # Generate the full report.
+        print("Generating report...")
+        subprocess.run([sys.executable, "-m", "nanochat.report", "generate"], check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Error generating report: {e}")
+        raise
+
+    # Upload report.md to GCS
+    print("Uploading report to GCS...")
+    # report.py generates report.md in local_base_dir/report/report.md AND copies it to current dir.
+    # We want to upload it to the bucket root or prefix root.
+    
+    local_report_file = "report.md"
+    if os.path.exists(local_report_file):
+        blob_path = os.path.join(prefix, "report.md") if prefix else "report.md"
+        bucket = storage.Client().bucket(bucket_name)
+        blob = bucket.blob(blob_path)
+        blob.upload_from_file(open(local_report_file, 'rb'))
+        print(f"Uploaded {local_report_file} to gs://{bucket_name}/{blob_path}")
+    else:
+        print("Warning: report.md not found in current directory.")
+
+    # Also upload the report dir just in case
+    if os.path.exists(local_report_dir):
+        upload_directory_to_gcs(local_report_dir, bucket_name, gcs_report_path)

 if __name__ == "__main__":
    main()
--- a/vertex_pipelines/run_pipeline.sh
+++ b/vertex_pipelines/run_pipeline.sh
@ -1,39 +1,101 @@
 #!/bin/bash
 set -euo pipefail

+# Check for optional flags
+SKIP_BUILD=false
+if [ "${1:-}" == "--skip-build" ]; then
+    SKIP_BUILD=true
+    shift
+fi
+
 # Check for required arguments
-if [ "$#" -ne 1 ]; then
-    echo "Usage: $0 gs://YOUR_GCS_BUCKET"
+if [ "$#" -lt 1 ] || [ "$#" -gt 10 ]; then
+    echo "Usage: $0 [--skip-build] gs://YOUR_GCS_BUCKET [WANDB_RUN_ID] [VERTEX_EXPERIMENT] [VERTEX_TENSORBOARD] [REGION] [ACCELERATOR_TYPE] [ACCELERATOR_COUNT] [NUM_DATA_SHARDS] [PREEMPTIBLE] [DEVICE_BATCH_SIZE]"
+    echo "  REGION defaults to \$VERTEX_REGION env var or 'us-central1'"
+    echo "  ACCELERATOR_TYPE defaults to 'NVIDIA_L4'"
+    echo "  ACCELERATOR_COUNT defaults to 1"
+    echo "  NUM_DATA_SHARDS defaults to 20 (number of HuggingFace data shards to download)"
+    echo "  PREEMPTIBLE defaults to false"
+    echo "  DEVICE_BATCH_SIZE defaults to 8"
    exit 1
 fi

 if [[ ! "$1" =~ ^gs:// ]]; then
  echo "Error: GCS bucket must be a valid gs:// path."
-  echo "Usage: $0 gs://YOUR_GCS_BUCKET"
+  echo "Usage: $0 [--skip-build] gs://YOUR_GCS_BUCKET [WANDB_RUN_ID] [VERTEX_EXPERIMENT] [VERTEX_TENSORBOARD] [REGION] [ACCELERATOR_TYPE] [ACCELERATOR_COUNT] [NUM_DATA_SHARDS] [PREEMPTIBLE] [DEVICE_BATCH_SIZE]"
  exit 1
 fi

 GCS_BUCKET=$1
 PIPELINE_ROOT="$GCS_BUCKET/pipeline-root"
 GCP_PROJECT=$(gcloud config get-value project)
-REGION="us-central1"
+WANDB_RUN=${2:-"dummy"} # Default to "dummy" if not provided
+VERTEX_EXPERIMENT=${3:-""}
+VERTEX_TENSORBOARD=${4:-""}
+REGION=${5:-${VERTEX_REGION:-us-central1}} # Use arg, then env var, then default
+ACCELERATOR_TYPE=${6:-NVIDIA_L4}
+ACCELERATOR_COUNT=${7:-1}
+NUM_DATA_SHARDS=${8:-20}
+PREEMPTIBLE=${9:-false}
+DEVICE_BATCH_SIZE=${10:-8}

 echo "Using GCP Project: $GCP_PROJECT"
 echo "Using GCS Bucket: $GCS_BUCKET"
 echo "Using Region: $REGION"
+echo "Using Accelerator: $ACCELERATOR_TYPE"
+echo "Using WANDB Run ID: $WANDB_RUN"
+if [ -n "$VERTEX_EXPERIMENT" ]; then
+    echo "Using Vertex Experiment: $VERTEX_EXPERIMENT"
+fi
+if [ -n "$VERTEX_TENSORBOARD" ]; then
+    echo "Using Vertex TensorBoard: $VERTEX_TENSORBOARD"
+fi

 # Submit the build to Cloud Build and get the image URI with digest
-echo "Submitting build to Cloud Build..."
-IMAGE_URI=$(gcloud builds submit --config vertex_pipelines/cloudbuild.yaml --format="value(results.images[0].name)" . --project=$GCP_PROJECT)
-echo "Cloud Build completed. Using image URI: $IMAGE_URI"
+# Use a timestamp tag to avoid caching issues with 'latest'
+if [ -z "${DOCKER_IMAGE_URI:-}" ]; then
+    TIMESTAMP=$(date +%Y%m%d%H%M%S)
+    IMAGE_URI="gcr.io/$GCP_PROJECT/nanochat:$TIMESTAMP"
+else
+    TIMESTAMP="custom"
+    IMAGE_URI="$DOCKER_IMAGE_URI"
+fi
+if [ "$SKIP_BUILD" = false ]; then
+    echo "Submitting build to Cloud Build with tag $TIMESTAMP..."
+    gcloud builds submit --config vertex_pipelines/cloudbuild.yaml --substitutions=_IMAGE_NAME="$IMAGE_URI" . --project=$GCP_PROJECT
+    echo "Cloud Build completed."
+else
+    echo "Skipping Cloud Build."
+fi
+echo "Using image URI: $IMAGE_URI"

 # Run the Vertex AI pipeline
+# Install dependencies for pipeline compilation
+echo "Installing dependencies..."
+if [ ! -d ".venv_pipeline" ]; then
+    python3 -m venv .venv_pipeline
+fi
+source .venv_pipeline/bin/activate
+python3 -m pip install -r requirements.txt
+
 echo "Running Vertex AI pipeline..."
-python vertex_pipelines/pipeline.py \
+export DOCKER_IMAGE_URI="$IMAGE_URI"
+# Use the default compute service account for the project
+SERVICE_ACCOUNT="247010501180-compute@developer.gserviceaccount.com"
+
+python3 vertex_pipelines/pipeline.py \
    --gcp-project "$GCP_PROJECT" \
    --gcs-bucket "$GCS_BUCKET" \
    --pipeline-root "$PIPELINE_ROOT" \
-    --docker-image-uri "$IMAGE_URI" \
-    --region "$REGION"
+    --region "$REGION" \
+    --wandb-run "$WANDB_RUN" \
+    --vertex-experiment "$VERTEX_EXPERIMENT" \
+    --vertex-tensorboard "$VERTEX_TENSORBOARD" \
+    --accelerator-type "$ACCELERATOR_TYPE" \
+    --accelerator-count "$ACCELERATOR_COUNT" \
+    --preemptible "$PREEMPTIBLE" \
+    --num-data-shards "$NUM_DATA_SHARDS" \
+    --service-account "$SERVICE_ACCOUNT" \
+    --device-batch-size "$DEVICE_BATCH_SIZE"

-echo "Pipeline submitted."
+echo "Pipeline submitted."
--- a/vertex_pipelines/setup_network.sh
+++ b/vertex_pipelines/setup_network.sh
@ -0,0 +1,34 @@
+#!/bin/bash
+set -e
+
+PROJECT="nzp-nanochat"
+NETWORK_NAME="nanochat-network"
+
+echo "Setting up network resources for project $PROJECT..."
+
+# 1. Create the VPC network (auto mode creates subnets in all regions)
+if ! gcloud compute networks describe "$NETWORK_NAME" --project="$PROJECT" &>/dev/null; then
+    echo "Creating VPC network '$NETWORK_NAME'..."
+    gcloud compute networks create "$NETWORK_NAME" \
+        --project="$PROJECT" \
+        --subnet-mode=auto \
+        --bgp-routing-mode=global
+    echo "✅ Network created."
+else
+    echo "✅ Network '$NETWORK_NAME' already exists."
+fi
+
+# 2. Create firewall rule to allow internal communication
+if ! gcloud compute firewall-rules describe "${NETWORK_NAME}-allow-internal" --project="$PROJECT" &>/dev/null; then
+    echo "Creating firewall rule '${NETWORK_NAME}-allow-internal'..."
+    gcloud compute firewall-rules create "${NETWORK_NAME}-allow-internal" \
+        --project="$PROJECT" \
+        --network="$NETWORK_NAME" \
+        --allow=tcp,udp,icmp \
+        --source-ranges=10.128.0.0/9
+    echo "✅ Firewall rule created."
+else
+    echo "✅ Firewall rule '${NETWORK_NAME}-allow-internal' already exists."
+fi
+
+echo "Network setup complete!"
--- a/vertex_pipelines/setup_resources.sh
+++ b/vertex_pipelines/setup_resources.sh
@ -0,0 +1,68 @@
+#!/bin/bash
+set -euo pipefail
+
+# Usage: ./setup_resources.sh <PROJECT_ID> <REGION> <BUCKET_NAME> [EXPERIMENT_NAME] [TENSORBOARD_DISPLAY_NAME]
+
+if [ "$#" -lt 3 ]; then
+    echo "Usage: $0 <PROJECT_ID> <REGION> <BUCKET_NAME> [EXPERIMENT_NAME] [TENSORBOARD_DISPLAY_NAME]"
+    exit 1
+fi
+
+PROJECT_ID=$1
+REGION=$2
+BUCKET_NAME=$3
+EXPERIMENT_NAME=${4:-"nanochat-experiment"}
+TENSORBOARD_DISPLAY_NAME=${5:-"nanochat-tensorboard"}
+
+echo "Setting up resources in Project: $PROJECT_ID, Region: $REGION"
+
+# 1. Create GCS Bucket
+echo "Checking bucket gs://$BUCKET_NAME..."
+if gcloud storage buckets describe "gs://$BUCKET_NAME" --project="$PROJECT_ID" &>/dev/null; then
+    echo "Bucket gs://$BUCKET_NAME already exists."
+else
+    echo "Creating bucket gs://$BUCKET_NAME..."
+    gcloud storage buckets create "gs://$BUCKET_NAME" --project="$PROJECT_ID" --location="$REGION" --uniform-bucket-level-access
+    echo "Bucket created."
+fi
+
+# 2. Create Vertex AI TensorBoard
+echo "Checking for existing TensorBoard with display name: $TENSORBOARD_DISPLAY_NAME..."
+EXISTING_TB=$(gcloud ai tensorboards list --region="$REGION" --project="$PROJECT_ID" --filter="displayName=$TENSORBOARD_DISPLAY_NAME" --format="value(name)" 2>/dev/null || true)
+
+if [ -n "$EXISTING_TB" ]; then
+    echo "TensorBoard '$TENSORBOARD_DISPLAY_NAME' already exists: $EXISTING_TB"
+    TENSORBOARD_ID=$EXISTING_TB
+else
+    echo "Creating Vertex AI TensorBoard: $TENSORBOARD_DISPLAY_NAME..."
+    # Create and capture the output. The output usually contains the name.
+    # We use --format="value(name)" to get just the resource name.
+    TENSORBOARD_ID=$(gcloud ai tensorboards create --display-name="$TENSORBOARD_DISPLAY_NAME" --region="$REGION" --project="$PROJECT_ID" --format="value(name)")
+    echo "TensorBoard created: $TENSORBOARD_ID"
+fi
+
+# 3. Create Vertex AI Experiment
+echo "Creating Vertex AI Experiment: $EXPERIMENT_NAME..."
+# Experiments are often implicitly created, but we can explicitly create it.
+# We check if it exists first to avoid errors.
+if gcloud ai experiments list --region="$REGION" --project="$PROJECT_ID" --filter="name=$EXPERIMENT_NAME" --format="value(name)" 2>/dev/null | grep -q "$EXPERIMENT_NAME"; then
+     echo "Experiment '$EXPERIMENT_NAME' already exists."
+else
+     # Try to create. Note: 'gcloud ai experiments create' might fail if it already exists but wasn't found by list for some reason,
+     # or if the command syntax varies. We'll allow it to fail gracefully if it's just "already exists".
+     gcloud ai experiments create --experiment="$EXPERIMENT_NAME" --region="$REGION" --project="$PROJECT_ID" || echo "Experiment creation returned status $? (might already exist)."
+     echo "Experiment setup complete."
+fi
+
+echo "----------------------------------------------------------------"
+echo "Setup Complete!"
+echo "----------------------------------------------------------------"
+echo "Use the following values for run_pipeline.sh:"
+echo ""
+echo "GCS_BUCKET: gs://$BUCKET_NAME"
+echo "VERTEX_EXPERIMENT: $EXPERIMENT_NAME"
+echo "VERTEX_TENSORBOARD: $TENSORBOARD_ID"
+echo ""
+echo "Example Command:"
+echo "./vertex_pipelines/run_pipeline.sh gs://$BUCKET_NAME <WANDB_RUN> $EXPERIMENT_NAME $TENSORBOARD_ID"
+echo "----------------------------------------------------------------"
--- a/vertex_pipelines/sft_step.py
+++ b/vertex_pipelines/sft_step.py
@ -1,30 +1,137 @@
 import os
 import subprocess
 import argparse
-from nanochat.common import get_base_dir
+import shutil
+from google.cloud import storage
+
+def download_directory_from_gcs(bucket_name, gcs_path, local_path):
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    blobs = bucket.list_blobs(prefix=gcs_path)
+    for blob in blobs:
+        if blob.name.endswith("/"):
+            continue
+        relative_path = os.path.relpath(blob.name, gcs_path)
+        local_file = os.path.join(local_path, relative_path)
+        os.makedirs(os.path.dirname(local_file), exist_ok=True)
+        blob.download_to_filename(local_file)
+        print(f"Downloaded gs://{bucket_name}/{blob.name} to {local_file}")
+
+def upload_directory_to_gcs(local_path, bucket_name, gcs_path):
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    for root, _, files in os.walk(local_path):
+        for file in files:
+            local_file = os.path.join(root, file)
+            relative_path = os.path.relpath(local_file, local_path)
+            blob_path = os.path.join(gcs_path, relative_path)
+            blob = bucket.blob(blob_path)
+            blob.upload_from_file(open(local_file, 'rb'))
+            print(f"Uploaded {local_file} to gs://{bucket_name}/{blob_path}")

 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gcs-bucket", type=str, required=True, help="GCS bucket for artifacts")
    parser.add_argument("--wandb-run", type=str, default="dummy", help="Wandb run name")
+    parser.add_argument("--vertex-experiment", type=str, default="", help="Vertex AI experiment name")
+    parser.add_argument("--vertex-tensorboard", type=str, default="", help="Vertex AI TensorBoard resource name")
    args = parser.parse_args()

-    # Set the base directory to the GCS bucket.
-    os.environ["NANOCHAT_BASE_DIR"] = args.gcs_bucket
+    # Parse bucket name and prefix
+    if args.gcs_bucket.startswith("gs://"):
+        bucket_name = args.gcs_bucket.replace("gs://", "").split("/")[0]
+        prefix_parts = args.gcs_bucket.replace("gs://", "").split("/")[1:]
+        prefix = "/".join(prefix_parts) if prefix_parts else ""
+    else:
+        bucket_name = args.gcs_bucket
+        prefix = ""

-    # Run supervised finetuning.
-    subprocess.run([
-        "torchrun", "--standalone", "--nproc_per_node=8",
-        "-m", "scripts.chat_sft", "--",
-        f"--run={args.wandb_run}"
-    ], check=True)
+    # Check if SFT checkpoint already exists (checkpoint detection)
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    gcs_sft_ckpt_path = os.path.join(prefix, "chatsft_checkpoints") if prefix else "chatsft_checkpoints"
+    
+    # Check for model.pt (the key checkpoint file)
+    checkpoint_exists = bucket.blob(os.path.join(gcs_sft_ckpt_path, "model.pt")).exists()
+    
+    if checkpoint_exists:
+        print(f"✓ SFT checkpoint already exists in gs://{bucket_name}/{gcs_sft_ckpt_path}")
+        print("Skipping SFT training (already completed)")
+        return

-    # Evaluate the model.
-    subprocess.run([
-        "torchrun", "--standalone", "--nproc_per_node=8",
-        "-m", "scripts.chat_eval", "--",
-        "-i", "sft"
-    ], check=True)
+    print(f"SFT checkpoint not found. Running SFT training...")
+
+    # Set local tmp dir for temporary files
+    local_base_dir = "/tmp/nanochat"
+    os.makedirs(local_base_dir, exist_ok=True)
+
+    # Download tokenizer from GCS
+    print("Downloading tokenizer from GCS...")
+    gcs_tokenizer_path = os.path.join(prefix, "tokenizer") if prefix else "tokenizer"
+    local_tokenizer_dir = os.path.join(local_base_dir, "tokenizer")
+    download_directory_from_gcs(bucket_name, gcs_tokenizer_path, local_tokenizer_dir)
+
+    # Download mid checkpoints from GCS
+    print("Downloading mid checkpoints from GCS...")
+    gcs_mid_checkpoints_path = os.path.join(prefix, "mid_checkpoints") if prefix else "mid_checkpoints"
+    local_mid_checkpoints_dir = os.path.join(local_base_dir, "mid_checkpoints")
+    download_directory_from_gcs(bucket_name, gcs_mid_checkpoints_path, local_mid_checkpoints_dir)
+
+    # Download report dir from GCS
+    print("Downloading report dir from GCS...")
+    gcs_report_path = os.path.join(prefix, "report") if prefix else "report"
+    local_report_dir = os.path.join(local_base_dir, "report")
+    download_directory_from_gcs(bucket_name, gcs_report_path, local_report_dir)
+    # Ensure report directory exists even if nothing was downloaded
+    os.makedirs(local_report_dir, exist_ok=True)
+
+    try:
+        # Download the identity conversations dataset.
+        print("Downloading identity conversations...")
+        subprocess.run([
+            "curl", "-L", "-o",
+            f"{local_base_dir}/identity_conversations.jsonl",
+            "https://karpathy-public.s3.us-west-2.amazonaws.com/identity_conversations.jsonl"
+        ], check=True)
+
+        # Run supervised finetuning.
+        print("Starting SFT...")
+        env = os.environ.copy()
+        env["NANOCHAT_BASE_DIR"] = local_base_dir
+        subprocess.run([
+            "torchrun", "--standalone", "--nproc_per_node=1",
+            "-m", "scripts.chat_sft",
+            f"--wandb_run_name={args.wandb_run}",
+            f"--vertex_experiment={args.vertex_experiment}",
+            f"--vertex_tensorboard={args.vertex_tensorboard}"
+        ], check=True, env=env)
+
+        # Evaluate the model.
+        print("Running chat_eval (sft)...")
+        subprocess.run([
+            "torchrun", "--standalone", "--nproc_per_node=1",
+            "-m", "scripts.chat_eval", "--",
+            "-i", "sft"
+        ], check=True, env=env)
+
+    except subprocess.CalledProcessError as e:
+        print(f"Error during SFT steps: {e}")
+        raise
+
+    # Upload checkpoints to GCS
+    print("Uploading artifacts to GCS...")
+    
+    # Upload chatsft_checkpoints
+    local_checkpoints_dir = os.path.join(local_base_dir, "chatsft_checkpoints")
+    gcs_checkpoints_path = os.path.join(prefix, "chatsft_checkpoints") if prefix else "chatsft_checkpoints"
+    if os.path.exists(local_checkpoints_dir):
+        upload_directory_to_gcs(local_checkpoints_dir, bucket_name, gcs_checkpoints_path)
+    else:
+        print(f"Warning: {local_checkpoints_dir} does not exist.")
+
+    # Upload report dir
+    if os.path.exists(local_report_dir):
+        upload_directory_to_gcs(local_report_dir, bucket_name, gcs_report_path)

 if __name__ == "__main__":
    main()
--- a/vertex_pipelines/submit.Dockerfile
+++ b/vertex_pipelines/submit.Dockerfile
@ -0,0 +1,24 @@
+FROM python:3.10-slim
+
+WORKDIR /app
+
+RUN apt-get update && apt-get install -y curl build-essential
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
+
+ENV PATH="/root/.local/bin:/root/.cargo/bin:/app/.venv/bin:${PATH}"
+
+COPY . .
+
+RUN uv venv
+RUN uv sync --extra gpu
+RUN uv pip install maturin
+RUN maturin develop --release --manifest-path rustbpe/Cargo.toml
+
+# Install gcloud
+RUN apt-get install -y apt-transport-https ca-certificates gnupg
+RUN echo "deb [signed-by=/usr/share/keyrings/cloud.google.gpg] https://packages.cloud.google.com/apt cloud-sdk main" | tee -a /etc/apt/sources.list.d/google-cloud-sdk.list
+RUN curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key --keyring /usr/share/keyrings/cloud.google.gpg add -
+RUN apt-get update && apt-get install -y google-cloud-sdk
+
+ENTRYPOINT ["bash"]
--- a/vertex_pipelines/tokenizer_step.py
+++ b/vertex_pipelines/tokenizer_step.py
@ -1,25 +1,99 @@
 import os
+import sys
 import subprocess
 import argparse
-from nanochat.common import get_base_dir
+import shutil
+from google.cloud import storage
+
+def upload_directory_to_gcs(local_path, bucket_name, gcs_path):
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    for root, _, files in os.walk(local_path):
+        for file in files:
+            local_file = os.path.join(root, file)
+            relative_path = os.path.relpath(local_file, local_path)
+            blob_path = os.path.join(gcs_path, relative_path)
+            blob = bucket.blob(blob_path)
+            blob.upload_from_file(open(local_file, 'rb'))
+            print(f"Uploaded {local_file} to gs://{bucket_name}/{blob_path}")

 def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--gcs-bucket", type=str, required=True, help="GCS bucket for artifacts")
    args = parser.parse_args()

-    # Set the base directory to the GCS bucket.
-    os.environ["NANOCHAT_BASE_DIR"] = args.gcs_bucket
+    # Parse bucket name and prefix from args.gcs_bucket
+    if args.gcs_bucket.startswith("gs://"):
+        bucket_name = args.gcs_bucket.replace("gs://", "").split("/")[0]
+        # Handle cases where there might be a prefix
+        prefix_parts = args.gcs_bucket.replace("gs://", "").split("/")[1:]
+        prefix = "/".join(prefix_parts) if prefix_parts else ""
+    else:
+        bucket_name = args.gcs_bucket
+        prefix = ""

-    # Download the dataset.
-    subprocess.run(["python", "-m", "nanochat.dataset", "-n", "8"], check=True)
-    subprocess.run(["python", "-m", "nanochat.dataset", "-n", "240"], check=True)
+    # Check if tokenizer artifacts already exist (checkpoint detection)
+    storage_client = storage.Client()
+    bucket = storage_client.bucket(bucket_name)
+    gcs_tokenizer_path = os.path.join(prefix, "tokenizer") if prefix else "tokenizer"
+    
+    # Check for key tokenizer files
+    tokenizer_files = ["model.json", "vocab.json", "merges.txt"]
+    all_exist = all(bucket.blob(os.path.join(gcs_tokenizer_path, f)).exists() for f in tokenizer_files)
+    
+    if all_exist:
+        print(f"✓ Tokenizer artifacts already exist in gs://{bucket_name}/{gcs_tokenizer_path}")
+        print("Skipping tokenizer training (already completed)")
+        return

-    # Train the tokenizer.
-    subprocess.run(["python", "-m", "scripts.tok_train", "--max_chars=2000000000"], check=True)
+    print(f"Tokenizer artifacts not found. Running tokenizer training...")

-    # Evaluate the tokenizer.
-    subprocess.run(["python", "-m", "scripts.tok_eval"], check=True)
+    # Set the base directory to a local temporary directory.
+    # We cannot use GCS directly because the tokenizer training script (Rust) expects local files.
+    local_base_dir = "/tmp/nanochat"
+    os.environ["NANOCHAT_BASE_DIR"] = local_base_dir
+    os.makedirs(local_base_dir, exist_ok=True)
+
+    try:
+        # Download the dataset.
+        # nanochat.dataset supports GCS, so we can point NANOCHAT_DATA_DIR to GCS if we wanted,
+        # but for simplicity let's just let it download to local temp.
+        # If you have data in GCS, you could set NANOCHAT_DATA_DIR to gs://...
+        # For now, we assume we download from HF to local.
+        print("Downloading dataset (n=8)...")
+        subprocess.run([sys.executable, "-m", "nanochat.dataset", "-n", "8"], check=True)
+        print("Downloading dataset (n=240)...")
+        subprocess.run([sys.executable, "-m", "nanochat.dataset", "-n", "240"], check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Error downloading dataset: {e}")
+        raise
+
+    try:
+        # Train the tokenizer.
+        print("Training tokenizer...")
+        subprocess.run([sys.executable, "scripts/tok_train.py", "--max_chars=2000000000"], check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Error training tokenizer: {e}")
+        raise
+
+    try:
+        # Evaluate the tokenizer.
+        print("Evaluating tokenizer...")
+        subprocess.run([sys.executable, "scripts/tok_eval.py"], check=True)
+    except subprocess.CalledProcessError as e:
+        print(f"Error evaluating tokenizer: {e}")
+        raise
+
+    # Upload artifacts to GCS
+    print("Uploading artifacts to GCS...")
+    
+    # Upload tokenizer
+    local_tokenizer_dir = os.path.join(local_base_dir, "tokenizer")
+    gcs_tokenizer_path = os.path.join(prefix, "tokenizer") if prefix else "tokenizer"
+    upload_directory_to_gcs(local_tokenizer_dir, bucket_name, gcs_tokenizer_path)
+
+    # Upload tokenized data if needed? 
+    # Usually we don't upload the raw data here, but tok_train might produce token_bytes.pt which is in tokenizer dir.

 if __name__ == "__main__":
    main()