From 34b86447ee5b3a680cc0efe82ab3d84ce9ce3497 Mon Sep 17 00:00:00 2001 From: Hritik Agarwal Date: Tue, 23 Jun 2026 05:53:32 +0000 Subject: [PATCH 01/13] Ansible changes: Integration of Failure-Triage-agent --- .../htcondor-integration-test.yml | 6 + .../multigroup-integration-test.yml | 3 + .../tasks/rescue_gcluster_failure.yml | 3 + .../tasks/trigger_failure_triage_agent.yml | 110 ++++++++++++++++++ .../daily-tests/builds/ansible-vm.yaml | 3 +- .../daily-tests/builds/batch-mpi.yaml | 3 +- .../cloud-build/daily-tests/builds/batch.yaml | 3 +- .../builds/chrome-remote-desktop-ubuntu.yaml | 3 +- .../builds/chrome-remote-desktop.yaml | 3 +- .../builds/gke-a2-highgpu-kueue-onspot.yaml | 4 +- .../builds/gke-a3-highgpu-onspot.yaml | 4 +- .../daily-tests/builds/gke-a3-highgpu.yaml | 3 +- .../builds/gke-a3-megagpu-onspot.yaml | 4 +- .../daily-tests/builds/gke-a3-megagpu.yaml | 3 +- .../builds/gke-a3-ultragpu-onspot.yaml | 4 +- .../daily-tests/builds/gke-a3-ultragpu.yaml | 3 +- .../daily-tests/builds/gke-a4-onspot.yaml | 4 +- .../daily-tests/builds/gke-a4x.yaml | 3 +- .../daily-tests/builds/gke-g4-onspot.yaml | 4 +- .../daily-tests/builds/gke-g4.yaml | 3 +- .../daily-tests/builds/gke-h4d-onspot.yaml | 4 +- .../daily-tests/builds/gke-h4d.yaml | 3 +- .../builds/gke-inactive-reservation.yaml | 3 +- .../builds/gke-managed-hyperdisk.yaml | 3 +- .../builds/gke-managed-lustre.yaml | 3 +- .../daily-tests/builds/gke-storage.yaml | 3 +- .../daily-tests/builds/gke-tpu-7x.yaml | 4 +- .../daily-tests/builds/gke-tpu-v6e-flex.yaml | 4 +- .../daily-tests/builds/gke-tpu-v6e.yaml | 4 +- tools/cloud-build/daily-tests/builds/gke.yaml | 3 +- .../daily-tests/builds/h4d-vm.yaml | 4 +- .../cloud-build/daily-tests/builds/hcls.yaml | 3 +- .../builds/hpc-build-slurm-image.yaml | 3 +- .../builds/hpc-enterprise-slurm.yaml | 3 +- .../daily-tests/builds/htc-slurm.yaml | 3 +- .../daily-tests/builds/htcondor.yaml | 3 +- .../builds/ml-a3-highgpu-onspot-slurm.yaml | 4 +- .../builds/ml-a3-highgpu-slurm.yaml | 3 +- .../ml-a3-megagpu-onspot-slurm-ubuntu.yaml | 4 +- .../builds/ml-a3-megagpu-slurm-ubuntu.yaml | 3 +- ...3-ultragpu-custom-2404-blueprint-test.yaml | 4 +- .../ml-a3-ultragpu-custom-blueprint-test.yaml | 4 +- .../builds/ml-a3-ultragpu-jbvms.yaml | 3 +- .../builds/ml-a3-ultragpu-onspot-jbvms.yaml | 4 +- .../builds/ml-a3-ultragpu-onspot-slurm.yaml | 4 +- .../builds/ml-a3-ultragpu-slurm.yaml | 3 +- ...a4-highgpu-custom-2404-blueprint-test.yaml | 4 +- .../ml-a4-highgpu-custom-blueprint-test.yaml | 4 +- .../builds/ml-a4-highgpu-onspot-slurm.yaml | 4 +- .../ml-a4x-highgpu-custom-blueprint-test.yaml | 3 +- .../builds/ml-a4x-highgpu-slurm.yaml | 3 +- .../builds/ml-g4-onspot-slurm.yaml | 4 +- .../daily-tests/builds/ml-gke-e2e.yaml | 3 +- .../daily-tests/builds/ml-gke.yaml | 3 +- .../builds/ml-h4d-onspot-slurm.yaml | 4 +- .../daily-tests/builds/ml-slurm.yaml | 3 +- .../daily-tests/builds/monitoring.yaml | 3 +- .../daily-tests/builds/netapp-volumes.yaml | 3 +- .../daily-tests/builds/ofe-deployment.yaml | 3 +- .../daily-tests/builds/packer.yaml | 3 +- .../builds/pfs-managed-lustre-slurm.yaml | 3 +- .../builds/pfs-managed-lustre-vm.yaml | 3 +- .../cloud-build/daily-tests/builds/slinky.yml | 3 +- .../builds/slurm-gcp-v6-debian.yaml | 3 +- .../builds/slurm-gcp-v6-rocky8.yaml | 3 +- .../daily-tests/builds/slurm-gcp-v6-ssd.yaml | 3 +- .../builds/slurm-gcp-v6-startup-scripts.yaml | 3 +- .../daily-tests/builds/slurm-gcp-v6-tpu.yaml | 4 +- .../builds/slurm-gcp-v6-ubuntu.yaml | 3 +- .../daily-tests/builds/slurm-gke.yaml | 3 +- .../builds/slurm-rapid-storage.yaml | 3 +- .../daily-tests/builds/spack-gromacs.yaml | 3 +- 72 files changed, 258 insertions(+), 91 deletions(-) create mode 100644 tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml index 4a71e601d5..79c16e4855 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml @@ -119,6 +119,9 @@ - firewall-rules - delete - "{{ deployment_name }}" + - name: Include Failure Triage Agent trigger tasks + ansible.builtin.include_tasks: tasks/trigger_failure_triage_agent.yml + - name: Destroy deployment register: gcluster_destroy changed_when: gcluster_destroy.changed @@ -179,6 +182,9 @@ - firewall-rules - delete - "{{ deployment_name }}" + - name: Include Failure Triage Agent trigger tasks + ansible.builtin.include_tasks: tasks/trigger_failure_triage_agent.yml + - name: Destroy deployment delegate_to: localhost register: gcluster_destroy diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml b/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml index 30923c1fd1..1d7e9fd7a1 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml @@ -34,6 +34,9 @@ environment: TF_IN_AUTOMATION: "TRUE" always: + - name: Include Failure Triage Agent trigger tasks + ansible.builtin.include_tasks: tasks/trigger_failure_triage_agent.yml + - name: Destroy deployment register: gcluster_destroy changed_when: gcluster_destroy.changed diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_gcluster_failure.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_gcluster_failure.yml index a68fef2811..983f0b7479 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_gcluster_failure.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_gcluster_failure.yml @@ -18,6 +18,9 @@ - deployment_name is defined - workspace is defined +- name: Include Failure Triage Agent trigger tasks + ansible.builtin.include_tasks: tasks/trigger_failure_triage_agent.yml + - name: Delete Firewall Rule register: fw_deleted changed_when: fw_deleted.rc == 0 diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml new file mode 100644 index 0000000000..543f2591c6 --- /dev/null +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml @@ -0,0 +1,110 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +- name: Set Triage Agent Configuration + ansible.builtin.set_fact: + triage_gcs_bucket: "g-ift-agent-bucket" + triage_project_id: "508417052821" + triage_invoker_sa: "triage-invoker@hpc-toolkit-dev.iam.gserviceaccount.com" + triage_cloud_run_url: "https://failure-triage-agent-508417052821.us-central1.run.app" + +- name: Check Triage Agent Prerequisites + delegate_to: localhost + changed_when: false + args: + executable: /bin/bash + ansible.builtin.shell: | + BUILD_ID="{{ full_build_id | default('') }}" + + if [ -z "$BUILD_ID" ]; then + echo "SKIPPED: No full_build_id provided. Append full_build_id=... to extra-vars to trigger." + exit 0 + fi + + if ! gcloud storage ls gs://{{ triage_gcs_bucket }}/ENABLE_TRIAGE_AGENT >/dev/null 2>&1; then + echo "SKIPPED: Failure Triage Agent is currently disabled (no ENABLE_TRIAGE_AGENT flag found in GCS)." + exit 0 + fi + + echo "PROCEED: Agent is enabled and build ID is present." + register: triage_init + +- name: Execute Triage Agent Pipeline + when: "'PROCEED' in triage_init.stdout" + vars: + triage_build_id: "{{ full_build_id | default('') }}" + block: + - name: Trigger Failure Triage Agent + delegate_to: localhost + changed_when: false + args: + executable: /bin/bash + ansible.builtin.shell: | + TOKEN=$(gcloud auth print-identity-token --impersonate-service-account="{{ triage_invoker_sa }}" --audiences="{{ triage_cloud_run_url }}" 2>/dev/null) + if [ -z "$TOKEN" ]; then + echo "Failed to get identity token." >&2 + exit 1 + fi + + HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST "{{ triage_cloud_run_url }}/trigger" \ + -H "Authorization: Bearer $TOKEN" \ + -H "Content-Type: application/json" \ + -d '{"build_id": "{{ triage_build_id }}", "project_id": "{{ triage_project_id }}"}') + + if [ "$HTTP_STATUS" != "200" ] && [ "$HTTP_STATUS" != "202" ]; then + echo "Failed to trigger agent. HTTP Status: $HTTP_STATUS" >&2 + exit 1 + fi + + - name: Wait for Analysis to Complete + delegate_to: localhost + changed_when: false + args: + executable: /bin/bash + ansible.builtin.shell: | + sleep 300 + for i in {1..30}; do + STATE_JSON=$(gcloud storage cat "gs://{{ triage_gcs_bucket }}/{{ triage_build_id }}/state.json" 2>/dev/null || echo '{}') + STATUS=$(echo "$STATE_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('status', ''))" 2>/dev/null) + if [ "$STATUS" = "completed" ] || [ "$STATUS" = "failed" ]; then + if [ "$STATUS" = "completed" ]; then + sleep 300 + gcloud storage cat "gs://{{ triage_gcs_bucket }}/{{ triage_build_id }}/state.json" 2>/dev/null || echo '{}' + else + echo "$STATE_JSON" + fi + exit 0 + fi + sleep 60 + done + exit 1 + register: agent_state + ignore_errors: true + + - name: Print Triage Report + delegate_to: localhost + ansible.builtin.debug: + msg: | + {% if agent_state.failed or (agent_state.stdout | default('{}', true) | from_json).status | default('') != 'completed' %} + Failure Triage Agent testing did not complete in time or failed internally. + + State File Link: + https://console.cloud.google.com/storage/browser/_details/{{ triage_gcs_bucket }}/{{ triage_build_id }}/state.json + {% else %} + TRIAGE EXECUTIVE SUMMARY: + {{ (agent_state.stdout | default('{}', true) | from_json).executive_summary | default('No summary available.') }} + + Full forensic report available at: + https://storage.cloud.google.com/{{ triage_gcs_bucket }}/{{ triage_build_id }}/report.txt + {% endif %} diff --git a/tools/cloud-build/daily-tests/builds/ansible-vm.yaml b/tools/cloud-build/daily-tests/builds/ansible-vm.yaml index 64b9f40513..6b7b0b78ed 100644 --- a/tools/cloud-build/daily-tests/builds/ansible-vm.yaml +++ b/tools/cloud-build/daily-tests/builds/ansible-vm.yaml @@ -34,6 +34,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -48,7 +49,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/ansible-vm.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/batch-mpi.yaml b/tools/cloud-build/daily-tests/builds/batch-mpi.yaml index b60844fa12..d65223ea6f 100644 --- a/tools/cloud-build/daily-tests/builds/batch-mpi.yaml +++ b/tools/cloud-build/daily-tests/builds/batch-mpi.yaml @@ -47,6 +47,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" secretEnv: ['SPACK_CACHE_WRF', 'GCLUSTER_GCS_PATH'] @@ -72,5 +73,5 @@ steps: echo ' timeout: 10800' >> $${SG_EXAMPLE} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/batch-mpi.yml" diff --git a/tools/cloud-build/daily-tests/builds/batch.yaml b/tools/cloud-build/daily-tests/builds/batch.yaml index 81abc0a9a5..151f90689e 100644 --- a/tools/cloud-build/daily-tests/builds/batch.yaml +++ b/tools/cloud-build/daily-tests/builds/batch.yaml @@ -35,6 +35,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -50,7 +51,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/batch.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/chrome-remote-desktop-ubuntu.yaml b/tools/cloud-build/daily-tests/builds/chrome-remote-desktop-ubuntu.yaml index 7cc95f1e14..d8a7faf705 100644 --- a/tools/cloud-build/daily-tests/builds/chrome-remote-desktop-ubuntu.yaml +++ b/tools/cloud-build/daily-tests/builds/chrome-remote-desktop-ubuntu.yaml @@ -33,6 +33,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -47,7 +48,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} os=ubuntu" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID os=ubuntu" \ --extra-vars="@tools/cloud-build/daily-tests/tests/chrome-remote-desktop.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/chrome-remote-desktop.yaml b/tools/cloud-build/daily-tests/builds/chrome-remote-desktop.yaml index be85523044..45e362f453 100644 --- a/tools/cloud-build/daily-tests/builds/chrome-remote-desktop.yaml +++ b/tools/cloud-build/daily-tests/builds/chrome-remote-desktop.yaml @@ -34,6 +34,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -48,7 +49,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} os=default" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID os=default" \ --extra-vars="@tools/cloud-build/daily-tests/tests/chrome-remote-desktop.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/gke-a2-highgpu-kueue-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-a2-highgpu-kueue-onspot.yaml index 6ea0b83eb1..8e00273079 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a2-highgpu-kueue-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a2-highgpu-kueue-onspot.yaml @@ -36,6 +36,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" @@ -45,7 +46,6 @@ steps: - "PROVISIONING_MODEL=SPOT" - "MACHINE_TYPE=a2-highgpu-2g" - "INSTANCE_PREFIX=a2hspgke" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a2hoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -109,7 +109,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="@$${GKE_VARS_FILE}" diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-highgpu-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu-onspot.yaml index 6f9c8f63ad..54936ded7f 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-highgpu-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu-onspot.yaml @@ -36,13 +36,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a3-highgpu-8g" - "INSTANCE_PREFIX=a3hspgke" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3hoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -98,7 +98,7 @@ steps: sed -i '/^ reservation:/d' $${EXAMPLE_BP} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="@$${GKE_VARS_FILE}" diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml index d97fdea39a..740969e02e 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml @@ -36,6 +36,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -69,7 +70,7 @@ steps: bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu-onspot.yaml index 347321ebe9..123d461efd 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu-onspot.yaml @@ -36,13 +36,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "MACHINE_TYPE=a3-megagpu-8g" - "NUM_NODES=4" - "INSTANCE_PREFIX=a3mspgke" - "PROJECT_ID=$PROJECT_ID" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3moptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -97,7 +97,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 \ - --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="@$${GKE_VARS_FILE}" diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml index 52337df9c0..77b93203d8 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml @@ -36,6 +36,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -69,7 +70,7 @@ steps: bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml index ddc653aa78..86b4b70e63 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml @@ -39,13 +39,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a3-ultragpu-8g" - "INSTANCE_PREFIX=a3uspgke" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3uoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -100,7 +100,7 @@ steps: bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} chs_repo=$${CHS_REPO}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID chs_repo=$${CHS_REPO}" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="@$${GKE_VARS_FILE}" diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu.yaml index 42fb9d5615..6b2913a86c 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu.yaml @@ -39,6 +39,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -72,7 +73,7 @@ steps: bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/gke-a4-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-a4-onspot.yaml index 5330c73a03..99df32fff7 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a4-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a4-onspot.yaml @@ -39,13 +39,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a4-highgpu-8g" - "INSTANCE_PREFIX=a4spgke" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a4hoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -101,7 +101,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} chs_repo=$${CHS_REPO}" \ + --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID chs_repo=$${CHS_REPO}" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="@$${GKE_VARS_FILE}" diff --git a/tools/cloud-build/daily-tests/builds/gke-a4x.yaml b/tools/cloud-build/daily-tests/builds/gke-a4x.yaml index 578a39d18e..7bd570565d 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a4x.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a4x.yaml @@ -40,6 +40,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -78,7 +79,7 @@ steps: bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a4x.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/gke-g4-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-g4-onspot.yaml index 35aa35ad73..3d91e60821 100644 --- a/tools/cloud-build/daily-tests/builds/gke-g4-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-g4-onspot.yaml @@ -36,13 +36,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=2" - "MACHINE_TYPE=g4-standard-48" - "INSTANCE_PREFIX=g4spgke" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/g4options.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -91,7 +91,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX}" \ --user=sa_106486320838376751393 \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="@$${GKE_VARS_FILE}" diff --git a/tools/cloud-build/daily-tests/builds/gke-g4.yaml b/tools/cloud-build/daily-tests/builds/gke-g4.yaml index f76b9d4a85..57cbef85df 100644 --- a/tools/cloud-build/daily-tests/builds/gke-g4.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-g4.yaml @@ -36,6 +36,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -67,7 +68,7 @@ steps: echo ' outputs: [instructions]' >> $${EXAMPLE_BP} bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/gke-g4.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/gke-h4d-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-h4d-onspot.yaml index 44a8634ab9..c311e149f1 100644 --- a/tools/cloud-build/daily-tests/builds/gke-h4d-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-h4d-onspot.yaml @@ -39,13 +39,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=2" - "MACHINE_TYPE=h4d-highmem-192-lssd" - "INSTANCE_PREFIX=h4dspgke" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/h4doptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -94,7 +94,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ --user=sa_106486320838376751393 \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="@$${GKE_VARS_FILE}" diff --git a/tools/cloud-build/daily-tests/builds/gke-h4d.yaml b/tools/cloud-build/daily-tests/builds/gke-h4d.yaml index 44e822c9c2..76da954c1c 100644 --- a/tools/cloud-build/daily-tests/builds/gke-h4d.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-h4d.yaml @@ -39,6 +39,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -75,7 +76,7 @@ steps: python3 tools/fix_vpc_name.py $${EXAMPLE_BP} "${_TEST_PREFIX}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/gke-h4d.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/gke-inactive-reservation.yaml b/tools/cloud-build/daily-tests/builds/gke-inactive-reservation.yaml index b444fdf39e..6fcc2e5cea 100644 --- a/tools/cloud-build/daily-tests/builds/gke-inactive-reservation.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-inactive-reservation.yaml @@ -36,6 +36,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -72,7 +73,7 @@ steps: bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/gke-inactive-reservation.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/gke-managed-hyperdisk.yaml b/tools/cloud-build/daily-tests/builds/gke-managed-hyperdisk.yaml index 9894c5875c..5b353d3206 100644 --- a/tools/cloud-build/daily-tests/builds/gke-managed-hyperdisk.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-managed-hyperdisk.yaml @@ -38,6 +38,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -61,7 +62,7 @@ steps: sed -i "s//$${IP}/" $${SG_EXAMPLE} bash tools/add_ttl_label.sh "$${SG_EXAMPLE}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/gke-managed-hyperdisk.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/gke-managed-lustre.yaml b/tools/cloud-build/daily-tests/builds/gke-managed-lustre.yaml index fc27f1c7dd..034f848a4d 100644 --- a/tools/cloud-build/daily-tests/builds/gke-managed-lustre.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-managed-lustre.yaml @@ -39,6 +39,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -63,7 +64,7 @@ steps: sed -i "s//$${IP}/" $${EXAMPLE_BP} bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/gke-managed-lustre.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/gke-storage.yaml b/tools/cloud-build/daily-tests/builds/gke-storage.yaml index 75ea16efc0..2fc273cbee 100644 --- a/tools/cloud-build/daily-tests/builds/gke-storage.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-storage.yaml @@ -41,6 +41,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -64,7 +65,7 @@ steps: python3 tools/fix_vpc_name.py $${SG_EXAMPLE} "${_TEST_PREFIX}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/gke-storage.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml b/tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml index 3b30ccc196..5753bfef8c 100644 --- a/tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml @@ -37,13 +37,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "MACHINE_TYPE=tpu7x-standard-4t" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=2" - "INSTANCE_PREFIX=tpu7xsp" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/tpu7xoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -85,7 +85,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} chs_repo=$${CHS_REPO}" \ + --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID chs_repo=$${CHS_REPO}" \ --extra-vars="region=us-central1 zone=us-central1-c" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="@$${GKE_VARS_FILE}" diff --git a/tools/cloud-build/daily-tests/builds/gke-tpu-v6e-flex.yaml b/tools/cloud-build/daily-tests/builds/gke-tpu-v6e-flex.yaml index f13491e4cb..a2dfa12a42 100644 --- a/tools/cloud-build/daily-tests/builds/gke-tpu-v6e-flex.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-tpu-v6e-flex.yaml @@ -39,10 +39,10 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - - "BUILD_ID=$BUILD_ID" args: - -c - | @@ -72,7 +72,7 @@ steps: bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" # Run the test ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="@tools/cloud-build/daily-tests/tests/gke-tpu-v6e-flex.yml" secretEnv: ['GCLUSTER_GCS_PATH'] diff --git a/tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml b/tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml index e47744c58b..7302d2d219 100644 --- a/tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml @@ -36,6 +36,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "MACHINE_TYPE=tpu" @@ -44,7 +45,6 @@ steps: - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=2" - "INSTANCE_PREFIX=v6esp" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/tpuv6eoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -100,7 +100,7 @@ steps: bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} chs_repo=$${CHS_REPO}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID chs_repo=$${CHS_REPO}" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="@$${GKE_VARS_FILE}" diff --git a/tools/cloud-build/daily-tests/builds/gke.yaml b/tools/cloud-build/daily-tests/builds/gke.yaml index 1a9bab23de..04273d6d5a 100644 --- a/tools/cloud-build/daily-tests/builds/gke.yaml +++ b/tools/cloud-build/daily-tests/builds/gke.yaml @@ -36,6 +36,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -63,7 +64,7 @@ steps: bash tools/add_ttl_label.sh "$${SG_EXAMPLE}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/gke.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/h4d-vm.yaml b/tools/cloud-build/daily-tests/builds/h4d-vm.yaml index 88c00e8f03..eecd78084a 100644 --- a/tools/cloud-build/daily-tests/builds/h4d-vm.yaml +++ b/tools/cloud-build/daily-tests/builds/h4d-vm.yaml @@ -35,13 +35,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=h4d-highmem-192-lssd" - "INSTANCE_PREFIX=h4dsp" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/h4doptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -80,7 +80,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ --user=sa_106486320838376751393 \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="@$${H4D_VARS_FILE}" diff --git a/tools/cloud-build/daily-tests/builds/hcls.yaml b/tools/cloud-build/daily-tests/builds/hcls.yaml index 2e0b849d28..4b9eae2ae7 100644 --- a/tools/cloud-build/daily-tests/builds/hcls.yaml +++ b/tools/cloud-build/daily-tests/builds/hcls.yaml @@ -49,6 +49,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -64,7 +65,7 @@ steps: python3 tools/fix_vpc_name.py $${BLUEPRINT} "${_TEST_PREFIX}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/hcls.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml b/tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml index 7420957fa4..07da3641c2 100644 --- a/tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml +++ b/tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml @@ -36,6 +36,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -49,7 +50,7 @@ steps: BLUEPRINT="community/examples/hpc-build-slurm-image.yaml" bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/hpc-enterprise-slurm.yaml b/tools/cloud-build/daily-tests/builds/hpc-enterprise-slurm.yaml index 8cc60dc284..dff2520b47 100644 --- a/tools/cloud-build/daily-tests/builds/hpc-enterprise-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/hpc-enterprise-slurm.yaml @@ -40,6 +40,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -54,7 +55,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/htc-slurm.yaml b/tools/cloud-build/daily-tests/builds/htc-slurm.yaml index f42d1a41cd..e80ec4dd07 100644 --- a/tools/cloud-build/daily-tests/builds/htc-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/htc-slurm.yaml @@ -38,6 +38,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -52,7 +53,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/htc-slurm.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/htcondor.yaml b/tools/cloud-build/daily-tests/builds/htcondor.yaml index 2a16b60880..18fea66645 100644 --- a/tools/cloud-build/daily-tests/builds/htcondor.yaml +++ b/tools/cloud-build/daily-tests/builds/htcondor.yaml @@ -41,6 +41,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -55,7 +56,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" --extra-vars="@tools/cloud-build/daily-tests/tests/htcondor.yml" + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" --extra-vars="@tools/cloud-build/daily-tests/tests/htcondor.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: secretManager: diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml index 5a33a9ae54..b754ff82eb 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml @@ -40,13 +40,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "MACHINE_TYPE=a3-highgpu-8g" - "NUM_NODES=4" - "INSTANCE_PREFIX=a3hsp" - "PROJECT_ID=$PROJECT_ID" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3hoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -83,7 +83,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} "\ + --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID "\ --extra-vars="region=$${REGION} zone=$${ZONE}"\ --extra-vars="enable_spot=$${ENABLE_SPOT} "\ --extra-vars="tcpx_kernel_login=$${TCPX_KERNEL_LOGIN} tcpx_kernel_password=$${TCPX_KERNEL_PASSWORD} keyserver_ubuntu_key=$${KEYSERVER_UBUNTU_KEY} "\ diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml index da077ec4c8..9a75f29500 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml @@ -40,6 +40,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -59,7 +60,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} "\ + --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID "\ --extra-vars="region=$${REGION} zone=$${ZONE} "\ --extra-vars="tcpx_kernel_login=$${TCPX_KERNEL_LOGIN} tcpx_kernel_password=$${TCPX_KERNEL_PASSWORD} keyserver_ubuntu_key=$${KEYSERVER_UBUNTU_KEY} "\ --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm.yml" diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml index 645ff085b7..d0c084226e 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml @@ -41,6 +41,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" # General Ansible configuration - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" @@ -48,7 +49,6 @@ steps: - "NUM_NODES=4" - "INSTANCE_PREFIX=a3msp" - "PROJECT_ID=$PROJECT_ID" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3moptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -87,7 +87,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ --user=sa_106486320838376751393 \ - --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="@$${SLURM_VARS_FILE}" diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm-ubuntu.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm-ubuntu.yaml index 17816847ef..134090cc99 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm-ubuntu.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm-ubuntu.yaml @@ -42,6 +42,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -64,7 +65,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ --user=sa_106486320838376751393 \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-ubuntu.yml" secretEnv: ['GCLUSTER_GCS_PATH'] diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-2404-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-2404-blueprint-test.yaml index a4126fe5c2..b8eb64c1b1 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-2404-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-2404-blueprint-test.yaml @@ -36,13 +36,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a3-ultragpu-8g" - "INSTANCE_PREFIX=a3usp" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3uoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -78,7 +78,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 \ - --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="instance_image_project=$${CUSTOM_IMAGE_PROJECT}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml index a87de19fc9..87f4a3f560 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml @@ -36,13 +36,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a3-ultragpu-8g" - "INSTANCE_PREFIX=a3usp" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3uoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -78,7 +78,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 \ - --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="instance_image_project=$${CUSTOM_IMAGE_PROJECT}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-jbvms.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-jbvms.yaml index 36d81dcd0d..5498eb9ef2 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-jbvms.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-jbvms.yaml @@ -36,6 +36,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -56,7 +57,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ --user=sa_106486320838376751393 \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-jbvms.yml" secretEnv: ['GCLUSTER_GCS_PATH'] diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-jbvms.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-jbvms.yaml index c5234076a2..76f5feb863 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-jbvms.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-jbvms.yaml @@ -36,13 +36,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a3-ultragpu-8g" - "INSTANCE_PREFIX=a3usp-jbvms" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3uoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -81,7 +81,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ --user=sa_106486320838376751393 \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="@$${JBVM_VARS_FILE}" diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml index 54489a2150..39152d9fe6 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml @@ -42,13 +42,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a3-ultragpu-8g" - "INSTANCE_PREFIX=a3usp" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3uoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -89,7 +89,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ --user=sa_106486320838376751393 \ - --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} chs_repo=$${CHS_REPO}" \ + --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID chs_repo=$${CHS_REPO}" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="@$${SLURM_VARS_FILE}" diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml index 86dfc77421..661726a114 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml @@ -42,6 +42,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -62,7 +63,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ --user=sa_106486320838376751393 \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml" secretEnv: ['GCLUSTER_GCS_PATH'] diff --git a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-2404-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-2404-blueprint-test.yaml index cb31f60305..96b11bc057 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-2404-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-2404-blueprint-test.yaml @@ -36,13 +36,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a4-highgpu-8g" - "INSTANCE_PREFIX=a4hsp" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a4hoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -77,7 +77,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 \ - --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="instance_image_project=$${CUSTOM_IMAGE_PROJECT}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml index 25c453a605..be2fc40802 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml @@ -36,13 +36,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a4-highgpu-8g" - "INSTANCE_PREFIX=a4hsp" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a4hoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -77,7 +77,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 \ - --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="instance_image_project=$${CUSTOM_IMAGE_PROJECT}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml index 98b27773af..4c77e5c711 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml @@ -41,13 +41,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a4-highgpu-8g" - "INSTANCE_PREFIX=a4hsp" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a4hoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -87,7 +87,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ --user=sa_106486320838376751393 \ - --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} chs_repo=$${CHS_REPO}" \ + --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID chs_repo=$${CHS_REPO}" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="@$${SLURM_VARS_FILE}" diff --git a/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-custom-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-custom-blueprint-test.yaml index 55803611b8..59622b4636 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-custom-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-custom-blueprint-test.yaml @@ -38,6 +38,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -60,7 +61,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX}" \ --user=sa_106486320838376751393 \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} "\ + --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID "\ --extra-vars="region=$${REGION} zone=$${ZONE} "\ --extra-vars="instance_image_project=$${CUSTOM_IMAGE_PROJECT}" \ --extra-vars="instance_image_family=$${CUSTOM_IMAGE_FAMILY}" \ diff --git a/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-slurm.yaml index 87af556b40..656fb44ead 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-slurm.yaml @@ -41,6 +41,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -66,7 +67,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ --user=sa_106486320838376751393 \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} "\ + --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID "\ --extra-vars="region=$${REGION} zone=$${ZONE} "\ --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a4x-highgpu-slurm.yml" secretEnv: ['GCLUSTER_GCS_PATH'] diff --git a/tools/cloud-build/daily-tests/builds/ml-g4-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-g4-onspot-slurm.yaml index bcdc7de6bf..ec2688d7b0 100644 --- a/tools/cloud-build/daily-tests/builds/ml-g4-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-g4-onspot-slurm.yaml @@ -38,13 +38,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=2" - "MACHINE_TYPE=g4-standard-48" - "INSTANCE_PREFIX=g4osp" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/g4options.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -80,7 +80,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ --user=sa_106486320838376751393 \ - --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="@$${SLURM_VARS_FILE}" diff --git a/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml b/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml index 38bb944772..132b86d405 100644 --- a/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml @@ -36,6 +36,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -65,7 +66,7 @@ steps: sed -i "s//$${IP}/" $${SG_EXAMPLE} bash tools/add_ttl_label.sh $${SG_EXAMPLE} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/ml-gke-e2e.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/ml-gke.yaml b/tools/cloud-build/daily-tests/builds/ml-gke.yaml index 6884f8cebf..a11d8c407d 100644 --- a/tools/cloud-build/daily-tests/builds/ml-gke.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-gke.yaml @@ -36,6 +36,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -66,7 +67,7 @@ steps: bash tools/add_ttl_label.sh $${SG_EXAMPLE} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/ml-gke.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/ml-h4d-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-h4d-onspot-slurm.yaml index c5481dd7c6..7d131f27d5 100644 --- a/tools/cloud-build/daily-tests/builds/ml-h4d-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-h4d-onspot-slurm.yaml @@ -36,6 +36,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" # General Ansible configuration - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" @@ -43,7 +44,6 @@ steps: - "NUM_NODES=4" - "INSTANCE_PREFIX=h4dspon" - "PROJECT_ID=$PROJECT_ID" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/h4doptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: @@ -79,7 +79,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ --user=sa_106486320838376751393 \ - --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="@$${SLURM_VARS_FILE}" diff --git a/tools/cloud-build/daily-tests/builds/ml-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-slurm.yaml index 26ac07632e..9c564634c2 100644 --- a/tools/cloud-build/daily-tests/builds/ml-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-slurm.yaml @@ -41,6 +41,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -57,7 +58,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/ml-slurm.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/monitoring.yaml b/tools/cloud-build/daily-tests/builds/monitoring.yaml index 833787b734..9aee9a521a 100644 --- a/tools/cloud-build/daily-tests/builds/monitoring.yaml +++ b/tools/cloud-build/daily-tests/builds/monitoring.yaml @@ -38,6 +38,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -52,7 +53,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/monitoring.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/netapp-volumes.yaml b/tools/cloud-build/daily-tests/builds/netapp-volumes.yaml index 39008f3576..b4212b1003 100644 --- a/tools/cloud-build/daily-tests/builds/netapp-volumes.yaml +++ b/tools/cloud-build/daily-tests/builds/netapp-volumes.yaml @@ -36,6 +36,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -50,7 +51,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/netapp-volumes.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/ofe-deployment.yaml b/tools/cloud-build/daily-tests/builds/ofe-deployment.yaml index 417c88d91d..d384c0c327 100644 --- a/tools/cloud-build/daily-tests/builds/ofe-deployment.yaml +++ b/tools/cloud-build/daily-tests/builds/ofe-deployment.yaml @@ -29,6 +29,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -43,7 +44,7 @@ steps: git init . # ofe deploymemt requires some git repo to figure out top level directory ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/ofe-deployment-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/ofe-deployment.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/packer.yaml b/tools/cloud-build/daily-tests/builds/packer.yaml index 8d58fed215..9fe1075e63 100644 --- a/tools/cloud-build/daily-tests/builds/packer.yaml +++ b/tools/cloud-build/daily-tests/builds/packer.yaml @@ -40,6 +40,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -54,7 +55,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/packer.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/pfs-managed-lustre-slurm.yaml b/tools/cloud-build/daily-tests/builds/pfs-managed-lustre-slurm.yaml index 7615564857..6da54a8cbf 100644 --- a/tools/cloud-build/daily-tests/builds/pfs-managed-lustre-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/pfs-managed-lustre-slurm.yaml @@ -37,6 +37,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -50,7 +51,7 @@ steps: BLUEPRINT="examples/pfs-managed-lustre-slurm.yaml" bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/pfs-managed-lustre-slurm.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/pfs-managed-lustre-vm.yaml b/tools/cloud-build/daily-tests/builds/pfs-managed-lustre-vm.yaml index 01ddc932ed..73c24e4b76 100644 --- a/tools/cloud-build/daily-tests/builds/pfs-managed-lustre-vm.yaml +++ b/tools/cloud-build/daily-tests/builds/pfs-managed-lustre-vm.yaml @@ -34,6 +34,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -47,7 +48,7 @@ steps: BLUEPRINT="examples/pfs-managed-lustre-vm.yaml" bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/pfs-managed-lustre-vm.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/slinky.yml b/tools/cloud-build/daily-tests/builds/slinky.yml index bdcc5a7136..0e1069aaaa 100644 --- a/tools/cloud-build/daily-tests/builds/slinky.yml +++ b/tools/cloud-build/daily-tests/builds/slinky.yml @@ -36,6 +36,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -59,7 +60,7 @@ steps: echo ' add_deployment_name_before_prefix: true' >> $${EXAMPLE_BP} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/slinky.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-debian.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-debian.yaml index eecb13fecd..7d7bd24c4d 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-debian.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-debian.yaml @@ -37,6 +37,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -51,7 +52,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-rocky8.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-rocky8.yaml index b0f8ffccf5..e74acdf5d3 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-rocky8.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-rocky8.yaml @@ -37,6 +37,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -53,7 +54,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ssd.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ssd.yaml index 70f5621cec..e538ff356a 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ssd.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ssd.yaml @@ -38,6 +38,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -52,7 +53,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-ssd.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-startup-scripts.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-startup-scripts.yaml index 1e5d81daca..4d3efa4cb0 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-startup-scripts.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-startup-scripts.yaml @@ -38,6 +38,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -52,7 +53,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-startup-scripts.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml index 39d1f76a2f..06600857ec 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml @@ -36,6 +36,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "MACHINE_TYPE=tpu" @@ -44,7 +45,6 @@ steps: - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=2" - "INSTANCE_PREFIX=v3sp" - - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/tpuv3options.txt" args: - -c @@ -67,7 +67,7 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 \ - --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-tpu.yml" secretEnv: ['GCLUSTER_GCS_PATH'] diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ubuntu.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ubuntu.yaml index c9f0428f0a..76594b0106 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ubuntu.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ubuntu.yaml @@ -37,6 +37,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -51,7 +52,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-ubuntu.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/slurm-gke.yaml b/tools/cloud-build/daily-tests/builds/slurm-gke.yaml index 6a3692af89..d5a4c9d9fb 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gke.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gke.yaml @@ -44,6 +44,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -58,7 +59,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook -v tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-gke.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/slurm-rapid-storage.yaml b/tools/cloud-build/daily-tests/builds/slurm-rapid-storage.yaml index 295bb3b59c..a7a213f382 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-rapid-storage.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-rapid-storage.yaml @@ -38,6 +38,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -51,7 +52,7 @@ steps: BLUEPRINT=examples/rapid-storage-slurm.yaml bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-rapid-storage.yaml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: diff --git a/tools/cloud-build/daily-tests/builds/spack-gromacs.yaml b/tools/cloud-build/daily-tests/builds/spack-gromacs.yaml index 9de11f0a43..72682b664e 100644 --- a/tools/cloud-build/daily-tests/builds/spack-gromacs.yaml +++ b/tools/cloud-build/daily-tests/builds/spack-gromacs.yaml @@ -41,6 +41,7 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: + - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" args: @@ -55,7 +56,7 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \ + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="@tools/cloud-build/daily-tests/tests/spack-gromacs.yml" secretEnv: ['GCLUSTER_GCS_PATH'] availableSecrets: From 2d8cfb05e9ff896101a7c14684054ce15ec89204 Mon Sep 17 00:00:00 2001 From: Hritik Agarwal Date: Tue, 23 Jun 2026 08:55:04 +0000 Subject: [PATCH 02/13] Fix include path and improve triage agent trigger playbook --- .../tasks/rescue_gcluster_failure.yml | 2 +- .../tasks/trigger_failure_triage_agent.yml | 33 +++++++++---------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_gcluster_failure.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_gcluster_failure.yml index 983f0b7479..e27ca4a236 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_gcluster_failure.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/rescue_gcluster_failure.yml @@ -19,7 +19,7 @@ - workspace is defined - name: Include Failure Triage Agent trigger tasks - ansible.builtin.include_tasks: tasks/trigger_failure_triage_agent.yml + ansible.builtin.include_tasks: trigger_failure_triage_agent.yml - name: Delete Firewall Rule register: fw_deleted diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml index 543f2591c6..3d0984bfdc 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml @@ -14,25 +14,25 @@ --- - name: Set Triage Agent Configuration ansible.builtin.set_fact: - triage_gcs_bucket: "g-ift-agent-bucket" - triage_project_id: "508417052821" - triage_invoker_sa: "triage-invoker@hpc-toolkit-dev.iam.gserviceaccount.com" - triage_cloud_run_url: "https://failure-triage-agent-508417052821.us-central1.run.app" + triage_gcs_bucket: "{{ triage_gcs_bucket_override | default('g-ift-agent-bucket') }}" + triage_project_id: "{{ triage_project_id_override | default('508417052821') }}" + triage_invoker_sa: "{{ triage_invoker_sa_override | default('triage-invoker@hpc-toolkit-dev.iam.gserviceaccount.com') }}" + triage_cloud_run_url: "{{ triage_cloud_run_url_override | default('https://failure-triage-agent-508417052821.us-central1.run.app') }}" - name: Check Triage Agent Prerequisites delegate_to: localhost changed_when: false args: executable: /bin/bash + environment: + TRIAGE_BUILD_ID: "{{ full_build_id | default('') }}" ansible.builtin.shell: | - BUILD_ID="{{ full_build_id | default('') }}" - - if [ -z "$BUILD_ID" ]; then + if [ -z "$TRIAGE_BUILD_ID" ]; then echo "SKIPPED: No full_build_id provided. Append full_build_id=... to extra-vars to trigger." exit 0 fi - if ! gcloud storage ls gs://{{ triage_gcs_bucket }}/ENABLE_TRIAGE_AGENT >/dev/null 2>&1; then + if ! gcloud storage ls "gs://{{ triage_gcs_bucket }}/ENABLE_TRIAGE_AGENT" >/dev/null 2>&1; then echo "SKIPPED: Failure Triage Agent is currently disabled (no ENABLE_TRIAGE_AGENT flag found in GCS)." exit 0 fi @@ -50,6 +50,8 @@ changed_when: false args: executable: /bin/bash + environment: + TRIAGE_BUILD_ID: "{{ triage_build_id }}" ansible.builtin.shell: | TOKEN=$(gcloud auth print-identity-token --impersonate-service-account="{{ triage_invoker_sa }}" --audiences="{{ triage_cloud_run_url }}" 2>/dev/null) if [ -z "$TOKEN" ]; then @@ -60,7 +62,7 @@ HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST "{{ triage_cloud_run_url }}/trigger" \ -H "Authorization: Bearer $TOKEN" \ -H "Content-Type: application/json" \ - -d '{"build_id": "{{ triage_build_id }}", "project_id": "{{ triage_project_id }}"}') + -d "{\"build_id\": \"$TRIAGE_BUILD_ID\", \"project_id\": \"{{ triage_project_id }}\"}") if [ "$HTTP_STATUS" != "200" ] && [ "$HTTP_STATUS" != "202" ]; then echo "Failed to trigger agent. HTTP Status: $HTTP_STATUS" >&2 @@ -72,18 +74,15 @@ changed_when: false args: executable: /bin/bash + environment: + TRIAGE_BUILD_ID: "{{ triage_build_id }}" ansible.builtin.shell: | - sleep 300 + sleep 60 for i in {1..30}; do - STATE_JSON=$(gcloud storage cat "gs://{{ triage_gcs_bucket }}/{{ triage_build_id }}/state.json" 2>/dev/null || echo '{}') + STATE_JSON=$(gcloud storage cat "gs://{{ triage_gcs_bucket }}/$TRIAGE_BUILD_ID/state.json" 2>/dev/null || echo '{}') STATUS=$(echo "$STATE_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('status', ''))" 2>/dev/null) if [ "$STATUS" = "completed" ] || [ "$STATUS" = "failed" ]; then - if [ "$STATUS" = "completed" ]; then - sleep 300 - gcloud storage cat "gs://{{ triage_gcs_bucket }}/{{ triage_build_id }}/state.json" 2>/dev/null || echo '{}' - else - echo "$STATE_JSON" - fi + echo "$STATE_JSON" exit 0 fi sleep 60 From c2b066852153282386d1d85ab0362b22c51e91f7 Mon Sep 17 00:00:00 2001 From: Hritik Agarwal Date: Tue, 23 Jun 2026 09:33:33 +0000 Subject: [PATCH 03/13] chore: remove stderr suppression from gcloud and curl commands --- .../ansible_playbooks/tasks/trigger_failure_triage_agent.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml index 3d0984bfdc..26497fc762 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml @@ -53,13 +53,13 @@ environment: TRIAGE_BUILD_ID: "{{ triage_build_id }}" ansible.builtin.shell: | - TOKEN=$(gcloud auth print-identity-token --impersonate-service-account="{{ triage_invoker_sa }}" --audiences="{{ triage_cloud_run_url }}" 2>/dev/null) + TOKEN=$(gcloud auth print-identity-token --impersonate-service-account="{{ triage_invoker_sa }}" --audiences="{{ triage_cloud_run_url }}") if [ -z "$TOKEN" ]; then echo "Failed to get identity token." >&2 exit 1 fi - HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST "{{ triage_cloud_run_url }}/trigger" \ + HTTP_STATUS=$(curl -sS -o /dev/null -w "%{http_code}" -X POST "{{ triage_cloud_run_url }}/trigger" \ -H "Authorization: Bearer $TOKEN" \ -H "Content-Type: application/json" \ -d "{\"build_id\": \"$TRIAGE_BUILD_ID\", \"project_id\": \"{{ triage_project_id }}\"}") From 6e9582c506d20e52578661de4779de1bee2fc03b Mon Sep 17 00:00:00 2001 From: Hritik Agarwal Date: Wed, 24 Jun 2026 03:50:42 +0000 Subject: [PATCH 04/13] Updating the Environment vars --- .../tasks/trigger_failure_triage_agent.yml | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml index 26497fc762..e21ee0dc85 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml @@ -15,7 +15,7 @@ - name: Set Triage Agent Configuration ansible.builtin.set_fact: triage_gcs_bucket: "{{ triage_gcs_bucket_override | default('g-ift-agent-bucket') }}" - triage_project_id: "{{ triage_project_id_override | default('508417052821') }}" + triage_project_number: "{{ triage_project_number_override | default('508417052821') }}" triage_invoker_sa: "{{ triage_invoker_sa_override | default('triage-invoker@hpc-toolkit-dev.iam.gserviceaccount.com') }}" triage_cloud_run_url: "{{ triage_cloud_run_url_override | default('https://failure-triage-agent-508417052821.us-central1.run.app') }}" @@ -26,13 +26,14 @@ executable: /bin/bash environment: TRIAGE_BUILD_ID: "{{ full_build_id | default('') }}" + TRIAGE_GCS_BUCKET: "{{ triage_gcs_bucket }}" ansible.builtin.shell: | if [ -z "$TRIAGE_BUILD_ID" ]; then echo "SKIPPED: No full_build_id provided. Append full_build_id=... to extra-vars to trigger." exit 0 fi - if ! gcloud storage ls "gs://{{ triage_gcs_bucket }}/ENABLE_TRIAGE_AGENT" >/dev/null 2>&1; then + if ! gcloud storage ls "gs://$TRIAGE_GCS_BUCKET/ENABLE_TRIAGE_AGENT" >/dev/null 2>&1; then echo "SKIPPED: Failure Triage Agent is currently disabled (no ENABLE_TRIAGE_AGENT flag found in GCS)." exit 0 fi @@ -52,17 +53,20 @@ executable: /bin/bash environment: TRIAGE_BUILD_ID: "{{ triage_build_id }}" + TRIAGE_INVOKER_SA: "{{ triage_invoker_sa }}" + TRIAGE_CLOUD_RUN_URL: "{{ triage_cloud_run_url }}" + TRIAGE_PROJECT_NUMBER: "{{ triage_project_number }}" ansible.builtin.shell: | - TOKEN=$(gcloud auth print-identity-token --impersonate-service-account="{{ triage_invoker_sa }}" --audiences="{{ triage_cloud_run_url }}") + TOKEN=$(gcloud auth print-identity-token --impersonate-service-account="$TRIAGE_INVOKER_SA" --audiences="$TRIAGE_CLOUD_RUN_URL") if [ -z "$TOKEN" ]; then echo "Failed to get identity token." >&2 exit 1 fi - HTTP_STATUS=$(curl -sS -o /dev/null -w "%{http_code}" -X POST "{{ triage_cloud_run_url }}/trigger" \ + HTTP_STATUS=$(curl -sS -o /dev/null -w "%{http_code}" -X POST "$TRIAGE_CLOUD_RUN_URL/trigger" \ -H "Authorization: Bearer $TOKEN" \ -H "Content-Type: application/json" \ - -d "{\"build_id\": \"$TRIAGE_BUILD_ID\", \"project_id\": \"{{ triage_project_id }}\"}") + -d "{\"build_id\": \"$TRIAGE_BUILD_ID\", \"project_number\": \"$TRIAGE_PROJECT_NUMBER\"}") if [ "$HTTP_STATUS" != "200" ] && [ "$HTTP_STATUS" != "202" ]; then echo "Failed to trigger agent. HTTP Status: $HTTP_STATUS" >&2 @@ -76,10 +80,11 @@ executable: /bin/bash environment: TRIAGE_BUILD_ID: "{{ triage_build_id }}" + TRIAGE_GCS_BUCKET: "{{ triage_gcs_bucket }}" ansible.builtin.shell: | sleep 60 for i in {1..30}; do - STATE_JSON=$(gcloud storage cat "gs://{{ triage_gcs_bucket }}/$TRIAGE_BUILD_ID/state.json" 2>/dev/null || echo '{}') + STATE_JSON=$(gcloud storage cat "gs://$TRIAGE_GCS_BUCKET/$TRIAGE_BUILD_ID/state.json" 2>/dev/null || echo '{}') STATUS=$(echo "$STATE_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('status', ''))" 2>/dev/null) if [ "$STATUS" = "completed" ] || [ "$STATUS" = "failed" ]; then echo "$STATE_JSON" From abc662570f86ea552299a838978751ec078e29d1 Mon Sep 17 00:00:00 2001 From: Hritik Agarwal Date: Wed, 24 Jun 2026 07:05:36 +0000 Subject: [PATCH 05/13] Kill switch updated --- .../ansible_playbooks/tasks/trigger_failure_triage_agent.yml | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml index e21ee0dc85..bcce628235 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml @@ -33,8 +33,9 @@ exit 0 fi - if ! gcloud storage ls "gs://$TRIAGE_GCS_BUCKET/ENABLE_TRIAGE_AGENT" >/dev/null 2>&1; then - echo "SKIPPED: Failure Triage Agent is currently disabled (no ENABLE_TRIAGE_AGENT flag found in GCS)." + CONFIG_CONTENT=$(gcloud storage cat "gs://$TRIAGE_GCS_BUCKET/config_triage_agent.env" 2>/dev/null || echo "enable_agent=false") + if ! echo "$CONFIG_CONTENT" | grep -qi "^enable_agent=true"; then + echo "SKIPPED: Failure Triage Agent is currently disabled in config_triage_agent.env." exit 0 fi From 6882199b04ace2f1a33d75862c29c751f270d332 Mon Sep 17 00:00:00 2001 From: Hritik Agarwal Date: Wed, 24 Jun 2026 12:49:17 +0000 Subject: [PATCH 06/13] resolved few comments --- .../tasks/trigger_failure_triage_agent.yml | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml index bcce628235..14eee2b614 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml @@ -29,13 +29,18 @@ TRIAGE_GCS_BUCKET: "{{ triage_gcs_bucket }}" ansible.builtin.shell: | if [ -z "$TRIAGE_BUILD_ID" ]; then - echo "SKIPPED: No full_build_id provided. Append full_build_id=... to extra-vars to trigger." + echo "SKIPPED: The 'full_build_id' variable is missing." >&2 + exit 0 + fi + + if ! gcloud storage buckets describe "gs://$TRIAGE_GCS_BUCKET" >/dev/null 2>&1; then + echo "SKIPPED: Triage Agent bucket '$TRIAGE_GCS_BUCKET' does not exist." >&2 exit 0 fi CONFIG_CONTENT=$(gcloud storage cat "gs://$TRIAGE_GCS_BUCKET/config_triage_agent.env" 2>/dev/null || echo "enable_agent=false") - if ! echo "$CONFIG_CONTENT" | grep -qi "^enable_agent=true"; then - echo "SKIPPED: Failure Triage Agent is currently disabled in config_triage_agent.env." + if ! echo "$CONFIG_CONTENT" | grep -qi '^[[:space:]]*enable_agent[[:space:]]*=[[:space:]]*true'; then + echo "SKIPPED: Failure Triage Agent is currently disabled in config_triage_agent.env." >&2 exit 0 fi @@ -64,13 +69,17 @@ exit 1 fi - HTTP_STATUS=$(curl -sS -o /dev/null -w "%{http_code}" -X POST "$TRIAGE_CLOUD_RUN_URL/trigger" \ + RESPONSE=$(curl -sS -w "\n%{http_code}" -X POST "$TRIAGE_CLOUD_RUN_URL/trigger" \ -H "Authorization: Bearer $TOKEN" \ -H "Content-Type: application/json" \ -d "{\"build_id\": \"$TRIAGE_BUILD_ID\", \"project_number\": \"$TRIAGE_PROJECT_NUMBER\"}") - if [ "$HTTP_STATUS" != "200" ] && [ "$HTTP_STATUS" != "202" ]; then + HTTP_STATUS=$(echo "$RESPONSE" | tail -n1) + BODY=$(echo "$RESPONSE" | sed '$d') + + if [ "$HTTP_STATUS" != "202" ]; then echo "Failed to trigger agent. HTTP Status: $HTTP_STATUS" >&2 + echo "Response Body: $BODY" >&2 exit 1 fi @@ -83,6 +92,7 @@ TRIAGE_BUILD_ID: "{{ triage_build_id }}" TRIAGE_GCS_BUCKET: "{{ triage_gcs_bucket }}" ansible.builtin.shell: | + # Wait for Cloud Run to start and for the initial state file to be copied sleep 60 for i in {1..30}; do STATE_JSON=$(gcloud storage cat "gs://$TRIAGE_GCS_BUCKET/$TRIAGE_BUILD_ID/state.json" 2>/dev/null || echo '{}') @@ -91,6 +101,7 @@ echo "$STATE_JSON" exit 0 fi + # Time delay between polling attempts sleep 60 done exit 1 @@ -103,13 +114,13 @@ msg: | {% if agent_state.failed or (agent_state.stdout | default('{}', true) | from_json).status | default('') != 'completed' %} Failure Triage Agent testing did not complete in time or failed internally. - - State File Link: - https://console.cloud.google.com/storage/browser/_details/{{ triage_gcs_bucket }}/{{ triage_build_id }}/state.json {% else %} TRIAGE EXECUTIVE SUMMARY: - {{ (agent_state.stdout | default('{}', true) | from_json).executive_summary | default('No summary available.') }} + {{ (agent_state.stdout | default('{}', true) | from_json).executive_summary | default('No summary available.') | wordwrap(100) }} - Full forensic report available at: + Full diagnostic report available at: https://storage.cloud.google.com/{{ triage_gcs_bucket }}/{{ triage_build_id }}/report.txt {% endif %} + + For detailed intermediate state information, please review the state file: + https://console.cloud.google.com/storage/browser/_details/{{ triage_gcs_bucket }}/{{ triage_build_id }}/state.json From 8dd7d1cca94997b06da7e635a1f1c9bd6aa079a7 Mon Sep 17 00:00:00 2001 From: Hritik Agarwal Date: Thu, 25 Jun 2026 04:42:59 +0000 Subject: [PATCH 07/13] Added a check for state file existence to fail fast --- .../tasks/trigger_failure_triage_agent.yml | 11 +++++++++-- .../cloud-build/daily-tests/builds/gke-a4-onspot.yaml | 2 +- .../cloud-build/daily-tests/builds/gke-g4-onspot.yaml | 2 +- .../daily-tests/builds/gke-h4d-onspot.yaml | 2 +- tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml | 2 +- .../daily-tests/builds/gke-tpu-v6e-flex.yaml | 2 +- tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml | 2 +- tools/cloud-build/daily-tests/builds/h4d-vm.yaml | 2 +- .../builds/ml-a3-highgpu-onspot-slurm.yaml | 2 +- .../builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml | 2 +- .../builds/ml-a3-ultragpu-custom-blueprint-test.yaml | 2 +- .../builds/ml-a3-ultragpu-onspot-jbvms.yaml | 2 +- .../builds/ml-a3-ultragpu-onspot-slurm.yaml | 2 +- .../ml-a4-highgpu-custom-2404-blueprint-test.yaml | 2 +- .../builds/ml-a4-highgpu-custom-blueprint-test.yaml | 2 +- .../builds/ml-a4-highgpu-onspot-slurm.yaml | 2 +- .../daily-tests/builds/ml-g4-onspot-slurm.yaml | 2 +- .../daily-tests/builds/ml-h4d-onspot-slurm.yaml | 2 +- .../daily-tests/builds/slurm-gcp-v6-tpu.yaml | 2 +- 19 files changed, 27 insertions(+), 20 deletions(-) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml index 14eee2b614..8c0c9c5747 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml @@ -94,6 +94,13 @@ ansible.builtin.shell: | # Wait for Cloud Run to start and for the initial state file to be copied sleep 60 + + # Fail fast if state.json is not present after initial buffer + if ! gcloud storage ls "gs://$TRIAGE_GCS_BUCKET/$TRIAGE_BUILD_ID/state.json" >/dev/null 2>&1; then + echo "Agent failed to start: state.json was not created within 60 seconds." >&2 + exit 1 + fi + for i in {1..30}; do STATE_JSON=$(gcloud storage cat "gs://$TRIAGE_GCS_BUCKET/$TRIAGE_BUILD_ID/state.json" 2>/dev/null || echo '{}') STATUS=$(echo "$STATE_JSON" | python3 -c "import sys, json; print(json.load(sys.stdin).get('status', ''))" 2>/dev/null) @@ -102,7 +109,7 @@ exit 0 fi # Time delay between polling attempts - sleep 60 + sleep 30 done exit 1 register: agent_state @@ -115,7 +122,7 @@ {% if agent_state.failed or (agent_state.stdout | default('{}', true) | from_json).status | default('') != 'completed' %} Failure Triage Agent testing did not complete in time or failed internally. {% else %} - TRIAGE EXECUTIVE SUMMARY: + TRIAGE AGENT SUMMARY: {{ (agent_state.stdout | default('{}', true) | from_json).executive_summary | default('No summary available.') | wordwrap(100) }} Full diagnostic report available at: diff --git a/tools/cloud-build/daily-tests/builds/gke-a4-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-a4-onspot.yaml index 99df32fff7..82df55b1d3 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a4-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a4-onspot.yaml @@ -39,13 +39,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a4-highgpu-8g" - "INSTANCE_PREFIX=a4spgke" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a4hoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/gke-g4-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-g4-onspot.yaml index 3d91e60821..de7d3564b1 100644 --- a/tools/cloud-build/daily-tests/builds/gke-g4-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-g4-onspot.yaml @@ -36,13 +36,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=2" - "MACHINE_TYPE=g4-standard-48" - "INSTANCE_PREFIX=g4spgke" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/g4options.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/gke-h4d-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-h4d-onspot.yaml index c311e149f1..af4c87e5a8 100644 --- a/tools/cloud-build/daily-tests/builds/gke-h4d-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-h4d-onspot.yaml @@ -39,13 +39,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=2" - "MACHINE_TYPE=h4d-highmem-192-lssd" - "INSTANCE_PREFIX=h4dspgke" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/h4doptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml b/tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml index 5753bfef8c..fd606bafe0 100644 --- a/tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml @@ -37,13 +37,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "MACHINE_TYPE=tpu7x-standard-4t" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=2" - "INSTANCE_PREFIX=tpu7xsp" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/tpu7xoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/gke-tpu-v6e-flex.yaml b/tools/cloud-build/daily-tests/builds/gke-tpu-v6e-flex.yaml index a2dfa12a42..89dbeb2ee4 100644 --- a/tools/cloud-build/daily-tests/builds/gke-tpu-v6e-flex.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-tpu-v6e-flex.yaml @@ -39,10 +39,10 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" + - "BUILD_ID=$BUILD_ID" args: - -c - | diff --git a/tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml b/tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml index 7302d2d219..53c79dd3f7 100644 --- a/tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml @@ -36,7 +36,6 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "MACHINE_TYPE=tpu" @@ -45,6 +44,7 @@ steps: - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=2" - "INSTANCE_PREFIX=v6esp" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/tpuv6eoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/h4d-vm.yaml b/tools/cloud-build/daily-tests/builds/h4d-vm.yaml index eecd78084a..b4d7da55d0 100644 --- a/tools/cloud-build/daily-tests/builds/h4d-vm.yaml +++ b/tools/cloud-build/daily-tests/builds/h4d-vm.yaml @@ -35,13 +35,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=h4d-highmem-192-lssd" - "INSTANCE_PREFIX=h4dsp" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/h4doptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml index b754ff82eb..7684417886 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml @@ -40,13 +40,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "MACHINE_TYPE=a3-highgpu-8g" - "NUM_NODES=4" - "INSTANCE_PREFIX=a3hsp" - "PROJECT_ID=$PROJECT_ID" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3hoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml index d0c084226e..2a81deeabb 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml @@ -41,7 +41,6 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" # General Ansible configuration - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" @@ -49,6 +48,7 @@ steps: - "NUM_NODES=4" - "INSTANCE_PREFIX=a3msp" - "PROJECT_ID=$PROJECT_ID" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3moptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml index 87f4a3f560..78cf3b31b2 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml @@ -36,13 +36,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a3-ultragpu-8g" - "INSTANCE_PREFIX=a3usp" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3uoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-jbvms.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-jbvms.yaml index 76f5feb863..f9cc32ef18 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-jbvms.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-jbvms.yaml @@ -36,13 +36,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a3-ultragpu-8g" - "INSTANCE_PREFIX=a3usp-jbvms" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3uoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml index 39152d9fe6..7cb332c505 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml @@ -42,13 +42,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a3-ultragpu-8g" - "INSTANCE_PREFIX=a3usp" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3uoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-2404-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-2404-blueprint-test.yaml index 96b11bc057..d430309b35 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-2404-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-2404-blueprint-test.yaml @@ -36,13 +36,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a4-highgpu-8g" - "INSTANCE_PREFIX=a4hsp" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a4hoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml index be2fc40802..600a676c8c 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml @@ -36,13 +36,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a4-highgpu-8g" - "INSTANCE_PREFIX=a4hsp" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a4hoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml index 4c77e5c711..30544f47a6 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml @@ -41,13 +41,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a4-highgpu-8g" - "INSTANCE_PREFIX=a4hsp" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a4hoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/ml-g4-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-g4-onspot-slurm.yaml index ec2688d7b0..a98396d59d 100644 --- a/tools/cloud-build/daily-tests/builds/ml-g4-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-g4-onspot-slurm.yaml @@ -38,13 +38,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=2" - "MACHINE_TYPE=g4-standard-48" - "INSTANCE_PREFIX=g4osp" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/g4options.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/ml-h4d-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-h4d-onspot-slurm.yaml index 7d131f27d5..ee702461e4 100644 --- a/tools/cloud-build/daily-tests/builds/ml-h4d-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-h4d-onspot-slurm.yaml @@ -36,7 +36,6 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" # General Ansible configuration - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" @@ -44,6 +43,7 @@ steps: - "NUM_NODES=4" - "INSTANCE_PREFIX=h4dspon" - "PROJECT_ID=$PROJECT_ID" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/h4doptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml index 06600857ec..39664df2a4 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml @@ -36,7 +36,6 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "MACHINE_TYPE=tpu" @@ -45,6 +44,7 @@ steps: - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=2" - "INSTANCE_PREFIX=v3sp" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/tpuv3options.txt" args: - -c From 1e3463015b6a09d9422a51fd89e0641a9dcb2d55 Mon Sep 17 00:00:00 2001 From: Hritik Agarwal Date: Thu, 25 Jun 2026 04:56:44 +0000 Subject: [PATCH 08/13] Updated ansible message for state file --- .../ansible_playbooks/tasks/trigger_failure_triage_agent.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml index 8c0c9c5747..81c41cb73f 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml @@ -129,5 +129,5 @@ https://storage.cloud.google.com/{{ triage_gcs_bucket }}/{{ triage_build_id }}/report.txt {% endif %} - For detailed intermediate state information, please review the state file: + For detailed intermediate state information, please review the diagnostic state file: https://console.cloud.google.com/storage/browser/_details/{{ triage_gcs_bucket }}/{{ triage_build_id }}/state.json From ebd355aa17a95e7f5b93c5e80b9576df7ef40ed0 Mon Sep 17 00:00:00 2001 From: Hritik Agarwal Date: Thu, 25 Jun 2026 06:20:10 +0000 Subject: [PATCH 09/13] corrected the position of env var --- .../daily-tests/builds/gke-a2-highgpu-kueue-onspot.yaml | 2 +- tools/cloud-build/daily-tests/builds/gke-a3-megagpu-onspot.yaml | 2 +- .../cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml | 2 +- .../builds/ml-a3-ultragpu-custom-2404-blueprint-test.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/cloud-build/daily-tests/builds/gke-a2-highgpu-kueue-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-a2-highgpu-kueue-onspot.yaml index 8e00273079..dea1b52a82 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a2-highgpu-kueue-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a2-highgpu-kueue-onspot.yaml @@ -36,7 +36,6 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" @@ -46,6 +45,7 @@ steps: - "PROVISIONING_MODEL=SPOT" - "MACHINE_TYPE=a2-highgpu-2g" - "INSTANCE_PREFIX=a2hspgke" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a2hoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu-onspot.yaml index 123d461efd..77af9c6699 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu-onspot.yaml @@ -36,13 +36,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "MACHINE_TYPE=a3-megagpu-8g" - "NUM_NODES=4" - "INSTANCE_PREFIX=a3mspgke" - "PROJECT_ID=$PROJECT_ID" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3moptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml index 86b4b70e63..7996ff9607 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml @@ -39,13 +39,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a3-ultragpu-8g" - "INSTANCE_PREFIX=a3uspgke" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3uoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-2404-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-2404-blueprint-test.yaml index b8eb64c1b1..928138004e 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-2404-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-2404-blueprint-test.yaml @@ -36,13 +36,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a3-ultragpu-8g" - "INSTANCE_PREFIX=a3usp" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3uoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: From 4f30632de183c72a3d577397596049134a4c0bc4 Mon Sep 17 00:00:00 2001 From: Hritik Agarwal Date: Thu, 25 Jun 2026 06:24:21 +0000 Subject: [PATCH 10/13] corrected the position of env var --- tools/cloud-build/daily-tests/builds/gke-a3-highgpu-onspot.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-highgpu-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu-onspot.yaml index 54936ded7f..10a5b17aeb 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-highgpu-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu-onspot.yaml @@ -36,13 +36,13 @@ steps: name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner entrypoint: /bin/bash env: - - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - "PROJECT_ID=$PROJECT_ID" - "NUM_NODES=4" - "MACHINE_TYPE=a3-highgpu-8g" - "INSTANCE_PREFIX=a3hspgke" + - "BUILD_ID=$BUILD_ID" - "OPTIONS_GCS_PATH=gs://hpc-ctk1357/a3hoptions.txt" - "ENABLE_SPOT_FALLBACK=true" args: From 4ab6c54a447476239bc88c58046d207829547358 Mon Sep 17 00:00:00 2001 From: Hritik Agarwal Date: Thu, 25 Jun 2026 18:21:30 +0000 Subject: [PATCH 11/13] chore(ci): configure failure triage agent with Secret Manager --- .../tasks/trigger_failure_triage_agent.yml | 18 ++++-- .../daily-tests/builds/ansible-vm.yaml | 16 ++++- .../daily-tests/builds/batch-mpi.yaml | 16 ++++- .../cloud-build/daily-tests/builds/batch.yaml | 16 ++++- .../builds/chrome-remote-desktop-ubuntu.yaml | 16 ++++- .../builds/chrome-remote-desktop.yaml | 16 ++++- .../builds/gke-a2-highgpu-kueue-onspot.yaml | 16 ++++- .../builds/gke-a3-highgpu-onspot.yaml | 16 ++++- .../daily-tests/builds/gke-a3-highgpu.yaml | 16 ++++- .../builds/gke-a3-megagpu-onspot.yaml | 16 ++++- .../daily-tests/builds/gke-a3-megagpu.yaml | 16 ++++- .../builds/gke-a3-ultragpu-onspot.yaml | 14 ++++- .../daily-tests/builds/gke-a3-ultragpu.yaml | 16 ++++- .../daily-tests/builds/gke-a4-onspot.yaml | 14 ++++- .../daily-tests/builds/gke-a4x.yaml | 16 ++++- .../daily-tests/builds/gke-g4-onspot.yaml | 16 ++++- .../daily-tests/builds/gke-g4.yaml | 16 ++++- .../daily-tests/builds/gke-h4d-onspot.yaml | 16 ++++- .../daily-tests/builds/gke-h4d.yaml | 16 ++++- .../builds/gke-inactive-reservation.yaml | 16 ++++- .../builds/gke-managed-hyperdisk.yaml | 16 ++++- .../builds/gke-managed-lustre.yaml | 16 ++++- .../daily-tests/builds/gke-storage.yaml | 16 ++++- .../daily-tests/builds/gke-tpu-7x.yaml | 14 ++++- .../daily-tests/builds/gke-tpu-v6e-flex.yaml | 16 ++++- .../daily-tests/builds/gke-tpu-v6e.yaml | 14 ++++- tools/cloud-build/daily-tests/builds/gke.yaml | 16 ++++- .../daily-tests/builds/h4d-vm.yaml | 16 ++++- .../cloud-build/daily-tests/builds/hcls.yaml | 16 ++++- .../builds/hpc-build-slurm-image.yaml | 16 ++++- .../builds/hpc-enterprise-slurm.yaml | 16 ++++- .../daily-tests/builds/htc-slurm.yaml | 16 ++++- .../daily-tests/builds/htcondor.yaml | 16 ++++- .../builds/ml-a3-highgpu-onspot-slurm.yaml | 14 ++++- .../builds/ml-a3-highgpu-slurm.yaml | 14 ++++- .../ml-a3-megagpu-onspot-slurm-ubuntu.yaml | 16 ++++- .../builds/ml-a3-megagpu-slurm-ubuntu.yaml | 16 ++++- ...3-ultragpu-custom-2404-blueprint-test.yaml | 16 ++++- .../ml-a3-ultragpu-custom-blueprint-test.yaml | 14 ++++- .../builds/ml-a3-ultragpu-jbvms.yaml | 16 ++++- .../builds/ml-a3-ultragpu-onspot-jbvms.yaml | 16 ++++- .../builds/ml-a3-ultragpu-onspot-slurm.yaml | 14 ++++- .../builds/ml-a3-ultragpu-slurm.yaml | 16 ++++- ...a4-highgpu-custom-2404-blueprint-test.yaml | 16 ++++- .../ml-a4-highgpu-custom-blueprint-test.yaml | 14 ++++- .../builds/ml-a4-highgpu-onspot-slurm.yaml | 14 ++++- .../ml-a4x-highgpu-custom-blueprint-test.yaml | 14 ++++- .../builds/ml-a4x-highgpu-slurm.yaml | 16 ++++- .../builds/ml-g4-onspot-slurm.yaml | 16 ++++- .../daily-tests/builds/ml-gke-e2e.yaml | 16 ++++- .../daily-tests/builds/ml-gke.yaml | 16 ++++- .../builds/ml-h4d-onspot-slurm.yaml | 16 ++++- .../daily-tests/builds/ml-slurm.yaml | 16 ++++- .../daily-tests/builds/monitoring.yaml | 16 ++++- .../daily-tests/builds/netapp-volumes.yaml | 16 ++++- .../daily-tests/builds/ofe-deployment.yaml | 16 ++++- .../daily-tests/builds/packer.yaml | 16 ++++- .../builds/pfs-managed-lustre-slurm.yaml | 16 ++++- .../builds/pfs-managed-lustre-vm.yaml | 16 ++++- .../cloud-build/daily-tests/builds/slinky.yml | 16 ++++- .../builds/slurm-gcp-v6-debian.yaml | 16 ++++- .../builds/slurm-gcp-v6-rocky8.yaml | 16 ++++- .../daily-tests/builds/slurm-gcp-v6-ssd.yaml | 16 ++++- .../builds/slurm-gcp-v6-startup-scripts.yaml | 16 ++++- .../daily-tests/builds/slurm-gcp-v6-tpu.yaml | 16 ++++- .../builds/slurm-gcp-v6-ubuntu.yaml | 16 ++++- .../daily-tests/builds/slurm-gke.yaml | 16 ++++- .../builds/slurm-rapid-storage.yaml | 16 ++++- .../daily-tests/builds/spack-gromacs.yaml | 16 ++++- validate.py | 58 +++++++++++++++++++ 70 files changed, 1013 insertions(+), 129 deletions(-) create mode 100644 validate.py diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml index 81c41cb73f..40076755ef 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml @@ -14,10 +14,10 @@ --- - name: Set Triage Agent Configuration ansible.builtin.set_fact: - triage_gcs_bucket: "{{ triage_gcs_bucket_override | default('g-ift-agent-bucket') }}" - triage_project_number: "{{ triage_project_number_override | default('508417052821') }}" - triage_invoker_sa: "{{ triage_invoker_sa_override | default('triage-invoker@hpc-toolkit-dev.iam.gserviceaccount.com') }}" - triage_cloud_run_url: "{{ triage_cloud_run_url_override | default('https://failure-triage-agent-508417052821.us-central1.run.app') }}" + triage_gcs_bucket: "{{ triage_gcs_bucket_override | default('') }}" + triage_project_number: "{{ triage_project_number_override | default('') }}" + triage_invoker_sa: "{{ triage_invoker_sa_override | default('') }}" + triage_cloud_run_url: "{{ triage_cloud_run_url_override | default('') }}" - name: Check Triage Agent Prerequisites delegate_to: localhost @@ -27,7 +27,15 @@ environment: TRIAGE_BUILD_ID: "{{ full_build_id | default('') }}" TRIAGE_GCS_BUCKET: "{{ triage_gcs_bucket }}" + TRIAGE_PROJECT_NUMBER: "{{ triage_project_number }}" + TRIAGE_INVOKER_SA: "{{ triage_invoker_sa }}" + TRIAGE_CLOUD_RUN_URL: "{{ triage_cloud_run_url }}" ansible.builtin.shell: | + if [ -z "$TRIAGE_GCS_BUCKET" ] || [ -z "$TRIAGE_PROJECT_NUMBER" ] || [ -z "$TRIAGE_INVOKER_SA" ] || [ -z "$TRIAGE_CLOUD_RUN_URL" ]; then + echo "SKIPPED: One or more Triage Agent configuration variables are missing." >&2 + exit 0 + fi + if [ -z "$TRIAGE_BUILD_ID" ]; then echo "SKIPPED: The 'full_build_id' variable is missing." >&2 exit 0 @@ -46,6 +54,7 @@ echo "PROCEED: Agent is enabled and build ID is present." register: triage_init + ignore_errors: true - name: Execute Triage Agent Pipeline when: "'PROCEED' in triage_init.stdout" @@ -82,6 +91,7 @@ echo "Response Body: $BODY" >&2 exit 1 fi + ignore_errors: true - name: Wait for Analysis to Complete delegate_to: localhost diff --git a/tools/cloud-build/daily-tests/builds/ansible-vm.yaml b/tools/cloud-build/daily-tests/builds/ansible-vm.yaml index 6b7b0b78ed..2136bf0df7 100644 --- a/tools/cloud-build/daily-tests/builds/ansible-vm.yaml +++ b/tools/cloud-build/daily-tests/builds/ansible-vm.yaml @@ -50,9 +50,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/ansible-vm.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/ansible-vm.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/batch-mpi.yaml b/tools/cloud-build/daily-tests/builds/batch-mpi.yaml index d65223ea6f..9dc47296b9 100644 --- a/tools/cloud-build/daily-tests/builds/batch-mpi.yaml +++ b/tools/cloud-build/daily-tests/builds/batch-mpi.yaml @@ -35,6 +35,14 @@ availableSecrets: env: SPACK_CACHE_WRF - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' steps: # While using static network names we are guarding against more than 1 instance running at a time (for multi-group tests) @@ -50,7 +58,7 @@ steps: - "BUILD_ID=$BUILD_ID" - "ANSIBLE_HOST_KEY_CHECKING=false" - "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg" - secretEnv: ['SPACK_CACHE_WRF', 'GCLUSTER_GCS_PATH'] + secretEnv: ['SPACK_CACHE_WRF', 'GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] args: - -c - | @@ -74,4 +82,8 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/batch-mpi.yml" + --extra-vars="@tools/cloud-build/daily-tests/tests/batch-mpi.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" diff --git a/tools/cloud-build/daily-tests/builds/batch.yaml b/tools/cloud-build/daily-tests/builds/batch.yaml index 151f90689e..50384dc928 100644 --- a/tools/cloud-build/daily-tests/builds/batch.yaml +++ b/tools/cloud-build/daily-tests/builds/batch.yaml @@ -52,9 +52,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/batch.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/batch.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/chrome-remote-desktop-ubuntu.yaml b/tools/cloud-build/daily-tests/builds/chrome-remote-desktop-ubuntu.yaml index d8a7faf705..566a70d1ca 100644 --- a/tools/cloud-build/daily-tests/builds/chrome-remote-desktop-ubuntu.yaml +++ b/tools/cloud-build/daily-tests/builds/chrome-remote-desktop-ubuntu.yaml @@ -49,9 +49,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID os=ubuntu" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/chrome-remote-desktop.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/chrome-remote-desktop.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/chrome-remote-desktop.yaml b/tools/cloud-build/daily-tests/builds/chrome-remote-desktop.yaml index 45e362f453..b1de19c23a 100644 --- a/tools/cloud-build/daily-tests/builds/chrome-remote-desktop.yaml +++ b/tools/cloud-build/daily-tests/builds/chrome-remote-desktop.yaml @@ -50,9 +50,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID os=default" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/chrome-remote-desktop.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/chrome-remote-desktop.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/gke-a2-highgpu-kueue-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-a2-highgpu-kueue-onspot.yaml index dea1b52a82..683082bf77 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a2-highgpu-kueue-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a2-highgpu-kueue-onspot.yaml @@ -112,9 +112,21 @@ steps: --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ - --extra-vars="@$${GKE_VARS_FILE}" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@$${GKE_VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-highgpu-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu-onspot.yaml index 10a5b17aeb..49d0eda113 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-highgpu-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu-onspot.yaml @@ -101,9 +101,21 @@ steps: --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ - --extra-vars="@$${GKE_VARS_FILE}" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@$${GKE_VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml index 740969e02e..1b05ae8f41 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-highgpu.yaml @@ -71,9 +71,21 @@ steps: bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a3-highgpu.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu-onspot.yaml index 77af9c6699..1e8bbbed26 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu-onspot.yaml @@ -100,9 +100,21 @@ steps: --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ - --extra-vars="@$${GKE_VARS_FILE}" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@$${GKE_VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml index 77b93203d8..f143a28573 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-megagpu.yaml @@ -71,9 +71,21 @@ steps: bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a3-megagpu.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml index 7996ff9607..d02f767821 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml @@ -103,11 +103,23 @@ steps: --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID chs_repo=$${CHS_REPO}" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ - --extra-vars="@$${GKE_VARS_FILE}" + --extra-vars="@$${GKE_VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' - versionName: projects/${PROJECT_ID}/secrets/cluster-health-scanner/versions/latest env: 'CHS_REPO' diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu.yaml index 6b2913a86c..fa1df10498 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu.yaml @@ -74,9 +74,21 @@ steps: bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/gke-a4-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-a4-onspot.yaml index 82df55b1d3..911356d6ca 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a4-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a4-onspot.yaml @@ -104,11 +104,23 @@ steps: --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID chs_repo=$${CHS_REPO}" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ - --extra-vars="@$${GKE_VARS_FILE}" + --extra-vars="@$${GKE_VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' - versionName: projects/${PROJECT_ID}/secrets/cluster-health-scanner/versions/latest env: 'CHS_REPO' diff --git a/tools/cloud-build/daily-tests/builds/gke-a4x.yaml b/tools/cloud-build/daily-tests/builds/gke-a4x.yaml index 7bd570565d..a1c85d8b9a 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a4x.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a4x.yaml @@ -80,9 +80,21 @@ steps: bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a4x.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-a4x.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/gke-g4-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-g4-onspot.yaml index de7d3564b1..c3cf55a1d4 100644 --- a/tools/cloud-build/daily-tests/builds/gke-g4-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-g4-onspot.yaml @@ -94,9 +94,21 @@ steps: --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ - --extra-vars="@$${GKE_VARS_FILE}" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@$${GKE_VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/gke-g4.yaml b/tools/cloud-build/daily-tests/builds/gke-g4.yaml index 57cbef85df..968788fa5d 100644 --- a/tools/cloud-build/daily-tests/builds/gke-g4.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-g4.yaml @@ -69,9 +69,21 @@ steps: bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/gke-g4.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-g4.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/gke-h4d-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-h4d-onspot.yaml index af4c87e5a8..47fc37fced 100644 --- a/tools/cloud-build/daily-tests/builds/gke-h4d-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-h4d-onspot.yaml @@ -97,9 +97,21 @@ steps: --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ - --extra-vars="@$${GKE_VARS_FILE}" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@$${GKE_VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/gke-h4d.yaml b/tools/cloud-build/daily-tests/builds/gke-h4d.yaml index 76da954c1c..d30c742e1b 100644 --- a/tools/cloud-build/daily-tests/builds/gke-h4d.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-h4d.yaml @@ -77,9 +77,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/gke-h4d.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-h4d.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/gke-inactive-reservation.yaml b/tools/cloud-build/daily-tests/builds/gke-inactive-reservation.yaml index 6fcc2e5cea..52d610c8da 100644 --- a/tools/cloud-build/daily-tests/builds/gke-inactive-reservation.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-inactive-reservation.yaml @@ -74,9 +74,21 @@ steps: bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/gke-inactive-reservation.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-inactive-reservation.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/gke-managed-hyperdisk.yaml b/tools/cloud-build/daily-tests/builds/gke-managed-hyperdisk.yaml index 5b353d3206..e1220c09c8 100644 --- a/tools/cloud-build/daily-tests/builds/gke-managed-hyperdisk.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-managed-hyperdisk.yaml @@ -63,9 +63,21 @@ steps: bash tools/add_ttl_label.sh "$${SG_EXAMPLE}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/gke-managed-hyperdisk.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-managed-hyperdisk.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/gke-managed-lustre.yaml b/tools/cloud-build/daily-tests/builds/gke-managed-lustre.yaml index 034f848a4d..51591245bd 100644 --- a/tools/cloud-build/daily-tests/builds/gke-managed-lustre.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-managed-lustre.yaml @@ -65,9 +65,21 @@ steps: bash tools/add_ttl_label.sh "$${EXAMPLE_BP}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/gke-managed-lustre.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-managed-lustre.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/gke-storage.yaml b/tools/cloud-build/daily-tests/builds/gke-storage.yaml index 2fc273cbee..32788cf6e1 100644 --- a/tools/cloud-build/daily-tests/builds/gke-storage.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-storage.yaml @@ -66,9 +66,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/gke-storage.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-storage.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml b/tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml index fd606bafe0..faa03dee29 100644 --- a/tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml @@ -88,11 +88,23 @@ steps: --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID chs_repo=$${CHS_REPO}" \ --extra-vars="region=us-central1 zone=us-central1-c" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ - --extra-vars="@$${GKE_VARS_FILE}" + --extra-vars="@$${GKE_VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' - versionName: projects/${PROJECT_ID}/secrets/cluster-health-scanner/versions/latest env: 'CHS_REPO' diff --git a/tools/cloud-build/daily-tests/builds/gke-tpu-v6e-flex.yaml b/tools/cloud-build/daily-tests/builds/gke-tpu-v6e-flex.yaml index 89dbeb2ee4..69ac3d9c00 100644 --- a/tools/cloud-build/daily-tests/builds/gke-tpu-v6e-flex.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-tpu-v6e-flex.yaml @@ -74,9 +74,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/gke-tpu-v6e-flex.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/gke-tpu-v6e-flex.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml b/tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml index 53c79dd3f7..320da33bc8 100644 --- a/tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml @@ -103,11 +103,23 @@ steps: --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID chs_repo=$${CHS_REPO}" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ - --extra-vars="@$${GKE_VARS_FILE}" + --extra-vars="@$${GKE_VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' - versionName: projects/${PROJECT_ID}/secrets/cluster-health-scanner/versions/latest env: 'CHS_REPO' diff --git a/tools/cloud-build/daily-tests/builds/gke.yaml b/tools/cloud-build/daily-tests/builds/gke.yaml index 04273d6d5a..b96855bcd0 100644 --- a/tools/cloud-build/daily-tests/builds/gke.yaml +++ b/tools/cloud-build/daily-tests/builds/gke.yaml @@ -65,9 +65,21 @@ steps: bash tools/add_ttl_label.sh "$${SG_EXAMPLE}" ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/gke.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/gke.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/h4d-vm.yaml b/tools/cloud-build/daily-tests/builds/h4d-vm.yaml index b4d7da55d0..36acefcebd 100644 --- a/tools/cloud-build/daily-tests/builds/h4d-vm.yaml +++ b/tools/cloud-build/daily-tests/builds/h4d-vm.yaml @@ -83,9 +83,21 @@ steps: --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ - --extra-vars="@$${H4D_VARS_FILE}" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@$${H4D_VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/hcls.yaml b/tools/cloud-build/daily-tests/builds/hcls.yaml index 4b9eae2ae7..92bea43f55 100644 --- a/tools/cloud-build/daily-tests/builds/hcls.yaml +++ b/tools/cloud-build/daily-tests/builds/hcls.yaml @@ -66,9 +66,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/hcls.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/hcls.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml b/tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml index 07da3641c2..68e0bff2e7 100644 --- a/tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml +++ b/tools/cloud-build/daily-tests/builds/hpc-build-slurm-image.yaml @@ -51,9 +51,21 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/hpc-build-slurm-image.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/hpc-enterprise-slurm.yaml b/tools/cloud-build/daily-tests/builds/hpc-enterprise-slurm.yaml index dff2520b47..cf066f9b3f 100644 --- a/tools/cloud-build/daily-tests/builds/hpc-enterprise-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/hpc-enterprise-slurm.yaml @@ -56,9 +56,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/hpc-enterprise-slurm.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/htc-slurm.yaml b/tools/cloud-build/daily-tests/builds/htc-slurm.yaml index e80ec4dd07..aef05add59 100644 --- a/tools/cloud-build/daily-tests/builds/htc-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/htc-slurm.yaml @@ -54,9 +54,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/htc-slurm.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/htc-slurm.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/htcondor.yaml b/tools/cloud-build/daily-tests/builds/htcondor.yaml index 18fea66645..66add7b9ad 100644 --- a/tools/cloud-build/daily-tests/builds/htcondor.yaml +++ b/tools/cloud-build/daily-tests/builds/htcondor.yaml @@ -56,9 +56,21 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/htcondor-integration-test.yml \ - --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" --extra-vars="@tools/cloud-build/daily-tests/tests/htcondor.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" --extra-vars="@tools/cloud-build/daily-tests/tests/htcondor.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml index 7684417886..903bfd60ce 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml @@ -87,12 +87,24 @@ steps: --extra-vars="region=$${REGION} zone=$${ZONE}"\ --extra-vars="enable_spot=$${ENABLE_SPOT} "\ --extra-vars="tcpx_kernel_login=$${TCPX_KERNEL_LOGIN} tcpx_kernel_password=$${TCPX_KERNEL_PASSWORD} keyserver_ubuntu_key=$${KEYSERVER_UBUNTU_KEY} "\ - --extra-vars="@$${SLURM_VARS_FILE}" + --extra-vars="@$${SLURM_VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" secretEnv: ['GCLUSTER_GCS_PATH', 'TCPX_KERNEL_LOGIN', 'TCPX_KERNEL_PASSWORD', 'KEYSERVER_UBUNTU_KEY'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' - versionName: projects/${PROJECT_ID}/secrets/tcpx-kernel-ppa-login/versions/latest env: 'TCPX_KERNEL_LOGIN' - versionName: projects/${PROJECT_ID}/secrets/tcpx-kernel-ppa-password/versions/latest diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml index 9a75f29500..440abd45e9 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml @@ -63,12 +63,24 @@ steps: --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID "\ --extra-vars="region=$${REGION} zone=$${ZONE} "\ --extra-vars="tcpx_kernel_login=$${TCPX_KERNEL_LOGIN} tcpx_kernel_password=$${TCPX_KERNEL_PASSWORD} keyserver_ubuntu_key=$${KEYSERVER_UBUNTU_KEY} "\ - --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm.yml" + --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a3-highgpu-slurm.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" secretEnv: ['GCLUSTER_GCS_PATH', 'TCPX_KERNEL_LOGIN', 'TCPX_KERNEL_PASSWORD', 'KEYSERVER_UBUNTU_KEY'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' - versionName: projects/${PROJECT_ID}/secrets/tcpx-kernel-ppa-login/versions/latest env: 'TCPX_KERNEL_LOGIN' - versionName: projects/${PROJECT_ID}/secrets/tcpx-kernel-ppa-password/versions/latest diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml index 2a81deeabb..6b46e2efdb 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-onspot-slurm-ubuntu.yaml @@ -90,9 +90,21 @@ steps: --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ - --extra-vars="@$${SLURM_VARS_FILE}" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@$${SLURM_VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm-ubuntu.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm-ubuntu.yaml index 134090cc99..b775c8cc35 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm-ubuntu.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-megagpu-slurm-ubuntu.yaml @@ -67,9 +67,21 @@ steps: --user=sa_106486320838376751393 \ --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-ubuntu.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a3-megagpu-slurm-ubuntu.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-2404-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-2404-blueprint-test.yaml index 928138004e..ca83f78ece 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-2404-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-2404-blueprint-test.yaml @@ -83,11 +83,23 @@ steps: --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="instance_image_project=$${CUSTOM_IMAGE_PROJECT}" \ --extra-vars="instance_image_family=$${CUSTOM_IMAGE_FAMILY}" \ - --extra-vars="@$${VARS_FILE}" - secretEnv: ['CUSTOM_IMAGE_PROJECT', 'CUSTOM_IMAGE_FAMILY'] + --extra-vars="@$${VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['CUSTOM_IMAGE_PROJECT', 'CUSTOM_IMAGE_FAMILY', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/custom-image-project/versions/latest env: 'CUSTOM_IMAGE_PROJECT' - versionName: projects/${PROJECT_ID}/secrets/custom-image-family-2404/versions/latest env: 'CUSTOM_IMAGE_FAMILY' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml index 78cf3b31b2..2e5b0423f7 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml @@ -83,12 +83,24 @@ steps: --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="instance_image_project=$${CUSTOM_IMAGE_PROJECT}" \ --extra-vars="instance_image_family=$${CUSTOM_IMAGE_FAMILY}" \ - --extra-vars="@$${VARS_FILE}" + --extra-vars="@$${VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" secretEnv: ['GCLUSTER_GCS_PATH', 'CUSTOM_IMAGE_PROJECT', 'CUSTOM_IMAGE_FAMILY'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' - versionName: projects/${PROJECT_ID}/secrets/custom-image-project/versions/latest env: 'CUSTOM_IMAGE_PROJECT' - versionName: projects/${PROJECT_ID}/secrets/custom-image-family/versions/latest diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-jbvms.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-jbvms.yaml index 5498eb9ef2..b71264926a 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-jbvms.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-jbvms.yaml @@ -59,9 +59,21 @@ steps: --user=sa_106486320838376751393 \ --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-jbvms.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-jbvms.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-jbvms.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-jbvms.yaml index f9cc32ef18..c7018ed917 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-jbvms.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-jbvms.yaml @@ -84,9 +84,21 @@ steps: --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ - --extra-vars="@$${JBVM_VARS_FILE}" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@$${JBVM_VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml index 7cb332c505..33bcbf7405 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml @@ -92,11 +92,23 @@ steps: --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID chs_repo=$${CHS_REPO}" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ - --extra-vars="@$${SLURM_VARS_FILE}" + --extra-vars="@$${SLURM_VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' - versionName: projects/${PROJECT_ID}/secrets/cluster-health-scanner/versions/latest env: 'CHS_REPO' diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml index 661726a114..91efa41702 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-slurm.yaml @@ -65,9 +65,21 @@ steps: --user=sa_106486320838376751393 \ --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a3-ultragpu-slurm.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-2404-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-2404-blueprint-test.yaml index d430309b35..ccf3f46eb0 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-2404-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-2404-blueprint-test.yaml @@ -82,11 +82,23 @@ steps: --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="instance_image_project=$${CUSTOM_IMAGE_PROJECT}" \ --extra-vars="instance_image_family=$${CUSTOM_IMAGE_FAMILY}" \ - --extra-vars="@$${VARS_FILE}" - secretEnv: ['CUSTOM_IMAGE_PROJECT', 'CUSTOM_IMAGE_FAMILY'] + --extra-vars="@$${VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['CUSTOM_IMAGE_PROJECT', 'CUSTOM_IMAGE_FAMILY', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/custom-image-project/versions/latest env: 'CUSTOM_IMAGE_PROJECT' - versionName: projects/${PROJECT_ID}/secrets/custom-image-family-2404/versions/latest env: 'CUSTOM_IMAGE_FAMILY' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml index 600a676c8c..168d13f7d8 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml @@ -82,12 +82,24 @@ steps: --extra-vars="enable_spot=$${ENABLE_SPOT}" \ --extra-vars="instance_image_project=$${CUSTOM_IMAGE_PROJECT}" \ --extra-vars="instance_image_family=$${CUSTOM_IMAGE_FAMILY}" \ - --extra-vars="@$${VARS_FILE}" + --extra-vars="@$${VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" secretEnv: ['GCLUSTER_GCS_PATH', 'CUSTOM_IMAGE_PROJECT', 'CUSTOM_IMAGE_FAMILY'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' - versionName: projects/${PROJECT_ID}/secrets/custom-image-project/versions/latest env: 'CUSTOM_IMAGE_PROJECT' - versionName: projects/${PROJECT_ID}/secrets/custom-image-family/versions/latest diff --git a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml index 30544f47a6..abc860e20b 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml @@ -90,11 +90,23 @@ steps: --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID chs_repo=$${CHS_REPO}" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ - --extra-vars="@$${SLURM_VARS_FILE}" + --extra-vars="@$${SLURM_VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' - versionName: projects/${PROJECT_ID}/secrets/cluster-health-scanner/versions/latest env: 'CHS_REPO' diff --git a/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-custom-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-custom-blueprint-test.yaml index 59622b4636..ba1c1fec28 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-custom-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-custom-blueprint-test.yaml @@ -65,12 +65,24 @@ steps: --extra-vars="region=$${REGION} zone=$${ZONE} "\ --extra-vars="instance_image_project=$${CUSTOM_IMAGE_PROJECT}" \ --extra-vars="instance_image_family=$${CUSTOM_IMAGE_FAMILY}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a4x-highgpu-custom-blueprint-test.yml" + --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a4x-highgpu-custom-blueprint-test.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" secretEnv: ['GCLUSTER_GCS_PATH', 'CUSTOM_IMAGE_PROJECT', 'CUSTOM_IMAGE_FAMILY'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' - versionName: projects/${PROJECT_ID}/secrets/custom-image-project/versions/latest env: 'CUSTOM_IMAGE_PROJECT' - versionName: projects/${PROJECT_ID}/secrets/custom-image-family-a4x/versions/latest diff --git a/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-slurm.yaml index 656fb44ead..924daa0008 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-slurm.yaml @@ -69,9 +69,21 @@ steps: --user=sa_106486320838376751393 \ --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID "\ --extra-vars="region=$${REGION} zone=$${ZONE} "\ - --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a4x-highgpu-slurm.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/ml-a4x-highgpu-slurm.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/ml-g4-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-g4-onspot-slurm.yaml index a98396d59d..6a9cb4f79b 100644 --- a/tools/cloud-build/daily-tests/builds/ml-g4-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-g4-onspot-slurm.yaml @@ -83,9 +83,21 @@ steps: --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ - --extra-vars="@$${SLURM_VARS_FILE}" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@$${SLURM_VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml b/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml index 132b86d405..5be8759563 100644 --- a/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-gke-e2e.yaml @@ -67,9 +67,21 @@ steps: bash tools/add_ttl_label.sh $${SG_EXAMPLE} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/ml-gke-e2e.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/ml-gke-e2e.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/ml-gke.yaml b/tools/cloud-build/daily-tests/builds/ml-gke.yaml index a11d8c407d..478c6cd352 100644 --- a/tools/cloud-build/daily-tests/builds/ml-gke.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-gke.yaml @@ -68,9 +68,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/ml-gke.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/ml-gke.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/ml-h4d-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-h4d-onspot-slurm.yaml index ee702461e4..510d8a5c00 100644 --- a/tools/cloud-build/daily-tests/builds/ml-h4d-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-h4d-onspot-slurm.yaml @@ -82,9 +82,21 @@ steps: --extra-vars="project=$${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ --extra-vars="enable_spot=$${ENABLE_SPOT}" \ - --extra-vars="@$${SLURM_VARS_FILE}" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@$${SLURM_VARS_FILE}" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/ml-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-slurm.yaml index 9c564634c2..e4eabcefe8 100644 --- a/tools/cloud-build/daily-tests/builds/ml-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-slurm.yaml @@ -59,9 +59,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/ml-slurm.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/ml-slurm.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/monitoring.yaml b/tools/cloud-build/daily-tests/builds/monitoring.yaml index 9aee9a521a..80c7d878fd 100644 --- a/tools/cloud-build/daily-tests/builds/monitoring.yaml +++ b/tools/cloud-build/daily-tests/builds/monitoring.yaml @@ -54,9 +54,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/monitoring.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/monitoring.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/netapp-volumes.yaml b/tools/cloud-build/daily-tests/builds/netapp-volumes.yaml index b4212b1003..76eed3ec31 100644 --- a/tools/cloud-build/daily-tests/builds/netapp-volumes.yaml +++ b/tools/cloud-build/daily-tests/builds/netapp-volumes.yaml @@ -52,9 +52,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/netapp-volumes.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/netapp-volumes.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/ofe-deployment.yaml b/tools/cloud-build/daily-tests/builds/ofe-deployment.yaml index d384c0c327..98cfad5879 100644 --- a/tools/cloud-build/daily-tests/builds/ofe-deployment.yaml +++ b/tools/cloud-build/daily-tests/builds/ofe-deployment.yaml @@ -45,9 +45,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/ofe-deployment-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/ofe-deployment.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/ofe-deployment.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/packer.yaml b/tools/cloud-build/daily-tests/builds/packer.yaml index 9fe1075e63..fdd9cab880 100644 --- a/tools/cloud-build/daily-tests/builds/packer.yaml +++ b/tools/cloud-build/daily-tests/builds/packer.yaml @@ -56,9 +56,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/multigroup-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/packer.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/packer.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/pfs-managed-lustre-slurm.yaml b/tools/cloud-build/daily-tests/builds/pfs-managed-lustre-slurm.yaml index 6da54a8cbf..8c6e1b74c3 100644 --- a/tools/cloud-build/daily-tests/builds/pfs-managed-lustre-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/pfs-managed-lustre-slurm.yaml @@ -52,9 +52,21 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/pfs-managed-lustre-slurm.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/pfs-managed-lustre-slurm.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/pfs-managed-lustre-vm.yaml b/tools/cloud-build/daily-tests/builds/pfs-managed-lustre-vm.yaml index 73c24e4b76..68c5f47387 100644 --- a/tools/cloud-build/daily-tests/builds/pfs-managed-lustre-vm.yaml +++ b/tools/cloud-build/daily-tests/builds/pfs-managed-lustre-vm.yaml @@ -49,9 +49,21 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/pfs-managed-lustre-vm.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/pfs-managed-lustre-vm.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/slinky.yml b/tools/cloud-build/daily-tests/builds/slinky.yml index 0e1069aaaa..9808bc7737 100644 --- a/tools/cloud-build/daily-tests/builds/slinky.yml +++ b/tools/cloud-build/daily-tests/builds/slinky.yml @@ -61,9 +61,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/slinky.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/slinky.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-debian.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-debian.yaml index 7d7bd24c4d..b39f123506 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-debian.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-debian.yaml @@ -53,9 +53,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-debian.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-rocky8.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-rocky8.yaml index e74acdf5d3..9dbaeb2672 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-rocky8.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-rocky8.yaml @@ -55,9 +55,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --extra-vars="test_prefix=${_TEST_PREFIX} use_fixed_vpc=true" \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-rocky8.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ssd.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ssd.yaml index e538ff356a..a1bbc3f6b1 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ssd.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ssd.yaml @@ -54,9 +54,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-ssd.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-ssd.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-startup-scripts.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-startup-scripts.yaml index 4d3efa4cb0..406612be06 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-startup-scripts.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-startup-scripts.yaml @@ -54,9 +54,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-startup-scripts.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-startup-scripts.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml index 39664df2a4..46d6038970 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-tpu.yaml @@ -69,9 +69,21 @@ steps: --user=sa_106486320838376751393 \ --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ --extra-vars="region=$${REGION} zone=$${ZONE}" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-tpu.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-tpu.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ubuntu.yaml b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ubuntu.yaml index 76594b0106..3afe46c2da 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ubuntu.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gcp-v6-ubuntu.yaml @@ -53,9 +53,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-ubuntu.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-v6-ubuntu.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/slurm-gke.yaml b/tools/cloud-build/daily-tests/builds/slurm-gke.yaml index d5a4c9d9fb..016bbd5da8 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-gke.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-gke.yaml @@ -60,9 +60,21 @@ steps: ansible-playbook -v tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-gke.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-gke.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/slurm-rapid-storage.yaml b/tools/cloud-build/daily-tests/builds/slurm-rapid-storage.yaml index a7a213f382..2005ecec80 100644 --- a/tools/cloud-build/daily-tests/builds/slurm-rapid-storage.yaml +++ b/tools/cloud-build/daily-tests/builds/slurm-rapid-storage.yaml @@ -53,9 +53,21 @@ steps: bash tools/add_ttl_label.sh $${BLUEPRINT} ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-rapid-storage.yaml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/slurm-rapid-storage.yaml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/tools/cloud-build/daily-tests/builds/spack-gromacs.yaml b/tools/cloud-build/daily-tests/builds/spack-gromacs.yaml index 72682b664e..26346d9cee 100644 --- a/tools/cloud-build/daily-tests/builds/spack-gromacs.yaml +++ b/tools/cloud-build/daily-tests/builds/spack-gromacs.yaml @@ -57,9 +57,21 @@ steps: ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/slurm-integration-test.yml \ --user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT} full_build_id=$BUILD_ID" \ - --extra-vars="@tools/cloud-build/daily-tests/tests/spack-gromacs.yml" - secretEnv: ['GCLUSTER_GCS_PATH'] + --extra-vars="@tools/cloud-build/daily-tests/tests/spack-gromacs.yml" \ + --extra-vars="triage_gcs_bucket_override=$$TRIAGE_GCS_BUCKET" \ + --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ + --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ + --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" + secretEnv: ['GCLUSTER_GCS_PATH', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest env: 'GCLUSTER_GCS_PATH' + - versionName: projects/${PROJECT_ID}/secrets/triage-gcs-bucket/versions/latest + env: 'TRIAGE_GCS_BUCKET' + - versionName: projects/${PROJECT_ID}/secrets/triage-project-number/versions/latest + env: 'TRIAGE_PROJECT_NUMBER' + - versionName: projects/${PROJECT_ID}/secrets/triage-invoker-sa/versions/latest + env: 'TRIAGE_INVOKER_SA' + - versionName: projects/${PROJECT_ID}/secrets/triage-cloud-run-url/versions/latest + env: 'TRIAGE_CLOUD_RUN_URL' diff --git a/validate.py b/validate.py new file mode 100644 index 0000000000..b4de8de35c --- /dev/null +++ b/validate.py @@ -0,0 +1,58 @@ +# Copyright 2026 "Google LLC" +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import yaml +import glob +import os +import subprocess + +def get_git_modified(): + out = subprocess.check_output(['git', 'diff', '--name-only']).decode('utf-8') + return [line.strip() for line in out.splitlines() if line.strip()] + +modified_files = get_git_modified() +yaml_files = glob.glob('tools/cloud-build/daily-tests/builds/*.yaml') +yaml_files += glob.glob('tools/cloud-build/daily-tests/builds/*.yml') + +required_secrets = ['TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] +missing_files = [] +invalid_yaml = [] +missing_secrets = [] + +for f in yaml_files: + try: + with open(f) as file: + content = file.read() + # check if yaml is valid + try: + data = yaml.safe_load(content) + except Exception as e: + invalid_yaml.append(f) + continue + + if 'ansible-playbook' in content: + # it should have been modified + if f not in modified_files: + missing_files.append(f) + + # Check for secrets + has_all_secrets = all(sec in content for sec in required_secrets) + if not has_all_secrets: + missing_secrets.append(f) + except Exception as e: + print(f"Error reading {f}: {e}") + +print(f"Invalid YAML: {invalid_yaml}") +print(f"Files with ansible-playbook but not modified: {missing_files}") +print(f"Files with ansible-playbook but missing secrets: {missing_secrets}") From 0bafeba3b0a9f2240b6beb3e5892d74c18309df7 Mon Sep 17 00:00:00 2001 From: Hritik Agarwal Date: Thu, 25 Jun 2026 18:30:46 +0000 Subject: [PATCH 12/13] resolved a pr review --- .../tasks/trigger_failure_triage_agent.yml | 8 ++- validate.py | 58 ------------------- 2 files changed, 6 insertions(+), 60 deletions(-) delete mode 100644 validate.py diff --git a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml index 40076755ef..90ad8e428b 100644 --- a/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml +++ b/tools/cloud-build/daily-tests/ansible_playbooks/tasks/trigger_failure_triage_agent.yml @@ -103,9 +103,13 @@ TRIAGE_GCS_BUCKET: "{{ triage_gcs_bucket }}" ansible.builtin.shell: | # Wait for Cloud Run to start and for the initial state file to be copied - sleep 60 + for i in {1..12}; do + if gcloud storage ls "gs://$TRIAGE_GCS_BUCKET/$TRIAGE_BUILD_ID/state.json" >/dev/null 2>&1; then + break + fi + sleep 5 + done - # Fail fast if state.json is not present after initial buffer if ! gcloud storage ls "gs://$TRIAGE_GCS_BUCKET/$TRIAGE_BUILD_ID/state.json" >/dev/null 2>&1; then echo "Agent failed to start: state.json was not created within 60 seconds." >&2 exit 1 diff --git a/validate.py b/validate.py deleted file mode 100644 index b4de8de35c..0000000000 --- a/validate.py +++ /dev/null @@ -1,58 +0,0 @@ -# Copyright 2026 "Google LLC" -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import yaml -import glob -import os -import subprocess - -def get_git_modified(): - out = subprocess.check_output(['git', 'diff', '--name-only']).decode('utf-8') - return [line.strip() for line in out.splitlines() if line.strip()] - -modified_files = get_git_modified() -yaml_files = glob.glob('tools/cloud-build/daily-tests/builds/*.yaml') -yaml_files += glob.glob('tools/cloud-build/daily-tests/builds/*.yml') - -required_secrets = ['TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] -missing_files = [] -invalid_yaml = [] -missing_secrets = [] - -for f in yaml_files: - try: - with open(f) as file: - content = file.read() - # check if yaml is valid - try: - data = yaml.safe_load(content) - except Exception as e: - invalid_yaml.append(f) - continue - - if 'ansible-playbook' in content: - # it should have been modified - if f not in modified_files: - missing_files.append(f) - - # Check for secrets - has_all_secrets = all(sec in content for sec in required_secrets) - if not has_all_secrets: - missing_secrets.append(f) - except Exception as e: - print(f"Error reading {f}: {e}") - -print(f"Invalid YAML: {invalid_yaml}") -print(f"Files with ansible-playbook but not modified: {missing_files}") -print(f"Files with ansible-playbook but missing secrets: {missing_secrets}") From 6181922330406c2d96263f7d42bdf9e631b90cdc Mon Sep 17 00:00:00 2001 From: Hritik Agarwal Date: Thu, 25 Jun 2026 18:49:33 +0000 Subject: [PATCH 13/13] Updating secret vars for some build files --- .../cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml | 2 +- tools/cloud-build/daily-tests/builds/gke-a4-onspot.yaml | 2 +- tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml | 2 +- tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml | 2 +- .../daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml | 2 +- tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml | 2 +- .../builds/ml-a3-ultragpu-custom-blueprint-test.yaml | 2 +- .../daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml | 2 +- .../daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml | 2 +- .../daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml | 2 +- .../builds/ml-a4x-highgpu-custom-blueprint-test.yaml | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml index d02f767821..e23672630d 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a3-ultragpu-onspot.yaml @@ -108,7 +108,7 @@ steps: --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" - secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO'] + secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest diff --git a/tools/cloud-build/daily-tests/builds/gke-a4-onspot.yaml b/tools/cloud-build/daily-tests/builds/gke-a4-onspot.yaml index 911356d6ca..a3e48ebc60 100644 --- a/tools/cloud-build/daily-tests/builds/gke-a4-onspot.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-a4-onspot.yaml @@ -109,7 +109,7 @@ steps: --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" - secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO'] + secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest diff --git a/tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml b/tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml index faa03dee29..15a34e6cab 100644 --- a/tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-tpu-7x.yaml @@ -93,7 +93,7 @@ steps: --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" - secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO'] + secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest diff --git a/tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml b/tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml index 320da33bc8..a93b713b52 100644 --- a/tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml +++ b/tools/cloud-build/daily-tests/builds/gke-tpu-v6e.yaml @@ -108,7 +108,7 @@ steps: --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" - secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO'] + secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml index 903bfd60ce..9090445faa 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-onspot-slurm.yaml @@ -92,7 +92,7 @@ steps: --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" - secretEnv: ['GCLUSTER_GCS_PATH', 'TCPX_KERNEL_LOGIN', 'TCPX_KERNEL_PASSWORD', 'KEYSERVER_UBUNTU_KEY'] + secretEnv: ['GCLUSTER_GCS_PATH', 'TCPX_KERNEL_LOGIN', 'TCPX_KERNEL_PASSWORD', 'KEYSERVER_UBUNTU_KEY', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml index 440abd45e9..981bd98675 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-highgpu-slurm.yaml @@ -68,7 +68,7 @@ steps: --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" - secretEnv: ['GCLUSTER_GCS_PATH', 'TCPX_KERNEL_LOGIN', 'TCPX_KERNEL_PASSWORD', 'KEYSERVER_UBUNTU_KEY'] + secretEnv: ['GCLUSTER_GCS_PATH', 'TCPX_KERNEL_LOGIN', 'TCPX_KERNEL_PASSWORD', 'KEYSERVER_UBUNTU_KEY', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml index 2e5b0423f7..6c8b740950 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-custom-blueprint-test.yaml @@ -88,7 +88,7 @@ steps: --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" - secretEnv: ['GCLUSTER_GCS_PATH', 'CUSTOM_IMAGE_PROJECT', 'CUSTOM_IMAGE_FAMILY'] + secretEnv: ['GCLUSTER_GCS_PATH', 'CUSTOM_IMAGE_PROJECT', 'CUSTOM_IMAGE_FAMILY', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest diff --git a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml index 33bcbf7405..31efff9fce 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a3-ultragpu-onspot-slurm.yaml @@ -97,7 +97,7 @@ steps: --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" - secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO'] + secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest diff --git a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml index 168d13f7d8..bb40f41fa9 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-custom-blueprint-test.yaml @@ -87,7 +87,7 @@ steps: --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" - secretEnv: ['GCLUSTER_GCS_PATH', 'CUSTOM_IMAGE_PROJECT', 'CUSTOM_IMAGE_FAMILY'] + secretEnv: ['GCLUSTER_GCS_PATH', 'CUSTOM_IMAGE_PROJECT', 'CUSTOM_IMAGE_FAMILY', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest diff --git a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml index abc860e20b..82b48a44d0 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4-highgpu-onspot-slurm.yaml @@ -95,7 +95,7 @@ steps: --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" - secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO'] + secretEnv: ['GCLUSTER_GCS_PATH', 'CHS_REPO', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest diff --git a/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-custom-blueprint-test.yaml b/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-custom-blueprint-test.yaml index ba1c1fec28..5d729e41d2 100644 --- a/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-custom-blueprint-test.yaml +++ b/tools/cloud-build/daily-tests/builds/ml-a4x-highgpu-custom-blueprint-test.yaml @@ -70,7 +70,7 @@ steps: --extra-vars="triage_project_number_override=$$TRIAGE_PROJECT_NUMBER" \ --extra-vars="triage_invoker_sa_override=$$TRIAGE_INVOKER_SA" \ --extra-vars="triage_cloud_run_url_override=$$TRIAGE_CLOUD_RUN_URL" - secretEnv: ['GCLUSTER_GCS_PATH', 'CUSTOM_IMAGE_PROJECT', 'CUSTOM_IMAGE_FAMILY'] + secretEnv: ['GCLUSTER_GCS_PATH', 'CUSTOM_IMAGE_PROJECT', 'CUSTOM_IMAGE_FAMILY', 'TRIAGE_GCS_BUCKET', 'TRIAGE_PROJECT_NUMBER', 'TRIAGE_INVOKER_SA', 'TRIAGE_CLOUD_RUN_URL'] availableSecrets: secretManager: - versionName: projects/${PROJECT_ID}/secrets/gcluster-develop-release-bucket/versions/latest