From ef9f4dd3d01d89b40c9dd14bcd94e90a129c62d2 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Thu, 4 Jun 2026 03:12:14 +0000 Subject: [PATCH 01/10] fix: use managed identity for ACR push in CI pipeline Local auth (username/password) has been disabled on the ACR registry. Switch CI to use managed identity with az acr login instead. Co-Authored-By: Claude Opus 4 --- .github/workflows/build-deploy-changes.yaml | 8 +++++++- build/pai_build.py | 2 ++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build-deploy-changes.yaml b/.github/workflows/build-deploy-changes.yaml index 7f847b59..8692bbaa 100644 --- a/.github/workflows/build-deploy-changes.yaml +++ b/.github/workflows/build-deploy-changes.yaml @@ -130,6 +130,11 @@ jobs: -c $GITHUB_WORKSPACE/config/cluster-configuration \ -s $changed_services + - name: Login to Azure with Managed Identity + if: steps.check.outputs.has_changed == 'true' + run: | + az login --identity --client-id ${{ secrets.AZURE_MANAGED_IDENTITY_CLIENT_ID }} + - name: Push Images of Changed Services to ACR if: steps.check.outputs.has_changed == 'true' run: | @@ -137,7 +142,8 @@ jobs: echo "Pushing: $changed_services" $GITHUB_WORKSPACE/build/pai_build.py push \ -c $GITHUB_WORKSPACE/config/cluster-configuration \ - -s $changed_services + -s $changed_services \ + --managed-identity-id ${{ secrets.AZURE_MANAGED_IDENTITY_CLIENT_ID }} - name: Push Images of Changed Service to GHCR if: steps.check.outputs.has_changed == 'true' diff --git a/build/pai_build.py b/build/pai_build.py index 6970a370..3b5060ba 100755 --- a/build/pai_build.py +++ b/build/pai_build.py @@ -177,6 +177,8 @@ def main(): config_model['dockerRegistryInfo']['dockerPassword'] = args.docker_password if hasattr(args, 'managed_identity_id') and args.managed_identity_id is not None: config_model['dockerRegistryInfo']['managedIdentityId'] = args.managed_identity_id + config_model['dockerRegistryInfo']['dockerUserName'] = '' + config_model['dockerRegistryInfo']['dockerPassword'] = '' if hasattr(args, 'docker_tag') and args.docker_tag is not None: config_model['dockerRegistryInfo']['dockerTag'] = args.docker_tag From fd702f81559d9e5dd83e7f65c7d854ced223b44c Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Thu, 4 Jun 2026 03:22:58 +0000 Subject: [PATCH 02/10] test: trigger CI to verify managed identity ACR push This comment will be reverted after CI verification. Co-Authored-By: Claude Opus 4 --- src/watchdog/build/watchdog.common.dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/src/watchdog/build/watchdog.common.dockerfile b/src/watchdog/build/watchdog.common.dockerfile index 949f0d34..e3a98d1d 100644 --- a/src/watchdog/build/watchdog.common.dockerfile +++ b/src/watchdog/build/watchdog.common.dockerfile @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +# ci-test: verify managed identity ACR push FROM golang:1.25.10-alpine3.23 as builder From 4d7998f84b718a1a0640a6ebd595221ecb77bbd8 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Thu, 4 Jun 2026 04:58:24 +0000 Subject: [PATCH 03/10] debug: print docker registry config in CI (temporary) To inspect current config structure before updating managed identity. Will be reverted after verification. Co-Authored-By: Claude Opus 4 --- .github/workflows/build-deploy-changes.yaml | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/.github/workflows/build-deploy-changes.yaml b/.github/workflows/build-deploy-changes.yaml index 8692bbaa..761c1551 100644 --- a/.github/workflows/build-deploy-changes.yaml +++ b/.github/workflows/build-deploy-changes.yaml @@ -104,6 +104,16 @@ jobs: mv $GITHUB_WORKSPACE/config/auth-configuration /tmp/ ls -l /tmp/auth-configuration + - name: Debug - Print docker registry config (temporary) + if: steps.check.outputs.has_changed == 'true' + run: | + echo "=== Docker registry config ===" + find $GITHUB_WORKSPACE/config/cluster-configuration -type f | xargs grep -l "docker-registry" || true + for f in $(find $GITHUB_WORKSPACE/config/cluster-configuration -type f -name "*.yaml" -o -name "*.yml"); do + echo "--- $f ---" + cat "$f" | sed 's/docker-password:.*/docker-password: ***REDACTED***/' + done + - name: Login to GHCR if: steps.check.outputs.has_changed == 'true' run: | From dd9307549cf148a4dd5343817df8c8e132dcbdd5 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Thu, 4 Jun 2026 06:16:29 +0000 Subject: [PATCH 04/10] fix: clean up CI workflow after migrating ACR auth to managed identity in config Removed debug steps, CLI --managed-identity-id override, and test trigger. ACR auth now uses managed-identity-id from config secret. Co-Authored-By: Claude Opus 4 --- .github/workflows/build-deploy-changes.yaml | 13 +------------ build/pai_build.py | 4 +--- src/watchdog/build/watchdog.common.dockerfile | 1 - 3 files changed, 2 insertions(+), 16 deletions(-) diff --git a/.github/workflows/build-deploy-changes.yaml b/.github/workflows/build-deploy-changes.yaml index 761c1551..8d67f3c5 100644 --- a/.github/workflows/build-deploy-changes.yaml +++ b/.github/workflows/build-deploy-changes.yaml @@ -104,16 +104,6 @@ jobs: mv $GITHUB_WORKSPACE/config/auth-configuration /tmp/ ls -l /tmp/auth-configuration - - name: Debug - Print docker registry config (temporary) - if: steps.check.outputs.has_changed == 'true' - run: | - echo "=== Docker registry config ===" - find $GITHUB_WORKSPACE/config/cluster-configuration -type f | xargs grep -l "docker-registry" || true - for f in $(find $GITHUB_WORKSPACE/config/cluster-configuration -type f -name "*.yaml" -o -name "*.yml"); do - echo "--- $f ---" - cat "$f" | sed 's/docker-password:.*/docker-password: ***REDACTED***/' - done - - name: Login to GHCR if: steps.check.outputs.has_changed == 'true' run: | @@ -152,8 +142,7 @@ jobs: echo "Pushing: $changed_services" $GITHUB_WORKSPACE/build/pai_build.py push \ -c $GITHUB_WORKSPACE/config/cluster-configuration \ - -s $changed_services \ - --managed-identity-id ${{ secrets.AZURE_MANAGED_IDENTITY_CLIENT_ID }} + -s $changed_services - name: Push Images of Changed Service to GHCR if: steps.check.outputs.has_changed == 'true' diff --git a/build/pai_build.py b/build/pai_build.py index 3b5060ba..2fffec25 100755 --- a/build/pai_build.py +++ b/build/pai_build.py @@ -177,11 +177,9 @@ def main(): config_model['dockerRegistryInfo']['dockerPassword'] = args.docker_password if hasattr(args, 'managed_identity_id') and args.managed_identity_id is not None: config_model['dockerRegistryInfo']['managedIdentityId'] = args.managed_identity_id - config_model['dockerRegistryInfo']['dockerUserName'] = '' - config_model['dockerRegistryInfo']['dockerPassword'] = '' if hasattr(args, 'docker_tag') and args.docker_tag is not None: config_model['dockerRegistryInfo']['dockerTag'] = args.docker_tag - + args.func(args, config_model) endtime = datetime.datetime.now() diff --git a/src/watchdog/build/watchdog.common.dockerfile b/src/watchdog/build/watchdog.common.dockerfile index e3a98d1d..949f0d34 100644 --- a/src/watchdog/build/watchdog.common.dockerfile +++ b/src/watchdog/build/watchdog.common.dockerfile @@ -1,6 +1,5 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -# ci-test: verify managed identity ACR push FROM golang:1.25.10-alpine3.23 as builder From eec93ab5a71d875498d3e37cde80b2983f1113cf Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Thu, 4 Jun 2026 06:43:15 +0000 Subject: [PATCH 05/10] fix: pin CI container to ubuntu:24.04 to avoid mirror sync issues Co-Authored-By: Claude Opus 4 --- .github/workflows/build-deploy-changes.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build-deploy-changes.yaml b/.github/workflows/build-deploy-changes.yaml index 8d67f3c5..75e41234 100644 --- a/.github/workflows/build-deploy-changes.yaml +++ b/.github/workflows/build-deploy-changes.yaml @@ -18,7 +18,7 @@ jobs: timeout-minutes: 120 environment: auto-test container: - image: ubuntu:latest + image: ubuntu:24.04 volumes: - /var/run/docker.sock:/var/run/docker.sock env: From f642ce04308378ad7ddcb1f78b372bc148246749 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Thu, 4 Jun 2026 06:48:53 +0000 Subject: [PATCH 06/10] test: trigger watchdog build to verify managed identity ACR push Will be reverted after verification. Co-Authored-By: Claude Opus 4 --- src/watchdog/build/watchdog.common.dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/src/watchdog/build/watchdog.common.dockerfile b/src/watchdog/build/watchdog.common.dockerfile index 949f0d34..e3a98d1d 100644 --- a/src/watchdog/build/watchdog.common.dockerfile +++ b/src/watchdog/build/watchdog.common.dockerfile @@ -1,5 +1,6 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. +# ci-test: verify managed identity ACR push FROM golang:1.25.10-alpine3.23 as builder From b01b4c9dba26cf7f461a53b12f37009b27d71141 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Thu, 4 Jun 2026 07:47:27 +0000 Subject: [PATCH 07/10] fix: use config-based managed identity for ACR push, restore deploy identity - Remove explicit az login step before ACR push (docker_login handles it via managed-identity-id from config) - Restore pai_build.py to original state - Add shell: bash for GHCR push step to fix [[ syntax - Keep watchdog trigger for CI verification Co-Authored-By: Claude Opus 4 --- .github/workflows/build-deploy-changes.yaml | 6 +----- build/pai_build.py | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/.github/workflows/build-deploy-changes.yaml b/.github/workflows/build-deploy-changes.yaml index 75e41234..87216c74 100644 --- a/.github/workflows/build-deploy-changes.yaml +++ b/.github/workflows/build-deploy-changes.yaml @@ -130,11 +130,6 @@ jobs: -c $GITHUB_WORKSPACE/config/cluster-configuration \ -s $changed_services - - name: Login to Azure with Managed Identity - if: steps.check.outputs.has_changed == 'true' - run: | - az login --identity --client-id ${{ secrets.AZURE_MANAGED_IDENTITY_CLIENT_ID }} - - name: Push Images of Changed Services to ACR if: steps.check.outputs.has_changed == 'true' run: | @@ -146,6 +141,7 @@ jobs: - name: Push Images of Changed Service to GHCR if: steps.check.outputs.has_changed == 'true' + shell: bash run: | changed_services="${{ steps.changes.outputs.folders }}" echo "Pushing: $changed_services" diff --git a/build/pai_build.py b/build/pai_build.py index 2fffec25..6970a370 100755 --- a/build/pai_build.py +++ b/build/pai_build.py @@ -179,7 +179,7 @@ def main(): config_model['dockerRegistryInfo']['managedIdentityId'] = args.managed_identity_id if hasattr(args, 'docker_tag') and args.docker_tag is not None: config_model['dockerRegistryInfo']['dockerTag'] = args.docker_tag - + args.func(args, config_model) endtime = datetime.datetime.now() From 2689867f335c9b28b19512187fecc168559b07aa Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Fri, 5 Jun 2026 00:49:04 +0000 Subject: [PATCH 08/10] fix: resolve CI warnings - upgrade checkout, fix shell and python syntax - Upgrade actions/checkout@v4 to @v5 for Node.js 24 compatibility - Add shell: bash to steps using [[ ]] syntax - Fix "is" string comparison to == in machine.py - Fix invalid escape sequence in pai_version.py Co-Authored-By: Claude Opus 4 --- .github/workflows/build-deploy-changes.yaml | 3 ++- deployment/clusterObjectModel/mainParser/machine.py | 2 +- deployment/utility/pai_version.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.github/workflows/build-deploy-changes.yaml b/.github/workflows/build-deploy-changes.yaml index 87216c74..594b50fb 100644 --- a/.github/workflows/build-deploy-changes.yaml +++ b/.github/workflows/build-deploy-changes.yaml @@ -30,7 +30,7 @@ jobs: DEBIAN_FRONTEND=noninteractive apt install -y git - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@v5 with: fetch-depth: 0 submodules: false @@ -111,6 +111,7 @@ jobs: - name: Build Images of Changed Services if: steps.check.outputs.has_changed == 'true' + shell: bash run: | changed_services="${{ steps.changes.outputs.folders }}" echo "Building: $changed_services" diff --git a/deployment/clusterObjectModel/mainParser/machine.py b/deployment/clusterObjectModel/mainParser/machine.py index a7c7ad54..6ed7f043 100644 --- a/deployment/clusterObjectModel/mainParser/machine.py +++ b/deployment/clusterObjectModel/mainParser/machine.py @@ -82,7 +82,7 @@ def validation_host_properties(self): if "k8s-role" not in host: return False, "k8s-role is miss in the host [{0}]".format(str(host)) - if "k8s-role" is "master": + if "k8s-role" == "master": if "etcdid" not in host: return False, "etcdid is miss in one of the host with the [k8s-role: master]." if host["etcdid"] in etcd_id_visited: diff --git a/deployment/utility/pai_version.py b/deployment/utility/pai_version.py index 266a73fe..00e8e870 100644 --- a/deployment/utility/pai_version.py +++ b/deployment/utility/pai_version.py @@ -22,7 +22,7 @@ def cluster_version(): try: # redicret stderr to devnull DEVNULL = open(os.devnull, 'w') - version = subprocess.check_output("kubectl get configmap pai-version -o jsonpath='{.data.PAI\.VERSION}'", shell=True, stderr=DEVNULL) + version = subprocess.check_output(r"kubectl get configmap pai-version -o jsonpath='{.data.PAI\.VERSION}'", shell=True, stderr=DEVNULL) logger.info("Cluster version: %s", version) except subprocess.CalledProcessError: logger.warning("Can't fetch cluster version!") From fab87713fb414ff0eaae4326691ff72ec2064ce7 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Fri, 5 Jun 2026 01:05:09 +0000 Subject: [PATCH 09/10] fix: remove watchdog CI test trigger Co-Authored-By: Claude Opus 4 --- src/watchdog/build/watchdog.common.dockerfile | 1 - 1 file changed, 1 deletion(-) diff --git a/src/watchdog/build/watchdog.common.dockerfile b/src/watchdog/build/watchdog.common.dockerfile index e3a98d1d..949f0d34 100644 --- a/src/watchdog/build/watchdog.common.dockerfile +++ b/src/watchdog/build/watchdog.common.dockerfile @@ -1,6 +1,5 @@ # Copyright (c) Microsoft Corporation. # Licensed under the MIT License. -# ci-test: verify managed identity ACR push FROM golang:1.25.10-alpine3.23 as builder From 48540b37c052023f79193f1a9606e521ddcd7990 Mon Sep 17 00:00:00 2001 From: Rui Gao Date: Mon, 8 Jun 2026 02:00:23 +0000 Subject: [PATCH 10/10] add no cache switch for CICD --- .github/workflows/build-all.yaml | 1 + .github/workflows/build-deploy-changes.yaml | 2 ++ 2 files changed, 3 insertions(+) diff --git a/.github/workflows/build-all.yaml b/.github/workflows/build-all.yaml index 89ecdd43..43ae8330 100644 --- a/.github/workflows/build-all.yaml +++ b/.github/workflows/build-all.yaml @@ -97,6 +97,7 @@ jobs: # Build specific images in alert-manager echo "Building specific alert-manager images" $GITHUB_WORKSPACE/build/pai_build.py build \ + -n \ -c $GITHUB_WORKSPACE/config/cluster-configuration \ -s alert-manager \ -i abnormal-detector,alert-handler,alert-parser,cert-expiration-checker,cluster-utilization,job-data-recorder,job-status-change-notification,node-failure-detection,node-issue-classifier,nvidia-gpu-low-perf-fixer,redis-monitoring diff --git a/.github/workflows/build-deploy-changes.yaml b/.github/workflows/build-deploy-changes.yaml index 594b50fb..e61a0b5a 100644 --- a/.github/workflows/build-deploy-changes.yaml +++ b/.github/workflows/build-deploy-changes.yaml @@ -121,6 +121,7 @@ jobs: # build specific images in alert-manager echo "Building specific alert-manager images" $GITHUB_WORKSPACE/build/pai_build.py build \ + -n \ -c $GITHUB_WORKSPACE/config/cluster-configuration \ -s alert-manager -i abnormal-detector,alert-handler,alert-parser,cert-expiration-checker,cluster-utilization,job-data-recorder,job-status-change-notification,node-failure-detection,node-issue-classifier,nvidia-gpu-low-perf-fixer,redis-monitoring @@ -128,6 +129,7 @@ jobs: echo "Changed services after removing alert-manager: $changed_services" $GITHUB_WORKSPACE/build/pai_build.py build \ + -n \ -c $GITHUB_WORKSPACE/config/cluster-configuration \ -s $changed_services