From dd5b2d70e5ace558ca2a23efc2a165d012f554bb Mon Sep 17 00:00:00 2001 From: Philip Lombardi <893096+plombardi89@users.noreply.github.com> Date: Mon, 22 Jun 2026 16:29:46 -0400 Subject: [PATCH 01/12] Make Orca integration deploy tooling deployment-neutral Rename the Orca integration-cluster deploy tooling so it no longer bakes in the 'stable' channel, since it is now shared by both unbounded-stable and the new unbounded-nightly cluster: hack/orca/deploy-stable.sh -> hack/orca/deploy-integration.sh hack/orca/smoke-stable.sh -> hack/orca/smoke-integration.sh hack/orca/stable/ -> hack/orca/integration/ Neutralize the Garage template labels (part-of: orca-stable -> orca) and admin-token default, and update all references (release-upgrade.yaml, create-credentials-secret.sh, and the manifests render test). The Garage labels are not used as selectors and the admin token is internal to the template, so existing clusters are unaffected on re-apply. --- .github/workflows/release-upgrade.yaml | 4 ++-- hack/orca/create-credentials-secret.sh | 4 ++-- ...deploy-stable.sh => deploy-integration.sh} | 23 +++++++++++-------- .../{stable => integration}/garage.yaml.tmpl | 23 ++++++++++--------- .../{smoke-stable.sh => smoke-integration.sh} | 7 +++--- internal/orca/manifests/manifests_test.go | 16 ++++++------- 6 files changed, 41 insertions(+), 36 deletions(-) rename hack/orca/{deploy-stable.sh => deploy-integration.sh} (90%) rename hack/orca/{stable => integration}/garage.yaml.tmpl (88%) rename hack/orca/{smoke-stable.sh => smoke-integration.sh} (97%) diff --git a/.github/workflows/release-upgrade.yaml b/.github/workflows/release-upgrade.yaml index 06a28b77..5ef3867b 100644 --- a/.github/workflows/release-upgrade.yaml +++ b/.github/workflows/release-upgrade.yaml @@ -460,7 +460,7 @@ jobs: # part of the shippable release tarball. release.yaml DOES build and # push the Orca image (/orca:), but its manifests plus # the test-only Garage cachestore are deployed here from hack/orca via - # hack/orca/deploy-stable.sh. + # hack/orca/deploy-integration.sh. # # Origin is real Azure Blob storage; cachestore is an in-cluster # single-node Garage backed by a PVC. Confidential values (Azure @@ -547,7 +547,7 @@ jobs: set -euo pipefail IMAGE="${REGISTRY,,}/orca:${TAG}" echo "Deploying Orca image: ${IMAGE}" - ./hack/orca/deploy-stable.sh \ + ./hack/orca/deploy-integration.sh \ --image "${IMAGE}" \ --azure-account "${ORCA_AZURE_ACCOUNT}" \ --azure-container "${ORCA_AZURE_CONTAINER}" \ diff --git a/hack/orca/create-credentials-secret.sh b/hack/orca/create-credentials-secret.sh index 02f51cf6..3d12f39a 100755 --- a/hack/orca/create-credentials-secret.sh +++ b/hack/orca/create-credentials-secret.sh @@ -10,7 +10,7 @@ # - ORCA_CACHESTORE_S3_ACCESS_KEY Garage S3 access key id (generated here) # - ORCA_CACHESTORE_S3_SECRET_KEY Garage S3 secret key (generated here) # -# The Garage S3 keys are the single source of truth: hack/orca/deploy-stable.sh +# The Garage S3 keys are the single source of truth: hack/orca/deploy-integration.sh # imports them into Garage (via bootstrap-garage.sh) and injects them into # Orca via envFrom. This script generates fresh ones in the format Garage # requires (access id = "GK" + 12 hex bytes; secret = 32 hex bytes) unless @@ -100,5 +100,5 @@ cat >&2 < Date: Mon, 22 Jun 2026 16:29:55 -0400 Subject: [PATCH 02/12] Add unbounded-nightly build and deploy workflow Introduce an unbounded-nightly integration cluster that mirrors unbounded-stable but deploys a from-source snapshot of main HEAD every morning at 06:00 UTC instead of a published release. The new .github/workflows/nightly.yaml: - resolves the snapshot commit and derives a nightly- tag - builds and pushes amd64 images (net-controller, net-node, machina, orca) for that tag, without the release path's cosign/SBOM/Trivy gating (the nightly cluster is a throwaway soak target) - renders manifests against the nightly tags and deploys them to the unbounded-nightly GitHub Environment (init on first bootstrap, upgrade-apply thereafter), reusing the #235 machina-config merge - deploys Orca via the shared deploy-integration.sh - runs the shared hack/release/smoke suite The target cluster is configured via the unbounded-nightly Environment using the existing hack/scripts/setup-deploy-environment.sh (no change needed there); the workflow header documents the one-time setup. --- .github/workflows/nightly.yaml | 666 +++++++++++++++++++++++++++++++++ 1 file changed, 666 insertions(+) create mode 100644 .github/workflows/nightly.yaml diff --git a/.github/workflows/nightly.yaml b/.github/workflows/nightly.yaml new file mode 100644 index 00000000..deaf8d55 --- /dev/null +++ b/.github/workflows/nightly.yaml @@ -0,0 +1,666 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +name: nightly + +# --------------------------------------------------------------------------- +# Builds a snapshot of `main` from source every morning and deploys it to the +# integration cluster (unbounded-nightly) so the tip of the tree gets the same +# soak treatment that releases get on unbounded-stable. +# +# Relationship to release-upgrade.yaml +# ------------------------------------ +# This is the nightly sibling of .github/workflows/release-upgrade.yaml. That +# workflow deploys a PUBLISHED, signed release (downloaded as a manifests +# tarball from a GitHub Release) to unbounded-stable. This one has no release +# and no tag: it builds the current main HEAD from source, pushes images +# tagged `nightly-`, renders manifests against those tags, and +# deploys them to unbounded-nightly. There is deliberately no cosign signing, +# SBOM, or Trivy gating here: the nightly cluster is a throwaway soak target, +# not a customer artifact. The full supply chain stays on the release path. +# +# Schedule +# -------- +# Runs at 06:00 UTC daily. GitHub Actions cron is UTC-only and does not follow +# US daylight saving, so this lands at 01:00 ET (EST) / 02:00 ET (EDT). We run +# early on purpose: a from-source build of every component plus a deploy and +# smoke pass needs to finish well before the US working day, so 06:00 UTC is +# chosen over a literal 4am-ET cron to leave that headroom. +# +# Target cluster +# -------------- +# The target cluster is configured via the `unbounded-nightly` GitHub +# Environment, which provides: +# - secret KUBECONFIG (raw kubeconfig file contents) +# - vars SITE_NAME, CLUSTER_NODE_CIDR, CLUSTER_POD_CIDR, +# SITE_NODE_CIDR, SITE_POD_CIDR, MANAGE_CNI_PLUGIN, +# ORCA_AZURE_ACCOUNT, ORCA_AZURE_CONTAINER, ORCA_AZURE_ENDPOINT +# - cluster Secret unbounded-kube/orca-credentials for confidential Orca +# values (Azure account key and Garage S3 credentials) +# +# First-time setup (run once, out of band): +# 1. Provision the AKS cluster and configure the Environment: +# hack/scripts/setup-deploy-environment.sh \ +# --env-name unbounded-nightly --site-name nightly \ +# --kubeconfig --cluster-node-cidr ... --cluster-pod-cidr ... \ +# --site-node-cidr ... --site-pod-cidr ... \ +# --orca-azure-account ... --orca-azure-container ... +# Use CIDRs distinct from unbounded-stable. +# 2. Pre-create the orca-credentials Secret: +# hack/orca/create-credentials-secret.sh ... (against the nightly cluster) +# 3. Label at least one gateway node: +# kubectl label node unbounded-cloud.io/unbounded-net-gateway=true --overwrite +# 4. Bootstrap once (or let the first scheduled run auto-init, since the +# Site CRD is absent on a fresh cluster): +# gh workflow run nightly.yaml -f force_init=true +# +# Customization points (search for "CUSTOMIZE:" in this file) +# ------------------------------------------------------------ +# - Add a smoke test -> drop a script in hack/release/smoke/ +# (shared with release-upgrade.yaml) +# - Change the schedule -> edit the cron below +# - Change target cluster -> change `environment:` on the relevant jobs +# --------------------------------------------------------------------------- + +on: + schedule: + # 06:00 UTC daily. See the "Schedule" note in the header above. + - cron: "0 6 * * *" + + # Manual trigger for re-deploys, first bootstrap, and testing a specific ref. + workflow_dispatch: + inputs: + ref: + description: "Git ref to snapshot (default: the default branch)" + required: false + type: string + force_init: + description: "Run 'site init' instead of upgrade-apply (use for first-ever bootstrap)" + type: boolean + default: false + +permissions: + contents: read + packages: write + +# Only the freshest snapshot matters. If a new nightly starts while an old one +# is still running, cancel the old one. +concurrency: + group: deploy-nightly + cancel-in-progress: true + +env: + REGISTRY: ghcr.io/${{ github.repository_owner }} + CNI_PLUGINS_VERSION: v1.9.1 + +jobs: + # --------------------------------------------------------------------------- + # Resolve the commit to snapshot and derive the nightly image tag. + # + # The downstream build and deploy jobs all pin to `resolve.outputs.sha` so a + # branch advance mid-run cannot split the snapshot across two commits. + # --------------------------------------------------------------------------- + resolve: + if: github.repository == 'Azure/unbounded' + runs-on: ubuntu-latest + outputs: + sha: ${{ steps.resolve.outputs.sha }} + tag: ${{ steps.resolve.outputs.tag }} + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + ref: ${{ inputs.ref || github.event.repository.default_branch }} + fetch-depth: 0 + + - name: Resolve snapshot commit and tag + id: resolve + run: | + set -euo pipefail + SHA="$(git rev-parse HEAD)" + SHORT="$(git rev-parse --short HEAD)" + TAG="nightly-${SHORT}" + { + echo "sha=${SHA}" + echo "tag=${TAG}" + } >> "$GITHUB_OUTPUT" + echo "Snapshot commit: ${SHA}" + echo "Nightly tag: ${TAG}" + + # --------------------------------------------------------------------------- + # Build and push the net controller/node images for the snapshot (amd64). + # + # Mirrors release.yaml's net-images job minus the multi-arch matrix, Trivy + # gating, cosign signing, and SBOM attestation. The frontend is built inline + # (release.yaml shares it across a per-arch matrix via an artifact; we have a + # single amd64 build, so inline is simpler) and staged into the embed dir. + # --------------------------------------------------------------------------- + net-images: + needs: resolve + runs-on: ubuntu-latest + env: + TAG: ${{ needs.resolve.outputs.tag }} + SHA: ${{ needs.resolve.outputs.sha }} + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + ref: ${{ needs.resolve.outputs.sha }} + + - name: Normalize container registry to lowercase + run: echo "REGISTRY=${REGISTRY,,}" >> "$GITHUB_ENV" + + - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.0.0 + with: + node-version: "24" + cache: npm + cache-dependency-path: frontend/package-lock.json + + - name: Build frontend and stage into the net embed dir + run: | + set -euo pipefail + npm ci --prefer-offline --no-audit + npm run build + # internal/net/html/pages.go embeds internal/net/html/dist via + # go:embed; the Containerfile copies the whole context, so the Go + # build picks up exactly this snapshot's UI. + rm -rf ../internal/net/html/dist + mkdir -p ../internal/net/html/dist + cp -R dist/. ../internal/net/html/dist/ + working-directory: frontend + + - name: Cache CNI plugins + uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 + with: + path: resources + key: cni-plugins-amd64-${{ env.CNI_PLUGINS_VERSION }} + + - name: Download CNI plugins + run: | + set -euo pipefail + mkdir -p resources + file="resources/cni-plugins-linux-amd64-${CNI_PLUGINS_VERSION}.tgz" + if [ ! -s "$file" ]; then + curl -fsSL \ + "https://github.com/containernetworking/plugins/releases/download/${CNI_PLUGINS_VERSION}/cni-plugins-linux-amd64-${CNI_PLUGINS_VERSION}.tgz" \ + -o "$file" + fi + + - uses: docker/setup-buildx-action@d7f5e7f509e45cec5c76c4d5afdd7de93d0b3df5 # v4.1.0 + + - name: Log in to ghcr.io + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Compute Go version for Docker + id: goversion + run: echo "go_version=$(awk '/^go /{split($2,v,"."); print v[1]"."v[2]}' go.mod)" >> "$GITHUB_OUTPUT" + + - name: Build and push controller image + uses: docker/build-push-action@f9f3042f7e2789586610d6e8b85c8f03e5195baf # v7.2.0 + with: + context: . + file: images/net/Containerfile + target: controller + platforms: linux/amd64 + push: true + provenance: false + build-args: | + GO_VERSION=${{ steps.goversion.outputs.go_version }} + VERSION=${{ env.TAG }} + GIT_COMMIT=${{ env.SHA }} + CNI_PLUGINS_VERSION=${{ env.CNI_PLUGINS_VERSION }} + tags: ${{ env.REGISTRY }}/unbounded-net-controller:${{ env.TAG }} + cache-from: type=gha,scope=nightly-net-controller + cache-to: type=gha,scope=nightly-net-controller,mode=max + + - name: Build and push node image + uses: docker/build-push-action@f9f3042f7e2789586610d6e8b85c8f03e5195baf # v7.2.0 + with: + context: . + file: images/net/Containerfile + target: node + platforms: linux/amd64 + push: true + provenance: false + build-args: | + GO_VERSION=${{ steps.goversion.outputs.go_version }} + VERSION=${{ env.TAG }} + GIT_COMMIT=${{ env.SHA }} + CNI_PLUGINS_VERSION=${{ env.CNI_PLUGINS_VERSION }} + tags: ${{ env.REGISTRY }}/unbounded-net-node:${{ env.TAG }} + cache-from: type=gha,scope=nightly-net-node + cache-to: type=gha,scope=nightly-net-node,mode=max + + # --------------------------------------------------------------------------- + # Build and push the remaining images the deploy needs (amd64). + # - machina : applied as part of the core deploy + # - orca : deployed separately by the deploy-orca job + # machine-ops-controller is intentionally omitted: like release-upgrade.yaml, + # the deploy applies only net/ and machina/, so machine-ops is never rolled + # out here and would be dead weight to build. + # --------------------------------------------------------------------------- + component-images: + needs: resolve + runs-on: ubuntu-latest + strategy: + matrix: + component: + - name: machina + file: images/machina/Containerfile + - name: orca + file: images/orca/Containerfile + env: + TAG: ${{ needs.resolve.outputs.tag }} + SHA: ${{ needs.resolve.outputs.sha }} + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + ref: ${{ needs.resolve.outputs.sha }} + + - name: Normalize container registry to lowercase + run: echo "REGISTRY=${REGISTRY,,}" >> "$GITHUB_ENV" + + - uses: docker/setup-buildx-action@d7f5e7f509e45cec5c76c4d5afdd7de93d0b3df5 # v4.1.0 + + - name: Log in to ghcr.io + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push ${{ matrix.component.name }} image + uses: docker/build-push-action@f9f3042f7e2789586610d6e8b85c8f03e5195baf # v7.2.0 + with: + context: . + file: ${{ matrix.component.file }} + platforms: linux/amd64 + push: true + provenance: false + build-args: | + VERSION=${{ env.TAG }} + GIT_COMMIT=${{ env.SHA }} + tags: ${{ env.REGISTRY }}/${{ matrix.component.name }}:${{ env.TAG }} + cache-from: type=gha,scope=nightly-${{ matrix.component.name }} + cache-to: type=gha,scope=nightly-${{ matrix.component.name }},mode=max + + # --------------------------------------------------------------------------- + # Deploy the snapshot to unbounded-nightly. + # + # MODE=init -> 'kubectl unbounded site init' using manifests embedded + # into a plugin built from this snapshot (stamped with the + # nightly image tags). Used for the first bootstrap. + # MODE=upgrade -> render the machina + net manifests for the nightly tags + # and server-side apply them. + # + # Unlike release-upgrade.yaml the manifests are rendered here from the + # snapshot checkout (no tarball download, no cosign verify): there is no + # release artifact for a nightly. + # --------------------------------------------------------------------------- + deploy: + needs: [resolve, net-images, component-images] + if: github.repository == 'Azure/unbounded' + runs-on: ubuntu-latest + environment: unbounded-nightly + env: + TAG: ${{ needs.resolve.outputs.tag }} + SITE_NAME: ${{ vars.SITE_NAME }} + CLUSTER_NODE_CIDR: ${{ vars.CLUSTER_NODE_CIDR }} + CLUSTER_POD_CIDR: ${{ vars.CLUSTER_POD_CIDR }} + SITE_NODE_CIDR: ${{ vars.SITE_NODE_CIDR }} + SITE_POD_CIDR: ${{ vars.SITE_POD_CIDR }} + MANAGE_CNI_PLUGIN: ${{ vars.MANAGE_CNI_PLUGIN }} + outputs: + mode: ${{ steps.mode.outputs.mode }} + steps: + - name: Validate environment configuration + run: | + set -euo pipefail + missing=() + for v in SITE_NAME CLUSTER_NODE_CIDR CLUSTER_POD_CIDR SITE_NODE_CIDR SITE_POD_CIDR MANAGE_CNI_PLUGIN; do + if [[ -z "${!v}" ]]; then + missing+=("$v") + fi + done + if (( ${#missing[@]} > 0 )); then + echo "::error::Missing required Environment variables: ${missing[*]}" + echo "::error::Configure with hack/scripts/setup-deploy-environment.sh" + exit 1 + fi + + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + ref: ${{ needs.resolve.outputs.sha }} + + - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 + with: + go-version-file: go.mod + + - name: Install kubectl + uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0 + + - name: Write kubeconfig + env: + KUBECONFIG_CONTENT: ${{ secrets.KUBECONFIG }} + run: | + set -euo pipefail + if [[ -z "${KUBECONFIG_CONTENT}" ]]; then + echo "::error::Environment secret KUBECONFIG is empty" + exit 1 + fi + umask 077 + printf '%s' "$KUBECONFIG_CONTENT" > "$RUNNER_TEMP/kubeconfig" + echo "KUBECONFIG=$RUNNER_TEMP/kubeconfig" >> "$GITHUB_ENV" + + - name: Verify cluster connectivity + run: | + set -euo pipefail + kubectl version + kubectl cluster-info + + - name: Determine deploy mode + id: mode + run: | + set -euo pipefail + MODE=upgrade + if [[ "${{ inputs.force_init }}" == "true" ]]; then + MODE=init + echo "force_init=true; selecting init mode" + elif ! kubectl get crd sites.net.unbounded-cloud.io >/dev/null 2>&1; then + MODE=init + echo "Site CRD not present; selecting init mode" + elif ! kubectl get site.net.unbounded-cloud.io "${SITE_NAME}" >/dev/null 2>&1; then + MODE=init + echo "Site '${SITE_NAME}' not present; selecting init mode" + else + echo "Site '${SITE_NAME}' exists; selecting upgrade mode" + fi + echo "MODE=${MODE}" >> "$GITHUB_ENV" + echo "mode=${MODE}" >> "$GITHUB_OUTPUT" + + - name: Build kubectl-unbounded plugin (init) + if: env.MODE == 'init' + # Building the plugin re-renders the embedded net + machina manifests + # stamped with this snapshot's nightly tags, so 'site init' installs + # manifests that reference the images built above. + run: | + set -euo pipefail + make kubectl-unbounded-build VERSION="${TAG}" CONTAINER_REGISTRY="${REGISTRY,,}" + + - name: Run site init (first install) + if: env.MODE == 'init' + run: | + set -euo pipefail + ./bin/kubectl-unbounded site init \ + --name "${SITE_NAME}" \ + --cluster-node-cidr "${CLUSTER_NODE_CIDR}" \ + --cluster-pod-cidr "${CLUSTER_POD_CIDR}" \ + --node-cidr "${SITE_NODE_CIDR}" \ + --pod-cidr "${SITE_POD_CIDR}" \ + --manage-cni-plugin="${MANAGE_CNI_PLUGIN}" + + - name: Render manifests (upgrade) + if: env.MODE == 'upgrade' + run: | + set -euo pipefail + make machina-manifests net-manifests \ + VERSION="${TAG}" \ + CONTAINER_REGISTRY="${REGISTRY,,}" + + - name: Apply CRDs (upgrade) + if: env.MODE == 'upgrade' + run: | + set -euo pipefail + kubectl apply --server-side --force-conflicts -f deploy/net/rendered/crd/ + kubectl apply --server-side --force-conflicts -f deploy/machina/rendered/crd/ + kubectl wait --for=condition=Established crd --all --timeout=120s + + - name: Workaround for #235 - merge bundled machina-config with live ConfigMap + if: env.MODE == 'upgrade' + # Same #235 workaround as release-upgrade.yaml: preserve the live + # per-cluster apiServerEndpoint (only set by 'site init') when applying + # the freshly rendered machina-config. The merge script lives in-tree + # on the snapshot, so no default-branch sparse-checkout is needed. + # + # REMOVE once https://github.com/Azure/unbounded/issues/235 lands. + env: + BUNDLE_FILE: deploy/machina/rendered/03-config.yaml + MERGE_SCRIPT: hack/release/merge-machina-config.py + run: | + set -euo pipefail + if [[ ! -f "$BUNDLE_FILE" ]]; then + echo "::warning::Workaround for #235 expected ${BUNDLE_FILE} but it was not present; the workaround may no longer be needed" + exit 0 + fi + + LIVE_INNER=$(kubectl -n unbounded-kube get cm machina-config \ + -o jsonpath='{.data.config\.yaml}' --ignore-not-found) + export LIVE_INNER + + python3 "$MERGE_SCRIPT" + + - name: Apply manifests (upgrade) + if: env.MODE == 'upgrade' + run: | + set -euo pipefail + kubectl apply --server-side --force-conflicts -R -f deploy/net/rendered/ + kubectl apply --server-side --force-conflicts -R -f deploy/machina/rendered/ + + - name: Wait for rollouts + run: | + set -euo pipefail + kubectl -n unbounded-net rollout status deploy/unbounded-net-controller --timeout=5m + kubectl -n unbounded-net rollout status ds/unbounded-net-node --timeout=5m + kubectl -n unbounded-kube rollout status deploy/machina-controller --timeout=5m + + - name: Summarize deploy + if: always() + run: | + set -euo pipefail + { + echo "## Nightly deploy" + echo "" + echo "- Tag: \`${TAG}\`" + echo "- Mode: \`${MODE:-unknown}\`" + echo "- Environment: \`unbounded-nightly\`" + echo "- Site: \`${SITE_NAME}\`" + echo "" + echo "### Workload images" + echo "" + echo '```' + kubectl -n unbounded-net get deploy unbounded-net-controller -o jsonpath='{.spec.template.spec.containers[*].image}{"\n"}' 2>/dev/null || true + kubectl -n unbounded-net get ds unbounded-net-node -o jsonpath='{.spec.template.spec.containers[*].image}{"\n"}' 2>/dev/null || true + kubectl -n unbounded-kube get deploy machina-controller -o jsonpath='{.spec.template.spec.containers[*].image}{"\n"}' 2>/dev/null || true + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + + # --------------------------------------------------------------------------- + # Deploy Orca (origin cache) to unbounded-nightly. Reuses the shared, + # deployment-neutral hack/orca/deploy-integration.sh (same script the stable + # deploy uses); it targets whatever cluster KUBECONFIG points at. + # --------------------------------------------------------------------------- + deploy-orca: + needs: [resolve, deploy] + if: github.repository == 'Azure/unbounded' + runs-on: ubuntu-latest + environment: unbounded-nightly + env: + TAG: ${{ needs.resolve.outputs.tag }} + ORCA_AZURE_ACCOUNT: ${{ vars.ORCA_AZURE_ACCOUNT }} + ORCA_AZURE_CONTAINER: ${{ vars.ORCA_AZURE_CONTAINER }} + ORCA_AZURE_ENDPOINT: ${{ vars.ORCA_AZURE_ENDPOINT }} + steps: + - name: Validate Orca environment configuration + run: | + set -euo pipefail + missing=() + for v in ORCA_AZURE_ACCOUNT ORCA_AZURE_CONTAINER; do + if [[ -z "${!v}" ]]; then + missing+=("$v") + fi + done + if (( ${#missing[@]} > 0 )); then + echo "::error::Missing required Environment variables: ${missing[*]}" + echo "::error::Configure with hack/scripts/setup-deploy-environment.sh" + exit 1 + fi + + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + ref: ${{ needs.resolve.outputs.sha }} + + - name: Set up Go + uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 + with: + go-version-file: go.mod + + - name: Install kubectl + uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0 + + - name: Write kubeconfig + env: + KUBECONFIG_CONTENT: ${{ secrets.KUBECONFIG }} + run: | + set -euo pipefail + if [[ -z "${KUBECONFIG_CONTENT}" ]]; then + echo "::error::Environment secret KUBECONFIG is empty" + exit 1 + fi + umask 077 + printf '%s' "$KUBECONFIG_CONTENT" > "$RUNNER_TEMP/kubeconfig" + echo "KUBECONFIG=$RUNNER_TEMP/kubeconfig" >> "$GITHUB_ENV" + + - name: Verify cluster connectivity + run: | + set -euo pipefail + kubectl version + kubectl cluster-info + + - name: Verify orca-credentials Secret exists + run: | + set -euo pipefail + if ! kubectl -n unbounded-kube get secret orca-credentials >/dev/null 2>&1; then + echo "::error::Secret orca-credentials not found in unbounded-kube; pre-create it with ORCA_AZUREBLOB_ACCOUNT_KEY, ORCA_CACHESTORE_S3_ACCESS_KEY, ORCA_CACHESTORE_S3_SECRET_KEY" + exit 1 + fi + + - name: Deploy Orca + run: | + set -euo pipefail + IMAGE="${REGISTRY,,}/orca:${TAG}" + echo "Deploying Orca image: ${IMAGE}" + ./hack/orca/deploy-integration.sh \ + --image "${IMAGE}" \ + --azure-account "${ORCA_AZURE_ACCOUNT}" \ + --azure-container "${ORCA_AZURE_CONTAINER}" \ + --azure-endpoint "${ORCA_AZURE_ENDPOINT}" + + - name: Summarize Orca deploy + if: always() + run: | + set -euo pipefail + { + echo "## Orca deploy" + echo "" + echo "- Tag: \`${TAG}\`" + echo "- Image: \`${REGISTRY,,}/orca:${TAG}\`" + echo "- Environment: \`unbounded-nightly\`" + echo "" + echo '```' + kubectl -n unbounded-kube get deploy orca -o jsonpath='{.spec.template.spec.containers[*].image}{"\n"}' 2>/dev/null || true + kubectl -n unbounded-kube get deploy garage -o jsonpath='{.spec.template.spec.containers[*].image}{"\n"}' 2>/dev/null || true + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + + # --------------------------------------------------------------------------- + # Discover smoke tests (shared with release-upgrade.yaml: hack/release/smoke). + # Runs from the snapshot checkout. Emits a JSON matrix; skips cleanly if the + # directory is absent or empty (GitHub errors on an empty matrix). + # --------------------------------------------------------------------------- + smoke-discover: + needs: [resolve, deploy] + runs-on: ubuntu-latest + outputs: + tasks: ${{ steps.list.outputs.tasks }} + has_tasks: ${{ steps.list.outputs.has_tasks }} + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + ref: ${{ needs.resolve.outputs.sha }} + + - name: Discover smoke tasks + id: list + run: | + set -euo pipefail + if [[ ! -d hack/release/smoke ]]; then + echo "has_tasks=false" >> "$GITHUB_OUTPUT" + echo "tasks=[]" >> "$GITHUB_OUTPUT" + echo "hack/release/smoke does not exist at this ref; skipping smoke tests" + exit 0 + fi + tasks=$( + find hack/release/smoke -maxdepth 1 -type f -name '*.sh' \ + | sort \ + | jq -R -s -c 'split("\n") | map(select(length>0)) | + map({name: (. | sub(".*/"; "") | sub(".sh$"; "")), script: .})' + ) + if [[ -z "$tasks" || "$tasks" == "[]" ]]; then + echo "has_tasks=false" >> "$GITHUB_OUTPUT" + echo "tasks=[]" >> "$GITHUB_OUTPUT" + echo "No smoke tasks found in hack/release/smoke/" + else + echo "has_tasks=true" >> "$GITHUB_OUTPUT" + echo "tasks=${tasks}" >> "$GITHUB_OUTPUT" + echo "Discovered smoke tasks:" + echo "${tasks}" | jq . + fi + + # --------------------------------------------------------------------------- + # Smoke tests. Runs once per deploy, after core + Orca deploys complete. Each + # task gets TAG, KUBECONFIG, and SITE_NAME exported. See the CUSTOMIZE block + # in release-upgrade.yaml for the smoke-script contract (it is shared). + # --------------------------------------------------------------------------- + smoke-tests: + needs: [resolve, deploy, deploy-orca, smoke-discover] + if: needs.smoke-discover.outputs.has_tasks == 'true' + runs-on: ubuntu-latest + environment: unbounded-nightly + strategy: + fail-fast: false + matrix: + task: ${{ fromJSON(needs.smoke-discover.outputs.tasks) }} + name: smoke (${{ matrix.task.name }}) + env: + TAG: ${{ needs.resolve.outputs.tag }} + SITE_NAME: ${{ vars.SITE_NAME }} + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + ref: ${{ needs.resolve.outputs.sha }} + + - name: Install kubectl + uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0 + + - name: Write kubeconfig + env: + KUBECONFIG_CONTENT: ${{ secrets.KUBECONFIG }} + run: | + set -euo pipefail + if [[ -z "${KUBECONFIG_CONTENT}" ]]; then + echo "::error::Environment secret KUBECONFIG is empty" + exit 1 + fi + umask 077 + printf '%s' "$KUBECONFIG_CONTENT" > "$RUNNER_TEMP/kubeconfig" + echo "KUBECONFIG=$RUNNER_TEMP/kubeconfig" >> "$GITHUB_ENV" + + - name: Run smoke task ${{ matrix.task.name }} + timeout-minutes: 15 + env: + SMOKE_SCRIPT: ${{ matrix.task.script }} + run: | + set -euo pipefail + chmod +x "$SMOKE_SCRIPT" + "$SMOKE_SCRIPT" From 22bbd6ad8fd46b7d83b28dc5f83722c566b59c75 Mon Sep 17 00:00:00 2001 From: Philip Lombardi <893096+plombardi89@users.noreply.github.com> Date: Tue, 23 Jun 2026 00:21:54 -0400 Subject: [PATCH 03/12] Add one-shot setup-nightly-cluster.sh provisioner Automate the full unbounded-nightly operator runbook in a single idempotent script: - builds forge and runs 'forge cluster create' (which also makes the gateway pool labeled unbounded-cloud.io/unbounded-net-gateway=true, opens WireGuard ports, lays down the bootstrap token, and writes a kubeconfig); parses forge's JSON stdout for the RG / node RG / subscription / kubeconfig path - auto-detects the cluster node/pod CIDRs from AKS (mirroring aks-quickstart.sh:detect_cluster_cidrs); site CIDRs default to constants - creates the Orca origin storage account (default ub01) + container and reads its key - configures the unbounded-nightly Environment via setup-deploy-environment.sh - creates the unbounded-kube namespace + orca-credentials Secret - triggers nightly.yaml with force_init=true and watches it Point the nightly.yaml header's first-time-setup notes at the script (forge handles gateway labeling, CIDRs are auto-detected). --- .github/workflows/nightly.yaml | 27 +- hack/scripts/setup-nightly-cluster.sh | 501 ++++++++++++++++++++++++++ 2 files changed, 514 insertions(+), 14 deletions(-) create mode 100755 hack/scripts/setup-nightly-cluster.sh diff --git a/.github/workflows/nightly.yaml b/.github/workflows/nightly.yaml index deaf8d55..e6aafb9f 100644 --- a/.github/workflows/nightly.yaml +++ b/.github/workflows/nightly.yaml @@ -39,20 +39,19 @@ name: nightly # values (Azure account key and Garage S3 credentials) # # First-time setup (run once, out of band): -# 1. Provision the AKS cluster and configure the Environment: -# hack/scripts/setup-deploy-environment.sh \ -# --env-name unbounded-nightly --site-name nightly \ -# --kubeconfig --cluster-node-cidr ... --cluster-pod-cidr ... \ -# --site-node-cidr ... --site-pod-cidr ... \ -# --orca-azure-account ... --orca-azure-container ... -# Use CIDRs distinct from unbounded-stable. -# 2. Pre-create the orca-credentials Secret: -# hack/orca/create-credentials-secret.sh ... (against the nightly cluster) -# 3. Label at least one gateway node: -# kubectl label node unbounded-cloud.io/unbounded-net-gateway=true --overwrite -# 4. Bootstrap once (or let the first scheduled run auto-init, since the -# Site CRD is absent on a fresh cluster): -# gh workflow run nightly.yaml -f force_init=true +# Run the one-shot provisioner, which creates the AKS cluster with forge, +# creates the Orca origin, configures this Environment, pre-creates the +# orca-credentials Secret, and triggers the first (force_init) deploy: +# +# hack/scripts/setup-nightly-cluster.sh \ +# --subscription [--location ] +# +# forge provisions the gateway node pool already labeled +# unbounded-cloud.io/unbounded-net-gateway=true, so no manual node +# labeling is needed. The cluster node/pod CIDRs are auto-detected from +# AKS; the site CIDRs default to constants. See the script's --help for +# all flags. This workflow must already be on the default branch for the +# trigger step to work. # # Customization points (search for "CUSTOMIZE:" in this file) # ------------------------------------------------------------ diff --git a/hack/scripts/setup-nightly-cluster.sh b/hack/scripts/setup-nightly-cluster.sh new file mode 100755 index 00000000..bb68e208 --- /dev/null +++ b/hack/scripts/setup-nightly-cluster.sh @@ -0,0 +1,501 @@ +#!/usr/bin/env bash +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. +# +# setup-nightly-cluster.sh - One-shot provisioning of the unbounded-nightly +# integration cluster and everything the nightly deploy workflow needs. +# +# This script automates the operator runbook for the unbounded-nightly +# cluster (the nightly sibling of unbounded-stable; see +# .github/workflows/nightly.yaml). It: +# +# 1. Builds the forge tool (go build -o bin/forge). +# 2. Creates an AKS cluster with `forge cluster create`. forge also makes +# the gateway node pool (already labeled +# unbounded-cloud.io/unbounded-net-gateway=true), opens the WireGuard +# ports, lays down the unbounded bootstrap token, and writes a +# kubeconfig. Its stdout is a JSON object with ResourceGroup, +# NodePoolsResourceGroup, SubscriptionID, ClusterName, KubeconfigPath. +# 3. Auto-detects the cluster node/pod CIDRs from AKS (the same way +# hack/scripts/aks-quickstart.sh does). The site node/pod CIDRs are +# unbounded facts and default to constants (overridable). +# 4. Creates the Orca origin: an Azure Blob storage account (default +# ub01, e.g. ubnightly01) + container, and reads its key. +# 5. Configures the unbounded-nightly GitHub Environment via +# hack/scripts/setup-deploy-environment.sh. +# 6. Creates the unbounded-kube namespace and the orca-credentials Secret +# on the cluster (hack/orca/create-credentials-secret.sh). +# 7. Triggers the nightly workflow (force_init=true) and watches it to +# completion. +# +# It is idempotent: re-running skips an existing cluster / storage account / +# namespace, and the Environment + Secret are create-or-update. +# +# Prerequisites: +# - az CLI, logged in (az login) to the target tenant/subscription. +# - kubectl, gh (authenticated with admin on --repo), go, openssl, jq. +# - The nightly workflow must already be on the repo's default branch +# (merge the PR that adds .github/workflows/nightly.yaml first), or +# step 7 has nothing to trigger. +# +# Usage: +# hack/scripts/setup-nightly-cluster.sh \ +# --subscription \ +# --orca-azure-container orca-origin \ +# [flags] +# +# See --help for all flags. + +set -euo pipefail + +# --------------------------------------------------------------------------- +# Defaults. +# --------------------------------------------------------------------------- +ENV_NAME="unbounded-nightly" +SITE_NAME="nightly" +CLUSTER_NAME="unbounded-nightly" +LOCATION="canadacentral" +REPO="Azure/unbounded" +SUBSCRIPTION="${AZURE_SUBSCRIPTION_ID:-}" +MANAGE_CNI_PLUGIN="true" + +# Site (unbounded overlay) CIDRs - unbounded facts, default constants. +SITE_NODE_CIDR="10.1.0.0/16" +SITE_POD_CIDR="100.125.0.0/16" +# Standard Kubernetes pod CIDR fallback when AKS reports none (BYO CNI). +DEFAULT_POD_CIDR="10.244.0.0/16" + +# Cluster CIDRs - auto-detected from AKS unless overridden. +CLUSTER_NODE_CIDR="" +CLUSTER_POD_CIDR="" + +# forge cluster sizing (pass-through). +SYSTEM_POOL_NODE_COUNT="" +GATEWAY_POOL_NODE_COUNT="" +SYSTEM_POOL_NODE_SKU="" +GATEWAY_POOL_NODE_SKU="" + +# Orca origin. +ORIGIN_ACCOUNT="" +ORIGIN_CONTAINER="orca-origin" +ORIGIN_RG="" +ORIGIN_KEY="${ORCA_AZUREBLOB_ACCOUNT_KEY:-}" +ORCA_AZURE_ENDPOINT="" + +ASSUME_YES="false" +WATCH="true" + +# Populated by ensure_cluster. +RESOURCE_GROUP="" +NODE_RESOURCE_GROUP="" +KUBECONFIG_PATH="" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +# --------------------------------------------------------------------------- +# Logging helpers. +# --------------------------------------------------------------------------- +log() { echo -e ">> $*" >&2; } +info() { echo -e " $*" >&2; } +warn() { echo -e "!! $*" >&2; } +die() { echo -e "!! $*" >&2; exit 1; } + +usage() { + awk ' + /^#!/ { next } + /^#/ { sub(/^# ?/, ""); print; next } + { exit } + ' "${BASH_SOURCE[0]}" + cat >&2 <<'EOF' + +Flags: + --subscription ID Azure subscription ID (or env AZURE_SUBSCRIPTION_ID) + --location LOC Azure location (default: canadacentral) + --cluster-name NAME AKS cluster / resource group name (default: unbounded-nightly) + --env-name NAME GitHub Environment name (default: unbounded-nightly) + --site-name NAME unbounded site name (default: nightly) + --repo OWNER/NAME Target repository (default: Azure/unbounded) + --manage-cni-plugin BOOL Whether unbounded manages the CNI (default: true) + + --site-node-cidr CIDR Site node CIDR (default: 10.1.0.0/16) + --site-pod-cidr CIDR Site pod CIDR (default: 100.125.0.0/16) + --cluster-node-cidr CIDR Override auto-detected cluster node CIDR + --cluster-pod-cidr CIDR Override auto-detected cluster pod CIDR + + --origin-account NAME Orca origin storage account (default: ub01) + --origin-container NAME Orca origin blob container (default: orca-origin) + --origin-rg NAME Resource group for the origin account (default: cluster RG) + --origin-key KEY Origin account key (default: env ORCA_AZUREBLOB_ACCOUNT_KEY or fetched via az) + --orca-azure-endpoint URL Azure blob endpoint (default: *.blob.core.windows.net) + + --system-pool-node-count N forge system pool node count + --gateway-pool-node-count N forge gateway pool node count + --system-pool-node-sku SKU forge system pool VM SKU + --gateway-pool-node-sku SKU forge gateway pool VM SKU + + --no-watch Trigger the deploy run but do not wait for it + --yes Skip confirmation prompts + --help Show this help +EOF + exit "${1:-0}" +} + +# --------------------------------------------------------------------------- +# CIDR helpers (mirrors hack/scripts/aks-quickstart.sh). +# --------------------------------------------------------------------------- +is_valid_cidr() { + local cidr="$1" + [[ "$cidr" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/[0-9]+$ ]] || return 1 + local prefix="${cidr#*/}" + (( prefix >= 0 && prefix <= 32 )) || return 1 + return 0 +} + +ip4_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) | (b << 16) | (c << 8) | d )) +} + +# subnet_contains_all +subnet_contains_all() { + local prefix="${1%/*}" + local len="${1#*/}" + local mask=$(( 0xFFFFFFFF << (32 - len) & 0xFFFFFFFF )) + local net_int + net_int=$(ip4_to_int "$prefix") + local network=$(( net_int & mask )) + while IFS= read -r ip; do + [[ -z "$ip" ]] && continue + local ip_int + ip_int=$(ip4_to_int "$ip") + [[ $(( ip_int & mask )) -eq $network ]] || return 1 + done <<< "$2" + return 0 +} + +KCTL() { kubectl --kubeconfig "${KUBECONFIG_PATH}" "$@"; } + +# --------------------------------------------------------------------------- +# Argument parsing. +# --------------------------------------------------------------------------- +require_value() { + if [[ -z "${2:-}" || "${2:0:2}" == "--" ]]; then + die "flag $1 requires a value" + fi +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --subscription) require_value "$1" "${2:-}"; SUBSCRIPTION="$2"; shift 2 ;; + --location) require_value "$1" "${2:-}"; LOCATION="$2"; shift 2 ;; + --cluster-name) require_value "$1" "${2:-}"; CLUSTER_NAME="$2"; shift 2 ;; + --env-name) require_value "$1" "${2:-}"; ENV_NAME="$2"; shift 2 ;; + --site-name) require_value "$1" "${2:-}"; SITE_NAME="$2"; shift 2 ;; + --repo) require_value "$1" "${2:-}"; REPO="$2"; shift 2 ;; + --manage-cni-plugin) require_value "$1" "${2:-}"; MANAGE_CNI_PLUGIN="$2"; shift 2 ;; + --site-node-cidr) require_value "$1" "${2:-}"; SITE_NODE_CIDR="$2"; shift 2 ;; + --site-pod-cidr) require_value "$1" "${2:-}"; SITE_POD_CIDR="$2"; shift 2 ;; + --cluster-node-cidr) require_value "$1" "${2:-}"; CLUSTER_NODE_CIDR="$2"; shift 2 ;; + --cluster-pod-cidr) require_value "$1" "${2:-}"; CLUSTER_POD_CIDR="$2"; shift 2 ;; + --origin-account) require_value "$1" "${2:-}"; ORIGIN_ACCOUNT="$2"; shift 2 ;; + --origin-container) require_value "$1" "${2:-}"; ORIGIN_CONTAINER="$2"; shift 2 ;; + --origin-rg) require_value "$1" "${2:-}"; ORIGIN_RG="$2"; shift 2 ;; + --origin-key) require_value "$1" "${2:-}"; ORIGIN_KEY="$2"; shift 2 ;; + --orca-azure-endpoint) require_value "$1" "${2:-}"; ORCA_AZURE_ENDPOINT="$2"; shift 2 ;; + --system-pool-node-count) require_value "$1" "${2:-}"; SYSTEM_POOL_NODE_COUNT="$2"; shift 2 ;; + --gateway-pool-node-count) require_value "$1" "${2:-}"; GATEWAY_POOL_NODE_COUNT="$2"; shift 2 ;; + --system-pool-node-sku) require_value "$1" "${2:-}"; SYSTEM_POOL_NODE_SKU="$2"; shift 2 ;; + --gateway-pool-node-sku) require_value "$1" "${2:-}"; GATEWAY_POOL_NODE_SKU="$2"; shift 2 ;; + --no-watch) WATCH="false"; shift ;; + --yes) ASSUME_YES="true"; shift ;; + --help|-h) usage 0 ;; + *) die "unknown argument: $1 (try --help)" ;; + esac +done + +# Derive defaults that depend on other flags. +[[ -n "${ORIGIN_ACCOUNT}" ]] || ORIGIN_ACCOUNT="ub${SITE_NAME}01" + +# --------------------------------------------------------------------------- +# Preflight. +# --------------------------------------------------------------------------- +preflight() { + log "Preflight checks" + + for tool in az kubectl gh go openssl jq; do + command -v "$tool" >/dev/null 2>&1 || die "'$tool' not found on PATH" + done + + az account show >/dev/null 2>&1 || die "az is not logged in; run 'az login' first" + gh auth status >/dev/null 2>&1 || die "gh is not authenticated; run 'gh auth login' first" + + [[ -n "${SUBSCRIPTION}" ]] || SUBSCRIPTION="$(az account show --query id -o tsv)" + [[ -n "${SUBSCRIPTION}" ]] || die "--subscription is required (or set AZURE_SUBSCRIPTION_ID)" + + is_valid_cidr "${SITE_NODE_CIDR}" || die "invalid --site-node-cidr: ${SITE_NODE_CIDR}" + is_valid_cidr "${SITE_POD_CIDR}" || die "invalid --site-pod-cidr: ${SITE_POD_CIDR}" + case "${MANAGE_CNI_PLUGIN}" in true|false) ;; *) die "--manage-cni-plugin must be true|false" ;; esac + + # Storage account names: 3-24 lowercase alphanumeric. + [[ "${ORIGIN_ACCOUNT}" =~ ^[a-z0-9]{3,24}$ ]] \ + || die "origin account '${ORIGIN_ACCOUNT}' must be 3-24 lowercase alphanumeric chars (override with --origin-account)" + + # The nightly workflow must exist on the default branch to be triggerable. + if ! gh workflow view nightly.yaml --repo "${REPO}" >/dev/null 2>&1; then + die "workflow 'nightly.yaml' not found on ${REPO}'s default branch; merge the PR that adds it before running this script" + fi + + info "subscription: ${SUBSCRIPTION}" + info "cluster: ${CLUSTER_NAME} (${LOCATION})" + info "environment: ${ENV_NAME} / site ${SITE_NAME}" + info "origin: ${ORIGIN_ACCOUNT}/${ORIGIN_CONTAINER}" +} + +confirm() { + [[ "${ASSUME_YES}" == "true" ]] && return 0 + echo >&2 + read -r -p "Proceed with provisioning ${ENV_NAME}? [y/N] " reply + case "${reply}" in y|Y|yes|YES) ;; *) die "aborted" ;; esac +} + +# --------------------------------------------------------------------------- +# Build forge. +# --------------------------------------------------------------------------- +build_forge() { + log "Building forge (go build -o bin/forge ./hack/cmd/forge)" + ( cd "${REPO_ROOT}" && go build -o bin/forge ./hack/cmd/forge ) +} + +# --------------------------------------------------------------------------- +# Create (or reuse) the AKS cluster. Sets RESOURCE_GROUP, +# NODE_RESOURCE_GROUP, KUBECONFIG_PATH. +# --------------------------------------------------------------------------- +ensure_cluster() { + if az aks show -g "${CLUSTER_NAME}" -n "${CLUSTER_NAME}" --subscription "${SUBSCRIPTION}" >/dev/null 2>&1; then + log "Cluster ${CLUSTER_NAME} already exists; reusing it" + RESOURCE_GROUP="${CLUSTER_NAME}" + NODE_RESOURCE_GROUP="$(az aks show -g "${CLUSTER_NAME}" -n "${CLUSTER_NAME}" \ + --subscription "${SUBSCRIPTION}" --query nodeResourceGroup -o tsv)" + KUBECONFIG_PATH="${HOME}/.unbounded-forge/${CLUSTER_NAME}/kubeconfig" + mkdir -p "$(dirname "${KUBECONFIG_PATH}")" + az aks get-credentials -g "${CLUSTER_NAME}" -n "${CLUSTER_NAME}" \ + --subscription "${SUBSCRIPTION}" --file "${KUBECONFIG_PATH}" --overwrite-existing >/dev/null + else + log "Creating AKS cluster ${CLUSTER_NAME} with forge" + local args=(cluster create --name "${CLUSTER_NAME}" --location "${LOCATION}" --subscription "${SUBSCRIPTION}") + [[ -n "${SYSTEM_POOL_NODE_COUNT}" ]] && args+=(--system-pool-node-count "${SYSTEM_POOL_NODE_COUNT}") + [[ -n "${GATEWAY_POOL_NODE_COUNT}" ]] && args+=(--gateway-pool-node-count "${GATEWAY_POOL_NODE_COUNT}") + [[ -n "${SYSTEM_POOL_NODE_SKU}" ]] && args+=(--system-pool-node-sku "${SYSTEM_POOL_NODE_SKU}") + [[ -n "${GATEWAY_POOL_NODE_SKU}" ]] && args+=(--gateway-pool-node-sku "${GATEWAY_POOL_NODE_SKU}") + + # forge prints progress logs to stderr and the result JSON to stdout. + local out + out="$(AZURE_AUTH_CHAIN_ORDER=CLI "${REPO_ROOT}/bin/forge" "${args[@]}")" + + RESOURCE_GROUP="$(jq -r '.ResourceGroup' <<<"${out}")" + NODE_RESOURCE_GROUP="$(jq -r '.NodePoolsResourceGroup' <<<"${out}")" + KUBECONFIG_PATH="$(jq -r '.KubeconfigPath' <<<"${out}")" + fi + + [[ -n "${RESOURCE_GROUP}" && "${RESOURCE_GROUP}" != "null" ]] || die "could not determine cluster resource group" + [[ -n "${NODE_RESOURCE_GROUP}" && "${NODE_RESOURCE_GROUP}" != "null" ]] || die "could not determine node resource group" + [[ -f "${KUBECONFIG_PATH}" ]] || die "kubeconfig not found at ${KUBECONFIG_PATH}" + + [[ -z "${ORIGIN_RG}" ]] && ORIGIN_RG="${RESOURCE_GROUP}" + + info "resource group: ${RESOURCE_GROUP}" + info "node resource group: ${NODE_RESOURCE_GROUP}" + info "kubeconfig: ${KUBECONFIG_PATH}" +} + +# --------------------------------------------------------------------------- +# Detect cluster node/pod CIDRs from AKS (skipped if both overridden). +# Mirrors hack/scripts/aks-quickstart.sh:detect_cluster_cidrs. +# --------------------------------------------------------------------------- +detect_cluster_cidrs() { + if [[ -n "${CLUSTER_POD_CIDR}" && -n "${CLUSTER_NODE_CIDR}" ]]; then + info "using provided cluster CIDRs (node=${CLUSTER_NODE_CIDR}, pod=${CLUSTER_POD_CIDR})" + return 0 + fi + + log "Detecting cluster CIDRs from AKS" + + if [[ -z "${CLUSTER_POD_CIDR}" ]]; then + CLUSTER_POD_CIDR="$(az aks show --subscription "${SUBSCRIPTION}" \ + --resource-group "${CLUSTER_NAME}" --name "${CLUSTER_NAME}" \ + --query "networkProfile.podCidr" -o tsv)" + [[ "${CLUSTER_POD_CIDR}" == "None" ]] && CLUSTER_POD_CIDR="" + if [[ -z "${CLUSTER_POD_CIDR}" ]]; then + CLUSTER_POD_CIDR="${DEFAULT_POD_CIDR}" + info "no pod CIDR in AKS network profile (expected with BYO CNI); using default ${CLUSTER_POD_CIDR}" + fi + fi + + if [[ -z "${CLUSTER_NODE_CIDR}" ]]; then + # Nodes register with an InternalIP even before the CNI makes them Ready. + local node_ips="" + local elapsed=0 + while (( elapsed < 300 )); do + node_ips="$(KCTL get nodes \ + -o jsonpath='{range .items[?(@.spec.providerID)]}{range .status.addresses[?(@.type=="InternalIP")]}{.address}{"\n"}{end}{end}' \ + 2>/dev/null | grep -v '^$' || true)" + [[ -n "${node_ips}" ]] && break + info "waiting for node IPs to appear..." + sleep 10 + (( elapsed += 10 )) + done + [[ -n "${node_ips}" ]] || die "could not retrieve node internal IPs" + + CLUSTER_NODE_CIDR="$(az network vnet list \ + --subscription "${SUBSCRIPTION}" --resource-group "${NODE_RESOURCE_GROUP}" \ + --query "[].subnets[].addressPrefix" -o tsv | while IFS= read -r prefix; do + [[ -z "${prefix}" ]] && continue + subnet_contains_all "${prefix}" "${node_ips}" && echo "${prefix}" && break + done)" + [[ -n "${CLUSTER_NODE_CIDR}" ]] || die "could not find a VNet subnet containing all node IPs" + fi + + is_valid_cidr "${CLUSTER_NODE_CIDR}" || die "detected invalid cluster node CIDR: ${CLUSTER_NODE_CIDR}" + is_valid_cidr "${CLUSTER_POD_CIDR}" || die "invalid cluster pod CIDR: ${CLUSTER_POD_CIDR}" + info "cluster node CIDR: ${CLUSTER_NODE_CIDR}" + info "cluster pod CIDR: ${CLUSTER_POD_CIDR}" +} + +# --------------------------------------------------------------------------- +# Create the Orca origin storage account + container; resolve its key. +# --------------------------------------------------------------------------- +ensure_origin() { + log "Ensuring Orca origin storage account ${ORIGIN_ACCOUNT} (rg ${ORIGIN_RG})" + + if az storage account show -n "${ORIGIN_ACCOUNT}" -g "${ORIGIN_RG}" \ + --subscription "${SUBSCRIPTION}" >/dev/null 2>&1; then + info "storage account ${ORIGIN_ACCOUNT} already exists" + else + az storage account create \ + --name "${ORIGIN_ACCOUNT}" --resource-group "${ORIGIN_RG}" \ + --location "${LOCATION}" --subscription "${SUBSCRIPTION}" \ + --sku Standard_LRS --kind StorageV2 --min-tls-version TLS1_2 \ + --allow-blob-public-access false --only-show-errors >/dev/null + info "created storage account ${ORIGIN_ACCOUNT}" + fi + + if [[ -z "${ORIGIN_KEY}" ]]; then + ORIGIN_KEY="$(az storage account keys list \ + --account-name "${ORIGIN_ACCOUNT}" --resource-group "${ORIGIN_RG}" \ + --subscription "${SUBSCRIPTION}" --query "[0].value" -o tsv)" + fi + [[ -n "${ORIGIN_KEY}" ]] || die "could not resolve storage account key for ${ORIGIN_ACCOUNT}" + + log "Ensuring blob container ${ORIGIN_CONTAINER}" + az storage container create \ + --name "${ORIGIN_CONTAINER}" \ + --account-name "${ORIGIN_ACCOUNT}" --account-key "${ORIGIN_KEY}" \ + --only-show-errors >/dev/null +} + +# --------------------------------------------------------------------------- +# Configure the GitHub Environment. +# --------------------------------------------------------------------------- +configure_environment() { + log "Configuring GitHub Environment ${ENV_NAME}" + local args=( + --env-name "${ENV_NAME}" + --kubeconfig "${KUBECONFIG_PATH}" + --site-name "${SITE_NAME}" + --cluster-node-cidr "${CLUSTER_NODE_CIDR}" + --cluster-pod-cidr "${CLUSTER_POD_CIDR}" + --site-node-cidr "${SITE_NODE_CIDR}" + --site-pod-cidr "${SITE_POD_CIDR}" + --manage-cni-plugin "${MANAGE_CNI_PLUGIN}" + --orca-azure-account "${ORIGIN_ACCOUNT}" + --orca-azure-container "${ORIGIN_CONTAINER}" + --repo "${REPO}" + --yes + ) + [[ -n "${ORCA_AZURE_ENDPOINT}" ]] && args+=(--orca-azure-endpoint "${ORCA_AZURE_ENDPOINT}") + + "${SCRIPT_DIR}/setup-deploy-environment.sh" "${args[@]}" +} + +# --------------------------------------------------------------------------- +# Create the unbounded-kube namespace + orca-credentials Secret on cluster. +# --------------------------------------------------------------------------- +ensure_secret() { + log "Ensuring unbounded-kube namespace and orca-credentials Secret" + KCTL get namespace unbounded-kube >/dev/null 2>&1 || KCTL create namespace unbounded-kube + + KUBECONFIG="${KUBECONFIG_PATH}" "${REPO_ROOT}/hack/orca/create-credentials-secret.sh" \ + --azure-account-key "${ORIGIN_KEY}" +} + +# --------------------------------------------------------------------------- +# Trigger the nightly workflow (force_init) and optionally watch it. +# --------------------------------------------------------------------------- +trigger_deploy() { + log "Triggering nightly workflow (force_init=true)" + + # Record the newest run id before dispatch so we can identify the new one. + local before + before="$(gh run list --repo "${REPO}" --workflow nightly.yaml \ + --limit 1 --json databaseId --jq '.[0].databaseId // 0' 2>/dev/null || echo 0)" + + gh workflow run nightly.yaml --repo "${REPO}" -f force_init=true + + log "Waiting for the run to register..." + local run_id="" elapsed=0 + while (( elapsed < 60 )); do + run_id="$(gh run list --repo "${REPO}" --workflow nightly.yaml \ + --limit 1 --json databaseId --jq '.[0].databaseId // 0' 2>/dev/null || echo 0)" + [[ -n "${run_id}" && "${run_id}" != "0" && "${run_id}" != "${before}" ]] && break + sleep 3 + (( elapsed += 3 )) + done + + if [[ -z "${run_id}" || "${run_id}" == "0" || "${run_id}" == "${before}" ]]; then + warn "could not identify the new run; check: gh run list --repo ${REPO} --workflow nightly.yaml" + return 0 + fi + + local run_url + run_url="$(gh run view "${run_id}" --repo "${REPO}" --json url --jq '.url' 2>/dev/null || true)" + info "run: ${run_url:-${run_id}}" + + if [[ "${WATCH}" == "true" ]]; then + log "Watching run ${run_id} to completion" + gh run watch "${run_id}" --repo "${REPO}" --exit-status + fi +} + +# --------------------------------------------------------------------------- +# Print a verification summary. +# --------------------------------------------------------------------------- +verify() { + log "Cluster state" + KCTL -n unbounded-net rollout status deploy/unbounded-net-controller --timeout=60s 2>/dev/null || true + KCTL -n unbounded-net rollout status ds/unbounded-net-node --timeout=60s 2>/dev/null || true + KCTL -n unbounded-kube rollout status deploy/machina-controller --timeout=60s 2>/dev/null || true + KCTL -n unbounded-kube get deploy orca garage 2>/dev/null || true +} + +# --------------------------------------------------------------------------- +# Main. +# --------------------------------------------------------------------------- +main() { + preflight + confirm + build_forge + ensure_cluster + detect_cluster_cidrs + ensure_origin + configure_environment + ensure_secret + trigger_deploy + verify + + log "Done. unbounded-nightly is provisioned and the first deploy was triggered." + info "Subsequent runs deploy automatically every morning at 06:00 UTC." +} + +main "$@" From eac9442a949a8d7251df0942498fc81991952412 Mon Sep 17 00:00:00 2001 From: Philip Lombardi <893096+plombardi89@users.noreply.github.com> Date: Tue, 23 Jun 2026 01:05:17 -0400 Subject: [PATCH 04/12] nightly: enable pre-merge testing from the branch workflow_dispatch and schedule only run from the default branch, so the nightly workflow could not be exercised before merge. Add a TEMPORARY push trigger on the nightly-deploy branch (marked for removal before merge) so pushing the branch runs its own workflow file end-to-end (build -> init deploy -> Orca -> smoke) against the unbounded-nightly cluster, giving reviewers a real run to inspect. Also fix the resolve job to snapshot github.sha instead of the default branch, so a push-triggered run builds the pushed commit rather than main. This is a correctness fix and stays after merge. setup-nightly-cluster.sh gains --no-trigger: provision the cluster, origin, Environment, and Secret without dispatching, then push the branch to fire the run. The default-branch workflow preflight is skipped in that mode. --- .github/workflows/nightly.yaml | 14 ++++++++- hack/scripts/setup-nightly-cluster.sh | 45 +++++++++++++++++++++++---- 2 files changed, 52 insertions(+), 7 deletions(-) diff --git a/.github/workflows/nightly.yaml b/.github/workflows/nightly.yaml index e6aafb9f..93617cbb 100644 --- a/.github/workflows/nightly.yaml +++ b/.github/workflows/nightly.yaml @@ -66,6 +66,15 @@ on: # 06:00 UTC daily. See the "Schedule" note in the header above. - cron: "0 6 * * *" + # TEMPORARY (remove before merge): allows this workflow to be tested from + # the nightly-deploy branch. workflow_dispatch and schedule only run from + # the default branch, so a push trigger is the only way to exercise a + # branch's own workflow file before it is merged. Pushing nightly-deploy + # runs the full build -> deploy -> smoke against the unbounded-nightly + # cluster (init mode is auto-selected while the Site CRD is absent). + push: + branches: [nightly-deploy] + # Manual trigger for re-deploys, first bootstrap, and testing a specific ref. workflow_dispatch: inputs: @@ -108,7 +117,10 @@ jobs: steps: - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 with: - ref: ${{ inputs.ref || github.event.repository.default_branch }} + # Snapshot the commit this run is for: the pushed commit on a push + # event, or the default-branch head on schedule/dispatch. inputs.ref + # overrides for a manual snapshot of a specific ref. + ref: ${{ inputs.ref || github.sha }} fetch-depth: 0 - name: Resolve snapshot commit and tag diff --git a/hack/scripts/setup-nightly-cluster.sh b/hack/scripts/setup-nightly-cluster.sh index bb68e208..7dfba8cb 100755 --- a/hack/scripts/setup-nightly-cluster.sh +++ b/hack/scripts/setup-nightly-cluster.sh @@ -84,6 +84,7 @@ ORCA_AZURE_ENDPOINT="" ASSUME_YES="false" WATCH="true" +TRIGGER="true" # Populated by ensure_cluster. RESOURCE_GROUP="" @@ -135,6 +136,10 @@ Flags: --gateway-pool-node-sku SKU forge gateway pool VM SKU --no-watch Trigger the deploy run but do not wait for it + --no-trigger Provision only; do not trigger the deploy run. + Use this to test the workflow from a branch before + it is on the default branch: provision, then push + the branch to fire its push-triggered run. --yes Skip confirmation prompts --help Show this help EOF @@ -209,6 +214,7 @@ while [[ $# -gt 0 ]]; do --system-pool-node-sku) require_value "$1" "${2:-}"; SYSTEM_POOL_NODE_SKU="$2"; shift 2 ;; --gateway-pool-node-sku) require_value "$1" "${2:-}"; GATEWAY_POOL_NODE_SKU="$2"; shift 2 ;; --no-watch) WATCH="false"; shift ;; + --no-trigger) TRIGGER="false"; shift ;; --yes) ASSUME_YES="true"; shift ;; --help|-h) usage 0 ;; *) die "unknown argument: $1 (try --help)" ;; @@ -242,9 +248,13 @@ preflight() { [[ "${ORIGIN_ACCOUNT}" =~ ^[a-z0-9]{3,24}$ ]] \ || die "origin account '${ORIGIN_ACCOUNT}' must be 3-24 lowercase alphanumeric chars (override with --origin-account)" - # The nightly workflow must exist on the default branch to be triggerable. - if ! gh workflow view nightly.yaml --repo "${REPO}" >/dev/null 2>&1; then - die "workflow 'nightly.yaml' not found on ${REPO}'s default branch; merge the PR that adds it before running this script" + # The nightly workflow must exist on the default branch to be triggerable + # via workflow_dispatch. Skipped when --no-trigger (e.g. pre-merge testing, + # where the run is fired by pushing the branch instead). + if [[ "${TRIGGER}" == "true" ]]; then + if ! gh workflow view nightly.yaml --repo "${REPO}" >/dev/null 2>&1; then + die "workflow 'nightly.yaml' not found on ${REPO}'s default branch; merge the PR that adds it, or pass --no-trigger to provision and test from a branch" + fi fi info "subscription: ${SUBSCRIPTION}" @@ -432,8 +442,23 @@ ensure_secret() { # --------------------------------------------------------------------------- # Trigger the nightly workflow (force_init) and optionally watch it. +# +# With --no-trigger we provision only and tell the operator how to fire the +# run by pushing the branch (used to test the workflow before it is on the +# default branch, where workflow_dispatch is not available). # --------------------------------------------------------------------------- trigger_deploy() { + if [[ "${TRIGGER}" != "true" ]]; then + local branch + branch="$(git -C "${REPO_ROOT}" rev-parse --abbrev-ref HEAD 2>/dev/null || echo '')" + log "Skipping deploy trigger (--no-trigger)" + info "Provisioning is complete. To run the workflow from this branch, push it:" + info " git push origin ${branch}" + info "The push-triggered run does the build, init deploy, Orca deploy, and smoke." + info "Watch it: gh run watch \$(gh run list --repo ${REPO} --workflow nightly.yaml --limit 1 --json databaseId --jq '.[0].databaseId') --repo ${REPO}" + return 0 + fi + log "Triggering nightly workflow (force_init=true)" # Record the newest run id before dispatch so we can identify the new one. @@ -492,10 +517,18 @@ main() { configure_environment ensure_secret trigger_deploy - verify - log "Done. unbounded-nightly is provisioned and the first deploy was triggered." - info "Subsequent runs deploy automatically every morning at 06:00 UTC." + # Cluster state is only meaningful once a deploy has actually completed. + if [[ "${TRIGGER}" == "true" && "${WATCH}" == "true" ]]; then + verify + fi + + if [[ "${TRIGGER}" == "true" ]]; then + log "Done. unbounded-nightly is provisioned and the first deploy was triggered." + info "Subsequent runs deploy automatically every morning at 06:00 UTC." + else + log "Done. unbounded-nightly is provisioned (deploy not triggered)." + fi } main "$@" From db1a0779de35badc0b9d4fcabac0edae346a14f4 Mon Sep 17 00:00:00 2001 From: Philip Lombardi <893096+plombardi89@users.noreply.github.com> Date: Tue, 23 Jun 2026 01:27:20 -0400 Subject: [PATCH 05/12] setup-deploy-environment: don't prompt on empty variable values set_var ran 'gh variable set --body ""' for empty values (e.g. a blank ORCA_AZURE_ENDPOINT, which means 'use the default *.blob.core.windows.net'). gh treats an empty --body as no value on a TTY and prompts interactively ('Paste your variable'), hanging non-interactive callers like setup-nightly-cluster.sh. Pipe the value via stdin instead; stdin is never a TTY here, so empty values are stored without prompting. --- hack/scripts/setup-deploy-environment.sh | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/hack/scripts/setup-deploy-environment.sh b/hack/scripts/setup-deploy-environment.sh index c128bc8b..f34648e3 100755 --- a/hack/scripts/setup-deploy-environment.sh +++ b/hack/scripts/setup-deploy-environment.sh @@ -228,10 +228,13 @@ set_var() { local name="$1" local value="$2" echo "==> Setting variable $name" - if ! gh variable set "$name" \ + # Pipe the value via stdin instead of --body: `gh variable set --body ""` + # treats an empty value as "no value" on a TTY and prompts interactively + # ("Paste your variable"), which would hang non-interactive callers. Stdin + # is never a TTY here, so an empty value is stored without prompting. + if ! printf '%s' "$value" | gh variable set "$name" \ --repo "$REPO" \ - --env "$ENV_NAME" \ - --body "$value"; then + --env "$ENV_NAME"; then echo "error: failed to set variable $name" >&2 exit 3 fi From ad74ff7895d8b891f9b1df26fa4394e08366e696 Mon Sep 17 00:00:00 2001 From: Philip Lombardi <893096+plombardi89@users.noreply.github.com> Date: Tue, 23 Jun 2026 02:00:18 -0400 Subject: [PATCH 06/12] setup-deploy-environment: skip empty variables instead of failing GitHub Actions variables cannot be empty: 'gh variable set' with an empty value returns HTTP 422 (missing required key 'value'). A blank ORCA_AZURE_ENDPOINT (meaning 'use the default *.blob.core.windows.net') therefore broke setup. Skip empty values in set_var; an unset variable already resolves to "" in the workflow, which is the intended behavior. Supersedes the earlier stdin approach (which avoided the interactive prompt but still sent an empty value and hit the 422). --- hack/scripts/setup-deploy-environment.sh | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/hack/scripts/setup-deploy-environment.sh b/hack/scripts/setup-deploy-environment.sh index f34648e3..45cf1adb 100755 --- a/hack/scripts/setup-deploy-environment.sh +++ b/hack/scripts/setup-deploy-environment.sh @@ -227,14 +227,19 @@ fi set_var() { local name="$1" local value="$2" + # GitHub Actions variables cannot be empty (the API returns HTTP 422 on a + # missing value). An unset variable already resolves to "" in the workflow, + # which is the intended behavior (e.g. a blank ORCA_AZURE_ENDPOINT => the + # Orca driver uses the default *.blob.core.windows.net). So skip empties. + if [[ -z "$value" ]]; then + echo "==> Skipping empty variable $name" + return 0 + fi echo "==> Setting variable $name" - # Pipe the value via stdin instead of --body: `gh variable set --body ""` - # treats an empty value as "no value" on a TTY and prompts interactively - # ("Paste your variable"), which would hang non-interactive callers. Stdin - # is never a TTY here, so an empty value is stored without prompting. - if ! printf '%s' "$value" | gh variable set "$name" \ + if ! gh variable set "$name" \ --repo "$REPO" \ - --env "$ENV_NAME"; then + --env "$ENV_NAME" \ + --body "$value"; then echo "error: failed to set variable $name" >&2 exit 3 fi From 6abdbd3835be9a9a66ef330a61aa11ee94e01625 Mon Sep 17 00:00:00 2001 From: Philip Lombardi <893096+plombardi89@users.noreply.github.com> Date: Tue, 23 Jun 2026 02:07:02 -0400 Subject: [PATCH 07/12] setup-deploy-environment: disable gh pager for non-interactive runs The 'Configured secrets'/'Configured variables' summaries call 'gh secret list' / 'gh variable list', which page their table output through an interactive pager when stdout is a TTY, dropping the caller (including setup-nightly-cluster.sh) into a pager that needs manual 'q' to exit. Export GH_PAGER=cat so the whole script runs without paging. --- hack/scripts/setup-deploy-environment.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/hack/scripts/setup-deploy-environment.sh b/hack/scripts/setup-deploy-environment.sh index 45cf1adb..637ff1ad 100755 --- a/hack/scripts/setup-deploy-environment.sh +++ b/hack/scripts/setup-deploy-environment.sh @@ -23,6 +23,11 @@ set -euo pipefail IFS=$'\n\t' +# This script is meant to run non-interactively (it is also invoked by +# setup-nightly-cluster.sh). Disable gh's pager so the secret/variable list +# summaries below never drop the caller into an interactive pager. +export GH_PAGER=cat + REPO="Azure/unbounded" MANAGE_CNI_PLUGIN="true" ASSUME_YES="false" From e42260d35352011eb2dc089d2f783a34113a5e08 Mon Sep 17 00:00:00 2001 From: Philip Lombardi <893096+plombardi89@users.noreply.github.com> Date: Tue, 23 Jun 2026 12:03:06 -0400 Subject: [PATCH 08/12] setup-deploy-environment: add --channel and drop obsolete gateway step The closing 'Next steps' hint was stable/release-specific and partly wrong: it told operators to label a gateway node (forge already labels the gwmain pool on both stable and nightly clusters) and to trigger release-upgrade.yaml with a tag (nightly deploys a snapshot of main via nightly.yaml on a schedule, not tags). Add --channel stable|nightly (default stable, so stable output is unchanged apart from removing the obsolete gateway-labeling step) and print channel-appropriate trigger guidance. setup-nightly-cluster.sh passes --channel nightly. --- hack/scripts/setup-deploy-environment.sh | 36 ++++++++++++++++++++---- hack/scripts/setup-nightly-cluster.sh | 1 + 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/hack/scripts/setup-deploy-environment.sh b/hack/scripts/setup-deploy-environment.sh index 637ff1ad..cf7bac70 100755 --- a/hack/scripts/setup-deploy-environment.sh +++ b/hack/scripts/setup-deploy-environment.sh @@ -31,6 +31,7 @@ export GH_PAGER=cat REPO="Azure/unbounded" MANAGE_CNI_PLUGIN="true" ASSUME_YES="false" +DEPLOY_CHANNEL="stable" ENV_NAME="" KUBECONFIG_PATH="" @@ -64,6 +65,7 @@ Required: Optional: --manage-cni-plugin BOOL Whether unbounded manages the CNI (true|false). Default: true + --channel CHANNEL Deploy channel for the "next steps" hint: stable|nightly. Default: stable --orca-azure-account NAME Azure storage account for the Orca origin (enables Orca deploy) --orca-azure-container NAME Azure blob container for the Orca origin --orca-azure-endpoint URL Azure blob endpoint (optional; blank => *.blob.core.windows.net) @@ -101,6 +103,7 @@ while [[ $# -gt 0 ]]; do --site-node-cidr) require_value "$1" "${2:-}"; SITE_NODE_CIDR="$2"; shift 2 ;; --site-pod-cidr) require_value "$1" "${2:-}"; SITE_POD_CIDR="$2"; shift 2 ;; --manage-cni-plugin) require_value "$1" "${2:-}"; MANAGE_CNI_PLUGIN="$2"; shift 2 ;; + --channel) require_value "$1" "${2:-}"; DEPLOY_CHANNEL="$2"; shift 2 ;; --orca-azure-account) require_value "$1" "${2:-}"; ORCA_AZURE_ACCOUNT="$2"; shift 2 ;; --orca-azure-container) require_value "$1" "${2:-}"; ORCA_AZURE_CONTAINER="$2"; shift 2 ;; --orca-azure-endpoint) require_value "$1" "${2:-}"; ORCA_AZURE_ENDPOINT="$2"; shift 2 ;; @@ -140,6 +143,12 @@ case "$MANAGE_CNI_PLUGIN" in *) die "--manage-cni-plugin must be 'true' or 'false', got '$MANAGE_CNI_PLUGIN'" ;; esac +# Validate deploy channel (only affects the "next steps" hint printed below). +case "$DEPLOY_CHANNEL" in + stable|nightly) ;; + *) die "--channel must be 'stable' or 'nightly', got '$DEPLOY_CHANNEL'" ;; +esac + # Validate Orca config: account and container go together (endpoint is # optional). If neither is set, the Orca deploy job is left unconfigured. if [[ -n "$ORCA_AZURE_ACCOUNT" && -z "$ORCA_AZURE_CONTAINER" ]]; then @@ -277,17 +286,34 @@ cat < \\ - unbounded-cloud.io/unbounded-net-gateway=true --overwrite + 1. Trigger the first install (run once per cluster): + gh workflow run nightly.yaml \\ + --repo $REPO \\ + -f force_init=true + + 2. Subsequent nightly snapshots of main deploy automatically to $ENV_NAME + at 06:00 UTC. +EOF +else + cat < Date: Tue, 23 Jun 2026 19:21:39 -0400 Subject: [PATCH 09/12] nightly: bump actions/checkout to v7.0.0 to match main main moved actions/checkout from v6.0.3 to v7.0.0 repo-wide (incl. release.yaml and release-upgrade.yaml). Bump the 7 pins in nightly.yaml to stay in lockstep. v7 is compatible with the ref/fetch-depth usage here; no behavior change. --- .github/workflows/nightly.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/nightly.yaml b/.github/workflows/nightly.yaml index 93617cbb..3f3e1b69 100644 --- a/.github/workflows/nightly.yaml +++ b/.github/workflows/nightly.yaml @@ -115,7 +115,7 @@ jobs: sha: ${{ steps.resolve.outputs.sha }} tag: ${{ steps.resolve.outputs.tag }} steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 with: # Snapshot the commit this run is for: the pushed commit on a push # event, or the default-branch head on schedule/dispatch. inputs.ref @@ -152,7 +152,7 @@ jobs: TAG: ${{ needs.resolve.outputs.tag }} SHA: ${{ needs.resolve.outputs.sha }} steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 with: ref: ${{ needs.resolve.outputs.sha }} @@ -266,7 +266,7 @@ jobs: TAG: ${{ needs.resolve.outputs.tag }} SHA: ${{ needs.resolve.outputs.sha }} steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 with: ref: ${{ needs.resolve.outputs.sha }} @@ -341,7 +341,7 @@ jobs: exit 1 fi - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 with: ref: ${{ needs.resolve.outputs.sha }} @@ -518,7 +518,7 @@ jobs: exit 1 fi - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 with: ref: ${{ needs.resolve.outputs.sha }} @@ -597,7 +597,7 @@ jobs: tasks: ${{ steps.list.outputs.tasks }} has_tasks: ${{ steps.list.outputs.has_tasks }} steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 with: ref: ${{ needs.resolve.outputs.sha }} @@ -647,7 +647,7 @@ jobs: TAG: ${{ needs.resolve.outputs.tag }} SITE_NAME: ${{ vars.SITE_NAME }} steps: - - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 with: ref: ${{ needs.resolve.outputs.sha }} From 57fc2dd85f12d3dd87b6e345ea58e8cbad9f20ea Mon Sep 17 00:00:00 2001 From: Philip Lombardi <893096+plombardi89@users.noreply.github.com> Date: Tue, 23 Jun 2026 20:46:44 -0400 Subject: [PATCH 10/12] nightly: deploy Orca with a single replica The default forge cluster has a 2-node system pool (the gateway pool is tainted), which cannot fit Orca's default 3 replicas alongside Garage and the net/machina workloads, so deploy-orca timed out waiting for the orca rollout. unbounded-nightly is a soak target, not HA, so run a single Orca replica via deploy-integration.sh --replicas 1. --- .github/workflows/nightly.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/workflows/nightly.yaml b/.github/workflows/nightly.yaml index 3f3e1b69..fcc8b27e 100644 --- a/.github/workflows/nightly.yaml +++ b/.github/workflows/nightly.yaml @@ -562,8 +562,12 @@ jobs: set -euo pipefail IMAGE="${REGISTRY,,}/orca:${TAG}" echo "Deploying Orca image: ${IMAGE}" + # Single replica: unbounded-nightly is a soak target, not HA, and + # the default forge cluster (2-node system pool) cannot fit Orca's + # default 3 replicas alongside Garage + net/machina. ./hack/orca/deploy-integration.sh \ --image "${IMAGE}" \ + --replicas 1 \ --azure-account "${ORCA_AZURE_ACCOUNT}" \ --azure-container "${ORCA_AZURE_CONTAINER}" \ --azure-endpoint "${ORCA_AZURE_ENDPOINT}" From 92370afc4e6f6f251dc019e220276bf08294438f Mon Sep 17 00:00:00 2001 From: Philip Lombardi <893096+plombardi89@users.noreply.github.com> Date: Tue, 23 Jun 2026 22:23:11 -0400 Subject: [PATCH 11/12] orca: grant Garage bucket by access key id; stop regenerating keys Orca failed at startup with a Garage 403 on GetBucketVersioning ('Operation is not allowed for this key'). Root cause: the one-shot re-ran create-credentials-secret.sh on every invocation, which mints fresh Garage S3 keys, while bootstrap-garage.sh imports each under the same name 'orca' and grants the bucket via --key orca. With several keys sharing that name the grant became ambiguous and landed on a stale key, leaving the key Orca actually uses unauthorized. Fixes: - bootstrap-garage.sh grants (key allow / bucket allow) by the unique access key id instead of the name, so the current Secret's key always gets authorized (self-heals clusters that already drifted on the next deploy-orca run). Benefits stable too. - setup-nightly-cluster.sh leaves an existing orca-credentials Secret untouched instead of regenerating its keys; delete the Secret to rotate. --- hack/orca/bootstrap-garage.sh | 10 ++++++++-- hack/scripts/setup-nightly-cluster.sh | 9 +++++++++ 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/hack/orca/bootstrap-garage.sh b/hack/orca/bootstrap-garage.sh index 087748f2..d6021524 100755 --- a/hack/orca/bootstrap-garage.sh +++ b/hack/orca/bootstrap-garage.sh @@ -121,11 +121,17 @@ secret_key="$(secret_value ORCA_CACHESTORE_S3_SECRET_KEY)" if ! gexec key list 2>/dev/null | grep -q "${access_key}"; then gexec key import "${access_key}" "${secret_key}" -n "${KEY_NAME}" --yes fi -gexec key allow --create-bucket "${KEY_NAME}" >/dev/null 2>&1 || true +# Grant by the unique access key id, not the human-readable name: if the +# Secret's keys were ever regenerated, Garage ends up with multiple keys +# sharing the name "${KEY_NAME}", and a name-based grant is ambiguous (it can +# land on a stale key, leaving the key Orca actually uses unauthorized and +# Orca failing with a 403 on its first cachestore call). The access key id is +# unique, so granting on it always targets the key currently in the Secret. +gexec key allow --create-bucket "${access_key}" >/dev/null 2>&1 || true # Ensure the cachestore bucket exists and is owned by the key. gexec bucket info "${BUCKET}" >/dev/null 2>&1 || gexec bucket create "${BUCKET}" -gexec bucket allow --read --write --owner --key "${KEY_NAME}" "${BUCKET}" >/dev/null 2>&1 || true +gexec bucket allow --read --write --owner --key "${access_key}" "${BUCKET}" >/dev/null 2>&1 || true # Verify the bucket is queryable before declaring success. gexec bucket info "${BUCKET}" >/dev/null 2>&1 \ diff --git a/hack/scripts/setup-nightly-cluster.sh b/hack/scripts/setup-nightly-cluster.sh index 62dee5e6..0e04b265 100755 --- a/hack/scripts/setup-nightly-cluster.sh +++ b/hack/scripts/setup-nightly-cluster.sh @@ -437,6 +437,15 @@ ensure_secret() { log "Ensuring unbounded-kube namespace and orca-credentials Secret" KCTL get namespace unbounded-kube >/dev/null 2>&1 || KCTL create namespace unbounded-kube + # Leave an existing Secret untouched. create-credentials-secret.sh generates + # fresh Garage S3 keys on every invocation; regenerating them on a re-run + # after Garage already granted the bucket to the previous key strands Orca's + # key (Garage 403). To rotate keys, delete the Secret first. + if KCTL -n unbounded-kube get secret orca-credentials >/dev/null 2>&1; then + info "orca-credentials Secret already exists; leaving its keys unchanged" + return 0 + fi + KUBECONFIG="${KUBECONFIG_PATH}" "${REPO_ROOT}/hack/orca/create-credentials-secret.sh" \ --azure-account-key "${ORIGIN_KEY}" } From 4ad7c84ebdf2e566b95b7bfc607efa9e7e931229 Mon Sep 17 00:00:00 2001 From: Philip Lombardi <893096+plombardi89@users.noreply.github.com> Date: Tue, 23 Jun 2026 23:12:20 -0400 Subject: [PATCH 12/12] nightly: remove temporary branch push trigger Pre-merge testing from the nightly-deploy branch is complete, so drop the temporary push trigger. The workflow now runs only on its 06:00 UTC schedule (against the default-branch head) and via manual workflow_dispatch. It does NOT run on merges/pushes to main. Keeps the github.sha resolve fix and the --no-trigger provisioning support. --- .github/workflows/nightly.yaml | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/.github/workflows/nightly.yaml b/.github/workflows/nightly.yaml index fcc8b27e..a7583f06 100644 --- a/.github/workflows/nightly.yaml +++ b/.github/workflows/nightly.yaml @@ -66,15 +66,6 @@ on: # 06:00 UTC daily. See the "Schedule" note in the header above. - cron: "0 6 * * *" - # TEMPORARY (remove before merge): allows this workflow to be tested from - # the nightly-deploy branch. workflow_dispatch and schedule only run from - # the default branch, so a push trigger is the only way to exercise a - # branch's own workflow file before it is merged. Pushing nightly-deploy - # runs the full build -> deploy -> smoke against the unbounded-nightly - # cluster (init mode is auto-selected while the Site CRD is absent). - push: - branches: [nightly-deploy] - # Manual trigger for re-deploys, first bootstrap, and testing a specific ref. workflow_dispatch: inputs: @@ -117,9 +108,8 @@ jobs: steps: - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 with: - # Snapshot the commit this run is for: the pushed commit on a push - # event, or the default-branch head on schedule/dispatch. inputs.ref - # overrides for a manual snapshot of a specific ref. + # Snapshot the default-branch head (github.sha) on schedule/dispatch. + # inputs.ref overrides for a manual snapshot of a specific ref. ref: ${{ inputs.ref || github.sha }} fetch-depth: 0