diff --git a/.github/workflows/nightly.yaml b/.github/workflows/nightly.yaml new file mode 100644 index 00000000..a7583f06 --- /dev/null +++ b/.github/workflows/nightly.yaml @@ -0,0 +1,671 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +name: nightly + +# --------------------------------------------------------------------------- +# Builds a snapshot of `main` from source every morning and deploys it to the +# integration cluster (unbounded-nightly) so the tip of the tree gets the same +# soak treatment that releases get on unbounded-stable. +# +# Relationship to release-upgrade.yaml +# ------------------------------------ +# This is the nightly sibling of .github/workflows/release-upgrade.yaml. That +# workflow deploys a PUBLISHED, signed release (downloaded as a manifests +# tarball from a GitHub Release) to unbounded-stable. This one has no release +# and no tag: it builds the current main HEAD from source, pushes images +# tagged `nightly-`, renders manifests against those tags, and +# deploys them to unbounded-nightly. There is deliberately no cosign signing, +# SBOM, or Trivy gating here: the nightly cluster is a throwaway soak target, +# not a customer artifact. The full supply chain stays on the release path. +# +# Schedule +# -------- +# Runs at 06:00 UTC daily. GitHub Actions cron is UTC-only and does not follow +# US daylight saving, so this lands at 01:00 ET (EST) / 02:00 ET (EDT). We run +# early on purpose: a from-source build of every component plus a deploy and +# smoke pass needs to finish well before the US working day, so 06:00 UTC is +# chosen over a literal 4am-ET cron to leave that headroom. +# +# Target cluster +# -------------- +# The target cluster is configured via the `unbounded-nightly` GitHub +# Environment, which provides: +# - secret KUBECONFIG (raw kubeconfig file contents) +# - vars SITE_NAME, CLUSTER_NODE_CIDR, CLUSTER_POD_CIDR, +# SITE_NODE_CIDR, SITE_POD_CIDR, MANAGE_CNI_PLUGIN, +# ORCA_AZURE_ACCOUNT, ORCA_AZURE_CONTAINER, ORCA_AZURE_ENDPOINT +# - cluster Secret unbounded-kube/orca-credentials for confidential Orca +# values (Azure account key and Garage S3 credentials) +# +# First-time setup (run once, out of band): +# Run the one-shot provisioner, which creates the AKS cluster with forge, +# creates the Orca origin, configures this Environment, pre-creates the +# orca-credentials Secret, and triggers the first (force_init) deploy: +# +# hack/scripts/setup-nightly-cluster.sh \ +# --subscription [--location ] +# +# forge provisions the gateway node pool already labeled +# unbounded-cloud.io/unbounded-net-gateway=true, so no manual node +# labeling is needed. The cluster node/pod CIDRs are auto-detected from +# AKS; the site CIDRs default to constants. See the script's --help for +# all flags. This workflow must already be on the default branch for the +# trigger step to work. +# +# Customization points (search for "CUSTOMIZE:" in this file) +# ------------------------------------------------------------ +# - Add a smoke test -> drop a script in hack/release/smoke/ +# (shared with release-upgrade.yaml) +# - Change the schedule -> edit the cron below +# - Change target cluster -> change `environment:` on the relevant jobs +# --------------------------------------------------------------------------- + +on: + schedule: + # 06:00 UTC daily. See the "Schedule" note in the header above. + - cron: "0 6 * * *" + + # Manual trigger for re-deploys, first bootstrap, and testing a specific ref. + workflow_dispatch: + inputs: + ref: + description: "Git ref to snapshot (default: the default branch)" + required: false + type: string + force_init: + description: "Run 'site init' instead of upgrade-apply (use for first-ever bootstrap)" + type: boolean + default: false + +permissions: + contents: read + packages: write + +# Only the freshest snapshot matters. If a new nightly starts while an old one +# is still running, cancel the old one. +concurrency: + group: deploy-nightly + cancel-in-progress: true + +env: + REGISTRY: ghcr.io/${{ github.repository_owner }} + CNI_PLUGINS_VERSION: v1.9.1 + +jobs: + # --------------------------------------------------------------------------- + # Resolve the commit to snapshot and derive the nightly image tag. + # + # The downstream build and deploy jobs all pin to `resolve.outputs.sha` so a + # branch advance mid-run cannot split the snapshot across two commits. + # --------------------------------------------------------------------------- + resolve: + if: github.repository == 'Azure/unbounded' + runs-on: ubuntu-latest + outputs: + sha: ${{ steps.resolve.outputs.sha }} + tag: ${{ steps.resolve.outputs.tag }} + steps: + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + with: + # Snapshot the default-branch head (github.sha) on schedule/dispatch. + # inputs.ref overrides for a manual snapshot of a specific ref. + ref: ${{ inputs.ref || github.sha }} + fetch-depth: 0 + + - name: Resolve snapshot commit and tag + id: resolve + run: | + set -euo pipefail + SHA="$(git rev-parse HEAD)" + SHORT="$(git rev-parse --short HEAD)" + TAG="nightly-${SHORT}" + { + echo "sha=${SHA}" + echo "tag=${TAG}" + } >> "$GITHUB_OUTPUT" + echo "Snapshot commit: ${SHA}" + echo "Nightly tag: ${TAG}" + + # --------------------------------------------------------------------------- + # Build and push the net controller/node images for the snapshot (amd64). + # + # Mirrors release.yaml's net-images job minus the multi-arch matrix, Trivy + # gating, cosign signing, and SBOM attestation. The frontend is built inline + # (release.yaml shares it across a per-arch matrix via an artifact; we have a + # single amd64 build, so inline is simpler) and staged into the embed dir. + # --------------------------------------------------------------------------- + net-images: + needs: resolve + runs-on: ubuntu-latest + env: + TAG: ${{ needs.resolve.outputs.tag }} + SHA: ${{ needs.resolve.outputs.sha }} + steps: + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + with: + ref: ${{ needs.resolve.outputs.sha }} + + - name: Normalize container registry to lowercase + run: echo "REGISTRY=${REGISTRY,,}" >> "$GITHUB_ENV" + + - uses: actions/setup-node@48b55a011bda9f5d6aeb4c2d9c7362e8dae4041e # v6.0.0 + with: + node-version: "24" + cache: npm + cache-dependency-path: frontend/package-lock.json + + - name: Build frontend and stage into the net embed dir + run: | + set -euo pipefail + npm ci --prefer-offline --no-audit + npm run build + # internal/net/html/pages.go embeds internal/net/html/dist via + # go:embed; the Containerfile copies the whole context, so the Go + # build picks up exactly this snapshot's UI. + rm -rf ../internal/net/html/dist + mkdir -p ../internal/net/html/dist + cp -R dist/. ../internal/net/html/dist/ + working-directory: frontend + + - name: Cache CNI plugins + uses: actions/cache@27d5ce7f107fe9357f9df03efb73ab90386fccae # v5.0.5 + with: + path: resources + key: cni-plugins-amd64-${{ env.CNI_PLUGINS_VERSION }} + + - name: Download CNI plugins + run: | + set -euo pipefail + mkdir -p resources + file="resources/cni-plugins-linux-amd64-${CNI_PLUGINS_VERSION}.tgz" + if [ ! -s "$file" ]; then + curl -fsSL \ + "https://github.com/containernetworking/plugins/releases/download/${CNI_PLUGINS_VERSION}/cni-plugins-linux-amd64-${CNI_PLUGINS_VERSION}.tgz" \ + -o "$file" + fi + + - uses: docker/setup-buildx-action@d7f5e7f509e45cec5c76c4d5afdd7de93d0b3df5 # v4.1.0 + + - name: Log in to ghcr.io + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Compute Go version for Docker + id: goversion + run: echo "go_version=$(awk '/^go /{split($2,v,"."); print v[1]"."v[2]}' go.mod)" >> "$GITHUB_OUTPUT" + + - name: Build and push controller image + uses: docker/build-push-action@f9f3042f7e2789586610d6e8b85c8f03e5195baf # v7.2.0 + with: + context: . + file: images/net/Containerfile + target: controller + platforms: linux/amd64 + push: true + provenance: false + build-args: | + GO_VERSION=${{ steps.goversion.outputs.go_version }} + VERSION=${{ env.TAG }} + GIT_COMMIT=${{ env.SHA }} + CNI_PLUGINS_VERSION=${{ env.CNI_PLUGINS_VERSION }} + tags: ${{ env.REGISTRY }}/unbounded-net-controller:${{ env.TAG }} + cache-from: type=gha,scope=nightly-net-controller + cache-to: type=gha,scope=nightly-net-controller,mode=max + + - name: Build and push node image + uses: docker/build-push-action@f9f3042f7e2789586610d6e8b85c8f03e5195baf # v7.2.0 + with: + context: . + file: images/net/Containerfile + target: node + platforms: linux/amd64 + push: true + provenance: false + build-args: | + GO_VERSION=${{ steps.goversion.outputs.go_version }} + VERSION=${{ env.TAG }} + GIT_COMMIT=${{ env.SHA }} + CNI_PLUGINS_VERSION=${{ env.CNI_PLUGINS_VERSION }} + tags: ${{ env.REGISTRY }}/unbounded-net-node:${{ env.TAG }} + cache-from: type=gha,scope=nightly-net-node + cache-to: type=gha,scope=nightly-net-node,mode=max + + # --------------------------------------------------------------------------- + # Build and push the remaining images the deploy needs (amd64). + # - machina : applied as part of the core deploy + # - orca : deployed separately by the deploy-orca job + # machine-ops-controller is intentionally omitted: like release-upgrade.yaml, + # the deploy applies only net/ and machina/, so machine-ops is never rolled + # out here and would be dead weight to build. + # --------------------------------------------------------------------------- + component-images: + needs: resolve + runs-on: ubuntu-latest + strategy: + matrix: + component: + - name: machina + file: images/machina/Containerfile + - name: orca + file: images/orca/Containerfile + env: + TAG: ${{ needs.resolve.outputs.tag }} + SHA: ${{ needs.resolve.outputs.sha }} + steps: + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + with: + ref: ${{ needs.resolve.outputs.sha }} + + - name: Normalize container registry to lowercase + run: echo "REGISTRY=${REGISTRY,,}" >> "$GITHUB_ENV" + + - uses: docker/setup-buildx-action@d7f5e7f509e45cec5c76c4d5afdd7de93d0b3df5 # v4.1.0 + + - name: Log in to ghcr.io + uses: docker/login-action@650006c6eb7dba73a995cc03b0b2d7f5ca915bee # v4.2.0 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push ${{ matrix.component.name }} image + uses: docker/build-push-action@f9f3042f7e2789586610d6e8b85c8f03e5195baf # v7.2.0 + with: + context: . + file: ${{ matrix.component.file }} + platforms: linux/amd64 + push: true + provenance: false + build-args: | + VERSION=${{ env.TAG }} + GIT_COMMIT=${{ env.SHA }} + tags: ${{ env.REGISTRY }}/${{ matrix.component.name }}:${{ env.TAG }} + cache-from: type=gha,scope=nightly-${{ matrix.component.name }} + cache-to: type=gha,scope=nightly-${{ matrix.component.name }},mode=max + + # --------------------------------------------------------------------------- + # Deploy the snapshot to unbounded-nightly. + # + # MODE=init -> 'kubectl unbounded site init' using manifests embedded + # into a plugin built from this snapshot (stamped with the + # nightly image tags). Used for the first bootstrap. + # MODE=upgrade -> render the machina + net manifests for the nightly tags + # and server-side apply them. + # + # Unlike release-upgrade.yaml the manifests are rendered here from the + # snapshot checkout (no tarball download, no cosign verify): there is no + # release artifact for a nightly. + # --------------------------------------------------------------------------- + deploy: + needs: [resolve, net-images, component-images] + if: github.repository == 'Azure/unbounded' + runs-on: ubuntu-latest + environment: unbounded-nightly + env: + TAG: ${{ needs.resolve.outputs.tag }} + SITE_NAME: ${{ vars.SITE_NAME }} + CLUSTER_NODE_CIDR: ${{ vars.CLUSTER_NODE_CIDR }} + CLUSTER_POD_CIDR: ${{ vars.CLUSTER_POD_CIDR }} + SITE_NODE_CIDR: ${{ vars.SITE_NODE_CIDR }} + SITE_POD_CIDR: ${{ vars.SITE_POD_CIDR }} + MANAGE_CNI_PLUGIN: ${{ vars.MANAGE_CNI_PLUGIN }} + outputs: + mode: ${{ steps.mode.outputs.mode }} + steps: + - name: Validate environment configuration + run: | + set -euo pipefail + missing=() + for v in SITE_NAME CLUSTER_NODE_CIDR CLUSTER_POD_CIDR SITE_NODE_CIDR SITE_POD_CIDR MANAGE_CNI_PLUGIN; do + if [[ -z "${!v}" ]]; then + missing+=("$v") + fi + done + if (( ${#missing[@]} > 0 )); then + echo "::error::Missing required Environment variables: ${missing[*]}" + echo "::error::Configure with hack/scripts/setup-deploy-environment.sh" + exit 1 + fi + + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + with: + ref: ${{ needs.resolve.outputs.sha }} + + - uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 + with: + go-version-file: go.mod + + - name: Install kubectl + uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0 + + - name: Write kubeconfig + env: + KUBECONFIG_CONTENT: ${{ secrets.KUBECONFIG }} + run: | + set -euo pipefail + if [[ -z "${KUBECONFIG_CONTENT}" ]]; then + echo "::error::Environment secret KUBECONFIG is empty" + exit 1 + fi + umask 077 + printf '%s' "$KUBECONFIG_CONTENT" > "$RUNNER_TEMP/kubeconfig" + echo "KUBECONFIG=$RUNNER_TEMP/kubeconfig" >> "$GITHUB_ENV" + + - name: Verify cluster connectivity + run: | + set -euo pipefail + kubectl version + kubectl cluster-info + + - name: Determine deploy mode + id: mode + run: | + set -euo pipefail + MODE=upgrade + if [[ "${{ inputs.force_init }}" == "true" ]]; then + MODE=init + echo "force_init=true; selecting init mode" + elif ! kubectl get crd sites.net.unbounded-cloud.io >/dev/null 2>&1; then + MODE=init + echo "Site CRD not present; selecting init mode" + elif ! kubectl get site.net.unbounded-cloud.io "${SITE_NAME}" >/dev/null 2>&1; then + MODE=init + echo "Site '${SITE_NAME}' not present; selecting init mode" + else + echo "Site '${SITE_NAME}' exists; selecting upgrade mode" + fi + echo "MODE=${MODE}" >> "$GITHUB_ENV" + echo "mode=${MODE}" >> "$GITHUB_OUTPUT" + + - name: Build kubectl-unbounded plugin (init) + if: env.MODE == 'init' + # Building the plugin re-renders the embedded net + machina manifests + # stamped with this snapshot's nightly tags, so 'site init' installs + # manifests that reference the images built above. + run: | + set -euo pipefail + make kubectl-unbounded-build VERSION="${TAG}" CONTAINER_REGISTRY="${REGISTRY,,}" + + - name: Run site init (first install) + if: env.MODE == 'init' + run: | + set -euo pipefail + ./bin/kubectl-unbounded site init \ + --name "${SITE_NAME}" \ + --cluster-node-cidr "${CLUSTER_NODE_CIDR}" \ + --cluster-pod-cidr "${CLUSTER_POD_CIDR}" \ + --node-cidr "${SITE_NODE_CIDR}" \ + --pod-cidr "${SITE_POD_CIDR}" \ + --manage-cni-plugin="${MANAGE_CNI_PLUGIN}" + + - name: Render manifests (upgrade) + if: env.MODE == 'upgrade' + run: | + set -euo pipefail + make machina-manifests net-manifests \ + VERSION="${TAG}" \ + CONTAINER_REGISTRY="${REGISTRY,,}" + + - name: Apply CRDs (upgrade) + if: env.MODE == 'upgrade' + run: | + set -euo pipefail + kubectl apply --server-side --force-conflicts -f deploy/net/rendered/crd/ + kubectl apply --server-side --force-conflicts -f deploy/machina/rendered/crd/ + kubectl wait --for=condition=Established crd --all --timeout=120s + + - name: Workaround for #235 - merge bundled machina-config with live ConfigMap + if: env.MODE == 'upgrade' + # Same #235 workaround as release-upgrade.yaml: preserve the live + # per-cluster apiServerEndpoint (only set by 'site init') when applying + # the freshly rendered machina-config. The merge script lives in-tree + # on the snapshot, so no default-branch sparse-checkout is needed. + # + # REMOVE once https://github.com/Azure/unbounded/issues/235 lands. + env: + BUNDLE_FILE: deploy/machina/rendered/03-config.yaml + MERGE_SCRIPT: hack/release/merge-machina-config.py + run: | + set -euo pipefail + if [[ ! -f "$BUNDLE_FILE" ]]; then + echo "::warning::Workaround for #235 expected ${BUNDLE_FILE} but it was not present; the workaround may no longer be needed" + exit 0 + fi + + LIVE_INNER=$(kubectl -n unbounded-kube get cm machina-config \ + -o jsonpath='{.data.config\.yaml}' --ignore-not-found) + export LIVE_INNER + + python3 "$MERGE_SCRIPT" + + - name: Apply manifests (upgrade) + if: env.MODE == 'upgrade' + run: | + set -euo pipefail + kubectl apply --server-side --force-conflicts -R -f deploy/net/rendered/ + kubectl apply --server-side --force-conflicts -R -f deploy/machina/rendered/ + + - name: Wait for rollouts + run: | + set -euo pipefail + kubectl -n unbounded-net rollout status deploy/unbounded-net-controller --timeout=5m + kubectl -n unbounded-net rollout status ds/unbounded-net-node --timeout=5m + kubectl -n unbounded-kube rollout status deploy/machina-controller --timeout=5m + + - name: Summarize deploy + if: always() + run: | + set -euo pipefail + { + echo "## Nightly deploy" + echo "" + echo "- Tag: \`${TAG}\`" + echo "- Mode: \`${MODE:-unknown}\`" + echo "- Environment: \`unbounded-nightly\`" + echo "- Site: \`${SITE_NAME}\`" + echo "" + echo "### Workload images" + echo "" + echo '```' + kubectl -n unbounded-net get deploy unbounded-net-controller -o jsonpath='{.spec.template.spec.containers[*].image}{"\n"}' 2>/dev/null || true + kubectl -n unbounded-net get ds unbounded-net-node -o jsonpath='{.spec.template.spec.containers[*].image}{"\n"}' 2>/dev/null || true + kubectl -n unbounded-kube get deploy machina-controller -o jsonpath='{.spec.template.spec.containers[*].image}{"\n"}' 2>/dev/null || true + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + + # --------------------------------------------------------------------------- + # Deploy Orca (origin cache) to unbounded-nightly. Reuses the shared, + # deployment-neutral hack/orca/deploy-integration.sh (same script the stable + # deploy uses); it targets whatever cluster KUBECONFIG points at. + # --------------------------------------------------------------------------- + deploy-orca: + needs: [resolve, deploy] + if: github.repository == 'Azure/unbounded' + runs-on: ubuntu-latest + environment: unbounded-nightly + env: + TAG: ${{ needs.resolve.outputs.tag }} + ORCA_AZURE_ACCOUNT: ${{ vars.ORCA_AZURE_ACCOUNT }} + ORCA_AZURE_CONTAINER: ${{ vars.ORCA_AZURE_CONTAINER }} + ORCA_AZURE_ENDPOINT: ${{ vars.ORCA_AZURE_ENDPOINT }} + steps: + - name: Validate Orca environment configuration + run: | + set -euo pipefail + missing=() + for v in ORCA_AZURE_ACCOUNT ORCA_AZURE_CONTAINER; do + if [[ -z "${!v}" ]]; then + missing+=("$v") + fi + done + if (( ${#missing[@]} > 0 )); then + echo "::error::Missing required Environment variables: ${missing[*]}" + echo "::error::Configure with hack/scripts/setup-deploy-environment.sh" + exit 1 + fi + + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + with: + ref: ${{ needs.resolve.outputs.sha }} + + - name: Set up Go + uses: actions/setup-go@4a3601121dd01d1626a1e23e37211e3254c1c06c # v6.4.0 + with: + go-version-file: go.mod + + - name: Install kubectl + uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0 + + - name: Write kubeconfig + env: + KUBECONFIG_CONTENT: ${{ secrets.KUBECONFIG }} + run: | + set -euo pipefail + if [[ -z "${KUBECONFIG_CONTENT}" ]]; then + echo "::error::Environment secret KUBECONFIG is empty" + exit 1 + fi + umask 077 + printf '%s' "$KUBECONFIG_CONTENT" > "$RUNNER_TEMP/kubeconfig" + echo "KUBECONFIG=$RUNNER_TEMP/kubeconfig" >> "$GITHUB_ENV" + + - name: Verify cluster connectivity + run: | + set -euo pipefail + kubectl version + kubectl cluster-info + + - name: Verify orca-credentials Secret exists + run: | + set -euo pipefail + if ! kubectl -n unbounded-kube get secret orca-credentials >/dev/null 2>&1; then + echo "::error::Secret orca-credentials not found in unbounded-kube; pre-create it with ORCA_AZUREBLOB_ACCOUNT_KEY, ORCA_CACHESTORE_S3_ACCESS_KEY, ORCA_CACHESTORE_S3_SECRET_KEY" + exit 1 + fi + + - name: Deploy Orca + run: | + set -euo pipefail + IMAGE="${REGISTRY,,}/orca:${TAG}" + echo "Deploying Orca image: ${IMAGE}" + # Single replica: unbounded-nightly is a soak target, not HA, and + # the default forge cluster (2-node system pool) cannot fit Orca's + # default 3 replicas alongside Garage + net/machina. + ./hack/orca/deploy-integration.sh \ + --image "${IMAGE}" \ + --replicas 1 \ + --azure-account "${ORCA_AZURE_ACCOUNT}" \ + --azure-container "${ORCA_AZURE_CONTAINER}" \ + --azure-endpoint "${ORCA_AZURE_ENDPOINT}" + + - name: Summarize Orca deploy + if: always() + run: | + set -euo pipefail + { + echo "## Orca deploy" + echo "" + echo "- Tag: \`${TAG}\`" + echo "- Image: \`${REGISTRY,,}/orca:${TAG}\`" + echo "- Environment: \`unbounded-nightly\`" + echo "" + echo '```' + kubectl -n unbounded-kube get deploy orca -o jsonpath='{.spec.template.spec.containers[*].image}{"\n"}' 2>/dev/null || true + kubectl -n unbounded-kube get deploy garage -o jsonpath='{.spec.template.spec.containers[*].image}{"\n"}' 2>/dev/null || true + echo '```' + } >> "$GITHUB_STEP_SUMMARY" + + # --------------------------------------------------------------------------- + # Discover smoke tests (shared with release-upgrade.yaml: hack/release/smoke). + # Runs from the snapshot checkout. Emits a JSON matrix; skips cleanly if the + # directory is absent or empty (GitHub errors on an empty matrix). + # --------------------------------------------------------------------------- + smoke-discover: + needs: [resolve, deploy] + runs-on: ubuntu-latest + outputs: + tasks: ${{ steps.list.outputs.tasks }} + has_tasks: ${{ steps.list.outputs.has_tasks }} + steps: + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + with: + ref: ${{ needs.resolve.outputs.sha }} + + - name: Discover smoke tasks + id: list + run: | + set -euo pipefail + if [[ ! -d hack/release/smoke ]]; then + echo "has_tasks=false" >> "$GITHUB_OUTPUT" + echo "tasks=[]" >> "$GITHUB_OUTPUT" + echo "hack/release/smoke does not exist at this ref; skipping smoke tests" + exit 0 + fi + tasks=$( + find hack/release/smoke -maxdepth 1 -type f -name '*.sh' \ + | sort \ + | jq -R -s -c 'split("\n") | map(select(length>0)) | + map({name: (. | sub(".*/"; "") | sub(".sh$"; "")), script: .})' + ) + if [[ -z "$tasks" || "$tasks" == "[]" ]]; then + echo "has_tasks=false" >> "$GITHUB_OUTPUT" + echo "tasks=[]" >> "$GITHUB_OUTPUT" + echo "No smoke tasks found in hack/release/smoke/" + else + echo "has_tasks=true" >> "$GITHUB_OUTPUT" + echo "tasks=${tasks}" >> "$GITHUB_OUTPUT" + echo "Discovered smoke tasks:" + echo "${tasks}" | jq . + fi + + # --------------------------------------------------------------------------- + # Smoke tests. Runs once per deploy, after core + Orca deploys complete. Each + # task gets TAG, KUBECONFIG, and SITE_NAME exported. See the CUSTOMIZE block + # in release-upgrade.yaml for the smoke-script contract (it is shared). + # --------------------------------------------------------------------------- + smoke-tests: + needs: [resolve, deploy, deploy-orca, smoke-discover] + if: needs.smoke-discover.outputs.has_tasks == 'true' + runs-on: ubuntu-latest + environment: unbounded-nightly + strategy: + fail-fast: false + matrix: + task: ${{ fromJSON(needs.smoke-discover.outputs.tasks) }} + name: smoke (${{ matrix.task.name }}) + env: + TAG: ${{ needs.resolve.outputs.tag }} + SITE_NAME: ${{ vars.SITE_NAME }} + steps: + - uses: actions/checkout@9c091bb21b7c1c1d1991bb908d89e4e9dddfe3e0 # v7.0.0 + with: + ref: ${{ needs.resolve.outputs.sha }} + + - name: Install kubectl + uses: azure/setup-kubectl@829323503d1be3d00ca8346e5391ca0b07a9ab0d # v5.1.0 + + - name: Write kubeconfig + env: + KUBECONFIG_CONTENT: ${{ secrets.KUBECONFIG }} + run: | + set -euo pipefail + if [[ -z "${KUBECONFIG_CONTENT}" ]]; then + echo "::error::Environment secret KUBECONFIG is empty" + exit 1 + fi + umask 077 + printf '%s' "$KUBECONFIG_CONTENT" > "$RUNNER_TEMP/kubeconfig" + echo "KUBECONFIG=$RUNNER_TEMP/kubeconfig" >> "$GITHUB_ENV" + + - name: Run smoke task ${{ matrix.task.name }} + timeout-minutes: 15 + env: + SMOKE_SCRIPT: ${{ matrix.task.script }} + run: | + set -euo pipefail + chmod +x "$SMOKE_SCRIPT" + "$SMOKE_SCRIPT" diff --git a/.github/workflows/release-upgrade.yaml b/.github/workflows/release-upgrade.yaml index 7c821653..14e6a300 100644 --- a/.github/workflows/release-upgrade.yaml +++ b/.github/workflows/release-upgrade.yaml @@ -465,7 +465,7 @@ jobs: # part of the shippable release tarball. release.yaml DOES build and # push the Orca image (/orca:), but its manifests plus # the test-only Garage cachestore are deployed here from hack/orca via - # hack/orca/deploy-stable.sh. + # hack/orca/deploy-integration.sh. # # Origin is real Azure Blob storage; cachestore is an in-cluster # single-node Garage backed by a PVC. Confidential values (Azure @@ -552,7 +552,7 @@ jobs: set -euo pipefail IMAGE="${REGISTRY,,}/orca:${TAG}" echo "Deploying Orca image: ${IMAGE}" - ./hack/orca/deploy-stable.sh \ + ./hack/orca/deploy-integration.sh \ --image "${IMAGE}" \ --azure-account "${ORCA_AZURE_ACCOUNT}" \ --azure-container "${ORCA_AZURE_CONTAINER}" \ diff --git a/hack/orca/bootstrap-garage.sh b/hack/orca/bootstrap-garage.sh index 087748f2..d6021524 100755 --- a/hack/orca/bootstrap-garage.sh +++ b/hack/orca/bootstrap-garage.sh @@ -121,11 +121,17 @@ secret_key="$(secret_value ORCA_CACHESTORE_S3_SECRET_KEY)" if ! gexec key list 2>/dev/null | grep -q "${access_key}"; then gexec key import "${access_key}" "${secret_key}" -n "${KEY_NAME}" --yes fi -gexec key allow --create-bucket "${KEY_NAME}" >/dev/null 2>&1 || true +# Grant by the unique access key id, not the human-readable name: if the +# Secret's keys were ever regenerated, Garage ends up with multiple keys +# sharing the name "${KEY_NAME}", and a name-based grant is ambiguous (it can +# land on a stale key, leaving the key Orca actually uses unauthorized and +# Orca failing with a 403 on its first cachestore call). The access key id is +# unique, so granting on it always targets the key currently in the Secret. +gexec key allow --create-bucket "${access_key}" >/dev/null 2>&1 || true # Ensure the cachestore bucket exists and is owned by the key. gexec bucket info "${BUCKET}" >/dev/null 2>&1 || gexec bucket create "${BUCKET}" -gexec bucket allow --read --write --owner --key "${KEY_NAME}" "${BUCKET}" >/dev/null 2>&1 || true +gexec bucket allow --read --write --owner --key "${access_key}" "${BUCKET}" >/dev/null 2>&1 || true # Verify the bucket is queryable before declaring success. gexec bucket info "${BUCKET}" >/dev/null 2>&1 \ diff --git a/hack/orca/create-credentials-secret.sh b/hack/orca/create-credentials-secret.sh index 02f51cf6..3d12f39a 100755 --- a/hack/orca/create-credentials-secret.sh +++ b/hack/orca/create-credentials-secret.sh @@ -10,7 +10,7 @@ # - ORCA_CACHESTORE_S3_ACCESS_KEY Garage S3 access key id (generated here) # - ORCA_CACHESTORE_S3_SECRET_KEY Garage S3 secret key (generated here) # -# The Garage S3 keys are the single source of truth: hack/orca/deploy-stable.sh +# The Garage S3 keys are the single source of truth: hack/orca/deploy-integration.sh # imports them into Garage (via bootstrap-garage.sh) and injects them into # Orca via envFrom. This script generates fresh ones in the format Garage # requires (access id = "GK" + 12 hex bytes; secret = 32 hex bytes) unless @@ -100,5 +100,5 @@ cat >&2 < *.blob.core.windows.net) @@ -96,6 +103,7 @@ while [[ $# -gt 0 ]]; do --site-node-cidr) require_value "$1" "${2:-}"; SITE_NODE_CIDR="$2"; shift 2 ;; --site-pod-cidr) require_value "$1" "${2:-}"; SITE_POD_CIDR="$2"; shift 2 ;; --manage-cni-plugin) require_value "$1" "${2:-}"; MANAGE_CNI_PLUGIN="$2"; shift 2 ;; + --channel) require_value "$1" "${2:-}"; DEPLOY_CHANNEL="$2"; shift 2 ;; --orca-azure-account) require_value "$1" "${2:-}"; ORCA_AZURE_ACCOUNT="$2"; shift 2 ;; --orca-azure-container) require_value "$1" "${2:-}"; ORCA_AZURE_CONTAINER="$2"; shift 2 ;; --orca-azure-endpoint) require_value "$1" "${2:-}"; ORCA_AZURE_ENDPOINT="$2"; shift 2 ;; @@ -135,6 +143,12 @@ case "$MANAGE_CNI_PLUGIN" in *) die "--manage-cni-plugin must be 'true' or 'false', got '$MANAGE_CNI_PLUGIN'" ;; esac +# Validate deploy channel (only affects the "next steps" hint printed below). +case "$DEPLOY_CHANNEL" in + stable|nightly) ;; + *) die "--channel must be 'stable' or 'nightly', got '$DEPLOY_CHANNEL'" ;; +esac + # Validate Orca config: account and container go together (endpoint is # optional). If neither is set, the Orca deploy job is left unconfigured. if [[ -n "$ORCA_AZURE_ACCOUNT" && -z "$ORCA_AZURE_CONTAINER" ]]; then @@ -227,6 +241,14 @@ fi set_var() { local name="$1" local value="$2" + # GitHub Actions variables cannot be empty (the API returns HTTP 422 on a + # missing value). An unset variable already resolves to "" in the workflow, + # which is the intended behavior (e.g. a blank ORCA_AZURE_ENDPOINT => the + # Orca driver uses the default *.blob.core.windows.net). So skip empties. + if [[ -z "$value" ]]; then + echo "==> Skipping empty variable $name" + return 0 + fi echo "==> Setting variable $name" if ! gh variable set "$name" \ --repo "$REPO" \ @@ -264,17 +286,34 @@ cat < \\ - unbounded-cloud.io/unbounded-net-gateway=true --overwrite + 1. Trigger the first install (run once per cluster): + gh workflow run nightly.yaml \\ + --repo $REPO \\ + -f force_init=true + + 2. Subsequent nightly snapshots of main deploy automatically to $ENV_NAME + at 06:00 UTC. +EOF +else + cat <01, e.g. ubnightly01) + container, and reads its key. +# 5. Configures the unbounded-nightly GitHub Environment via +# hack/scripts/setup-deploy-environment.sh. +# 6. Creates the unbounded-kube namespace and the orca-credentials Secret +# on the cluster (hack/orca/create-credentials-secret.sh). +# 7. Triggers the nightly workflow (force_init=true) and watches it to +# completion. +# +# It is idempotent: re-running skips an existing cluster / storage account / +# namespace, and the Environment + Secret are create-or-update. +# +# Prerequisites: +# - az CLI, logged in (az login) to the target tenant/subscription. +# - kubectl, gh (authenticated with admin on --repo), go, openssl, jq. +# - The nightly workflow must already be on the repo's default branch +# (merge the PR that adds .github/workflows/nightly.yaml first), or +# step 7 has nothing to trigger. +# +# Usage: +# hack/scripts/setup-nightly-cluster.sh \ +# --subscription \ +# --orca-azure-container orca-origin \ +# [flags] +# +# See --help for all flags. + +set -euo pipefail + +# --------------------------------------------------------------------------- +# Defaults. +# --------------------------------------------------------------------------- +ENV_NAME="unbounded-nightly" +SITE_NAME="nightly" +CLUSTER_NAME="unbounded-nightly" +LOCATION="canadacentral" +REPO="Azure/unbounded" +SUBSCRIPTION="${AZURE_SUBSCRIPTION_ID:-}" +MANAGE_CNI_PLUGIN="true" + +# Site (unbounded overlay) CIDRs - unbounded facts, default constants. +SITE_NODE_CIDR="10.1.0.0/16" +SITE_POD_CIDR="100.125.0.0/16" +# Standard Kubernetes pod CIDR fallback when AKS reports none (BYO CNI). +DEFAULT_POD_CIDR="10.244.0.0/16" + +# Cluster CIDRs - auto-detected from AKS unless overridden. +CLUSTER_NODE_CIDR="" +CLUSTER_POD_CIDR="" + +# forge cluster sizing (pass-through). +SYSTEM_POOL_NODE_COUNT="" +GATEWAY_POOL_NODE_COUNT="" +SYSTEM_POOL_NODE_SKU="" +GATEWAY_POOL_NODE_SKU="" + +# Orca origin. +ORIGIN_ACCOUNT="" +ORIGIN_CONTAINER="orca-origin" +ORIGIN_RG="" +ORIGIN_KEY="${ORCA_AZUREBLOB_ACCOUNT_KEY:-}" +ORCA_AZURE_ENDPOINT="" + +ASSUME_YES="false" +WATCH="true" +TRIGGER="true" + +# Populated by ensure_cluster. +RESOURCE_GROUP="" +NODE_RESOURCE_GROUP="" +KUBECONFIG_PATH="" + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(cd "${SCRIPT_DIR}/../.." && pwd)" + +# --------------------------------------------------------------------------- +# Logging helpers. +# --------------------------------------------------------------------------- +log() { echo -e ">> $*" >&2; } +info() { echo -e " $*" >&2; } +warn() { echo -e "!! $*" >&2; } +die() { echo -e "!! $*" >&2; exit 1; } + +usage() { + awk ' + /^#!/ { next } + /^#/ { sub(/^# ?/, ""); print; next } + { exit } + ' "${BASH_SOURCE[0]}" + cat >&2 <<'EOF' + +Flags: + --subscription ID Azure subscription ID (or env AZURE_SUBSCRIPTION_ID) + --location LOC Azure location (default: canadacentral) + --cluster-name NAME AKS cluster / resource group name (default: unbounded-nightly) + --env-name NAME GitHub Environment name (default: unbounded-nightly) + --site-name NAME unbounded site name (default: nightly) + --repo OWNER/NAME Target repository (default: Azure/unbounded) + --manage-cni-plugin BOOL Whether unbounded manages the CNI (default: true) + + --site-node-cidr CIDR Site node CIDR (default: 10.1.0.0/16) + --site-pod-cidr CIDR Site pod CIDR (default: 100.125.0.0/16) + --cluster-node-cidr CIDR Override auto-detected cluster node CIDR + --cluster-pod-cidr CIDR Override auto-detected cluster pod CIDR + + --origin-account NAME Orca origin storage account (default: ub01) + --origin-container NAME Orca origin blob container (default: orca-origin) + --origin-rg NAME Resource group for the origin account (default: cluster RG) + --origin-key KEY Origin account key (default: env ORCA_AZUREBLOB_ACCOUNT_KEY or fetched via az) + --orca-azure-endpoint URL Azure blob endpoint (default: *.blob.core.windows.net) + + --system-pool-node-count N forge system pool node count + --gateway-pool-node-count N forge gateway pool node count + --system-pool-node-sku SKU forge system pool VM SKU + --gateway-pool-node-sku SKU forge gateway pool VM SKU + + --no-watch Trigger the deploy run but do not wait for it + --no-trigger Provision only; do not trigger the deploy run. + Use this to test the workflow from a branch before + it is on the default branch: provision, then push + the branch to fire its push-triggered run. + --yes Skip confirmation prompts + --help Show this help +EOF + exit "${1:-0}" +} + +# --------------------------------------------------------------------------- +# CIDR helpers (mirrors hack/scripts/aks-quickstart.sh). +# --------------------------------------------------------------------------- +is_valid_cidr() { + local cidr="$1" + [[ "$cidr" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+/[0-9]+$ ]] || return 1 + local prefix="${cidr#*/}" + (( prefix >= 0 && prefix <= 32 )) || return 1 + return 0 +} + +ip4_to_int() { + local IFS=. + read -r a b c d <<< "$1" + echo $(( (a << 24) | (b << 16) | (c << 8) | d )) +} + +# subnet_contains_all +subnet_contains_all() { + local prefix="${1%/*}" + local len="${1#*/}" + local mask=$(( 0xFFFFFFFF << (32 - len) & 0xFFFFFFFF )) + local net_int + net_int=$(ip4_to_int "$prefix") + local network=$(( net_int & mask )) + while IFS= read -r ip; do + [[ -z "$ip" ]] && continue + local ip_int + ip_int=$(ip4_to_int "$ip") + [[ $(( ip_int & mask )) -eq $network ]] || return 1 + done <<< "$2" + return 0 +} + +KCTL() { kubectl --kubeconfig "${KUBECONFIG_PATH}" "$@"; } + +# --------------------------------------------------------------------------- +# Argument parsing. +# --------------------------------------------------------------------------- +require_value() { + if [[ -z "${2:-}" || "${2:0:2}" == "--" ]]; then + die "flag $1 requires a value" + fi +} + +while [[ $# -gt 0 ]]; do + case "$1" in + --subscription) require_value "$1" "${2:-}"; SUBSCRIPTION="$2"; shift 2 ;; + --location) require_value "$1" "${2:-}"; LOCATION="$2"; shift 2 ;; + --cluster-name) require_value "$1" "${2:-}"; CLUSTER_NAME="$2"; shift 2 ;; + --env-name) require_value "$1" "${2:-}"; ENV_NAME="$2"; shift 2 ;; + --site-name) require_value "$1" "${2:-}"; SITE_NAME="$2"; shift 2 ;; + --repo) require_value "$1" "${2:-}"; REPO="$2"; shift 2 ;; + --manage-cni-plugin) require_value "$1" "${2:-}"; MANAGE_CNI_PLUGIN="$2"; shift 2 ;; + --site-node-cidr) require_value "$1" "${2:-}"; SITE_NODE_CIDR="$2"; shift 2 ;; + --site-pod-cidr) require_value "$1" "${2:-}"; SITE_POD_CIDR="$2"; shift 2 ;; + --cluster-node-cidr) require_value "$1" "${2:-}"; CLUSTER_NODE_CIDR="$2"; shift 2 ;; + --cluster-pod-cidr) require_value "$1" "${2:-}"; CLUSTER_POD_CIDR="$2"; shift 2 ;; + --origin-account) require_value "$1" "${2:-}"; ORIGIN_ACCOUNT="$2"; shift 2 ;; + --origin-container) require_value "$1" "${2:-}"; ORIGIN_CONTAINER="$2"; shift 2 ;; + --origin-rg) require_value "$1" "${2:-}"; ORIGIN_RG="$2"; shift 2 ;; + --origin-key) require_value "$1" "${2:-}"; ORIGIN_KEY="$2"; shift 2 ;; + --orca-azure-endpoint) require_value "$1" "${2:-}"; ORCA_AZURE_ENDPOINT="$2"; shift 2 ;; + --system-pool-node-count) require_value "$1" "${2:-}"; SYSTEM_POOL_NODE_COUNT="$2"; shift 2 ;; + --gateway-pool-node-count) require_value "$1" "${2:-}"; GATEWAY_POOL_NODE_COUNT="$2"; shift 2 ;; + --system-pool-node-sku) require_value "$1" "${2:-}"; SYSTEM_POOL_NODE_SKU="$2"; shift 2 ;; + --gateway-pool-node-sku) require_value "$1" "${2:-}"; GATEWAY_POOL_NODE_SKU="$2"; shift 2 ;; + --no-watch) WATCH="false"; shift ;; + --no-trigger) TRIGGER="false"; shift ;; + --yes) ASSUME_YES="true"; shift ;; + --help|-h) usage 0 ;; + *) die "unknown argument: $1 (try --help)" ;; + esac +done + +# Derive defaults that depend on other flags. +[[ -n "${ORIGIN_ACCOUNT}" ]] || ORIGIN_ACCOUNT="ub${SITE_NAME}01" + +# --------------------------------------------------------------------------- +# Preflight. +# --------------------------------------------------------------------------- +preflight() { + log "Preflight checks" + + for tool in az kubectl gh go openssl jq; do + command -v "$tool" >/dev/null 2>&1 || die "'$tool' not found on PATH" + done + + az account show >/dev/null 2>&1 || die "az is not logged in; run 'az login' first" + gh auth status >/dev/null 2>&1 || die "gh is not authenticated; run 'gh auth login' first" + + [[ -n "${SUBSCRIPTION}" ]] || SUBSCRIPTION="$(az account show --query id -o tsv)" + [[ -n "${SUBSCRIPTION}" ]] || die "--subscription is required (or set AZURE_SUBSCRIPTION_ID)" + + is_valid_cidr "${SITE_NODE_CIDR}" || die "invalid --site-node-cidr: ${SITE_NODE_CIDR}" + is_valid_cidr "${SITE_POD_CIDR}" || die "invalid --site-pod-cidr: ${SITE_POD_CIDR}" + case "${MANAGE_CNI_PLUGIN}" in true|false) ;; *) die "--manage-cni-plugin must be true|false" ;; esac + + # Storage account names: 3-24 lowercase alphanumeric. + [[ "${ORIGIN_ACCOUNT}" =~ ^[a-z0-9]{3,24}$ ]] \ + || die "origin account '${ORIGIN_ACCOUNT}' must be 3-24 lowercase alphanumeric chars (override with --origin-account)" + + # The nightly workflow must exist on the default branch to be triggerable + # via workflow_dispatch. Skipped when --no-trigger (e.g. pre-merge testing, + # where the run is fired by pushing the branch instead). + if [[ "${TRIGGER}" == "true" ]]; then + if ! gh workflow view nightly.yaml --repo "${REPO}" >/dev/null 2>&1; then + die "workflow 'nightly.yaml' not found on ${REPO}'s default branch; merge the PR that adds it, or pass --no-trigger to provision and test from a branch" + fi + fi + + info "subscription: ${SUBSCRIPTION}" + info "cluster: ${CLUSTER_NAME} (${LOCATION})" + info "environment: ${ENV_NAME} / site ${SITE_NAME}" + info "origin: ${ORIGIN_ACCOUNT}/${ORIGIN_CONTAINER}" +} + +confirm() { + [[ "${ASSUME_YES}" == "true" ]] && return 0 + echo >&2 + read -r -p "Proceed with provisioning ${ENV_NAME}? [y/N] " reply + case "${reply}" in y|Y|yes|YES) ;; *) die "aborted" ;; esac +} + +# --------------------------------------------------------------------------- +# Build forge. +# --------------------------------------------------------------------------- +build_forge() { + log "Building forge (go build -o bin/forge ./hack/cmd/forge)" + ( cd "${REPO_ROOT}" && go build -o bin/forge ./hack/cmd/forge ) +} + +# --------------------------------------------------------------------------- +# Create (or reuse) the AKS cluster. Sets RESOURCE_GROUP, +# NODE_RESOURCE_GROUP, KUBECONFIG_PATH. +# --------------------------------------------------------------------------- +ensure_cluster() { + if az aks show -g "${CLUSTER_NAME}" -n "${CLUSTER_NAME}" --subscription "${SUBSCRIPTION}" >/dev/null 2>&1; then + log "Cluster ${CLUSTER_NAME} already exists; reusing it" + RESOURCE_GROUP="${CLUSTER_NAME}" + NODE_RESOURCE_GROUP="$(az aks show -g "${CLUSTER_NAME}" -n "${CLUSTER_NAME}" \ + --subscription "${SUBSCRIPTION}" --query nodeResourceGroup -o tsv)" + KUBECONFIG_PATH="${HOME}/.unbounded-forge/${CLUSTER_NAME}/kubeconfig" + mkdir -p "$(dirname "${KUBECONFIG_PATH}")" + az aks get-credentials -g "${CLUSTER_NAME}" -n "${CLUSTER_NAME}" \ + --subscription "${SUBSCRIPTION}" --file "${KUBECONFIG_PATH}" --overwrite-existing >/dev/null + else + log "Creating AKS cluster ${CLUSTER_NAME} with forge" + local args=(cluster create --name "${CLUSTER_NAME}" --location "${LOCATION}" --subscription "${SUBSCRIPTION}") + [[ -n "${SYSTEM_POOL_NODE_COUNT}" ]] && args+=(--system-pool-node-count "${SYSTEM_POOL_NODE_COUNT}") + [[ -n "${GATEWAY_POOL_NODE_COUNT}" ]] && args+=(--gateway-pool-node-count "${GATEWAY_POOL_NODE_COUNT}") + [[ -n "${SYSTEM_POOL_NODE_SKU}" ]] && args+=(--system-pool-node-sku "${SYSTEM_POOL_NODE_SKU}") + [[ -n "${GATEWAY_POOL_NODE_SKU}" ]] && args+=(--gateway-pool-node-sku "${GATEWAY_POOL_NODE_SKU}") + + # forge prints progress logs to stderr and the result JSON to stdout. + local out + out="$(AZURE_AUTH_CHAIN_ORDER=CLI "${REPO_ROOT}/bin/forge" "${args[@]}")" + + RESOURCE_GROUP="$(jq -r '.ResourceGroup' <<<"${out}")" + NODE_RESOURCE_GROUP="$(jq -r '.NodePoolsResourceGroup' <<<"${out}")" + KUBECONFIG_PATH="$(jq -r '.KubeconfigPath' <<<"${out}")" + fi + + [[ -n "${RESOURCE_GROUP}" && "${RESOURCE_GROUP}" != "null" ]] || die "could not determine cluster resource group" + [[ -n "${NODE_RESOURCE_GROUP}" && "${NODE_RESOURCE_GROUP}" != "null" ]] || die "could not determine node resource group" + [[ -f "${KUBECONFIG_PATH}" ]] || die "kubeconfig not found at ${KUBECONFIG_PATH}" + + [[ -z "${ORIGIN_RG}" ]] && ORIGIN_RG="${RESOURCE_GROUP}" + + info "resource group: ${RESOURCE_GROUP}" + info "node resource group: ${NODE_RESOURCE_GROUP}" + info "kubeconfig: ${KUBECONFIG_PATH}" +} + +# --------------------------------------------------------------------------- +# Detect cluster node/pod CIDRs from AKS (skipped if both overridden). +# Mirrors hack/scripts/aks-quickstart.sh:detect_cluster_cidrs. +# --------------------------------------------------------------------------- +detect_cluster_cidrs() { + if [[ -n "${CLUSTER_POD_CIDR}" && -n "${CLUSTER_NODE_CIDR}" ]]; then + info "using provided cluster CIDRs (node=${CLUSTER_NODE_CIDR}, pod=${CLUSTER_POD_CIDR})" + return 0 + fi + + log "Detecting cluster CIDRs from AKS" + + if [[ -z "${CLUSTER_POD_CIDR}" ]]; then + CLUSTER_POD_CIDR="$(az aks show --subscription "${SUBSCRIPTION}" \ + --resource-group "${CLUSTER_NAME}" --name "${CLUSTER_NAME}" \ + --query "networkProfile.podCidr" -o tsv)" + [[ "${CLUSTER_POD_CIDR}" == "None" ]] && CLUSTER_POD_CIDR="" + if [[ -z "${CLUSTER_POD_CIDR}" ]]; then + CLUSTER_POD_CIDR="${DEFAULT_POD_CIDR}" + info "no pod CIDR in AKS network profile (expected with BYO CNI); using default ${CLUSTER_POD_CIDR}" + fi + fi + + if [[ -z "${CLUSTER_NODE_CIDR}" ]]; then + # Nodes register with an InternalIP even before the CNI makes them Ready. + local node_ips="" + local elapsed=0 + while (( elapsed < 300 )); do + node_ips="$(KCTL get nodes \ + -o jsonpath='{range .items[?(@.spec.providerID)]}{range .status.addresses[?(@.type=="InternalIP")]}{.address}{"\n"}{end}{end}' \ + 2>/dev/null | grep -v '^$' || true)" + [[ -n "${node_ips}" ]] && break + info "waiting for node IPs to appear..." + sleep 10 + (( elapsed += 10 )) + done + [[ -n "${node_ips}" ]] || die "could not retrieve node internal IPs" + + CLUSTER_NODE_CIDR="$(az network vnet list \ + --subscription "${SUBSCRIPTION}" --resource-group "${NODE_RESOURCE_GROUP}" \ + --query "[].subnets[].addressPrefix" -o tsv | while IFS= read -r prefix; do + [[ -z "${prefix}" ]] && continue + subnet_contains_all "${prefix}" "${node_ips}" && echo "${prefix}" && break + done)" + [[ -n "${CLUSTER_NODE_CIDR}" ]] || die "could not find a VNet subnet containing all node IPs" + fi + + is_valid_cidr "${CLUSTER_NODE_CIDR}" || die "detected invalid cluster node CIDR: ${CLUSTER_NODE_CIDR}" + is_valid_cidr "${CLUSTER_POD_CIDR}" || die "invalid cluster pod CIDR: ${CLUSTER_POD_CIDR}" + info "cluster node CIDR: ${CLUSTER_NODE_CIDR}" + info "cluster pod CIDR: ${CLUSTER_POD_CIDR}" +} + +# --------------------------------------------------------------------------- +# Create the Orca origin storage account + container; resolve its key. +# --------------------------------------------------------------------------- +ensure_origin() { + log "Ensuring Orca origin storage account ${ORIGIN_ACCOUNT} (rg ${ORIGIN_RG})" + + if az storage account show -n "${ORIGIN_ACCOUNT}" -g "${ORIGIN_RG}" \ + --subscription "${SUBSCRIPTION}" >/dev/null 2>&1; then + info "storage account ${ORIGIN_ACCOUNT} already exists" + else + az storage account create \ + --name "${ORIGIN_ACCOUNT}" --resource-group "${ORIGIN_RG}" \ + --location "${LOCATION}" --subscription "${SUBSCRIPTION}" \ + --sku Standard_LRS --kind StorageV2 --min-tls-version TLS1_2 \ + --allow-blob-public-access false --only-show-errors >/dev/null + info "created storage account ${ORIGIN_ACCOUNT}" + fi + + if [[ -z "${ORIGIN_KEY}" ]]; then + ORIGIN_KEY="$(az storage account keys list \ + --account-name "${ORIGIN_ACCOUNT}" --resource-group "${ORIGIN_RG}" \ + --subscription "${SUBSCRIPTION}" --query "[0].value" -o tsv)" + fi + [[ -n "${ORIGIN_KEY}" ]] || die "could not resolve storage account key for ${ORIGIN_ACCOUNT}" + + log "Ensuring blob container ${ORIGIN_CONTAINER}" + az storage container create \ + --name "${ORIGIN_CONTAINER}" \ + --account-name "${ORIGIN_ACCOUNT}" --account-key "${ORIGIN_KEY}" \ + --only-show-errors >/dev/null +} + +# --------------------------------------------------------------------------- +# Configure the GitHub Environment. +# --------------------------------------------------------------------------- +configure_environment() { + log "Configuring GitHub Environment ${ENV_NAME}" + local args=( + --env-name "${ENV_NAME}" + --kubeconfig "${KUBECONFIG_PATH}" + --site-name "${SITE_NAME}" + --cluster-node-cidr "${CLUSTER_NODE_CIDR}" + --cluster-pod-cidr "${CLUSTER_POD_CIDR}" + --site-node-cidr "${SITE_NODE_CIDR}" + --site-pod-cidr "${SITE_POD_CIDR}" + --manage-cni-plugin "${MANAGE_CNI_PLUGIN}" + --channel nightly + --orca-azure-account "${ORIGIN_ACCOUNT}" + --orca-azure-container "${ORIGIN_CONTAINER}" + --repo "${REPO}" + --yes + ) + [[ -n "${ORCA_AZURE_ENDPOINT}" ]] && args+=(--orca-azure-endpoint "${ORCA_AZURE_ENDPOINT}") + + "${SCRIPT_DIR}/setup-deploy-environment.sh" "${args[@]}" +} + +# --------------------------------------------------------------------------- +# Create the unbounded-kube namespace + orca-credentials Secret on cluster. +# --------------------------------------------------------------------------- +ensure_secret() { + log "Ensuring unbounded-kube namespace and orca-credentials Secret" + KCTL get namespace unbounded-kube >/dev/null 2>&1 || KCTL create namespace unbounded-kube + + # Leave an existing Secret untouched. create-credentials-secret.sh generates + # fresh Garage S3 keys on every invocation; regenerating them on a re-run + # after Garage already granted the bucket to the previous key strands Orca's + # key (Garage 403). To rotate keys, delete the Secret first. + if KCTL -n unbounded-kube get secret orca-credentials >/dev/null 2>&1; then + info "orca-credentials Secret already exists; leaving its keys unchanged" + return 0 + fi + + KUBECONFIG="${KUBECONFIG_PATH}" "${REPO_ROOT}/hack/orca/create-credentials-secret.sh" \ + --azure-account-key "${ORIGIN_KEY}" +} + +# --------------------------------------------------------------------------- +# Trigger the nightly workflow (force_init) and optionally watch it. +# +# With --no-trigger we provision only and tell the operator how to fire the +# run by pushing the branch (used to test the workflow before it is on the +# default branch, where workflow_dispatch is not available). +# --------------------------------------------------------------------------- +trigger_deploy() { + if [[ "${TRIGGER}" != "true" ]]; then + local branch + branch="$(git -C "${REPO_ROOT}" rev-parse --abbrev-ref HEAD 2>/dev/null || echo '')" + log "Skipping deploy trigger (--no-trigger)" + info "Provisioning is complete. To run the workflow from this branch, push it:" + info " git push origin ${branch}" + info "The push-triggered run does the build, init deploy, Orca deploy, and smoke." + info "Watch it: gh run watch \$(gh run list --repo ${REPO} --workflow nightly.yaml --limit 1 --json databaseId --jq '.[0].databaseId') --repo ${REPO}" + return 0 + fi + + log "Triggering nightly workflow (force_init=true)" + + # Record the newest run id before dispatch so we can identify the new one. + local before + before="$(gh run list --repo "${REPO}" --workflow nightly.yaml \ + --limit 1 --json databaseId --jq '.[0].databaseId // 0' 2>/dev/null || echo 0)" + + gh workflow run nightly.yaml --repo "${REPO}" -f force_init=true + + log "Waiting for the run to register..." + local run_id="" elapsed=0 + while (( elapsed < 60 )); do + run_id="$(gh run list --repo "${REPO}" --workflow nightly.yaml \ + --limit 1 --json databaseId --jq '.[0].databaseId // 0' 2>/dev/null || echo 0)" + [[ -n "${run_id}" && "${run_id}" != "0" && "${run_id}" != "${before}" ]] && break + sleep 3 + (( elapsed += 3 )) + done + + if [[ -z "${run_id}" || "${run_id}" == "0" || "${run_id}" == "${before}" ]]; then + warn "could not identify the new run; check: gh run list --repo ${REPO} --workflow nightly.yaml" + return 0 + fi + + local run_url + run_url="$(gh run view "${run_id}" --repo "${REPO}" --json url --jq '.url' 2>/dev/null || true)" + info "run: ${run_url:-${run_id}}" + + if [[ "${WATCH}" == "true" ]]; then + log "Watching run ${run_id} to completion" + gh run watch "${run_id}" --repo "${REPO}" --exit-status + fi +} + +# --------------------------------------------------------------------------- +# Print a verification summary. +# --------------------------------------------------------------------------- +verify() { + log "Cluster state" + KCTL -n unbounded-net rollout status deploy/unbounded-net-controller --timeout=60s 2>/dev/null || true + KCTL -n unbounded-net rollout status ds/unbounded-net-node --timeout=60s 2>/dev/null || true + KCTL -n unbounded-kube rollout status deploy/machina-controller --timeout=60s 2>/dev/null || true + KCTL -n unbounded-kube get deploy orca garage 2>/dev/null || true +} + +# --------------------------------------------------------------------------- +# Main. +# --------------------------------------------------------------------------- +main() { + preflight + confirm + build_forge + ensure_cluster + detect_cluster_cidrs + ensure_origin + configure_environment + ensure_secret + trigger_deploy + + # Cluster state is only meaningful once a deploy has actually completed. + if [[ "${TRIGGER}" == "true" && "${WATCH}" == "true" ]]; then + verify + fi + + if [[ "${TRIGGER}" == "true" ]]; then + log "Done. unbounded-nightly is provisioned and the first deploy was triggered." + info "Subsequent runs deploy automatically every morning at 06:00 UTC." + else + log "Done. unbounded-nightly is provisioned (deploy not triggered)." + fi +} + +main "$@" diff --git a/internal/orca/manifests/manifests_test.go b/internal/orca/manifests/manifests_test.go index 3f929a72..45b60df6 100644 --- a/internal/orca/manifests/manifests_test.go +++ b/internal/orca/manifests/manifests_test.go @@ -124,27 +124,27 @@ func TestDevManifestsRender(t *testing.T) { ) } -// TestStableGarageManifestsRender renders the integration-cluster -// (unbounded-stable) Garage manifest under hack/orca/stable/. Unlike +// TestIntegrationGarageManifestsRender renders the integration-cluster +// Garage manifest under hack/orca/integration/. Unlike // the dev Garage (deploy/orca/dev/), this one is PVC-backed and serves // only as Orca's cachestore, so the required-kind set includes a // PersistentVolumeClaim. The Service must be ClusterIP (Orca reaches // Garage in-cluster via its Service DNS). -func TestStableGarageManifestsRender(t *testing.T) { +func TestIntegrationGarageManifestsRender(t *testing.T) { t.Parallel() root := repoRoot(t) - templatesDir := filepath.Join(root, "hack", "orca", "stable") + templatesDir := filepath.Join(root, "hack", "orca", "integration") - renderAndValidate(t, templatesDir, stableGarageData(), + renderAndValidate(t, templatesDir, integrationGarageData(), expectKindsAtLeastOnce("Deployment", "Service", "ConfigMap", "PersistentVolumeClaim"), expectAllServicesClusterIP(), ) } -// stableGarageData supplies realistic template variables for the -// hack/orca/stable Garage manifest. -func stableGarageData() map[string]string { +// integrationGarageData supplies realistic template variables for the +// hack/orca/integration Garage manifest. +func integrationGarageData() map[string]string { return map[string]string{ "Namespace": "unbounded-kube", "CachestoreRegion": "us-east-1",