From 0edf27cb3fbcf40719abb7a7fcf27c9fed6b22ba Mon Sep 17 00:00:00 2001 From: Ravi Shankar Date: Thu, 25 Jun 2026 20:49:24 -0700 Subject: [PATCH] test(e2e): e2e k8s environment tests using chainsaw Signed-off-by: Ravi Shankar --- .claude/CLAUDE.md | 16 + .github/workflows/e2e.yml | 100 ++++++ AGENTS.md | 16 + Makefile | 47 ++- docs/engines/k8s.md | 46 +++ tests/chainsaw/README.md | 149 +++++++++ tests/chainsaw/chainsaw-config.yaml | 14 + .../k8s/label-application/chainsaw-test.yaml | 156 ++++++++++ .../k8s/label-application/values.yaml | 29 ++ .../k8s/label-truncation/chainsaw-test.yaml | 153 ++++++++++ .../chainsaw/k8s/label-truncation/values.yaml | 29 ++ tests/chainsaw/kind-config.yaml | 8 + .../block-complement/chainsaw-test.yaml | 252 +++++++++++++++ .../slinky/block-complement/values.yaml | 41 +++ .../slinky/dra-provider/chainsaw-test.yaml | 226 ++++++++++++++ .../chainsaw/slinky/dra-provider/values.yaml | 25 ++ .../slinky/dynamic-nodes/chainsaw-test.yaml | 288 ++++++++++++++++++ .../chainsaw/slinky/dynamic-nodes/values.yaml | 43 +++ .../slinky/tree-topology/chainsaw-test.yaml | 123 ++++++++ .../chainsaw/slinky/tree-topology/values.yaml | 37 +++ 20 files changed, 1796 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/e2e.yml create mode 100644 tests/chainsaw/README.md create mode 100644 tests/chainsaw/chainsaw-config.yaml create mode 100644 tests/chainsaw/k8s/label-application/chainsaw-test.yaml create mode 100644 tests/chainsaw/k8s/label-application/values.yaml create mode 100644 tests/chainsaw/k8s/label-truncation/chainsaw-test.yaml create mode 100644 tests/chainsaw/k8s/label-truncation/values.yaml create mode 100644 tests/chainsaw/kind-config.yaml create mode 100644 tests/chainsaw/slinky/block-complement/chainsaw-test.yaml create mode 100644 tests/chainsaw/slinky/block-complement/values.yaml create mode 100644 tests/chainsaw/slinky/dra-provider/chainsaw-test.yaml create mode 100644 tests/chainsaw/slinky/dra-provider/values.yaml create mode 100644 tests/chainsaw/slinky/dynamic-nodes/chainsaw-test.yaml create mode 100644 tests/chainsaw/slinky/dynamic-nodes/values.yaml create mode 100644 tests/chainsaw/slinky/tree-topology/chainsaw-test.yaml create mode 100644 tests/chainsaw/slinky/tree-topology/values.yaml diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 9b39401b..9bc1fb00 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -42,6 +42,7 @@ internal/ # Shared utilities not part of the public API charts/topograph/ # Helm chart (with node-data-broker subchart); tests/ holds the helm-unittest suites + snapshots docs/ # Public-facing docs — overview.md, architecture.md, api.md + providers/, engines/, reference/ subdirectories tests/models/ # YAML simulation fixtures +tests/chainsaw/ # Chainsaw E2E test suites (label-application, label-truncation, node-observer, slinky) config/ # Sample topograph-config.yaml scripts/ # Build scripts (deb, rpm, SSL, clean) localdev/ # Developer-local workspace — not tracked; personal scratch files @@ -94,6 +95,20 @@ make coverage # human-readable per-package summary Run `make qualify` before pushing. The individual targets are available if you want to run a single check during iteration. Run `make chart-test` when you change `charts/topograph/` or its subcharts; CI runs it on every workflow trigger. +### E2E tests (Chainsaw) + +Chainsaw conformance tests live in `tests/chainsaw/` and exercise the full Helm deploy → generate → assert cycle against a real cluster. + +```bash +make e2e-local # build image, create kind cluster, run all suites, delete cluster +make kind-load KIND_CLUSTER= # load image into an existing kind cluster (run before make e2e) +make e2e # run suites against current KUBECONFIG context +``` + +`make e2e` uses `E2E_IMAGE_TAG` (defaults to the short commit SHA) as the image tag. For a local kind cluster, run `make image-build && make kind-load KIND_CLUSTER=` before each `make e2e` — the tag changes with every commit, so both steps are needed after any new commit. Prerequisites: `chainsaw`, `kind`, `helm`, `kubectl`, `docker`. See `tests/chainsaw/README.md` for details. + +These tests are triggered manually via `.github/workflows/e2e.yml` (`workflow_dispatch`). Run them before merging changes to the Helm chart, Node Observer, or engine output. + ### Coverage policy From `codecov.yml`: @@ -109,6 +124,7 @@ Coverage checks run on pull requests. A drop below target with no matching uplif - `.github/workflows/docker.yml` — container image build (manual trigger) - `.github/workflows/docker-ib.yml` — InfiniBand-variant container (manual trigger) - `.github/workflows/helm-release.yaml` — Helm chart release (manual trigger) +- `.github/workflows/e2e.yml` — Chainsaw E2E suite against a kind cluster (manual trigger via `workflow_dispatch`) ### Deployment surfaces diff --git a/.github/workflows/e2e.yml b/.github/workflows/e2e.yml new file mode 100644 index 00000000..0eb89641 --- /dev/null +++ b/.github/workflows/e2e.yml @@ -0,0 +1,100 @@ +# Copyright 2026 NVIDIA CORPORATION +# SPDX-License-Identifier: Apache-2.0 + +name: E2E + +on: + workflow_dispatch: + inputs: + chainsaw_version: + description: "Chainsaw version to install (e.g. v0.2.12)" + required: false + default: "latest" + +env: + KIND_CLUSTER: topograph-e2e + IMAGE_REPO: ghcr.io/nvidia/topograph + CHAINSAW_VERSION: ${{ github.event.inputs.chainsaw_version || 'latest' }} + +jobs: + e2e: + name: Chainsaw E2E + runs-on: ubuntu-latest + permissions: + contents: read + + steps: + - uses: actions/checkout@v5 + + - name: Set up Go + uses: actions/setup-go@v6 + with: + go-version: '1.25.9' + + - name: Install kind + run: go install sigs.k8s.io/kind@latest + + - name: Install Chainsaw + run: | + if [ "$CHAINSAW_VERSION" = "latest" ]; then + TAG=$(curl -s https://api.github.com/repos/kyverno/chainsaw/releases/latest \ + | grep '"tag_name"' | cut -d'"' -f4) + else + TAG="$CHAINSAW_VERSION" + fi + echo "Installing Chainsaw $TAG" + BASE_URL="https://github.com/kyverno/chainsaw/releases/download/${TAG}" + curl -fsSL "${BASE_URL}/chainsaw_linux_amd64.tar.gz" -o chainsaw.tar.gz + curl -fsSL "${BASE_URL}/chainsaw_checksums.txt" -o chainsaw_checksums.txt + grep "chainsaw_linux_amd64.tar.gz" chainsaw_checksums.txt | sha256sum -c - + tar xz -f chainsaw.tar.gz chainsaw + sudo mv chainsaw /usr/local/bin/ + rm -f chainsaw.tar.gz chainsaw_checksums.txt + chainsaw version + + - name: Create kind cluster + run: | + kind create cluster \ + --name "$KIND_CLUSTER" \ + --config tests/chainsaw/kind-config.yaml \ + --wait 120s + + - name: Build Linux/amd64 image + run: make build-linux-amd64 + + - name: Build container image + env: + GOOS: linux + GOARCH: amd64 + run: | + # Use the short commit SHA as the image tag: always a valid Docker tag, + # works regardless of branch naming conventions. + IMAGE_TAG=$(git rev-parse --short HEAD) + make image-build IMAGE_TAG="$IMAGE_TAG" + echo "IMAGE_TAG=$IMAGE_TAG" >> "$GITHUB_ENV" + + - name: Load image into kind + run: | + kind load docker-image "${IMAGE_REPO}:${IMAGE_TAG}" \ + --name "$KIND_CLUSTER" + + - name: Run E2E tests + env: + TOPOGRAPH_IMAGE_REPO: ${{ env.IMAGE_REPO }} + TOPOGRAPH_IMAGE_PULL_POLICY: Never + run: | + make e2e E2E_IMAGE_TAG="$IMAGE_TAG" + + - name: Collect diagnostic logs on failure + if: failure() + run: | + echo "=== kind nodes ===" + kubectl get nodes -o wide + echo "=== all pods ===" + kubectl get pods -A -o wide + echo "=== recent events ===" + kubectl get events -A --sort-by='.lastTimestamp' | tail -50 + + - name: Delete kind cluster + if: always() + run: kind delete cluster --name "$KIND_CLUSTER" diff --git a/AGENTS.md b/AGENTS.md index 43270e0b..c5cef641 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -42,6 +42,7 @@ internal/ # Shared utilities not part of the public API charts/topograph/ # Helm chart (with node-data-broker subchart); tests/ holds the helm-unittest suites + snapshots docs/ # Public-facing docs — overview.md, architecture.md, api.md + providers/, engines/, reference/ subdirectories tests/models/ # YAML simulation fixtures +tests/chainsaw/ # Chainsaw E2E test suites (label-application, label-truncation, node-observer, slinky) config/ # Sample topograph-config.yaml scripts/ # Build scripts (deb, rpm, SSL, clean) localdev/ # Developer-local workspace — not tracked; personal scratch files @@ -94,6 +95,20 @@ make coverage # human-readable per-package summary Run `make qualify` before pushing. The individual targets are available if you want to run a single check during iteration. Run `make chart-test` when you change `charts/topograph/` or its subcharts; CI runs it on every workflow trigger. +### E2E tests (Chainsaw) + +Chainsaw conformance tests live in `tests/chainsaw/` and exercise the full Helm deploy → generate → assert cycle against a real cluster. + +```bash +make e2e-local # build image, create kind cluster, run all suites, delete cluster +make kind-load KIND_CLUSTER= # load image into an existing kind cluster (run before make e2e) +make e2e # run suites against current KUBECONFIG context +``` + +`make e2e` uses `E2E_IMAGE_TAG` (defaults to the short commit SHA) as the image tag. For a local kind cluster, run `make image-build && make kind-load KIND_CLUSTER=` before each `make e2e` — the tag changes with every commit, so both steps are needed after any new commit. Prerequisites: `chainsaw`, `kind`, `helm`, `kubectl`, `docker`. See `tests/chainsaw/README.md` for details. + +These tests are triggered manually via `.github/workflows/e2e.yml` (`workflow_dispatch`). Run them before merging changes to the Helm chart, Node Observer, or engine output. + ### Coverage policy From `codecov.yml`: @@ -109,6 +124,7 @@ Coverage checks run on pull requests. A drop below target with no matching uplif - `.github/workflows/docker.yml` — container image build (manual trigger) - `.github/workflows/docker-ib.yml` — InfiniBand-variant container (manual trigger) - `.github/workflows/helm-release.yaml` — Helm chart release (manual trigger) +- `.github/workflows/e2e.yml` — Chainsaw E2E suite against a kind cluster (manual trigger via `workflow_dispatch`) ### Deployment surfaces diff --git a/Makefile b/Makefile index a0cf09c1..0196ff1e 100644 --- a/Makefile +++ b/Makefile @@ -23,7 +23,7 @@ OUTPUT_DIR := ./bin IMAGE_REPO ?=ghcr.io/nvidia/topograph GIT_REF ?=$(shell git rev-parse --abbrev-ref HEAD) -IMAGE_TAG ?=$(GIT_REF) +IMAGE_TAG ?=$(shell git rev-parse --short HEAD) .PHONY: build build: @@ -102,7 +102,7 @@ coverage: test .PHONY: image-build image-build: - $(DOCKER_BIN) build --build-arg TARGETOS=$(GOOS) --build-arg TARGETARCH=$(GOARCH) -t $(IMAGE_REPO):$(IMAGE_TAG) -f ./Dockerfile . + $(DOCKER_BIN) build --build-arg TARGETOS=linux --build-arg TARGETARCH=$(GOARCH) -t $(IMAGE_REPO):$(IMAGE_TAG) -f ./Dockerfile . .PHONY: image-push image-push: image-build @@ -115,6 +115,49 @@ docker-buildx: $(DOCKER_BIN) buildx build --platform $(PLATFORMS) -t $(IMAGE_REPO):$(IMAGE_TAG) -f ./Dockerfile --push . - $(DOCKER_BIN) buildx rm topograph-builder +CHAINSAW_BIN ?= chainsaw +KIND_CLUSTER ?= topograph-e2e +E2E_IMAGE_TAG ?= $(IMAGE_TAG) + +# Check that chainsaw is installed; print install hint if not. +.PHONY: chainsaw-install +chainsaw-install: + @which $(CHAINSAW_BIN) >/dev/null 2>&1 || \ + (echo "chainsaw not found — install from https://kyverno.github.io/chainsaw/latest/quick-start/install/"; exit 1) + +# Load the locally-built image into an existing kind cluster with the correct +# E2E_IMAGE_TAG. Use this before running make e2e against a local kind cluster: +# make kind-load KIND_CLUSTER=topograph-test && make e2e +.PHONY: kind-load +kind-load: + kind load docker-image $(IMAGE_REPO):$(E2E_IMAGE_TAG) --name $(KIND_CLUSTER) + +# Run all Chainsaw E2E suites against the current KUBECONFIG context. +# For a pre-pushed registry image: set TOPOGRAPH_IMAGE_REPO and TOPOGRAPH_IMAGE_TAG. +# For a local kind cluster: run "make kind-load KIND_CLUSTER=" first. +.PHONY: e2e +e2e: chainsaw-install + TOPOGRAPH_IMAGE_REPO=$(IMAGE_REPO) \ + TOPOGRAPH_IMAGE_TAG=$(E2E_IMAGE_TAG) \ + $(CHAINSAW_BIN) test --test-dir tests/chainsaw + +# Build the image, create a 4-worker kind cluster, load the image, run all +# Chainsaw suites, and destroy the cluster. Requires kind and chainsaw. +.PHONY: e2e-local +e2e-local: chainsaw-install image-build + kind create cluster --name $(KIND_CLUSTER) \ + --config tests/chainsaw/kind-config.yaml --wait 120s \ + || kind get clusters | grep -q "^$(KIND_CLUSTER)$$" + kind load docker-image $(IMAGE_REPO):$(E2E_IMAGE_TAG) --name $(KIND_CLUSTER) + KUBECONFIG="$$(kind get kubeconfig --name $(KIND_CLUSTER))" \ + TOPOGRAPH_IMAGE_REPO=$(IMAGE_REPO) \ + TOPOGRAPH_IMAGE_TAG=$(E2E_IMAGE_TAG) \ + TOPOGRAPH_IMAGE_PULL_POLICY=Never \ + $(CHAINSAW_BIN) test --test-dir tests/chainsaw; \ + E2E_STATUS=$$?; \ + kind delete cluster --name $(KIND_CLUSTER); \ + exit $$E2E_STATUS + .PHONY: ssl ssl: SSL_DIR=ssl ./scripts/configure-ssl.sh diff --git a/docs/engines/k8s.md b/docs/engines/k8s.md index dc8a1915..e509d8c5 100644 --- a/docs/engines/k8s.md +++ b/docs/engines/k8s.md @@ -344,6 +344,52 @@ tests: enabled: false ``` +### Conformance testing with Chainsaw + +`helm test` verifies that a deployed instance is healthy. To verify that the engine actually **applies correct topology labels** to nodes, use the Chainsaw E2E suite in `tests/chainsaw/`. + +[Chainsaw](https://kyverno.github.io/chainsaw/) is Kyverno's declarative E2E framework. Each suite drives `apply → wait → assert → cleanup` against a real cluster using the built-in **test provider** — no cloud credentials required. + +#### Test suites + +| Suite | What it checks | +|---|---| +| `k8s/label-application` | `leaf`, `spine`, and `accelerator` labels applied to nodes after generation | +| `k8s/label-truncation` | Switch names >63 chars replaced with an FNV64a hash (valid label value) | +| `slinky/tree-topology` | Slinky engine writes correct `topology.conf` (tree topology) into a ConfigMap | +| `slinky/dra-provider` | DRA provider discovers NVLink clique topology; Slinky engine writes correct `topology.conf` (block topology) into a ConfigMap | + +#### How suites map topology to nodes + +Each suite ships a `topology-model.yaml` in its directory. The suite creates +fake K8s Node objects whose names match the node IDs in that file, loads the +file into a ConfigMap, and mounts it at `/etc/topograph/models/` inside the +pod. The `/v1/generate` request passes `modelFileName` pointing at the mounted +file. No node annotations are required. + +#### Running locally + +```bash +# Prerequisites: chainsaw, kind, helm, kubectl, docker + +# Full lifecycle — build, create cluster, run all suites, delete cluster: +make e2e-local + +# Against an existing local kind cluster (repeat after each commit): +make image-build # rebuild with the current commit SHA tag +make kind-load KIND_CLUSTER= # load into the cluster +make e2e + +# Single suite only: +chainsaw test --test-dir tests/chainsaw/k8s/label-application +``` + +See `tests/chainsaw/README.md` for full prerequisites and environment variable reference. + +#### Running in CI + +The `.github/workflows/e2e.yml` workflow runs on `workflow_dispatch`. Trigger it manually from the GitHub UI before merging changes to the Helm chart, Node Observer, or engine output code paths. + ### Chart README For installation, prerequisites, values reference, and configuration examples, see [`charts/topograph/README.md`](../../charts/topograph/README.md) — also surfaced via `helm show readme topograph/topograph`. diff --git a/tests/chainsaw/README.md b/tests/chainsaw/README.md new file mode 100644 index 00000000..bb3701bc --- /dev/null +++ b/tests/chainsaw/README.md @@ -0,0 +1,149 @@ +# Topograph Chainsaw E2E Tests + +End-to-end conformance tests for the Kubernetes and Slinky engines using +[Chainsaw](https://kyverno.github.io/chainsaw/) — Kyverno's declarative +`apply → wait → assert → cleanup` framework. + +## How the tests work + +All suites follow the same high-level cycle: prepare cluster state → install +Topograph → assert outputs → clean up. There are three preparation patterns. + +### Pattern A — Test provider with fake nodes + +Used by: `k8s/label-application`, `k8s/label-truncation`, `slinky/block-complement`, +`slinky/dynamic-nodes` + +1. Creates a `topology-test-model` ConfigMap with the topology model embedded inline, + mounted at `/etc/topograph/models/` in the Topograph pod. +2. Creates fake K8s Node objects carrying `kubernetes.io/os=linux` so the Node Observer + fires on them. k8s-engine suites also carry `topograph.nvidia.com/instance` and + `topograph.nvidia.com/region` annotations so the engine can map instance IDs to nodes. +3. Slinky block-topology suites additionally create one fake slurmd pod per fake node + (status-patched to `Ready`) so the Slinky engine can build its + k8s-node→SLURM-hostname map for writing the ConfigMap and annotating nodes. +4. Installs the Topograph Helm chart with the Node Observer enabled. The observer + fires on the fake nodes on startup, auto-triggering `/v1/generate` — no manual + HTTP POST needed. +5. Asserts that the expected node labels (k8s engine) or ConfigMap content (Slinky + engine) appear. +6. Cleans up (uninstalls the chart, deletes the fake nodes and namespace). + +### Pattern B — DRA provider with fake nodes + +Used by: `slinky/dra-provider` + +1. Creates fake K8s nodes carrying `nvidia.com/gpu.clique`, + `topograph.nvidia.com/instance`, and `topograph.nvidia.com/region` + labels/annotations. The DRA provider reads NVLink clique topology directly from + the K8s API — no model ConfigMap is needed. +2. Creates one fake slurmd pod per fake node (status-patched to `Ready`) so the + Slinky engine can build its k8s-node→SLURM-hostname map. +3. Installs the Helm chart with the Node Observer enabled; the observer fires on the + fake nodes and auto-triggers generation. +4. Asserts the `slurm-topology` ConfigMap contains the correct block topology entries. +5. Cleans up. + +### Pattern C — Test provider with real cluster nodes + +Used by: `slinky/tree-topology` + +1. Creates a `topology-test-model` ConfigMap with the model embedded inline. +2. Installs the Helm chart with the Node Observer watching all `kubernetes.io/os=linux` + nodes. Kind worker nodes already carry this label, so generation is triggered + immediately on startup — no fake nodes are created. +3. Topology entries are derived entirely from the model's switch structure; no slurmd + pods are needed. +4. Asserts the `slurm-topology` ConfigMap contains the correct tree-format entries. +5. Cleans up. + +## Test suites + +| Suite | Topology source | What it checks | +|---|---|---| +| `k8s/label-application` | Test provider — inline model `s1→{s2,s3}`, nodes `node-01` (under s2) and `node-02` (under s3); two fake K8s nodes | `leaf`, `spine` labels applied correctly to fake nodes | +| `k8s/label-truncation` | Test provider — inline model `s1→AVERYLONGSWITCHNAMETHATEXCEEDSSIXTYCHARACTERSFORTESTINGPURPOSES01→node-01`; one fake K8s node | Switch names >63 chars are replaced with an FNV64a hash prefixed with `x` | +| `slinky/tree-topology` | Test provider — inline model `S1→{S2,S3}`, nodes `node-01` and `node-02`; fires on real kind worker nodes (no fake nodes) | Slinky engine writes correct `topology.conf` (tree format) into a ConfigMap | +| `slinky/dra-provider` | DRA provider — `nvidia.com/gpu.clique` labels on four fake nodes (clique-1: node-01/node-02, clique-2: node-03/node-04); fake slurmd pods | DRA provider discovers NVLink clique topology from node labels; Slinky engine writes correct `topology.conf` (block topology) into a ConfigMap | +| `slinky/block-complement` | Test provider — inline model: spine→{leaf-1,leaf-2,leaf-3}, three NVLink cliques with node-02 (clique-1) and node-05 (clique-3) absent; four fake K8s nodes and fake slurmd pods | Slinky engine pads the block tree with an empty `BlockName=block004` placeholder when BlockSizes=2,4,8 and only 3 of 4 base-block slots are filled; absent nodes are not emitted in their BlockName line | +| `slinky/dynamic-nodes` | Test provider — same three-clique model as `block-complement` (node-02/05 absent); four fake K8s nodes and fake slurmd pods; `useDynamicNodes: true`, `configUpdateMode: skeleton-only` | Slinky engine writes all `BlockName` lines without `Nodes=` (skeleton format) and `performReconciliation` annotates each K8s node with `topology.slinky.slurm.net/spec` pointing to its assigned block | + +## Prerequisites + +| Tool | Install | +|---|---| +| `chainsaw` | `brew install kyverno/tap/chainsaw` or see [docs](https://kyverno.github.io/chainsaw/latest/quick-start/install/) | +| `kind` | `brew install kind` | +| `helm` | `brew install helm` | +| `kubectl` | `brew install kubectl` | +| `docker` | [Docker Desktop](https://www.docker.com/products/docker-desktop/) | + +## Quick start — local kind cluster + +```bash +# Build image, create cluster, run all suites, delete cluster +make e2e-local +``` + +`make e2e-local` runs in sequence: +1. `make image-build` — builds the container image for `linux/` +2. `kind create cluster` — spins up a 4-worker kind cluster (`tests/chainsaw/kind-config.yaml`) +3. `kind load docker-image` — loads the local image into the cluster with `imagePullPolicy: Never` +4. `chainsaw test` — runs all suites +5. `kind delete cluster` — tears down the cluster + +## Running against an existing kind cluster + +If you already have a kind cluster and want to run the tests without tearing it +down, the three-step sequence is: + +```bash +make image-build # 1. build the image (tagged with the current commit SHA) +make kind-load KIND_CLUSTER= # 2. load that image into the cluster +make e2e # 3. run all suites +``` + +`IMAGE_TAG` defaults to `$(git rev-parse --short HEAD)`. Because it is tied to +the commit SHA, you must rebuild and reload whenever you commit new changes — +otherwise the cluster has a stale image or the tag does not exist at all. + +To use a fixed tag instead of the SHA: + +```bash +make image-build E2E_IMAGE_TAG=my-tag +make kind-load KIND_CLUSTER= E2E_IMAGE_TAG=my-tag +make e2e E2E_IMAGE_TAG=my-tag +``` + +## Running against a non-kind cluster + +For a cluster where the image is already in a reachable registry, pass the +repo and tag as Make variable overrides (not shell env vars — the Makefile +uses `IMAGE_REPO` and `E2E_IMAGE_TAG`, not `TOPOGRAPH_IMAGE_REPO`/`TOPOGRAPH_IMAGE_TAG`): + +```bash +make e2e IMAGE_REPO=my-registry/topograph E2E_IMAGE_TAG=my-tag +``` + +## Running a single suite + +```bash +chainsaw test --test-dir tests/chainsaw/k8s/label-application +``` + +To pass a specific image: + +```bash +TOPOGRAPH_IMAGE_REPO=ghcr.io/nvidia/topograph \ +TOPOGRAPH_IMAGE_TAG=my-tag \ +chainsaw test --test-dir tests/chainsaw/k8s/label-application +``` + +## Environment variables + +| Variable | Default | Purpose | +|---|---|---| +| `TOPOGRAPH_IMAGE_REPO` | `ghcr.io/nvidia/topograph` | Image repository | +| `TOPOGRAPH_IMAGE_TAG` | `` (chart `appVersion`) | Image tag passed directly to test scripts | +| `E2E_IMAGE_TAG` | short commit SHA (`git rev-parse --short HEAD`) | Tag used by `make e2e` / `make e2e-local` / `make kind-load` | +| `TOPOGRAPH_IMAGE_PULL_POLICY` | `IfNotPresent` | Set to `Never` for kind (done automatically by `make e2e-local`) | diff --git a/tests/chainsaw/chainsaw-config.yaml b/tests/chainsaw/chainsaw-config.yaml new file mode 100644 index 00000000..2a98e46a --- /dev/null +++ b/tests/chainsaw/chainsaw-config.yaml @@ -0,0 +1,14 @@ +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Configuration +metadata: + name: chainsaw +spec: + timeouts: + apply: 120s + assert: 90s + cleanup: 60s + delete: 60s + error: 30s + exec: 120s + fullName: true + parallel: 1 diff --git a/tests/chainsaw/k8s/label-application/chainsaw-test.yaml b/tests/chainsaw/k8s/label-application/chainsaw-test.yaml new file mode 100644 index 00000000..13ecb56e --- /dev/null +++ b/tests/chainsaw/k8s/label-application/chainsaw-test.yaml @@ -0,0 +1,156 @@ +# Copyright 2026 NVIDIA CORPORATION +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: k8s-label-application +spec: + description: > + Verify that the Kubernetes engine applies the correct topology labels to + cluster nodes. The test uses a static topology model (s1→s2/s3, node-01 + under s2 and node-02 under s3) embedded inline in a ConfigMap and mounted + into the pod. Fake Node objects matching those instance IDs are created so + the engine can label them. After triggering generation the test asserts + leaf/spine labels on node-01 (s2) and node-02 (s3). + concurrent: false + steps: + + - name: prepare + description: > + Create fake K8s Node objects (node-01, node-02) matching the model + instance IDs, then create the topology-model ConfigMap with inline content. + try: + - apply: + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-01 + labels: + kubernetes.io/os: linux + - apply: + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-02 + labels: + kubernetes.io/os: linux + - apply: + resource: + apiVersion: v1 + kind: ConfigMap + metadata: + name: topology-test-model + data: + topology-model.yaml: | + switches: + s1: + switches: + - s2 + - s3 + s2: + nodes: + - node-01 + s3: + nodes: + - node-02 + nodes: + node-01: {} + node-02: {} + + - name: install-topograph + description: > + Install Topograph together with the Node Observer. The observer fires + "Added" events for the fake nodes on startup, automatically triggering + topology generation via the modelFileName configured in values.yaml. + try: + - script: + timeout: 120s + content: | + set -euo pipefail + REPO_ROOT=$(git rev-parse --show-toplevel) + IMAGE_REPO="${TOPOGRAPH_IMAGE_REPO:-ghcr.io/nvidia/topograph}" + IMAGE_TAG="${TOPOGRAPH_IMAGE_TAG:-}" + PULL_POLICY="${TOPOGRAPH_IMAGE_PULL_POLICY:-IfNotPresent}" + + kubectl delete clusterrole topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete clusterrolebinding topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + + helm upgrade --install topograph "${REPO_ROOT}/charts/topograph" \ + --namespace "$NAMESPACE" --create-namespace \ + --values "$(pwd)/values.yaml" \ + --set "image.repository=${IMAGE_REPO}" \ + ${IMAGE_TAG:+--set "image.tag=${IMAGE_TAG}"} \ + --set "image.pullPolicy=${PULL_POLICY}" \ + --set "node-observer.image.repository=${IMAGE_REPO}" \ + ${IMAGE_TAG:+--set "node-observer.image.tag=${IMAGE_TAG}"} \ + --set "node-observer.image.pullPolicy=${PULL_POLICY}" \ + --wait --timeout 90s + catch: + - description: Pod status on install failure + script: + content: | + kubectl get pods -n "$NAMESPACE" -o wide + kubectl describe pods -n "$NAMESPACE" + + - name: assert + description: > + The Node Observer fires "Added" events for the fake nodes on startup, + triggering generation automatically. Wait for topology labels to appear: + node-01 → leaf=s2 spine=s1, node-02 → leaf=s3 spine=s1. + try: + - assert: + timeout: 60s + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-01 + labels: + network.topology.nvidia.com/leaf: s2 + network.topology.nvidia.com/spine: s1 + - assert: + timeout: 60s + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-02 + labels: + network.topology.nvidia.com/leaf: s3 + network.topology.nvidia.com/spine: s1 + catch: + - description: Topograph and Node Observer logs on assertion failure + script: + content: | + echo "=== Topograph logs ===" + kubectl logs -n "$NAMESPACE" deploy/topograph --tail=50 || true + echo "=== Node Observer logs ===" + kubectl logs -n "$NAMESPACE" deploy/topograph-node-observer --tail=50 || true + echo "=== Node labels ===" + kubectl get nodes node-01 node-02 \ + -o custom-columns='NAME:.metadata.name,LEAF:.metadata.labels.network\.topology\.nvidia\.com/leaf,SPINE:.metadata.labels.network\.topology\.nvidia\.com/spine' || true + finally: + - script: + timeout: 120s + content: | + helm uninstall topograph -n "$NAMESPACE" --wait --timeout 60s || true + kubectl delete clusterrole topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete clusterrolebinding topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete namespace "$NAMESPACE" --timeout=60s || true + - delete: + ref: + apiVersion: v1 + kind: Node + name: node-01 + - delete: + ref: + apiVersion: v1 + kind: Node + name: node-02 diff --git a/tests/chainsaw/k8s/label-application/values.yaml b/tests/chainsaw/k8s/label-application/values.yaml new file mode 100644 index 00000000..eea13501 --- /dev/null +++ b/tests/chainsaw/k8s/label-application/values.yaml @@ -0,0 +1,29 @@ +global: + provider: + name: test + params: + modelFileName: /etc/topograph/models/topology-model.yaml + engine: + name: k8s + +config: + requestAggregationDelay: 1s + +node-observer: + topograph: + trigger: + nodeSelector: + kubernetes.io/os: linux + +node-data-broker: + enabled: false + +# test specific values to mount the topology-model.yaml ConfigMap into the Topograph pod +volumes: +- name: topology-test-model + configMap: + name: topology-test-model + +volumeMounts: +- name: topology-test-model + mountPath: /etc/topograph/models diff --git a/tests/chainsaw/k8s/label-truncation/chainsaw-test.yaml b/tests/chainsaw/k8s/label-truncation/chainsaw-test.yaml new file mode 100644 index 00000000..acdaf061 --- /dev/null +++ b/tests/chainsaw/k8s/label-truncation/chainsaw-test.yaml @@ -0,0 +1,153 @@ +# Copyright 2026 NVIDIA CORPORATION +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: k8s-label-truncation +spec: + description: > + Verify that switch names exceeding 63 characters (the Kubernetes label-value + limit) are replaced with an FNV64a hash prefixed with "x". The test uses + a static topology model with a 65-char leaf switch name, embedded inline in + a ConfigMap and mounted into the pod. A fake Node object (node-01) is + created to receive the label. After generation the leaf label must be ≤63 + chars and start with "x". + concurrent: false + steps: + + - name: prepare + description: > + Create a fake K8s Node object (node-01) matching the model instance ID, + then create the topology-model ConfigMap with inline content. + try: + - apply: + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-01 + labels: + kubernetes.io/os: linux + - apply: + resource: + apiVersion: v1 + kind: ConfigMap + metadata: + name: topology-test-model + data: + topology-model.yaml: | + switches: + s1: + switches: + - AVERYLONGSWITCHNAMETHATEXCEEDSSIXTYCHARACTERSFORTESTINGPURPOSES01 + AVERYLONGSWITCHNAMETHATEXCEEDSSIXTYCHARACTERSFORTESTINGPURPOSES01: + nodes: + - node-01 + nodes: + node-01: {} + + - name: install-topograph + description: > + Install Topograph together with the Node Observer. The observer fires + "Added" events for the fake node on startup, automatically triggering + topology generation via the modelFileName configured in values.yaml. + try: + - script: + timeout: 120s + content: | + set -euo pipefail + REPO_ROOT=$(git rev-parse --show-toplevel) + IMAGE_REPO="${TOPOGRAPH_IMAGE_REPO:-ghcr.io/nvidia/topograph}" + IMAGE_TAG="${TOPOGRAPH_IMAGE_TAG:-}" + PULL_POLICY="${TOPOGRAPH_IMAGE_PULL_POLICY:-IfNotPresent}" + + kubectl delete clusterrole topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete clusterrolebinding topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + + helm upgrade --install topograph "${REPO_ROOT}/charts/topograph" \ + --namespace "$NAMESPACE" --create-namespace \ + --values "$(pwd)/values.yaml" \ + --set "image.repository=${IMAGE_REPO}" \ + ${IMAGE_TAG:+--set "image.tag=${IMAGE_TAG}"} \ + --set "image.pullPolicy=${PULL_POLICY}" \ + --set "node-observer.image.repository=${IMAGE_REPO}" \ + ${IMAGE_TAG:+--set "node-observer.image.tag=${IMAGE_TAG}"} \ + --set "node-observer.image.pullPolicy=${PULL_POLICY}" \ + --wait --timeout 90s + catch: + - description: Pod status on install failure + script: + content: | + kubectl get pods -n "$NAMESPACE" -o wide + kubectl describe pods -n "$NAMESPACE" + + - name: assert + description: > + The Node Observer fires "Added" events for the fake node on startup, + triggering generation automatically. Assert the leaf label on node-01 + is ≤63 chars and starts with "x" (FNV64a hash of the 65-char switch name). + try: + - script: + timeout: 90s + content: | + set -euo pipefail + + echo "Polling for leaf label on node-01..." + LEAF="" + for i in $(seq 1 30); do + LEAF=$(kubectl get node node-01 \ + -o jsonpath='{.metadata.labels.network\.topology\.nvidia\.com/leaf}' 2>/dev/null || true) + [ -n "$LEAF" ] && break + sleep 2 + done + + if [ -z "$LEAF" ]; then + echo "FAIL: leaf label not set on node-01" + exit 1 + fi + + echo "leaf label value: '$LEAF' (length=${#LEAF})" + + if [ "${#LEAF}" -gt 63 ]; then + echo "FAIL: label length ${#LEAF} exceeds 63" + exit 1 + fi + + ORIGINAL="AVERYLONGSWITCHNAMETHATEXCEEDSSIXTYCHARACTERSFORTESTINGPURPOSES01" + if [ "$LEAF" = "$ORIGINAL" ]; then + echo "FAIL: label was not hashed (still equals original name)" + exit 1 + fi + + if [ "${LEAF:0:1}" != "x" ]; then + echo "FAIL: expected hashed label to start with 'x', got: $LEAF" + exit 1 + fi + + echo "OK: label correctly hashed to '$LEAF'" + catch: + - description: Topograph and Node Observer logs on assertion failure + script: + content: | + echo "=== Topograph logs ===" + kubectl logs -n "$NAMESPACE" deploy/topograph --tail=50 || true + echo "=== Node Observer logs ===" + kubectl logs -n "$NAMESPACE" deploy/topograph-node-observer --tail=50 || true + finally: + - script: + timeout: 120s + content: | + helm uninstall topograph -n "$NAMESPACE" --wait --timeout 60s || true + kubectl delete clusterrole topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete clusterrolebinding topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete namespace "$NAMESPACE" --timeout=60s || true + - delete: + ref: + apiVersion: v1 + kind: Node + name: node-01 diff --git a/tests/chainsaw/k8s/label-truncation/values.yaml b/tests/chainsaw/k8s/label-truncation/values.yaml new file mode 100644 index 00000000..eea13501 --- /dev/null +++ b/tests/chainsaw/k8s/label-truncation/values.yaml @@ -0,0 +1,29 @@ +global: + provider: + name: test + params: + modelFileName: /etc/topograph/models/topology-model.yaml + engine: + name: k8s + +config: + requestAggregationDelay: 1s + +node-observer: + topograph: + trigger: + nodeSelector: + kubernetes.io/os: linux + +node-data-broker: + enabled: false + +# test specific values to mount the topology-model.yaml ConfigMap into the Topograph pod +volumes: +- name: topology-test-model + configMap: + name: topology-test-model + +volumeMounts: +- name: topology-test-model + mountPath: /etc/topograph/models diff --git a/tests/chainsaw/kind-config.yaml b/tests/chainsaw/kind-config.yaml new file mode 100644 index 00000000..6f0e32df --- /dev/null +++ b/tests/chainsaw/kind-config.yaml @@ -0,0 +1,8 @@ +kind: Cluster +apiVersion: kind.x-k8s.io/v1alpha4 +nodes: +- role: control-plane +- role: worker +- role: worker +- role: worker +- role: worker diff --git a/tests/chainsaw/slinky/block-complement/chainsaw-test.yaml b/tests/chainsaw/slinky/block-complement/chainsaw-test.yaml new file mode 100644 index 00000000..028296fd --- /dev/null +++ b/tests/chainsaw/slinky/block-complement/chainsaw-test.yaml @@ -0,0 +1,252 @@ +# Copyright 2026 NVIDIA CORPORATION +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: slinky-block-complement +spec: + description: > + Verify block-complementing with missing nodes in two of three NVLink cliques. + The topology model defines three cliques under a spine→{leaf-1,leaf-2,leaf-3} + switch tree: clique-1 has only node-01 (node-02 absent), clique-2 has both + node-03 and node-04, and clique-3 has only node-06 (node-05 absent). Four + fake K8s nodes and one fake slurmd pod per node are created. With explicit + BlockSizes=2,4,8, each clique occupies one base block of capacity 2 regardless + of how many nodes are present. Three cliques fill 3 of the 4 base-block slots + required at the top level (4×2=8 nodes to satisfy the 8-node lastBS boundary), + so complementBlocks pads the output with one empty placeholder (block004). + Nodes absent from their clique appear as unfilled host slots within their base + block — they are not emitted in the BlockName line. The Node Observer fires on + the fake nodes on startup and auto-triggers generation. + concurrent: false + steps: + + - name: prepare + description: > + Create the topology-model ConfigMap with three NVLink cliques where node-02 + (clique-1) and node-05 (clique-3) are absent, simulating unavailable nodes. + Create four fake K8s nodes (node-01, node-03, node-04, node-06) and one fake + slurmd pod per node. Each pod is placed on the corresponding fake node via + spec.nodeName and status-patched to Ready so the Slinky engine can build its + k8s-node→SLURM-hostname map. Fake nodes carry kubernetes.io/os=linux so the + Node Observer fires on them. + try: + - apply: + resource: + apiVersion: v1 + kind: ConfigMap + metadata: + name: topology-test-model + data: + topology-model.yaml: | + switches: + spine: + switches: + - leaf-1 + - leaf-2 + - leaf-3 + leaf-1: + nodes: + - node-01 + leaf-2: + nodes: + - node-03 + - node-04 + leaf-3: + nodes: + - node-06 + nodes: + node-01: + attributes: + nvlink: clique-1 + node-03: + attributes: + nvlink: clique-2 + node-04: + attributes: + nvlink: clique-2 + node-06: + attributes: + nvlink: clique-3 + - apply: + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-01 + labels: + kubernetes.io/os: linux + - apply: + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-03 + labels: + kubernetes.io/os: linux + - apply: + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-04 + labels: + kubernetes.io/os: linux + - apply: + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-06 + labels: + kubernetes.io/os: linux + - script: + timeout: 30s + content: | + set -euo pipefail + # Create one fake slurmd pod per fake node, directly assigned via spec.nodeName. + # The Slinky engine requires ready slurmd pods to build its k8s-node to + # SLURM-hostname map. Without pods nodeMap is empty and no nodes are resolved. + for NODE in node-01 node-03 node-04 node-06; do + kubectl apply -n "$NAMESPACE" -f - < + Install Topograph with the Node Observer using the test provider and Slinky + engine configured with topology/block and explicit BlockSizes=2,4,8. The + namespace is injected via --set so the Slinky engine writes the + slurm-topology ConfigMap into the correct test namespace (where the fake + slurmd pods already exist). The observer fires on the fake nodes on startup + and auto-triggers generation. + try: + - script: + timeout: 120s + content: | + set -euo pipefail + REPO_ROOT=$(git rev-parse --show-toplevel) + IMAGE_REPO="${TOPOGRAPH_IMAGE_REPO:-ghcr.io/nvidia/topograph}" + IMAGE_TAG="${TOPOGRAPH_IMAGE_TAG:-}" + PULL_POLICY="${TOPOGRAPH_IMAGE_PULL_POLICY:-IfNotPresent}" + + kubectl delete clusterrole topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete clusterrolebinding topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + + helm upgrade --install topograph "${REPO_ROOT}/charts/topograph" \ + --namespace "$NAMESPACE" --create-namespace \ + --values "$(pwd)/values.yaml" \ + --set "image.repository=${IMAGE_REPO}" \ + ${IMAGE_TAG:+--set "image.tag=${IMAGE_TAG}"} \ + --set "image.pullPolicy=${PULL_POLICY}" \ + --set "node-observer.image.repository=${IMAGE_REPO}" \ + ${IMAGE_TAG:+--set "node-observer.image.tag=${IMAGE_TAG}"} \ + --set "node-observer.image.pullPolicy=${PULL_POLICY}" \ + --set "global.engine.params.namespace=${NAMESPACE}" \ + --wait --timeout 90s + catch: + - description: Pod status on install failure + script: + content: | + kubectl get pods -n "$NAMESPACE" -o wide + kubectl describe pods -n "$NAMESPACE" + + - name: assert + description: > + The Node Observer fires on the fake nodes on startup, triggering generation + automatically. The test provider returns a graph with three NVLink clique + domains: clique-1 (node-01 only), clique-2 (node-03 and node-04), and + clique-3 (node-06 only). The Slinky engine with BlockSizes=2,4,8 calls + complementBlocks: each clique occupies one base block of capacity 2 + regardless of how many live nodes it has. Three cliques fill 3 of the 4 + required top-level slots, so block004 is added as an empty placeholder. + Nodes absent from a clique (node-02 in clique-1, node-05 in clique-3) are + not emitted in their BlockName line — the base block simply has fewer live + hosts. Assert the ConfigMap contains all three real blocks with their + respective live nodes and the empty complement placeholder. + try: + - assert: + timeout: 60s + resource: + apiVersion: v1 + kind: ConfigMap + metadata: + name: slurm-topology + data: + topology.conf: | + # block001=clique-1 + BlockName=block001 Nodes=node-01 + # block002=clique-2 + BlockName=block002 Nodes=node-[03-04] + # block003=clique-3 + BlockName=block003 Nodes=node-06 + BlockName=block004 + BlockSizes=2,4,8 + catch: + - description: Topograph and Node Observer logs on failure + script: + content: | + echo "=== Topograph logs ===" + kubectl logs -n "$NAMESPACE" deploy/topograph --tail=80 || true + echo "=== Node Observer logs ===" + kubectl logs -n "$NAMESPACE" deploy/topograph-node-observer --tail=50 || true + echo "=== slurm-topology content ===" + kubectl get configmap slurm-topology -n "$NAMESPACE" -o yaml 2>/dev/null || true + finally: + - script: + timeout: 120s + content: | + helm uninstall topograph -n "$NAMESPACE" --wait --timeout 60s || true + kubectl delete clusterrole topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete clusterrolebinding topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete pod \ + slurmd-node-01 slurmd-node-03 slurmd-node-04 slurmd-node-06 \ + -n "$NAMESPACE" --ignore-not-found --grace-period=0 --force 2>/dev/null || true + kubectl delete namespace "$NAMESPACE" --timeout=60s || true + - delete: + ref: + apiVersion: v1 + kind: Node + name: node-01 + - delete: + ref: + apiVersion: v1 + kind: Node + name: node-03 + - delete: + ref: + apiVersion: v1 + kind: Node + name: node-04 + - delete: + ref: + apiVersion: v1 + kind: Node + name: node-06 diff --git a/tests/chainsaw/slinky/block-complement/values.yaml b/tests/chainsaw/slinky/block-complement/values.yaml new file mode 100644 index 00000000..38aff418 --- /dev/null +++ b/tests/chainsaw/slinky/block-complement/values.yaml @@ -0,0 +1,41 @@ +global: + provider: + name: test + params: + modelFileName: /etc/topograph/models/topology-model.yaml + engine: + name: slinky + params: + plugin: topology/block + blockSizes: + - 2 + - 4 + - 8 + podSelector: + matchLabels: + app: slurmd + topologyConfigmapName: slurm-topology + topologyConfigPath: topology.conf + # namespace is injected at install time via --set global.engine.params.namespace=$NAMESPACE + +config: + requestAggregationDelay: 1s + +node-observer: + topograph: + trigger: + nodeSelector: + kubernetes.io/os: linux + +node-data-broker: + enabled: false + +# test specific values to mount the topology-model.yaml ConfigMap into the Topograph pod +volumes: +- name: topology-test-model + configMap: + name: topology-test-model + +volumeMounts: +- name: topology-test-model + mountPath: /etc/topograph/models diff --git a/tests/chainsaw/slinky/dra-provider/chainsaw-test.yaml b/tests/chainsaw/slinky/dra-provider/chainsaw-test.yaml new file mode 100644 index 00000000..a02a9a31 --- /dev/null +++ b/tests/chainsaw/slinky/dra-provider/chainsaw-test.yaml @@ -0,0 +1,226 @@ +# Copyright 2026 NVIDIA CORPORATION +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: slinky-dra-provider +spec: + description: > + Verify that the DRA provider discovers NVLink clique topology from + nvidia.com/gpu.clique node labels and that the Slinky engine writes a + correct block topology.conf into a ConfigMap. Four fake nodes are arranged + in two cliques (node-01/node-02 in clique-1, node-03/node-04 in clique-2). + Each node carries the topograph.nvidia.com/instance and + topograph.nvidia.com/region annotations that the Slinky engine uses to build + the instance map. A fake slurmd pod (status-patched to Ready) is placed on + each fake node so the Slinky engine can resolve the k8s-node→SLURM-hostname + mapping it requires. The Node Observer fires on startup and auto-triggers + generation. The Slinky engine translates the DRA domain map into + topology/block format: one BlockName entry per NVLink clique. + concurrent: false + steps: + + - name: prepare + description: > + Create four fake K8s Node objects arranged in two NVLink cliques, then + create one fake slurmd Pod per node (status-patched to Ready) so the + Slinky engine can build its k8s-node→SLURM-hostname mapping. + Each node carries: + - nvidia.com/gpu.clique: the NVLink clique ID (read by the DRA provider) + - topograph.nvidia.com/instance: the instance ID (= node name) + - topograph.nvidia.com/region: "local" (set by node-data-broker in production) + - kubernetes.io/os: linux (triggers the Node Observer) + try: + - apply: + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-01 + labels: + kubernetes.io/os: linux + nvidia.com/gpu.clique: clique-1 + annotations: + topograph.nvidia.com/instance: node-01 + topograph.nvidia.com/region: local + - apply: + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-02 + labels: + kubernetes.io/os: linux + nvidia.com/gpu.clique: clique-1 + annotations: + topograph.nvidia.com/instance: node-02 + topograph.nvidia.com/region: local + - apply: + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-03 + labels: + kubernetes.io/os: linux + nvidia.com/gpu.clique: clique-2 + annotations: + topograph.nvidia.com/instance: node-03 + topograph.nvidia.com/region: local + - apply: + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-04 + labels: + kubernetes.io/os: linux + nvidia.com/gpu.clique: clique-2 + annotations: + topograph.nvidia.com/instance: node-04 + topograph.nvidia.com/region: local + - script: + timeout: 30s + content: | + set -euo pipefail + # Create one fake slurmd pod per fake node, directly assigned via spec.nodeName. + # The Slinky engine requires ready slurmd pods to build its k8s-node to + # SLURM-hostname map. Without pods nodeMap is empty and no nodes are resolved. + for NODE in node-01 node-02 node-03 node-04; do + kubectl apply -n "$NAMESPACE" -f - < + Install Topograph together with the Node Observer using the DRA provider + and Slinky engine. No model ConfigMap is needed — the DRA provider reads + nvidia.com/gpu.clique labels and topograph.nvidia.com/* annotations + directly from the K8s API. The namespace is injected via --set so the + Slinky engine writes the slurm-topology ConfigMap into the correct test + namespace (where the fake slurmd pods already exist). The observer fires + on startup and auto-triggers generation. + try: + - script: + timeout: 120s + content: | + set -euo pipefail + REPO_ROOT=$(git rev-parse --show-toplevel) + IMAGE_REPO="${TOPOGRAPH_IMAGE_REPO:-ghcr.io/nvidia/topograph}" + IMAGE_TAG="${TOPOGRAPH_IMAGE_TAG:-}" + PULL_POLICY="${TOPOGRAPH_IMAGE_PULL_POLICY:-IfNotPresent}" + + kubectl delete clusterrole topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete clusterrolebinding topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + + helm upgrade --install topograph "${REPO_ROOT}/charts/topograph" \ + --namespace "$NAMESPACE" --create-namespace \ + --values "$(pwd)/values.yaml" \ + --set "image.repository=${IMAGE_REPO}" \ + ${IMAGE_TAG:+--set "image.tag=${IMAGE_TAG}"} \ + --set "image.pullPolicy=${PULL_POLICY}" \ + --set "node-observer.image.repository=${IMAGE_REPO}" \ + ${IMAGE_TAG:+--set "node-observer.image.tag=${IMAGE_TAG}"} \ + --set "node-observer.image.pullPolicy=${PULL_POLICY}" \ + --set "global.engine.params.namespace=${NAMESPACE}" \ + --wait --timeout 90s + catch: + - description: Pod status on install failure + script: + content: | + kubectl get pods -n "$NAMESPACE" -o wide + kubectl describe pods -n "$NAMESPACE" + + - name: assert + description: > + The Node Observer fires on the fake nodes on startup, triggering generation + automatically. The DRA provider groups nodes by nvidia.com/gpu.clique into + two domains. The Slinky engine uses the fake slurmd pods to resolve k8s + node names to SLURM hostnames, then translates the domain map into + topology/block format and writes the slurm-topology ConfigMap. Assert the + ConfigMap contains the correct BlockName entries for each NVLink clique and + the auto-calculated BlockSizes (2 cliques x 2 nodes each -> BlockSizes=2,4). + try: + - assert: + timeout: 60s + resource: + apiVersion: v1 + kind: ConfigMap + metadata: + name: slurm-topology + data: + topology.conf: | + # block001=clique-1 + BlockName=block001 Nodes=node-[01-02] + # block002=clique-2 + BlockName=block002 Nodes=node-[03-04] + BlockSizes=2,4 + catch: + - description: Topograph and Node Observer logs on failure + script: + content: | + echo "=== Topograph logs ===" + kubectl logs -n "$NAMESPACE" deploy/topograph --tail=80 || true + echo "=== Node Observer logs ===" + kubectl logs -n "$NAMESPACE" deploy/topograph-node-observer --tail=50 || true + echo "=== ConfigMaps ===" + kubectl get configmaps -n "$NAMESPACE" + echo "=== slurm-topology content ===" + kubectl get configmap slurm-topology -n "$NAMESPACE" -o yaml 2>/dev/null || true + finally: + - script: + timeout: 120s + content: | + helm uninstall topograph -n "$NAMESPACE" --wait --timeout 60s || true + kubectl delete clusterrole topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete clusterrolebinding topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete pod slurmd-node-01 slurmd-node-02 slurmd-node-03 slurmd-node-04 \ + -n "$NAMESPACE" --ignore-not-found --grace-period=0 --force 2>/dev/null || true + kubectl delete namespace "$NAMESPACE" --timeout=60s || true + - delete: + ref: + apiVersion: v1 + kind: Node + name: node-01 + - delete: + ref: + apiVersion: v1 + kind: Node + name: node-02 + - delete: + ref: + apiVersion: v1 + kind: Node + name: node-03 + - delete: + ref: + apiVersion: v1 + kind: Node + name: node-04 diff --git a/tests/chainsaw/slinky/dra-provider/values.yaml b/tests/chainsaw/slinky/dra-provider/values.yaml new file mode 100644 index 00000000..1cf1de5e --- /dev/null +++ b/tests/chainsaw/slinky/dra-provider/values.yaml @@ -0,0 +1,25 @@ +global: + provider: + name: dra + engine: + name: slinky + params: + plugin: topology/block + podSelector: + matchLabels: + app: slurmd + topologyConfigmapName: slurm-topology + topologyConfigPath: topology.conf + # namespace is injected at install time via --set global.engine.params.namespace=$NAMESPACE + +config: + requestAggregationDelay: 1s + +node-observer: + topograph: + trigger: + nodeSelector: + kubernetes.io/os: linux + +node-data-broker: + enabled: false diff --git a/tests/chainsaw/slinky/dynamic-nodes/chainsaw-test.yaml b/tests/chainsaw/slinky/dynamic-nodes/chainsaw-test.yaml new file mode 100644 index 00000000..830f5d8f --- /dev/null +++ b/tests/chainsaw/slinky/dynamic-nodes/chainsaw-test.yaml @@ -0,0 +1,288 @@ +# Copyright 2026 NVIDIA CORPORATION +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: slinky-dynamic-nodes +spec: + description: > + Verify the skeleton-only ConfigMap and per-node topology annotations produced + by the Slinky engine when useDynamicNodes=true and configUpdateMode=skeleton-only. + The topology model defines three NVLink cliques under a spine→{leaf-1,leaf-2,leaf-3} + switch tree: clique-1 has only node-01 (node-02 absent), clique-2 has both + node-03 and node-04, and clique-3 has only node-06 (node-05 absent). Four fake + K8s nodes and one fake slurmd pod per node are created. With BlockSizes=2,4,8 + and three cliques, complementBlocks adds block004 as an empty placeholder. With + skeleton-only mode ALL BlockName lines are written WITHOUT Nodes= so that Slinky + can reconstruct membership from per-node annotations. performReconciliation then + annotates each K8s node with topology.slinky.slurm.net/spec pointing to its + assigned block. + concurrent: false + steps: + + - name: prepare + description: > + Create the topology-model ConfigMap with three NVLink cliques where node-02 + (clique-1) and node-05 (clique-3) are absent, simulating unavailable nodes. + Create four fake K8s nodes (node-01, node-03, node-04, node-06) and one fake + slurmd pod per node. Each pod is placed on the corresponding fake node via + spec.nodeName and status-patched to Ready so the Slinky engine can build its + k8s-node→SLURM-hostname map for performReconciliation. Fake nodes carry + kubernetes.io/os=linux so the Node Observer fires on them. + try: + - apply: + resource: + apiVersion: v1 + kind: ConfigMap + metadata: + name: topology-test-model + data: + topology-model.yaml: | + switches: + spine: + switches: + - leaf-1 + - leaf-2 + - leaf-3 + leaf-1: + nodes: + - node-01 + leaf-2: + nodes: + - node-03 + - node-04 + leaf-3: + nodes: + - node-06 + nodes: + node-01: + attributes: + nvlink: clique-1 + node-03: + attributes: + nvlink: clique-2 + node-04: + attributes: + nvlink: clique-2 + node-06: + attributes: + nvlink: clique-3 + - apply: + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-01 + labels: + kubernetes.io/os: linux + - apply: + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-03 + labels: + kubernetes.io/os: linux + - apply: + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-04 + labels: + kubernetes.io/os: linux + - apply: + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-06 + labels: + kubernetes.io/os: linux + - script: + timeout: 30s + content: | + set -euo pipefail + # Create one fake slurmd pod per fake node, directly assigned via spec.nodeName. + # performReconciliation uses these pods to build the k8s-node→SLURM-hostname map + # before annotating each node with topology.slinky.slurm.net/spec. + for NODE in node-01 node-03 node-04 node-06; do + kubectl apply -n "$NAMESPACE" -f - < + Install Topograph with the Node Observer using the test provider and Slinky + engine configured with topology/block, BlockSizes=2,4,8, useDynamicNodes=true, + and configUpdateMode=skeleton-only. The namespace is injected via --set so the + Slinky engine writes the slurm-topology ConfigMap and node annotations into the + correct test namespace. The observer fires on the fake nodes on startup and + auto-triggers generation. + try: + - script: + timeout: 120s + content: | + set -euo pipefail + REPO_ROOT=$(git rev-parse --show-toplevel) + IMAGE_REPO="${TOPOGRAPH_IMAGE_REPO:-ghcr.io/nvidia/topograph}" + IMAGE_TAG="${TOPOGRAPH_IMAGE_TAG:-}" + PULL_POLICY="${TOPOGRAPH_IMAGE_PULL_POLICY:-IfNotPresent}" + + kubectl delete clusterrole topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete clusterrolebinding topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + + helm upgrade --install topograph "${REPO_ROOT}/charts/topograph" \ + --namespace "$NAMESPACE" --create-namespace \ + --values "$(pwd)/values.yaml" \ + --set "image.repository=${IMAGE_REPO}" \ + ${IMAGE_TAG:+--set "image.tag=${IMAGE_TAG}"} \ + --set "image.pullPolicy=${PULL_POLICY}" \ + --set "node-observer.image.repository=${IMAGE_REPO}" \ + ${IMAGE_TAG:+--set "node-observer.image.tag=${IMAGE_TAG}"} \ + --set "node-observer.image.pullPolicy=${PULL_POLICY}" \ + --set "global.engine.params.namespace=${NAMESPACE}" \ + --wait --timeout 90s + catch: + - description: Pod status on install failure + script: + content: | + kubectl get pods -n "$NAMESPACE" -o wide + kubectl describe pods -n "$NAMESPACE" + + - name: assert + description: > + The Node Observer fires on the fake nodes on startup, triggering generation. + With configUpdateMode=skeleton-only the slurm-topology ConfigMap is written + with all BlockName lines omitting Nodes= — including the complement placeholder + block004. With useDynamicNodes=true performReconciliation then annotates each + K8s node with topology.slinky.slurm.net/spec pointing to its assigned block: + node-01→block001, node-03→block002, node-04→block002, node-06→block003. + try: + - assert: + timeout: 60s + resource: + apiVersion: v1 + kind: ConfigMap + metadata: + name: slurm-topology + data: + topology.conf: | + # block001=clique-1 + BlockName=block001 + # block002=clique-2 + BlockName=block002 + # block003=clique-3 + BlockName=block003 + BlockName=block004 + BlockSizes=2,4,8 + - assert: + timeout: 30s + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-01 + annotations: + topology.slinky.slurm.net/spec: "default:block001" + - assert: + timeout: 30s + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-03 + annotations: + topology.slinky.slurm.net/spec: "default:block002" + - assert: + timeout: 30s + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-04 + annotations: + topology.slinky.slurm.net/spec: "default:block002" + - assert: + timeout: 30s + resource: + apiVersion: v1 + kind: Node + metadata: + name: node-06 + annotations: + topology.slinky.slurm.net/spec: "default:block003" + catch: + - description: Topograph and Node Observer logs on failure + script: + content: | + echo "=== Topograph logs ===" + kubectl logs -n "$NAMESPACE" deploy/topograph --tail=80 || true + echo "=== Node Observer logs ===" + kubectl logs -n "$NAMESPACE" deploy/topograph-node-observer --tail=50 || true + echo "=== slurm-topology content ===" + kubectl get configmap slurm-topology -n "$NAMESPACE" -o yaml 2>/dev/null || true + echo "=== node annotations ===" + for NODE in node-01 node-03 node-04 node-06; do + echo "--- ${NODE} ---" + kubectl get node "${NODE}" -o jsonpath='{.metadata.annotations}' 2>/dev/null || true + echo + done + finally: + - script: + timeout: 120s + content: | + helm uninstall topograph -n "$NAMESPACE" --wait --timeout 60s || true + kubectl delete clusterrole topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete clusterrolebinding topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete pod \ + slurmd-node-01 slurmd-node-03 slurmd-node-04 slurmd-node-06 \ + -n "$NAMESPACE" --ignore-not-found --grace-period=0 --force 2>/dev/null || true + kubectl delete namespace "$NAMESPACE" --timeout=60s || true + - delete: + ref: + apiVersion: v1 + kind: Node + name: node-01 + - delete: + ref: + apiVersion: v1 + kind: Node + name: node-03 + - delete: + ref: + apiVersion: v1 + kind: Node + name: node-04 + - delete: + ref: + apiVersion: v1 + kind: Node + name: node-06 diff --git a/tests/chainsaw/slinky/dynamic-nodes/values.yaml b/tests/chainsaw/slinky/dynamic-nodes/values.yaml new file mode 100644 index 00000000..206e9b93 --- /dev/null +++ b/tests/chainsaw/slinky/dynamic-nodes/values.yaml @@ -0,0 +1,43 @@ +global: + provider: + name: test + params: + modelFileName: /etc/topograph/models/topology-model.yaml + engine: + name: slinky + params: + plugin: topology/block + blockSizes: + - 2 + - 4 + - 8 + podSelector: + matchLabels: + app: slurmd + topologyConfigmapName: slurm-topology + topologyConfigPath: topology.conf + useDynamicNodes: true + configUpdateMode: skeleton-only + # namespace is injected at install time via --set global.engine.params.namespace=$NAMESPACE + +config: + requestAggregationDelay: 1s + +node-observer: + topograph: + trigger: + nodeSelector: + kubernetes.io/os: linux + +node-data-broker: + enabled: false + +# test specific values to mount the topology-model.yaml ConfigMap into the Topograph pod +volumes: +- name: topology-test-model + configMap: + name: topology-test-model + +volumeMounts: +- name: topology-test-model + mountPath: /etc/topograph/models diff --git a/tests/chainsaw/slinky/tree-topology/chainsaw-test.yaml b/tests/chainsaw/slinky/tree-topology/chainsaw-test.yaml new file mode 100644 index 00000000..32303935 --- /dev/null +++ b/tests/chainsaw/slinky/tree-topology/chainsaw-test.yaml @@ -0,0 +1,123 @@ +# Copyright 2026 NVIDIA CORPORATION +# SPDX-License-Identifier: Apache-2.0 + +apiVersion: chainsaw.kyverno.io/v1alpha1 +kind: Test +metadata: + name: slinky-tree-topology +spec: + description: > + Verify that the Slinky engine writes a correct topology.conf fragment into + a ConfigMap. The test uses a static topology model (S1→S2/S3, node-01 under + S2 and node-02 under S3) embedded inline in a ConfigMap and mounted into the + pod. The Node Observer fires on cluster nodes (which already carry the + kubernetes.io/os=linux label) and auto-triggers generation — no manual POST + is required. The Slinky engine derives topology.conf entirely from the graph + switch structure; no actual slurmd pods are needed. + concurrent: false + steps: + + - name: prepare-model-configmap + description: Create the topology-model ConfigMap with inline content. + try: + - apply: + resource: + apiVersion: v1 + kind: ConfigMap + metadata: + name: topology-test-model + data: + topology-model.yaml: | + switches: + S1: + switches: + - S2 + - S3 + S2: + nodes: + - node-01 + S3: + nodes: + - node-02 + nodes: + node-01: {} + node-02: {} + + - name: install-topograph + description: > + Install Topograph together with the Node Observer. The namespace is + injected via --set so the Slinky engine writes the slurm-topology + ConfigMap into the correct test namespace. The observer fires on cluster + nodes on startup, auto-triggering generation. + try: + - script: + timeout: 120s + content: | + set -euo pipefail + REPO_ROOT=$(git rev-parse --show-toplevel) + IMAGE_REPO="${TOPOGRAPH_IMAGE_REPO:-ghcr.io/nvidia/topograph}" + IMAGE_TAG="${TOPOGRAPH_IMAGE_TAG:-}" + PULL_POLICY="${TOPOGRAPH_IMAGE_PULL_POLICY:-IfNotPresent}" + + kubectl delete clusterrole topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete clusterrolebinding topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + + helm upgrade --install topograph "${REPO_ROOT}/charts/topograph" \ + --namespace "$NAMESPACE" --create-namespace \ + --values "$(pwd)/values.yaml" \ + --set "image.repository=${IMAGE_REPO}" \ + ${IMAGE_TAG:+--set "image.tag=${IMAGE_TAG}"} \ + --set "image.pullPolicy=${PULL_POLICY}" \ + --set "node-observer.image.repository=${IMAGE_REPO}" \ + ${IMAGE_TAG:+--set "node-observer.image.tag=${IMAGE_TAG}"} \ + --set "node-observer.image.pullPolicy=${PULL_POLICY}" \ + --set "global.engine.params.namespace=${NAMESPACE}" \ + --wait --timeout 90s + catch: + - description: Pod status on install failure + script: + content: | + kubectl get pods -n "$NAMESPACE" -o wide + kubectl describe pods -n "$NAMESPACE" + + - name: assert + description: > + The Node Observer fires on cluster nodes on startup, triggering generation + automatically. The Slinky engine derives topology.conf from the graph + switch structure (S1→S2/S3) and writes it into the slurm-topology + ConfigMap. Assert the ConfigMap contains the correct topology.conf entries. + try: + - assert: + timeout: 60s + resource: + apiVersion: v1 + kind: ConfigMap + metadata: + name: slurm-topology + data: + topology.conf: | + SwitchName=S1 Switches=S[2-3] + SwitchName=S2 Nodes=node-01 + SwitchName=S3 Nodes=node-02 + catch: + - description: Topograph and Node Observer logs on failure + script: + content: | + echo "=== Topograph logs ===" + kubectl logs -n "$NAMESPACE" deploy/topograph --tail=50 || true + echo "=== Node Observer logs ===" + kubectl logs -n "$NAMESPACE" deploy/topograph-node-observer --tail=50 || true + echo "=== ConfigMaps ===" + kubectl get configmaps -n "$NAMESPACE" + finally: + - script: + timeout: 120s + content: | + helm uninstall topograph -n "$NAMESPACE" --wait --timeout 60s || true + kubectl delete clusterrole topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete clusterrolebinding topograph topograph-node-observer \ + --ignore-not-found 2>/dev/null || true + kubectl delete namespace "$NAMESPACE" --timeout=60s || true diff --git a/tests/chainsaw/slinky/tree-topology/values.yaml b/tests/chainsaw/slinky/tree-topology/values.yaml new file mode 100644 index 00000000..4342f815 --- /dev/null +++ b/tests/chainsaw/slinky/tree-topology/values.yaml @@ -0,0 +1,37 @@ +global: + provider: + name: test + params: + modelFileName: /etc/topograph/models/topology-model.yaml + engine: + name: slinky + params: + plugin: topology/tree + podSelector: + matchLabels: + app: slurmd + topologyConfigmapName: slurm-topology + topologyConfigPath: topology.conf + # namespace is injected at install time via --set global.engine.params.namespace=$NAMESPACE + +config: + requestAggregationDelay: 1s + +node-observer: + topograph: + trigger: + nodeSelector: + kubernetes.io/os: linux + +node-data-broker: + enabled: false + +# test specific values to mount the topology-model.yaml ConfigMap into the Topograph pod +volumes: +- name: topology-test-model + configMap: + name: topology-test-model + +volumeMounts: +- name: topology-test-model + mountPath: /etc/topograph/models