From fa2b9a0df018d63391ee0dfdb4e7e442a4a21829 Mon Sep 17 00:00:00 2001 From: Giulio Calzolari Date: Sat, 27 Jun 2026 16:29:46 +0200 Subject: [PATCH 1/4] feat(node-data-broker): run broker as main container with health probes Replace the init container plus curl sleeper with a single container running node-data-broker-initc. The binary applies node annotations, serves /healthz, and re-applies annotations on a configurable refreshInterval (default 5m). Add a startup probe so slow providers can finish before liveness kicks in, move initc.extraArgs to top-level extraArgs, and update infiniband docs and helm-unittest coverage. Signed-off-by: Giulio Calzolari --- .gitignore | 4 + .../node-data-broker/templates/_helpers.tpl | 7 - .../node-data-broker/templates/daemonset.yaml | 63 +-- .../charts/node-data-broker/values.yaml | 45 ++- .../render_snapshot_test.yaml.snap | 379 +++++++++++------- charts/topograph/tests/subcharts_test.yaml | 65 +-- cmd/node-data-broker-initc/main.go | 108 ++++- cmd/node-data-broker-initc/main_test.go | 109 ++++- docs/providers/infiniband.md | 16 +- 9 files changed, 562 insertions(+), 234 deletions(-) diff --git a/.gitignore b/.gitignore index 0181cd69..cee242bd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,9 @@ /coverage.out /bin +# Binaries built at the repo root by `go build ./cmd/...` +/topograph +/node-observer +/node-data-broker-initc /ssl /deb/topograph/DEBIAN/control /deb/topograph/usr/local/bin/ diff --git a/charts/topograph/charts/node-data-broker/templates/_helpers.tpl b/charts/topograph/charts/node-data-broker/templates/_helpers.tpl index 4c47bc69..8d603233 100644 --- a/charts/topograph/charts/node-data-broker/templates/_helpers.tpl +++ b/charts/topograph/charts/node-data-broker/templates/_helpers.tpl @@ -37,13 +37,6 @@ Container image reference. The tag defaults to the chart appVersion when unset. {{- printf "%s:%s" .Values.image.repository (.Values.image.tag | default .Chart.AppVersion) -}} {{- end }} -{{/* -Init container image reference. The tag defaults to the chart appVersion when unset. -*/}} -{{- define "node-data-broker.initImage" -}} -{{- printf "%s:%s" .Values.initc.image.repository (.Values.initc.image.tag | default .Chart.AppVersion) -}} -{{- end }} - {{/* Common labels */}} diff --git a/charts/topograph/charts/node-data-broker/templates/daemonset.yaml b/charts/topograph/charts/node-data-broker/templates/daemonset.yaml index f6bd312c..bf0853cf 100644 --- a/charts/topograph/charts/node-data-broker/templates/daemonset.yaml +++ b/charts/topograph/charts/node-data-broker/templates/daemonset.yaml @@ -25,20 +25,23 @@ spec: serviceAccountName: {{ include "node-data-broker.serviceAccountName" . }} securityContext: {{- toYaml .Values.podSecurityContext | nindent 8 }} - {{- if .Values.initc.enabled }} - initContainers: - - name: init-node-labels - image: {{ include "node-data-broker.initImage" . | quote }} - imagePullPolicy: {{ .Values.initc.image.pullPolicy }} + containers: + - name: {{ .Chart.Name }} + securityContext: + {{- toYaml .Values.securityContext | nindent 12 }} + image: {{ include "node-data-broker.image" . | quote }} + imagePullPolicy: {{ .Values.image.pullPolicy }} command: - /usr/local/bin/node-data-broker-initc args: - --provider={{ .Values.global.provider.name }} - -v={{ .Values.verbosity }} + - --port={{ .Values.port }} + - --refresh-interval={{ .Values.refreshInterval }} {{- if $useGpuCliqueLabel }} - --set=useGpuCliqueLabel=true {{- end }} - {{- range .Values.initc.extraArgs }} + {{- range .Values.extraArgs }} - --set={{ . }} {{- end }} env: @@ -46,31 +49,29 @@ spec: valueFrom: fieldRef: fieldPath: spec.nodeName - {{- if or $configMapMounts .Values.volumeMounts }} - volumeMounts: - {{- range $configMapMounts }} - - name: {{ include "node-data-broker.configMapMountVolumeName" (dict "name" .name) }} - mountPath: {{ required "node-data-broker.configMapMounts[].mountPath is required" .mountPath | quote }} - {{- with .subPath }} - subPath: {{ . | quote }} - {{- end }} - readOnly: true - {{- end }} - {{- with .Values.volumeMounts }} - {{- toYaml . | nindent 12 }} - {{- end }} - {{- end }} - {{- end }} - containers: - - name: {{ .Chart.Name }} - securityContext: - {{- toYaml .Values.securityContext | nindent 12 }} - image: {{ include "node-data-broker.image" . | quote }} - imagePullPolicy: {{ .Values.image.pullPolicy }} - {{- with .Values.command }} - command: - {{ toYaml . | nindent 12 }} - {{- end }} + ports: + - name: http + containerPort: {{ .Values.port }} + protocol: TCP + # The broker applies node annotations before it starts serving + # /healthz. The startup probe gates liveness/readiness so slow + # providers (e.g. infiniband ibnetdiscover) have up to + # failureThreshold * periodSeconds to finish before the container can + # be restarted. + startupProbe: + httpGet: + path: /healthz + port: http + failureThreshold: {{ .Values.startupProbe.failureThreshold }} + periodSeconds: {{ .Values.startupProbe.periodSeconds }} + livenessProbe: + httpGet: + path: /healthz + port: http + readinessProbe: + httpGet: + path: /healthz + port: http resources: {{- toYaml .Values.resources | nindent 12 }} {{- if or $configMapMounts .Values.volumeMounts }} diff --git a/charts/topograph/charts/node-data-broker/values.yaml b/charts/topograph/charts/node-data-broker/values.yaml index e83a6623..cf587d65 100644 --- a/charts/topograph/charts/node-data-broker/values.yaml +++ b/charts/topograph/charts/node-data-broker/values.yaml @@ -3,26 +3,35 @@ # Declare variables to be passed into your templates. image: - repository: curlimages/curl + repository: ghcr.io/nvidia/topograph pullPolicy: IfNotPresent - tag: 8.13.0 + # Overrides the image tag whose default is the chart appVersion. + tag: "" enabled: true -command: - - tail - - -f - - /dev/null - -initc: - enabled: true - image: - repository: ghcr.io/nvidia/topograph - pullPolicy: IfNotPresent - # Overrides the image tag whose default is the chart appVersion. - tag: "" - extraArgs: - # - key=val +# Port the node-data-broker serves its /healthz endpoint on after applying +# node annotations. +port: 8080 + +# How often to re-apply node annotations after the initial startup apply. +# Slow providers (e.g. infiniband ibnetdiscover) may need a longer interval. +# Set to 0 to disable periodic refresh. +refreshInterval: 5m + +# Extra key=value parameters passed to node-data-broker-initc via --set +# (e.g. gpu-operator-namespace / device-plugin-daemonset for infiniband-k8s). +extraArgs: [] +# - key=val + +# Startup probe for the node-data-broker container. The broker applies node +# annotations before it starts serving /healthz, so the startup probe gates the +# liveness/readiness probes and gives slow providers (e.g. infiniband +# ibnetdiscover) time to finish. The total startup budget is +# failureThreshold * periodSeconds (default 30 * 10s = 5m). +startupProbe: + failureThreshold: 30 + periodSeconds: 10 imagePullSecrets: [] nameOverride: "" @@ -67,8 +76,8 @@ resources: cpu: 100m memory: 128Mi -# Optional ConfigMaps rendered by the chart and mounted into both the -# node-data-broker container and init container. Set subPath for file mounts; +# Optional ConfigMaps rendered by the chart and mounted into the +# node-data-broker container. Set subPath for file mounts; # omit it to mount the whole ConfigMap as a directory. configMapMounts: [] # name is used as a Kubernetes resource and volume suffix; keep it DNS-label-safe. diff --git a/charts/topograph/tests/__snapshot__/render_snapshot_test.yaml.snap b/charts/topograph/tests/__snapshot__/render_snapshot_test.yaml.snap index 28e1196f..21de5f0d 100644 --- a/charts/topograph/tests/__snapshot__/render_snapshot_test.yaml.snap +++ b/charts/topograph/tests/__snapshot__/render_snapshot_test.yaml.snap @@ -26,25 +26,11 @@ renders default values.yaml: helm.sh/chart: node-data-broker-0.0.0 spec: containers: - - command: - - tail - - -f - - /dev/null - image: curlimages/curl:8.13.0 - imagePullPolicy: IfNotPresent - name: node-data-broker - resources: - limits: - cpu: 100m - memory: 128Mi - requests: - cpu: 100m - memory: 128Mi - securityContext: {} - initContainers: - args: - --provider=test - -v=3 + - --port=8080 + - --refresh-interval=5m command: - /usr/local/bin/node-data-broker-initc env: @@ -54,7 +40,33 @@ renders default values.yaml: fieldPath: spec.nodeName image: ghcr.io/nvidia/topograph:v0.0.0 imagePullPolicy: IfNotPresent - name: init-node-labels + livenessProbe: + httpGet: + path: /healthz + port: http + name: node-data-broker + ports: + - containerPort: 8080 + name: http + protocol: TCP + readinessProbe: + httpGet: + path: /healthz + port: http + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + securityContext: {} + startupProbe: + failureThreshold: 30 + httpGet: + path: /healthz + port: http + periodSeconds: 10 securityContext: {} serviceAccountName: chart-ci-node-data-broker 2: | @@ -492,25 +504,11 @@ renders values.k8s.gateway-api-example.yaml: helm.sh/chart: node-data-broker-0.0.0 spec: containers: - - command: - - tail - - -f - - /dev/null - image: curlimages/curl:8.13.0 - imagePullPolicy: IfNotPresent - name: node-data-broker - resources: - limits: - cpu: 100m - memory: 128Mi - requests: - cpu: 100m - memory: 128Mi - securityContext: {} - initContainers: - args: - --provider=test - -v=3 + - --port=8080 + - --refresh-interval=5m command: - /usr/local/bin/node-data-broker-initc env: @@ -520,7 +518,33 @@ renders values.k8s.gateway-api-example.yaml: fieldPath: spec.nodeName image: ghcr.io/nvidia/topograph:v0.0.0 imagePullPolicy: IfNotPresent - name: init-node-labels + livenessProbe: + httpGet: + path: /healthz + port: http + name: node-data-broker + ports: + - containerPort: 8080 + name: http + protocol: TCP + readinessProbe: + httpGet: + path: /healthz + port: http + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + securityContext: {} + startupProbe: + failureThreshold: 30 + httpGet: + path: /healthz + port: http + periodSeconds: 10 securityContext: {} serviceAccountName: chart-ci-node-data-broker 2: | @@ -980,25 +1004,11 @@ renders values.k8s.gcp-federated-workload-identity-example.yaml: helm.sh/chart: node-data-broker-0.0.0 spec: containers: - - command: - - tail - - -f - - /dev/null - image: ghcr.io/nvidia/topograph/ib:main - imagePullPolicy: IfNotPresent - name: node-data-broker - resources: - limits: - cpu: 100m - memory: 128Mi - requests: - cpu: 100m - memory: 128Mi - securityContext: {} - initContainers: - args: - --provider=gcp - -v=3 + - --port=8080 + - --refresh-interval=5m command: - /usr/local/bin/node-data-broker-initc env: @@ -1006,9 +1016,35 @@ renders values.k8s.gcp-federated-workload-identity-example.yaml: valueFrom: fieldRef: fieldPath: spec.nodeName - image: ghcr.io/nvidia/topograph:v0.0.0 + image: ghcr.io/nvidia/topograph/ib:main imagePullPolicy: IfNotPresent - name: init-node-labels + livenessProbe: + httpGet: + path: /healthz + port: http + name: node-data-broker + ports: + - containerPort: 8080 + name: http + protocol: TCP + readinessProbe: + httpGet: + path: /healthz + port: http + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + securityContext: {} + startupProbe: + failureThreshold: 30 + httpGet: + path: /healthz + port: http + periodSeconds: 10 nodeSelector: brightcomputing.com/node-category: dgx securityContext: {} @@ -1471,25 +1507,11 @@ renders values.k8s.gcp-service-account-example.yaml: helm.sh/chart: node-data-broker-0.0.0 spec: containers: - - command: - - tail - - -f - - /dev/null - image: ghcr.io/nvidia/topograph/ib:main - imagePullPolicy: IfNotPresent - name: node-data-broker - resources: - limits: - cpu: 100m - memory: 128Mi - requests: - cpu: 100m - memory: 128Mi - securityContext: {} - initContainers: - args: - --provider=gcp - -v=3 + - --port=8080 + - --refresh-interval=5m command: - /usr/local/bin/node-data-broker-initc env: @@ -1497,9 +1519,35 @@ renders values.k8s.gcp-service-account-example.yaml: valueFrom: fieldRef: fieldPath: spec.nodeName - image: ghcr.io/nvidia/topograph:v0.0.0 + image: ghcr.io/nvidia/topograph/ib:main imagePullPolicy: IfNotPresent - name: init-node-labels + livenessProbe: + httpGet: + path: /healthz + port: http + name: node-data-broker + ports: + - containerPort: 8080 + name: http + protocol: TCP + readinessProbe: + httpGet: + path: /healthz + port: http + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + securityContext: {} + startupProbe: + failureThreshold: 30 + httpGet: + path: /healthz + port: http + periodSeconds: 10 nodeSelector: brightcomputing.com/node-category: dgx securityContext: {} @@ -1950,29 +1998,11 @@ renders values.k8s.ib-example.yaml: helm.sh/chart: node-data-broker-0.0.0 spec: containers: - - command: - - tail - - -f - - /dev/null - image: ghcr.io/nvidia/topograph/ib:main - imagePullPolicy: IfNotPresent - name: node-data-broker - resources: - limits: - cpu: 100m - memory: 128Mi - requests: - cpu: 100m - memory: 128Mi - securityContext: - privileged: true - volumeMounts: - - mountPath: /sys/class - name: sys-class-volume - initContainers: - args: - --provider=infiniband-k8s - -v=3 + - --port=8080 + - --refresh-interval=5m - --set=useGpuCliqueLabel=true command: - /usr/local/bin/node-data-broker-initc @@ -1981,9 +2011,36 @@ renders values.k8s.ib-example.yaml: valueFrom: fieldRef: fieldPath: spec.nodeName - image: ghcr.io/nvidia/topograph:v0.0.0 + image: ghcr.io/nvidia/topograph/ib:main imagePullPolicy: IfNotPresent - name: init-node-labels + livenessProbe: + httpGet: + path: /healthz + port: http + name: node-data-broker + ports: + - containerPort: 8080 + name: http + protocol: TCP + readinessProbe: + httpGet: + path: /healthz + port: http + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + securityContext: + privileged: true + startupProbe: + failureThreshold: 30 + httpGet: + path: /healthz + port: http + periodSeconds: 10 volumeMounts: - mountPath: /sys/class name: sys-class-volume @@ -2440,25 +2497,11 @@ renders values.slinky.block-example.yaml: helm.sh/chart: node-data-broker-0.0.0 spec: containers: - - command: - - tail - - -f - - /dev/null - image: curlimages/curl:8.13.0 - imagePullPolicy: IfNotPresent - name: node-data-broker - resources: - limits: - cpu: 100m - memory: 128Mi - requests: - cpu: 100m - memory: 128Mi - securityContext: {} - initContainers: - args: - --provider=aws - -v=3 + - --port=8080 + - --refresh-interval=5m command: - /usr/local/bin/node-data-broker-initc env: @@ -2468,7 +2511,33 @@ renders values.slinky.block-example.yaml: fieldPath: spec.nodeName image: ghcr.io/nvidia/topograph:v0.0.0 imagePullPolicy: IfNotPresent - name: init-node-labels + livenessProbe: + httpGet: + path: /healthz + port: http + name: node-data-broker + ports: + - containerPort: 8080 + name: http + protocol: TCP + readinessProbe: + httpGet: + path: /healthz + port: http + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + securityContext: {} + startupProbe: + failureThreshold: 30 + httpGet: + path: /healthz + port: http + periodSeconds: 10 securityContext: {} serviceAccountName: chart-ci-node-data-broker 2: | @@ -2939,25 +3008,11 @@ renders values.slinky.partition-example.yaml: helm.sh/chart: node-data-broker-0.0.0 spec: containers: - - command: - - tail - - -f - - /dev/null - image: curlimages/curl:8.13.0 - imagePullPolicy: IfNotPresent - name: node-data-broker - resources: - limits: - cpu: 100m - memory: 128Mi - requests: - cpu: 100m - memory: 128Mi - securityContext: {} - initContainers: - args: - --provider=aws - -v=3 + - --port=8080 + - --refresh-interval=5m command: - /usr/local/bin/node-data-broker-initc env: @@ -2967,7 +3022,33 @@ renders values.slinky.partition-example.yaml: fieldPath: spec.nodeName image: ghcr.io/nvidia/topograph:v0.0.0 imagePullPolicy: IfNotPresent - name: init-node-labels + livenessProbe: + httpGet: + path: /healthz + port: http + name: node-data-broker + ports: + - containerPort: 8080 + name: http + protocol: TCP + readinessProbe: + httpGet: + path: /healthz + port: http + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + securityContext: {} + startupProbe: + failureThreshold: 30 + httpGet: + path: /healthz + port: http + periodSeconds: 10 securityContext: {} serviceAccountName: chart-ci-node-data-broker 2: | @@ -3455,25 +3536,11 @@ renders values.slinky.tree-example.yaml: helm.sh/chart: node-data-broker-0.0.0 spec: containers: - - command: - - tail - - -f - - /dev/null - image: curlimages/curl:8.13.0 - imagePullPolicy: IfNotPresent - name: node-data-broker - resources: - limits: - cpu: 100m - memory: 128Mi - requests: - cpu: 100m - memory: 128Mi - securityContext: {} - initContainers: - args: - --provider=aws - -v=3 + - --port=8080 + - --refresh-interval=5m command: - /usr/local/bin/node-data-broker-initc env: @@ -3483,7 +3550,33 @@ renders values.slinky.tree-example.yaml: fieldPath: spec.nodeName image: ghcr.io/nvidia/topograph:v0.0.0 imagePullPolicy: IfNotPresent - name: init-node-labels + livenessProbe: + httpGet: + path: /healthz + port: http + name: node-data-broker + ports: + - containerPort: 8080 + name: http + protocol: TCP + readinessProbe: + httpGet: + path: /healthz + port: http + resources: + limits: + cpu: 100m + memory: 128Mi + requests: + cpu: 100m + memory: 128Mi + securityContext: {} + startupProbe: + failureThreshold: 30 + httpGet: + path: /healthz + port: http + periodSeconds: 10 securityContext: {} serviceAccountName: chart-ci-node-data-broker 2: | diff --git a/charts/topograph/tests/subcharts_test.yaml b/charts/topograph/tests/subcharts_test.yaml index 3364c37c..79e3b1f2 100644 --- a/charts/topograph/tests/subcharts_test.yaml +++ b/charts/topograph/tests/subcharts_test.yaml @@ -64,15 +64,18 @@ tests: path: spec.template.spec.containers[0].image pattern: "^ghcr.io/nvidia/topograph:v[0-9]+\\.[0-9]+\\.[0-9]+$" - - it: node-data-broker renders the init container by default + - it: node-data-broker renders the broker as the main container by default templates: - charts/node-data-broker/templates/daemonset.yaml asserts: - isKind: of: DaemonSet - equal: - path: spec.template.spec.initContainers[0].name - value: init-node-labels + path: spec.template.spec.containers[0].name + value: node-data-broker + - equal: + path: spec.template.spec.containers[0].command[0] + value: /usr/local/bin/node-data-broker-initc - it: node-data-broker uses an existing service account when creation is disabled templates: @@ -87,7 +90,7 @@ tests: path: spec.template.spec.serviceAccountName value: existing-node-data-broker-sa - - it: node-data-broker init container passes the provider and NODE_NAME + - it: node-data-broker container passes the provider and NODE_NAME templates: - charts/node-data-broker/templates/daemonset.yaml set: @@ -96,17 +99,45 @@ tests: name: infiniband-k8s asserts: - contains: - path: spec.template.spec.initContainers[0].args + path: spec.template.spec.containers[0].args content: --provider=infiniband-k8s - contains: - path: spec.template.spec.initContainers[0].env + path: spec.template.spec.containers[0].env content: name: NODE_NAME valueFrom: fieldRef: fieldPath: spec.nodeName - - it: node-data-broker init container forwards the GPU clique label flag and extra args + - it: node-data-broker container passes the refresh interval + templates: + - charts/node-data-broker/templates/daemonset.yaml + set: + node-data-broker: + refreshInterval: 10m + asserts: + - contains: + path: spec.template.spec.containers[0].args + content: --refresh-interval=10m + + - it: node-data-broker container gates liveness behind a startup probe + templates: + - charts/node-data-broker/templates/daemonset.yaml + asserts: + - equal: + path: spec.template.spec.containers[0].startupProbe.httpGet.path + value: /healthz + - equal: + path: spec.template.spec.containers[0].startupProbe.failureThreshold + value: 30 + - equal: + path: spec.template.spec.containers[0].startupProbe.periodSeconds + value: 10 + - equal: + path: spec.template.spec.containers[0].livenessProbe.httpGet.path + value: /healthz + + - it: node-data-broker container forwards the GPU clique label flag and extra args templates: - charts/node-data-broker/templates/daemonset.yaml set: @@ -116,28 +147,16 @@ tests: params: useGpuCliqueLabel: true node-data-broker: - initc: - extraArgs: - - foo=bar + extraArgs: + - foo=bar asserts: - contains: - path: spec.template.spec.initContainers[0].args + path: spec.template.spec.containers[0].args content: --set=useGpuCliqueLabel=true - contains: - path: spec.template.spec.initContainers[0].args + path: spec.template.spec.containers[0].args content: --set=foo=bar - - it: node-data-broker initc.enabled=false omits the init container - templates: - - charts/node-data-broker/templates/daemonset.yaml - set: - node-data-broker: - initc: - enabled: false - asserts: - - notExists: - path: spec.template.spec.initContainers - - it: node-data-broker.enabled=false omits the DaemonSet entirely templates: - charts/node-data-broker/templates/daemonset.yaml diff --git a/cmd/node-data-broker-initc/main.go b/cmd/node-data-broker-initc/main.go index d2b1e217..c748b21c 100644 --- a/cmd/node-data-broker-initc/main.go +++ b/cmd/node-data-broker-initc/main.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,11 +18,16 @@ package main import ( "context" + "errors" "flag" "fmt" "maps" + "net/http" "os" + "os/signal" "strings" + "syscall" + "time" "github.com/spf13/pflag" corev1 "k8s.io/api/core/v1" @@ -40,13 +45,24 @@ import ( "github.com/NVIDIA/topograph/pkg/providers/oci" ) +const ( + defaultPort = 8080 + defaultRefreshInterval = 5 * time.Minute + readHeaderTimeout = 5 * time.Second + shutdownTimeout = 5 * time.Second +) + func main() { var provider string var ver bool var sets []string + var port int + var refreshInterval time.Duration pflag.StringVar(&provider, "provider", "", "API provider") pflag.BoolVar(&ver, "version", false, "show the version") pflag.StringArrayVar(&sets, "set", []string{}, "extra key=value parameters") + pflag.IntVar(&port, "port", defaultPort, "port for the health HTTP server") + pflag.DurationVar(&refreshInterval, "refresh-interval", defaultRefreshInterval, "interval between node annotation refreshes after startup (0 disables periodic refresh)") klog.InitFlags(nil) pflag.CommandLine.AddGoFlagSet(flag.CommandLine) @@ -58,14 +74,57 @@ func main() { os.Exit(0) } - if err := mainInternal(provider, sets); err != nil { + if err := mainInternal(provider, sets, port, refreshInterval); err != nil { klog.Error(err.Error()) os.Exit(1) } } -func mainInternal(provider string, sets []string) error { - klog.InfoS("Starting node-data-broker", "provider", provider, "extras", sets) +func mainInternal(provider string, sets []string, port int, refreshInterval time.Duration) error { + ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) + defer stop() + + if err := applyNodeAnnotations(ctx, provider, sets); err != nil { + return err + } + + if refreshInterval > 0 { + go refreshNodeAnnotations(ctx, provider, sets, refreshInterval) + } + + // Keep the DaemonSet pod Running by serving a health endpoint until the pod + // is terminated. Periodic annotation refresh runs in the background. + return serveHealth(ctx, port) +} + +type annotationApplier func(ctx context.Context) error + +// refreshNodeAnnotations re-applies node annotations on a fixed interval until +// the context is cancelled. Failures are logged but do not terminate the pod. +func refreshNodeAnnotations(ctx context.Context, provider string, sets []string, interval time.Duration) { + runRefreshLoop(ctx, interval, func(ctx context.Context) error { + return applyNodeAnnotations(ctx, provider, sets) + }) +} + +func runRefreshLoop(ctx context.Context, interval time.Duration, apply annotationApplier) { + ticker := time.NewTicker(interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + return + case <-ticker.C: + if err := apply(ctx); err != nil { + klog.ErrorS(err, "periodic node annotation refresh failed") + } + } + } +} + +func applyNodeAnnotations(ctx context.Context, provider string, sets []string) error { + klog.InfoS("Applying node annotations", "provider", provider, "extras", sets) extras, err := getExtras(sets) if err != nil { @@ -82,7 +141,6 @@ func mainInternal(provider string, sets []string) error { return fmt.Errorf("failed to create clientset: %v", err) } - ctx := context.TODO() nodeName := os.Getenv("NODE_NAME") annotations, err := getAnnotations(ctx, clientset, config, provider, nodeName, extras) @@ -106,6 +164,44 @@ func mainInternal(provider string, sets []string) error { return nil } +// serveHealth runs a minimal HTTP server exposing /healthz so the DaemonSet +// pod stays Running after node annotations have been applied. It blocks until +// the context is cancelled (SIGTERM/SIGINT), then shuts down gracefully. +func serveHealth(ctx context.Context, port int) error { + srv := &http.Server{ + Addr: fmt.Sprintf(":%d", port), + Handler: healthHandler(), + ReadHeaderTimeout: readHeaderTimeout, + } + + errCh := make(chan error, 1) + go func() { + klog.Infof("Serving health endpoint on %s", srv.Addr) + if err := srv.ListenAndServe(); err != nil && !errors.Is(err, http.ErrServerClosed) { + errCh <- err + } + }() + + select { + case <-ctx.Done(): + klog.Info("Shutting down health endpoint") + shutdownCtx, cancel := context.WithTimeout(context.Background(), shutdownTimeout) + defer cancel() + return srv.Shutdown(shutdownCtx) + case err := <-errCh: + return err + } +} + +func healthHandler() http.Handler { + mux := http.NewServeMux() + mux.HandleFunc("/healthz", func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("ok")) + }) + return mux +} + func getExtras(sets []string) (map[string]string, error) { extras := make(map[string]string) for _, kv := range sets { @@ -124,7 +220,7 @@ func getExtras(sets []string) (map[string]string, error) { return extras, nil } -func getAnnotations(ctx context.Context, client *kubernetes.Clientset, config *rest.Config, provider, nodeName string, extras map[string]string) (map[string]string, error) { +func getAnnotations(ctx context.Context, client kubernetes.Interface, config *rest.Config, provider, nodeName string, extras map[string]string) (map[string]string, error) { switch provider { case aws.NAME: return aws.GetNodeAnnotations(ctx) diff --git a/cmd/node-data-broker-initc/main_test.go b/cmd/node-data-broker-initc/main_test.go index 67b2bcf6..89c41fb0 100644 --- a/cmd/node-data-broker-initc/main_test.go +++ b/cmd/node-data-broker-initc/main_test.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024-2025, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2024-2026, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,7 +18,11 @@ package main import ( "context" + "io" + "net/http" + "net/http/httptest" "testing" + "time" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" @@ -141,3 +145,106 @@ func TestMergeNodeAnnotations(t *testing.T) { }) } } + +func TestHealthHandler(t *testing.T) { + srv := httptest.NewServer(healthHandler()) + defer srv.Close() + + resp, err := http.Get(srv.URL + "/healthz") + require.NoError(t, err) + defer func() { _ = resp.Body.Close() }() + + require.Equal(t, http.StatusOK, resp.StatusCode) + body, err := io.ReadAll(resp.Body) + require.NoError(t, err) + require.Equal(t, "ok", string(body)) +} + +func TestServeHealthShutsDownOnContextCancel(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + + done := make(chan error, 1) + go func() { + // port 0 lets the OS pick a free ephemeral port. + done <- serveHealth(ctx, 0) + }() + + time.Sleep(50 * time.Millisecond) + cancel() + + select { + case err := <-done: + require.NoError(t, err) + case <-time.After(5 * time.Second): + t.Fatal("serveHealth did not return after context cancellation") + } +} + +func TestRunRefreshLoopAppliesOnInterval(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + var applyCount int + done := make(chan struct{}) + go func() { + runRefreshLoop(ctx, 20*time.Millisecond, func(context.Context) error { + applyCount++ + if applyCount >= 2 { + close(done) + } + return nil + }) + }() + + select { + case <-done: + require.GreaterOrEqual(t, applyCount, 2) + case <-time.After(2 * time.Second): + t.Fatal("refresh loop did not invoke apply at least twice") + } +} + +func TestRunRefreshLoopContinuesAfterApplyError(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + var applyCount int + done := make(chan struct{}) + go func() { + runRefreshLoop(ctx, 20*time.Millisecond, func(context.Context) error { + applyCount++ + if applyCount == 1 { + return context.Canceled + } + close(done) + return nil + }) + }() + + select { + case <-done: + require.GreaterOrEqual(t, applyCount, 2) + case <-time.After(2 * time.Second): + t.Fatal("refresh loop stopped after apply error") + } +} + +func TestRunRefreshLoopStopsOnContextCancel(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + + done := make(chan struct{}) + go func() { + runRefreshLoop(ctx, time.Hour, func(context.Context) error { + return nil + }) + close(done) + }() + + cancel() + + select { + case <-done: + case <-time.After(time.Second): + t.Fatal("refresh loop did not return after context cancellation") + } +} diff --git a/docs/providers/infiniband.md b/docs/providers/infiniband.md index 16c327ea..477f7d22 100644 --- a/docs/providers/infiniband.md +++ b/docs/providers/infiniband.md @@ -123,14 +123,20 @@ global: name: k8s ``` -When `useGpuCliqueLabel` is not set, the node-data-broker init container uses the GPU Operator device-plugin DaemonSet as before. To override the GPU Operator namespace or device plugin DaemonSet name (defaults: `gpu-operator` and `nvidia-device-plugin-daemonset`), set these via `node-data-broker.initc.extraArgs` in your Helm values — they are init container arguments, not provider request parameters: +When `useGpuCliqueLabel` is not set, the node-data-broker uses the GPU Operator device-plugin DaemonSet as before. To override the GPU Operator namespace or device plugin DaemonSet name (defaults: `gpu-operator` and `nvidia-device-plugin-daemonset`), set these via `node-data-broker.extraArgs` in your Helm values — they are node-data-broker arguments, not provider request parameters: ```yaml node-data-broker: - initc: - extraArgs: - - gpu-operator-namespace=my-namespace - - device-plugin-daemonset=my-daemonset + extraArgs: + - gpu-operator-namespace=my-namespace + - device-plugin-daemonset=my-daemonset +``` + +By default the node-data-broker re-applies node annotations every 5 minutes after the initial startup apply. Adjust or disable periodic refresh with `node-data-broker.refreshInterval` (set to `0` to apply only at pod start): + +```yaml +node-data-broker: + refreshInterval: 10m ``` If `ibnetdiscover` needs extra config files, the chart can render ConfigMaps and mount them into the node-data-broker pods: From 57e4ff7d7eb4cef24523132afbd935a7edad268f Mon Sep 17 00:00:00 2001 From: Giulio Calzolari Date: Sat, 27 Jun 2026 16:39:35 +0200 Subject: [PATCH 2/4] fix(node-data-broker): address Greptile review on PR #368 Reuse a single in-cluster clientset for startup and periodic annotation refresh instead of constructing one on every apply. Fix a race in refresh-loop tests where a third ticker tick could close the done channel twice; gate shutdown with sync.Once and cancel the loop immediately when the test condition is met. Signed-off-by: Giulio Calzolari --- cmd/node-data-broker-initc/main.go | 71 ++++++++++++++++--------- cmd/node-data-broker-initc/main_test.go | 21 +++++++- 2 files changed, 66 insertions(+), 26 deletions(-) diff --git a/cmd/node-data-broker-initc/main.go b/cmd/node-data-broker-initc/main.go index c748b21c..2db047c2 100644 --- a/cmd/node-data-broker-initc/main.go +++ b/cmd/node-data-broker-initc/main.go @@ -52,6 +52,14 @@ const ( shutdownTimeout = 5 * time.Second ) +type nodeBroker struct { + clientset kubernetes.Interface + config *rest.Config + provider string + sets []string + nodeName string +} + func main() { var provider string var ver bool @@ -84,12 +92,25 @@ func mainInternal(provider string, sets []string, port int, refreshInterval time ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM) defer stop() - if err := applyNodeAnnotations(ctx, provider, sets); err != nil { + clientset, config, err := newInClusterClientset() + if err != nil { + return err + } + + broker := &nodeBroker{ + clientset: clientset, + config: config, + provider: provider, + sets: sets, + nodeName: os.Getenv("NODE_NAME"), + } + + if err := broker.apply(ctx); err != nil { return err } if refreshInterval > 0 { - go refreshNodeAnnotations(ctx, provider, sets, refreshInterval) + go refreshNodeAnnotations(ctx, broker, refreshInterval) } // Keep the DaemonSet pod Running by serving a health endpoint until the pod @@ -97,13 +118,27 @@ func mainInternal(provider string, sets []string, port int, refreshInterval time return serveHealth(ctx, port) } +func newInClusterClientset() (kubernetes.Interface, *rest.Config, error) { + config, err := rest.InClusterConfig() + if err != nil { + return nil, nil, fmt.Errorf("failed to load in-cluster config: %v", err) + } + + clientset, err := kubernetes.NewForConfig(config) + if err != nil { + return nil, nil, fmt.Errorf("failed to create clientset: %v", err) + } + + return clientset, config, nil +} + type annotationApplier func(ctx context.Context) error // refreshNodeAnnotations re-applies node annotations on a fixed interval until // the context is cancelled. Failures are logged but do not terminate the pod. -func refreshNodeAnnotations(ctx context.Context, provider string, sets []string, interval time.Duration) { +func refreshNodeAnnotations(ctx context.Context, broker *nodeBroker, interval time.Duration) { runRefreshLoop(ctx, interval, func(ctx context.Context) error { - return applyNodeAnnotations(ctx, provider, sets) + return broker.apply(ctx) }) } @@ -123,40 +158,28 @@ func runRefreshLoop(ctx context.Context, interval time.Duration, apply annotatio } } -func applyNodeAnnotations(ctx context.Context, provider string, sets []string) error { - klog.InfoS("Applying node annotations", "provider", provider, "extras", sets) +func (b *nodeBroker) apply(ctx context.Context) error { + klog.InfoS("Applying node annotations", "provider", b.provider, "extras", b.sets) - extras, err := getExtras(sets) + extras, err := getExtras(b.sets) if err != nil { return err } - config, err := rest.InClusterConfig() - if err != nil { - return fmt.Errorf("failed to load in-cluster config: %v", err) - } - - clientset, err := kubernetes.NewForConfig(config) - if err != nil { - return fmt.Errorf("failed to create clientset: %v", err) - } - - nodeName := os.Getenv("NODE_NAME") - - annotations, err := getAnnotations(ctx, clientset, config, provider, nodeName, extras) + annotations, err := getAnnotations(ctx, b.clientset, b.config, b.provider, b.nodeName, extras) if err != nil { return err } - klog.Infof("adding annotations %v in node %s for provider %s", annotations, nodeName, provider) + klog.Infof("adding annotations %v in node %s for provider %s", annotations, b.nodeName, b.provider) - node, err := clientset.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + node, err := b.clientset.CoreV1().Nodes().Get(ctx, b.nodeName, metav1.GetOptions{}) if err != nil { - return fmt.Errorf("failed to get node %q: %v", nodeName, err) + return fmt.Errorf("failed to get node %q: %v", b.nodeName, err) } mergeNodeAnnotations(node, annotations) - _, err = clientset.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}) + _, err = b.clientset.CoreV1().Nodes().Update(ctx, node, metav1.UpdateOptions{}) if err != nil { return fmt.Errorf("failed to update node: %v", err) } diff --git a/cmd/node-data-broker-initc/main_test.go b/cmd/node-data-broker-initc/main_test.go index 89c41fb0..69388ad8 100644 --- a/cmd/node-data-broker-initc/main_test.go +++ b/cmd/node-data-broker-initc/main_test.go @@ -21,6 +21,7 @@ import ( "io" "net/http" "net/http/httptest" + "sync" "testing" "time" @@ -186,11 +187,19 @@ func TestRunRefreshLoopAppliesOnInterval(t *testing.T) { var applyCount int done := make(chan struct{}) + var finishOnce sync.Once + finish := func() { + finishOnce.Do(func() { + cancel() + close(done) + }) + } + go func() { runRefreshLoop(ctx, 20*time.Millisecond, func(context.Context) error { applyCount++ if applyCount >= 2 { - close(done) + finish() } return nil }) @@ -210,13 +219,21 @@ func TestRunRefreshLoopContinuesAfterApplyError(t *testing.T) { var applyCount int done := make(chan struct{}) + var finishOnce sync.Once + finish := func() { + finishOnce.Do(func() { + cancel() + close(done) + }) + } + go func() { runRefreshLoop(ctx, 20*time.Millisecond, func(context.Context) error { applyCount++ if applyCount == 1 { return context.Canceled } - close(done) + finish() return nil }) }() From d0214af0346e4803e2e183df9b8a7988217bdc01 Mon Sep 17 00:00:00 2001 From: Giulio Calzolari Date: Sun, 28 Jun 2026 15:47:23 +0200 Subject: [PATCH 3/4] build(docker): fold ibnetdiscover into main image via rdma-core Install rdma-core in the Alpine runtime image so node-data-broker and infiniband-k8s no longer need ghcr.io/nvidia/topograph/ib. Remove Dockerfile.ib, the docker-ib workflow, and /ib overrides from Helm examples and docs. Signed-off-by: Giulio Calzolari --- .claude/CLAUDE.md | 3 +- .github/workflows/docker-ib.yml | 70 ------------------- AGENTS.md | 3 +- Dockerfile | 2 + Dockerfile.ib | 8 --- charts/topograph/README.md | 2 +- .../render_snapshot_test.yaml.snap | 6 +- ...p-federated-workload-identity-example.yaml | 4 -- ...alues.k8s.gcp-service-account-example.yaml | 4 -- charts/topograph/values.k8s.ib-example.yaml | 4 -- .../values.slinky.ib.block-example.yaml | 11 --- docs/engines/k8s.md | 2 +- 12 files changed, 9 insertions(+), 110 deletions(-) delete mode 100644 .github/workflows/docker-ib.yml delete mode 100644 Dockerfile.ib diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md index 255dc414..b251d2c3 100644 --- a/.claude/CLAUDE.md +++ b/.claude/CLAUDE.md @@ -65,7 +65,7 @@ These structures propagate across every provider and engine. Changing them in a - **make** - **golangci-lint** — `brew install golangci-lint` or via `go install` - **helm 3.10+ or 4.x** — required for `make chart-test`; the `helm-unittest` plugin is installed automatically by the target (`brew install helm`). CI pins helm `v4.1.1` in `.github/workflows/chart-test.yaml`. -- **docker** — only for container image builds and the IB variant +- **docker** — for container image builds (the main image includes `rdma-core` / `ibnetdiscover` for InfiniBand deployments) ### Clone and build @@ -107,7 +107,6 @@ Coverage checks run on pull requests. A drop below target with no matching uplif - `.github/workflows/go.yml` — build, test, and lint on every push and PR - `.github/workflows/chart-test.yaml` — Helm chart lint + helm-unittest suites (`make chart-test`) on every push and PR - `.github/workflows/docker.yml` — container image build (manual trigger) -- `.github/workflows/docker-ib.yml` — InfiniBand-variant container (manual trigger) - `.github/workflows/helm-release.yaml` — Helm chart release (manual trigger) ### Deployment surfaces diff --git a/.github/workflows/docker-ib.yml b/.github/workflows/docker-ib.yml deleted file mode 100644 index 1fbc5e85..00000000 --- a/.github/workflows/docker-ib.yml +++ /dev/null @@ -1,70 +0,0 @@ -name: Docker-ib - -# This workflow uses actions that are not certified by GitHub. -# They are provided by a third-party and are governed by -# separate terms of service, privacy policy, and support -# documentation. - -on: workflow_dispatch - -env: - REGISTRY: ghcr.io - # github.repository as / - IMAGE_NAME: ${{ github.repository }}/ib - -jobs: - build: - runs-on: linux-amd64-cpu16 - - steps: - - name: Checkout repository - uses: actions/checkout@v5 - - # Login against a Docker registry except on PR - # https://github.com/docker/login-action - - name: Log into registry ${{ env.REGISTRY }} - uses: docker/login-action@v3 - with: - registry: ${{ env.REGISTRY }} - username: ${{ github.actor }} - password: ${{ secrets.GITHUB_TOKEN }} - - # Extract metadata (tags, labels) for Docker - # https://github.com/docker/metadata-action - - name: Extract Docker metadata - id: meta - uses: docker/metadata-action@v5 - with: - images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} - tags: | - type=semver,pattern={{version}} - type=semver,pattern={{major}}.{{minor}} - type=semver,pattern={{major}} - type=ref,event=branch - type=sha,priority=100,prefix=,suffix=,format=short - - # Set up QEMU for cross-platform builds - - name: Set up QEMU - uses: docker/setup-qemu-action@v3 - - # Set up Docker Buildx - - name: Set up Docker Buildx - uses: docker/setup-buildx-action@v3 - with: - buildkitd-config: /etc/buildkit/buildkitd.toml - - # Build and push Docker image with Buildx (don't push on PR) - # https://github.com/docker/build-push-action - - name: Build and push Docker image - id: build-and-push - uses: docker/build-push-action@v6 - with: - context: . - file: ./Dockerfile.ib - push: ${{ github.event_name != 'pull_request' }} - #build-args: | - # TARGETOS=linux - # TARGETARCH=amd64 - platforms: linux/amd64,linux/arm64 - tags: ${{ steps.meta.outputs.tags }} - labels: ${{ steps.meta.outputs.labels }} diff --git a/AGENTS.md b/AGENTS.md index f6ab9f99..ed16908d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -65,7 +65,7 @@ These structures propagate across every provider and engine. Changing them in a - **make** - **golangci-lint** — `brew install golangci-lint` or via `go install` - **helm 3.10+ or 4.x** — required for `make chart-test`; the `helm-unittest` plugin is installed automatically by the target (`brew install helm`). CI pins helm `v4.1.1` in `.github/workflows/chart-test.yaml`. -- **docker** — only for container image builds and the IB variant +- **docker** — for container image builds (the main image includes `rdma-core` / `ibnetdiscover` for InfiniBand deployments) ### Clone and build @@ -107,7 +107,6 @@ Coverage checks run on pull requests. A drop below target with no matching uplif - `.github/workflows/go.yml` — build, test, and lint on every push and PR - `.github/workflows/chart-test.yaml` — Helm chart lint + helm-unittest suites (`make chart-test`) on every push and PR - `.github/workflows/docker.yml` — container image build (manual trigger) -- `.github/workflows/docker-ib.yml` — InfiniBand-variant container (manual trigger) - `.github/workflows/helm-release.yaml` — Helm chart release (manual trigger) ### Deployment surfaces diff --git a/Dockerfile b/Dockerfile index 6abdc876..44a8444c 100644 --- a/Dockerfile +++ b/Dockerfile @@ -10,4 +10,6 @@ RUN make build-${TARGETOS}-${TARGETARCH} FROM alpine:3 +RUN apk add --no-cache rdma-core + COPY --from=builder /go/src/github.com/NVIDIA/topograph/bin/* /usr/local/bin/ diff --git a/Dockerfile.ib b/Dockerfile.ib deleted file mode 100644 index 298545e7..00000000 --- a/Dockerfile.ib +++ /dev/null @@ -1,8 +0,0 @@ -FROM ubuntu:24.04 - -# Install dependencies and ibnetdiscover -RUN apt-get update && \ - DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ - infiniband-diags ibutils \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* diff --git a/charts/topograph/README.md b/charts/topograph/README.md index 2474e0ec..292a15ed 100644 --- a/charts/topograph/README.md +++ b/charts/topograph/README.md @@ -93,7 +93,7 @@ Both test pods are removed automatically on success (`helm.sh/hook-delete-policy By default, the test pods reuse the main topograph image. Topograph's default image is Alpine-based and ships with busybox `wget`, which the test probes use — so `helm test` works without pulling any additional image, including in air-gapped environments where only mirrored images are reachable. -If you run a topograph image variant without busybox `wget` (for example, the IB variant built on `ubuntu`), override the test image to point at one that does, via `tests.image.repository` and `tests.image.tag`. You can also disable the tests entirely with `tests.enabled=false`. +If your mirrored image lacks busybox `wget`, override the test image to point at one that does, via `tests.image.repository` and `tests.image.tag`. You can also disable the tests entirely with `tests.enabled=false`. ## Subcharts diff --git a/charts/topograph/tests/__snapshot__/render_snapshot_test.yaml.snap b/charts/topograph/tests/__snapshot__/render_snapshot_test.yaml.snap index 21de5f0d..1e531c6c 100644 --- a/charts/topograph/tests/__snapshot__/render_snapshot_test.yaml.snap +++ b/charts/topograph/tests/__snapshot__/render_snapshot_test.yaml.snap @@ -1016,7 +1016,7 @@ renders values.k8s.gcp-federated-workload-identity-example.yaml: valueFrom: fieldRef: fieldPath: spec.nodeName - image: ghcr.io/nvidia/topograph/ib:main + image: ghcr.io/nvidia/topograph:v0.0.0 imagePullPolicy: IfNotPresent livenessProbe: httpGet: @@ -1519,7 +1519,7 @@ renders values.k8s.gcp-service-account-example.yaml: valueFrom: fieldRef: fieldPath: spec.nodeName - image: ghcr.io/nvidia/topograph/ib:main + image: ghcr.io/nvidia/topograph:v0.0.0 imagePullPolicy: IfNotPresent livenessProbe: httpGet: @@ -2011,7 +2011,7 @@ renders values.k8s.ib-example.yaml: valueFrom: fieldRef: fieldPath: spec.nodeName - image: ghcr.io/nvidia/topograph/ib:main + image: ghcr.io/nvidia/topograph:v0.0.0 imagePullPolicy: IfNotPresent livenessProbe: httpGet: diff --git a/charts/topograph/values.k8s.gcp-federated-workload-identity-example.yaml b/charts/topograph/values.k8s.gcp-federated-workload-identity-example.yaml index 3f0fcc7a..bbfa6c53 100644 --- a/charts/topograph/values.k8s.gcp-federated-workload-identity-example.yaml +++ b/charts/topograph/values.k8s.gcp-federated-workload-identity-example.yaml @@ -20,9 +20,5 @@ node-observer: brightcomputing.com/node-category: dgx node-data-broker: - image: - repository: ghcr.io/nvidia/topograph/ib - pullPolicy: IfNotPresent - tag: main nodeSelector: brightcomputing.com/node-category: dgx diff --git a/charts/topograph/values.k8s.gcp-service-account-example.yaml b/charts/topograph/values.k8s.gcp-service-account-example.yaml index abbd5cb3..286d99cb 100644 --- a/charts/topograph/values.k8s.gcp-service-account-example.yaml +++ b/charts/topograph/values.k8s.gcp-service-account-example.yaml @@ -18,9 +18,5 @@ node-observer: brightcomputing.com/node-category: dgx node-data-broker: - image: - repository: ghcr.io/nvidia/topograph/ib - pullPolicy: IfNotPresent - tag: main nodeSelector: brightcomputing.com/node-category: dgx diff --git a/charts/topograph/values.k8s.ib-example.yaml b/charts/topograph/values.k8s.ib-example.yaml index c9eff711..c1a5a975 100644 --- a/charts/topograph/values.k8s.ib-example.yaml +++ b/charts/topograph/values.k8s.ib-example.yaml @@ -13,10 +13,6 @@ node-observer: nvidia.com/gpu.present: "true" node-data-broker: - image: - repository: ghcr.io/nvidia/topograph/ib - pullPolicy: IfNotPresent - tag: main securityContext: privileged: true nodeSelector: diff --git a/charts/topograph/values.slinky.ib.block-example.yaml b/charts/topograph/values.slinky.ib.block-example.yaml index d9d4c1e5..9c507c16 100644 --- a/charts/topograph/values.slinky.ib.block-example.yaml +++ b/charts/topograph/values.slinky.ib.block-example.yaml @@ -70,17 +70,6 @@ node-observer: app.kubernetes.io/name: slurmd node-data-broker: - image: - # The IB image includes the runtime tooling needed by the infiniband-k8s - # provider, including ibnetdiscover. - repository: ghcr.io/nvidia/topograph/ib - pullPolicy: IfNotPresent - tag: main - initc: - # Annotates each node with Topograph instance/region metadata used by the - # engines. With useGpuCliqueLabel=true, it skips NVLink clique collection - # through the GPU Operator device-plugin DaemonSet. - enabled: true securityContext: # Required so ibnetdiscover in the broker pod can access host IB devices. privileged: true diff --git a/docs/engines/k8s.md b/docs/engines/k8s.md index dc8a1915..db047a7b 100644 --- a/docs/engines/k8s.md +++ b/docs/engines/k8s.md @@ -328,7 +328,7 @@ Phase: Succeeded Both pods clean themselves up on success (`helm.sh/hook-delete-policy: before-hook-creation,hook-succeeded`). On failure the pods persist so operators can inspect logs via `kubectl logs -n `; the next `helm test` invocation replaces the prior pods. -**Air-gapped environments.** The test pods reuse the main topograph image by default — they invoke `busybox wget` from the Alpine-based `ghcr.io/nvidia/topograph` image already pulled by the Deployment. No additional image pull is required by `helm test`, so the suite works in environments where only mirrored images are reachable. Operators running a topograph image variant without `busybox wget` (notably the ubuntu-based IB variant built from `Dockerfile.ib`) can either override the test image: +**Air-gapped environments.** The test pods reuse the main topograph image by default — they invoke `busybox wget` from the Alpine-based `ghcr.io/nvidia/topograph` image already pulled by the Deployment. No additional image pull is required by `helm test`, so the suite works in environments where only mirrored images are reachable. If your mirrored image lacks `busybox wget`, override the test image: ```yaml tests: From 0ea7c82bda7eee7d1464b0307b2c391bd74d317f Mon Sep 17 00:00:00 2001 From: Giulio Calzolari Date: Sun, 28 Jun 2026 15:49:25 +0200 Subject: [PATCH 4/4] docs(infiniband): drop stale IB image references Document that the default topograph image includes ibnetdiscover via rdma-core, fix the node-data-broker init-container wording, and remove the obsolete IB/ubuntu variant note from chart values comments. Signed-off-by: Giulio Calzolari --- charts/topograph/values.yaml | 5 ++--- docs/providers/infiniband.md | 3 ++- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/charts/topograph/values.yaml b/charts/topograph/values.yaml index 9f2a49ec..598decc2 100644 --- a/charts/topograph/values.yaml +++ b/charts/topograph/values.yaml @@ -203,9 +203,8 @@ tests: # By default, test pods reuse the main topograph image. Topograph's # default image is Alpine-based and ships with busybox wget, which the # test probes use -- no additional image pull is needed, including in - # air-gapped environments. If you run a topograph image variant without - # busybox wget (for example, the IB variant built on ubuntu), override - # either field to point at an image that ships wget. Leave both empty + # air-gapped environments. If your mirrored image lacks busybox wget, + # override either field to point at an image that ships wget. Leave both empty # to reuse the main image. image: repository: "" diff --git a/docs/providers/infiniband.md b/docs/providers/infiniband.md index 477f7d22..82009a58 100644 --- a/docs/providers/infiniband.md +++ b/docs/providers/infiniband.md @@ -79,6 +79,7 @@ For the Slurm engine, verify the generated `topology.conf` reflects the expected ### Prerequisites - Topograph deployed via Helm — the node-data-broker DaemonSet (a Topograph subchart, enabled by default) collects NVLink clique IDs from each node and stores them as Kubernetes node annotations (`topograph.nvidia.com/cluster-id`). If `useGpuCliqueLabel` is enabled, Topograph reads `nvidia.com/gpu.clique` directly instead and the node-data-broker skips NVLink clique collection. +- The default **`ghcr.io/nvidia/topograph`** image includes **`ibnetdiscover`** (Alpine `rdma-core`). No separate InfiniBand image is required. IB deployments typically run the broker **privileged** and mount host **`/sys/class`** so `ibnetdiscover` can reach IB devices — see [`values.k8s.ib-example.yaml`](../../charts/topograph/values.k8s.ib-example.yaml). - NVIDIA GPU Operator — standard on NVIDIA GPU Kubernetes clusters; manages the device plugin DaemonSet used to read NVLink clique IDs. Required only for NVLink domain discovery; on clusters without NVLink-connected GPUs this does not apply and the provider will still discover the IB switch tree. ### How It Works @@ -111,7 +112,7 @@ The following optional parameter can be passed in the topology request payload: | `nodeSelector` | `map[string]string` | — | Label selector to filter which nodes participate in topology discovery | | `useGpuCliqueLabel` | `bool` | `false` | Use `nvidia.com/gpu.clique` as the accelerator-domain ID source instead of the `topograph.nvidia.com/cluster-id` annotation. | -With Helm, configure `useGpuCliqueLabel` under `global.provider.params`. The chart also passes it to the node-data-broker init container so it skips NVLink clique collection instead of exec-ing into the GPU Operator device-plugin DaemonSet to run `nvidia-smi`: +With Helm, configure `useGpuCliqueLabel` under `global.provider.params`. The chart also passes it to the node-data-broker container so it skips NVLink clique collection instead of exec-ing into the GPU Operator device-plugin DaemonSet to run `nvidia-smi`: ```yaml global: