diff --git a/charts/topograph/templates/rbac.yaml b/charts/topograph/templates/rbac.yaml index e8fd00a5..d0860bd1 100644 --- a/charts/topograph/templates/rbac.yaml +++ b/charts/topograph/templates/rbac.yaml @@ -1,7 +1,8 @@ {{- if .Values.serviceAccount.create -}} {{- /* pods/exec is needed by providers that exec inside pods, and by Slinky only for -legacy partition discovery through `scontrol show partition` in a login pod. +legacy partition discovery through `scontrol show partition` in the controller +pod (falling back to a login pod when present). Slinky skips that path when dynamic nodes are enabled or when a topology entry already supplies nodes, a pod selector, or the default flat topology. */ -}} diff --git a/docs/engines/slinky.md b/docs/engines/slinky.md index e6bc62a8..8515a1ea 100644 --- a/docs/engines/slinky.md +++ b/docs/engines/slinky.md @@ -49,7 +49,7 @@ When per-partition topologies are configured, each entry may declare how its nod |---|---| | `nodes` | Explicit SLURM node list. Takes precedence over `podSelector`. | | `podSelector` | Kubernetes `LabelSelector` matching the slurmd pods in the partition. The engine lists pods in the engine's `namespace`, filters to `Ready` pods, and reads each pod's SLURM name from the `slurm.node.name` label (falling back to `pod.spec.hostname`). | -| _neither_ | The engine falls back to running `scontrol show partition ` inside a login pod (legacy behavior). | +| _neither_ | The engine falls back to running `scontrol show partition ` inside the controller pod, or a login pod when no controller pod is running (legacy behavior). The controller (`app.kubernetes.io/component: controller`) is always present; login pods are optional. | `nodes` and `podSelector` are mutually exclusive on the same entry; configuring both returns a validation error at engine load time. diff --git a/pkg/engines/slinky/engine.go b/pkg/engines/slinky/engine.go index 30f18f83..04b17ae8 100644 --- a/pkg/engines/slinky/engine.go +++ b/pkg/engines/slinky/engine.go @@ -50,6 +50,13 @@ const ( ConfigUpdateModeSkeletonOnly = "skeleton-only" dynamicShowPartitionNodes = "\tNodes=NONE" + + // Slinky component labels used to locate a pod that can run + // `scontrol show partition`. The controller is always present; login pods + // are optional. + slurmComponentLabelKey = "app.kubernetes.io/component" + slurmComponentController = "controller" + slurmComponentLogin = "login" ) type SlinkyEngine struct { @@ -443,27 +450,33 @@ func (eng *SlinkyEngine) getPartitionNodes(ctx context.Context, partition string return "", fmt.Errorf("getPartitionNodes expects a string parameter") } - labels := map[string]string{"app.kubernetes.io/component": "login"} - pods, err := k8s.GetPodsByLabels(ctx, eng.client, namespace, labels) - if err != nil { - return "", err - } - - for _, pod := range pods.Items { - if pod.Status.Phase != corev1.PodRunning { - continue - } - - cmd := []string{"scontrol", "show", "partition", partition} - buf, err := k8s.ExecInPod(ctx, eng.client, eng.config, pod.Name, pod.Namespace, cmd) + // A Slinky cluster always runs a controller (slurmctld) but login pods are + // optional, so prefer the controller for `scontrol show partition` and fall + // back to a login pod when one is present. + for _, component := range []string{slurmComponentController, slurmComponentLogin} { + labels := map[string]string{slurmComponentLabelKey: component} + pods, err := k8s.GetPodsByLabels(ctx, eng.client, namespace, labels) if err != nil { return "", err } - return buf.String(), nil + for _, pod := range pods.Items { + if pod.Status.Phase != corev1.PodRunning { + continue + } + + cmd := []string{"scontrol", "show", "partition", partition} + buf, err := k8s.ExecInPod(ctx, eng.client, eng.config, pod.Name, pod.Namespace, cmd) + if err != nil { + return "", err + } + + return buf.String(), nil + } } - return "", fmt.Errorf("no running pods with labels %v", labels) + return "", fmt.Errorf("no running %s or %s pods found for partition discovery in namespace %q", + slurmComponentController, slurmComponentLogin, namespace) } func (eng *SlinkyEngine) performReconciliation(ctx context.Context, nt *translate.NetworkTopology, topologies []*translate.TopologyUnit) *httperr.Error { diff --git a/pkg/engines/slinky/engine_test.go b/pkg/engines/slinky/engine_test.go index 9d56ec54..929093d4 100644 --- a/pkg/engines/slinky/engine_test.go +++ b/pkg/engines/slinky/engine_test.go @@ -502,6 +502,76 @@ func slurmTopologiesForDynamicTest(plugins []string) map[string]*Topology { return out } +func TestGetPartitionNodes(t *testing.T) { + const namespace = "slurm" + + // A controller pod that is not running, so getPartitionNodes never reaches + // ExecInPod and exercises the no-running-pods path deterministically. + pendingControllerClient := func() *fake.Clientset { + return fake.NewSimpleClientset(&corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "slurm-controller-0", + Namespace: namespace, + Labels: map[string]string{slurmComponentLabelKey: slurmComponentController}, + }, + Status: corev1.PodStatus{Phase: corev1.PodPending}, + }) + } + + testCases := []struct { + name string + useDynamicNodes bool + client *fake.Clientset + params []any + want string + errMsg string + }{ + { + name: "dynamic nodes short-circuits without listing pods", + useDynamicNodes: true, + client: fake.NewSimpleClientset(), + params: []any{namespace}, + want: dynamicShowPartitionNodes, + }, + { + name: "wrong parameter count", + client: fake.NewSimpleClientset(), + params: []any{namespace, "extra"}, + errMsg: "expects a namespace as a parameter", + }, + { + name: "non-string parameter", + client: fake.NewSimpleClientset(), + params: []any{42}, + errMsg: "expects a string parameter", + }, + { + name: "no running controller or login pods", + client: pendingControllerClient(), + params: []any{namespace}, + errMsg: "no running controller or login pods found for partition discovery", + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + eng := &SlinkyEngine{ + client: tc.client, + params: &Params{Namespace: namespace, UseDynamicNodes: tc.useDynamicNodes}, + } + + got, err := eng.getPartitionNodes(context.Background(), "gpu", tc.params) + if tc.errMsg != "" { + require.Error(t, err) + require.Contains(t, err.Error(), tc.errMsg) + return + } + require.NoError(t, err) + require.Equal(t, tc.want, got) + }) + } +} + func TestGenerateDynamicNodesOutput(t *testing.T) { slinkyPodSel := metav1.LabelSelector{MatchLabels: map[string]string{"app": "slinky"}}