Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion charts/topograph/templates/rbac.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
{{- if .Values.serviceAccount.create -}}
{{- /*
pods/exec is needed by providers that exec inside pods, and by Slinky only for
legacy partition discovery through `scontrol show partition` in a login pod.
legacy partition discovery through `scontrol show partition` in the controller
pod (falling back to a login pod when present).
Slinky skips that path when dynamic nodes are enabled or when a topology entry
already supplies nodes, a pod selector, or the default flat topology.
*/ -}}
Expand Down
2 changes: 1 addition & 1 deletion docs/engines/slinky.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ When per-partition topologies are configured, each entry may declare how its nod
|---|---|
| `nodes` | Explicit SLURM node list. Takes precedence over `podSelector`. |
| `podSelector` | Kubernetes `LabelSelector` matching the slurmd pods in the partition. The engine lists pods in the engine's `namespace`, filters to `Ready` pods, and reads each pod's SLURM name from the `slurm.node.name` label (falling back to `pod.spec.hostname`). |
| _neither_ | The engine falls back to running `scontrol show partition <name>` inside a login pod (legacy behavior). |
| _neither_ | The engine falls back to running `scontrol show partition <name>` inside the controller pod, or a login pod when no controller pod is running (legacy behavior). The controller (`app.kubernetes.io/component: controller`) is always present; login pods are optional. |

`nodes` and `podSelector` are mutually exclusive on the same entry; configuring both returns a validation error at engine load time.

Expand Down
43 changes: 28 additions & 15 deletions pkg/engines/slinky/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,13 @@ const (
ConfigUpdateModeSkeletonOnly = "skeleton-only"

dynamicShowPartitionNodes = "\tNodes=NONE"

// Slinky component labels used to locate a pod that can run
// `scontrol show partition`. The controller is always present; login pods
// are optional.
slurmComponentLabelKey = "app.kubernetes.io/component"
slurmComponentController = "controller"
slurmComponentLogin = "login"
)

type SlinkyEngine struct {
Expand Down Expand Up @@ -443,27 +450,33 @@ func (eng *SlinkyEngine) getPartitionNodes(ctx context.Context, partition string
return "", fmt.Errorf("getPartitionNodes expects a string parameter")
}

labels := map[string]string{"app.kubernetes.io/component": "login"}
pods, err := k8s.GetPodsByLabels(ctx, eng.client, namespace, labels)
if err != nil {
return "", err
}

for _, pod := range pods.Items {
if pod.Status.Phase != corev1.PodRunning {
continue
}

cmd := []string{"scontrol", "show", "partition", partition}
buf, err := k8s.ExecInPod(ctx, eng.client, eng.config, pod.Name, pod.Namespace, cmd)
// A Slinky cluster always runs a controller (slurmctld) but login pods are
// optional, so prefer the controller for `scontrol show partition` and fall
// back to a login pod when one is present.
for _, component := range []string{slurmComponentController, slurmComponentLogin} {
labels := map[string]string{slurmComponentLabelKey: component}
pods, err := k8s.GetPodsByLabels(ctx, eng.client, namespace, labels)
if err != nil {
return "", err
}
Comment on lines 459 to 461

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 API error on controller listing aborts the login fallback

If GetPodsByLabels returns a non-nil error for the controller component (e.g., an RBAC misconfiguration that allows listing login pods but not controller pods), the function returns immediately and never attempts the login fallback. The original code had symmetric behavior for login pods, so this isn't a new class of failure, but the addition of the controller leg means a new RBAC permission (list pods scoped to controller) is now exercised first. A cluster that previously worked with only login-pod discovery will silently break if the service account lacks permission to list controller pods, even though a login pod is available.


return buf.String(), nil
for _, pod := range pods.Items {
if pod.Status.Phase != corev1.PodRunning {
continue
}

cmd := []string{"scontrol", "show", "partition", partition}
buf, err := k8s.ExecInPod(ctx, eng.client, eng.config, pod.Name, pod.Namespace, cmd)
if err != nil {
return "", err
}

return buf.String(), nil
}
}

return "", fmt.Errorf("no running pods with labels %v", labels)
return "", fmt.Errorf("no running %s or %s pods found for partition discovery in namespace %q",
slurmComponentController, slurmComponentLogin, namespace)
}

func (eng *SlinkyEngine) performReconciliation(ctx context.Context, nt *translate.NetworkTopology, topologies []*translate.TopologyUnit) *httperr.Error {
Expand Down
70 changes: 70 additions & 0 deletions pkg/engines/slinky/engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,76 @@ func slurmTopologiesForDynamicTest(plugins []string) map[string]*Topology {
return out
}

func TestGetPartitionNodes(t *testing.T) {
const namespace = "slurm"

// A controller pod that is not running, so getPartitionNodes never reaches
// ExecInPod and exercises the no-running-pods path deterministically.
pendingControllerClient := func() *fake.Clientset {
return fake.NewSimpleClientset(&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "slurm-controller-0",
Namespace: namespace,
Labels: map[string]string{slurmComponentLabelKey: slurmComponentController},
},
Status: corev1.PodStatus{Phase: corev1.PodPending},
})
}

testCases := []struct {
name string
useDynamicNodes bool
client *fake.Clientset
params []any
want string
errMsg string
}{
{
name: "dynamic nodes short-circuits without listing pods",
useDynamicNodes: true,
client: fake.NewSimpleClientset(),
params: []any{namespace},
want: dynamicShowPartitionNodes,
},
{
name: "wrong parameter count",
client: fake.NewSimpleClientset(),
params: []any{namespace, "extra"},
errMsg: "expects a namespace as a parameter",
},
{
name: "non-string parameter",
client: fake.NewSimpleClientset(),
params: []any{42},
errMsg: "expects a string parameter",
},
{
name: "no running controller or login pods",
client: pendingControllerClient(),
params: []any{namespace},
errMsg: "no running controller or login pods found for partition discovery",
},
}

for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
eng := &SlinkyEngine{
client: tc.client,
params: &Params{Namespace: namespace, UseDynamicNodes: tc.useDynamicNodes},
}

got, err := eng.getPartitionNodes(context.Background(), "gpu", tc.params)
if tc.errMsg != "" {
require.Error(t, err)
require.Contains(t, err.Error(), tc.errMsg)
return
}
require.NoError(t, err)
require.Equal(t, tc.want, got)
})
}
}

func TestGenerateDynamicNodesOutput(t *testing.T) {
slinkyPodSel := metav1.LabelSelector{MatchLabels: map[string]string{"app": "slinky"}}

Expand Down
Loading