diff --git a/distros/kubernetes/nvsentinel/charts/fault-remediation/templates/configmap.yaml b/distros/kubernetes/nvsentinel/charts/fault-remediation/templates/configmap.yaml index d294fc0e8..1ef6a9b7f 100644 --- a/distros/kubernetes/nvsentinel/charts/fault-remediation/templates/configmap.yaml +++ b/distros/kubernetes/nvsentinel/charts/fault-remediation/templates/configmap.yaml @@ -47,6 +47,30 @@ data: supersedingEquivalenceGroups = {{ $config.supersedingEquivalenceGroups | toJson }} {{- end }} {{- end }} + + {{- range $componentClass, $actions := .Values.maintenance.componentActions }} + {{- range $actionName, $config := $actions }} + [componentRemediationActions.{{ $componentClass | quote }}.{{ $actionName | quote }}] + apiGroup = {{ $config.apiGroup | quote }} + version = {{ $config.version | quote }} + kind = {{ $config.kind | quote }} + scope = {{ $config.scope | default "Cluster" | quote }} + {{- if $config.namespace }} + namespace = {{ $config.namespace | quote }} + {{- end }} + completeConditionType = {{ $config.completeConditionType | default "NodeReady" | quote }} + templateFileName = {{ $config.templateFileName | quote }} + {{- if $config.equivalenceGroup }} + equivalenceGroup = {{ $config.equivalenceGroup | quote }} + {{- end }} + {{- if $config.impactedEntityScope }} + impactedEntityScope = {{ $config.impactedEntityScope | quote }} + {{- end }} + {{- if $config.supersedingEquivalenceGroups }} + supersedingEquivalenceGroups = {{ $config.supersedingEquivalenceGroups | toJson }} + {{- end }} + {{- end }} + {{- end }} [updateRetry] maxRetries = {{ .Values.updateRetry.maxRetries }} diff --git a/distros/kubernetes/nvsentinel/charts/fault-remediation/values.yaml b/distros/kubernetes/nvsentinel/charts/fault-remediation/values.yaml index 6c5b17f88..32a660843 100755 --- a/distros/kubernetes/nvsentinel/charts/fault-remediation/values.yaml +++ b/distros/kubernetes/nvsentinel/charts/fault-remediation/values.yaml @@ -88,11 +88,40 @@ maintenance: templateFileName: "terminate-node.yaml" equivalenceGroup: "terminate" - # NOTE: Resource names for RBAC are generated by appending 's' to lowercase kind. - # This works for regular nouns but may fail for irregular plurals: - # RebootNode → rebootnodes (✓ correct) - # Policy → policys (✗ should be policies) - # Use CRD kinds that follow regular pluralization rules. + # NOTE: Resource names for RBAC are generated by appending 's' to lowercase kind. + # This works for regular nouns but may fail for irregular plurals: + # RebootNode → rebootnodes (✓ correct) + # Policy → policys (✗ should be policies) + # Use CRD kinds that follow regular pluralization rules. + + # Optional component-specific remediation definitions. + # Resolution order is: + # 1. maintenance.componentActions[componentClass][recommendedAction] + # 2. maintenance.actions[recommendedAction] + # Example: + # componentActions: + # GPU: + # COMPONENT_RESET: + # apiGroup: "janitor.dgxc.nvidia.com" + # version: "v1alpha1" + # kind: "GPUReset" + # scope: "Cluster" + # completeConditionType: "Complete" + # templateFileName: "gpureset-template.yaml" + # equivalenceGroup: "reset" + # impactedEntityScope: "GPU_UUID" + # supersedingEquivalenceGroups: ["restart"] + # LPU: + # COMPONENT_RESET: + # apiGroup: "janitor.dgxc.nvidia.com" + # version: "v1alpha1" + # kind: "LPURemediation" + # scope: "Cluster" + # completeConditionType: "Complete" + # templateFileName: "lpu-remediation.yaml" + # equivalenceGroup: "reset" + # supersedingEquivalenceGroups: ["restart"] + componentActions: {} # Template content for each remediation action # Key matches the templateFileName name from actions above diff --git a/distros/kubernetes/nvsentinel/values-full.yaml b/distros/kubernetes/nvsentinel/values-full.yaml index 3a4efa03d..acb4f399e 100644 --- a/distros/kubernetes/nvsentinel/values-full.yaml +++ b/distros/kubernetes/nvsentinel/values-full.yaml @@ -682,6 +682,12 @@ fault-remediation: # Policy → policys (✗ should be policies) # Use CRD kinds that follow regular pluralization rules. + # Optional component-specific remediation definitions. + # Resolution order is: + # 1. maintenance.componentActions[componentClass][recommendedAction] + # 2. maintenance.actions[recommendedAction] + componentActions: {} + # Template content for each remediation action # Key matches the templateFileName name from actions above templates: diff --git a/docs/configuration/fault-remediation.md b/docs/configuration/fault-remediation.md index cbeec6686..af2687014 100644 --- a/docs/configuration/fault-remediation.md +++ b/docs/configuration/fault-remediation.md @@ -52,18 +52,46 @@ Defines the Custom Resource that will be created to trigger remediation actions. fault-remediation: maintenance: actions: - "COMPONENT_RESET": + "RESTART_VM": apiGroup: "janitor.dgxc.nvidia.com" version: "v1alpha1" - kind: "GPUReset" + kind: "RebootNode" scope: "Cluster" - completeConditionType: "Complete" - templateFileName: "gpureset-template.yaml" - equivalenceGroup: "reset" - supersedingEquivalenceGroups: ["restart"] - impactedEntityScope: "GPU_UUID" + completeConditionType: "NodeReady" + templateFileName: "rebootnode-template.yaml" + equivalenceGroup: "restart" + + componentActions: + "GPU": + "COMPONENT_RESET": + apiGroup: "janitor.dgxc.nvidia.com" + version: "v1alpha1" + kind: "GPUReset" + scope: "Cluster" + completeConditionType: "Complete" + templateFileName: "gpureset-template.yaml" + equivalenceGroup: "reset" + supersedingEquivalenceGroups: ["restart"] + impactedEntityScope: "GPU_UUID" + "LPU": + "COMPONENT_RESET": + apiGroup: "janitor.dgxc.nvidia.com" + version: "v1alpha1" + kind: "LPURemediation" + scope: "Cluster" + completeConditionType: "Complete" + templateFileName: "lpu-remediation-template.yaml" + equivalenceGroup: "reset" + supersedingEquivalenceGroups: ["restart"] templates: + "rebootnode-template.yaml": | + apiVersion: {{.ApiGroup}}/{{.Version}} + kind: RebootNode + metadata: + name: maintenance-{{ .HealthEvent.NodeName }}-{{ .HealthEventID }} + spec: + nodeName: {{ .HealthEvent.NodeName }} "gpureset-template.yaml": | apiVersion: {{.ApiGroup}}/{{.Version}} kind: GPUReset @@ -74,10 +102,36 @@ fault-remediation: selector: uuids: - {{ .ImpactedEntityScopeValue }} + "lpu-remediation-template.yaml": | + apiVersion: {{.ApiGroup}}/{{.Version}} + kind: LPURemediation + metadata: + name: maintenance-{{ .HealthEvent.NodeName }}-{{ .HealthEventID }} + spec: + nodeName: {{ .HealthEvent.NodeName }} + device: {{ index .HealthEvent.Metadata "device" }} ``` +### Resolution Order + +Fault Remediation resolves the maintenance resource in this order: + +1. `maintenance.componentActions[healthEvent.componentClass][healthEvent.recommendedAction]` +2. `maintenance.actions[healthEvent.recommendedAction]` + +This allows node-level actions such as `RESTART_VM` and `RESTART_BM` to remain shared while still letting a +component-specific action like `COMPONENT_RESET` resolve to different CR kinds for GPU and LPU events. + ### Parameters +#### actions +Shared remediation mappings keyed only by recommended action. Use this for actions that should behave the same across +component types, such as rebooting or terminating a node. + +#### componentActions +Optional component-specific remediation mappings keyed by component class first and recommended action second. Use this +when the same recommended action should create a different maintenance CR depending on the failing component type. + #### apiGroup API group of the maintenance CRD installed by your maintenance operator. @@ -100,10 +154,15 @@ Kubernetes namespace where maintenance CRs will be created. Defines which remediation actions are considered equivalent for deduplication. Actions in the same group will deduplicate against each other regardless of CRD type if a previous CRD is in a non-terminal state. #### supersedingEquivalenceGroups -Defines additional equivalence groups that are considered equivalent for deduplication. For example, the COMPONENT_RESET action in the reset group should be deduplicated with the RESTART_VM action in the restart group. In other words, rebooting a node will have the same effect as resetting a GPU whereas the inverse is not true. +Defines additional equivalence groups that are considered equivalent for deduplication. For example, the GPU +`COMPONENT_RESET` action in the `reset` group can be deduplicated with the `RESTART_VM` action in the `restart` group. +In other words, rebooting a node will have the same effect as resetting a GPU whereas the inverse is not true. #### impactedEntityScope -For the COMPONENT_RESET action, the impacted entity scope should be defined so that there's a unique equivalence group for each entity. The unique equivalence group is constructed by appending the value for the given impacted entity to the equivalence group name. For example, each GPU needing reset will be in its own equivalence group named like reset-. +For component-scoped reset actions such as GPU `COMPONENT_RESET`, the impacted entity scope should be defined so that +there's a unique equivalence group for each entity. The unique equivalence group is constructed by appending the value +for the given impacted entity to the equivalence group name. For example, each GPU needing reset will be in its own +equivalence group named like `reset-`. #### templates Go template that generates the maintenance CR YAML. See Template Extension Point section below. @@ -119,11 +178,11 @@ The maintenance template is a Go template that generates the Kubernetes CR YAML - `.HealthEvent` (HealthEvent) - The entire content of the triggering health event - `.RecommendedAction` (int) - Numeric action code from health event (see [health_event.proto](https://github.com/NVIDIA/NVSentinel/blob/main/data-models/protobufs/health_event.proto)) - `.RecommendedActionName` (string) - Action name from the health event -- `.ImpactedEntityScopeValue` (string) - The GPU_UUID used in COMPONENT_RESET remediation actions -- `.ApiGroup` (string) - Value from `maintenance.apiGroup` -- `.Version` (string) - Value from `maintenance.version` -- `.Kind` (string) - Value from `maintenance.kind` -- `.Namespace` (string) - Value from `maintenance.namespace` +- `.ImpactedEntityScopeValue` (string) - The impacted entity value used by component-scoped reset actions such as GPU `COMPONENT_RESET` +- `.ApiGroup` (string) - Value from the resolved maintenance resource +- `.Version` (string) - Value from the resolved maintenance resource +- `.Kind` (string) - Value from the resolved maintenance resource +- `.Namespace` (string) - Value from the resolved maintenance resource ### Template Examples diff --git a/docs/fault-remediation.md b/docs/fault-remediation.md index 3c2d5131f..7918059d0 100644 --- a/docs/fault-remediation.md +++ b/docs/fault-remediation.md @@ -49,16 +49,29 @@ fault-remediation: completeConditionType: "NodeReady" templateFileName: "rebootnode-template.yaml" equivalenceGroup: "restart" - "COMPONENT_RESET": - apiGroup: "janitor.dgxc.nvidia.com" - version: "v1alpha1" - kind: "GPUReset" - scope: "Cluster" - completeConditionType: "Complete" - templateFileName: "gpureset-template.yaml" - equivalenceGroup: "reset" - impactedEntityScope: "GPU_UUID" - supersedingEquivalenceGroups: ["restart"] + + componentActions: + "GPU": + "COMPONENT_RESET": + apiGroup: "janitor.dgxc.nvidia.com" + version: "v1alpha1" + kind: "GPUReset" + scope: "Cluster" + completeConditionType: "Complete" + templateFileName: "gpureset-template.yaml" + equivalenceGroup: "reset" + impactedEntityScope: "GPU_UUID" + supersedingEquivalenceGroups: ["restart"] + "LPU": + "COMPONENT_RESET": + apiGroup: "janitor.dgxc.nvidia.com" + version: "v1alpha1" + kind: "LPURemediation" + scope: "Cluster" + completeConditionType: "Complete" + templateFileName: "lpu-remediation-template.yaml" + equivalenceGroup: "reset" + supersedingEquivalenceGroups: ["restart"] templates: "rebootnode-template.yaml": | @@ -78,6 +91,14 @@ fault-remediation: selector: uuids: - {{ .ImpactedEntityScopeValue }} + "lpu-remediation-template.yaml": | + apiVersion: {{.ApiGroup}}/{{.Version}} + kind: LPURemediation + metadata: + name: maintenance-{{ .HealthEvent.NodeName }}-{{ .HealthEventID }} + spec: + nodeName: {{ .HealthEvent.NodeName }} + device: {{ index .HealthEvent.Metadata "device" }} logCollector: enabled: false # Enable log collection before remediation @@ -85,6 +106,11 @@ fault-remediation: timeout: "10m" ``` +Resolution order is: + +- `maintenance.componentActions[componentClass][recommendedAction]` +- fallback to `maintenance.actions[recommendedAction]` + ### Configuration Options - **Dry Run**: Test CRD creation without creating maintenance requests diff --git a/fault-remediation/pkg/annotation/annotation.go b/fault-remediation/pkg/annotation/annotation.go index f9c8e5280..2163bc49f 100644 --- a/fault-remediation/pkg/annotation/annotation.go +++ b/fault-remediation/pkg/annotation/annotation.go @@ -95,7 +95,7 @@ func (m *NodeAnnotationManager) GetRemediationState( // UpdateRemediationState updates the node annotation with new remediation state func (m *NodeAnnotationManager) UpdateRemediationState(ctx context.Context, nodeName string, - group string, crName string, actionName string) error { + group string, crName string, actionName string, componentClass string) error { err := retry.RetryOnConflict(conflictBackoff, func() error { // Get current state state, node, err := m.GetRemediationState(ctx, nodeName) @@ -106,9 +106,10 @@ func (m *NodeAnnotationManager) UpdateRemediationState(ctx context.Context, node // Update state for the group state.EquivalenceGroups[group] = EquivalenceGroupState{ - MaintenanceCR: crName, - CreatedAt: time.Now().UTC(), - ActionName: actionName, + MaintenanceCR: crName, + CreatedAt: time.Now().UTC(), + ActionName: actionName, + ComponentClass: componentClass, } // Marshal to JSON diff --git a/fault-remediation/pkg/annotation/annotation_interface.go b/fault-remediation/pkg/annotation/annotation_interface.go index d590fbdfe..9aa0dbbd3 100644 --- a/fault-remediation/pkg/annotation/annotation_interface.go +++ b/fault-remediation/pkg/annotation/annotation_interface.go @@ -29,7 +29,8 @@ const ( // NodeAnnotationManagerInterface defines the interface for managing node annotations type NodeAnnotationManagerInterface interface { GetRemediationState(ctx context.Context, nodeName string) (*RemediationStateAnnotation, *corev1.Node, error) - UpdateRemediationState(ctx context.Context, nodeName string, group string, crName string, actionName string) error + UpdateRemediationState(ctx context.Context, nodeName string, group string, crName string, actionName string, + componentClass string) error ClearRemediationState(ctx context.Context, nodeName string) error RemoveGroupsFromState(ctx context.Context, nodeName string, groups []string) error } @@ -44,7 +45,9 @@ type EquivalenceGroupState struct { MaintenanceCR string `json:"maintenanceCR"` CreatedAt time.Time `json:"createdAt"` - // Action that created the CR (e.g., "RESTART_BM") - // Required to look up the corresponding MaintenanceResource from the TomlConfig + // Action corresponding with the HealthEvent that created the CR (e.g., "RESTART_BM") ActionName string `json:"actionName"` + + // Component class corresponding with the HealthEvent that created the CR (e.g. "GPU") + ComponentClass string `json:"componentClass,omitempty"` } diff --git a/fault-remediation/pkg/annotation/annotation_test.go b/fault-remediation/pkg/annotation/annotation_test.go index 194f56bd7..1de7bcdae 100644 --- a/fault-remediation/pkg/annotation/annotation_test.go +++ b/fault-remediation/pkg/annotation/annotation_test.go @@ -18,14 +18,15 @@ import ( "context" "fmt" "sync" + "testing" + "time" + "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client/fake" - "testing" - "time" ) func TestGetRemediationState(t *testing.T) { @@ -74,7 +75,8 @@ func TestGetRemediationState(t *testing.T) { "gpu-timeout": { "maintenanceCR": "gpu-maintenance-abc123", "createdAt": "%s", - "actionName": "testAction1" + "actionName": "testAction1", + "componentClass": "GPU" }, "node-reboot": { "maintenanceCR": "node-reboot-cr-789", @@ -89,9 +91,10 @@ func TestGetRemediationState(t *testing.T) { expectedState: RemediationStateAnnotation{ EquivalenceGroups: map[string]EquivalenceGroupState{ "gpu-timeout": { - MaintenanceCR: "gpu-maintenance-abc123", - CreatedAt: now, - ActionName: "testAction1", + MaintenanceCR: "gpu-maintenance-abc123", + CreatedAt: now, + ActionName: "testAction1", + ComponentClass: "GPU", }, "node-reboot": { MaintenanceCR: "node-reboot-cr-789", @@ -128,6 +131,7 @@ func TestGetRemediationState(t *testing.T) { for expectedKey, expectedValue := range tt.expectedState.EquivalenceGroups { assert.Equal(t, expectedValue.MaintenanceCR, resultState.EquivalenceGroups[expectedKey].MaintenanceCR) assert.Equal(t, expectedValue.ActionName, resultState.EquivalenceGroups[expectedKey].ActionName) + assert.Equal(t, expectedValue.ComponentClass, resultState.EquivalenceGroups[expectedKey].ComponentClass) assert.Equal(t, expectedValue.CreatedAt.Unix(), resultState.EquivalenceGroups[expectedKey].CreatedAt.Unix()) } } @@ -150,7 +154,7 @@ func TestUpdateRemediationState(t *testing.T) { annotationManager := NodeAnnotationManager{ client: client, } - err := annotationManager.UpdateRemediationState(context.TODO(), nodeName, group, crName, actionName) + err := annotationManager.UpdateRemediationState(context.TODO(), nodeName, group, crName, actionName, "GPU") assert.NoError(t, err) state, _, err := annotationManager.GetRemediationState(context.TODO(), nodeName) @@ -159,6 +163,7 @@ func TestUpdateRemediationState(t *testing.T) { assert.Contains(t, state.EquivalenceGroups, group) assert.Equal(t, crName, state.EquivalenceGroups[group].MaintenanceCR) assert.Equal(t, actionName, state.EquivalenceGroups[group].ActionName) + assert.Equal(t, "GPU", state.EquivalenceGroups[group].ComponentClass) } func TestClearRemediationState(t *testing.T) { @@ -290,9 +295,9 @@ func TestConcurrentUpdateAndRemoveGroupsFromState(t *testing.T) { const iterations = 100 for i := 0; i < iterations; i++ { // Reset state each iteration - err := annotationManager.UpdateRemediationState(context.TODO(), nodeName, "existing-group-1", "old-cr-1", "RESTART_BM") + err := annotationManager.UpdateRemediationState(context.TODO(), nodeName, "existing-group-1", "old-cr-1", "RESTART_BM", "") require.NoError(t, err) - err = annotationManager.UpdateRemediationState(context.TODO(), nodeName, "existing-group-2", "old-cr-2", "COMPONENT_RESET") + err = annotationManager.UpdateRemediationState(context.TODO(), nodeName, "existing-group-2", "old-cr-2", "COMPONENT_RESET", "") require.NoError(t, err) var wg sync.WaitGroup @@ -305,7 +310,7 @@ func TestConcurrentUpdateAndRemoveGroupsFromState(t *testing.T) { go func() { defer wg.Done() - _ = annotationManager.UpdateRemediationState(context.TODO(), nodeName, "new-group", "new-cr", "COMPONENT_RESET") + _ = annotationManager.UpdateRemediationState(context.TODO(), nodeName, "new-group", "new-cr", "COMPONENT_RESET", "") }() wg.Wait() diff --git a/fault-remediation/pkg/common/equivalence_groups.go b/fault-remediation/pkg/common/equivalence_groups.go index b6c8f9028..15404fe24 100644 --- a/fault-remediation/pkg/common/equivalence_groups.go +++ b/fault-remediation/pkg/common/equivalence_groups.go @@ -59,11 +59,11 @@ In the second example for COMPONENT_RESET, we will consider the HealthEvent as b and restart EquivalenceGroups. Additionally, the ImpactedEntityScope will be passed to the corresponding maintenance custom resource template. */ -func GetGroupConfigForEvent(remediationActions map[string]config.MaintenanceResource, +func GetGroupConfigForEvent(remediationConfig *config.TomlConfig, healthEvent *protos.HealthEvent) (*EquivalenceGroupConfig, error) { actionName := healthEvent.RecommendedAction.String() - actionConfig, exists := remediationActions[actionName] + actionConfig, _, exists := remediationConfig.ResolveMaintenanceResource(healthEvent.ComponentClass, actionName) if !exists { slog.Warn("Action not found in remediation configuration", "action", actionName, diff --git a/fault-remediation/pkg/common/equivalence_groups_test.go b/fault-remediation/pkg/common/equivalence_groups_test.go index c9c0ea5b7..262b75d3a 100644 --- a/fault-remediation/pkg/common/equivalence_groups_test.go +++ b/fault-remediation/pkg/common/equivalence_groups_test.go @@ -25,26 +25,42 @@ import ( ) func TestGetGroupConfigForEvent(t *testing.T) { - remediationActions := map[string]config.MaintenanceResource{ - "RESTART_BM": { - ApiGroup: "janitor.dgxc.nvidia.com", - Version: "v1alpha1", - Kind: "RebootNode", - TemplateFileName: "rebootnode-template.yaml", - CompleteConditionType: "NodeReady", - EquivalenceGroup: "restart", - ImpactedEntityScope: "", - SupersedingEquivalenceGroups: nil, + remediationConfig := &config.TomlConfig{ + RemediationActions: map[string]config.MaintenanceResource{ + "RESTART_BM": { + ApiGroup: "janitor.dgxc.nvidia.com", + Version: "v1alpha1", + Kind: "RebootNode", + TemplateFileName: "rebootnode-template.yaml", + CompleteConditionType: "NodeReady", + EquivalenceGroup: "restart", + ImpactedEntityScope: "", + SupersedingEquivalenceGroups: nil, + }, + "COMPONENT_RESET": { + ApiGroup: "janitor.dgxc.nvidia.com", + Version: "v1alpha1", + Kind: "RebootNode", + TemplateFileName: "rebootnode-template.yaml", + CompleteConditionType: "NodeReady", + EquivalenceGroup: "reset", + ImpactedEntityScope: "GPU_UUID", + SupersedingEquivalenceGroups: []string{"restart"}, + }, }, - "COMPONENT_RESET": { - ApiGroup: "janitor.dgxc.nvidia.com", - Version: "v1alpha1", - Kind: "RebootNode", - TemplateFileName: "rebootnode-template.yaml", - CompleteConditionType: "NodeReady", - EquivalenceGroup: "reset", - ImpactedEntityScope: "GPU_UUID", - SupersedingEquivalenceGroups: []string{"restart"}, + ComponentRemediationActions: config.ComponentRemediationActions{ + "GPU": { + "COMPONENT_RESET": { + ApiGroup: "janitor.dgxc.nvidia.com", + Version: "v1alpha1", + Kind: "GPUReset", + TemplateFileName: "gpureset-template.yaml", + CompleteConditionType: "Complete", + EquivalenceGroup: "gpu-reset", + ImpactedEntityScope: "GPU_UUID", + SupersedingEquivalenceGroups: []string{"restart"}, + }, + }, }, } @@ -98,10 +114,29 @@ func TestGetGroupConfigForEvent(t *testing.T) { expectError: true, expectedGroupConfig: nil, }, + { + name: "Component override is used for equivalence group resolution", + healthEvent: &protos.HealthEvent{ + ComponentClass: "GPU", + RecommendedAction: protos.RecommendedAction_COMPONENT_RESET, + EntitiesImpacted: []*protos.Entity{ + { + EntityType: "GPU_UUID", + EntityValue: "GPU-456", + }, + }, + }, + expectError: false, + expectedGroupConfig: &EquivalenceGroupConfig{ + EffectiveEquivalenceGroup: "gpu-reset-GPU-456", + ImpactedEntityScopeValue: "GPU-456", + SupersedingEquivalenceGroups: []string{"restart"}, + }, + }, } for _, tt := range tests { t.Run(tt.name, func(t *testing.T) { - groupConfig, err := GetGroupConfigForEvent(remediationActions, tt.healthEvent) + groupConfig, err := GetGroupConfigForEvent(remediationConfig, tt.healthEvent) if tt.expectError { assert.Error(t, err) } else { diff --git a/fault-remediation/pkg/config/config.go b/fault-remediation/pkg/config/config.go index b2084697f..7f4e16ea3 100644 --- a/fault-remediation/pkg/config/config.go +++ b/fault-remediation/pkg/config/config.go @@ -21,6 +21,7 @@ import ( "fmt" "os" "path/filepath" + "sort" "github.com/nvidia/nvsentinel/data-models/pkg/model" "github.com/nvidia/nvsentinel/data-models/pkg/protos" @@ -71,19 +72,27 @@ type UpdateRetry struct { RetryDelaySeconds int `toml:"retryDelaySeconds"` } +type ComponentRemediationActions map[string]map[string]MaintenanceResource + // TomlConfig holds the complete TOML configuration for fault remediation type TomlConfig struct { // Template mount configuration Template Template `toml:"template"` - // Multi-template configuration - map from RecommendedAction string to MaintenanceResource - RemediationActions map[string]MaintenanceResource `toml:"remediationActions"` - // Templates contains the actual template content keyed by filename Templates map[string]string `toml:"templates"` // Common configuration UpdateRetry UpdateRetry `toml:"updateRetry"` + + // RemediationActions and ComponentRemediationActions define a static mapping + // from healthEvent attributes to MaintenanceResource + + // ComponentClass, RecommendedAction -> MaintenanceResource + ComponentRemediationActions ComponentRemediationActions `toml:"componentRemediationActions"` + + // RecommendedAction -> MaintenanceResource + RemediationActions map[string]MaintenanceResource `toml:"remediationActions"` } // Validate checks the configuration for consistency and completeness @@ -98,6 +107,14 @@ func (c *TomlConfig) Validate() error { } } + for componentClass, actions := range c.ComponentRemediationActions { + for actionName, resource := range actions { + if err := c.validateComponentRemediationAction(componentClass, actionName, resource); err != nil { + return err + } + } + } + return nil } @@ -110,7 +127,7 @@ func (c *TomlConfig) validateTemplate() error { } func (c *TomlConfig) validateRemediationAction(actionName string, resource MaintenanceResource) error { - if err := c.validateEquivalenceGroup(actionName, resource); err != nil { + if err := c.validateEquivalenceGroup(actionName, resource, c.allMaintenanceResources()); err != nil { return err } @@ -129,6 +146,34 @@ func (c *TomlConfig) validateRemediationAction(actionName string, resource Maint return nil } +func (c *TomlConfig) validateComponentRemediationAction( + componentClass, actionName string, + resource MaintenanceResource, +) error { + if componentClass == "" { + return fmt.Errorf("componentRemediationActions must have a non-empty componentClass") + } + + if err := c.validateEquivalenceGroup(actionName, resource, c.allMaintenanceResources()); err != nil { + return fmt.Errorf("componentClass '%s' action '%s': %w", componentClass, actionName, err) + } + + if resource.TemplateFileName == "" { + return fmt.Errorf("componentClass '%s' action '%s' must have a non-empty templateFileName", + componentClass, actionName) + } + + if err := c.validateTemplateFileExists(actionName, resource.TemplateFileName); err != nil { + return fmt.Errorf("componentClass '%s': %w", componentClass, err) + } + + if err := validateScope(actionName, resource.Scope, resource.Namespace); err != nil { + return fmt.Errorf("componentClass '%s': %w", componentClass, err) + } + + return nil +} + /* EquivalenceGroup requirements: - All MaintenanceResources must have an EquivalenceGroup defined. @@ -141,13 +186,15 @@ not a use-case for it. Additionally, the ImpactedEntityScope must be included in EntityTypeToResourceNames (meaning that partial draining is enabled for that entity). */ -func (c *TomlConfig) validateEquivalenceGroup(actionName string, resource MaintenanceResource) error { +func (c *TomlConfig) validateEquivalenceGroup(actionName string, resource MaintenanceResource, + allResources []MaintenanceResource, +) error { if len(resource.EquivalenceGroup) == 0 { return fmt.Errorf("action '%s' must have a non-empty EquivalenceGroup", actionName) } for _, group := range resource.SupersedingEquivalenceGroups { - if err := validateOneSupersedingGroup(actionName, group, resource, c.RemediationActions); err != nil { + if err := validateOneSupersedingGroup(actionName, group, resource, allResources); err != nil { return err } } @@ -156,7 +203,7 @@ func (c *TomlConfig) validateEquivalenceGroup(actionName string, resource Mainte } func validateOneSupersedingGroup(actionName, group string, resource MaintenanceResource, - remediationActions map[string]MaintenanceResource, + allResources []MaintenanceResource, ) error { if group == resource.EquivalenceGroup { return fmt.Errorf("action '%s': SupersedingEquivalenceGroup cannot include the EquivalenceGroup itself: %s", @@ -165,7 +212,7 @@ func validateOneSupersedingGroup(actionName, group string, resource MaintenanceR foundGroup := false - for _, maintenanceResource := range remediationActions { + for _, maintenanceResource := range allResources { if group != maintenanceResource.EquivalenceGroup { continue } @@ -186,6 +233,114 @@ func validateOneSupersedingGroup(actionName, group string, resource MaintenanceR return nil } +func (c *TomlConfig) allMaintenanceResources() []MaintenanceResource { + allResources := make([]MaintenanceResource, 0, + len(c.RemediationActions)+len(c.ComponentRemediationActions)) + + for _, resource := range c.RemediationActions { + allResources = append(allResources, resource) + } + + for _, actions := range c.ComponentRemediationActions { + for _, resource := range actions { + allResources = append(allResources, resource) + } + } + + return allResources +} + +// ResolvedActionKey returns the canonical key used for resolved remediation actions. +// Shared actions use the action name directly, while component-specific overrides are +// namespaced by componentClass. +func ResolvedActionKey(componentClass, actionName string) string { + if componentClass == "" { + return actionName + } + + return componentClass + "/" + actionName +} + +// ResolveMaintenanceResource returns the component-specific remediation action when present, +// otherwise it falls back to the shared action-level configuration. +func (c *TomlConfig) ResolveMaintenanceResource(componentClass, actionName string) (MaintenanceResource, string, bool) { + if componentClass != "" { + if componentActions, exists := c.ComponentRemediationActions[componentClass]; exists { + if resource, ok := componentActions[actionName]; ok { + return resource, ResolvedActionKey(componentClass, actionName), true + } + } + } + + resource, exists := c.RemediationActions[actionName] + if !exists { + return MaintenanceResource{}, "", false + } + + return resource, actionName, true +} + +func (c *TomlConfig) SharedActionNames() []string { + actions := make([]string, 0, len(c.RemediationActions)) + for action := range c.RemediationActions { + actions = append(actions, action) + } + + sort.Strings(actions) + + return actions +} + +func (c *TomlConfig) ComponentActionNames(componentClass string) []string { + componentActions, exists := c.ComponentRemediationActions[componentClass] + if !exists { + return nil + } + + actions := make([]string, 0, len(componentActions)) + for action := range componentActions { + actions = append(actions, action) + } + + sort.Strings(actions) + + return actions +} + +// ResolveStoredMaintenanceResource resolves the maintenance resource for a +// remediation entry previously stored in the node annotation. +// +// Older remediation annotations may not have ComponentClass populated. +// When componentClass is unpopulated, we will resolve the action when it is still unambiguous in the current config. +// If ambiguous, it will be treated as stale. +func (c *TomlConfig) ResolveStoredMaintenanceResource(componentClass, actionName string) (MaintenanceResource, bool) { + if componentClass != "" { + if componentActions, exists := c.ComponentRemediationActions[componentClass]; exists { + if resource, ok := componentActions[actionName]; ok { + return resource, true + } + } + + if resource, exists := c.RemediationActions[actionName]; exists { + return resource, true + } + + return MaintenanceResource{}, false + } + + for _, componentActions := range c.ComponentRemediationActions { + if _, ok := componentActions[actionName]; ok { + return MaintenanceResource{}, false + } + } + + if resource, exists := c.RemediationActions[actionName]; exists { + return resource, true + } + + return MaintenanceResource{}, false +} + func validateResourceImpactedEntityScope(actionName string, resource MaintenanceResource) error { if len(resource.ImpactedEntityScope) == 0 { return nil diff --git a/fault-remediation/pkg/config/config_test.go b/fault-remediation/pkg/config/config_test.go index 78663f366..9b9604235 100644 --- a/fault-remediation/pkg/config/config_test.go +++ b/fault-remediation/pkg/config/config_test.go @@ -19,6 +19,8 @@ import ( "path/filepath" "strings" "testing" + + "github.com/stretchr/testify/assert" ) func TestTomlConfig_Validate(t *testing.T) { @@ -47,7 +49,7 @@ func TestTomlConfig_Validate(t *testing.T) { { name: "empty template mountPath should be rejected", config: TomlConfig{ - Template: Template{MountPath: ""}, + Template: Template{MountPath: ""}, RemediationActions: map[string]MaintenanceResource{}, }, expectError: true, @@ -75,6 +77,31 @@ func TestTomlConfig_Validate(t *testing.T) { }, expectError: false, }, + { + name: "valid config with component override", + config: TomlConfig{ + Template: Template{MountPath: tempDir}, + RemediationActions: map[string]MaintenanceResource{ + "RESTART_VM": { + TemplateFileName: "template-a.yaml", + Scope: "Cluster", + EquivalenceGroup: "restart", + }, + }, + ComponentRemediationActions: ComponentRemediationActions{ + "GPU": { + "COMPONENT_RESET": { + TemplateFileName: "template-b.yaml", + Scope: "Cluster", + EquivalenceGroup: "reset", + SupersedingEquivalenceGroups: []string{"restart"}, + ImpactedEntityScope: "GPU_UUID", + }, + }, + }, + }, + expectError: false, + }, { name: "missing template file reference", config: TomlConfig{ @@ -275,6 +302,24 @@ func TestTomlConfig_Validate(t *testing.T) { expectError: true, errorSubstr: "cannot have an ImpactedEntityScope defined", }, + { + name: "component override must have a non-empty componentClass", + config: TomlConfig{ + Template: Template{MountPath: tempDir}, + RemediationActions: map[string]MaintenanceResource{}, + ComponentRemediationActions: ComponentRemediationActions{ + "": { + "COMPONENT_RESET": { + TemplateFileName: "template-a.yaml", + Scope: "Cluster", + EquivalenceGroup: "reset", + }, + }, + }, + }, + expectError: true, + errorSubstr: "must have a non-empty componentClass", + }, } for _, tt := range tests { @@ -298,3 +343,159 @@ func TestTomlConfig_Validate(t *testing.T) { }) } } + +func TestTomlConfig_ResolveMaintenanceResource(t *testing.T) { + config := TomlConfig{ + RemediationActions: map[string]MaintenanceResource{ + "COMPONENT_RESET": { + Kind: "RebootNode", + EquivalenceGroup: "restart", + }, + "RESTART_VM": { + Kind: "RebootNode", + EquivalenceGroup: "restart", + }, + }, + ComponentRemediationActions: ComponentRemediationActions{ + "GPU": { + "COMPONENT_RESET": { + Kind: "GPUReset", + EquivalenceGroup: "reset", + ImpactedEntityScope: "GPU_UUID", + }, + }, + "LPU": { + "COMPONENT_RESET": { + Kind: "LPURemediation", + EquivalenceGroup: "reset", + }, + }, + }, + } + + tests := []struct { + name string + componentClass string + actionName string + wantKind string + wantKey string + wantFound bool + }{ + { + name: "component override wins", + componentClass: "GPU", + actionName: "COMPONENT_RESET", + wantKind: "GPUReset", + wantKey: "GPU/COMPONENT_RESET", + wantFound: true, + }, + { + name: "falls back to shared remediation action", + componentClass: "GPU", + actionName: "RESTART_VM", + wantKind: "RebootNode", + wantKey: "RESTART_VM", + wantFound: true, + }, + { + name: "unknown component also falls back to shared action", + componentClass: "DPU", + actionName: "COMPONENT_RESET", + wantKind: "RebootNode", + wantKey: "COMPONENT_RESET", + wantFound: true, + }, + { + name: "missing action is not resolved", + componentClass: "GPU", + actionName: "REPLACE_VM", + wantKey: "", + wantFound: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + gotResource, gotKey, gotFound := config.ResolveMaintenanceResource(tt.componentClass, tt.actionName) + + if gotFound != tt.wantFound { + t.Fatalf("ResolveMaintenanceResource() found = %v, want %v", gotFound, tt.wantFound) + } + + if gotKey != tt.wantKey { + t.Fatalf("ResolveMaintenanceResource() key = %q, want %q", gotKey, tt.wantKey) + } + + if gotResource.Kind != tt.wantKind { + t.Fatalf("ResolveMaintenanceResource() kind = %q, want %q", gotResource.Kind, tt.wantKind) + } + }) + } +} + +func TestResolvedActionKeyAndActionNameHelpers(t *testing.T) { + config := TomlConfig{ + RemediationActions: map[string]MaintenanceResource{ + "RESTART_VM": {}, + "COMPONENT_RESET": {}, + }, + ComponentRemediationActions: ComponentRemediationActions{ + "GPU": { + "COMPONENT_RESET": {}, + "RESTART_BM": {}, + }, + }, + } + + assert.Equal(t, "COMPONENT_RESET", ResolvedActionKey("", "COMPONENT_RESET")) + assert.Equal(t, "GPU/COMPONENT_RESET", ResolvedActionKey("GPU", "COMPONENT_RESET")) + assert.Equal(t, []string{"COMPONENT_RESET", "RESTART_VM"}, config.SharedActionNames()) + assert.Equal(t, []string{"COMPONENT_RESET", "RESTART_BM"}, config.ComponentActionNames("GPU")) + assert.Nil(t, config.ComponentActionNames("LPU")) +} + +func TestTomlConfig_ResolveStoredMaintenanceResource(t *testing.T) { + cfg := TomlConfig{ + RemediationActions: map[string]MaintenanceResource{ + "COMPONENT_RESET": {Kind: "RebootNode"}, + }, + ComponentRemediationActions: ComponentRemediationActions{ + "GPU": { + "COMPONENT_RESET": {Kind: "GPUReset"}, + }, + "LPU": { + "COMPONENT_RESET": {Kind: "LPURemediation"}, + }, + }, + } + + t.Run("component-specific resource resolves exactly from stored component class", func(t *testing.T) { + resource, found := cfg.ResolveStoredMaintenanceResource("GPU", "COMPONENT_RESET") + assert.True(t, found) + assert.Equal(t, MaintenanceResource{Kind: "GPUReset"}, resource) + }) + + t.Run("stored component class falls back to shared action when no override exists", func(t *testing.T) { + resource, found := cfg.ResolveStoredMaintenanceResource("NIC", "COMPONENT_RESET") + assert.True(t, found) + assert.Equal(t, MaintenanceResource{Kind: "RebootNode"}, resource) + }) + + t.Run("empty component class returns no resource when action is ambiguous", func(t *testing.T) { + resource, found := cfg.ResolveStoredMaintenanceResource("", "COMPONENT_RESET") + assert.False(t, found) + assert.Equal(t, MaintenanceResource{}, resource) + }) + + t.Run("empty component class can still resolve an unambiguous shared action", func(t *testing.T) { + cfgWithoutOverrides := TomlConfig{ + RemediationActions: map[string]MaintenanceResource{ + "COMPONENT_RESET": {Kind: "RebootNode"}, + }, + } + + resource, found := cfgWithoutOverrides.ResolveStoredMaintenanceResource("", "COMPONENT_RESET") + assert.True(t, found) + assert.Equal(t, MaintenanceResource{Kind: "RebootNode"}, resource) + }) +} diff --git a/fault-remediation/pkg/crstatus/checker.go b/fault-remediation/pkg/crstatus/checker.go index 5983e1a0c..020c0c861 100644 --- a/fault-remediation/pkg/crstatus/checker.go +++ b/fault-remediation/pkg/crstatus/checker.go @@ -18,6 +18,7 @@ import ( "context" "log/slog" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime/schema" "sigs.k8s.io/controller-runtime/pkg/client" @@ -26,49 +27,61 @@ import ( ) type CRStatusChecker struct { - client client.Client - remediationActions map[string]config.MaintenanceResource - dryRun bool + client client.Client + remediationConfig *config.TomlConfig + dryRun bool } func NewCRStatusChecker( client client.Client, - remediationActions map[string]config.MaintenanceResource, + remediationConfig *config.TomlConfig, dryRun bool, ) *CRStatusChecker { return &CRStatusChecker{ - client: client, - remediationActions: remediationActions, - dryRun: dryRun, + client: client, + remediationConfig: remediationConfig, + dryRun: dryRun, } } -// ShouldSkipCRCreation returns true if the CR exists and is not in a terminal state otherwise returns false. -func (c *CRStatusChecker) ShouldSkipCRCreation(ctx context.Context, actionName string, crName string) bool { - resource, exists := c.remediationActions[actionName] - if !exists { - slog.Error("No remediation configuration found for action", "action", actionName) +// ShouldSkipCRCreation returns true if a matching CR exists and is not in a terminal state otherwise returns false. +func (c *CRStatusChecker) ShouldSkipCRCreation(ctx context.Context, componentClass, actionName, crName string) bool { + if c.dryRun { + slog.Info("DRY-RUN: CR doesn't exist (dry-run mode)", "crName", crName, "action", actionName) return false } - if c.dryRun { - slog.Info("DRY-RUN: CR doesn't exist (dry-run mode)", "crName", crName, "action", actionName) + resource, found := c.remediationConfig.ResolveStoredMaintenanceResource(componentClass, actionName) + if !found { + slog.Error("No remediation configuration found for action", + "action", actionName, + "componentClass", componentClass) + return false } + obj := &unstructured.Unstructured{} gvk := schema.GroupVersionKind{ Group: resource.ApiGroup, Version: resource.Version, Kind: resource.Kind, } - - obj := &unstructured.Unstructured{} obj.SetGroupVersionKind(gvk) key := client.ObjectKey{Name: crName, Namespace: resource.Namespace} if err := c.client.Get(ctx, key, obj); err != nil { - slog.Warn("Failed to get CR, allowing create", "crName", crName, "gvk", gvk.String(), "error", err) + if apierrors.IsNotFound(err) { + return false + } + + slog.Warn("Failed to get CR, assuming not in progress", + "crName", crName, + "action", actionName, + "componentClass", componentClass, + "gvk", gvk.String(), + "error", err) + return false } diff --git a/fault-remediation/pkg/crstatus/crstatus_interface.go b/fault-remediation/pkg/crstatus/crstatus_interface.go index 55a2beab6..4b74100fc 100644 --- a/fault-remediation/pkg/crstatus/crstatus_interface.go +++ b/fault-remediation/pkg/crstatus/crstatus_interface.go @@ -21,5 +21,5 @@ import ( ) type CRStatusCheckerInterface interface { - ShouldSkipCRCreation(context.Context, string, string) bool + ShouldSkipCRCreation(context.Context, string, string, string) bool } diff --git a/fault-remediation/pkg/crstatus/crstatus_test.go b/fault-remediation/pkg/crstatus/crstatus_test.go index 7a93d9f06..e3c2f32a6 100644 --- a/fault-remediation/pkg/crstatus/crstatus_test.go +++ b/fault-remediation/pkg/crstatus/crstatus_test.go @@ -15,10 +15,14 @@ package crstatus import ( + "context" "testing" "github.com/stretchr/testify/assert" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/client/fake" "github.com/nvidia/nvsentinel/fault-remediation/pkg/config" ) @@ -27,8 +31,10 @@ func TestCheckCondition(t *testing.T) { testResource := config.MaintenanceResource{ CompleteConditionType: "Completed", } - cfg := map[string]config.MaintenanceResource{ - "test": testResource, + cfg := &config.TomlConfig{ + RemediationActions: map[string]config.MaintenanceResource{ + "test": testResource, + }, } checker := NewCRStatusChecker(nil, cfg, false) @@ -120,3 +126,139 @@ func TestCheckCondition(t *testing.T) { }) } } + +func TestShouldSkipCRCreation_DoesNotGuessAcrossOtherComponentOverrides(t *testing.T) { + scheme := runtime.NewScheme() + + existingCR := &unstructured.Unstructured{} + existingCR.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "janitor.dgxc.nvidia.com", + Version: "v1alpha1", + Kind: "GPUReset", + }) + existingCR.SetName("maintenance-node-1-event-1") + existingCR.Object["status"] = map[string]any{ + "conditions": []any{ + map[string]any{ + "type": "Complete", + "status": "Unknown", + }, + }, + } + + client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existingCR).Build() + cfg := &config.TomlConfig{ + ComponentRemediationActions: config.ComponentRemediationActions{ + "GPU": { + "COMPONENT_RESET": { + ApiGroup: "janitor.dgxc.nvidia.com", + Version: "v1alpha1", + Kind: "GPUReset", + CompleteConditionType: "Complete", + }, + }, + }, + } + + checker := NewCRStatusChecker(client, cfg, false) + + shouldSkip := checker.ShouldSkipCRCreation(context.Background(), "LPU", "COMPONENT_RESET", "maintenance-node-1-event-1") + + assert.False(t, shouldSkip, "expected checker not to guess across other component overrides") +} + +func TestShouldSkipCRCreation_FallsBackToSharedAction(t *testing.T) { + scheme := runtime.NewScheme() + + existingCR := &unstructured.Unstructured{} + existingCR.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "janitor.dgxc.nvidia.com", + Version: "v1alpha1", + Kind: "RebootNode", + }) + existingCR.SetName("maintenance-node-2-event-1") + existingCR.Object["status"] = map[string]any{ + "conditions": []any{ + map[string]any{ + "type": "NodeReady", + "status": "Unknown", + }, + }, + } + + client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existingCR).Build() + cfg := &config.TomlConfig{ + RemediationActions: map[string]config.MaintenanceResource{ + "COMPONENT_RESET": { + ApiGroup: "janitor.dgxc.nvidia.com", + Version: "v1alpha1", + Kind: "RebootNode", + CompleteConditionType: "NodeReady", + }, + }, + ComponentRemediationActions: config.ComponentRemediationActions{ + "GPU": { + "COMPONENT_RESET": { + ApiGroup: "janitor.dgxc.nvidia.com", + Version: "v1alpha1", + Kind: "GPUReset", + CompleteConditionType: "Complete", + }, + }, + }, + } + + checker := NewCRStatusChecker(client, cfg, false) + + shouldSkip := checker.ShouldSkipCRCreation(context.Background(), "LPU", "COMPONENT_RESET", "maintenance-node-2-event-1") + + assert.True(t, shouldSkip, "expected checker to fall back to the shared action mapping") +} + +func TestShouldSkipCRCreation_EmptyStoredComponentClassTreatsAmbiguousActionAsStale(t *testing.T) { + scheme := runtime.NewScheme() + + existingCR := &unstructured.Unstructured{} + existingCR.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "janitor.dgxc.nvidia.com", + Version: "v1alpha1", + Kind: "GPUReset", + }) + existingCR.SetName("maintenance-node-3-event-1") + existingCR.Object["status"] = map[string]any{ + "conditions": []any{ + map[string]any{ + "type": "Complete", + "status": "Unknown", + }, + }, + } + + client := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existingCR).Build() + cfg := &config.TomlConfig{ + ComponentRemediationActions: config.ComponentRemediationActions{ + "GPU": { + "COMPONENT_RESET": { + ApiGroup: "janitor.dgxc.nvidia.com", + Version: "v1alpha1", + Kind: "GPUReset", + CompleteConditionType: "Complete", + }, + }, + "LPU": { + "COMPONENT_RESET": { + ApiGroup: "janitor.dgxc.nvidia.com", + Version: "v1alpha1", + Kind: "LPURemediation", + CompleteConditionType: "Complete", + }, + }, + }, + } + + checker := NewCRStatusChecker(client, cfg, false) + + shouldSkip := checker.ShouldSkipCRCreation(context.Background(), "", "COMPONENT_RESET", "maintenance-node-3-event-1") + + assert.False(t, shouldSkip, "expected empty stored component class to be treated as stale when action is ambiguous") +} diff --git a/fault-remediation/pkg/reconciler/reconciler.go b/fault-remediation/pkg/reconciler/reconciler.go index a5d37f63b..248a81bd7 100644 --- a/fault-remediation/pkg/reconciler/reconciler.go +++ b/fault-remediation/pkg/reconciler/reconciler.go @@ -283,8 +283,7 @@ func (r *FaultRemediationReconciler) handleRemediationEvent( healthEvent := healthEventWithStatus.HealthEvent nodeName := healthEvent.NodeName - groupConfig, err := common.GetGroupConfigForEvent(r.Config.RemediationClient.GetConfig().RemediationActions, - healthEvent) + groupConfig, err := common.GetGroupConfigForEvent(r.Config.RemediationClient.GetConfig(), healthEvent) if err != nil { // If we got an error, groupConfig will be nil which will result in shouldSkipEvent setting state label to // remediation-failed @@ -493,7 +492,8 @@ func (r *FaultRemediationReconciler) checkExistingCRStatus(ctx context.Context, var groupsToRemove []string for groupName, groupState := range groupStates { - shouldSkip := statusChecker.ShouldSkipCRCreation(ctx, groupState.ActionName, groupState.MaintenanceCR) + shouldSkip := statusChecker.ShouldSkipCRCreation(ctx, groupState.ComponentClass, + groupState.ActionName, groupState.MaintenanceCR) if shouldSkip { slog.Info("CR exists and is in progress, skipping event", "node", nodeName, "crName", groupState.MaintenanceCR) return false, groupState.MaintenanceCR, nil diff --git a/fault-remediation/pkg/reconciler/reconciler_e2e_test.go b/fault-remediation/pkg/reconciler/reconciler_e2e_test.go index 4c5cdce2f..a6dbccce3 100644 --- a/fault-remediation/pkg/reconciler/reconciler_e2e_test.go +++ b/fault-remediation/pkg/reconciler/reconciler_e2e_test.go @@ -403,7 +403,7 @@ func TestCRBasedDeduplication_Integration(t *testing.T) { }, }, } - groupConfig, err := common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, healthEventDoc.HealthEvent) + groupConfig, err := common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), healthEventDoc.HealthEvent) assert.NoError(t, err) // TODO: ignoring error otherwise need to properly walk state transitions @@ -468,7 +468,7 @@ func TestCRBasedDeduplication_Integration(t *testing.T) { }, }, } - groupConfig, err := common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, event1.HealthEvent) + groupConfig, err := common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event1.HealthEvent) assert.NoError(t, err) // TODO: ignoring error otherwise need to properly walk state transitions _, _ = stateManager.UpdateNVSentinelStateNodeLabel(ctx, nodeName, statemanager.RemediatingLabelValue, false) @@ -528,7 +528,7 @@ func TestCRBasedDeduplication_Integration(t *testing.T) { }, }, } - groupConfig, err := common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, event1.HealthEvent) + groupConfig, err := common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event1.HealthEvent) assert.NoError(t, err) // TODO: ignoring error otherwise need to properly walk state transitions // TODO: also why does this return an error but also put the change through @@ -567,7 +567,7 @@ func TestCRBasedDeduplication_Integration(t *testing.T) { }, }, } - groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, event2.HealthEvent) + groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event2.HealthEvent) assert.NoError(t, err) //TODO: is this a bug? if you enter remediation-succeeded it won't let you get back to remediating @@ -622,7 +622,7 @@ func TestCRBasedDeduplication_Integration(t *testing.T) { }, }, } - groupConfig1, err := common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, + groupConfig1, err := common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event1.HealthEvent) assert.NoError(t, err) @@ -641,7 +641,7 @@ func TestCRBasedDeduplication_Integration(t *testing.T) { NodeName: nodeName, RecommendedAction: protos.RecommendedAction_RESTART_BM, } - groupConfig2, err := common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, + groupConfig2, err := common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event2Health) assert.NoError(t, err) @@ -707,7 +707,7 @@ func TestEventSequenceWithAnnotations_Integration(t *testing.T) { }, }, } - groupConfig, err := common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, + groupConfig, err := common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event1.HealthEvent) assert.NoError(t, err) _, _ = stateManager.UpdateNVSentinelStateNodeLabel(ctx, nodeName, statemanager.DrainSucceededLabelValue, false) @@ -727,7 +727,7 @@ func TestEventSequenceWithAnnotations_Integration(t *testing.T) { NodeName: nodeName, RecommendedAction: protos.RecommendedAction_RESTART_VM, } - groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, + groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event2) assert.NoError(t, err) shouldCreate, existingCR, err := r.checkExistingCRStatus(ctx, event2, groupConfig) @@ -747,7 +747,7 @@ func TestEventSequenceWithAnnotations_Integration(t *testing.T) { NodeName: nodeName, RecommendedAction: protos.RecommendedAction_RESTART_BM, } - groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, + groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event3) assert.NoError(t, err) shouldCreate, _, err = r.checkExistingCRStatus(ctx, event3, groupConfig) @@ -761,7 +761,7 @@ func TestEventSequenceWithAnnotations_Integration(t *testing.T) { NodeName: nodeName, RecommendedAction: protos.RecommendedAction_RESTART_BM, } - groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, + groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event4) assert.NoError(t, err) shouldCreate, _, err = r.checkExistingCRStatus(ctx, event4, groupConfig) @@ -784,7 +784,7 @@ func TestEventSequenceWithAnnotations_Integration(t *testing.T) { }, }, } - groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, + groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event5.HealthEvent) assert.NoError(t, err) crName2, err := r.performRemediation(ctx, event5, groupConfig) @@ -849,7 +849,7 @@ func TestEventSequenceWithSupersedingGroup(t *testing.T) { }, }, } - groupConfig, err := common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, + groupConfig, err := common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event1.HealthEvent) assert.NoError(t, err) _, _ = stateManager.UpdateNVSentinelStateNodeLabel(ctx, nodeName, statemanager.DrainSucceededLabelValue, false) @@ -877,7 +877,7 @@ func TestEventSequenceWithSupersedingGroup(t *testing.T) { }, } - groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, + groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event2.HealthEvent) assert.NoError(t, err) shouldSkip := r.shouldSkipEvent(ctx, event2, groupConfig) @@ -906,7 +906,7 @@ func TestEventSequenceWithSupersedingGroup(t *testing.T) { }, }, } - groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, + groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event3) assert.NoError(t, err) shouldCreate, _, err = r.checkExistingCRStatus(ctx, event3, groupConfig) @@ -926,7 +926,7 @@ func TestEventSequenceWithSupersedingGroup(t *testing.T) { }, }, } - groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, + groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event4) assert.NoError(t, err) shouldCreate, _, err = r.checkExistingCRStatus(ctx, event4, groupConfig) @@ -955,7 +955,7 @@ func TestEventSequenceWithSupersedingGroup(t *testing.T) { }, }, } - groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, + groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event5.HealthEvent) assert.NoError(t, err) crName2, err := r.performRemediation(ctx, event5, groupConfig) @@ -979,7 +979,7 @@ func TestEventSequenceWithSupersedingGroup(t *testing.T) { }, }, } - groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, + groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event6) assert.NoError(t, err) shouldCreate, existingCR, err = r.checkExistingCRStatus(ctx, event6, groupConfig) @@ -1009,7 +1009,7 @@ func TestEventSequenceWithSupersedingGroup(t *testing.T) { }, } - groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, + groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event7.HealthEvent) assert.NoError(t, err) shouldCreate, existingCR, err = r.checkExistingCRStatus(ctx, event7.HealthEvent, groupConfig) @@ -1049,7 +1049,7 @@ func TestEventSequenceWithSupersedingGroup(t *testing.T) { }, } - groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, + groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event8) assert.NoError(t, err) shouldCreate, existingCR, err = r.checkExistingCRStatus(ctx, event8, groupConfig) @@ -1078,7 +1078,7 @@ func TestEventSequenceWithSupersedingGroup(t *testing.T) { RecommendedAction: protos.RecommendedAction_COMPONENT_RESET, }, } - groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, + groupConfig, err = common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), event9.HealthEvent) assert.Error(t, err) assert.Nil(t, groupConfig) @@ -1296,7 +1296,7 @@ func TestFullReconcilerWithMockedMongoDB_E2E(t *testing.T) { RecommendedAction: protos.RecommendedAction_UNKNOWN, }, } - groupConfig, err := common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig().RemediationActions, + groupConfig, err := common.GetGroupConfigForEvent(cfg.RemediationClient.GetConfig(), healthEvent.HealthEvent) assert.NoError(t, err) diff --git a/fault-remediation/pkg/reconciler/reconciler_test.go b/fault-remediation/pkg/reconciler/reconciler_test.go index 44db88132..2cab1ae5b 100644 --- a/fault-remediation/pkg/reconciler/reconciler_test.go +++ b/fault-remediation/pkg/reconciler/reconciler_test.go @@ -24,7 +24,11 @@ import ( "github.com/google/uuid" "github.com/stretchr/testify/assert" corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client/fake" "github.com/nvidia/nvsentinel/commons/pkg/statemanager" "github.com/nvidia/nvsentinel/data-models/pkg/model" @@ -45,6 +49,8 @@ type MockK8sClient struct { runLogCollectorJobFn func(ctx context.Context, nodeName string) (ctrl.Result, error) annotationManagerOverride annotation.NodeAnnotationManagerInterface mockStatusChecker *mockStatusChecker + statusCheckerOverride crstatus.CRStatusCheckerInterface + configOverride *config.TomlConfig } func (m *MockK8sClient) CreateMaintenanceResource(ctx context.Context, healthEventData *events.HealthEventData, groupConfig *common.EquivalenceGroupConfig) (string, error) { @@ -60,6 +66,10 @@ func (m *MockK8sClient) GetAnnotationManager() annotation.NodeAnnotationManagerI } func (m *MockK8sClient) GetStatusChecker() crstatus.CRStatusCheckerInterface { + if m.statusCheckerOverride != nil { + return m.statusCheckerOverride + } + return m.mockStatusChecker } @@ -68,7 +78,7 @@ type mockStatusChecker struct { callCount int } -func (statusChecker *mockStatusChecker) ShouldSkipCRCreation(context.Context, string, string) bool { +func (statusChecker *mockStatusChecker) ShouldSkipCRCreation(context.Context, string, string, string) bool { shouldSkip := statusChecker.shouldSkip[statusChecker.callCount] if statusChecker.callCount < len(statusChecker.shouldSkip)-1 { statusChecker.callCount++ @@ -77,6 +87,10 @@ func (statusChecker *mockStatusChecker) ShouldSkipCRCreation(context.Context, st } func (m *MockK8sClient) GetConfig() *config.TomlConfig { + if m.configOverride != nil { + return m.configOverride + } + return &config.TomlConfig{ RemediationActions: map[string]config.MaintenanceResource{ protos.RecommendedAction_RESTART_BM.String(): { @@ -124,11 +138,13 @@ func (w *MockCRStatusCheckerWrapper) IsSuccessful(ctx context.Context, crName st } type MockNodeAnnotationManager struct { - existingCRs map[string]string + existingCRs map[string]string + existingGroupStates map[string]annotation.EquivalenceGroupState + removedGroups []string } func (m *MockNodeAnnotationManager) GetRemediationState(ctx context.Context, nodeName string) (*annotation.RemediationStateAnnotation, *corev1.Node, error) { - if m.existingCRs == nil { + if m.existingCRs == nil && m.existingGroupStates == nil { return &annotation.RemediationStateAnnotation{ EquivalenceGroups: make(map[string]annotation.EquivalenceGroupState), }, nil, nil @@ -137,6 +153,9 @@ func (m *MockNodeAnnotationManager) GetRemediationState(ctx context.Context, nod annotationState := &annotation.RemediationStateAnnotation{ EquivalenceGroups: make(map[string]annotation.EquivalenceGroupState), } + for groupName, state := range m.existingGroupStates { + annotationState.EquivalenceGroups[groupName] = state + } for groupName, crName := range m.existingCRs { annotationState.EquivalenceGroups[groupName] = annotation.EquivalenceGroupState{ MaintenanceCR: crName, @@ -147,7 +166,7 @@ func (m *MockNodeAnnotationManager) GetRemediationState(ctx context.Context, nod } func (m *MockNodeAnnotationManager) UpdateRemediationState(ctx context.Context, nodeName string, - group string, crName string, actionName string) error { + group string, crName string, actionName string, componentClass string) error { return nil } @@ -156,6 +175,18 @@ func (m *MockNodeAnnotationManager) ClearRemediationState(ctx context.Context, n } func (m *MockNodeAnnotationManager) RemoveGroupsFromState(ctx context.Context, nodeName string, groups []string) error { + m.removedGroups = append(m.removedGroups, groups...) + + for _, group := range groups { + if m.existingGroupStates != nil { + delete(m.existingGroupStates, group) + } + + if m.existingCRs != nil { + delete(m.existingCRs, group) + } + } + return nil } @@ -964,6 +995,148 @@ func TestCRBasedDeduplication(t *testing.T) { } } +func TestCheckExistingCRStatus_DeduplicatesStoredInProgressCROnDifferentComponentClass(t *testing.T) { + ctx := context.Background() + + existingCR := &unstructured.Unstructured{} + existingCR.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "janitor.dgxc.nvidia.com", + Version: "v1alpha1", + Kind: "GPUReset", + }) + existingCR.SetName("maintenance-gpu-123") + existingCR.Object["status"] = map[string]any{ + "conditions": []any{ + map[string]any{ + "type": "Complete", + "status": "Unknown", + }, + }, + } + + scheme := runtime.NewScheme() + statusClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existingCR).Build() + + remediationConfig := &config.TomlConfig{ + ComponentRemediationActions: config.ComponentRemediationActions{ + "GPU": { + protos.RecommendedAction_COMPONENT_RESET.String(): { + ApiGroup: "janitor.dgxc.nvidia.com", + Version: "v1alpha1", + Kind: "GPUReset", + CompleteConditionType: "Complete", + }, + }, + "LPU": { + protos.RecommendedAction_COMPONENT_RESET.String(): { + ApiGroup: "janitor.dgxc.nvidia.com", + Version: "v1alpha1", + Kind: "LPURemediation", + CompleteConditionType: "Complete", + }, + }, + }, + } + + mockAnnotationManager := &MockNodeAnnotationManager{ + existingGroupStates: map[string]annotation.EquivalenceGroupState{ + "restart": { + MaintenanceCR: "maintenance-gpu-123", + CreatedAt: time.Now(), + ActionName: protos.RecommendedAction_COMPONENT_RESET.String(), + ComponentClass: "GPU", + }, + }, + } + + mockK8sClient := &MockK8sClient{ + annotationManagerOverride: mockAnnotationManager, + statusCheckerOverride: crstatus.NewCRStatusChecker(statusClient, remediationConfig, false), + configOverride: remediationConfig, + } + + r := NewFaultRemediationReconciler(nil, nil, nil, ReconcilerConfig{ + RemediationClient: mockK8sClient, + }, false) + + healthEvent := &protos.HealthEvent{ + NodeName: "test-node", + ComponentClass: "GPU", + RecommendedAction: protos.RecommendedAction_COMPONENT_RESET, + } + + shouldCreateCR, existingCRName, err := r.checkExistingCRStatus(ctx, healthEvent, getGroupConfig("restart", nil)) + assert.NoError(t, err) + assert.False(t, shouldCreateCR, + "dedup should skip creating a new CR when the stored equivalence-group entry already points to an in-progress CR") + assert.Equal(t, "maintenance-gpu-123", existingCRName) +} + +func TestCheckExistingCRStatus_IgnoresLegacyAnnotationEntryWithoutActionName(t *testing.T) { + ctx := context.Background() + + existingCR := &unstructured.Unstructured{} + existingCR.SetGroupVersionKind(schema.GroupVersionKind{ + Group: "janitor.dgxc.nvidia.com", + Version: "v1alpha1", + Kind: "RebootNode", + }) + existingCR.SetName("maintenance-legacy-123") + existingCR.Object["status"] = map[string]any{ + "conditions": []any{ + map[string]any{ + "type": "NodeReady", + "status": "Unknown", + }, + }, + } + + scheme := runtime.NewScheme() + statusClient := fake.NewClientBuilder().WithScheme(scheme).WithObjects(existingCR).Build() + + remediationConfig := &config.TomlConfig{ + RemediationActions: map[string]config.MaintenanceResource{ + protos.RecommendedAction_RESTART_BM.String(): { + ApiGroup: "janitor.dgxc.nvidia.com", + Version: "v1alpha1", + Kind: "RebootNode", + CompleteConditionType: "NodeReady", + }, + }, + } + + mockAnnotationManager := &MockNodeAnnotationManager{ + existingGroupStates: map[string]annotation.EquivalenceGroupState{ + "restart": { + MaintenanceCR: "maintenance-legacy-123", + CreatedAt: time.Now(), + }, + }, + } + + mockK8sClient := &MockK8sClient{ + annotationManagerOverride: mockAnnotationManager, + statusCheckerOverride: crstatus.NewCRStatusChecker(statusClient, remediationConfig, false), + configOverride: remediationConfig, + } + + r := NewFaultRemediationReconciler(nil, nil, nil, ReconcilerConfig{ + RemediationClient: mockK8sClient, + }, false) + + healthEvent := &protos.HealthEvent{ + NodeName: "test-node", + RecommendedAction: protos.RecommendedAction_RESTART_BM, + } + + shouldCreateCR, existingCRName, err := r.checkExistingCRStatus(ctx, healthEvent, getGroupConfig("restart", nil)) + assert.NoError(t, err) + assert.True(t, shouldCreateCR, "legacy entries without an actionName should be ignored") + assert.Empty(t, existingCRName) + assert.Equal(t, []string{"restart"}, mockAnnotationManager.removedGroups) + assert.Empty(t, mockAnnotationManager.existingGroupStates) +} + // TestLogCollectorOnlyCalledWhenShouldCreateCR verifies that log collector is only called // when shouldCreateCR is true (Issue #441 - prevent duplicate log-collector jobs) // This tests the logic that log collector runs AFTER checkExistingCRStatus, not before diff --git a/fault-remediation/pkg/remediation/remediation.go b/fault-remediation/pkg/remediation/remediation.go index 073b86af5..c41e97503 100755 --- a/fault-remediation/pkg/remediation/remediation.go +++ b/fault-remediation/pkg/remediation/remediation.go @@ -70,6 +70,7 @@ type FaultRemediationClient struct { statusChecker *crstatus.CRStatusChecker } +// nolint:cyclop func NewRemediationClient( client client.Client, dryRun bool, @@ -86,16 +87,18 @@ func NewRemediationClient( // Load templates for multi-template actions for actionName, maintenanceResource := range remediationConfig.RemediationActions { - if maintenanceResource.TemplateFileName == "" { - return nil, fmt.Errorf("remediation action %s is missing template file configuration", actionName) + if err := loadTemplateForAction(templates, templateMountPath, actionName, maintenanceResource); err != nil { + return nil, err } + } - tmpl, err := loadAndParseTemplate(templateMountPath, maintenanceResource.TemplateFileName, actionName) - if err != nil { - return nil, fmt.Errorf("failed to load template for action %s: %w", actionName, err) + for componentClass, actions := range remediationConfig.ComponentRemediationActions { + for actionName, maintenanceResource := range actions { + actionKey := config.ResolvedActionKey(componentClass, actionName) + if err := loadTemplateForAction(templates, templateMountPath, actionKey, maintenanceResource); err != nil { + return nil, err + } } - - templates[actionName] = tmpl } // Validate namespace configuration for namespaced resources @@ -105,6 +108,16 @@ func NewRemediationClient( } } + for componentClass, actions := range remediationConfig.ComponentRemediationActions { + for actionName, maintenanceResource := range actions { + if maintenanceResource.Scope == "Namespaced" && maintenanceResource.Namespace == "" { + return nil, + fmt.Errorf("remediation action %s for componentClass %s is namespaced but missing namespace configuration", + actionName, componentClass) + } + } + } + ctrlRuntimeRemediationClient := &FaultRemediationClient{ client: client, templates: templates, @@ -123,7 +136,7 @@ func NewRemediationClient( ctrlRuntimeRemediationClient.statusChecker = crstatus.NewCRStatusChecker( client, - remediationConfig.RemediationActions, + &remediationConfig, dryRun, ) @@ -155,6 +168,23 @@ func loadAndParseTemplate(mountPath, fileName, templateName string) (*template.T return tmpl, nil } +func loadTemplateForAction(templates map[string]*template.Template, templateMountPath, actionKey string, + maintenanceResource config.MaintenanceResource, +) error { + if maintenanceResource.TemplateFileName == "" { + return fmt.Errorf("remediation action %s is missing template file configuration", actionKey) + } + + tmpl, err := loadAndParseTemplate(templateMountPath, maintenanceResource.TemplateFileName, actionKey) + if err != nil { + return fmt.Errorf("failed to load template for action %s: %w", actionKey, err) + } + + templates[actionKey] = tmpl + + return nil +} + func (c *FaultRemediationClient) GetAnnotationManager() annotation.NodeAnnotationManagerInterface { return c.annotationManager } @@ -168,7 +198,7 @@ func (c *FaultRemediationClient) CreateMaintenanceResource(ctx context.Context, healthEvent := healthEventData.HealthEvent healthEventID := healthEventData.ID - // Generate CR name + // Generate the maintenance CR name from the node and health event ID. crName := fmt.Sprintf("maintenance-%s-%s", healthEvent.NodeName, healthEventID) // Skip custom resource creation if dry-run is enabled @@ -180,7 +210,7 @@ func (c *FaultRemediationClient) CreateMaintenanceResource(ctx context.Context, recommendedActionName := healthEvent.RecommendedAction.String() maintenanceResource, selectedTemplate, actionKey, err := - c.selectRemediationActionAndTemplate(recommendedActionName, healthEvent.NodeName) + c.selectRemediationActionAndTemplate(healthEvent.ComponentClass, recommendedActionName, healthEvent.NodeName) if err != nil { return "", fmt.Errorf("error selecting remediation action and template: %w", err) } @@ -203,7 +233,7 @@ func (c *FaultRemediationClient) CreateMaintenanceResource(ctx context.Context, } if err := c.updateRemediationAnnotationIfNeeded(ctx, healthEvent.NodeName, groupConfig.EffectiveEquivalenceGroup, - actualCRName, recommendedActionName); err != nil { + actualCRName, recommendedActionName, healthEvent.ComponentClass); err != nil { return "", err } @@ -275,14 +305,14 @@ func (c *FaultRemediationClient) createMaintenanceCR(ctx context.Context, select // updateRemediationAnnotationIfNeeded updates node remediation state when equivalence group // and annotation manager are set. func (c *FaultRemediationClient) updateRemediationAnnotationIfNeeded(ctx context.Context, nodeName string, - effectiveEquivalenceGroup string, actualCRName, recommendedActionName string, + effectiveEquivalenceGroup string, actualCRName, recommendedActionName, componentClass string, ) error { if effectiveEquivalenceGroup == "" || c.annotationManager == nil { return nil } err := c.annotationManager.UpdateRemediationState(ctx, nodeName, effectiveEquivalenceGroup, - actualCRName, recommendedActionName) + actualCRName, recommendedActionName, componentClass) if err != nil { slog.Warn("Failed to update node annotation", "node", nodeName, "error", err) return fmt.Errorf("failed to update node annotation: %w", err) @@ -326,36 +356,34 @@ func setNodeOwnerRef(maintenance *unstructured.Unstructured, node *corev1.Node) } func (c *FaultRemediationClient) selectRemediationActionAndTemplate( + componentClass string, recommendedActionName string, nodeName string, ) (config.MaintenanceResource, *template.Template, string, error) { - resource, exists := c.remediationConfig.RemediationActions[recommendedActionName] + resource, actionKey, exists := c.remediationConfig.ResolveMaintenanceResource(componentClass, recommendedActionName) if !exists { slog.Error("No remediation configuration found for action", "action", recommendedActionName, + "componentClass", componentClass, "node", nodeName, - "availableActions", func() []string { - actions := make([]string, 0, len(c.remediationConfig.RemediationActions)) - for action := range c.remediationConfig.RemediationActions { - actions = append(actions, action) - } - - return actions - }()) + "availableSharedActions", c.remediationConfig.SharedActionNames(), + "availableComponentActions", c.remediationConfig.ComponentActionNames(componentClass)) return config.MaintenanceResource{}, nil, "", fmt.Errorf("no remediation config for action") } - tmpl := c.templates[recommendedActionName] + tmpl := c.templates[actionKey] if tmpl == nil { slog.Error("No template available for remediation action", "action", recommendedActionName, + "componentClass", componentClass, + "actionKey", actionKey, "node", nodeName) return config.MaintenanceResource{}, nil, "", fmt.Errorf("no template available for remediation action") } - return resource, tmpl, recommendedActionName, nil + return resource, tmpl, actionKey, nil } // getNodeForOwnerReference retrieves the node for setting owner reference on the CR. diff --git a/fault-remediation/pkg/remediation/remediation_test.go b/fault-remediation/pkg/remediation/remediation_test.go index 4246cb378..ec882eb0c 100644 --- a/fault-remediation/pkg/remediation/remediation_test.go +++ b/fault-remediation/pkg/remediation/remediation_test.go @@ -111,6 +111,56 @@ func TestNewRemediationClient(t *testing.T) { } } +func TestNewRemediationClient_LoadsComponentSpecificTemplates(t *testing.T) { + tempDir := t.TempDir() + + rebootTemplate := `apiVersion: janitor.dgxc.nvidia.com/v1alpha1 +kind: RebootNode +metadata: + name: maintenance-{{ .HealthEvent.NodeName }}-{{ .HealthEventID }} +spec: + nodeName: {{ .HealthEvent.NodeName }}` + gpuTemplate := `apiVersion: janitor.dgxc.nvidia.com/v1alpha1 +kind: GPUReset +metadata: + name: maintenance-{{ .HealthEvent.NodeName }}-{{ .HealthEventID }} +spec: + nodeName: {{ .HealthEvent.NodeName }}` + + require.NoError(t, os.WriteFile(filepath.Join(tempDir, "rebootnode-template.yaml"), []byte(rebootTemplate), 0644)) + require.NoError(t, os.WriteFile(filepath.Join(tempDir, "gpureset-template.yaml"), []byte(gpuTemplate), 0644)) + + testConfig := config.TomlConfig{ + Template: config.Template{MountPath: tempDir}, + RemediationActions: map[string]config.MaintenanceResource{ + protos.RecommendedAction_COMPONENT_RESET.String(): { + Version: "v1alpha1", + ApiGroup: "janitor.dgxc.nvidia.com", + Kind: "RebootNode", + CompleteConditionType: "NodeReady", + TemplateFileName: "rebootnode-template.yaml", + }, + }, + ComponentRemediationActions: config.ComponentRemediationActions{ + "GPU": { + protos.RecommendedAction_COMPONENT_RESET.String(): { + Version: "v1alpha1", + ApiGroup: "janitor.dgxc.nvidia.com", + Kind: "GPUReset", + CompleteConditionType: "Complete", + TemplateFileName: "gpureset-template.yaml", + }, + }, + }, + } + + result, err := NewRemediationClient(fake.NewClientBuilder().Build(), false, testConfig) + require.NoError(t, err) + require.NotNil(t, result) + assert.Contains(t, result.templates, protos.RecommendedAction_COMPONENT_RESET.String()) + assert.Contains(t, result.templates, "GPU/"+protos.RecommendedAction_COMPONENT_RESET.String()) +} + func TestNewRemediationClient_MissingTemplateFile_E2E(t *testing.T) { // Create a temporary directory for test files @@ -311,7 +361,7 @@ spec: }, }, } - groupConfig, err := common.GetGroupConfigForEvent(remediationConfig.RemediationActions, + groupConfig, err := common.GetGroupConfigForEvent(&remediationConfig, healthEventDoc.HealthEvent) assert.NoError(t, err) @@ -361,6 +411,101 @@ spec: } } +func TestCreateMaintenanceResource_UsesComponentSpecificOverride(t *testing.T) { + tempDir := t.TempDir() + + rebootTemplate := `apiVersion: janitor.dgxc.nvidia.com/v1alpha1 +kind: RebootNode +metadata: + name: maintenance-{{ .HealthEvent.NodeName }}-{{ .HealthEventID }} +spec: + nodeName: {{ .HealthEvent.NodeName }} + force: false` + gpuTemplate := `apiVersion: janitor.dgxc.nvidia.com/v1alpha1 +kind: GPUReset +metadata: + name: maintenance-{{ .HealthEvent.NodeName }}-{{ .HealthEventID }} +spec: + nodeName: {{ .HealthEvent.NodeName }} + selector: + uuids: + - {{ .ImpactedEntityScopeValue }}` + + require.NoError(t, os.WriteFile(filepath.Join(tempDir, "rebootnode-template.yaml"), []byte(rebootTemplate), 0644)) + require.NoError(t, os.WriteFile(filepath.Join(tempDir, "gpureset-template.yaml"), []byte(gpuTemplate), 0644)) + + node := &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "test-node-override"}, + } + fakeClient := fake.NewClientBuilder().WithObjects(node).Build() + + cfg := config.TomlConfig{ + Template: config.Template{MountPath: tempDir}, + RemediationActions: map[string]config.MaintenanceResource{ + protos.RecommendedAction_COMPONENT_RESET.String(): { + Namespace: "dgxc-janitor", + Version: "v1alpha1", + ApiGroup: "janitor.dgxc.nvidia.com", + Kind: "RebootNode", + CompleteConditionType: "NodeReady", + TemplateFileName: "rebootnode-template.yaml", + EquivalenceGroup: "restart", + }, + }, + ComponentRemediationActions: config.ComponentRemediationActions{ + "GPU": { + protos.RecommendedAction_COMPONENT_RESET.String(): { + Namespace: "dgxc-janitor", + Version: "v1alpha1", + ApiGroup: "janitor.dgxc.nvidia.com", + Kind: "GPUReset", + CompleteConditionType: "Complete", + TemplateFileName: "gpureset-template.yaml", + EquivalenceGroup: "reset", + ImpactedEntityScope: "GPU_UUID", + }, + }, + }, + } + + remediationClient, err := NewRemediationClient(fakeClient, false, cfg) + require.NoError(t, err) + + healthEventDoc := &events.HealthEventData{ + ID: uuid.New().String(), + HealthEventWithStatus: model.HealthEventWithStatus{ + HealthEvent: &protos.HealthEvent{ + NodeName: "test-node-override", + ComponentClass: "GPU", + RecommendedAction: protos.RecommendedAction_COMPONENT_RESET, + EntitiesImpacted: []*protos.Entity{ + { + EntityType: "GPU_UUID", + EntityValue: "GPU-123", + }, + }, + }, + }, + } + groupConfig, err := common.GetGroupConfigForEvent(&cfg, healthEventDoc.HealthEvent) + require.NoError(t, err) + + crName, err := remediationClient.CreateMaintenanceResource(context.Background(), healthEventDoc, groupConfig) + require.NoError(t, err) + require.NotEmpty(t, crName) + + gvk := schema.GroupVersionKind{ + Group: "janitor.dgxc.nvidia.com", + Version: "v1alpha1", + Kind: "GPUReset", + } + obj := &unstructured.Unstructured{} + obj.SetGroupVersionKind(gvk) + err = fakeClient.Get(context.Background(), types.NamespacedName{Name: crName}, obj) + require.NoError(t, err) + assert.Equal(t, "GPUReset", obj.GetKind()) +} + func TestRunLogCollectorJob(t *testing.T) { eventId := "12345" jobNamespace := "test"