Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,30 @@ data:
supersedingEquivalenceGroups = {{ $config.supersedingEquivalenceGroups | toJson }}
{{- end }}
{{- end }}

{{- range $componentClass, $actions := .Values.maintenance.componentActions }}
{{- range $actionName, $config := $actions }}
[componentRemediationActions.{{ $componentClass | quote }}.{{ $actionName | quote }}]
apiGroup = {{ $config.apiGroup | quote }}
version = {{ $config.version | quote }}
kind = {{ $config.kind | quote }}
scope = {{ $config.scope | default "Cluster" | quote }}
{{- if $config.namespace }}
namespace = {{ $config.namespace | quote }}
{{- end }}
completeConditionType = {{ $config.completeConditionType | default "NodeReady" | quote }}
templateFileName = {{ $config.templateFileName | quote }}
{{- if $config.equivalenceGroup }}
equivalenceGroup = {{ $config.equivalenceGroup | quote }}
{{- end }}
{{- if $config.impactedEntityScope }}
impactedEntityScope = {{ $config.impactedEntityScope | quote }}
{{- end }}
{{- if $config.supersedingEquivalenceGroups }}
supersedingEquivalenceGroups = {{ $config.supersedingEquivalenceGroups | toJson }}
{{- end }}
{{- end }}
{{- end }}

[updateRetry]
maxRetries = {{ .Values.updateRetry.maxRetries }}
Expand Down
39 changes: 34 additions & 5 deletions distros/kubernetes/nvsentinel/charts/fault-remediation/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -88,11 +88,40 @@ maintenance:
templateFileName: "terminate-node.yaml"
equivalenceGroup: "terminate"

# NOTE: Resource names for RBAC are generated by appending 's' to lowercase kind.
# This works for regular nouns but may fail for irregular plurals:
# RebootNode β†’ rebootnodes (βœ“ correct)
# Policy β†’ policys (βœ— should be policies)
# Use CRD kinds that follow regular pluralization rules.
# NOTE: Resource names for RBAC are generated by appending 's' to lowercase kind.
# This works for regular nouns but may fail for irregular plurals:
# RebootNode β†’ rebootnodes (βœ“ correct)
# Policy β†’ policys (βœ— should be policies)
# Use CRD kinds that follow regular pluralization rules.

# Optional component-specific remediation definitions.
# Resolution order is:
# 1. maintenance.componentActions[componentClass][recommendedAction]
# 2. maintenance.actions[recommendedAction]
# Example:
# componentActions:
# GPU:
# COMPONENT_RESET:
# apiGroup: "janitor.dgxc.nvidia.com"
# version: "v1alpha1"
# kind: "GPUReset"
# scope: "Cluster"
# completeConditionType: "Complete"
# templateFileName: "gpureset-template.yaml"
# equivalenceGroup: "reset"
# impactedEntityScope: "GPU_UUID"
# supersedingEquivalenceGroups: ["restart"]
# LPU:
# COMPONENT_RESET:
# apiGroup: "janitor.dgxc.nvidia.com"
# version: "v1alpha1"
# kind: "LPURemediation"
# scope: "Cluster"
# completeConditionType: "Complete"
# templateFileName: "lpu-remediation.yaml"
# equivalenceGroup: "reset"
# supersedingEquivalenceGroups: ["restart"]
componentActions: {}

# Template content for each remediation action
# Key matches the templateFileName name from actions above
Expand Down
6 changes: 6 additions & 0 deletions distros/kubernetes/nvsentinel/values-full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,12 @@ fault-remediation:
# Policy β†’ policys (βœ— should be policies)
# Use CRD kinds that follow regular pluralization rules.

# Optional component-specific remediation definitions.
# Resolution order is:
# 1. maintenance.componentActions[componentClass][recommendedAction]
# 2. maintenance.actions[recommendedAction]
componentActions: {}

# Template content for each remediation action
# Key matches the templateFileName name from actions above
templates:
Expand Down
87 changes: 73 additions & 14 deletions docs/configuration/fault-remediation.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,18 +52,46 @@ Defines the Custom Resource that will be created to trigger remediation actions.
fault-remediation:
maintenance:
actions:
"COMPONENT_RESET":
"RESTART_VM":
apiGroup: "janitor.dgxc.nvidia.com"
version: "v1alpha1"
kind: "GPUReset"
kind: "RebootNode"
scope: "Cluster"
completeConditionType: "Complete"
templateFileName: "gpureset-template.yaml"
equivalenceGroup: "reset"
supersedingEquivalenceGroups: ["restart"]
impactedEntityScope: "GPU_UUID"
completeConditionType: "NodeReady"
templateFileName: "rebootnode-template.yaml"
equivalenceGroup: "restart"

componentActions:
"GPU":
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe I'm missing some context here: What forbids an user to add Node here and do node level remediation flowing through this new pathaway?

"COMPONENT_RESET":
apiGroup: "janitor.dgxc.nvidia.com"
version: "v1alpha1"
kind: "GPUReset"
scope: "Cluster"
completeConditionType: "Complete"
templateFileName: "gpureset-template.yaml"
equivalenceGroup: "reset"
supersedingEquivalenceGroups: ["restart"]
impactedEntityScope: "GPU_UUID"
"LPU":
"COMPONENT_RESET":
apiGroup: "janitor.dgxc.nvidia.com"
version: "v1alpha1"
kind: "LPURemediation"
scope: "Cluster"
completeConditionType: "Complete"
templateFileName: "lpu-remediation-template.yaml"
equivalenceGroup: "reset"
supersedingEquivalenceGroups: ["restart"]

templates:
"rebootnode-template.yaml": |
apiVersion: {{.ApiGroup}}/{{.Version}}
kind: RebootNode
metadata:
name: maintenance-{{ .HealthEvent.NodeName }}-{{ .HealthEventID }}
spec:
nodeName: {{ .HealthEvent.NodeName }}
"gpureset-template.yaml": |
apiVersion: {{.ApiGroup}}/{{.Version}}
kind: GPUReset
Expand All @@ -74,10 +102,36 @@ fault-remediation:
selector:
uuids:
- {{ .ImpactedEntityScopeValue }}
"lpu-remediation-template.yaml": |
apiVersion: {{.ApiGroup}}/{{.Version}}
kind: LPURemediation
metadata:
name: maintenance-{{ .HealthEvent.NodeName }}-{{ .HealthEventID }}
spec:
nodeName: {{ .HealthEvent.NodeName }}
device: {{ index .HealthEvent.Metadata "device" }}
```

### Resolution Order

Fault Remediation resolves the maintenance resource in this order:

1. `maintenance.componentActions[healthEvent.componentClass][healthEvent.recommendedAction]`
2. `maintenance.actions[healthEvent.recommendedAction]`

This allows node-level actions such as `RESTART_VM` and `RESTART_BM` to remain shared while still letting a
component-specific action like `COMPONENT_RESET` resolve to different CR kinds for GPU and LPU events.

### Parameters

#### actions
Shared remediation mappings keyed only by recommended action. Use this for actions that should behave the same across
component types, such as rebooting or terminating a node.

#### componentActions
Optional component-specific remediation mappings keyed by component class first and recommended action second. Use this
when the same recommended action should create a different maintenance CR depending on the failing component type.

#### apiGroup
API group of the maintenance CRD installed by your maintenance operator.

Expand All @@ -100,10 +154,15 @@ Kubernetes namespace where maintenance CRs will be created.
Defines which remediation actions are considered equivalent for deduplication. Actions in the same group will deduplicate against each other regardless of CRD type if a previous CRD is in a non-terminal state.

#### supersedingEquivalenceGroups
Defines additional equivalence groups that are considered equivalent for deduplication. For example, the COMPONENT_RESET action in the reset group should be deduplicated with the RESTART_VM action in the restart group. In other words, rebooting a node will have the same effect as resetting a GPU whereas the inverse is not true.
Defines additional equivalence groups that are considered equivalent for deduplication. For example, the GPU
`COMPONENT_RESET` action in the `reset` group can be deduplicated with the `RESTART_VM` action in the `restart` group.
In other words, rebooting a node will have the same effect as resetting a GPU whereas the inverse is not true.

#### impactedEntityScope
For the COMPONENT_RESET action, the impacted entity scope should be defined so that there's a unique equivalence group for each entity. The unique equivalence group is constructed by appending the value for the given impacted entity to the equivalence group name. For example, each GPU needing reset will be in its own equivalence group named like reset-<GPU_UUID>.
For component-scoped reset actions such as GPU `COMPONENT_RESET`, the impacted entity scope should be defined so that
there's a unique equivalence group for each entity. The unique equivalence group is constructed by appending the value
for the given impacted entity to the equivalence group name. For example, each GPU needing reset will be in its own
equivalence group named like `reset-<GPU_UUID>`.

#### templates
Go template that generates the maintenance CR YAML. See Template Extension Point section below.
Expand All @@ -119,11 +178,11 @@ The maintenance template is a Go template that generates the Kubernetes CR YAML
- `.HealthEvent` (HealthEvent) - The entire content of the triggering health event
- `.RecommendedAction` (int) - Numeric action code from health event (see [health_event.proto](https://github.com/NVIDIA/NVSentinel/blob/main/data-models/protobufs/health_event.proto))
- `.RecommendedActionName` (string) - Action name from the health event
- `.ImpactedEntityScopeValue` (string) - The GPU_UUID used in COMPONENT_RESET remediation actions
- `.ApiGroup` (string) - Value from `maintenance.apiGroup`
- `.Version` (string) - Value from `maintenance.version`
- `.Kind` (string) - Value from `maintenance.kind`
- `.Namespace` (string) - Value from `maintenance.namespace`
- `.ImpactedEntityScopeValue` (string) - The impacted entity value used by component-scoped reset actions such as GPU `COMPONENT_RESET`
- `.ApiGroup` (string) - Value from the resolved maintenance resource
- `.Version` (string) - Value from the resolved maintenance resource
- `.Kind` (string) - Value from the resolved maintenance resource
- `.Namespace` (string) - Value from the resolved maintenance resource

### Template Examples

Expand Down
46 changes: 36 additions & 10 deletions docs/fault-remediation.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,16 +49,29 @@ fault-remediation:
completeConditionType: "NodeReady"
templateFileName: "rebootnode-template.yaml"
equivalenceGroup: "restart"
"COMPONENT_RESET":
apiGroup: "janitor.dgxc.nvidia.com"
version: "v1alpha1"
kind: "GPUReset"
scope: "Cluster"
completeConditionType: "Complete"
templateFileName: "gpureset-template.yaml"
equivalenceGroup: "reset"
impactedEntityScope: "GPU_UUID"
supersedingEquivalenceGroups: ["restart"]

componentActions:
"GPU":
"COMPONENT_RESET":
apiGroup: "janitor.dgxc.nvidia.com"
version: "v1alpha1"
kind: "GPUReset"
scope: "Cluster"
completeConditionType: "Complete"
templateFileName: "gpureset-template.yaml"
equivalenceGroup: "reset"
impactedEntityScope: "GPU_UUID"
supersedingEquivalenceGroups: ["restart"]
"LPU":
"COMPONENT_RESET":
apiGroup: "janitor.dgxc.nvidia.com"
version: "v1alpha1"
kind: "LPURemediation"
scope: "Cluster"
completeConditionType: "Complete"
templateFileName: "lpu-remediation-template.yaml"
equivalenceGroup: "reset"
supersedingEquivalenceGroups: ["restart"]

templates:
"rebootnode-template.yaml": |
Expand All @@ -78,13 +91,26 @@ fault-remediation:
selector:
uuids:
- {{ .ImpactedEntityScopeValue }}
"lpu-remediation-template.yaml": |
apiVersion: {{.ApiGroup}}/{{.Version}}
kind: LPURemediation
metadata:
name: maintenance-{{ .HealthEvent.NodeName }}-{{ .HealthEventID }}
spec:
nodeName: {{ .HealthEvent.NodeName }}
device: {{ index .HealthEvent.Metadata "device" }}
Comment on lines +94 to +101
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟑 Minor

Quote templated device value in the example for YAML safety.

At Line 101, prefer quoting the rendered value to avoid YAML parsing edge cases for device identifiers:
device: "{{ index .HealthEvent.Metadata "device" }}".

πŸ€– Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@docs/fault-remediation.md` around lines 94 - 101, In the LPURemediation
template under the "lpu-remediation-template.yaml" snippet, the unquoted device
value pulled via index from HealthEvent.Metadata can produce YAML parsing
issues; update the spec.nodeName/device line to render the device value wrapped
in double quotes (i.e., quote the result of index .HealthEvent.Metadata
"device") so the generated LPURemediation manifest (kind: LPURemediation) safely
handles device identifiers.


logCollector:
enabled: false # Enable log collection before remediation
uploadURL: "http://nvsentinel-incluster-file-server.nvsentinel.svc.cluster.local/upload"
timeout: "10m"
```

Resolution order is:

- `maintenance.componentActions[componentClass][recommendedAction]`
- fallback to `maintenance.actions[recommendedAction]`

### Configuration Options

- **Dry Run**: Test CRD creation without creating maintenance requests
Expand Down
9 changes: 5 additions & 4 deletions fault-remediation/pkg/annotation/annotation.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ func (m *NodeAnnotationManager) GetRemediationState(

// UpdateRemediationState updates the node annotation with new remediation state
func (m *NodeAnnotationManager) UpdateRemediationState(ctx context.Context, nodeName string,
group string, crName string, actionName string) error {
group string, crName string, actionName string, componentClass string) error {
err := retry.RetryOnConflict(conflictBackoff, func() error {
// Get current state
state, node, err := m.GetRemediationState(ctx, nodeName)
Expand All @@ -106,9 +106,10 @@ func (m *NodeAnnotationManager) UpdateRemediationState(ctx context.Context, node

// Update state for the group
state.EquivalenceGroups[group] = EquivalenceGroupState{
MaintenanceCR: crName,
CreatedAt: time.Now().UTC(),
ActionName: actionName,
MaintenanceCR: crName,
CreatedAt: time.Now().UTC(),
ActionName: actionName,
ComponentClass: componentClass,
}

// Marshal to JSON
Expand Down
9 changes: 6 additions & 3 deletions fault-remediation/pkg/annotation/annotation_interface.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,8 @@ const (
// NodeAnnotationManagerInterface defines the interface for managing node annotations
type NodeAnnotationManagerInterface interface {
GetRemediationState(ctx context.Context, nodeName string) (*RemediationStateAnnotation, *corev1.Node, error)
UpdateRemediationState(ctx context.Context, nodeName string, group string, crName string, actionName string) error
UpdateRemediationState(ctx context.Context, nodeName string, group string, crName string, actionName string,
componentClass string) error
ClearRemediationState(ctx context.Context, nodeName string) error
RemoveGroupsFromState(ctx context.Context, nodeName string, groups []string) error
}
Expand All @@ -44,7 +45,9 @@ type EquivalenceGroupState struct {
MaintenanceCR string `json:"maintenanceCR"`
CreatedAt time.Time `json:"createdAt"`

// Action that created the CR (e.g., "RESTART_BM")
// Required to look up the corresponding MaintenanceResource from the TomlConfig
// Action corresponding with the HealthEvent that created the CR (e.g., "RESTART_BM")
ActionName string `json:"actionName"`

// Component class corresponding with the HealthEvent that created the CR (e.g. "GPU")
ComponentClass string `json:"componentClass,omitempty"`
}
Loading
Loading