diff --git a/cmd/vsphere/main.go b/cmd/vsphere/main.go index 38bfa24f2..e3f4c1220 100644 --- a/cmd/vsphere/main.go +++ b/cmd/vsphere/main.go @@ -8,12 +8,15 @@ import ( "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apiserver/pkg/util/feature" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" "k8s.io/component-base/featuregate" "k8s.io/klog/v2" ipamv1beta1 "sigs.k8s.io/cluster-api/api/ipam/v1beta1" //nolint:staticcheck ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/client/config" "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/healthz" @@ -25,6 +28,7 @@ import ( machinev1 "github.com/openshift/api/machine/v1beta1" "github.com/openshift/library-go/pkg/config/leaderelection" "github.com/openshift/library-go/pkg/features" + "github.com/openshift/machine-api-operator/pkg/controller/infrastructurevalidation" capimachine "github.com/openshift/machine-api-operator/pkg/controller/machine" "github.com/openshift/machine-api-operator/pkg/controller/vsphere" machine "github.com/openshift/machine-api-operator/pkg/controller/vsphere" @@ -122,7 +126,29 @@ func main() { LeaseDuration: metav1.Duration{Duration: *leaderElectLeaseDuration}, }) + // Setup Scheme for all resources - must be done before creating manager + // so that cache configuration can reference these types + scheme := runtime.NewScheme() + + // Register core Kubernetes types (Secret, ConfigMap, etc.) + if err := clientgoscheme.AddToScheme(scheme); err != nil { + klog.Fatalf("unable to add client-go scheme: %v", err) + } + + if err := configv1.Install(scheme); err != nil { + klog.Fatalf("unable to add configv1 to scheme: %v", err) + } + + if err := machinev1.Install(scheme); err != nil { + klog.Fatalf("unable to add machinev1 to scheme: %v", err) + } + + if err := ipamv1beta1.AddToScheme(scheme); err != nil { + klog.Fatalf("unable to add ipamv1beta1 to scheme: %v", err) + } + opts := manager.Options{ + Scheme: scheme, Metrics: server.Options{ BindAddress: *metricsAddress, }, @@ -145,6 +171,15 @@ func main() { klog.Infof("Watching machine-api objects only in namespace %q for reconciliation.", *watchNamespace) } + // Infrastructure and ClusterOperator are cluster-scoped resources that need to be watched + // regardless of namespace restrictions. + // Machines also need to be watched cluster-wide for infrastructure validation. + opts.Cache.ByObject = map[client.Object]cache.ByObject{ + &configv1.Infrastructure{}: {}, + &configv1.ClusterOperator{}: {}, + &machinev1.Machine{}: {}, // Watch machines cluster-wide for infrastructure validation + } + // Sets feature gates from flags klog.Infof("Initializing feature gates: %s", strings.Join(defaultMutableGate.KnownFeatures(), ", ")) warnings, err := gateOpts.ApplyTo(defaultMutableGate) @@ -182,21 +217,8 @@ func main() { OpenshiftConfigNamespace: vsphere.OpenshiftConfigManagedNamespace, }) - if err := configv1.Install(mgr.GetScheme()); err != nil { - klog.Fatal(err) - } - - if err := machinev1.Install(mgr.GetScheme()); err != nil { - klog.Fatal(err) - } - - if err := machinev1.Install(mgr.GetScheme()); err != nil { - klog.Fatal(err) - } - - if err := ipamv1beta1.AddToScheme(mgr.GetScheme()); err != nil { - klog.Fatalf("unable to add ipamv1beta1 to scheme: %v", err) - } + // Scheme was already installed before manager creation (see above) + // This allows cache configuration to reference these types if err := capimachine.AddWithActuator(mgr, machineActuator, defaultMutableGate); err != nil { klog.Fatal(err) @@ -211,6 +233,11 @@ func main() { os.Exit(1) } + // Setup Infrastructure validation controller + if err := infrastructurevalidation.Add(mgr, opts); err != nil { + klog.Fatalf("Failed to add infrastructure validation controller: %v", err) + } + if err := mgr.AddReadyzCheck("ping", healthz.Ping); err != nil { klog.Fatal(err) } diff --git a/install/0000_30_machine-api-operator_09_rbac.yaml b/install/0000_30_machine-api-operator_09_rbac.yaml index e70deb082..1218da97c 100644 --- a/install/0000_30_machine-api-operator_09_rbac.yaml +++ b/install/0000_30_machine-api-operator_09_rbac.yaml @@ -288,6 +288,17 @@ rules: - list - watch + - apiGroups: + - config.openshift.io + resources: + - clusteroperators + - clusteroperators/status + verbs: + - get + - list + - watch + - update + --- apiVersion: rbac.authorization.k8s.io/v1 kind: Role diff --git a/pkg/controller/infrastructurevalidation/infrastructurevalidation_controller.go b/pkg/controller/infrastructurevalidation/infrastructurevalidation_controller.go new file mode 100644 index 000000000..8f6e045b6 --- /dev/null +++ b/pkg/controller/infrastructurevalidation/infrastructurevalidation_controller.go @@ -0,0 +1,285 @@ +package infrastructurevalidation + +import ( + "context" + "fmt" + "strings" + + configv1 "github.com/openshift/api/config/v1" + machinev1 "github.com/openshift/api/machine/v1beta1" + "github.com/openshift/library-go/pkg/config/clusteroperator/v1helpers" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/client-go/tools/record" + "k8s.io/klog/v2" + "k8s.io/utils/clock" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/controller-runtime/pkg/source" +) + +const ( + controllerName = "infrastructurevalidation-controller" + clusterOperatorName = "machine-api" + infrastructureName = "cluster" + conditionType = "InfrastructureFailureDomainsValid" + reasonAllReferencesValid = "AllReferencesValid" + reasonOrphanedReferences = "OrphanedMachineReferences" + degradedReasonOrphanedReferences = "OrphanedFailureDomainReferences" +) + +// Ensure ReconcileInfrastructureValidation implements reconcile.Reconciler +var _ reconcile.Reconciler = &ReconcileInfrastructureValidation{} + +// ReconcileInfrastructureValidation validates Infrastructure failure domain configurations +type ReconcileInfrastructureValidation struct { + client client.Client + apiReader client.Reader // Bypasses cache for cluster-scoped resources + recorder record.EventRecorder +} + +// Add creates a new Infrastructure validation controller and adds it to the Manager. +func Add(mgr manager.Manager, opts manager.Options) error { + reconciler := newReconciler(mgr) + return add(mgr, reconciler) +} + +// newReconciler returns a new reconcile.Reconciler +func newReconciler(mgr manager.Manager) *ReconcileInfrastructureValidation { + return &ReconcileInfrastructureValidation{ + client: mgr.GetClient(), + apiReader: mgr.GetAPIReader(), // Use APIReader to bypass cache for Infrastructure/ClusterOperator + recorder: mgr.GetEventRecorderFor(controllerName), + } +} + +// add adds a new Controller to mgr with r as the reconcile.Reconciler +func add(mgr manager.Manager, r *ReconcileInfrastructureValidation) error { + // Create a new controller + c, err := controller.New(controllerName, mgr, controller.Options{Reconciler: r}) + if err != nil { + return err + } + + // Watch Infrastructure/cluster + err = c.Watch(source.Kind(mgr.GetCache(), &configv1.Infrastructure{}, + &handler.TypedEnqueueRequestForObject[*configv1.Infrastructure]{})) + if err != nil { + return fmt.Errorf("failed to watch Infrastructure: %w", err) + } + + // Watch Machines and trigger Infrastructure reconcile when they change + err = c.Watch(source.Kind(mgr.GetCache(), &machinev1.Machine{}, + handler.TypedEnqueueRequestsFromMapFunc[*machinev1.Machine](infrastructureRequestFromMachine))) + if err != nil { + return fmt.Errorf("failed to watch Machines: %w", err) + } + + klog.Infof("Infrastructure validation controller started") + return nil +} + +// infrastructureRequestFromMachine maps any Machine event to a reconcile request +// for the Infrastructure/cluster singleton. +func infrastructureRequestFromMachine(ctx context.Context, machine *machinev1.Machine) []reconcile.Request { + // Always reconcile the cluster Infrastructure singleton when any machine changes + return []reconcile.Request{ + { + NamespacedName: client.ObjectKey{Name: infrastructureName}, + }, + } +} + +// Reconcile validates that failure domains referenced by machines are configured +// in the Infrastructure object. +func (r *ReconcileInfrastructureValidation) Reconcile(ctx context.Context, request reconcile.Request) (reconcile.Result, error) { + klog.Infof("Reconciling Infrastructure validation for %v", request.NamespacedName) + + // Fetch Infrastructure using APIReader to bypass cache + // This ensures we always get the latest state, even with namespace-scoped cache + infra := &configv1.Infrastructure{} + err := r.apiReader.Get(ctx, request.NamespacedName, infra) + if err != nil { + if errors.IsNotFound(err) { + // Infrastructure doesn't exist, nothing to validate + klog.V(3).Infof("Infrastructure %s not found, skipping validation", request.NamespacedName) + return reconcile.Result{}, nil + } + klog.Errorf("Failed to get Infrastructure: %v", err) + return reconcile.Result{}, fmt.Errorf("failed to get Infrastructure: %w", err) + } + + klog.V(4).Infof("Processing Infrastructure generation=%d, resourceVersion=%s", + infra.Generation, infra.ResourceVersion) + + // Skip validation if not VSphere platform + if !isVSpherePlatform(infra) { + klog.V(4).Infof("Platform is not VSphere, skipping failure domain validation") + // Clear any existing validation conditions for non-VSphere platforms + return r.clearValidationCondition(ctx) + } + + // Run VSphere validation + validator := &VSphereFailureDomainValidator{client: r.client} + result, err := validator.Validate(ctx, infra) + if err != nil { + klog.Errorf("Failed to validate failure domains: %v", err) + return reconcile.Result{}, fmt.Errorf("failed to validate failure domains: %w", err) + } + + // Update ClusterOperator conditions + if err := r.updateClusterOperatorConditions(ctx, result); err != nil { + klog.Errorf("Failed to update ClusterOperator conditions: %v", err) + return reconcile.Result{}, fmt.Errorf("failed to update ClusterOperator conditions: %w", err) + } + + // Note: We don't emit events on Infrastructure because it's cluster-scoped + // and would create events in the default namespace, causing RBAC issues. + // The ClusterOperator conditions provide the canonical status. + + return reconcile.Result{}, nil +} + +// isVSpherePlatform checks if the Infrastructure is configured for VSphere. +func isVSpherePlatform(infra *configv1.Infrastructure) bool { + return infra.Status.PlatformStatus != nil && + infra.Status.PlatformStatus.Type == configv1.VSpherePlatformType +} + +// updateClusterOperatorConditions updates the ClusterOperator with validation results. +func (r *ReconcileInfrastructureValidation) updateClusterOperatorConditions( + ctx context.Context, + result *FailureDomainValidationResult, +) error { + // Use APIReader to bypass cache and get latest ClusterOperator state + co := &configv1.ClusterOperator{} + if err := r.apiReader.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err != nil { + if errors.IsNotFound(err) { + klog.Warningf("ClusterOperator %s not found, cannot update condition", clusterOperatorName) + return nil + } + return fmt.Errorf("failed to get ClusterOperator: %w", err) + } + + // Create validation condition + validationCondition := configv1.ClusterOperatorStatusCondition{ + Type: conditionType, + } + + // Create degraded condition + var degradedCondition configv1.ClusterOperatorStatusCondition + + if result.Valid { + // Validation passed + validationCondition.Status = configv1.ConditionTrue + validationCondition.Reason = reasonAllReferencesValid + validationCondition.Message = "All machine failure domain references are valid" + + // Only clear Degraded if it was set by us (matching our specific reason) + existingDegraded := v1helpers.FindStatusCondition(co.Status.Conditions, configv1.OperatorDegraded) + if existingDegraded != nil && existingDegraded.Reason == degradedReasonOrphanedReferences { + degradedCondition.Type = configv1.OperatorDegraded + degradedCondition.Status = configv1.ConditionFalse + degradedCondition.Reason = reasonAllReferencesValid + degradedCondition.Message = "All machine failure domain references are valid" + v1helpers.SetStatusCondition(&co.Status.Conditions, degradedCondition, clock.RealClock{}) + klog.V(3).Info("Cleared Degraded condition (orphaned failure domain references resolved)") + } + } else { + // Validation failed - machines reference removed failure domains + validationCondition.Status = configv1.ConditionFalse + validationCondition.Reason = reasonOrphanedReferences + validationCondition.Message = formatOrphanedMessage(result.OrphanedReferences) + + // Set operator to Degraded + degradedCondition.Type = configv1.OperatorDegraded + degradedCondition.Status = configv1.ConditionTrue + degradedCondition.Reason = degradedReasonOrphanedReferences + degradedCondition.Message = fmt.Sprintf("Machines reference removed failure domains. %s", + formatOrphanedMessage(result.OrphanedReferences)) + v1helpers.SetStatusCondition(&co.Status.Conditions, degradedCondition, clock.RealClock{}) + klog.Warningf("Set ClusterOperator to Degraded: %s", degradedCondition.Message) + } + + // Set validation condition + v1helpers.SetStatusCondition(&co.Status.Conditions, validationCondition, clock.RealClock{}) + + // Update ClusterOperator status + if err := r.client.Status().Update(ctx, co); err != nil { + return fmt.Errorf("failed to update ClusterOperator status: %w", err) + } + + klog.V(3).Infof("Updated ClusterOperator conditions: %s=%s, Degraded=%s", + conditionType, validationCondition.Status, degradedCondition.Status) + return nil +} + +// clearValidationCondition removes the validation condition and our Degraded condition for non-VSphere platforms. +func (r *ReconcileInfrastructureValidation) clearValidationCondition(ctx context.Context) (reconcile.Result, error) { + // Use APIReader to bypass cache and get latest ClusterOperator state + co := &configv1.ClusterOperator{} + if err := r.apiReader.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, co); err != nil { + if errors.IsNotFound(err) { + return reconcile.Result{}, nil + } + return reconcile.Result{}, fmt.Errorf("failed to get ClusterOperator: %w", err) + } + + // Remove the validation condition and our specific Degraded condition if they exist + changed := false + newConditions := []configv1.ClusterOperatorStatusCondition{} + for _, cond := range co.Status.Conditions { + // Remove our validation condition + if cond.Type == conditionType { + changed = true + continue + } + // Remove Degraded condition if it was set by us + if cond.Type == configv1.OperatorDegraded && cond.Reason == degradedReasonOrphanedReferences { + changed = true + klog.V(3).Info("Cleared Degraded condition for non-VSphere platform") + continue + } + newConditions = append(newConditions, cond) + } + + if changed { + co.Status.Conditions = newConditions + if err := r.client.Status().Update(ctx, co); err != nil { + return reconcile.Result{}, fmt.Errorf("failed to update ClusterOperator status: %w", err) + } + klog.V(3).Infof("Cleared validation conditions for non-VSphere platform") + } + + return reconcile.Result{}, nil +} + +// formatOrphanedMessage formats the orphaned references into a human-readable message. +func formatOrphanedMessage(refs []OrphanedReference) string { + if len(refs) == 0 { + return "" + } + + // Group by failure domain for cleaner message + domainToMachines := make(map[string][]string) + for _, ref := range refs { + key := ref.FailureDomainName + machine := fmt.Sprintf("%s/%s", ref.MachineNamespace, ref.MachineName) + domainToMachines[key] = append(domainToMachines[key], machine) + } + + var parts []string + for domain, machines := range domainToMachines { + if len(machines) == 1 { + parts = append(parts, fmt.Sprintf("domain %q: %s", domain, machines[0])) + } else { + parts = append(parts, fmt.Sprintf("domain %q: %s and %d more", + domain, machines[0], len(machines)-1)) + } + } + + return fmt.Sprintf("Machines reference removed failure domains: %s", + strings.Join(parts, "; ")) +} diff --git a/pkg/controller/infrastructurevalidation/infrastructurevalidation_controller_test.go b/pkg/controller/infrastructurevalidation/infrastructurevalidation_controller_test.go new file mode 100644 index 000000000..81b4777b4 --- /dev/null +++ b/pkg/controller/infrastructurevalidation/infrastructurevalidation_controller_test.go @@ -0,0 +1,490 @@ +package infrastructurevalidation + +import ( + "context" + "testing" + + configv1 "github.com/openshift/api/config/v1" + machinev1 "github.com/openshift/api/machine/v1beta1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/tools/record" + "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + "sigs.k8s.io/controller-runtime/pkg/reconcile" +) + +func init() { + if err := machinev1.Install(scheme.Scheme); err != nil { + klog.Fatal(err) + } + if err := configv1.Install(scheme.Scheme); err != nil { + klog.Fatal(err) + } +} + +// createClusterOperator creates a test ClusterOperator +func createClusterOperator() *configv1.ClusterOperator { + return &configv1.ClusterOperator{ + ObjectMeta: metav1.ObjectMeta{ + Name: clusterOperatorName, + }, + Status: configv1.ClusterOperatorStatus{ + Conditions: []configv1.ClusterOperatorStatusCondition{}, + }, + } +} + +func TestReconcileVSphereValidReferences(t *testing.T) { + ctx := context.Background() + + infra := createInfrastructure([]failureDomainDef{ + {Name: "fd-east", Region: "us-east", Zone: "us-east-1a"}, + {Name: "fd-west", Region: "us-west", Zone: "us-west-1a"}, + }) + machine1 := createVSphereMachine("machine-1", testNamespace, "us-east", "us-east-1a") + machine2 := createVSphereMachine("machine-2", testNamespace, "us-west", "us-west-1a") + co := createClusterOperator() + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(infra, machine1, machine2, co). + WithStatusSubresource(co). + Build() + + recorder := record.NewFakeRecorder(10) + + r := &ReconcileInfrastructureValidation{ + client: fakeClient, + apiReader: fakeClient, // In tests, use the same fake client as apiReader + recorder: recorder, + } + + request := reconcile.Request{ + NamespacedName: client.ObjectKey{Name: infrastructureName}, + } + + result, err := r.Reconcile(ctx, request) + if err != nil { + t.Fatalf("Reconcile failed: %v", err) + } + + if result.RequeueAfter > 0 { + t.Error("Expected no requeue") + } + + // Verify ClusterOperator condition was set correctly + updatedCO := &configv1.ClusterOperator{} + if err := fakeClient.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, updatedCO); err != nil { + t.Fatalf("Failed to get ClusterOperator: %v", err) + } + + foundValidation := false + for _, cond := range updatedCO.Status.Conditions { + if cond.Type == conditionType { + foundValidation = true + if cond.Status != configv1.ConditionTrue { + t.Errorf("Expected condition status to be True, got %v", cond.Status) + } + if cond.Reason != reasonAllReferencesValid { + t.Errorf("Expected reason %q, got %q", reasonAllReferencesValid, cond.Reason) + } + } + if cond.Type == configv1.OperatorDegraded { + // Degraded should not be set by us when validation passes + if cond.Reason == degradedReasonOrphanedReferences { + t.Errorf("Expected Degraded condition not to be set by us when validation passes") + } + } + } + + if !foundValidation { + t.Error("Expected validation condition to be set") + } +} + +func TestReconcileVSphereOrphanedReferences(t *testing.T) { + ctx := context.Background() + + infra := createInfrastructure([]failureDomainDef{ + {Name: "fd-east", Region: "us-east", Zone: "us-east-1a"}, + }) + machine1 := createVSphereMachine("machine-1", testNamespace, "us-east", "us-east-1a") // valid + machine2 := createVSphereMachine("machine-2", testNamespace, "us-west", "us-west-1a") // orphaned + co := createClusterOperator() + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(infra, machine1, machine2, co). + WithStatusSubresource(co). + Build() + + recorder := record.NewFakeRecorder(10) + + r := &ReconcileInfrastructureValidation{ + client: fakeClient, + apiReader: fakeClient, // In tests, use the same fake client as apiReader + recorder: recorder, + } + + request := reconcile.Request{ + NamespacedName: client.ObjectKey{Name: infrastructureName}, + } + + result, err := r.Reconcile(ctx, request) + if err != nil { + t.Fatalf("Reconcile failed: %v", err) + } + + if result.RequeueAfter > 0 { + t.Error("Expected no requeue") + } + + // Verify ClusterOperator conditions were set correctly + updatedCO := &configv1.ClusterOperator{} + if err := fakeClient.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, updatedCO); err != nil { + t.Fatalf("Failed to get ClusterOperator: %v", err) + } + + foundValidation := false + foundDegraded := false + for _, cond := range updatedCO.Status.Conditions { + if cond.Type == conditionType { + foundValidation = true + if cond.Status != configv1.ConditionFalse { + t.Errorf("Expected condition status to be False, got %v", cond.Status) + } + if cond.Reason != reasonOrphanedReferences { + t.Errorf("Expected reason %q, got %q", reasonOrphanedReferences, cond.Reason) + } + // Message should contain the orphaned machine info + if cond.Message == "" { + t.Error("Expected non-empty message") + } + } + if cond.Type == configv1.OperatorDegraded { + foundDegraded = true + if cond.Status != configv1.ConditionTrue { + t.Errorf("Expected Degraded status to be True, got %v", cond.Status) + } + if cond.Reason != degradedReasonOrphanedReferences { + t.Errorf("Expected Degraded reason %q, got %q", degradedReasonOrphanedReferences, cond.Reason) + } + if cond.Message == "" { + t.Error("Expected non-empty Degraded message") + } + } + } + + if !foundValidation { + t.Error("Expected validation condition to be set") + } + if !foundDegraded { + t.Error("Expected Degraded condition to be set when orphaned references exist") + } + + // Note: We no longer emit events on Infrastructure (cluster-scoped resource) + // to avoid RBAC issues with events in the default namespace. + // The ClusterOperator conditions provide the canonical status. +} + +func TestReconcileDegradedConditionCleared(t *testing.T) { + ctx := context.Background() + + infra := createInfrastructure([]failureDomainDef{ + {Name: "fd-east", Region: "us-east", Zone: "us-east-1a"}, + }) + machine1 := createVSphereMachine("machine-1", testNamespace, "us-east", "us-east-1a") + + // Create ClusterOperator with pre-existing Degraded condition set by us + co := createClusterOperator() + co.Status.Conditions = []configv1.ClusterOperatorStatusCondition{ + { + Type: configv1.OperatorDegraded, + Status: configv1.ConditionTrue, + Reason: degradedReasonOrphanedReferences, + Message: "Previously degraded", + }, + } + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(infra, machine1, co). + WithStatusSubresource(co). + Build() + + recorder := record.NewFakeRecorder(10) + + r := &ReconcileInfrastructureValidation{ + client: fakeClient, + apiReader: fakeClient, // In tests, use the same fake client as apiReader + recorder: recorder, + } + + request := reconcile.Request{ + NamespacedName: client.ObjectKey{Name: infrastructureName}, + } + + result, err := r.Reconcile(ctx, request) + if err != nil { + t.Fatalf("Reconcile failed: %v", err) + } + + if result.RequeueAfter > 0 { + t.Error("Expected no requeue") + } + + // Verify Degraded condition was cleared + updatedCO := &configv1.ClusterOperator{} + if err := fakeClient.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, updatedCO); err != nil { + t.Fatalf("Failed to get ClusterOperator: %v", err) + } + + for _, cond := range updatedCO.Status.Conditions { + if cond.Type == configv1.OperatorDegraded { + if cond.Status == configv1.ConditionTrue && cond.Reason == degradedReasonOrphanedReferences { + t.Error("Expected Degraded condition to be cleared when validation passes") + } + } + } +} + +func TestReconcileNonVSpherePlatform(t *testing.T) { + ctx := context.Background() + + // AWS platform instead of VSphere + infra := &configv1.Infrastructure{ + ObjectMeta: metav1.ObjectMeta{ + Name: infrastructureName, + }, + Spec: configv1.InfrastructureSpec{ + PlatformSpec: configv1.PlatformSpec{ + Type: configv1.AWSPlatformType, + }, + }, + Status: configv1.InfrastructureStatus{ + PlatformStatus: &configv1.PlatformStatus{ + Type: configv1.AWSPlatformType, + }, + }, + } + co := createClusterOperator() + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(infra, co). + Build() + + recorder := record.NewFakeRecorder(10) + + r := &ReconcileInfrastructureValidation{ + client: fakeClient, + apiReader: fakeClient, // In tests, use the same fake client as apiReader + recorder: recorder, + } + + request := reconcile.Request{ + NamespacedName: client.ObjectKey{Name: infrastructureName}, + } + + result, err := r.Reconcile(ctx, request) + if err != nil { + t.Fatalf("Reconcile failed: %v", err) + } + + if result.RequeueAfter > 0 { + t.Error("Expected no requeue") + } + + // Verify validation condition was NOT set (or was removed) + updatedCO := &configv1.ClusterOperator{} + if err := fakeClient.Get(ctx, client.ObjectKey{Name: clusterOperatorName}, updatedCO); err != nil { + t.Fatalf("Failed to get ClusterOperator: %v", err) + } + + for _, cond := range updatedCO.Status.Conditions { + if cond.Type == conditionType { + t.Error("Expected validation condition to be removed for non-VSphere platform") + } + } +} + +func TestReconcileInfrastructureNotFound(t *testing.T) { + ctx := context.Background() + + // No Infrastructure object + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + Build() + + recorder := record.NewFakeRecorder(10) + + r := &ReconcileInfrastructureValidation{ + client: fakeClient, + apiReader: fakeClient, // In tests, use the same fake client as apiReader + recorder: recorder, + } + + request := reconcile.Request{ + NamespacedName: client.ObjectKey{Name: infrastructureName}, + } + + result, err := r.Reconcile(ctx, request) + if err != nil { + t.Fatalf("Reconcile failed: %v", err) + } + + if result.RequeueAfter > 0 { + t.Error("Expected no requeue when Infrastructure not found") + } +} + +func TestReconcileClusterOperatorNotFound(t *testing.T) { + ctx := context.Background() + + infra := createInfrastructure([]failureDomainDef{ + {Name: "fd-east", Region: "us-east", Zone: "us-east-1a"}, + }) + machine1 := createVSphereMachine("machine-1", testNamespace, "us-east", "us-east-1a") + + // No ClusterOperator object + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(infra, machine1). + Build() + + recorder := record.NewFakeRecorder(10) + + r := &ReconcileInfrastructureValidation{ + client: fakeClient, + apiReader: fakeClient, // In tests, use the same fake client as apiReader + recorder: recorder, + } + + request := reconcile.Request{ + NamespacedName: client.ObjectKey{Name: infrastructureName}, + } + + // Should not fail when ClusterOperator doesn't exist + result, err := r.Reconcile(ctx, request) + if err != nil { + t.Fatalf("Reconcile should not fail when ClusterOperator not found: %v", err) + } + + if result.RequeueAfter > 0 { + t.Error("Expected no requeue") + } +} + +func TestIsVSpherePlatform(t *testing.T) { + tests := []struct { + name string + infra *configv1.Infrastructure + expected bool + }{ + { + name: "VSphere platform", + infra: createInfrastructure([]failureDomainDef{ + {Name: "fd-east", Region: "us-east", Zone: "us-east-1a"}, + }), + expected: true, + }, + { + name: "AWS platform", + infra: &configv1.Infrastructure{ + Status: configv1.InfrastructureStatus{ + PlatformStatus: &configv1.PlatformStatus{ + Type: configv1.AWSPlatformType, + }, + }, + }, + expected: false, + }, + { + name: "No platform status", + infra: &configv1.Infrastructure{ + Status: configv1.InfrastructureStatus{}, + }, + expected: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := isVSpherePlatform(tt.infra) + if result != tt.expected { + t.Errorf("Expected %v, got %v", tt.expected, result) + } + }) + } +} + +func TestFormatOrphanedMessage(t *testing.T) { + tests := []struct { + name string + refs []OrphanedReference + expected string + }{ + { + name: "Empty references", + refs: []OrphanedReference{}, + expected: "", + }, + { + name: "Single reference", + refs: []OrphanedReference{ + { + FailureDomainName: "fd-west", + MachineName: "machine-1", + MachineNamespace: "openshift-machine-api", + }, + }, + expected: "Machines reference removed failure domains: domain \"fd-west\": openshift-machine-api/machine-1", + }, + { + name: "Multiple references same domain", + refs: []OrphanedReference{ + { + FailureDomainName: "fd-west", + MachineName: "machine-1", + MachineNamespace: "openshift-machine-api", + }, + { + FailureDomainName: "fd-west", + MachineName: "machine-2", + MachineNamespace: "openshift-machine-api", + }, + }, + expected: "Machines reference removed failure domains: domain \"fd-west\": openshift-machine-api/machine-1 and 1 more", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := formatOrphanedMessage(tt.refs) + if result != tt.expected { + t.Errorf("Expected %q, got %q", tt.expected, result) + } + }) + } +} + +func TestInfrastructureRequestFromMachine(t *testing.T) { + ctx := context.Background() + machine := createVSphereMachine("test-machine", testNamespace, "us-east", "us-east-1a") + + requests := infrastructureRequestFromMachine(ctx, machine) + + if len(requests) != 1 { + t.Errorf("Expected 1 request, got %d", len(requests)) + } + + if requests[0].Name != infrastructureName { + t.Errorf("Expected Infrastructure name %q, got %q", infrastructureName, requests[0].Name) + } + + if requests[0].Namespace != "" { + t.Errorf("Expected empty namespace, got %q", requests[0].Namespace) + } +} diff --git a/pkg/controller/infrastructurevalidation/vsphere_validator.go b/pkg/controller/infrastructurevalidation/vsphere_validator.go new file mode 100644 index 000000000..03bd03c55 --- /dev/null +++ b/pkg/controller/infrastructurevalidation/vsphere_validator.go @@ -0,0 +1,137 @@ +package infrastructurevalidation + +import ( + "context" + "fmt" + + configv1 "github.com/openshift/api/config/v1" + machinev1 "github.com/openshift/api/machine/v1beta1" + "k8s.io/klog/v2" + runtimeclient "sigs.k8s.io/controller-runtime/pkg/client" +) + +const ( + // Machine label keys for region and zone + machineRegionLabelKey = "machine.openshift.io/region" + machineZoneLabelKey = "machine.openshift.io/zone" +) + +// FailureDomainValidationResult contains the results of failure domain validation. +type FailureDomainValidationResult struct { + Valid bool + OrphanedReferences []OrphanedReference + ConfiguredDomains []string +} + +// OrphanedReference represents a machine that references a removed failure domain. +type OrphanedReference struct { + FailureDomainName string // Region+Zone combination + MachineName string + MachineNamespace string +} + +// failureDomainKey represents a unique region+zone combination +type failureDomainKey struct { + Region string + Zone string +} + +// VSphereFailureDomainValidator validates VSphere failure domain references. +type VSphereFailureDomainValidator struct { + client runtimeclient.Client +} + +// Validate checks if any VSphere machines reference failure domains that have been removed +// from the Infrastructure configuration. +func (v *VSphereFailureDomainValidator) Validate( + ctx context.Context, + infra *configv1.Infrastructure, +) (*FailureDomainValidationResult, error) { + result := &FailureDomainValidationResult{ + Valid: true, + OrphanedReferences: []OrphanedReference{}, + ConfiguredDomains: []string{}, + } + + // Build map of configured failure domains by region+zone + configuredFDs := make(map[failureDomainKey]string) + if infra.Spec.PlatformSpec.VSphere != nil { + for _, fd := range infra.Spec.PlatformSpec.VSphere.FailureDomains { + key := failureDomainKey{ + Region: fd.Region, + Zone: fd.Zone, + } + configuredFDs[key] = fd.Name + result.ConfiguredDomains = append(result.ConfiguredDomains, fd.Name) + } + } + + klog.V(4).Infof("Configured VSphere failure domains: %v", result.ConfiguredDomains) + + // List all machines + machineList := &machinev1.MachineList{} + if err := v.client.List(ctx, machineList); err != nil { + return nil, fmt.Errorf("failed to list machines: %w", err) + } + + klog.V(4).Infof("Checking %d machines for failure domain references", len(machineList.Items)) + + // Check each machine's failure domain reference via region/zone labels + for i := range machineList.Items { + machine := &machineList.Items[i] + + // Extract region and zone from machine labels + region := machine.Labels[machineRegionLabelKey] + zone := machine.Labels[machineZoneLabelKey] + + // Skip machines without region/zone labels (not using failure domains) + if region == "" && zone == "" { + klog.V(5).Infof("Skipping machine %s/%s: no region/zone labels", + machine.Namespace, machine.Name) + continue + } + + // If machine has partial labels, that's suspicious but we'll check it + machineKey := failureDomainKey{ + Region: region, + Zone: zone, + } + + // Check if this region+zone combination matches a configured failure domain + fdName, exists := configuredFDs[machineKey] + if !exists { + // Machine references a region+zone that doesn't match any configured failure domain + fdReference := fmt.Sprintf("region=%s, zone=%s", region, zone) + if region == "" { + fdReference = fmt.Sprintf("zone=%s (missing region)", zone) + } else if zone == "" { + fdReference = fmt.Sprintf("region=%s (missing zone)", region) + } + + klog.Warningf("Machine %s/%s references removed or invalid failure domain: %s", + machine.Namespace, machine.Name, fdReference) + result.Valid = false + result.OrphanedReferences = append(result.OrphanedReferences, OrphanedReference{ + FailureDomainName: fdReference, + MachineName: machine.Name, + MachineNamespace: machine.Namespace, + }) + } else { + klog.V(5).Infof("Machine %s/%s references valid failure domain %s (%s)", + machine.Namespace, machine.Name, fdName, machineKey) + } + } + + if !result.Valid { + klog.Warningf("Found %d machines with orphaned failure domain references", len(result.OrphanedReferences)) + } else { + klog.V(3).Infof("All machine failure domain references are valid") + } + + return result, nil +} + +// String returns a string representation of the failureDomainKey +func (k failureDomainKey) String() string { + return fmt.Sprintf("region=%s, zone=%s", k.Region, k.Zone) +} diff --git a/pkg/controller/infrastructurevalidation/vsphere_validator_test.go b/pkg/controller/infrastructurevalidation/vsphere_validator_test.go new file mode 100644 index 000000000..1bb773163 --- /dev/null +++ b/pkg/controller/infrastructurevalidation/vsphere_validator_test.go @@ -0,0 +1,353 @@ +package infrastructurevalidation + +import ( + "context" + "testing" + + configv1 "github.com/openshift/api/config/v1" + machinev1 "github.com/openshift/api/machine/v1beta1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/klog/v2" + "sigs.k8s.io/controller-runtime/pkg/client/fake" +) + +func init() { + if err := machinev1.Install(scheme.Scheme); err != nil { + klog.Fatal(err) + } + if err := configv1.Install(scheme.Scheme); err != nil { + klog.Fatal(err) + } +} + +const ( + testNamespace = "openshift-machine-api" +) + +// createVSphereMachine creates a test machine with region/zone labels +func createVSphereMachine(name, namespace, region, zone string) *machinev1.Machine { + labels := make(map[string]string) + if region != "" { + labels[machineRegionLabelKey] = region + } + if zone != "" { + labels[machineZoneLabelKey] = zone + } + + return &machinev1.Machine{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + Labels: labels, + }, + Spec: machinev1.MachineSpec{}, + } +} + +// createInfrastructure creates a test Infrastructure with VSphere failure domains +func createInfrastructure(failureDomains []failureDomainDef) *configv1.Infrastructure { + fds := []configv1.VSpherePlatformFailureDomainSpec{} + for _, fd := range failureDomains { + fds = append(fds, configv1.VSpherePlatformFailureDomainSpec{ + Name: fd.Name, + Region: fd.Region, + Zone: fd.Zone, + }) + } + + return &configv1.Infrastructure{ + ObjectMeta: metav1.ObjectMeta{ + Name: infrastructureName, + }, + Spec: configv1.InfrastructureSpec{ + PlatformSpec: configv1.PlatformSpec{ + Type: configv1.VSpherePlatformType, + VSphere: &configv1.VSpherePlatformSpec{ + FailureDomains: fds, + }, + }, + }, + Status: configv1.InfrastructureStatus{ + PlatformStatus: &configv1.PlatformStatus{ + Type: configv1.VSpherePlatformType, + }, + }, + } +} + +type failureDomainDef struct { + Name string + Region string + Zone string +} + +func TestVSphereValidatorAllReferencesValid(t *testing.T) { + ctx := context.Background() + + infra := createInfrastructure([]failureDomainDef{ + {Name: "fd-east", Region: "us-east", Zone: "us-east-1a"}, + {Name: "fd-west", Region: "us-west", Zone: "us-west-1a"}, + }) + machine1 := createVSphereMachine("machine-1", testNamespace, "us-east", "us-east-1a") + machine2 := createVSphereMachine("machine-2", testNamespace, "us-west", "us-west-1a") + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(infra, machine1, machine2). + Build() + + validator := &VSphereFailureDomainValidator{client: fakeClient} + result, err := validator.Validate(ctx, infra) + + if err != nil { + t.Fatalf("Validation failed: %v", err) + } + + if !result.Valid { + t.Errorf("Expected validation to pass, but got: %v", result.OrphanedReferences) + } + + if len(result.OrphanedReferences) != 0 { + t.Errorf("Expected no orphaned references, got %d", len(result.OrphanedReferences)) + } +} + +func TestVSphereValidatorOrphanedReferences(t *testing.T) { + ctx := context.Background() + + // Infrastructure only has fd-east + infra := createInfrastructure([]failureDomainDef{ + {Name: "fd-east", Region: "us-east", Zone: "us-east-1a"}, + }) + machine1 := createVSphereMachine("machine-1", testNamespace, "us-east", "us-east-1a") // valid + machine2 := createVSphereMachine("machine-2", testNamespace, "us-west", "us-west-1a") // orphaned + machine3 := createVSphereMachine("machine-3", testNamespace, "eu-west", "eu-west-1a") // orphaned + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(infra, machine1, machine2, machine3). + Build() + + validator := &VSphereFailureDomainValidator{client: fakeClient} + result, err := validator.Validate(ctx, infra) + + if err != nil { + t.Fatalf("Validation failed: %v", err) + } + + if result.Valid { + t.Error("Expected validation to fail, but it passed") + } + + if len(result.OrphanedReferences) != 2 { + t.Errorf("Expected 2 orphaned references, got %d", len(result.OrphanedReferences)) + } + + // Check that the right machines are orphaned + orphanedMachines := make(map[string]string) + for _, ref := range result.OrphanedReferences { + orphanedMachines[ref.MachineName] = ref.FailureDomainName + } + + if fd, ok := orphanedMachines["machine-2"]; !ok || fd != "region=us-west, zone=us-west-1a" { + t.Errorf("Expected machine-2 to be orphaned with region=us-west, zone=us-west-1a, got %v", orphanedMachines) + } + + if fd, ok := orphanedMachines["machine-3"]; !ok || fd != "region=eu-west, zone=eu-west-1a" { + t.Errorf("Expected machine-3 to be orphaned with region=eu-west, zone=eu-west-1a, got %v", orphanedMachines) + } +} + +func TestVSphereValidatorNoFailureDomains(t *testing.T) { + ctx := context.Background() + + // Infrastructure with no failure domains + infra := createInfrastructure([]failureDomainDef{}) + machine1 := createVSphereMachine("machine-1", testNamespace, "us-east", "us-east-1a") + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(infra, machine1). + Build() + + validator := &VSphereFailureDomainValidator{client: fakeClient} + result, err := validator.Validate(ctx, infra) + + if err != nil { + t.Fatalf("Validation failed: %v", err) + } + + if result.Valid { + t.Error("Expected validation to fail when machine references non-existent domain") + } + + if len(result.OrphanedReferences) != 1 { + t.Errorf("Expected 1 orphaned reference, got %d", len(result.OrphanedReferences)) + } +} + +func TestVSphereValidatorMachineWithoutRegionZone(t *testing.T) { + ctx := context.Background() + + infra := createInfrastructure([]failureDomainDef{ + {Name: "fd-east", Region: "us-east", Zone: "us-east-1a"}, + }) + // Machine without region/zone labels + machine1 := createVSphereMachine("machine-1", testNamespace, "", "") + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(infra, machine1). + Build() + + validator := &VSphereFailureDomainValidator{client: fakeClient} + result, err := validator.Validate(ctx, infra) + + if err != nil { + t.Fatalf("Validation failed: %v", err) + } + + if !result.Valid { + t.Error("Expected validation to pass for machine without region/zone labels") + } + + if len(result.OrphanedReferences) != 0 { + t.Errorf("Expected no orphaned references, got %d", len(result.OrphanedReferences)) + } +} + +func TestVSphereValidatorMachineWithPartialLabels(t *testing.T) { + ctx := context.Background() + + infra := createInfrastructure([]failureDomainDef{ + {Name: "fd-east", Region: "us-east", Zone: "us-east-1a"}, + }) + + tests := []struct { + name string + region string + zone string + expectOrphaned bool + expectedReference string + }{ + { + name: "Region only", + region: "us-east", + zone: "", + expectOrphaned: true, + expectedReference: "region=us-east (missing zone)", + }, + { + name: "Zone only", + region: "", + zone: "us-east-1a", + expectOrphaned: true, + expectedReference: "zone=us-east-1a (missing region)", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + machine := createVSphereMachine("test-machine", testNamespace, tt.region, tt.zone) + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(infra, machine). + Build() + + validator := &VSphereFailureDomainValidator{client: fakeClient} + result, err := validator.Validate(ctx, infra) + + if err != nil { + t.Fatalf("Validation failed: %v", err) + } + + if tt.expectOrphaned { + if result.Valid { + t.Error("Expected validation to fail for machine with partial labels") + } + if len(result.OrphanedReferences) != 1 { + t.Errorf("Expected 1 orphaned reference, got %d", len(result.OrphanedReferences)) + } + if len(result.OrphanedReferences) > 0 && result.OrphanedReferences[0].FailureDomainName != tt.expectedReference { + t.Errorf("Expected reference %q, got %q", tt.expectedReference, result.OrphanedReferences[0].FailureDomainName) + } + } + }) + } +} + +func TestVSphereValidatorNoMachines(t *testing.T) { + ctx := context.Background() + + infra := createInfrastructure([]failureDomainDef{ + {Name: "fd-east", Region: "us-east", Zone: "us-east-1a"}, + {Name: "fd-west", Region: "us-west", Zone: "us-west-1a"}, + }) + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(infra). + Build() + + validator := &VSphereFailureDomainValidator{client: fakeClient} + result, err := validator.Validate(ctx, infra) + + if err != nil { + t.Fatalf("Validation failed: %v", err) + } + + if !result.Valid { + t.Error("Expected validation to pass when no machines exist") + } + + if len(result.OrphanedReferences) != 0 { + t.Errorf("Expected no orphaned references, got %d", len(result.OrphanedReferences)) + } +} + +func TestVSphereValidatorMultipleMachinesSameFailureDomain(t *testing.T) { + ctx := context.Background() + + infra := createInfrastructure([]failureDomainDef{ + {Name: "fd-east", Region: "us-east", Zone: "us-east-1a"}, + }) + machine1 := createVSphereMachine("machine-1", testNamespace, "us-east", "us-east-1a") + machine2 := createVSphereMachine("machine-2", testNamespace, "us-east", "us-east-1a") + machine3 := createVSphereMachine("machine-3", testNamespace, "us-east", "us-east-1a") + + fakeClient := fake.NewClientBuilder(). + WithScheme(scheme.Scheme). + WithObjects(infra, machine1, machine2, machine3). + Build() + + validator := &VSphereFailureDomainValidator{client: fakeClient} + result, err := validator.Validate(ctx, infra) + + if err != nil { + t.Fatalf("Validation failed: %v", err) + } + + if !result.Valid { + t.Error("Expected validation to pass for multiple machines in same failure domain") + } + + if len(result.OrphanedReferences) != 0 { + t.Errorf("Expected no orphaned references, got %d", len(result.OrphanedReferences)) + } +} + +func TestFailureDomainKeyString(t *testing.T) { + key := failureDomainKey{ + Region: "us-east", + Zone: "us-east-1a", + } + + expected := "region=us-east, zone=us-east-1a" + result := key.String() + + if result != expected { + t.Errorf("Expected %q, got %q", expected, result) + } +} diff --git a/pkg/operator/sync.go b/pkg/operator/sync.go index 986f86d91..2308d2e9b 100644 --- a/pkg/operator/sync.go +++ b/pkg/operator/sync.go @@ -25,6 +25,7 @@ import ( configv1 "github.com/openshift/api/config/v1" machinev1beta1 "github.com/openshift/api/machine/v1beta1" utiltls "github.com/openshift/controller-runtime-common/pkg/tls" + "github.com/openshift/library-go/pkg/config/clusteroperator/v1helpers" libgocrypto "github.com/openshift/library-go/pkg/crypto" "github.com/openshift/library-go/pkg/operator/events" "github.com/openshift/library-go/pkg/operator/resource/resourceapply" @@ -88,6 +89,18 @@ func (optr *Operator) syncAll(config *OperatorConfig) (reconcile.Result, error) } errors := []error{} + + // Check infrastructure validation for VSphere + if config.PlatformType == configv1.VSpherePlatformType { + co, err := optr.getClusterOperator() + if err == nil { + // Check if InfrastructureFailureDomainsValid condition is False + infraValidCondition := v1helpers.FindStatusCondition(co.Status.Conditions, "InfrastructureFailureDomainsValid") + if infraValidCondition != nil && infraValidCondition.Status == configv1.ConditionFalse { + errors = append(errors, fmt.Errorf("infrastructure validation failed: %s", infraValidCondition.Message)) + } + } + } // Sync webhook configuration if err := optr.syncWebhookConfiguration(config); err != nil { errors = append(errors, fmt.Errorf("error syncing machine API webhook configurations: %w", err))