diff --git a/internal/controller/installepinio_controller.go b/internal/controller/installepinio_controller.go index 9f751fa..ed73476 100644 --- a/internal/controller/installepinio_controller.go +++ b/internal/controller/installepinio_controller.go @@ -20,6 +20,7 @@ import ( "context" "errors" "fmt" + "sort" "strings" "time" @@ -262,10 +263,7 @@ func (r *InstallEpinioReconciler) Reconcile(ctx context.Context, req ctrl.Reques if jobTerminalFailed(job.Status) { failed := job.Status.Failed log.Info("Install Job reported failure", "job", jobName, "namespace", ctrlNs, "failed", failed) - failMsg := fmt.Sprintf("Install Job failed (%d failed)", failed) - if failed == 0 { - failMsg = "Install Job failed" - } + failMsg := r.jobFailureMessage(ctx, ctrlNs, jobName) setCondition(&inst, metav1.Condition{ Type: conditionDegraded, Status: metav1.ConditionTrue, @@ -412,6 +410,17 @@ func (r *InstallEpinioReconciler) reconcileDelete(ctx context.Context, inst *epi return ctrl.Result{}, err } + // Check if this is the last InstallEpinio in the namespace before removing the finalizer. + var remaining epiniov1alpha1.InstallEpinioList + if err := r.List(ctx, &remaining, client.InNamespace(ctrlNs)); err != nil { + log.Error(err, "failed to list remaining InstallEpinio resources") + return ctrl.Result{}, err + } + // The current resource is still present (finalizer not yet removed), so count > 1 means others exist. + lastCR := len(remaining.Items) <= 1 + + // Remove finalizer first so the CR can be fully deleted even if the + // subsequent namespace deletion terminates the operator pod. controllerutil.RemoveFinalizer(inst, installCleanupFinalizer) log.Info("Removing cleanup finalizer") if err := r.Update(ctx, inst); err != nil { @@ -419,6 +428,17 @@ func (r *InstallEpinioReconciler) reconcileDelete(ctx context.Context, inst *epi return ctrl.Result{}, err } + // Delete the operator namespace after the finalizer is gone so that + // namespace termination is not blocked by the CR. + if lastCR { + ns := &corev1.Namespace{} + ns.Name = ctrlNs + if err := r.deleteIfExists(ctx, ns, "Namespace"); err != nil { + return ctrl.Result{}, err + } + log.Info("Deleted operator namespace", "namespace", ctrlNs) + } + return ctrl.Result{}, nil } @@ -526,6 +546,59 @@ func installJobName(namespace, name string) string { return jobNamePrefix + namespace + "-" + name } +// jobFailureMessage inspects the pods belonging to a failed Job and extracts +// a human-readable error from init or main container termination details. +func (r *InstallEpinioReconciler) jobFailureMessage(ctx context.Context, namespace, jobName string) string { + fallback := fmt.Sprintf("Install Job %q failed", jobName) + + var podList corev1.PodList + if err := r.List(ctx, &podList, + client.InNamespace(namespace), + client.MatchingLabels{"job-name": jobName}, + ); err != nil || len(podList.Items) == 0 { + return fallback + } + + pods := append([]corev1.Pod(nil), podList.Items...) + sort.Slice(pods, func(i, j int) bool { + return pods[i].CreationTimestamp.After(pods[j].CreationTimestamp.Time) + }) + + for i := range pods { + if msg := installFailureMessageFromPod(&pods[i]); msg != "" { + return msg + } + } + + return fallback +} + +func installFailureMessageFromPod(pod *corev1.Pod) string { + if msg := installFailureMessageFromContainerStatuses(pod.Status.InitContainerStatuses); msg != "" { + return msg + } + return installFailureMessageFromContainerStatuses(pod.Status.ContainerStatuses) +} + +func installFailureMessageFromContainerStatuses(statuses []corev1.ContainerStatus) string { + for _, cs := range statuses { + if cs.State.Terminated == nil { + continue + } + term := cs.State.Terminated + if term.Message != "" { + return fmt.Sprintf("Install failed: %s", term.Message) + } + if term.Reason != "" { + return fmt.Sprintf("Install failed: %s (exit code %d)", term.Reason, term.ExitCode) + } + if term.ExitCode != 0 { + return fmt.Sprintf("Install failed with exit code %d", term.ExitCode) + } + } + return "" +} + func jobTerminalFailed(status batchv1.JobStatus) bool { for i := range status.Conditions { if status.Conditions[i].Type == batchv1.JobFailed && status.Conditions[i].Status == corev1.ConditionTrue { diff --git a/internal/controller/installepinio_controller_test.go b/internal/controller/installepinio_controller_test.go index ec9d913..a1d185c 100644 --- a/internal/controller/installepinio_controller_test.go +++ b/internal/controller/installepinio_controller_test.go @@ -18,6 +18,7 @@ package controller import ( "context" + "fmt" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -33,28 +34,32 @@ import ( epiniov1alpha1 "github.com/epinio/install-operator/api/v1alpha1" ) +var testNsCounter int + var _ = Describe("InstallEpinio Controller", func() { Context("When reconciling a resource", func() { const resourceName = "test-resource" ctx := context.Background() - typeNamespacedName := types.NamespacedName{ - Name: resourceName, - Namespace: "system", - } + var testNs string + var typeNamespacedName types.NamespacedName installepinio := &epiniov1alpha1.InstallEpinio{} var reconciler *InstallEpinioReconciler BeforeEach(func() { - Expect(k8sClient.Create(ctx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "system"}})).To(SatisfyAny( - Succeed(), - WithTransform(errors.IsAlreadyExists, BeTrue()), - )) + testNsCounter++ + testNs = fmt.Sprintf("test-system-%d", testNsCounter) + typeNamespacedName = types.NamespacedName{ + Name: resourceName, + Namespace: testNs, + } + + Expect(k8sClient.Create(ctx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testNs}})).To(Succeed()) reconciler = &InstallEpinioReconciler{ Client: k8sClient, Scheme: k8sClient.Scheme(), - ControllerNamespace: "system", + ControllerNamespace: testNs, InstallerServiceAccount: "installer-sa", InstallerHelmImage: "example.com/helm:test", } @@ -65,7 +70,7 @@ var _ = Describe("InstallEpinio Controller", func() { resource := &epiniov1alpha1.InstallEpinio{ ObjectMeta: metav1.ObjectMeta{ Name: resourceName, - Namespace: "system", + Namespace: testNs, }, Spec: epiniov1alpha1.InstallEpinioSpec{ Domain: "demo.example.test", @@ -110,7 +115,7 @@ var _ = Describe("InstallEpinio Controller", func() { Expect(err).NotTo(HaveOccurred()) job := &batchv1.Job{} - Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "epinio-installer-system-test-resource", Namespace: "system"}, job)).To(Succeed()) + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf("epinio-installer-%s-test-resource", testNs), Namespace: testNs}, job)).To(Succeed()) Expect(job.Spec.Template.Spec.Containers).To(HaveLen(1)) Expect(job.Spec.Template.Spec.Containers[0].Env).To(ContainElement(corev1.EnvVar{Name: "EPINIO_VERSION", Value: "1.0.0"})) Expect(job.Spec.Template.Spec.ServiceAccountName).To(Equal("installer-sa")) @@ -119,7 +124,7 @@ var _ = Describe("InstallEpinio Controller", func() { Expect(job.OwnerReferences[0].Name).To(Equal(resourceName)) configMap := &corev1.ConfigMap{} - Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "epinio-install-script-system-test-resource", Namespace: "system"}, configMap)).To(Succeed()) + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf("epinio-install-script-%s-test-resource", testNs), Namespace: testNs}, configMap)).To(Succeed()) Expect(configMap.OwnerReferences).To(HaveLen(1)) Expect(configMap.OwnerReferences[0].Name).To(Equal(resourceName)) @@ -136,7 +141,7 @@ var _ = Describe("InstallEpinio Controller", func() { Expect(err).NotTo(HaveOccurred()) job := &batchv1.Job{} - Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "epinio-installer-system-test-resource", Namespace: "system"}, job)).To(Succeed()) + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf("epinio-installer-%s-test-resource", testNs), Namespace: testNs}, job)).To(Succeed()) job.Status.Succeeded = 1 Expect(k8sClient.Status().Update(ctx, job)).To(Succeed()) diff --git a/internal/controller/installepinio_controller_unit_test.go b/internal/controller/installepinio_controller_unit_test.go index cb5771c..cdb2e34 100644 --- a/internal/controller/installepinio_controller_unit_test.go +++ b/internal/controller/installepinio_controller_unit_test.go @@ -19,6 +19,83 @@ import ( epiniov1alpha1 "github.com/epinio/install-operator/api/v1alpha1" ) +func TestInstallFailureMessageFromContainerStatuses(t *testing.T) { + t.Parallel() + + if got := installFailureMessageFromContainerStatuses(nil); got != "" { + t.Fatalf("empty statuses: got %q", got) + } + msg := installFailureMessageFromContainerStatuses([]corev1.ContainerStatus{ + {Name: "a", State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{Message: "helm oops"}}}, + }) + if msg != "Install failed: helm oops" { + t.Fatalf("message: got %q", msg) + } +} + +func TestInstallFailureMessageFromPodInitBeforeMain(t *testing.T) { + t.Parallel() + + pod := &corev1.Pod{ + Status: corev1.PodStatus{ + InitContainerStatuses: []corev1.ContainerStatus{ + {Name: "init", State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{Message: "init pull failed"}}}, + }, + ContainerStatuses: []corev1.ContainerStatus{ + {Name: "main", State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{Message: "main never ran"}}}, + }, + }, + } + if got := installFailureMessageFromPod(pod); got != "Install failed: init pull failed" { + t.Fatalf("expected init message first, got %q", got) + } +} + +func TestJobFailureMessagePrefersNewestPod(t *testing.T) { + t.Parallel() + + ctx := context.Background() + scheme := runtime.NewScheme() + if err := clientgoscheme.AddToScheme(scheme); err != nil { + t.Fatalf("scheme: %v", err) + } + + oldTime := metav1.NewTime(time.Date(2020, 1, 1, 0, 0, 0, 0, time.UTC)) + newTime := metav1.NewTime(time.Date(2021, 1, 1, 0, 0, 0, 0, time.UTC)) + + objs := []client.Object{ + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "old", Namespace: "system", Labels: map[string]string{"job-name": "thejob"}, + CreationTimestamp: oldTime, + }, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{ + {State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{Message: "from older pod"}}}, + }, + }, + }, + &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "new", Namespace: "system", Labels: map[string]string{"job-name": "thejob"}, + CreationTimestamp: newTime, + }, + Status: corev1.PodStatus{ + ContainerStatuses: []corev1.ContainerStatus{ + {State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{Message: "from newer pod"}}}, + }, + }, + }, + } + + r := &InstallEpinioReconciler{ + Client: fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build(), + } + if got := r.jobFailureMessage(ctx, "system", "thejob"); got != "Install failed: from newer pod" { + t.Fatalf("jobFailureMessage: got %q", got) + } +} + func TestJobTerminalHelpers(t *testing.T) { t.Parallel()