Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 77 additions & 4 deletions internal/controller/installepinio_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"context"
"errors"
"fmt"
"sort"
"strings"
"time"

Expand Down Expand Up @@ -262,10 +263,7 @@ func (r *InstallEpinioReconciler) Reconcile(ctx context.Context, req ctrl.Reques
if jobTerminalFailed(job.Status) {
failed := job.Status.Failed
log.Info("Install Job reported failure", "job", jobName, "namespace", ctrlNs, "failed", failed)
failMsg := fmt.Sprintf("Install Job failed (%d failed)", failed)
if failed == 0 {
failMsg = "Install Job failed"
}
failMsg := r.jobFailureMessage(ctx, ctrlNs, jobName)
setCondition(&inst, metav1.Condition{
Type: conditionDegraded,
Status: metav1.ConditionTrue,
Expand Down Expand Up @@ -412,13 +410,35 @@ func (r *InstallEpinioReconciler) reconcileDelete(ctx context.Context, inst *epi
return ctrl.Result{}, err
}

// Check if this is the last InstallEpinio in the namespace before removing the finalizer.
var remaining epiniov1alpha1.InstallEpinioList
if err := r.List(ctx, &remaining, client.InNamespace(ctrlNs)); err != nil {
log.Error(err, "failed to list remaining InstallEpinio resources")
return ctrl.Result{}, err
}
// The current resource is still present (finalizer not yet removed), so count > 1 means others exist.
lastCR := len(remaining.Items) <= 1

// Remove finalizer first so the CR can be fully deleted even if the
// subsequent namespace deletion terminates the operator pod.
controllerutil.RemoveFinalizer(inst, installCleanupFinalizer)
log.Info("Removing cleanup finalizer")
if err := r.Update(ctx, inst); err != nil {
log.Error(err, "failed to remove cleanup finalizer")
return ctrl.Result{}, err
}

// Delete the operator namespace after the finalizer is gone so that
// namespace termination is not blocked by the CR.
if lastCR {
ns := &corev1.Namespace{}
ns.Name = ctrlNs
if err := r.deleteIfExists(ctx, ns, "Namespace"); err != nil {
return ctrl.Result{}, err
}
log.Info("Deleted operator namespace", "namespace", ctrlNs)
}

return ctrl.Result{}, nil
}

Expand Down Expand Up @@ -526,6 +546,59 @@ func installJobName(namespace, name string) string {
return jobNamePrefix + namespace + "-" + name
}

// jobFailureMessage inspects the pods belonging to a failed Job and extracts
// a human-readable error from init or main container termination details.
func (r *InstallEpinioReconciler) jobFailureMessage(ctx context.Context, namespace, jobName string) string {
fallback := fmt.Sprintf("Install Job %q failed", jobName)

var podList corev1.PodList
if err := r.List(ctx, &podList,
client.InNamespace(namespace),
client.MatchingLabels{"job-name": jobName},
); err != nil || len(podList.Items) == 0 {
return fallback
}

pods := append([]corev1.Pod(nil), podList.Items...)
sort.Slice(pods, func(i, j int) bool {
return pods[i].CreationTimestamp.After(pods[j].CreationTimestamp.Time)
})

for i := range pods {
if msg := installFailureMessageFromPod(&pods[i]); msg != "" {
return msg
}
}

return fallback
}

func installFailureMessageFromPod(pod *corev1.Pod) string {
if msg := installFailureMessageFromContainerStatuses(pod.Status.InitContainerStatuses); msg != "" {
return msg
}
return installFailureMessageFromContainerStatuses(pod.Status.ContainerStatuses)
}

func installFailureMessageFromContainerStatuses(statuses []corev1.ContainerStatus) string {
for _, cs := range statuses {
if cs.State.Terminated == nil {
continue
}
term := cs.State.Terminated
if term.Message != "" {
return fmt.Sprintf("Install failed: %s", term.Message)
}
if term.Reason != "" {
return fmt.Sprintf("Install failed: %s (exit code %d)", term.Reason, term.ExitCode)
}
if term.ExitCode != 0 {
return fmt.Sprintf("Install failed with exit code %d", term.ExitCode)
}
}
return ""
}

func jobTerminalFailed(status batchv1.JobStatus) bool {
for i := range status.Conditions {
if status.Conditions[i].Type == batchv1.JobFailed && status.Conditions[i].Status == corev1.ConditionTrue {
Expand Down
31 changes: 18 additions & 13 deletions internal/controller/installepinio_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package controller

import (
"context"
"fmt"

. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
Expand All @@ -33,28 +34,32 @@ import (
epiniov1alpha1 "github.com/epinio/install-operator/api/v1alpha1"
)

var testNsCounter int

var _ = Describe("InstallEpinio Controller", func() {
Context("When reconciling a resource", func() {
const resourceName = "test-resource"

ctx := context.Background()

typeNamespacedName := types.NamespacedName{
Name: resourceName,
Namespace: "system",
}
var testNs string
var typeNamespacedName types.NamespacedName
installepinio := &epiniov1alpha1.InstallEpinio{}
var reconciler *InstallEpinioReconciler

BeforeEach(func() {
Expect(k8sClient.Create(ctx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "system"}})).To(SatisfyAny(
Succeed(),
WithTransform(errors.IsAlreadyExists, BeTrue()),
))
testNsCounter++
testNs = fmt.Sprintf("test-system-%d", testNsCounter)
typeNamespacedName = types.NamespacedName{
Name: resourceName,
Namespace: testNs,
}

Expect(k8sClient.Create(ctx, &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: testNs}})).To(Succeed())
reconciler = &InstallEpinioReconciler{
Client: k8sClient,
Scheme: k8sClient.Scheme(),
ControllerNamespace: "system",
ControllerNamespace: testNs,
InstallerServiceAccount: "installer-sa",
InstallerHelmImage: "example.com/helm:test",
}
Expand All @@ -65,7 +70,7 @@ var _ = Describe("InstallEpinio Controller", func() {
resource := &epiniov1alpha1.InstallEpinio{
ObjectMeta: metav1.ObjectMeta{
Name: resourceName,
Namespace: "system",
Namespace: testNs,
},
Spec: epiniov1alpha1.InstallEpinioSpec{
Domain: "demo.example.test",
Expand Down Expand Up @@ -110,7 +115,7 @@ var _ = Describe("InstallEpinio Controller", func() {
Expect(err).NotTo(HaveOccurred())

job := &batchv1.Job{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "epinio-installer-system-test-resource", Namespace: "system"}, job)).To(Succeed())
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf("epinio-installer-%s-test-resource", testNs), Namespace: testNs}, job)).To(Succeed())
Expect(job.Spec.Template.Spec.Containers).To(HaveLen(1))
Expect(job.Spec.Template.Spec.Containers[0].Env).To(ContainElement(corev1.EnvVar{Name: "EPINIO_VERSION", Value: "1.0.0"}))
Expect(job.Spec.Template.Spec.ServiceAccountName).To(Equal("installer-sa"))
Expand All @@ -119,7 +124,7 @@ var _ = Describe("InstallEpinio Controller", func() {
Expect(job.OwnerReferences[0].Name).To(Equal(resourceName))

configMap := &corev1.ConfigMap{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "epinio-install-script-system-test-resource", Namespace: "system"}, configMap)).To(Succeed())
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf("epinio-install-script-%s-test-resource", testNs), Namespace: testNs}, configMap)).To(Succeed())
Expect(configMap.OwnerReferences).To(HaveLen(1))
Expect(configMap.OwnerReferences[0].Name).To(Equal(resourceName))

Expand All @@ -136,7 +141,7 @@ var _ = Describe("InstallEpinio Controller", func() {
Expect(err).NotTo(HaveOccurred())

job := &batchv1.Job{}
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "epinio-installer-system-test-resource", Namespace: "system"}, job)).To(Succeed())
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: fmt.Sprintf("epinio-installer-%s-test-resource", testNs), Namespace: testNs}, job)).To(Succeed())
job.Status.Succeeded = 1
Expect(k8sClient.Status().Update(ctx, job)).To(Succeed())

Expand Down
77 changes: 77 additions & 0 deletions internal/controller/installepinio_controller_unit_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,83 @@ import (
epiniov1alpha1 "github.com/epinio/install-operator/api/v1alpha1"
)

func TestInstallFailureMessageFromContainerStatuses(t *testing.T) {
t.Parallel()

if got := installFailureMessageFromContainerStatuses(nil); got != "" {
t.Fatalf("empty statuses: got %q", got)
}
msg := installFailureMessageFromContainerStatuses([]corev1.ContainerStatus{
{Name: "a", State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{Message: "helm oops"}}},
})
if msg != "Install failed: helm oops" {
t.Fatalf("message: got %q", msg)
}
}

func TestInstallFailureMessageFromPodInitBeforeMain(t *testing.T) {
t.Parallel()

pod := &corev1.Pod{
Status: corev1.PodStatus{
InitContainerStatuses: []corev1.ContainerStatus{
{Name: "init", State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{Message: "init pull failed"}}},
},
ContainerStatuses: []corev1.ContainerStatus{
{Name: "main", State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{Message: "main never ran"}}},
},
},
}
if got := installFailureMessageFromPod(pod); got != "Install failed: init pull failed" {
t.Fatalf("expected init message first, got %q", got)
}
}

func TestJobFailureMessagePrefersNewestPod(t *testing.T) {
t.Parallel()

ctx := context.Background()
scheme := runtime.NewScheme()
if err := clientgoscheme.AddToScheme(scheme); err != nil {
t.Fatalf("scheme: %v", err)
}

oldTime := metav1.NewTime(time.Date(2020, 1, 1, 0, 0, 0, 0, time.UTC))
newTime := metav1.NewTime(time.Date(2021, 1, 1, 0, 0, 0, 0, time.UTC))

objs := []client.Object{
&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "old", Namespace: "system", Labels: map[string]string{"job-name": "thejob"},
CreationTimestamp: oldTime,
},
Status: corev1.PodStatus{
ContainerStatuses: []corev1.ContainerStatus{
{State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{Message: "from older pod"}}},
},
},
},
&corev1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "new", Namespace: "system", Labels: map[string]string{"job-name": "thejob"},
CreationTimestamp: newTime,
},
Status: corev1.PodStatus{
ContainerStatuses: []corev1.ContainerStatus{
{State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{Message: "from newer pod"}}},
},
},
},
}

r := &InstallEpinioReconciler{
Client: fake.NewClientBuilder().WithScheme(scheme).WithObjects(objs...).Build(),
}
if got := r.jobFailureMessage(ctx, "system", "thejob"); got != "Install failed: from newer pod" {
t.Fatalf("jobFailureMessage: got %q", got)
}
}

func TestJobTerminalHelpers(t *testing.T) {
t.Parallel()

Expand Down
Loading