From 32f388fdb6963cf8f8a867a3a763c41a8dcf686c Mon Sep 17 00:00:00 2001 From: Mate Saary Date: Fri, 19 Jun 2026 02:00:40 +0100 Subject: [PATCH] consoleerrorbudgetburn: add probe history and LB security group checks --- pkg/aws/aws.go | 58 +- pkg/aws/mock/aws.go | 52 +- .../consoleerrorbudgetburn.go | 238 +++++++- .../consoleerrorbudgetburn_test.go | 539 +++++++++++++++++- pkg/ocm/mock/ocmmock.go | 6 +- 5 files changed, 863 insertions(+), 30 deletions(-) diff --git a/pkg/aws/aws.go b/pkg/aws/aws.go index e99b3249..e7f32da9 100644 --- a/pkg/aws/aws.go +++ b/pkg/aws/aws.go @@ -103,10 +103,13 @@ type Client interface { FindHostedZone(ctx context.Context, dnsName string, private bool) (string, error) HasResourceRecordSet(ctx context.Context, hostedZoneID, recordName, recordType string) (bool, error) GetVpcDhcpConfiguration(ctx context.Context, infraID string) ([]string, error) - FindNLBByDNSName(ctx context.Context, dnsName string) (arn string, name string, err error) + FindNLBByDNSName(ctx context.Context, dnsName string) (arn string, name string, securityGroups []string, err error) FindCLBByDNSName(ctx context.Context, dnsName string) (name string, err error) GetNLBTargetHealth(ctx context.Context, lbARN string) ([]NLBTargetHealth, error) GetCLBInstanceHealth(ctx context.Context, lbName string) ([]CLBInstanceHealth, error) + GetCLBSecurityGroupIDs(ctx context.Context, lbName string) ([]string, error) + GetSecurityGroupRules(ctx context.Context, sgIDs []string) ([]ec2v2types.SecurityGroup, error) + GetInstanceSecurityGroupIDs(ctx context.Context, instanceIDs []string) (map[string][]string, error) } type SdkClient struct { @@ -728,18 +731,18 @@ func (c *SdkClient) GetVpcDhcpConfiguration(ctx context.Context, infraID string) return servers, nil } -func (c *SdkClient) FindNLBByDNSName(ctx context.Context, dnsName string) (string, string, error) { +func (c *SdkClient) FindNLBByDNSName(ctx context.Context, dnsName string) (string, string, []string, error) { var marker *string for { out, err := c.Elbv2Client.DescribeLoadBalancers(ctx, &elbv2.DescribeLoadBalancersInput{ Marker: marker, }) if err != nil { - return "", "", fmt.Errorf("failed to describe NLBs: %w", err) + return "", "", nil, fmt.Errorf("failed to describe NLBs: %w", err) } for _, lb := range out.LoadBalancers { if lb.Type == elbv2types.LoadBalancerTypeEnumNetwork && awsv2.ToString(lb.DNSName) == dnsName { - return awsv2.ToString(lb.LoadBalancerArn), awsv2.ToString(lb.LoadBalancerName), nil + return awsv2.ToString(lb.LoadBalancerArn), awsv2.ToString(lb.LoadBalancerName), lb.SecurityGroups, nil } } if out.NextMarker == nil { @@ -747,7 +750,7 @@ func (c *SdkClient) FindNLBByDNSName(ctx context.Context, dnsName string) (strin } marker = out.NextMarker } - return "", "", nil + return "", "", nil, nil } func (c *SdkClient) FindCLBByDNSName(ctx context.Context, dnsName string) (string, error) { @@ -829,6 +832,51 @@ func (c *SdkClient) GetCLBInstanceHealth(ctx context.Context, lbName string) ([] return results, nil } +func (c *SdkClient) GetCLBSecurityGroupIDs(ctx context.Context, lbName string) ([]string, error) { + out, err := c.ElbClient.DescribeLoadBalancers(ctx, &elbv1.DescribeLoadBalancersInput{ + LoadBalancerNames: []string{lbName}, + }) + if err != nil { + return nil, fmt.Errorf("failed to describe CLB %s: %w", lbName, err) + } + if len(out.LoadBalancerDescriptions) == 0 { + return nil, fmt.Errorf("CLB %s not found", lbName) + } + return out.LoadBalancerDescriptions[0].SecurityGroups, nil +} + +// GetSecurityGroupRules returns full security group objects including their inbound rules +func (c *SdkClient) GetSecurityGroupRules(ctx context.Context, sgIDs []string) ([]ec2v2types.SecurityGroup, error) { + out, err := c.Ec2Client.DescribeSecurityGroups(ctx, &ec2v2.DescribeSecurityGroupsInput{ + GroupIds: sgIDs, + }) + if err != nil { + return nil, fmt.Errorf("failed to describe security groups: %w", err) + } + return out.SecurityGroups, nil +} + +func (c *SdkClient) GetInstanceSecurityGroupIDs(ctx context.Context, instanceIDs []string) (map[string][]string, error) { + out, err := c.Ec2Client.DescribeInstances(ctx, &ec2v2.DescribeInstancesInput{ + InstanceIds: instanceIDs, + }) + if err != nil { + return nil, fmt.Errorf("failed to describe instances: %w", err) + } + result := make(map[string][]string) + for _, res := range out.Reservations { + for _, inst := range res.Instances { + id := awsv2.ToString(inst.InstanceId) + sgs := make([]string, 0, len(inst.SecurityGroups)) + for _, sg := range inst.SecurityGroups { + sgs = append(sgs, awsv2.ToString(sg.GroupId)) + } + result[id] = sgs + } + } + return result, nil +} + func populateStopTime(instances []ec2v2types.Instance) (map[string]time.Time, error) { idToStopTime := make(map[string]time.Time) for _, instance := range instances { diff --git a/pkg/aws/mock/aws.go b/pkg/aws/mock/aws.go index 2398ede6..ddbf0e30 100644 --- a/pkg/aws/mock/aws.go +++ b/pkg/aws/mock/aws.go @@ -570,13 +570,14 @@ func (mr *MockClientMockRecorder) FindHostedZone(ctx, dnsName, private any) *gom } // FindNLBByDNSName mocks base method. -func (m *MockClient) FindNLBByDNSName(ctx context.Context, dnsName string) (string, string, error) { +func (m *MockClient) FindNLBByDNSName(ctx context.Context, dnsName string) (string, string, []string, error) { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "FindNLBByDNSName", ctx, dnsName) ret0, _ := ret[0].(string) ret1, _ := ret[1].(string) - ret2, _ := ret[2].(error) - return ret0, ret1, ret2 + ret2, _ := ret[2].([]string) + ret3, _ := ret[3].(error) + return ret0, ret1, ret2, ret3 } // FindNLBByDNSName indicates an expected call of FindNLBByDNSName. @@ -614,6 +615,36 @@ func (mr *MockClientMockRecorder) GetCLBInstanceHealth(ctx, lbName any) *gomock. return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetCLBInstanceHealth", reflect.TypeOf((*MockClient)(nil).GetCLBInstanceHealth), ctx, lbName) } +// GetCLBSecurityGroupIDs mocks base method. +func (m *MockClient) GetCLBSecurityGroupIDs(ctx context.Context, lbName string) ([]string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetCLBSecurityGroupIDs", ctx, lbName) + ret0, _ := ret[0].([]string) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetCLBSecurityGroupIDs indicates an expected call of GetCLBSecurityGroupIDs. +func (mr *MockClientMockRecorder) GetCLBSecurityGroupIDs(ctx, lbName any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetCLBSecurityGroupIDs", reflect.TypeOf((*MockClient)(nil).GetCLBSecurityGroupIDs), ctx, lbName) +} + +// GetInstanceSecurityGroupIDs mocks base method. +func (m *MockClient) GetInstanceSecurityGroupIDs(ctx context.Context, instanceIDs []string) (map[string][]string, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetInstanceSecurityGroupIDs", ctx, instanceIDs) + ret0, _ := ret[0].(map[string][]string) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetInstanceSecurityGroupIDs indicates an expected call of GetInstanceSecurityGroupIDs. +func (mr *MockClientMockRecorder) GetInstanceSecurityGroupIDs(ctx, instanceIDs any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetInstanceSecurityGroupIDs", reflect.TypeOf((*MockClient)(nil).GetInstanceSecurityGroupIDs), ctx, instanceIDs) +} + // GetNLBTargetHealth mocks base method. func (m *MockClient) GetNLBTargetHealth(ctx context.Context, lbARN string) ([]aws0.NLBTargetHealth, error) { m.ctrl.T.Helper() @@ -659,6 +690,21 @@ func (mr *MockClientMockRecorder) GetSecurityGroupID(infraID any) *gomock.Call { return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetSecurityGroupID", reflect.TypeOf((*MockClient)(nil).GetSecurityGroupID), infraID) } +// GetSecurityGroupRules mocks base method. +func (m *MockClient) GetSecurityGroupRules(ctx context.Context, sgIDs []string) ([]types0.SecurityGroup, error) { + m.ctrl.T.Helper() + ret := m.ctrl.Call(m, "GetSecurityGroupRules", ctx, sgIDs) + ret0, _ := ret[0].([]types0.SecurityGroup) + ret1, _ := ret[1].(error) + return ret0, ret1 +} + +// GetSecurityGroupRules indicates an expected call of GetSecurityGroupRules. +func (mr *MockClientMockRecorder) GetSecurityGroupRules(ctx, sgIDs any) *gomock.Call { + mr.mock.ctrl.T.Helper() + return mr.mock.ctrl.RecordCallWithMethodType(mr.mock, "GetSecurityGroupRules", reflect.TypeOf((*MockClient)(nil).GetSecurityGroupRules), ctx, sgIDs) +} + // GetSubnetID mocks base method. func (m *MockClient) GetSubnetID(infraID string) ([]string, error) { m.ctrl.T.Helper() diff --git a/pkg/investigations/consoleerrorbudgetburn/consoleerrorbudgetburn.go b/pkg/investigations/consoleerrorbudgetburn/consoleerrorbudgetburn.go index 8e93ea75..b4a3467e 100644 --- a/pkg/investigations/consoleerrorbudgetburn/consoleerrorbudgetburn.go +++ b/pkg/investigations/consoleerrorbudgetburn/consoleerrorbudgetburn.go @@ -8,11 +8,13 @@ import ( "fmt" "math" "net" + "net/url" "regexp" "strconv" "strings" "time" + ec2v2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" operatorv1 "github.com/openshift/api/operator/v1" "github.com/openshift/configuration-anomaly-detection/pkg/aws" @@ -42,6 +44,10 @@ type blackboxProber interface { runProbe(ctx context.Context, k8sClient k8sclient.Client, restConfig *rest.Config, consoleURL string) (string, error) } +type probeHistoryChecker interface { + queryProbeHistory(ctx context.Context, k8sClient k8sclient.Client, restConfig *rest.Config, consoleURL string) (string, error) +} + type egressVerifier interface { run(r *investigation.Resources) (networkverifier.VerifierResult, string, error) } @@ -53,9 +59,10 @@ func (d *defaultEgressVerifier) run(r *investigation.Resources) (networkverifier } type Investigation struct { - consoleChecker consoleServiceChecker - blackboxProber blackboxProber - egressVerifier egressVerifier + consoleChecker consoleServiceChecker + blackboxProber blackboxProber + probeHistoryCheck probeHistoryChecker + egressVerifier egressVerifier } func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.InvestigationResult, error) { @@ -65,6 +72,9 @@ func (i *Investigation) Run(rb investigation.ResourceBuilder) (investigation.Inv if i.blackboxProber == nil { i.blackboxProber = &defaultBlackboxProber{} } + if i.probeHistoryCheck == nil { + i.probeHistoryCheck = &defaultProbeHistoryChecker{} + } if i.egressVerifier == nil { i.egressVerifier = &defaultEgressVerifier{} } @@ -243,6 +253,53 @@ func cidrContains(allowedCIDR string, machineIP net.IP, machineNet *net.IPNet) b return allowedOnes <= machineOnes && allowedNet.Contains(machineIP) } +// sgAllowsTCPInbound checks whether any of the given SGs allow TCP +// inbound traffic on [fromPort, toPort] +func sgAllowsTCPInbound(securityGroups []ec2v2types.SecurityGroup, fromPort, toPort int32, machineCIDR string) bool { + var machineIP net.IP + var machineNet *net.IPNet + if machineCIDR != "" { + machineIP, machineNet, _ = net.ParseCIDR(machineCIDR) + } + + for _, sg := range securityGroups { + for _, perm := range sg.IpPermissions { + proto := "" + if perm.IpProtocol != nil { + proto = *perm.IpProtocol + } + if proto != "tcp" && proto != "-1" { + continue + } + // "-1" means all traffic (all ports). + if proto != "-1" { + if perm.FromPort == nil || perm.ToPort == nil { + continue + } + if *perm.FromPort > fromPort || *perm.ToPort < toPort { + continue + } + } + if machineCIDR == "" { + return true + } + for _, ipRange := range perm.IpRanges { + cidr := "" + if ipRange.CidrIp != nil { + cidr = *ipRange.CidrIp + } + if cidr == "0.0.0.0/0" { + return true + } + if machineNet != nil && cidrContains(cidr, machineIP, machineNet) { + return true + } + } + } + } + return false +} + // newAllowedSourceRangesSL returns the service log for an allowedSourceRanges misconfiguration. // Content matches the managed-notifications AllowedSourceRanges.json template. func newAllowedSourceRangesSL(machineCIDR string) *ocm.ServiceLog { @@ -445,7 +502,7 @@ func (i *Investigation) checkLoadBalancerHealth(ctx context.Context, r *investig // checkNLBHealth finds an NLB by DNS name and reports target health. func (i *Investigation) checkNLBHealth(ctx context.Context, r *investigation.Resources, hostname string, notes *notewriter.NoteWriter) { - arn, name, err := r.AwsClient.FindNLBByDNSName(ctx, hostname) + arn, name, nlbSecurityGroups, err := r.AwsClient.FindNLBByDNSName(ctx, hostname) if err != nil { notes.AppendWarning("LB Health: failed to look up NLB — %v", err) return @@ -466,6 +523,50 @@ func (i *Investigation) checkNLBHealth(ctx context.Context, r *investigation.Res } reportNLBTargetHealth(name, targets, notes) + + if len(nlbSecurityGroups) > 0 { + notes.AppendWarning("LB Security: NLB %s has security groups attached (unusual): %v", name, nlbSecurityGroups) + } + + // Infra node SG check; verify nodePort traffic is allowed. + instanceIDs := make([]string, 0, len(targets)) + for _, t := range targets { + if t.TargetID != "" { + instanceIDs = append(instanceIDs, t.TargetID) + } + } + if len(instanceIDs) == 0 { + return + } + instanceSGs, err := r.AwsClient.GetInstanceSecurityGroupIDs(ctx, instanceIDs) + if err != nil { + notes.AppendWarning("LB Security: failed to get infra node security groups — %v", err) + return + } + sgSet := make(map[string]struct{}) + for _, sgs := range instanceSGs { + for _, sg := range sgs { + sgSet[sg] = struct{}{} + } + } + sgIDs := make([]string, 0, len(sgSet)) + for sg := range sgSet { + sgIDs = append(sgIDs, sg) + } + if len(sgIDs) == 0 { + notes.AppendWarning("LB Security: no security groups found on NLB target instances") + return + } + sgs, err := r.AwsClient.GetSecurityGroupRules(ctx, sgIDs) + if err != nil { + notes.AppendWarning("LB Security: failed to describe infra node security groups — %v", err) + return + } + if sgAllowsTCPInbound(sgs, 30000, 32767, "") { + notes.AppendSuccess("LB Security: infra node security groups allow NodePort traffic (30000-32767)") + } else { + notes.AppendWarning("LB Security: infra node security groups do not allow TCP traffic on NodePort range (30000-32767)") + } } // reportNLBTargetHealth formats and appends NLB target health to the notewriter. @@ -517,6 +618,28 @@ func (i *Investigation) checkCLBHealth(ctx context.Context, r *investigation.Res } reportCLBInstanceHealth(name, instances, notes) + + // Security group check: verify TCP 443 inbound is allowed. + sgIDs, err := r.AwsClient.GetCLBSecurityGroupIDs(ctx, name) + if err != nil { + notes.AppendWarning("LB Security: failed to get CLB security groups — %v", err) + return + } + if len(sgIDs) == 0 { + notes.AppendWarning("LB Security: CLB %s has no security groups attached", name) + return + } + sgs, err := r.AwsClient.GetSecurityGroupRules(ctx, sgIDs) + if err != nil { + notes.AppendWarning("LB Security: failed to describe CLB security groups — %v", err) + return + } + machineCIDR := r.Cluster.Network().MachineCIDR() + if sgAllowsTCPInbound(sgs, 443, 443, machineCIDR) { + notes.AppendSuccess("LB Security: CLB %s security group allows TCP 443 inbound", name) + } else { + notes.AppendWarning("LB Security: CLB %s security group does not allow TCP 443 inbound from 0.0.0.0/0 or machine CIDR %s", name, machineCIDR) + } } // reportCLBInstanceHealth formats and appends CLB instance health to the notewriter. @@ -1118,6 +1241,113 @@ func (i *Investigation) checkBlackboxProbe(ctx context.Context, r *investigation } else { notes.AppendWarning("Blackbox Probe: probe FAILED for %s — failure mode: %s — %s", consoleURL, pr.failureMode, pr.failureDetail) } + + // Query Prometheus for historical probe_success data. + // Applies to: Classic only — on HCP, probe_success is forwarded to RHOBS + // and is not available in the in-cluster Prometheus. + if !r.IsHCP { + i.checkProbeHistory(ctx, r, consoleURL, notes) + } else { + notes.AppendSuccess("Probe History: skipped on HCP — probe_success is forwarded to RHOBS, not available in in-cluster Prometheus") + } +} + +// defaultProbeHistoryChecker implements probeHistoryChecker by exec'ing curl +// into a cluster-monitoring-operator pod to query the in-cluster Prometheus. +type defaultProbeHistoryChecker struct{} + +func (d *defaultProbeHistoryChecker) queryProbeHistory(ctx context.Context, k8sClient k8sclient.Client, restConfig *rest.Config, consoleURL string) (string, error) { + podList := &corev1.PodList{} + if err := k8sClient.List( + ctx, podList, + client.InNamespace("openshift-monitoring"), + client.MatchingLabels{"app.kubernetes.io/name": "cluster-monitoring-operator"}, + ); err != nil { + return "", fmt.Errorf("failed to list cluster-monitoring-operator pods: %w", err) + } + + var cmoPod *corev1.Pod + for idx := range podList.Items { + if podList.Items[idx].Status.Phase == corev1.PodRunning { + cmoPod = &podList.Items[idx] + break + } + } + if cmoPod == nil { + return "", fmt.Errorf("no running cluster-monitoring-operator pod found in openshift-monitoring") + } + + // Build the Prometheus range query URL. + hostname := strings.TrimPrefix(consoleURL, "https://") + hostname = strings.TrimPrefix(hostname, "http://") + now := time.Now() + params := url.Values{} + params.Set("query", fmt.Sprintf(`probe_success{probe_url=~".*%s.*"}`, hostname)) + params.Set("start", fmt.Sprintf("%d", now.Add(-2*time.Hour).Unix())) + params.Set("end", fmt.Sprintf("%d", now.Unix())) + params.Set("step", "60") + queryURL := fmt.Sprintf("https://prometheus-k8s.openshift-monitoring.svc.cluster.local:9091/api/v1/query_range?%s", params.Encode()) + + curlCmd := fmt.Sprintf(`curl -sk -H "Authorization: Bearer $(cat /var/run/secrets/kubernetes.io/serviceaccount/token)" '%s'`, queryURL) + output, err := k8sclient.ExecInPod(ctx, restConfig, cmoPod, "cluster-monitoring-operator", []string{ + "sh", "-c", curlCmd, + }) + if err != nil { + return output, fmt.Errorf("exec failed: %w", err) + } + return strings.TrimSpace(output), nil +} + +// prometheusRangeResponse mirrors the Prometheus HTTP API range query response structure. +type prometheusRangeResponse struct { + Data struct { + Result []struct { + Values [][]interface{} `json:"values"` + } `json:"result"` + } `json:"data"` +} + +// checkProbeHistory queries Prometheus for historical probe_success data and classifies +// the failure pattern as persistent, intermittent, or recovered. +// +// Informational-only check. +func (i *Investigation) checkProbeHistory(ctx context.Context, r *investigation.Resources, consoleURL string, notes *notewriter.NoteWriter) { + output, err := i.probeHistoryCheck.queryProbeHistory(ctx, r.K8sClient, &r.RestConfig.Config, consoleURL) + if err != nil { + notes.AppendWarning("Probe History: unable to query Prometheus — %v", err) + return + } + + var resp prometheusRangeResponse + if err := json.Unmarshal([]byte(output), &resp); err != nil { + notes.AppendWarning("Probe History: failed to parse Prometheus response — %v", err) + return + } + + if len(resp.Data.Result) == 0 || len(resp.Data.Result[0].Values) == 0 { + notes.AppendWarning("Probe History: no probe_success data found in Prometheus for the past 2 hours") + return + } + + values := resp.Data.Result[0].Values + total := len(values) + failures := 0 + for _, v := range values { + if len(v) >= 2 { + if str, ok := v[1].(string); ok && str == "0" { + failures++ + } + } + } + + switch failures { + case 0: + notes.AppendSuccess("Probe History: probe has been succeeding for all %d data points in the past 2 hours — failure may be intermittent or already resolved", total) + case total: + notes.AppendWarning("Probe History: probe has been consistently failing — all %d data points in the past 2 hours show failure", total) + default: + notes.AppendWarning("Probe History: probe is intermittently failing — %d/%d data points failed in the past 2 hours", failures, total) + } } func (i *Investigation) Name() string { diff --git a/pkg/investigations/consoleerrorbudgetburn/consoleerrorbudgetburn_test.go b/pkg/investigations/consoleerrorbudgetburn/consoleerrorbudgetburn_test.go index e144c96e..9749949a 100644 --- a/pkg/investigations/consoleerrorbudgetburn/consoleerrorbudgetburn_test.go +++ b/pkg/investigations/consoleerrorbudgetburn/consoleerrorbudgetburn_test.go @@ -9,6 +9,7 @@ import ( "testing" "time" + ec2v2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" cmv1 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" operatorv1 "github.com/openshift/api/operator/v1" "github.com/openshift/configuration-anomaly-detection/pkg/aws" @@ -138,6 +139,27 @@ func healthyBlackboxProber() *mockBlackboxProber { return &mockBlackboxProber{output: successfulProbeResponse} } +// mockProbeHistoryChecker implements probeHistoryChecker for testing. +type mockProbeHistoryChecker struct { + output string + err error +} + +func (m *mockProbeHistoryChecker) queryProbeHistory(_ context.Context, _ k8sclient.Client, _ *rest.Config, _ string) (string, error) { + return m.output, m.err +} + +func noopProbeHistoryChecker() *mockProbeHistoryChecker { + return &mockProbeHistoryChecker{output: allSucceedingHistoryResponse} +} + +const ( + allSucceedingHistoryResponse = `{"data":{"result":[{"values":[[1700000000,"1"],[1700000060,"1"],[1700000120,"1"]]}]}}` + persistentFailureHistoryResponse = `{"data":{"result":[{"values":[[1700000000,"0"],[1700000060,"0"],[1700000120,"0"]]}]}}` + intermittentFailureHistoryResponse = `{"data":{"result":[{"values":[[1700000000,"1"],[1700000060,"0"],[1700000120,"1"],[1700000180,"0"]]}]}}` + emptyHistoryResponse = `{"data":{"result":[]}}` +) + const successfulProbeResponse = `Logs for the probe: ts=2024-01-01T00:00:00.000Z caller=handler.go:119 module=http_2xx target=https://console-openshift-console.apps.test.example.com level=debug msg="Beginning probe" ts=2024-01-01T00:00:00.001Z caller=handler.go:119 module=http_2xx target=https://console-openshift-console.apps.test.example.com level=debug msg="Resolving target address" ip_protocol=ip4 @@ -641,6 +663,92 @@ func TestCheckUpstreamDNS_GetError(t *testing.T) { } } +// checkProbeHistory unit tests + +func TestCheckProbeHistory_Persistent(t *testing.T) { + notes := newTestNotes() + inv := &Investigation{probeHistoryCheck: &mockProbeHistoryChecker{output: persistentFailureHistoryResponse}} + r := &investigation.Resources{RestConfig: testRestConfig(), K8sClient: newFakeClient()} + + inv.checkProbeHistory(context.Background(), r, "https://console.test.example.com", notes) + + output := notes.String() + if !strings.Contains(output, "consistently failing") { + t.Errorf("expected persistent failure message, got: %s", output) + } + if !strings.Contains(output, "3 data points") { + t.Errorf("expected '3 data points' in notes, got: %s", output) + } +} + +func TestCheckProbeHistory_Intermittent(t *testing.T) { + notes := newTestNotes() + inv := &Investigation{probeHistoryCheck: &mockProbeHistoryChecker{output: intermittentFailureHistoryResponse}} + r := &investigation.Resources{RestConfig: testRestConfig(), K8sClient: newFakeClient()} + + inv.checkProbeHistory(context.Background(), r, "https://console.test.example.com", notes) + + output := notes.String() + if !strings.Contains(output, "intermittently failing") { + t.Errorf("expected intermittent failure message, got: %s", output) + } + if !strings.Contains(output, "2/4") { + t.Errorf("expected '2/4' failure count in notes, got: %s", output) + } +} + +func TestCheckProbeHistory_AllSucceeding(t *testing.T) { + notes := newTestNotes() + inv := &Investigation{probeHistoryCheck: &mockProbeHistoryChecker{output: allSucceedingHistoryResponse}} + r := &investigation.Resources{RestConfig: testRestConfig(), K8sClient: newFakeClient()} + + inv.checkProbeHistory(context.Background(), r, "https://console.test.example.com", notes) + + output := notes.String() + if !strings.Contains(output, "succeeding for all") { + t.Errorf("expected all-succeeding message, got: %s", output) + } + if !strings.Contains(output, "3 data points") { + t.Errorf("expected '3 data points' in notes, got: %s", output) + } +} + +func TestCheckProbeHistory_NoData(t *testing.T) { + notes := newTestNotes() + inv := &Investigation{probeHistoryCheck: &mockProbeHistoryChecker{output: emptyHistoryResponse}} + r := &investigation.Resources{RestConfig: testRestConfig(), K8sClient: newFakeClient()} + + inv.checkProbeHistory(context.Background(), r, "https://console.test.example.com", notes) + + if !strings.Contains(notes.String(), "no probe_success data found") { + t.Errorf("expected no-data warning, got: %s", notes.String()) + } +} + +func TestCheckProbeHistory_QueryFails(t *testing.T) { + notes := newTestNotes() + inv := &Investigation{probeHistoryCheck: &mockProbeHistoryChecker{err: fmt.Errorf("exec timeout")}} + r := &investigation.Resources{RestConfig: testRestConfig(), K8sClient: newFakeClient()} + + inv.checkProbeHistory(context.Background(), r, "https://console.test.example.com", notes) + + if !strings.Contains(notes.String(), "unable to query Prometheus") { + t.Errorf("expected exec error warning, got: %s", notes.String()) + } +} + +func TestCheckProbeHistory_BadJSON(t *testing.T) { + notes := newTestNotes() + inv := &Investigation{probeHistoryCheck: &mockProbeHistoryChecker{output: "not json"}} + r := &investigation.Resources{RestConfig: testRestConfig(), K8sClient: newFakeClient()} + + inv.checkProbeHistory(context.Background(), r, "https://console.test.example.com", notes) + + if !strings.Contains(notes.String(), "failed to parse Prometheus response") { + t.Errorf("expected parse error warning, got: %s", notes.String()) + } +} + // checkPodHealth unit tests (Router) var ( @@ -1110,7 +1218,7 @@ func TestRun_HCP_SkipsAllowedSourceRanges(t *testing.T) { }, } - inv := &Investigation{consoleChecker: healthyConsoleChecker(), blackboxProber: healthyBlackboxProber()} + inv := &Investigation{consoleChecker: healthyConsoleChecker(), blackboxProber: healthyBlackboxProber(), probeHistoryCheck: noopProbeHistoryChecker()} result, err := inv.Run(rb) if err != nil { t.Fatalf("unexpected error: %v", err) @@ -1150,7 +1258,7 @@ func TestRun_AllowedSourceRangesMisconfigured(t *testing.T) { }, } - inv := &Investigation{consoleChecker: healthyConsoleChecker(), blackboxProber: healthyBlackboxProber()} + inv := &Investigation{consoleChecker: healthyConsoleChecker(), blackboxProber: healthyBlackboxProber(), probeHistoryCheck: noopProbeHistoryChecker()} result, err := inv.Run(rb) if err != nil { t.Fatalf("unexpected error: %v", err) @@ -1202,7 +1310,7 @@ func TestRun_AllowedSourceRangesOK(t *testing.T) { }, } - inv := &Investigation{consoleChecker: healthyConsoleChecker(), blackboxProber: healthyBlackboxProber()} + inv := &Investigation{consoleChecker: healthyConsoleChecker(), blackboxProber: healthyBlackboxProber(), probeHistoryCheck: noopProbeHistoryChecker()} result, err := inv.Run(rb) if err != nil { t.Fatalf("unexpected error: %v", err) @@ -1276,7 +1384,7 @@ func TestRun_NonAWSCluster_SkipsAWSChecks(t *testing.T) { }, } - inv := &Investigation{consoleChecker: healthyConsoleChecker(), blackboxProber: healthyBlackboxProber()} + inv := &Investigation{consoleChecker: healthyConsoleChecker(), blackboxProber: healthyBlackboxProber(), probeHistoryCheck: noopProbeHistoryChecker()} result, err := inv.Run(rb) if err != nil { t.Fatalf("unexpected error: %v", err) @@ -1313,6 +1421,9 @@ func TestRun_AWSCluster_AWSChecksRun(t *testing.T) { {InstanceID: "i-001", State: "InService"}, {InstanceID: "i-002", State: "InService"}, }, nil) + mockAws.EXPECT().GetCLBSecurityGroupIDs(gomock.Any(), "test-elb").Return([]string{"sg-clb"}, nil) + mockAws.EXPECT().GetSecurityGroupRules(gomock.Any(), []string{"sg-clb"}). + Return([]ec2v2types.SecurityGroup{testSGAllowTCP443("sg-clb")}, nil) ic := newDefaultIngressController([]operatorv1.CIDR{"10.0.0.0/8"}) dns := newDefaultDNS([]operatorv1.Upstream{{Type: operatorv1.SystemResolveConfType}}) @@ -1338,9 +1449,10 @@ func TestRun_AWSCluster_AWSChecksRun(t *testing.T) { } inv := &Investigation{ - consoleChecker: healthyConsoleChecker(), - blackboxProber: healthyBlackboxProber(), - egressVerifier: successEgressVerifier(), + consoleChecker: healthyConsoleChecker(), + blackboxProber: healthyBlackboxProber(), + probeHistoryCheck: noopProbeHistoryChecker(), + egressVerifier: successEgressVerifier(), } result, err := inv.Run(rb) if err != nil { @@ -1464,7 +1576,7 @@ func TestCheckBlackboxProbe_Success(t *testing.T) { cluster := newTestCluster("test-cluster", "10.0.0.0/16", "https://console.test.example.com") k8sClient := newFakeClient() notes := newTestNotes() - inv := &Investigation{blackboxProber: healthyBlackboxProber()} + inv := &Investigation{blackboxProber: healthyBlackboxProber(), probeHistoryCheck: noopProbeHistoryChecker()} r := &investigation.Resources{ Cluster: cluster, @@ -1487,7 +1599,7 @@ func TestCheckBlackboxProbe_DNSFailure(t *testing.T) { cluster := newTestCluster("test-cluster", "10.0.0.0/16", "https://console.test.example.com") k8sClient := newFakeClient() notes := newTestNotes() - inv := &Investigation{blackboxProber: &mockBlackboxProber{output: dnsFailureProbeResponse}} + inv := &Investigation{blackboxProber: &mockBlackboxProber{output: dnsFailureProbeResponse}, probeHistoryCheck: noopProbeHistoryChecker()} r := &investigation.Resources{ Cluster: cluster, @@ -1936,6 +2048,39 @@ func newNLBIngressController(allowedSourceRanges []operatorv1.CIDR) *operatorv1. return ic } +// testSGAllowTCP443 creates a SecurityGroup that allows TCP 443 from 0.0.0.0/0. +func testSGAllowTCP443(sgID string) ec2v2types.SecurityGroup { + return ec2v2types.SecurityGroup{ + GroupId: &sgID, + IpPermissions: []ec2v2types.IpPermission{ + { + IpProtocol: strPtr("tcp"), + FromPort: int32Ptr(443), + ToPort: int32Ptr(443), + IpRanges: []ec2v2types.IpRange{{CidrIp: strPtr("0.0.0.0/0")}}, + }, + }, + } +} + +// testSGAllowNodePorts creates a SecurityGroup that allows TCP 30000-32767 from 0.0.0.0/0. +func testSGAllowNodePorts(sgID string) ec2v2types.SecurityGroup { + return ec2v2types.SecurityGroup{ + GroupId: &sgID, + IpPermissions: []ec2v2types.IpPermission{ + { + IpProtocol: strPtr("tcp"), + FromPort: int32Ptr(30000), + ToPort: int32Ptr(32767), + IpRanges: []ec2v2types.IpRange{{CidrIp: strPtr("0.0.0.0/0")}}, + }, + }, + } +} + +func strPtr(s string) *string { return &s } +func int32Ptr(i int32) *int32 { return &i } + // determineLBType unit tests func TestDetermineLBType_NLB(t *testing.T) { @@ -2004,12 +2149,16 @@ func TestCheckLBHealth_NLB_AllHealthy(t *testing.T) { mockAws := awsmock.NewMockClient(ctrl) mockAws.EXPECT().FindNLBByDNSName(gomock.Any(), "test-nlb-abc123.elb.us-east-1.amazonaws.com"). - Return("arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/net/test-nlb/abc", "test-nlb", nil) + Return("arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/net/test-nlb/abc", "test-nlb", []string{}, nil) mockAws.EXPECT().GetNLBTargetHealth(gomock.Any(), "arn:aws:elasticloadbalancing:us-east-1:123:loadbalancer/net/test-nlb/abc"). Return([]aws.NLBTargetHealth{ {TargetID: "i-001", Port: 443, State: "healthy"}, {TargetID: "i-002", Port: 443, State: "healthy"}, }, nil) + mockAws.EXPECT().GetInstanceSecurityGroupIDs(gomock.Any(), gomock.Any()). + Return(map[string][]string{"i-001": {"sg-aaa"}, "i-002": {"sg-aaa"}}, nil) + mockAws.EXPECT().GetSecurityGroupRules(gomock.Any(), gomock.Any()). + Return([]ec2v2types.SecurityGroup{testSGAllowNodePorts("sg-aaa")}, nil) ic := newNLBIngressController(nil) svc := newRouterService("test-nlb-abc123.elb.us-east-1.amazonaws.com") @@ -2033,12 +2182,16 @@ func TestCheckLBHealth_NLB_UnhealthyTarget(t *testing.T) { mockAws := awsmock.NewMockClient(ctrl) mockAws.EXPECT().FindNLBByDNSName(gomock.Any(), "test-nlb-abc123.elb.us-east-1.amazonaws.com"). - Return("arn:nlb", "test-nlb", nil) + Return("arn:nlb", "test-nlb", []string{}, nil) mockAws.EXPECT().GetNLBTargetHealth(gomock.Any(), "arn:nlb"). Return([]aws.NLBTargetHealth{ {TargetID: "i-001", Port: 443, State: "healthy"}, {TargetID: "i-002", Port: 443, State: "unhealthy", Reason: "Target.FailedHealthChecks"}, }, nil) + mockAws.EXPECT().GetInstanceSecurityGroupIDs(gomock.Any(), gomock.Any()). + Return(map[string][]string{"i-001": {"sg-aaa"}, "i-002": {"sg-aaa"}}, nil) + mockAws.EXPECT().GetSecurityGroupRules(gomock.Any(), gomock.Any()). + Return([]ec2v2types.SecurityGroup{testSGAllowNodePorts("sg-aaa")}, nil) ic := newNLBIngressController(nil) svc := newRouterService("test-nlb-abc123.elb.us-east-1.amazonaws.com") @@ -2071,6 +2224,10 @@ func TestCheckLBHealth_CLB_AllInService(t *testing.T) { {InstanceID: "i-001", State: "InService"}, {InstanceID: "i-002", State: "InService"}, }, nil) + mockAws.EXPECT().GetCLBSecurityGroupIDs(gomock.Any(), "test-clb"). + Return([]string{"sg-clb"}, nil) + mockAws.EXPECT().GetSecurityGroupRules(gomock.Any(), []string{"sg-clb"}). + Return([]ec2v2types.SecurityGroup{testSGAllowTCP443("sg-clb")}, nil) ic := newDefaultIngressController(nil) // no ProviderParameters → Classic svc := newRouterService("test-clb-123456.us-east-1.elb.amazonaws.com") @@ -2100,6 +2257,10 @@ func TestCheckLBHealth_CLB_OutOfService(t *testing.T) { {InstanceID: "i-001", State: "InService"}, {InstanceID: "i-002", State: "OutOfService", Description: "Instance has failed health checks"}, }, nil) + mockAws.EXPECT().GetCLBSecurityGroupIDs(gomock.Any(), "test-clb"). + Return([]string{"sg-clb"}, nil) + mockAws.EXPECT().GetSecurityGroupRules(gomock.Any(), []string{"sg-clb"}). + Return([]ec2v2types.SecurityGroup{testSGAllowTCP443("sg-clb")}, nil) ic := newDefaultIngressController(nil) svc := newRouterService("test-clb-123456.us-east-1.elb.amazonaws.com") @@ -2207,7 +2368,7 @@ func TestCheckLBHealth_NLB_NoTargets(t *testing.T) { mockAws := awsmock.NewMockClient(ctrl) mockAws.EXPECT().FindNLBByDNSName(gomock.Any(), "test-nlb.elb.us-east-1.amazonaws.com"). - Return("arn:nlb", "test-nlb", nil) + Return("arn:nlb", "test-nlb", []string{}, nil) mockAws.EXPECT().GetNLBTargetHealth(gomock.Any(), "arn:nlb"). Return([]aws.NLBTargetHealth{}, nil) @@ -2239,6 +2400,10 @@ func TestCheckLBHealth_CLB_DefaultType(t *testing.T) { Return([]aws.CLBInstanceHealth{ {InstanceID: "i-001", State: "InService"}, }, nil) + mockAws.EXPECT().GetCLBSecurityGroupIDs(gomock.Any(), "test-clb"). + Return([]string{"sg-clb"}, nil) + mockAws.EXPECT().GetSecurityGroupRules(gomock.Any(), []string{"sg-clb"}). + Return([]ec2v2types.SecurityGroup{testSGAllowTCP443("sg-clb")}, nil) ic := newDefaultIngressController(nil) // no ProviderParameters svc := newRouterService("test-clb.us-east-1.elb.amazonaws.com") @@ -2259,6 +2424,346 @@ func TestCheckLBHealth_CLB_DefaultType(t *testing.T) { } } +// Securitygroup check tests + +func TestCheckCLBHealth_SGAllowsTCP443(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + mockAws := awsmock.NewMockClient(ctrl) + + mockAws.EXPECT().FindCLBByDNSName(gomock.Any(), "test-clb.elb.amazonaws.com").Return("test-clb", nil) + mockAws.EXPECT().GetCLBInstanceHealth(gomock.Any(), "test-clb").Return([]aws.CLBInstanceHealth{ + {InstanceID: "i-001", State: "InService"}, + }, nil) + mockAws.EXPECT().GetCLBSecurityGroupIDs(gomock.Any(), "test-clb").Return([]string{"sg-123"}, nil) + mockAws.EXPECT().GetSecurityGroupRules(gomock.Any(), []string{"sg-123"}). + Return([]ec2v2types.SecurityGroup{testSGAllowTCP443("sg-123")}, nil) + + ic := newDefaultIngressController(nil) + svc := newRouterService("test-clb.elb.amazonaws.com") + k8sClient := newFakeClient(ic, svc) + cluster := newAWSTestClusterWithDNS("test", "10.0.0.0/16", "", "p", "e.com", false) + notes := newTestNotes() + inv := &Investigation{} + + r := &investigation.Resources{Cluster: cluster, K8sClient: k8sClient, AwsClient: mockAws} + inv.checkLoadBalancerHealth(context.Background(), r, notes) + + output := notes.String() + if !strings.Contains(output, "security group allows TCP 443 inbound") { + t.Errorf("expected SG allows TCP 443 message, got: %s", output) + } +} + +func TestCheckCLBHealth_SGMissingTCP443(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + mockAws := awsmock.NewMockClient(ctrl) + + mockAws.EXPECT().FindCLBByDNSName(gomock.Any(), "test-clb.elb.amazonaws.com").Return("test-clb", nil) + mockAws.EXPECT().GetCLBInstanceHealth(gomock.Any(), "test-clb").Return([]aws.CLBInstanceHealth{ + {InstanceID: "i-001", State: "InService"}, + }, nil) + mockAws.EXPECT().GetCLBSecurityGroupIDs(gomock.Any(), "test-clb").Return([]string{"sg-123"}, nil) + // SG with UDP rule only; no TCP 443. + mockAws.EXPECT().GetSecurityGroupRules(gomock.Any(), []string{"sg-123"}). + Return([]ec2v2types.SecurityGroup{{ + GroupId: strPtr("sg-123"), + IpPermissions: []ec2v2types.IpPermission{ + { + IpProtocol: strPtr("udp"), + FromPort: int32Ptr(443), + ToPort: int32Ptr(443), + IpRanges: []ec2v2types.IpRange{{CidrIp: strPtr("0.0.0.0/0")}}, + }, + }, + }}, nil) + + ic := newDefaultIngressController(nil) + svc := newRouterService("test-clb.elb.amazonaws.com") + k8sClient := newFakeClient(ic, svc) + cluster := newAWSTestClusterWithDNS("test", "10.0.0.0/16", "", "p", "e.com", false) + notes := newTestNotes() + inv := &Investigation{} + + r := &investigation.Resources{Cluster: cluster, K8sClient: k8sClient, AwsClient: mockAws} + inv.checkLoadBalancerHealth(context.Background(), r, notes) + + output := notes.String() + if !strings.Contains(output, "does not allow TCP 443 inbound") { + t.Errorf("expected SG missing TCP 443 warning, got: %s", output) + } +} + +func TestCheckCLBHealth_NoSecurityGroups(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + mockAws := awsmock.NewMockClient(ctrl) + + mockAws.EXPECT().FindCLBByDNSName(gomock.Any(), "test-clb.elb.amazonaws.com").Return("test-clb", nil) + mockAws.EXPECT().GetCLBInstanceHealth(gomock.Any(), "test-clb").Return([]aws.CLBInstanceHealth{ + {InstanceID: "i-001", State: "InService"}, + }, nil) + mockAws.EXPECT().GetCLBSecurityGroupIDs(gomock.Any(), "test-clb").Return([]string{}, nil) + + ic := newDefaultIngressController(nil) + svc := newRouterService("test-clb.elb.amazonaws.com") + k8sClient := newFakeClient(ic, svc) + cluster := newAWSTestClusterWithDNS("test", "10.0.0.0/16", "", "p", "e.com", false) + notes := newTestNotes() + inv := &Investigation{} + + r := &investigation.Resources{Cluster: cluster, K8sClient: k8sClient, AwsClient: mockAws} + inv.checkLoadBalancerHealth(context.Background(), r, notes) + + if !strings.Contains(notes.String(), "no security groups attached") { + t.Errorf("expected no SGs warning, got: %s", notes.String()) + } +} + +func TestCheckCLBHealth_SGLookupError(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + mockAws := awsmock.NewMockClient(ctrl) + + mockAws.EXPECT().FindCLBByDNSName(gomock.Any(), "test-clb.elb.amazonaws.com").Return("test-clb", nil) + mockAws.EXPECT().GetCLBInstanceHealth(gomock.Any(), "test-clb").Return([]aws.CLBInstanceHealth{ + {InstanceID: "i-001", State: "InService"}, + }, nil) + mockAws.EXPECT().GetCLBSecurityGroupIDs(gomock.Any(), "test-clb").Return(nil, fmt.Errorf("access denied")) + + ic := newDefaultIngressController(nil) + svc := newRouterService("test-clb.elb.amazonaws.com") + k8sClient := newFakeClient(ic, svc) + cluster := newAWSTestClusterWithDNS("test", "10.0.0.0/16", "", "p", "e.com", false) + notes := newTestNotes() + inv := &Investigation{} + + r := &investigation.Resources{Cluster: cluster, K8sClient: k8sClient, AwsClient: mockAws} + inv.checkLoadBalancerHealth(context.Background(), r, notes) + + if !strings.Contains(notes.String(), "failed to get CLB security groups") { + t.Errorf("expected SG lookup error warning, got: %s", notes.String()) + } +} + +func TestCheckNLBHealth_UnexpectedSGs(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + mockAws := awsmock.NewMockClient(ctrl) + + mockAws.EXPECT().FindNLBByDNSName(gomock.Any(), "test-nlb.elb.amazonaws.com"). + Return("arn:nlb", "test-nlb", []string{"sg-unexpected"}, nil) + mockAws.EXPECT().GetNLBTargetHealth(gomock.Any(), "arn:nlb"). + Return([]aws.NLBTargetHealth{ + {TargetID: "i-001", Port: 443, State: "healthy"}, + }, nil) + mockAws.EXPECT().GetInstanceSecurityGroupIDs(gomock.Any(), gomock.Any()). + Return(map[string][]string{"i-001": {"sg-node"}}, nil) + mockAws.EXPECT().GetSecurityGroupRules(gomock.Any(), gomock.Any()). + Return([]ec2v2types.SecurityGroup{testSGAllowNodePorts("sg-node")}, nil) + + ic := newNLBIngressController(nil) + svc := newRouterService("test-nlb.elb.amazonaws.com") + k8sClient := newFakeClient(ic, svc) + cluster := newAWSTestClusterWithDNS("test", "10.0.0.0/16", "", "p", "e.com", false) + notes := newTestNotes() + inv := &Investigation{} + + r := &investigation.Resources{Cluster: cluster, K8sClient: k8sClient, AwsClient: mockAws} + inv.checkLoadBalancerHealth(context.Background(), r, notes) + + if !strings.Contains(notes.String(), "has security groups attached (unusual)") { + t.Errorf("expected unexpected SGs warning, got: %s", notes.String()) + } +} + +func TestCheckNLBHealth_NodeSGAllowsNodePorts(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + mockAws := awsmock.NewMockClient(ctrl) + + mockAws.EXPECT().FindNLBByDNSName(gomock.Any(), "test-nlb.elb.amazonaws.com"). + Return("arn:nlb", "test-nlb", []string{}, nil) + mockAws.EXPECT().GetNLBTargetHealth(gomock.Any(), "arn:nlb"). + Return([]aws.NLBTargetHealth{ + {TargetID: "i-001", Port: 31174, State: "healthy"}, + }, nil) + mockAws.EXPECT().GetInstanceSecurityGroupIDs(gomock.Any(), []string{"i-001"}). + Return(map[string][]string{"i-001": {"sg-node"}}, nil) + mockAws.EXPECT().GetSecurityGroupRules(gomock.Any(), gomock.Any()). + Return([]ec2v2types.SecurityGroup{testSGAllowNodePorts("sg-node")}, nil) + + ic := newNLBIngressController(nil) + svc := newRouterService("test-nlb.elb.amazonaws.com") + k8sClient := newFakeClient(ic, svc) + cluster := newAWSTestClusterWithDNS("test", "10.0.0.0/16", "", "p", "e.com", false) + notes := newTestNotes() + inv := &Investigation{} + + r := &investigation.Resources{Cluster: cluster, K8sClient: k8sClient, AwsClient: mockAws} + inv.checkLoadBalancerHealth(context.Background(), r, notes) + + if !strings.Contains(notes.String(), "infra node security groups allow NodePort traffic") { + t.Errorf("expected NodePort allowed message, got: %s", notes.String()) + } +} + +func TestCheckNLBHealth_NodeSGMissingNodePorts(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + mockAws := awsmock.NewMockClient(ctrl) + + mockAws.EXPECT().FindNLBByDNSName(gomock.Any(), "test-nlb.elb.amazonaws.com"). + Return("arn:nlb", "test-nlb", []string{}, nil) + mockAws.EXPECT().GetNLBTargetHealth(gomock.Any(), "arn:nlb"). + Return([]aws.NLBTargetHealth{ + {TargetID: "i-001", Port: 31174, State: "healthy"}, + }, nil) + mockAws.EXPECT().GetInstanceSecurityGroupIDs(gomock.Any(), []string{"i-001"}). + Return(map[string][]string{"i-001": {"sg-node"}}, nil) + mockAws.EXPECT().GetSecurityGroupRules(gomock.Any(), gomock.Any()). + Return([]ec2v2types.SecurityGroup{{ + GroupId: strPtr("sg-node"), + IpPermissions: []ec2v2types.IpPermission{ + { + IpProtocol: strPtr("tcp"), + FromPort: int32Ptr(22), + ToPort: int32Ptr(22), + IpRanges: []ec2v2types.IpRange{{CidrIp: strPtr("0.0.0.0/0")}}, + }, + }, + }}, nil) + + ic := newNLBIngressController(nil) + svc := newRouterService("test-nlb.elb.amazonaws.com") + k8sClient := newFakeClient(ic, svc) + cluster := newAWSTestClusterWithDNS("test", "10.0.0.0/16", "", "p", "e.com", false) + notes := newTestNotes() + inv := &Investigation{} + + r := &investigation.Resources{Cluster: cluster, K8sClient: k8sClient, AwsClient: mockAws} + inv.checkLoadBalancerHealth(context.Background(), r, notes) + + if !strings.Contains(notes.String(), "do not allow TCP traffic on NodePort range") { + t.Errorf("expected NodePort blocked warning, got: %s", notes.String()) + } +} + +func TestCheckNLBHealth_InstanceSGLookupError(t *testing.T) { + ctrl := gomock.NewController(t) + defer ctrl.Finish() + mockAws := awsmock.NewMockClient(ctrl) + + mockAws.EXPECT().FindNLBByDNSName(gomock.Any(), "test-nlb.elb.amazonaws.com"). + Return("arn:nlb", "test-nlb", []string{}, nil) + mockAws.EXPECT().GetNLBTargetHealth(gomock.Any(), "arn:nlb"). + Return([]aws.NLBTargetHealth{ + {TargetID: "i-001", Port: 443, State: "healthy"}, + }, nil) + mockAws.EXPECT().GetInstanceSecurityGroupIDs(gomock.Any(), []string{"i-001"}). + Return(nil, fmt.Errorf("access denied")) + + ic := newNLBIngressController(nil) + svc := newRouterService("test-nlb.elb.amazonaws.com") + k8sClient := newFakeClient(ic, svc) + cluster := newAWSTestClusterWithDNS("test", "10.0.0.0/16", "", "p", "e.com", false) + notes := newTestNotes() + inv := &Investigation{} + + r := &investigation.Resources{Cluster: cluster, K8sClient: k8sClient, AwsClient: mockAws} + inv.checkLoadBalancerHealth(context.Background(), r, notes) + + if !strings.Contains(notes.String(), "failed to get infra node security groups") { + t.Errorf("expected SG lookup error warning, got: %s", notes.String()) + } +} + +// sgAllowsTCPInbound unit tests + +func TestSgAllowsTCPInbound_Allows0000(t *testing.T) { + sgs := []ec2v2types.SecurityGroup{testSGAllowTCP443("sg-1")} + if !sgAllowsTCPInbound(sgs, 443, 443, "10.0.0.0/16") { + t.Error("expected true for 0.0.0.0/0 rule") + } +} + +func TestSgAllowsTCPInbound_AllowsMachineCIDR(t *testing.T) { + sg := ec2v2types.SecurityGroup{ + GroupId: strPtr("sg-1"), + IpPermissions: []ec2v2types.IpPermission{ + { + IpProtocol: strPtr("tcp"), + FromPort: int32Ptr(443), + ToPort: int32Ptr(443), + IpRanges: []ec2v2types.IpRange{{CidrIp: strPtr("10.0.0.0/8")}}, + }, + }, + } + if !sgAllowsTCPInbound([]ec2v2types.SecurityGroup{sg}, 443, 443, "10.0.0.0/16") { + t.Error("expected true — 10.0.0.0/8 covers 10.0.0.0/16") + } +} + +func TestSgAllowsTCPInbound_PortMismatch(t *testing.T) { + sg := ec2v2types.SecurityGroup{ + GroupId: strPtr("sg-1"), + IpPermissions: []ec2v2types.IpPermission{ + { + IpProtocol: strPtr("tcp"), + FromPort: int32Ptr(80), + ToPort: int32Ptr(80), + IpRanges: []ec2v2types.IpRange{{CidrIp: strPtr("0.0.0.0/0")}}, + }, + }, + } + if sgAllowsTCPInbound([]ec2v2types.SecurityGroup{sg}, 443, 443, "10.0.0.0/16") { + t.Error("expected false — rule is for port 80, not 443") + } +} + +func TestSgAllowsTCPInbound_NoMatchingRules(t *testing.T) { + sg := ec2v2types.SecurityGroup{ + GroupId: strPtr("sg-1"), + IpPermissions: []ec2v2types.IpPermission{}, + } + if sgAllowsTCPInbound([]ec2v2types.SecurityGroup{sg}, 443, 443, "10.0.0.0/16") { + t.Error("expected false — no rules at all") + } +} + +func TestSgAllowsTCPInbound_AllProtocols(t *testing.T) { + sg := ec2v2types.SecurityGroup{ + GroupId: strPtr("sg-1"), + IpPermissions: []ec2v2types.IpPermission{ + { + IpProtocol: strPtr("-1"), + IpRanges: []ec2v2types.IpRange{{CidrIp: strPtr("0.0.0.0/0")}}, + }, + }, + } + if !sgAllowsTCPInbound([]ec2v2types.SecurityGroup{sg}, 443, 443, "10.0.0.0/16") { + t.Error("expected true — protocol -1 allows all traffic") + } +} + +func TestSgAllowsTCPInbound_EmptyCIDR(t *testing.T) { + sg := ec2v2types.SecurityGroup{ + GroupId: strPtr("sg-1"), + IpPermissions: []ec2v2types.IpPermission{ + { + IpProtocol: strPtr("tcp"), + FromPort: int32Ptr(30000), + ToPort: int32Ptr(32767), + IpRanges: []ec2v2types.IpRange{{CidrIp: strPtr("10.0.0.0/8")}}, + }, + }, + } + if !sgAllowsTCPInbound([]ec2v2types.SecurityGroup{sg}, 30000, 32767, "") { + t.Error("expected true — empty CIDR means any source is OK") + } +} + // mockEgressVerifier is a test mock for the egressVerifier interface. type mockEgressVerifier struct { result networkverifier.VerifierResult @@ -2380,6 +2885,9 @@ func TestRun_PrivateLink_SkipsEgress(t *testing.T) { {InstanceID: "i-001", State: "InService"}, {InstanceID: "i-002", State: "InService"}, }, nil) + mockAws.EXPECT().GetCLBSecurityGroupIDs(gomock.Any(), "test-elb").Return([]string{"sg-clb"}, nil) + mockAws.EXPECT().GetSecurityGroupRules(gomock.Any(), []string{"sg-clb"}). + Return([]ec2v2types.SecurityGroup{testSGAllowTCP443("sg-clb")}, nil) // No egress verifier calls expected — PrivateLink clusters skip egress. @@ -2407,9 +2915,10 @@ func TestRun_PrivateLink_SkipsEgress(t *testing.T) { egressMock := successEgressVerifier() inv := &Investigation{ - consoleChecker: healthyConsoleChecker(), - blackboxProber: healthyBlackboxProber(), - egressVerifier: egressMock, + consoleChecker: healthyConsoleChecker(), + blackboxProber: healthyBlackboxProber(), + probeHistoryCheck: noopProbeHistoryChecker(), + egressVerifier: egressMock, } result, err := inv.Run(rb) if err != nil { diff --git a/pkg/ocm/mock/ocmmock.go b/pkg/ocm/mock/ocmmock.go index 212367dc..c8930692 100644 --- a/pkg/ocm/mock/ocmmock.go +++ b/pkg/ocm/mock/ocmmock.go @@ -12,7 +12,7 @@ package ocmmock import ( reflect "reflect" - ocm_sdk_go "github.com/openshift-online/ocm-sdk-go" + sdk "github.com/openshift-online/ocm-sdk-go" v1 "github.com/openshift-online/ocm-sdk-go/accountsmgmt/v1" v10 "github.com/openshift-online/ocm-sdk-go/clustersmgmt/v1" v11 "github.com/openshift-online/ocm-sdk-go/servicelogs/v1" @@ -120,10 +120,10 @@ func (mr *MockClientMockRecorder) GetClusterMachinePools(internalClusterID any) } // GetConnection mocks base method. -func (m *MockClient) GetConnection() *ocm_sdk_go.Connection { +func (m *MockClient) GetConnection() *sdk.Connection { m.ctrl.T.Helper() ret := m.ctrl.Call(m, "GetConnection") - ret0, _ := ret[0].(*ocm_sdk_go.Connection) + ret0, _ := ret[0].(*sdk.Connection) return ret0 }