diff --git a/libpod/container_internal.go b/libpod/container_internal.go index 042a6a4c153..c159876dc6d 100644 --- a/libpod/container_internal.go +++ b/libpod/container_internal.go @@ -230,7 +230,9 @@ func (c *Container) shouldRestart() bool { if c.config.HealthCheckOnFailureAction == define.HealthCheckOnFailureActionRestart { isUnhealthy, err := c.isUnhealthy() if err != nil { + // If the state cannot be determined, assume unhealthy. logrus.Errorf("Checking if container is unhealthy: %v", err) + return true } else if isUnhealthy { return true } diff --git a/libpod/define/errors.go b/libpod/define/errors.go index 47b50a6a64c..0572cc118e7 100644 --- a/libpod/define/errors.go +++ b/libpod/define/errors.go @@ -226,4 +226,8 @@ var ( // ErrHealthCheckTimeout indicates that a HealthCheck timed out. ErrHealthCheckTimeout = errors.New("healthcheck command exceeded timeout") + + // ErrHealthCheckLogCorrupted indicates that the healthcheck log + // cannot be parsed. + ErrHealthCheckLogCorrupted = errors.New("healthcheck log corrupted") ) diff --git a/libpod/healthcheck.go b/libpod/healthcheck.go index ba429547673..820151408f9 100644 --- a/libpod/healthcheck.go +++ b/libpod/healthcheck.go @@ -17,6 +17,7 @@ import ( "github.com/sirupsen/logrus" "go.podman.io/podman/v6/libpod/define" "go.podman.io/podman/v6/libpod/shutdown" + "go.podman.io/storage/pkg/ioutils" "golang.org/x/sys/unix" ) @@ -351,7 +352,11 @@ func newHealthCheckLog(start, end time.Time, exitCode int, log string) define.He func (c *Container) updateHealthStatus(status string) error { healthCheck, err := c.readHealthCheckLog() if err != nil { - return err + // If the healthcheck log is corrupted, overwrite it with a new result. + if !errors.Is(err, define.ErrHealthCheckLogCorrupted) { + return err + } + logrus.Warnf("Failed to read healthcheck log for %s: %v", c.ID(), err) } healthCheck.Status = status return c.writeHealthCheckLog(healthCheck) @@ -364,7 +369,8 @@ func (c *Container) isUnhealthy() (bool, error) { } healthCheck, err := c.readHealthCheckLog() if err != nil { - return false, err + // If the state cannot be determined from the log, treat it as unhealthy. + return true, err } return healthCheck.Status == define.HealthCheckUnhealthy, nil } @@ -374,7 +380,11 @@ func (c *Container) isUnhealthy() (bool, error) { func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, hcResult define.HealthCheckStatus, inStartPeriod bool) (define.HealthCheckResults, error) { healthCheck, err := c.readHealthCheckLog() if err != nil { - return define.HealthCheckResults{}, err + // If the log is corrupted, use an empty result and eventually overwrite it. + if !errors.Is(err, define.ErrHealthCheckLogCorrupted) { + return healthCheck, err + } + logrus.Warnf("Failed to read healthcheck log for %s: %v", c.ID(), err) } if hcl.ExitCode == 0 { // set status to healthy, reset failing state to 0 @@ -402,12 +412,12 @@ func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, hcResult def return healthCheck, c.writeHealthCheckLog(healthCheck) } -func (c *Container) witeToFileHealthCheckResults(path string, result define.HealthCheckResults) error { +func (c *Container) writeToFileHealthCheckResults(path string, result define.HealthCheckResults) error { newResults, err := json.Marshal(result) if err != nil { return fmt.Errorf("unable to marshall healthchecks for writing: %w", err) } - return os.WriteFile(path, newResults, 0o700) + return ioutils.AtomicWriteFile(path, newResults, 0o700) } func (c *Container) getHealthCheckLogDestination() string { @@ -422,7 +432,7 @@ func (c *Container) getHealthCheckLogDestination() string { } func (c *Container) writeHealthCheckLog(result define.HealthCheckResults) error { - return c.witeToFileHealthCheckResults(c.getHealthCheckLogDestination(), result) + return c.writeToFileHealthCheckResults(c.getHealthCheckLogDestination(), result) } // readHealthCheckLog read HealthCheck logs from the path or events_logger @@ -432,27 +442,32 @@ func (c *Container) readHealthCheckLog() (define.HealthCheckResults, error) { } // readFromFileHealthCheckLog returns HealthCheck results by reading the container's -// health check log file. If the health check log file does not exist, then -// an empty healthcheck struct is returned +// health check log file. If the health check log file does not exist, then +// an empty healthcheck struct is returned. If the health check log file +// cannot be parsed, the error define.ErrHealthCheckLogCorrupted struct +// is returned together with an empty define.HealthCheckResults. // The caller should lock the container before this function is called. func (c *Container) readFromFileHealthCheckLog(path string) (define.HealthCheckResults, error) { var healthCheck define.HealthCheckResults b, err := os.ReadFile(path) if err != nil { if errors.Is(err, fs.ErrNotExist) { - // If the file does not exists just return empty healthcheck and no error. + // If the file does not exist just return empty healthcheck and no error. return healthCheck, nil } return healthCheck, fmt.Errorf("failed to read health check log file: %w", err) } + // If the file is corrupted, return an empty healthcheck and wrapped ErrHealthCheckLogCorrupted. if err := json.Unmarshal(b, &healthCheck); err != nil { - return healthCheck, fmt.Errorf("failed to unmarshal existing healthcheck results in %s: %w", path, err) + return healthCheck, fmt.Errorf("%w: %w", define.ErrHealthCheckLogCorrupted, err) } return healthCheck, nil } // HealthCheckStatus returns the current state of a container with a healthcheck. // Returns an empty string if no health check is defined for the container. +// If the healthcheck log is corrupted, the error define.ErrHealthCheckLogCorrupted +// is returned. func (c *Container) HealthCheckStatus() (string, error) { if !c.batched { c.lock.Lock() @@ -474,7 +489,7 @@ func (c *Container) healthCheckStatus() (string, error) { results, err := c.readHealthCheckLog() if err != nil { - return "", fmt.Errorf("unable to get healthcheck log for %s: %w", c.ID(), err) + return define.HealthCheckUnhealthy, fmt.Errorf("unable to get healthcheck log for %s: %w", c.ID(), err) } return results.Status, nil diff --git a/pkg/domain/filters/containers.go b/pkg/domain/filters/containers.go index 273ece6604d..57bb518d26c 100644 --- a/pkg/domain/filters/containers.go +++ b/pkg/domain/filters/containers.go @@ -10,6 +10,7 @@ import ( "strings" "time" + "github.com/sirupsen/logrus" "go.podman.io/common/pkg/filters" "go.podman.io/common/pkg/util" "go.podman.io/podman/v6/libpod" @@ -180,6 +181,7 @@ func GenerateContainerFilterFuncs(filter string, filterValues []string, r *libpo return func(c *libpod.Container) bool { hcStatus, err := c.HealthCheckStatus() if err != nil { + logrus.Warnf("Healthcheck status for %v can't be determined, status won't be matched: %v", c.ID(), err) return false } return slices.Contains(filterValues, hcStatus) diff --git a/pkg/ps/ps.go b/pkg/ps/ps.go index 3babb648951..32558562f20 100644 --- a/pkg/ps/ps.go +++ b/pkg/ps/ps.go @@ -229,7 +229,11 @@ func ListContainerBatch(rt *libpod.Runtime, ctr *libpod.Container, opts entities healthStatus, err = c.HealthCheckStatus() if err != nil { - return err + // Do not treat a corrupted healthcheck log as a hard error. + if !errors.Is(err, define.ErrHealthCheckLogCorrupted) { + return err + } + logrus.Warnf("Container %v health check log is corrupted: %v", c.ID(), err) } restartCount, err = c.RestartCount() diff --git a/test/system/220-healthcheck.bats b/test/system/220-healthcheck.bats index b2b3c90ab81..f215010a0ff 100644 --- a/test/system/220-healthcheck.bats +++ b/test/system/220-healthcheck.bats @@ -399,6 +399,46 @@ function _check_health_log { } +@test "podman healthcheck - corrupted log file is handled gracefully" { + local TMP_DIR_HEALTHCHECK="$PODMAN_TMPDIR/healthcheck" + mkdir $TMP_DIR_HEALTHCHECK + local ctrname="c-h-$(safename)" + local msg="healthmsg-$(random_string)" + _create_container_with_health_log_settings $ctrname $msg "{{.Config.HealthLogDestination}}" "--health-log-destination $TMP_DIR_HEALTHCHECK" "$TMP_DIR_HEALTHCHECK" "HealthLogDestination" + cid="$output" + + # First make sure there is an uncorrupted log. + run_podman healthcheck run $ctrname + is "$output" "" "output from 'podman healthcheck run'" + + healthcheck_log_path="${TMP_DIR_HEALTHCHECK}/${cid}-healthcheck.log" + count=$(grep -co "$msg" $healthcheck_log_path) + assert "$count" -ge 1 "Number of matching health log messages" + + # Corrupt the log file with invalid JSON. + echo "{invalid json{" > $healthcheck_log_path + + # podman ps must not fail but should warn and report unhealthy. + run_podman 0+w ps --format '{{.Names}} {{.Status}}' + assert "$output" =~ "$ctrname" "podman ps must still list the container" + assert "$output" =~ "unhealthy" "corrupted log should be reported as unhealthy" + assert "$output" =~ "healthcheck log corrupted" "expected warning about corrupted log from ps" + + # Verify that healthcheck run recovers by overwriting the corrupted + # log file with a new result while issuing a warning. + run_podman 0+w healthcheck run $ctrname + assert "$output" =~ "healthcheck log corrupted" "expected warning about corrupted log" + + count=$(grep -co "$msg" $healthcheck_log_path) + assert "$count" -ge 1 "Number of matching health log messages after recovery" + + # Run again to verify the warning is gone after the log was rewritten. + run_podman healthcheck run $ctrname + is "$output" "" "no warnings after log recovery" + + run_podman rm -t 0 -f $ctrname +} + @test "podman healthcheck --health-log-destination journal" { skip_if_remote "We cannot read journalctl over remote."