Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions libpod/container_internal.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,9 @@ func (c *Container) shouldRestart() bool {
if c.config.HealthCheckOnFailureAction == define.HealthCheckOnFailureActionRestart {
isUnhealthy, err := c.isUnhealthy()
if err != nil {
// If the state cannot be determined, assume unhealthy.
logrus.Errorf("Checking if container is unhealthy: %v", err)
return true
} else if isUnhealthy {
return true
}
Expand Down
4 changes: 4 additions & 0 deletions libpod/define/errors.go
Original file line number Diff line number Diff line change
Expand Up @@ -226,4 +226,8 @@ var (

// ErrHealthCheckTimeout indicates that a HealthCheck timed out.
ErrHealthCheckTimeout = errors.New("healthcheck command exceeded timeout")

// ErrHealthCheckLogCorrupted indicates that the healthcheck log
// cannot be parsed.
ErrHealthCheckLogCorrupted = errors.New("healthcheck log corrupted")
)
37 changes: 26 additions & 11 deletions libpod/healthcheck.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/sirupsen/logrus"
"go.podman.io/podman/v6/libpod/define"
"go.podman.io/podman/v6/libpod/shutdown"
"go.podman.io/storage/pkg/ioutils"
"golang.org/x/sys/unix"
)

Expand Down Expand Up @@ -351,7 +352,11 @@ func newHealthCheckLog(start, end time.Time, exitCode int, log string) define.He
func (c *Container) updateHealthStatus(status string) error {
healthCheck, err := c.readHealthCheckLog()
if err != nil {
return err
// If the healthcheck log is corrupted, overwrite it with a new result.
if !errors.Is(err, define.ErrHealthCheckLogCorrupted) {
return err
}
logrus.Warnf("Failed to read healthcheck log for %s: %v", c.ID(), err)
}
healthCheck.Status = status
return c.writeHealthCheckLog(healthCheck)
Expand All @@ -364,7 +369,8 @@ func (c *Container) isUnhealthy() (bool, error) {
}
healthCheck, err := c.readHealthCheckLog()
if err != nil {
return false, err
// If the state cannot be determined from the log, treat it as unhealthy.
return true, err
}
return healthCheck.Status == define.HealthCheckUnhealthy, nil
}
Expand All @@ -374,7 +380,11 @@ func (c *Container) isUnhealthy() (bool, error) {
func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, hcResult define.HealthCheckStatus, inStartPeriod bool) (define.HealthCheckResults, error) {
healthCheck, err := c.readHealthCheckLog()
if err != nil {
return define.HealthCheckResults{}, err
// If the log is corrupted, use an empty result and eventually overwrite it.
if !errors.Is(err, define.ErrHealthCheckLogCorrupted) {
return healthCheck, err
}
logrus.Warnf("Failed to read healthcheck log for %s: %v", c.ID(), err)
}
if hcl.ExitCode == 0 {
// set status to healthy, reset failing state to 0
Expand Down Expand Up @@ -402,12 +412,12 @@ func (c *Container) updateHealthCheckLog(hcl define.HealthCheckLog, hcResult def
return healthCheck, c.writeHealthCheckLog(healthCheck)
}

func (c *Container) witeToFileHealthCheckResults(path string, result define.HealthCheckResults) error {
func (c *Container) writeToFileHealthCheckResults(path string, result define.HealthCheckResults) error {
newResults, err := json.Marshal(result)
if err != nil {
return fmt.Errorf("unable to marshall healthchecks for writing: %w", err)
}
return os.WriteFile(path, newResults, 0o700)
return ioutils.AtomicWriteFile(path, newResults, 0o700)

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are permissions correct?

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should it be 0o644 or 0o600?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think 0o600 is correct.

@baude, do you remember why 0o700 was used?

}

func (c *Container) getHealthCheckLogDestination() string {
Expand All @@ -422,7 +432,7 @@ func (c *Container) getHealthCheckLogDestination() string {
}

func (c *Container) writeHealthCheckLog(result define.HealthCheckResults) error {
return c.witeToFileHealthCheckResults(c.getHealthCheckLogDestination(), result)
return c.writeToFileHealthCheckResults(c.getHealthCheckLogDestination(), result)
}

// readHealthCheckLog read HealthCheck logs from the path or events_logger
Expand All @@ -432,27 +442,32 @@ func (c *Container) readHealthCheckLog() (define.HealthCheckResults, error) {
}

// readFromFileHealthCheckLog returns HealthCheck results by reading the container's
// health check log file. If the health check log file does not exist, then
// an empty healthcheck struct is returned
// health check log file. If the health check log file does not exist, then
// an empty healthcheck struct is returned. If the health check log file
// cannot be parsed, the error define.ErrHealthCheckLogCorrupted struct
// is returned together with an empty define.HealthCheckResults.
// The caller should lock the container before this function is called.
func (c *Container) readFromFileHealthCheckLog(path string) (define.HealthCheckResults, error) {
var healthCheck define.HealthCheckResults
b, err := os.ReadFile(path)
if err != nil {
if errors.Is(err, fs.ErrNotExist) {
// If the file does not exists just return empty healthcheck and no error.
// If the file does not exist just return empty healthcheck and no error.
return healthCheck, nil
}
return healthCheck, fmt.Errorf("failed to read health check log file: %w", err)
}
// If the file is corrupted, return an empty healthcheck and wrapped ErrHealthCheckLogCorrupted.
if err := json.Unmarshal(b, &healthCheck); err != nil {
return healthCheck, fmt.Errorf("failed to unmarshal existing healthcheck results in %s: %w", path, err)
return healthCheck, fmt.Errorf("%w: %w", define.ErrHealthCheckLogCorrupted, err)
}
return healthCheck, nil
}

// HealthCheckStatus returns the current state of a container with a healthcheck.
// Returns an empty string if no health check is defined for the container.
// If the healthcheck log is corrupted, the error define.ErrHealthCheckLogCorrupted
// is returned.
func (c *Container) HealthCheckStatus() (string, error) {
if !c.batched {
c.lock.Lock()
Expand All @@ -474,7 +489,7 @@ func (c *Container) healthCheckStatus() (string, error) {

results, err := c.readHealthCheckLog()
if err != nil {
return "", fmt.Errorf("unable to get healthcheck log for %s: %w", c.ID(), err)
return define.HealthCheckUnhealthy, fmt.Errorf("unable to get healthcheck log for %s: %w", c.ID(), err)
}

return results.Status, nil
Expand Down
2 changes: 2 additions & 0 deletions pkg/domain/filters/containers.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"strings"
"time"

"github.com/sirupsen/logrus"
"go.podman.io/common/pkg/filters"
"go.podman.io/common/pkg/util"
"go.podman.io/podman/v6/libpod"
Expand Down Expand Up @@ -180,6 +181,7 @@ func GenerateContainerFilterFuncs(filter string, filterValues []string, r *libpo
return func(c *libpod.Container) bool {
hcStatus, err := c.HealthCheckStatus()
if err != nil {
logrus.Warnf("Healthcheck status for %v can't be determined, status won't be matched: %v", c.ID(), err)
return false
}
return slices.Contains(filterValues, hcStatus)
Expand Down
6 changes: 5 additions & 1 deletion pkg/ps/ps.go
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,11 @@ func ListContainerBatch(rt *libpod.Runtime, ctr *libpod.Container, opts entities

healthStatus, err = c.HealthCheckStatus()
if err != nil {
return err
// Do not treat a corrupted healthcheck log as a hard error.
if !errors.Is(err, define.ErrHealthCheckLogCorrupted) {
return err
}
logrus.Warnf("Container %v health check log is corrupted: %v", c.ID(), err)
}

restartCount, err = c.RestartCount()
Expand Down
40 changes: 40 additions & 0 deletions test/system/220-healthcheck.bats
Original file line number Diff line number Diff line change
Expand Up @@ -399,6 +399,46 @@ function _check_health_log {
}


@test "podman healthcheck - corrupted log file is handled gracefully" {
local TMP_DIR_HEALTHCHECK="$PODMAN_TMPDIR/healthcheck"
mkdir $TMP_DIR_HEALTHCHECK
local ctrname="c-h-$(safename)"
local msg="healthmsg-$(random_string)"
_create_container_with_health_log_settings $ctrname $msg "{{.Config.HealthLogDestination}}" "--health-log-destination $TMP_DIR_HEALTHCHECK" "$TMP_DIR_HEALTHCHECK" "HealthLogDestination"
cid="$output"

# First make sure there is an uncorrupted log.
run_podman healthcheck run $ctrname
is "$output" "" "output from 'podman healthcheck run'"

healthcheck_log_path="${TMP_DIR_HEALTHCHECK}/${cid}-healthcheck.log"
count=$(grep -co "$msg" $healthcheck_log_path)
assert "$count" -ge 1 "Number of matching health log messages"

# Corrupt the log file with invalid JSON.
echo "{invalid json{" > $healthcheck_log_path

# podman ps must not fail but should warn and report unhealthy.
run_podman 0+w ps --format '{{.Names}} {{.Status}}'
assert "$output" =~ "$ctrname" "podman ps must still list the container"
assert "$output" =~ "unhealthy" "corrupted log should be reported as unhealthy"
assert "$output" =~ "healthcheck log corrupted" "expected warning about corrupted log from ps"

# Verify that healthcheck run recovers by overwriting the corrupted
# log file with a new result while issuing a warning.
run_podman 0+w healthcheck run $ctrname
assert "$output" =~ "healthcheck log corrupted" "expected warning about corrupted log"

count=$(grep -co "$msg" $healthcheck_log_path)
assert "$count" -ge 1 "Number of matching health log messages after recovery"

# Run again to verify the warning is gone after the log was rewritten.
run_podman healthcheck run $ctrname
is "$output" "" "no warnings after log recovery"

run_podman rm -t 0 -f $ctrname
}

@test "podman healthcheck --health-log-destination journal" {
skip_if_remote "We cannot read journalctl over remote."

Expand Down