From 779dc3a19134ebe028b2b86184228c8515fcd18b Mon Sep 17 00:00:00 2001 From: Samuel Archambault Date: Tue, 9 Sep 2025 13:41:55 -0400 Subject: [PATCH 1/2] healthcheck for non-systemd Signed-off-by: Samuel Archambault --- libpod/healthcheck_nosystemd_linux.go | 162 +++++++++++++++++++++++++- 1 file changed, 157 insertions(+), 5 deletions(-) diff --git a/libpod/healthcheck_nosystemd_linux.go b/libpod/healthcheck_nosystemd_linux.go index c338caf1cd..eb9f178395 100644 --- a/libpod/healthcheck_nosystemd_linux.go +++ b/libpod/healthcheck_nosystemd_linux.go @@ -4,20 +4,172 @@ package libpod import ( "context" + "fmt" + "time" + + "github.com/containers/podman/v5/libpod/define" + "github.com/sirupsen/logrus" ) -// createTimer systemd timers for healthchecks of a container +// healthcheckTimer manages the background goroutine for healthchecks +type healthcheckTimer struct { + container *Container + interval time.Duration + ctx context.Context + cancel context.CancelFunc + done chan struct{} +} + +// Global map to track active timers (in a real implementation, this would be part of the runtime) +var activeTimers = make(map[string]*healthcheckTimer) + +// disableHealthCheckSystemd returns true if healthcheck should be disabled +// For non-systemd builds, we only disable if interval is 0 +func (c *Container) disableHealthCheckSystemd(isStartup bool) bool { + if isStartup { + if c.config.StartupHealthCheckConfig != nil && c.config.StartupHealthCheckConfig.Interval == 0 { + return true + } + } + if c.config.HealthCheckConfig != nil && c.config.HealthCheckConfig.Interval == 0 { + return true + } + return false +} + +// createTimer creates a goroutine-based timer for healthchecks of a container func (c *Container) createTimer(interval string, isStartup bool) error { + if c.disableHealthCheckSystemd(isStartup) { + return nil + } + + // Parse the interval duration + duration, err := time.ParseDuration(interval) + if err != nil { + return err + } + + // Stop any existing timer + if c.state.HCUnitName != "" { + c.stopHealthCheckTimer() + } + + // Create context for cancellation + ctx, cancel := context.WithCancel(context.Background()) + + // Create timer struct + timer := &healthcheckTimer{ + container: c, + interval: duration, + ctx: ctx, + cancel: cancel, + done: make(chan struct{}), + } + + // Store timer reference globally and in container state + activeTimers[c.ID()] = timer + c.state.HCUnitName = "goroutine-timer" + + if err := c.save(); err != nil { + cancel() + delete(activeTimers, c.ID()) + return fmt.Errorf("saving container %s healthcheck timer: %w", c.ID(), err) + } + + // Start the background goroutine + go timer.run() + + logrus.Debugf("Created goroutine-based healthcheck timer for container %s with interval %s", c.ID(), interval) return nil } -// startTimer starts a systemd timer for the healthchecks +// startTimer starts the goroutine-based timer for healthchecks func (c *Container) startTimer(isStartup bool) error { + // Timer is already started in createTimer, nothing to do return nil } -// removeTransientFiles removes the systemd timer and unit files -// for the container +// removeTransientFiles stops the goroutine-based timer func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, unitName string) error { - return nil + return c.stopHealthCheckTimer() +} + +// stopHealthCheckTimer stops the background healthcheck goroutine +func (c *Container) stopHealthCheckTimer() error { + timer, exists := activeTimers[c.ID()] + if !exists { + logrus.Debugf("No active healthcheck timer found for container %s", c.ID()) + return nil + } + + logrus.Debugf("Stopping healthcheck timer for container %s", c.ID()) + + // Cancel the context to stop the goroutine + timer.cancel() + + // Wait for the goroutine to finish (with timeout) + select { + case <-timer.done: + logrus.Debugf("Healthcheck timer for container %s stopped gracefully", c.ID()) + case <-time.After(5 * time.Second): + logrus.Warnf("Healthcheck timer for container %s did not stop within timeout", c.ID()) + } + + // Remove from active timers + delete(activeTimers, c.ID()) + + // Clear the unit name + c.state.HCUnitName = "" + return c.save() +} + +// run executes the healthcheck in a loop with the specified interval +func (t *healthcheckTimer) run() { + defer close(t.done) + + ticker := time.NewTicker(t.interval) + defer ticker.Stop() + + logrus.Debugf("Starting healthcheck timer for container %s with interval %s", t.container.ID(), t.interval) + + for { + select { + case <-t.ctx.Done(): + logrus.Debugf("Healthcheck timer for container %s stopped", t.container.ID()) + return + case <-ticker.C: + // Run the healthcheck + if err := t.runHealthCheck(); err != nil { + logrus.Errorf("Healthcheck failed for container %s: %v", t.container.ID(), err) + } + } + } +} + +// runHealthCheck executes a single healthcheck +func (t *healthcheckTimer) runHealthCheck() error { + // Check if container is still running (without holding lock to avoid deadlock) + state, err := t.container.State() + if err != nil { + return err + } + + if state != define.ContainerStateRunning { + logrus.Debugf("Container %s is not running (state: %v), skipping healthcheck", t.container.ID(), state) + return nil + } + + // Get healthcheck config (without holding lock) + healthConfig := t.container.HealthCheckConfig() + if healthConfig == nil { + logrus.Debugf("No healthcheck config found for container %s, skipping healthcheck", t.container.ID()) + return nil + } + + // Run the healthcheck - let runHealthCheck handle its own locking internally + ctx, cancel := context.WithTimeout(context.Background(), healthConfig.Timeout) + defer cancel() + + _, _, err = t.container.runHealthCheck(ctx, false) + return err } From 5fe62815bfa742ea2bc2e42791b927c9f1f6163e Mon Sep 17 00:00:00 2001 From: Samuel Archambault Date: Wed, 10 Sep 2025 07:32:55 -0400 Subject: [PATCH 2/2] Reattach timers and stop healthchecks with stop file Signed-off-by: Samuel Archambault --- libpod/container.go | 2 + libpod/healthcheck_linux.go | 8 ++ libpod/healthcheck_nosystemd_linux.go | 125 ++++++++++++++++++++------ libpod/healthcheck_unsupported.go | 6 ++ libpod/runtime.go | 10 +++ 5 files changed, 124 insertions(+), 27 deletions(-) diff --git a/libpod/container.go b/libpod/container.go index e7882efa40..01a93eac11 100644 --- a/libpod/container.go +++ b/libpod/container.go @@ -164,6 +164,8 @@ type ContainerState struct { PID int `json:"pid,omitempty"` // ConmonPID is the PID of the container's conmon ConmonPID int `json:"conmonPid,omitempty"` + // HealthCheckStopFile is the path to a file that signals the healthcheck timer to stop (nosystemd only) + HealthCheckStopFile string `json:"healthCheckStopFile,omitempty"` // ExecSessions contains all exec sessions that are associated with this // container. ExecSessions map[string]*ExecSession `json:"newExecSessions,omitempty"` diff --git a/libpod/healthcheck_linux.go b/libpod/healthcheck_linux.go index 2723c1eb21..4452f0bd79 100644 --- a/libpod/healthcheck_linux.go +++ b/libpod/healthcheck_linux.go @@ -18,6 +18,14 @@ import ( systemdCommon "go.podman.io/common/pkg/systemd" ) +// ReattachHealthCheckTimers reattaches healthcheck timers for running containers after podman restart +// This is a no-op for systemd builds since systemd manages healthcheck timers independently +func ReattachHealthCheckTimers(containers []*Container) { + // Systemd healthchecks are managed by systemd and don't need reattachment + // The timers persist across podman restarts because they're systemd units + logrus.Debugf("Skipping healthcheck reattachment for systemd build - timers are managed by systemd") +} + // createTimer systemd timers for healthchecks of a container func (c *Container) createTimer(interval string, isStartup bool) error { if c.disableHealthCheckSystemd(isStartup) { diff --git a/libpod/healthcheck_nosystemd_linux.go b/libpod/healthcheck_nosystemd_linux.go index eb9f178395..deef60acfe 100644 --- a/libpod/healthcheck_nosystemd_linux.go +++ b/libpod/healthcheck_nosystemd_linux.go @@ -5,6 +5,8 @@ package libpod import ( "context" "fmt" + "os" + "path/filepath" "time" "github.com/containers/podman/v5/libpod/define" @@ -23,6 +25,42 @@ type healthcheckTimer struct { // Global map to track active timers (in a real implementation, this would be part of the runtime) var activeTimers = make(map[string]*healthcheckTimer) +// ReattachHealthCheckTimers reattaches healthcheck timers for running containers after podman restart +// This implementation is for nosystemd builds where healthchecks are managed by goroutines +func ReattachHealthCheckTimers(containers []*Container) { + for _, ctr := range containers { + // Only reattach for running containers with healthcheck configs + if ctr.state.State != define.ContainerStateRunning { + continue + } + + // Check if container has healthcheck config + if ctr.config.HealthCheckConfig == nil { + continue + } + + // Check if timer is already running + if _, exists := activeTimers[ctr.ID()]; exists { + continue + } + + // Check if this is a startup healthcheck that hasn't passed yet + if ctr.config.StartupHealthCheckConfig != nil && !ctr.state.StartupHCPassed { + // Reattach startup healthcheck + interval := ctr.config.StartupHealthCheckConfig.StartInterval.String() + if err := ctr.createTimer(interval, true); err != nil { + logrus.Errorf("Failed to reattach startup healthcheck timer for container %s: %v", ctr.ID(), err) + } + } else if ctr.state.StartupHCPassed || ctr.config.StartupHealthCheckConfig == nil { + // Reattach regular healthcheck + interval := ctr.config.HealthCheckConfig.Interval.String() + if err := ctr.createTimer(interval, false); err != nil { + logrus.Errorf("Failed to reattach healthcheck timer for container %s: %v", ctr.ID(), err) + } + } + } +} + // disableHealthCheckSystemd returns true if healthcheck should be disabled // For non-systemd builds, we only disable if interval is 0 func (c *Container) disableHealthCheckSystemd(isStartup bool) bool { @@ -49,9 +87,19 @@ func (c *Container) createTimer(interval string, isStartup bool) error { return err } - // Stop any existing timer + // Stop any existing timer only if there's actually an active timer in memory if c.state.HCUnitName != "" { - c.stopHealthCheckTimer() + // Check if there's an active timer in memory before stopping + if _, exists := activeTimers[c.ID()]; exists { + c.stopHealthCheckTimer() + } else { + // No active timer in memory, just clear the state without creating stop file + c.state.HCUnitName = "" + c.state.HealthCheckStopFile = "" + if err := c.save(); err != nil { + return fmt.Errorf("clearing container %s healthcheck state: %w", c.ID(), err) + } + } } // Create context for cancellation @@ -69,6 +117,9 @@ func (c *Container) createTimer(interval string, isStartup bool) error { // Store timer reference globally and in container state activeTimers[c.ID()] = timer c.state.HCUnitName = "goroutine-timer" + // Create a stop file for cross-process cleanup + stopFile := filepath.Join(c.runtime.config.Engine.TmpDir, fmt.Sprintf("healthcheck-stop-%s", c.ID())) + c.state.HealthCheckStopFile = stopFile if err := c.save(); err != nil { cancel() @@ -79,13 +130,25 @@ func (c *Container) createTimer(interval string, isStartup bool) error { // Start the background goroutine go timer.run() - logrus.Debugf("Created goroutine-based healthcheck timer for container %s with interval %s", c.ID(), interval) return nil } // startTimer starts the goroutine-based timer for healthchecks func (c *Container) startTimer(isStartup bool) error { - // Timer is already started in createTimer, nothing to do + // Check if timer already exists + if _, exists := activeTimers[c.ID()]; exists { + return nil + } + + // Create timer if it doesn't exist + if c.config.HealthCheckConfig != nil { + interval := c.config.HealthCheckConfig.Interval.String() + if c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed { + interval = c.config.StartupHealthCheckConfig.StartInterval.String() + } + return c.createTimer(interval, c.config.StartupHealthCheckConfig != nil) + } + return nil } @@ -96,30 +159,32 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, un // stopHealthCheckTimer stops the background healthcheck goroutine func (c *Container) stopHealthCheckTimer() error { + // First try to stop using the in-memory map (same process) timer, exists := activeTimers[c.ID()] - if !exists { - logrus.Debugf("No active healthcheck timer found for container %s", c.ID()) - return nil - } + if exists { + // Cancel the context to stop the goroutine + timer.cancel() - logrus.Debugf("Stopping healthcheck timer for container %s", c.ID()) - - // Cancel the context to stop the goroutine - timer.cancel() + // Wait for the goroutine to finish (with timeout) + select { + case <-timer.done: + // Timer stopped gracefully + case <-time.After(5 * time.Second): + logrus.Warnf("Healthcheck timer for container %s did not stop within timeout", c.ID()) + } - // Wait for the goroutine to finish (with timeout) - select { - case <-timer.done: - logrus.Debugf("Healthcheck timer for container %s stopped gracefully", c.ID()) - case <-time.After(5 * time.Second): - logrus.Warnf("Healthcheck timer for container %s did not stop within timeout", c.ID()) + // Remove from active timers + delete(activeTimers, c.ID()) + } else if c.state.HealthCheckStopFile != "" { + // Called from different process (cleanup), create stop file + if err := os.WriteFile(c.state.HealthCheckStopFile, []byte("stop"), 0644); err != nil { + logrus.Errorf("Failed to create healthcheck stop file for container %s: %v", c.ID(), err) + } } - // Remove from active timers - delete(activeTimers, c.ID()) - - // Clear the unit name + // Clear the unit name and stop file c.state.HCUnitName = "" + c.state.HealthCheckStopFile = "" return c.save() } @@ -130,14 +195,22 @@ func (t *healthcheckTimer) run() { ticker := time.NewTicker(t.interval) defer ticker.Stop() - logrus.Debugf("Starting healthcheck timer for container %s with interval %s", t.container.ID(), t.interval) - for { select { case <-t.ctx.Done(): - logrus.Debugf("Healthcheck timer for container %s stopped", t.container.ID()) return case <-ticker.C: + // Check for stop file (cross-process cleanup) + if t.container.state.HealthCheckStopFile != "" { + if _, err := os.Stat(t.container.state.HealthCheckStopFile); err == nil { + // Clean up the stop file + if err := os.Remove(t.container.state.HealthCheckStopFile); err != nil { + logrus.Warnf("Failed to remove stop file for container %s: %v", t.container.ID(), err) + } + return + } + } + // Run the healthcheck if err := t.runHealthCheck(); err != nil { logrus.Errorf("Healthcheck failed for container %s: %v", t.container.ID(), err) @@ -155,14 +228,12 @@ func (t *healthcheckTimer) runHealthCheck() error { } if state != define.ContainerStateRunning { - logrus.Debugf("Container %s is not running (state: %v), skipping healthcheck", t.container.ID(), state) return nil } // Get healthcheck config (without holding lock) healthConfig := t.container.HealthCheckConfig() if healthConfig == nil { - logrus.Debugf("No healthcheck config found for container %s, skipping healthcheck", t.container.ID()) return nil } diff --git a/libpod/healthcheck_unsupported.go b/libpod/healthcheck_unsupported.go index 8d733698b8..da2e0b95fe 100644 --- a/libpod/healthcheck_unsupported.go +++ b/libpod/healthcheck_unsupported.go @@ -6,6 +6,12 @@ import ( "context" ) +// ReattachHealthCheckTimers reattaches healthcheck timers for running containers after podman restart +// This is a no-op for unsupported platforms since healthchecks are not supported +func ReattachHealthCheckTimers(containers []*Container) { + // Healthchecks are not supported on this platform +} + // createTimer systemd timers for healthchecks of a container func (c *Container) createTimer(interval string, isStartup bool) error { return nil diff --git a/libpod/runtime.go b/libpod/runtime.go index d6d5364874..5113702013 100644 --- a/libpod/runtime.go +++ b/libpod/runtime.go @@ -647,6 +647,16 @@ func makeRuntime(ctx context.Context, runtime *Runtime) (retErr error) { runtime.startWorker() + // Reattach healthcheck timers for running containers after podman restart + // This is only needed for the nosystemd build where healthchecks are managed by goroutines + // Systemd healthchecks are managed by systemd and don't need reattachment + ctrs, err := runtime.state.AllContainers(true) + if err != nil { + logrus.Errorf("Failed to get containers for healthcheck reattachment: %v", err) + } else { + ReattachHealthCheckTimers(ctrs) + } + return nil }