From 779dc3a19134ebe028b2b86184228c8515fcd18b Mon Sep 17 00:00:00 2001
From: Samuel Archambault <samuel.archambault@getmaintainx.com>
Date: Tue, 9 Sep 2025 13:41:55 -0400
Subject: [PATCH 1/2] healthcheck for non-systemd

Signed-off-by: Samuel Archambault <samuel.archambault@getmaintainx.com>
---
 libpod/healthcheck_nosystemd_linux.go | 162 +++++++++++++++++++++++++-
 1 file changed, 157 insertions(+), 5 deletions(-)

diff --git a/libpod/healthcheck_nosystemd_linux.go b/libpod/healthcheck_nosystemd_linux.go
index c338caf1cd..eb9f178395 100644
--- a/libpod/healthcheck_nosystemd_linux.go
+++ b/libpod/healthcheck_nosystemd_linux.go
@@ -4,20 +4,172 @@ package libpod
 
 import (
 	"context"
+	"fmt"
+	"time"
+
+	"github.com/containers/podman/v5/libpod/define"
+	"github.com/sirupsen/logrus"
 )
 
-// createTimer systemd timers for healthchecks of a container
+// healthcheckTimer manages the background goroutine for healthchecks
+type healthcheckTimer struct {
+	container *Container
+	interval  time.Duration
+	ctx       context.Context
+	cancel    context.CancelFunc
+	done      chan struct{}
+}
+
+// Global map to track active timers (in a real implementation, this would be part of the runtime)
+var activeTimers = make(map[string]*healthcheckTimer)
+
+// disableHealthCheckSystemd returns true if healthcheck should be disabled
+// For non-systemd builds, we only disable if interval is 0
+func (c *Container) disableHealthCheckSystemd(isStartup bool) bool {
+	if isStartup {
+		if c.config.StartupHealthCheckConfig != nil && c.config.StartupHealthCheckConfig.Interval == 0 {
+			return true
+		}
+	}
+	if c.config.HealthCheckConfig != nil && c.config.HealthCheckConfig.Interval == 0 {
+		return true
+	}
+	return false
+}
+
+// createTimer creates a goroutine-based timer for healthchecks of a container
 func (c *Container) createTimer(interval string, isStartup bool) error {
+	if c.disableHealthCheckSystemd(isStartup) {
+		return nil
+	}
+
+	// Parse the interval duration
+	duration, err := time.ParseDuration(interval)
+	if err != nil {
+		return err
+	}
+
+	// Stop any existing timer
+	if c.state.HCUnitName != "" {
+		c.stopHealthCheckTimer()
+	}
+
+	// Create context for cancellation
+	ctx, cancel := context.WithCancel(context.Background())
+
+	// Create timer struct
+	timer := &healthcheckTimer{
+		container: c,
+		interval:  duration,
+		ctx:       ctx,
+		cancel:    cancel,
+		done:      make(chan struct{}),
+	}
+
+	// Store timer reference globally and in container state
+	activeTimers[c.ID()] = timer
+	c.state.HCUnitName = "goroutine-timer"
+
+	if err := c.save(); err != nil {
+		cancel()
+		delete(activeTimers, c.ID())
+		return fmt.Errorf("saving container %s healthcheck timer: %w", c.ID(), err)
+	}
+
+	// Start the background goroutine
+	go timer.run()
+
+	logrus.Debugf("Created goroutine-based healthcheck timer for container %s with interval %s", c.ID(), interval)
 	return nil
 }
 
-// startTimer starts a systemd timer for the healthchecks
+// startTimer starts the goroutine-based timer for healthchecks
 func (c *Container) startTimer(isStartup bool) error {
+	// Timer is already started in createTimer, nothing to do
 	return nil
 }
 
-// removeTransientFiles removes the systemd timer and unit files
-// for the container
+// removeTransientFiles stops the goroutine-based timer
 func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, unitName string) error {
-	return nil
+	return c.stopHealthCheckTimer()
+}
+
+// stopHealthCheckTimer stops the background healthcheck goroutine
+func (c *Container) stopHealthCheckTimer() error {
+	timer, exists := activeTimers[c.ID()]
+	if !exists {
+		logrus.Debugf("No active healthcheck timer found for container %s", c.ID())
+		return nil
+	}
+
+	logrus.Debugf("Stopping healthcheck timer for container %s", c.ID())
+
+	// Cancel the context to stop the goroutine
+	timer.cancel()
+
+	// Wait for the goroutine to finish (with timeout)
+	select {
+	case <-timer.done:
+		logrus.Debugf("Healthcheck timer for container %s stopped gracefully", c.ID())
+	case <-time.After(5 * time.Second):
+		logrus.Warnf("Healthcheck timer for container %s did not stop within timeout", c.ID())
+	}
+
+	// Remove from active timers
+	delete(activeTimers, c.ID())
+
+	// Clear the unit name
+	c.state.HCUnitName = ""
+	return c.save()
+}
+
+// run executes the healthcheck in a loop with the specified interval
+func (t *healthcheckTimer) run() {
+	defer close(t.done)
+
+	ticker := time.NewTicker(t.interval)
+	defer ticker.Stop()
+
+	logrus.Debugf("Starting healthcheck timer for container %s with interval %s", t.container.ID(), t.interval)
+
+	for {
+		select {
+		case <-t.ctx.Done():
+			logrus.Debugf("Healthcheck timer for container %s stopped", t.container.ID())
+			return
+		case <-ticker.C:
+			// Run the healthcheck
+			if err := t.runHealthCheck(); err != nil {
+				logrus.Errorf("Healthcheck failed for container %s: %v", t.container.ID(), err)
+			}
+		}
+	}
+}
+
+// runHealthCheck executes a single healthcheck
+func (t *healthcheckTimer) runHealthCheck() error {
+	// Check if container is still running (without holding lock to avoid deadlock)
+	state, err := t.container.State()
+	if err != nil {
+		return err
+	}
+
+	if state != define.ContainerStateRunning {
+		logrus.Debugf("Container %s is not running (state: %v), skipping healthcheck", t.container.ID(), state)
+		return nil
+	}
+
+	// Get healthcheck config (without holding lock)
+	healthConfig := t.container.HealthCheckConfig()
+	if healthConfig == nil {
+		logrus.Debugf("No healthcheck config found for container %s, skipping healthcheck", t.container.ID())
+		return nil
+	}
+
+	// Run the healthcheck - let runHealthCheck handle its own locking internally
+	ctx, cancel := context.WithTimeout(context.Background(), healthConfig.Timeout)
+	defer cancel()
+
+	_, _, err = t.container.runHealthCheck(ctx, false)
+	return err
 }

From 5fe62815bfa742ea2bc2e42791b927c9f1f6163e Mon Sep 17 00:00:00 2001
From: Samuel Archambault <samuel.archambault@getmaintainx.com>
Date: Wed, 10 Sep 2025 07:32:55 -0400
Subject: [PATCH 2/2] Reattach timers and stop healthchecks with stop file

Signed-off-by: Samuel Archambault <samuel.archambault@getmaintainx.com>
---
 libpod/container.go                   |   2 +
 libpod/healthcheck_linux.go           |   8 ++
 libpod/healthcheck_nosystemd_linux.go | 125 ++++++++++++++++++++------
 libpod/healthcheck_unsupported.go     |   6 ++
 libpod/runtime.go                     |  10 +++
 5 files changed, 124 insertions(+), 27 deletions(-)

diff --git a/libpod/container.go b/libpod/container.go
index e7882efa40..01a93eac11 100644
--- a/libpod/container.go
+++ b/libpod/container.go
@@ -164,6 +164,8 @@ type ContainerState struct {
 	PID int `json:"pid,omitempty"`
 	// ConmonPID is the PID of the container's conmon
 	ConmonPID int `json:"conmonPid,omitempty"`
+	// HealthCheckStopFile is the path to a file that signals the healthcheck timer to stop (nosystemd only)
+	HealthCheckStopFile string `json:"healthCheckStopFile,omitempty"`
 	// ExecSessions contains all exec sessions that are associated with this
 	// container.
 	ExecSessions map[string]*ExecSession `json:"newExecSessions,omitempty"`
diff --git a/libpod/healthcheck_linux.go b/libpod/healthcheck_linux.go
index 2723c1eb21..4452f0bd79 100644
--- a/libpod/healthcheck_linux.go
+++ b/libpod/healthcheck_linux.go
@@ -18,6 +18,14 @@ import (
 	systemdCommon "go.podman.io/common/pkg/systemd"
 )
 
+// ReattachHealthCheckTimers reattaches healthcheck timers for running containers after podman restart
+// This is a no-op for systemd builds since systemd manages healthcheck timers independently
+func ReattachHealthCheckTimers(containers []*Container) {
+	// Systemd healthchecks are managed by systemd and don't need reattachment
+	// The timers persist across podman restarts because they're systemd units
+	logrus.Debugf("Skipping healthcheck reattachment for systemd build - timers are managed by systemd")
+}
+
 // createTimer systemd timers for healthchecks of a container
 func (c *Container) createTimer(interval string, isStartup bool) error {
 	if c.disableHealthCheckSystemd(isStartup) {
diff --git a/libpod/healthcheck_nosystemd_linux.go b/libpod/healthcheck_nosystemd_linux.go
index eb9f178395..deef60acfe 100644
--- a/libpod/healthcheck_nosystemd_linux.go
+++ b/libpod/healthcheck_nosystemd_linux.go
@@ -5,6 +5,8 @@ package libpod
 import (
 	"context"
 	"fmt"
+	"os"
+	"path/filepath"
 	"time"
 
 	"github.com/containers/podman/v5/libpod/define"
@@ -23,6 +25,42 @@ type healthcheckTimer struct {
 // Global map to track active timers (in a real implementation, this would be part of the runtime)
 var activeTimers = make(map[string]*healthcheckTimer)
 
+// ReattachHealthCheckTimers reattaches healthcheck timers for running containers after podman restart
+// This implementation is for nosystemd builds where healthchecks are managed by goroutines
+func ReattachHealthCheckTimers(containers []*Container) {
+	for _, ctr := range containers {
+		// Only reattach for running containers with healthcheck configs
+		if ctr.state.State != define.ContainerStateRunning {
+			continue
+		}
+
+		// Check if container has healthcheck config
+		if ctr.config.HealthCheckConfig == nil {
+			continue
+		}
+
+		// Check if timer is already running
+		if _, exists := activeTimers[ctr.ID()]; exists {
+			continue
+		}
+
+		// Check if this is a startup healthcheck that hasn't passed yet
+		if ctr.config.StartupHealthCheckConfig != nil && !ctr.state.StartupHCPassed {
+			// Reattach startup healthcheck
+			interval := ctr.config.StartupHealthCheckConfig.StartInterval.String()
+			if err := ctr.createTimer(interval, true); err != nil {
+				logrus.Errorf("Failed to reattach startup healthcheck timer for container %s: %v", ctr.ID(), err)
+			}
+		} else if ctr.state.StartupHCPassed || ctr.config.StartupHealthCheckConfig == nil {
+			// Reattach regular healthcheck
+			interval := ctr.config.HealthCheckConfig.Interval.String()
+			if err := ctr.createTimer(interval, false); err != nil {
+				logrus.Errorf("Failed to reattach healthcheck timer for container %s: %v", ctr.ID(), err)
+			}
+		}
+	}
+}
+
 // disableHealthCheckSystemd returns true if healthcheck should be disabled
 // For non-systemd builds, we only disable if interval is 0
 func (c *Container) disableHealthCheckSystemd(isStartup bool) bool {
@@ -49,9 +87,19 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
 		return err
 	}
 
-	// Stop any existing timer
+	// Stop any existing timer only if there's actually an active timer in memory
 	if c.state.HCUnitName != "" {
-		c.stopHealthCheckTimer()
+		// Check if there's an active timer in memory before stopping
+		if _, exists := activeTimers[c.ID()]; exists {
+			c.stopHealthCheckTimer()
+		} else {
+			// No active timer in memory, just clear the state without creating stop file
+			c.state.HCUnitName = ""
+			c.state.HealthCheckStopFile = ""
+			if err := c.save(); err != nil {
+				return fmt.Errorf("clearing container %s healthcheck state: %w", c.ID(), err)
+			}
+		}
 	}
 
 	// Create context for cancellation
@@ -69,6 +117,9 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
 	// Store timer reference globally and in container state
 	activeTimers[c.ID()] = timer
 	c.state.HCUnitName = "goroutine-timer"
+	// Create a stop file for cross-process cleanup
+	stopFile := filepath.Join(c.runtime.config.Engine.TmpDir, fmt.Sprintf("healthcheck-stop-%s", c.ID()))
+	c.state.HealthCheckStopFile = stopFile
 
 	if err := c.save(); err != nil {
 		cancel()
@@ -79,13 +130,25 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
 	// Start the background goroutine
 	go timer.run()
 
-	logrus.Debugf("Created goroutine-based healthcheck timer for container %s with interval %s", c.ID(), interval)
 	return nil
 }
 
 // startTimer starts the goroutine-based timer for healthchecks
 func (c *Container) startTimer(isStartup bool) error {
-	// Timer is already started in createTimer, nothing to do
+	// Check if timer already exists
+	if _, exists := activeTimers[c.ID()]; exists {
+		return nil
+	}
+
+	// Create timer if it doesn't exist
+	if c.config.HealthCheckConfig != nil {
+		interval := c.config.HealthCheckConfig.Interval.String()
+		if c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed {
+			interval = c.config.StartupHealthCheckConfig.StartInterval.String()
+		}
+		return c.createTimer(interval, c.config.StartupHealthCheckConfig != nil)
+	}
+
 	return nil
 }
 
@@ -96,30 +159,32 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, un
 
 // stopHealthCheckTimer stops the background healthcheck goroutine
 func (c *Container) stopHealthCheckTimer() error {
+	// First try to stop using the in-memory map (same process)
 	timer, exists := activeTimers[c.ID()]
-	if !exists {
-		logrus.Debugf("No active healthcheck timer found for container %s", c.ID())
-		return nil
-	}
+	if exists {
+		// Cancel the context to stop the goroutine
+		timer.cancel()
 
-	logrus.Debugf("Stopping healthcheck timer for container %s", c.ID())
-
-	// Cancel the context to stop the goroutine
-	timer.cancel()
+		// Wait for the goroutine to finish (with timeout)
+		select {
+		case <-timer.done:
+			// Timer stopped gracefully
+		case <-time.After(5 * time.Second):
+			logrus.Warnf("Healthcheck timer for container %s did not stop within timeout", c.ID())
+		}
 
-	// Wait for the goroutine to finish (with timeout)
-	select {
-	case <-timer.done:
-		logrus.Debugf("Healthcheck timer for container %s stopped gracefully", c.ID())
-	case <-time.After(5 * time.Second):
-		logrus.Warnf("Healthcheck timer for container %s did not stop within timeout", c.ID())
+		// Remove from active timers
+		delete(activeTimers, c.ID())
+	} else if c.state.HealthCheckStopFile != "" {
+		// Called from different process (cleanup), create stop file
+		if err := os.WriteFile(c.state.HealthCheckStopFile, []byte("stop"), 0644); err != nil {
+			logrus.Errorf("Failed to create healthcheck stop file for container %s: %v", c.ID(), err)
+		}
 	}
 
-	// Remove from active timers
-	delete(activeTimers, c.ID())
-
-	// Clear the unit name
+	// Clear the unit name and stop file
 	c.state.HCUnitName = ""
+	c.state.HealthCheckStopFile = ""
 	return c.save()
 }
 
@@ -130,14 +195,22 @@ func (t *healthcheckTimer) run() {
 	ticker := time.NewTicker(t.interval)
 	defer ticker.Stop()
 
-	logrus.Debugf("Starting healthcheck timer for container %s with interval %s", t.container.ID(), t.interval)
-
 	for {
 		select {
 		case <-t.ctx.Done():
-			logrus.Debugf("Healthcheck timer for container %s stopped", t.container.ID())
 			return
 		case <-ticker.C:
+			// Check for stop file (cross-process cleanup)
+			if t.container.state.HealthCheckStopFile != "" {
+				if _, err := os.Stat(t.container.state.HealthCheckStopFile); err == nil {
+					// Clean up the stop file
+					if err := os.Remove(t.container.state.HealthCheckStopFile); err != nil {
+						logrus.Warnf("Failed to remove stop file for container %s: %v", t.container.ID(), err)
+					}
+					return
+				}
+			}
+
 			// Run the healthcheck
 			if err := t.runHealthCheck(); err != nil {
 				logrus.Errorf("Healthcheck failed for container %s: %v", t.container.ID(), err)
@@ -155,14 +228,12 @@ func (t *healthcheckTimer) runHealthCheck() error {
 	}
 
 	if state != define.ContainerStateRunning {
-		logrus.Debugf("Container %s is not running (state: %v), skipping healthcheck", t.container.ID(), state)
 		return nil
 	}
 
 	// Get healthcheck config (without holding lock)
 	healthConfig := t.container.HealthCheckConfig()
 	if healthConfig == nil {
-		logrus.Debugf("No healthcheck config found for container %s, skipping healthcheck", t.container.ID())
 		return nil
 	}
 
diff --git a/libpod/healthcheck_unsupported.go b/libpod/healthcheck_unsupported.go
index 8d733698b8..da2e0b95fe 100644
--- a/libpod/healthcheck_unsupported.go
+++ b/libpod/healthcheck_unsupported.go
@@ -6,6 +6,12 @@ import (
 	"context"
 )
 
+// ReattachHealthCheckTimers reattaches healthcheck timers for running containers after podman restart
+// This is a no-op for unsupported platforms since healthchecks are not supported
+func ReattachHealthCheckTimers(containers []*Container) {
+	// Healthchecks are not supported on this platform
+}
+
 // createTimer systemd timers for healthchecks of a container
 func (c *Container) createTimer(interval string, isStartup bool) error {
 	return nil
diff --git a/libpod/runtime.go b/libpod/runtime.go
index d6d5364874..5113702013 100644
--- a/libpod/runtime.go
+++ b/libpod/runtime.go
@@ -647,6 +647,16 @@ func makeRuntime(ctx context.Context, runtime *Runtime) (retErr error) {
 
 	runtime.startWorker()
 
+	// Reattach healthcheck timers for running containers after podman restart
+	// This is only needed for the nosystemd build where healthchecks are managed by goroutines
+	// Systemd healthchecks are managed by systemd and don't need reattachment
+	ctrs, err := runtime.state.AllContainers(true)
+	if err != nil {
+		logrus.Errorf("Failed to get containers for healthcheck reattachment: %v", err)
+	} else {
+		ReattachHealthCheckTimers(ctrs)
+	}
+
 	return nil
 }