Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions libpod/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,8 @@ type ContainerState struct {
PID int `json:"pid,omitempty"`
// ConmonPID is the PID of the container's conmon
ConmonPID int `json:"conmonPid,omitempty"`
// HealthCheckStopFile is the path to a file that signals the healthcheck timer to stop (nosystemd only)
HealthCheckStopFile string `json:"healthCheckStopFile,omitempty"`
// ExecSessions contains all exec sessions that are associated with this
// container.
ExecSessions map[string]*ExecSession `json:"newExecSessions,omitempty"`
Expand Down
8 changes: 8 additions & 0 deletions libpod/healthcheck_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,14 @@ import (
systemdCommon "go.podman.io/common/pkg/systemd"
)

// ReattachHealthCheckTimers reattaches healthcheck timers for running containers after podman restart
// This is a no-op for systemd builds since systemd manages healthcheck timers independently
func ReattachHealthCheckTimers(containers []*Container) {
// Systemd healthchecks are managed by systemd and don't need reattachment
// The timers persist across podman restarts because they're systemd units
logrus.Debugf("Skipping healthcheck reattachment for systemd build - timers are managed by systemd")
}

// createTimer systemd timers for healthchecks of a container
func (c *Container) createTimer(interval string, isStartup bool) error {
if c.disableHealthCheckSystemd(isStartup) {
Expand Down
233 changes: 228 additions & 5 deletions libpod/healthcheck_nosystemd_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,243 @@ package libpod

import (
"context"
"fmt"
"os"
"path/filepath"
"time"

"github.com/containers/podman/v5/libpod/define"
"github.com/sirupsen/logrus"
)

// createTimer systemd timers for healthchecks of a container
// healthcheckTimer manages the background goroutine for healthchecks
type healthcheckTimer struct {
container *Container
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This data will become outdated. I'd prefer to store only the ID and get the container on demand. This also prevents a memory leak if the activeTimers map grows without bounds.

interval time.Duration
ctx context.Context
cancel context.CancelFunc
done chan struct{}
}

// Global map to track active timers (in a real implementation, this would be part of the runtime)
var activeTimers = make(map[string]*healthcheckTimer)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Accessing this map is a critical section, so you must use a mutex.


// ReattachHealthCheckTimers reattaches healthcheck timers for running containers after podman restart
// This implementation is for nosystemd builds where healthchecks are managed by goroutines
func ReattachHealthCheckTimers(containers []*Container) {
for _, ctr := range containers {
// Only reattach for running containers with healthcheck configs
if ctr.state.State != define.ContainerStateRunning {
continue
}

// Check if container has healthcheck config
if ctr.config.HealthCheckConfig == nil {
continue
}

// Check if timer is already running
if _, exists := activeTimers[ctr.ID()]; exists {
continue
}

// Check if this is a startup healthcheck that hasn't passed yet
if ctr.config.StartupHealthCheckConfig != nil && !ctr.state.StartupHCPassed {
// Reattach startup healthcheck
interval := ctr.config.StartupHealthCheckConfig.StartInterval.String()
if err := ctr.createTimer(interval, true); err != nil {
logrus.Errorf("Failed to reattach startup healthcheck timer for container %s: %v", ctr.ID(), err)
}
} else if ctr.state.StartupHCPassed || ctr.config.StartupHealthCheckConfig == nil {
// Reattach regular healthcheck
interval := ctr.config.HealthCheckConfig.Interval.String()
if err := ctr.createTimer(interval, false); err != nil {
logrus.Errorf("Failed to reattach healthcheck timer for container %s: %v", ctr.ID(), err)
}
}
}
}

// disableHealthCheckSystemd returns true if healthcheck should be disabled
// For non-systemd builds, we only disable if interval is 0
func (c *Container) disableHealthCheckSystemd(isStartup bool) bool {
if isStartup {
if c.config.StartupHealthCheckConfig != nil && c.config.StartupHealthCheckConfig.Interval == 0 {
return true
}
}
if c.config.HealthCheckConfig != nil && c.config.HealthCheckConfig.Interval == 0 {
return true
}
return false
}

// createTimer creates a goroutine-based timer for healthchecks of a container
func (c *Container) createTimer(interval string, isStartup bool) error {
if c.disableHealthCheckSystemd(isStartup) {
return nil
}

// Parse the interval duration
duration, err := time.ParseDuration(interval)
if err != nil {
return err
}

// Stop any existing timer only if there's actually an active timer in memory
if c.state.HCUnitName != "" {
// Check if there's an active timer in memory before stopping
if _, exists := activeTimers[c.ID()]; exists {
c.stopHealthCheckTimer()
} else {
// No active timer in memory, just clear the state without creating stop file
c.state.HCUnitName = ""
c.state.HealthCheckStopFile = ""
if err := c.save(); err != nil {
return fmt.Errorf("clearing container %s healthcheck state: %w", c.ID(), err)
}
}
}

// Create context for cancellation
ctx, cancel := context.WithCancel(context.Background())

// Create timer struct
timer := &healthcheckTimer{
container: c,
interval: duration,
ctx: ctx,
cancel: cancel,
done: make(chan struct{}),
}

// Store timer reference globally and in container state
activeTimers[c.ID()] = timer
c.state.HCUnitName = "goroutine-timer"
// Create a stop file for cross-process cleanup
stopFile := filepath.Join(c.runtime.config.Engine.TmpDir, fmt.Sprintf("healthcheck-stop-%s", c.ID()))
c.state.HealthCheckStopFile = stopFile

if err := c.save(); err != nil {
cancel()
delete(activeTimers, c.ID())
return fmt.Errorf("saving container %s healthcheck timer: %w", c.ID(), err)
}

// Start the background goroutine
go timer.run()

return nil
}

// startTimer starts a systemd timer for the healthchecks
// startTimer starts the goroutine-based timer for healthchecks
func (c *Container) startTimer(isStartup bool) error {
// Check if timer already exists
if _, exists := activeTimers[c.ID()]; exists {
return nil
}

// Create timer if it doesn't exist
if c.config.HealthCheckConfig != nil {
interval := c.config.HealthCheckConfig.Interval.String()
if c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed {
interval = c.config.StartupHealthCheckConfig.StartInterval.String()
}
return c.createTimer(interval, c.config.StartupHealthCheckConfig != nil)
}

return nil
}

// removeTransientFiles removes the systemd timer and unit files
// for the container
// removeTransientFiles stops the goroutine-based timer
func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, unitName string) error {
return nil
return c.stopHealthCheckTimer()
}

// stopHealthCheckTimer stops the background healthcheck goroutine
func (c *Container) stopHealthCheckTimer() error {
// First try to stop using the in-memory map (same process)
timer, exists := activeTimers[c.ID()]
if exists {
// Cancel the context to stop the goroutine
timer.cancel()

// Wait for the goroutine to finish (with timeout)
select {
case <-timer.done:
// Timer stopped gracefully
case <-time.After(5 * time.Second):
logrus.Warnf("Healthcheck timer for container %s did not stop within timeout", c.ID())
}

// Remove from active timers
delete(activeTimers, c.ID())
} else if c.state.HealthCheckStopFile != "" {
// Called from different process (cleanup), create stop file
if err := os.WriteFile(c.state.HealthCheckStopFile, []byte("stop"), 0644); err != nil {
logrus.Errorf("Failed to create healthcheck stop file for container %s: %v", c.ID(), err)
}
}

// Clear the unit name and stop file
c.state.HCUnitName = ""
c.state.HealthCheckStopFile = ""
return c.save()
}

// run executes the healthcheck in a loop with the specified interval
func (t *healthcheckTimer) run() {
defer close(t.done)

ticker := time.NewTicker(t.interval)
defer ticker.Stop()

for {
select {
case <-t.ctx.Done():
return
case <-ticker.C:
// Check for stop file (cross-process cleanup)
if t.container.state.HealthCheckStopFile != "" {
if _, err := os.Stat(t.container.state.HealthCheckStopFile); err == nil {
// Clean up the stop file
if err := os.Remove(t.container.state.HealthCheckStopFile); err != nil {
logrus.Warnf("Failed to remove stop file for container %s: %v", t.container.ID(), err)
}
return
}
}

// Run the healthcheck
if err := t.runHealthCheck(); err != nil {
logrus.Errorf("Healthcheck failed for container %s: %v", t.container.ID(), err)
}
}
}
}

// runHealthCheck executes a single healthcheck
func (t *healthcheckTimer) runHealthCheck() error {
// Check if container is still running (without holding lock to avoid deadlock)
state, err := t.container.State()
if err != nil {
return err
}

if state != define.ContainerStateRunning {
return nil
}

// Get healthcheck config (without holding lock)
healthConfig := t.container.HealthCheckConfig()
if healthConfig == nil {
return nil
}

// Run the healthcheck - let runHealthCheck handle its own locking internally
ctx, cancel := context.WithTimeout(context.Background(), healthConfig.Timeout)
defer cancel()

_, _, err = t.container.runHealthCheck(ctx, false)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will bypass the startup health check. Instead, you should call Runtime.HealthCheck or replicate its structure.

return err
}
6 changes: 6 additions & 0 deletions libpod/healthcheck_unsupported.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ import (
"context"
)

// ReattachHealthCheckTimers reattaches healthcheck timers for running containers after podman restart
// This is a no-op for unsupported platforms since healthchecks are not supported
func ReattachHealthCheckTimers(containers []*Container) {
// Healthchecks are not supported on this platform
}

// createTimer systemd timers for healthchecks of a container
func (c *Container) createTimer(interval string, isStartup bool) error {
return nil
Expand Down
10 changes: 10 additions & 0 deletions libpod/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -647,6 +647,16 @@ func makeRuntime(ctx context.Context, runtime *Runtime) (retErr error) {

runtime.startWorker()

// Reattach healthcheck timers for running containers after podman restart
// This is only needed for the nosystemd build where healthchecks are managed by goroutines
// Systemd healthchecks are managed by systemd and don't need reattachment
ctrs, err := runtime.state.AllContainers(true)
if err != nil {
logrus.Errorf("Failed to get containers for healthcheck reattachment: %v", err)
} else {
ReattachHealthCheckTimers(ctrs)
}

return nil
}

Expand Down