Skip to content

Commit 5fe6281

Browse files
author
Samuel Archambault
committed
Reattach timers and stop healthchecks with stop file
Signed-off-by: Samuel Archambault <[email protected]>
1 parent 779dc3a commit 5fe6281

File tree

5 files changed

+124
-27
lines changed

5 files changed

+124
-27
lines changed

libpod/container.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,8 @@ type ContainerState struct {
164164
PID int `json:"pid,omitempty"`
165165
// ConmonPID is the PID of the container's conmon
166166
ConmonPID int `json:"conmonPid,omitempty"`
167+
// HealthCheckStopFile is the path to a file that signals the healthcheck timer to stop (nosystemd only)
168+
HealthCheckStopFile string `json:"healthCheckStopFile,omitempty"`
167169
// ExecSessions contains all exec sessions that are associated with this
168170
// container.
169171
ExecSessions map[string]*ExecSession `json:"newExecSessions,omitempty"`

libpod/healthcheck_linux.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,14 @@ import (
1818
systemdCommon "go.podman.io/common/pkg/systemd"
1919
)
2020

21+
// ReattachHealthCheckTimers reattaches healthcheck timers for running containers after podman restart
22+
// This is a no-op for systemd builds since systemd manages healthcheck timers independently
23+
func ReattachHealthCheckTimers(containers []*Container) {
24+
// Systemd healthchecks are managed by systemd and don't need reattachment
25+
// The timers persist across podman restarts because they're systemd units
26+
logrus.Debugf("Skipping healthcheck reattachment for systemd build - timers are managed by systemd")
27+
}
28+
2129
// createTimer systemd timers for healthchecks of a container
2230
func (c *Container) createTimer(interval string, isStartup bool) error {
2331
if c.disableHealthCheckSystemd(isStartup) {

libpod/healthcheck_nosystemd_linux.go

Lines changed: 98 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@ package libpod
55
import (
66
"context"
77
"fmt"
8+
"os"
9+
"path/filepath"
810
"time"
911

1012
"github.com/containers/podman/v5/libpod/define"
@@ -23,6 +25,42 @@ type healthcheckTimer struct {
2325
// Global map to track active timers (in a real implementation, this would be part of the runtime)
2426
var activeTimers = make(map[string]*healthcheckTimer)
2527

28+
// ReattachHealthCheckTimers reattaches healthcheck timers for running containers after podman restart
29+
// This implementation is for nosystemd builds where healthchecks are managed by goroutines
30+
func ReattachHealthCheckTimers(containers []*Container) {
31+
for _, ctr := range containers {
32+
// Only reattach for running containers with healthcheck configs
33+
if ctr.state.State != define.ContainerStateRunning {
34+
continue
35+
}
36+
37+
// Check if container has healthcheck config
38+
if ctr.config.HealthCheckConfig == nil {
39+
continue
40+
}
41+
42+
// Check if timer is already running
43+
if _, exists := activeTimers[ctr.ID()]; exists {
44+
continue
45+
}
46+
47+
// Check if this is a startup healthcheck that hasn't passed yet
48+
if ctr.config.StartupHealthCheckConfig != nil && !ctr.state.StartupHCPassed {
49+
// Reattach startup healthcheck
50+
interval := ctr.config.StartupHealthCheckConfig.StartInterval.String()
51+
if err := ctr.createTimer(interval, true); err != nil {
52+
logrus.Errorf("Failed to reattach startup healthcheck timer for container %s: %v", ctr.ID(), err)
53+
}
54+
} else if ctr.state.StartupHCPassed || ctr.config.StartupHealthCheckConfig == nil {
55+
// Reattach regular healthcheck
56+
interval := ctr.config.HealthCheckConfig.Interval.String()
57+
if err := ctr.createTimer(interval, false); err != nil {
58+
logrus.Errorf("Failed to reattach healthcheck timer for container %s: %v", ctr.ID(), err)
59+
}
60+
}
61+
}
62+
}
63+
2664
// disableHealthCheckSystemd returns true if healthcheck should be disabled
2765
// For non-systemd builds, we only disable if interval is 0
2866
func (c *Container) disableHealthCheckSystemd(isStartup bool) bool {
@@ -49,9 +87,19 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
4987
return err
5088
}
5189

52-
// Stop any existing timer
90+
// Stop any existing timer only if there's actually an active timer in memory
5391
if c.state.HCUnitName != "" {
54-
c.stopHealthCheckTimer()
92+
// Check if there's an active timer in memory before stopping
93+
if _, exists := activeTimers[c.ID()]; exists {
94+
c.stopHealthCheckTimer()
95+
} else {
96+
// No active timer in memory, just clear the state without creating stop file
97+
c.state.HCUnitName = ""
98+
c.state.HealthCheckStopFile = ""
99+
if err := c.save(); err != nil {
100+
return fmt.Errorf("clearing container %s healthcheck state: %w", c.ID(), err)
101+
}
102+
}
55103
}
56104

57105
// Create context for cancellation
@@ -69,6 +117,9 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
69117
// Store timer reference globally and in container state
70118
activeTimers[c.ID()] = timer
71119
c.state.HCUnitName = "goroutine-timer"
120+
// Create a stop file for cross-process cleanup
121+
stopFile := filepath.Join(c.runtime.config.Engine.TmpDir, fmt.Sprintf("healthcheck-stop-%s", c.ID()))
122+
c.state.HealthCheckStopFile = stopFile
72123

73124
if err := c.save(); err != nil {
74125
cancel()
@@ -79,13 +130,25 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
79130
// Start the background goroutine
80131
go timer.run()
81132

82-
logrus.Debugf("Created goroutine-based healthcheck timer for container %s with interval %s", c.ID(), interval)
83133
return nil
84134
}
85135

86136
// startTimer starts the goroutine-based timer for healthchecks
87137
func (c *Container) startTimer(isStartup bool) error {
88-
// Timer is already started in createTimer, nothing to do
138+
// Check if timer already exists
139+
if _, exists := activeTimers[c.ID()]; exists {
140+
return nil
141+
}
142+
143+
// Create timer if it doesn't exist
144+
if c.config.HealthCheckConfig != nil {
145+
interval := c.config.HealthCheckConfig.Interval.String()
146+
if c.config.StartupHealthCheckConfig != nil && !c.state.StartupHCPassed {
147+
interval = c.config.StartupHealthCheckConfig.StartInterval.String()
148+
}
149+
return c.createTimer(interval, c.config.StartupHealthCheckConfig != nil)
150+
}
151+
89152
return nil
90153
}
91154

@@ -96,30 +159,32 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, un
96159

97160
// stopHealthCheckTimer stops the background healthcheck goroutine
98161
func (c *Container) stopHealthCheckTimer() error {
162+
// First try to stop using the in-memory map (same process)
99163
timer, exists := activeTimers[c.ID()]
100-
if !exists {
101-
logrus.Debugf("No active healthcheck timer found for container %s", c.ID())
102-
return nil
103-
}
164+
if exists {
165+
// Cancel the context to stop the goroutine
166+
timer.cancel()
104167

105-
logrus.Debugf("Stopping healthcheck timer for container %s", c.ID())
106-
107-
// Cancel the context to stop the goroutine
108-
timer.cancel()
168+
// Wait for the goroutine to finish (with timeout)
169+
select {
170+
case <-timer.done:
171+
// Timer stopped gracefully
172+
case <-time.After(5 * time.Second):
173+
logrus.Warnf("Healthcheck timer for container %s did not stop within timeout", c.ID())
174+
}
109175

110-
// Wait for the goroutine to finish (with timeout)
111-
select {
112-
case <-timer.done:
113-
logrus.Debugf("Healthcheck timer for container %s stopped gracefully", c.ID())
114-
case <-time.After(5 * time.Second):
115-
logrus.Warnf("Healthcheck timer for container %s did not stop within timeout", c.ID())
176+
// Remove from active timers
177+
delete(activeTimers, c.ID())
178+
} else if c.state.HealthCheckStopFile != "" {
179+
// Called from different process (cleanup), create stop file
180+
if err := os.WriteFile(c.state.HealthCheckStopFile, []byte("stop"), 0644); err != nil {
181+
logrus.Errorf("Failed to create healthcheck stop file for container %s: %v", c.ID(), err)
182+
}
116183
}
117184

118-
// Remove from active timers
119-
delete(activeTimers, c.ID())
120-
121-
// Clear the unit name
185+
// Clear the unit name and stop file
122186
c.state.HCUnitName = ""
187+
c.state.HealthCheckStopFile = ""
123188
return c.save()
124189
}
125190

@@ -130,14 +195,22 @@ func (t *healthcheckTimer) run() {
130195
ticker := time.NewTicker(t.interval)
131196
defer ticker.Stop()
132197

133-
logrus.Debugf("Starting healthcheck timer for container %s with interval %s", t.container.ID(), t.interval)
134-
135198
for {
136199
select {
137200
case <-t.ctx.Done():
138-
logrus.Debugf("Healthcheck timer for container %s stopped", t.container.ID())
139201
return
140202
case <-ticker.C:
203+
// Check for stop file (cross-process cleanup)
204+
if t.container.state.HealthCheckStopFile != "" {
205+
if _, err := os.Stat(t.container.state.HealthCheckStopFile); err == nil {
206+
// Clean up the stop file
207+
if err := os.Remove(t.container.state.HealthCheckStopFile); err != nil {
208+
logrus.Warnf("Failed to remove stop file for container %s: %v", t.container.ID(), err)
209+
}
210+
return
211+
}
212+
}
213+
141214
// Run the healthcheck
142215
if err := t.runHealthCheck(); err != nil {
143216
logrus.Errorf("Healthcheck failed for container %s: %v", t.container.ID(), err)
@@ -155,14 +228,12 @@ func (t *healthcheckTimer) runHealthCheck() error {
155228
}
156229

157230
if state != define.ContainerStateRunning {
158-
logrus.Debugf("Container %s is not running (state: %v), skipping healthcheck", t.container.ID(), state)
159231
return nil
160232
}
161233

162234
// Get healthcheck config (without holding lock)
163235
healthConfig := t.container.HealthCheckConfig()
164236
if healthConfig == nil {
165-
logrus.Debugf("No healthcheck config found for container %s, skipping healthcheck", t.container.ID())
166237
return nil
167238
}
168239

libpod/healthcheck_unsupported.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,12 @@ import (
66
"context"
77
)
88

9+
// ReattachHealthCheckTimers reattaches healthcheck timers for running containers after podman restart
10+
// This is a no-op for unsupported platforms since healthchecks are not supported
11+
func ReattachHealthCheckTimers(containers []*Container) {
12+
// Healthchecks are not supported on this platform
13+
}
14+
915
// createTimer systemd timers for healthchecks of a container
1016
func (c *Container) createTimer(interval string, isStartup bool) error {
1117
return nil

libpod/runtime.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -647,6 +647,16 @@ func makeRuntime(ctx context.Context, runtime *Runtime) (retErr error) {
647647

648648
runtime.startWorker()
649649

650+
// Reattach healthcheck timers for running containers after podman restart
651+
// This is only needed for the nosystemd build where healthchecks are managed by goroutines
652+
// Systemd healthchecks are managed by systemd and don't need reattachment
653+
ctrs, err := runtime.state.AllContainers(true)
654+
if err != nil {
655+
logrus.Errorf("Failed to get containers for healthcheck reattachment: %v", err)
656+
} else {
657+
ReattachHealthCheckTimers(ctrs)
658+
}
659+
650660
return nil
651661
}
652662

0 commit comments

Comments
 (0)