@@ -5,6 +5,8 @@ package libpod
5
5
import (
6
6
"context"
7
7
"fmt"
8
+ "os"
9
+ "path/filepath"
8
10
"time"
9
11
10
12
"github.com/containers/podman/v5/libpod/define"
@@ -23,6 +25,42 @@ type healthcheckTimer struct {
23
25
// Global map to track active timers (in a real implementation, this would be part of the runtime)
24
26
var activeTimers = make (map [string ]* healthcheckTimer )
25
27
28
+ // ReattachHealthCheckTimers reattaches healthcheck timers for running containers after podman restart
29
+ // This implementation is for nosystemd builds where healthchecks are managed by goroutines
30
+ func ReattachHealthCheckTimers (containers []* Container ) {
31
+ for _ , ctr := range containers {
32
+ // Only reattach for running containers with healthcheck configs
33
+ if ctr .state .State != define .ContainerStateRunning {
34
+ continue
35
+ }
36
+
37
+ // Check if container has healthcheck config
38
+ if ctr .config .HealthCheckConfig == nil {
39
+ continue
40
+ }
41
+
42
+ // Check if timer is already running
43
+ if _ , exists := activeTimers [ctr .ID ()]; exists {
44
+ continue
45
+ }
46
+
47
+ // Check if this is a startup healthcheck that hasn't passed yet
48
+ if ctr .config .StartupHealthCheckConfig != nil && ! ctr .state .StartupHCPassed {
49
+ // Reattach startup healthcheck
50
+ interval := ctr .config .StartupHealthCheckConfig .StartInterval .String ()
51
+ if err := ctr .createTimer (interval , true ); err != nil {
52
+ logrus .Errorf ("Failed to reattach startup healthcheck timer for container %s: %v" , ctr .ID (), err )
53
+ }
54
+ } else if ctr .state .StartupHCPassed || ctr .config .StartupHealthCheckConfig == nil {
55
+ // Reattach regular healthcheck
56
+ interval := ctr .config .HealthCheckConfig .Interval .String ()
57
+ if err := ctr .createTimer (interval , false ); err != nil {
58
+ logrus .Errorf ("Failed to reattach healthcheck timer for container %s: %v" , ctr .ID (), err )
59
+ }
60
+ }
61
+ }
62
+ }
63
+
26
64
// disableHealthCheckSystemd returns true if healthcheck should be disabled
27
65
// For non-systemd builds, we only disable if interval is 0
28
66
func (c * Container ) disableHealthCheckSystemd (isStartup bool ) bool {
@@ -49,9 +87,19 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
49
87
return err
50
88
}
51
89
52
- // Stop any existing timer
90
+ // Stop any existing timer only if there's actually an active timer in memory
53
91
if c .state .HCUnitName != "" {
54
- c .stopHealthCheckTimer ()
92
+ // Check if there's an active timer in memory before stopping
93
+ if _ , exists := activeTimers [c .ID ()]; exists {
94
+ c .stopHealthCheckTimer ()
95
+ } else {
96
+ // No active timer in memory, just clear the state without creating stop file
97
+ c .state .HCUnitName = ""
98
+ c .state .HealthCheckStopFile = ""
99
+ if err := c .save (); err != nil {
100
+ return fmt .Errorf ("clearing container %s healthcheck state: %w" , c .ID (), err )
101
+ }
102
+ }
55
103
}
56
104
57
105
// Create context for cancellation
@@ -69,6 +117,9 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
69
117
// Store timer reference globally and in container state
70
118
activeTimers [c .ID ()] = timer
71
119
c .state .HCUnitName = "goroutine-timer"
120
+ // Create a stop file for cross-process cleanup
121
+ stopFile := filepath .Join (c .runtime .config .Engine .TmpDir , fmt .Sprintf ("healthcheck-stop-%s" , c .ID ()))
122
+ c .state .HealthCheckStopFile = stopFile
72
123
73
124
if err := c .save (); err != nil {
74
125
cancel ()
@@ -79,13 +130,25 @@ func (c *Container) createTimer(interval string, isStartup bool) error {
79
130
// Start the background goroutine
80
131
go timer .run ()
81
132
82
- logrus .Debugf ("Created goroutine-based healthcheck timer for container %s with interval %s" , c .ID (), interval )
83
133
return nil
84
134
}
85
135
86
136
// startTimer starts the goroutine-based timer for healthchecks
87
137
func (c * Container ) startTimer (isStartup bool ) error {
88
- // Timer is already started in createTimer, nothing to do
138
+ // Check if timer already exists
139
+ if _ , exists := activeTimers [c .ID ()]; exists {
140
+ return nil
141
+ }
142
+
143
+ // Create timer if it doesn't exist
144
+ if c .config .HealthCheckConfig != nil {
145
+ interval := c .config .HealthCheckConfig .Interval .String ()
146
+ if c .config .StartupHealthCheckConfig != nil && ! c .state .StartupHCPassed {
147
+ interval = c .config .StartupHealthCheckConfig .StartInterval .String ()
148
+ }
149
+ return c .createTimer (interval , c .config .StartupHealthCheckConfig != nil )
150
+ }
151
+
89
152
return nil
90
153
}
91
154
@@ -96,30 +159,32 @@ func (c *Container) removeTransientFiles(ctx context.Context, isStartup bool, un
96
159
97
160
// stopHealthCheckTimer stops the background healthcheck goroutine
98
161
func (c * Container ) stopHealthCheckTimer () error {
162
+ // First try to stop using the in-memory map (same process)
99
163
timer , exists := activeTimers [c .ID ()]
100
- if ! exists {
101
- logrus .Debugf ("No active healthcheck timer found for container %s" , c .ID ())
102
- return nil
103
- }
164
+ if exists {
165
+ // Cancel the context to stop the goroutine
166
+ timer .cancel ()
104
167
105
- logrus .Debugf ("Stopping healthcheck timer for container %s" , c .ID ())
106
-
107
- // Cancel the context to stop the goroutine
108
- timer .cancel ()
168
+ // Wait for the goroutine to finish (with timeout)
169
+ select {
170
+ case <- timer .done :
171
+ // Timer stopped gracefully
172
+ case <- time .After (5 * time .Second ):
173
+ logrus .Warnf ("Healthcheck timer for container %s did not stop within timeout" , c .ID ())
174
+ }
109
175
110
- // Wait for the goroutine to finish (with timeout)
111
- select {
112
- case <- timer .done :
113
- logrus .Debugf ("Healthcheck timer for container %s stopped gracefully" , c .ID ())
114
- case <- time .After (5 * time .Second ):
115
- logrus .Warnf ("Healthcheck timer for container %s did not stop within timeout" , c .ID ())
176
+ // Remove from active timers
177
+ delete (activeTimers , c .ID ())
178
+ } else if c .state .HealthCheckStopFile != "" {
179
+ // Called from different process (cleanup), create stop file
180
+ if err := os .WriteFile (c .state .HealthCheckStopFile , []byte ("stop" ), 0644 ); err != nil {
181
+ logrus .Errorf ("Failed to create healthcheck stop file for container %s: %v" , c .ID (), err )
182
+ }
116
183
}
117
184
118
- // Remove from active timers
119
- delete (activeTimers , c .ID ())
120
-
121
- // Clear the unit name
185
+ // Clear the unit name and stop file
122
186
c .state .HCUnitName = ""
187
+ c .state .HealthCheckStopFile = ""
123
188
return c .save ()
124
189
}
125
190
@@ -130,14 +195,22 @@ func (t *healthcheckTimer) run() {
130
195
ticker := time .NewTicker (t .interval )
131
196
defer ticker .Stop ()
132
197
133
- logrus .Debugf ("Starting healthcheck timer for container %s with interval %s" , t .container .ID (), t .interval )
134
-
135
198
for {
136
199
select {
137
200
case <- t .ctx .Done ():
138
- logrus .Debugf ("Healthcheck timer for container %s stopped" , t .container .ID ())
139
201
return
140
202
case <- ticker .C :
203
+ // Check for stop file (cross-process cleanup)
204
+ if t .container .state .HealthCheckStopFile != "" {
205
+ if _ , err := os .Stat (t .container .state .HealthCheckStopFile ); err == nil {
206
+ // Clean up the stop file
207
+ if err := os .Remove (t .container .state .HealthCheckStopFile ); err != nil {
208
+ logrus .Warnf ("Failed to remove stop file for container %s: %v" , t .container .ID (), err )
209
+ }
210
+ return
211
+ }
212
+ }
213
+
141
214
// Run the healthcheck
142
215
if err := t .runHealthCheck (); err != nil {
143
216
logrus .Errorf ("Healthcheck failed for container %s: %v" , t .container .ID (), err )
@@ -155,14 +228,12 @@ func (t *healthcheckTimer) runHealthCheck() error {
155
228
}
156
229
157
230
if state != define .ContainerStateRunning {
158
- logrus .Debugf ("Container %s is not running (state: %v), skipping healthcheck" , t .container .ID (), state )
159
231
return nil
160
232
}
161
233
162
234
// Get healthcheck config (without holding lock)
163
235
healthConfig := t .container .HealthCheckConfig ()
164
236
if healthConfig == nil {
165
- logrus .Debugf ("No healthcheck config found for container %s, skipping healthcheck" , t .container .ID ())
166
237
return nil
167
238
}
168
239
0 commit comments