@@ -33,10 +33,14 @@ const (
33
33
// this is in addition to the Application errors that are already ignored.
34
34
envDisableHealthChecks = "DP_DISABLE_HEALTHCHECKS"
35
35
allHealthChecks = "xids"
36
+
37
+ nvmlEventTypeGpuRecoveryAction = 0x0000000000008000 // from https://docs.nvidia.com/deploy/nvml-api/group__nvmlEventType.html?
38
+
39
+ nvmlEventTypeGpuUnavailableError = 0x0000000000004000
36
40
)
37
41
38
42
// CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices
39
- func (r * nvmlResourceManager ) checkHealth (stop <- chan interface {}, devices Devices , unhealthy chan <- * Device ) error {
43
+ func (r * nvmlResourceManager ) checkHealth (stop <- chan interface {}, devices Devices , unhealthy chan <- * DeviceEvent ) error {
40
44
disableHealthChecks := strings .ToLower (os .Getenv (envDisableHealthChecks ))
41
45
if disableHealthChecks == "all" {
42
46
disableHealthChecks = allHealthChecks
@@ -92,12 +96,15 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
92
96
deviceIDToGiMap := make (map [string ]uint32 )
93
97
deviceIDToCiMap := make (map [string ]uint32 )
94
98
95
- eventMask := uint64 (nvml .EventTypeXidCriticalError | nvml .EventTypeDoubleBitEccError | nvml .EventTypeSingleBitEccError )
99
+ eventMask := uint64 (nvml .EventTypeXidCriticalError | nvml .EventTypeDoubleBitEccError | nvml .EventTypeSingleBitEccError | nvmlEventTypeGpuUnavailableError | nvmlEventTypeGpuRecoveryAction )
96
100
for _ , d := range devices {
97
101
uuid , gi , ci , err := r .getDevicePlacement (d )
98
102
if err != nil {
99
103
klog .Warningf ("Could not determine device placement for %v: %v; Marking it unhealthy." , d .ID , err )
100
- unhealthy <- d
104
+ unhealthy <- & DeviceEvent {
105
+ Device : d ,
106
+ Event : DeviceUnHalthy ,
107
+ }
101
108
continue
102
109
}
103
110
deviceIDToGiMap [d .ID ] = gi
@@ -107,14 +114,20 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
107
114
gpu , ret := r .nvml .DeviceGetHandleByUUID (uuid )
108
115
if ret != nvml .SUCCESS {
109
116
klog .Infof ("unable to get device handle from UUID: %v; marking it as unhealthy" , ret )
110
- unhealthy <- d
117
+ unhealthy <- & DeviceEvent {
118
+ Device : d ,
119
+ Event : DeviceUnHalthy ,
120
+ }
111
121
continue
112
122
}
113
123
114
124
supportedEvents , ret := gpu .GetSupportedEventTypes ()
115
125
if ret != nvml .SUCCESS {
116
126
klog .Infof ("unable to determine the supported events for %v: %v; marking it as unhealthy" , d .ID , ret )
117
- unhealthy <- d
127
+ unhealthy <- & DeviceEvent {
128
+ Device : d ,
129
+ Event : DeviceUnHalthy ,
130
+ }
118
131
continue
119
132
}
120
133
@@ -124,7 +137,10 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
124
137
}
125
138
if ret != nvml .SUCCESS {
126
139
klog .Infof ("Marking device %v as unhealthy: %v" , d .ID , ret )
127
- unhealthy <- d
140
+ unhealthy <- & DeviceEvent {
141
+ Device : d ,
142
+ Event : DeviceUnHalthy ,
143
+ }
128
144
}
129
145
}
130
146
@@ -142,7 +158,10 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
142
158
if ret != nvml .SUCCESS {
143
159
klog .Infof ("Error waiting for event: %v; Marking all devices as unhealthy" , ret )
144
160
for _ , d := range devices {
145
- unhealthy <- d
161
+ unhealthy <- & DeviceEvent {
162
+ Device : d ,
163
+ Event : DeviceUnHalthy ,
164
+ }
146
165
}
147
166
continue
148
167
}
@@ -163,7 +182,10 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
163
182
// If we cannot reliably determine the device UUID, we mark all devices as unhealthy.
164
183
klog .Infof ("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy." , e , ret )
165
184
for _ , d := range devices {
166
- unhealthy <- d
185
+ unhealthy <- & DeviceEvent {
186
+ Device : d ,
187
+ Event : DeviceUnHalthy ,
188
+ }
167
189
}
168
190
continue
169
191
}
@@ -173,6 +195,15 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
173
195
klog .Infof ("Ignoring event for unexpected device: %v" , eventUUID )
174
196
continue
175
197
}
198
+ // nvmlEventTypeRecovery is a special case, where we mark the device as healthy.
199
+ if e .EventType == nvmlEventTypeGpuRecoveryAction {
200
+ klog .Infof ("Gpu recovery event: %+v" , e )
201
+ unhealthy <- & DeviceEvent {
202
+ Device : d ,
203
+ Event : DeviceHealthy ,
204
+ }
205
+
206
+ }
176
207
177
208
if d .IsMigDevice () && e .GpuInstanceId != 0xFFFFFFFF && e .ComputeInstanceId != 0xFFFFFFFF {
178
209
gi := deviceIDToGiMap [d .ID ]
@@ -184,7 +215,10 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
184
215
}
185
216
186
217
klog .Infof ("XidCriticalError: Xid=%d on Device=%s; marking device as unhealthy." , e .EventData , d .ID )
187
- unhealthy <- d
218
+ unhealthy <- & DeviceEvent {
219
+ Device : d ,
220
+ Event : DeviceUnHalthy ,
221
+ }
188
222
}
189
223
}
190
224
0 commit comments