Skip to content

Commit 6f604f5

Browse files
author
Pei PeiDong
committed
feat: add mark health device function
1 parent f666bc3 commit 6f604f5

File tree

6 files changed

+72
-16
lines changed

6 files changed

+72
-16
lines changed

internal/plugin/server.go

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -272,11 +272,21 @@ func (plugin *nvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.D
272272
return nil
273273
case d := <-plugin.health:
274274
// FIXME: there is no way to recover from the Unhealthy state.
275-
d.Health = pluginapi.Unhealthy
276-
klog.Infof("'%s' device marked unhealthy: %s", plugin.rm.Resource(), d.ID)
277-
if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: plugin.apiDevices()}); err != nil {
278-
return nil
275+
if d.Event == rm.DeviceUnHalthy {
276+
d.Device.Health = pluginapi.Unhealthy
277+
klog.Infof("'%s' device marked unhealthy: %s", plugin.rm.Resource(), d.Device.ID)
278+
if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: plugin.apiDevices()}); err != nil {
279+
return nil
280+
}
279281
}
282+
if d.Event == rm.DeviceHealthy {
283+
d.Device.Health = pluginapi.Healthy
284+
klog.Infof("'%s' device marked healthy: %s", plugin.rm.Resource(), d.Device.ID)
285+
if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: plugin.apiDevices()}); err != nil {
286+
return nil
287+
}
288+
}
289+
280290
}
281291
}
282292
}

internal/rm/devices.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,18 @@ type Device struct {
3636
Replicas int
3737
}
3838

39+
type DeviceEventType int
40+
41+
const (
42+
DeviceUnHalthy DeviceEventType = iota
43+
DeviceHealthy
44+
)
45+
46+
type DeviceEvent struct {
47+
Device *Device
48+
Event DeviceEventType
49+
}
50+
3951
// deviceInfo defines the information the required to construct a Device
4052
type deviceInfo interface {
4153
GetUUID() (string, error)

internal/rm/health.go

Lines changed: 43 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,14 @@ const (
3333
// this is in addition to the Application errors that are already ignored.
3434
envDisableHealthChecks = "DP_DISABLE_HEALTHCHECKS"
3535
allHealthChecks = "xids"
36+
37+
nvmlEventTypeGpuRecoveryAction = 0x0000000000008000 // from https://docs.nvidia.com/deploy/nvml-api/group__nvmlEventType.html?
38+
39+
nvmlEventTypeGpuUnavailableError = 0x0000000000004000
3640
)
3741

3842
// CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices
39-
func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devices, unhealthy chan<- *Device) error {
43+
func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devices, unhealthy chan<- *DeviceEvent) error {
4044
disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks))
4145
if disableHealthChecks == "all" {
4246
disableHealthChecks = allHealthChecks
@@ -92,12 +96,15 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
9296
deviceIDToGiMap := make(map[string]uint32)
9397
deviceIDToCiMap := make(map[string]uint32)
9498

95-
eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError)
99+
eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError | nvmlEventTypeGpuUnavailableError | nvmlEventTypeGpuRecoveryAction)
96100
for _, d := range devices {
97101
uuid, gi, ci, err := r.getDevicePlacement(d)
98102
if err != nil {
99103
klog.Warningf("Could not determine device placement for %v: %v; Marking it unhealthy.", d.ID, err)
100-
unhealthy <- d
104+
unhealthy <- &DeviceEvent{
105+
Device: d,
106+
Event: DeviceUnHalthy,
107+
}
101108
continue
102109
}
103110
deviceIDToGiMap[d.ID] = gi
@@ -107,14 +114,20 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
107114
gpu, ret := r.nvml.DeviceGetHandleByUUID(uuid)
108115
if ret != nvml.SUCCESS {
109116
klog.Infof("unable to get device handle from UUID: %v; marking it as unhealthy", ret)
110-
unhealthy <- d
117+
unhealthy <- &DeviceEvent{
118+
Device: d,
119+
Event: DeviceUnHalthy,
120+
}
111121
continue
112122
}
113123

114124
supportedEvents, ret := gpu.GetSupportedEventTypes()
115125
if ret != nvml.SUCCESS {
116126
klog.Infof("unable to determine the supported events for %v: %v; marking it as unhealthy", d.ID, ret)
117-
unhealthy <- d
127+
unhealthy <- &DeviceEvent{
128+
Device: d,
129+
Event: DeviceUnHalthy,
130+
}
118131
continue
119132
}
120133

@@ -124,7 +137,10 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
124137
}
125138
if ret != nvml.SUCCESS {
126139
klog.Infof("Marking device %v as unhealthy: %v", d.ID, ret)
127-
unhealthy <- d
140+
unhealthy <- &DeviceEvent{
141+
Device: d,
142+
Event: DeviceUnHalthy,
143+
}
128144
}
129145
}
130146

@@ -142,7 +158,10 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
142158
if ret != nvml.SUCCESS {
143159
klog.Infof("Error waiting for event: %v; Marking all devices as unhealthy", ret)
144160
for _, d := range devices {
145-
unhealthy <- d
161+
unhealthy <- &DeviceEvent{
162+
Device: d,
163+
Event: DeviceUnHalthy,
164+
}
146165
}
147166
continue
148167
}
@@ -163,7 +182,10 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
163182
// If we cannot reliably determine the device UUID, we mark all devices as unhealthy.
164183
klog.Infof("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", e, ret)
165184
for _, d := range devices {
166-
unhealthy <- d
185+
unhealthy <- &DeviceEvent{
186+
Device: d,
187+
Event: DeviceUnHalthy,
188+
}
167189
}
168190
continue
169191
}
@@ -173,6 +195,15 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
173195
klog.Infof("Ignoring event for unexpected device: %v", eventUUID)
174196
continue
175197
}
198+
// nvmlEventTypeRecovery is a special case, where we mark the device as healthy.
199+
if e.EventType == nvmlEventTypeGpuRecoveryAction {
200+
klog.Infof("Gpu recovery event: %+v", e)
201+
unhealthy <- &DeviceEvent{
202+
Device: d,
203+
Event: DeviceHealthy,
204+
}
205+
206+
}
176207

177208
if d.IsMigDevice() && e.GpuInstanceId != 0xFFFFFFFF && e.ComputeInstanceId != 0xFFFFFFFF {
178209
gi := deviceIDToGiMap[d.ID]
@@ -184,7 +215,10 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic
184215
}
185216

186217
klog.Infof("XidCriticalError: Xid=%d on Device=%s; marking device as unhealthy.", e.EventData, d.ID)
187-
unhealthy <- d
218+
unhealthy <- &DeviceEvent{
219+
Device: d,
220+
Event: DeviceUnHalthy,
221+
}
188222
}
189223
}
190224

internal/rm/nvml_manager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -91,7 +91,7 @@ func (r *nvmlResourceManager) GetDevicePaths(ids []string) []string {
9191
}
9292

9393
// CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices
94-
func (r *nvmlResourceManager) CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error {
94+
func (r *nvmlResourceManager) CheckHealth(stop <-chan interface{}, unhealthy chan<- *DeviceEvent) error {
9595
return r.checkHealth(stop, r.devices, unhealthy)
9696
}
9797

internal/rm/rm.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ type ResourceManager interface {
4242
Devices() Devices
4343
GetDevicePaths([]string) []string
4444
GetPreferredAllocation(available, required []string, size int) ([]string, error)
45-
CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error
45+
CheckHealth(stop <-chan interface{}, unhealthy chan<- *DeviceEvent) error
4646
ValidateRequest(AnnotatedIDs) error
4747
}
4848

internal/rm/tegra_manager.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,6 +71,6 @@ func (r *tegraResourceManager) GetDevicePaths(ids []string) []string {
7171
}
7272

7373
// CheckHealth is disabled for the tegraResourceManager
74-
func (r *tegraResourceManager) CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error {
74+
func (r *tegraResourceManager) CheckHealth(stop <-chan interface{}, unhealthy chan<- *DeviceEvent) error {
7575
return nil
7676
}

0 commit comments

Comments
 (0)