diff --git a/internal/plugin/server.go b/internal/plugin/server.go index 6b55589b3..b0b59f749 100644 --- a/internal/plugin/server.go +++ b/internal/plugin/server.go @@ -59,7 +59,7 @@ type nvidiaDevicePlugin struct { socket string server *grpc.Server - health chan *rm.Device + health chan *rm.DeviceEvent stop chan interface{} imexChannels imex.Channels @@ -105,7 +105,7 @@ func getPluginSocketPath(resource spec.ResourceName) string { func (plugin *nvidiaDevicePlugin) initialize() { plugin.server = grpc.NewServer([]grpc.ServerOption{}...) - plugin.health = make(chan *rm.Device) + plugin.health = make(chan *rm.DeviceEvent, 10) plugin.stop = make(chan interface{}) } @@ -272,11 +272,21 @@ func (plugin *nvidiaDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.D return nil case d := <-plugin.health: // FIXME: there is no way to recover from the Unhealthy state. - d.Health = pluginapi.Unhealthy - klog.Infof("'%s' device marked unhealthy: %s", plugin.rm.Resource(), d.ID) - if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: plugin.apiDevices()}); err != nil { - return nil + if d.Event == rm.DeviceUnHalthy { + d.Device.Health = pluginapi.Unhealthy + klog.Infof("'%s' device marked unhealthy: %s", plugin.rm.Resource(), d.Device.ID) + if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: plugin.apiDevices()}); err != nil { + return nil + } } + if d.Event == rm.DeviceHealthy { + d.Device.Health = pluginapi.Healthy + klog.Infof("'%s' device marked healthy: %s", plugin.rm.Resource(), d.Device.ID) + if err := s.Send(&pluginapi.ListAndWatchResponse{Devices: plugin.apiDevices()}); err != nil { + return nil + } + } + } } } diff --git a/internal/rm/devices.go b/internal/rm/devices.go index f3b77c5fb..7dbac9ecc 100644 --- a/internal/rm/devices.go +++ b/internal/rm/devices.go @@ -36,6 +36,18 @@ type Device struct { Replicas int } +type DeviceEventType int + +const ( + DeviceUnHalthy DeviceEventType = iota + DeviceHealthy +) + +type DeviceEvent struct { + Device *Device + Event DeviceEventType +} + // deviceInfo defines the information the required to construct a Device type deviceInfo interface { GetUUID() (string, error) diff --git a/internal/rm/health.go b/internal/rm/health.go index a1989cd88..cc73fcd49 100644 --- a/internal/rm/health.go +++ b/internal/rm/health.go @@ -33,10 +33,14 @@ const ( // this is in addition to the Application errors that are already ignored. envDisableHealthChecks = "DP_DISABLE_HEALTHCHECKS" allHealthChecks = "xids" + + nvmlEventTypeGpuRecoveryAction = 0x0000000000008000 // from https://docs.nvidia.com/deploy/nvml-api/group__nvmlEventType.html? + + nvmlEventTypeGpuUnavailableError = 0x0000000000004000 ) // CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices -func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devices, unhealthy chan<- *Device) error { +func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devices, unhealthy chan<- *DeviceEvent) error { disableHealthChecks := strings.ToLower(os.Getenv(envDisableHealthChecks)) if disableHealthChecks == "all" { disableHealthChecks = allHealthChecks @@ -92,12 +96,15 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic deviceIDToGiMap := make(map[string]uint32) deviceIDToCiMap := make(map[string]uint32) - eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError) + eventMask := uint64(nvml.EventTypeXidCriticalError | nvml.EventTypeDoubleBitEccError | nvml.EventTypeSingleBitEccError | nvmlEventTypeGpuUnavailableError | nvmlEventTypeGpuRecoveryAction) for _, d := range devices { uuid, gi, ci, err := r.getDevicePlacement(d) if err != nil { klog.Warningf("Could not determine device placement for %v: %v; Marking it unhealthy.", d.ID, err) - unhealthy <- d + unhealthy <- &DeviceEvent{ + Device: d, + Event: DeviceUnHalthy, + } continue } deviceIDToGiMap[d.ID] = gi @@ -107,14 +114,20 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic gpu, ret := r.nvml.DeviceGetHandleByUUID(uuid) if ret != nvml.SUCCESS { klog.Infof("unable to get device handle from UUID: %v; marking it as unhealthy", ret) - unhealthy <- d + unhealthy <- &DeviceEvent{ + Device: d, + Event: DeviceUnHalthy, + } continue } supportedEvents, ret := gpu.GetSupportedEventTypes() if ret != nvml.SUCCESS { klog.Infof("unable to determine the supported events for %v: %v; marking it as unhealthy", d.ID, ret) - unhealthy <- d + unhealthy <- &DeviceEvent{ + Device: d, + Event: DeviceUnHalthy, + } continue } @@ -124,7 +137,10 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic } if ret != nvml.SUCCESS { klog.Infof("Marking device %v as unhealthy: %v", d.ID, ret) - unhealthy <- d + unhealthy <- &DeviceEvent{ + Device: d, + Event: DeviceUnHalthy, + } } } @@ -142,16 +158,14 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic if ret != nvml.SUCCESS { klog.Infof("Error waiting for event: %v; Marking all devices as unhealthy", ret) for _, d := range devices { - unhealthy <- d + unhealthy <- &DeviceEvent{ + Device: d, + Event: DeviceUnHalthy, + } } continue } - if e.EventType != nvml.EventTypeXidCriticalError { - klog.Infof("Skipping non-nvmlEventTypeXidCriticalError event: %+v", e) - continue - } - if skippedXids[e.EventData] { klog.Infof("Skipping event %+v", e) continue @@ -163,7 +177,10 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic // If we cannot reliably determine the device UUID, we mark all devices as unhealthy. klog.Infof("Failed to determine uuid for event %v: %v; Marking all devices as unhealthy.", e, ret) for _, d := range devices { - unhealthy <- d + unhealthy <- &DeviceEvent{ + Device: d, + Event: DeviceUnHalthy, + } } continue } @@ -173,6 +190,30 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic klog.Infof("Ignoring event for unexpected device: %v", eventUUID) continue } + // nvmlEventTypeRecovery is a special case, where we mark the device as healthy. + if e.EventType == nvmlEventTypeGpuRecoveryAction { + klog.Infof("Gpu recovery event: %+v", e) + unhealthy <- &DeviceEvent{ + Device: d, + Event: DeviceHealthy, + } + continue + + } + + if e.EventType == nvmlEventTypeGpuUnavailableError { + klog.Infof("Gpu unavailable event: %+v", e) + unhealthy <- &DeviceEvent{ + Device: d, + Event: DeviceUnHalthy, + } + continue + } + + if e.EventType != nvml.EventTypeXidCriticalError { + klog.Infof("Skipping non-nvmlEventTypeXidCriticalError event: %+v", e) + continue + } if d.IsMigDevice() && e.GpuInstanceId != 0xFFFFFFFF && e.ComputeInstanceId != 0xFFFFFFFF { gi := deviceIDToGiMap[d.ID] @@ -184,7 +225,10 @@ func (r *nvmlResourceManager) checkHealth(stop <-chan interface{}, devices Devic } klog.Infof("XidCriticalError: Xid=%d on Device=%s; marking device as unhealthy.", e.EventData, d.ID) - unhealthy <- d + unhealthy <- &DeviceEvent{ + Device: d, + Event: DeviceUnHalthy, + } } } diff --git a/internal/rm/nvml_manager.go b/internal/rm/nvml_manager.go index fac923429..071b9dc8d 100644 --- a/internal/rm/nvml_manager.go +++ b/internal/rm/nvml_manager.go @@ -91,7 +91,7 @@ func (r *nvmlResourceManager) GetDevicePaths(ids []string) []string { } // CheckHealth performs health checks on a set of devices, writing to the 'unhealthy' channel with any unhealthy devices -func (r *nvmlResourceManager) CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error { +func (r *nvmlResourceManager) CheckHealth(stop <-chan interface{}, unhealthy chan<- *DeviceEvent) error { return r.checkHealth(stop, r.devices, unhealthy) } diff --git a/internal/rm/rm.go b/internal/rm/rm.go index 372165df0..e0954dd96 100644 --- a/internal/rm/rm.go +++ b/internal/rm/rm.go @@ -42,7 +42,7 @@ type ResourceManager interface { Devices() Devices GetDevicePaths([]string) []string GetPreferredAllocation(available, required []string, size int) ([]string, error) - CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error + CheckHealth(stop <-chan interface{}, unhealthy chan<- *DeviceEvent) error ValidateRequest(AnnotatedIDs) error } diff --git a/internal/rm/tegra_manager.go b/internal/rm/tegra_manager.go index 65ca2022f..a756a3fe5 100644 --- a/internal/rm/tegra_manager.go +++ b/internal/rm/tegra_manager.go @@ -71,6 +71,6 @@ func (r *tegraResourceManager) GetDevicePaths(ids []string) []string { } // CheckHealth is disabled for the tegraResourceManager -func (r *tegraResourceManager) CheckHealth(stop <-chan interface{}, unhealthy chan<- *Device) error { +func (r *tegraResourceManager) CheckHealth(stop <-chan interface{}, unhealthy chan<- *DeviceEvent) error { return nil }