From ca908c2d9977385f7acf041f845050ee95184a75 Mon Sep 17 00:00:00 2001 From: Julien Blache Date: Tue, 18 Oct 2022 23:58:51 -0700 Subject: [PATCH 1/3] MIG GPU discovery --- .../mesos/isolators/gpu/allocator.cpp | 141 +++++++++++-- .../mesos/isolators/gpu/allocator.hpp | 6 + .../mesos/isolators/gpu/nvml.cpp | 193 ++++++++++++++++++ .../mesos/isolators/gpu/nvml.hpp | 5 + 4 files changed, 329 insertions(+), 16 deletions(-) diff --git a/src/slave/containerizer/mesos/isolators/gpu/allocator.cpp b/src/slave/containerizer/mesos/isolators/gpu/allocator.cpp index 77522ff8e63..f4da3b49b71 100644 --- a/src/slave/containerizer/mesos/isolators/gpu/allocator.cpp +++ b/src/slave/containerizer/mesos/isolators/gpu/allocator.cpp @@ -85,7 +85,12 @@ static Try> enumerateGpus( if (flags.nvidia_gpu_devices.isSome()) { indices = flags.nvidia_gpu_devices.get(); } else { - for (size_t i = 0; i < resources.gpus().getOrElse(0); ++i) { + Try available = nvml::deviceGetCount(); + if (available.isError()) { + return Error("Failed to nvml::deviceGetCount: " + available.error()); + } + + for (unsigned int i = 0; i < available.get(); ++i) { indices.push_back(i); } } @@ -103,17 +108,90 @@ static Try> enumerateGpus( return Error("Failed to nvml::deviceGetMinorNumber: " + minor.error()); } - Gpu gpu; - gpu.major = NVIDIA_MAJOR_DEVICE; - gpu.minor = minor.get(); + Try ismig = nvml::deviceGetMigMode(handle.get()); + if (ismig.isError()) { + return Error("Failed to nvml::deviceGetMigMode: " + ismig.error()); + } + + if (!ismig.get()) { + Gpu gpu; + gpu.major = NVIDIA_MAJOR_DEVICE; + gpu.minor = minor.get(); + + gpus.insert(gpu); + + continue; + } + + Try migcount = nvml::deviceGetMigDeviceCount(handle.get()); + if (migcount.isError()) { + return Error("Failed to nvml::deviceGetMigDeviceCount: " + migcount.error()); + } - gpus.insert(gpu); + for (unsigned int migindex = 0; migindex < migcount.get(); migindex++) { + Try mighandle = nvml::deviceGetMigDeviceHandleByIndex(handle.get(), migindex); + if (mighandle.isError()) { + return Error("Failed to nvml::deviceGetMigDeviceHandleByIndex: " + mighandle.error()); + } + + Try gi_minor = nvml::deviceGetGpuInstanceMinor(mighandle.get()); + if (gi_minor.isError()) { + return Error("Failed to nvml::deviceGetGpuInstanceMinor: " + gi_minor.error()); + } + + Try ci_minor = nvml::deviceGetComputeInstanceMinor(mighandle.get()); + if (ci_minor.isError()) { + return Error("Failed to nvml::deviceGetComputeInstanceMinor: " + ci_minor.error()); + } + + Gpu gpu; + gpu.major = NVIDIA_MAJOR_DEVICE; + gpu.minor = minor.get(); + gpu.ismig = true; + gpu.gi_minor = gi_minor.get(); + gpu.ci_minor = ci_minor.get(); + + gpus.insert(gpu); + } } return gpus; } +static Try countGpuInstancesForDevices( + const vector& devices) +{ + unsigned int count = 0; + + foreach (unsigned int device, devices) { + Try handle = nvml::deviceGetHandleByIndex(device); + if (handle.isError()) { + return Error("Failed to nvml::deviceGetHandleByIndex: " + handle.error()); + } + + Try ismig = nvml::deviceGetMigMode(handle.get()); + if (ismig.isError()) { + return Error("Failed to nvml::deviceGetMigMode: " + ismig.error()); + } + + if (!ismig.get()) { + count++; + continue; + } + + Try migcount = nvml::deviceGetMigDeviceCount(handle.get()); + if (migcount.isError()) { + return Error("Failed to nvml::deviceGetMigDeviceCount: " + migcount.error()); + } + + count += migcount.get(); + } + + return count; +} + + // To determine the proper number of GPU resources to return, we // need to check both --resources and --nvidia_gpu_devices. // There are two cases to consider: @@ -174,11 +252,6 @@ static Try enumerateGpuResources(const Flags& flags) return Error("Failed to nvml::initialize: " + initialized.error()); } - Try available = nvml::deviceGetCount(); - if (available.isError()) { - return Error("Failed to nvml::deviceGetCount: " + available.error()); - } - // The `Resources` wrapper does not allow us to distinguish between // a user specifying "gpus:0" in the --resources flag and not // specifying "gpus" at all. To help with this we short circuit @@ -225,9 +298,11 @@ static Try enumerateGpuResources(const Flags& flags) return Error("'--nvidia_gpu_devices' contains duplicates"); } - if (flags.nvidia_gpu_devices->size() != resources.gpus().get()) { - return Error("'--resources' and '--nvidia_gpu_devices' specify" - " different numbers of GPU devices"); + Try available = countGpuInstancesForDevices(unique); + if (available.isError()) { + return Error("Failed to count all GPU instances for devices" + " specified by --nvidia_gpu_devices: " + + available.error()); } if (resources.gpus().get() > available.get()) { @@ -238,6 +313,22 @@ static Try enumerateGpuResources(const Flags& flags) return resources; } + Try available = nvml::deviceGetCount(); + if (available.isError()) { + return Error("Failed to nvml::deviceGetCount: " + available.error()); + } + + vector indices; + for (unsigned int i = 0; i < available.get(); ++i) { + indices.push_back(i); + } + + available = countGpuInstancesForDevices(indices); + if (available.isError()) { + return Error("Failed to count all GPU instances: " + + available.error()); + } + return Resources::parse( "gpus", stringify(available.get()), @@ -378,7 +469,15 @@ Future NvidiaGpuAllocator::deallocate(const set& gpus) bool operator<(const Gpu& left, const Gpu& right) { if (left.major == right.major) { - return left.minor < right.minor; + // Either or both aren't MIG, comparing major/minor is enough + if (!left.ismig || !right.ismig || (left.minor != right.minor)) { + return left.minor < right.minor; + } + + if (left.gi_minor == right.gi_minor) { + return left.ci_minor < right.ci_minor; + } + return left.gi_minor < right.gi_minor; } return left.major < right.major; } @@ -404,7 +503,14 @@ bool operator>=(const Gpu& left, const Gpu& right) bool operator==(const Gpu& left, const Gpu& right) { - return left.major == right.major && left.minor == right.minor; + if (left.ismig != right.ismig) + return false; + + if (!left.ismig) + return left.major == right.major && left.minor == right.minor; + + return left.major == right.major && left.minor == right.minor + && left.gi_minor == right.gi_minor && left.ci_minor == right.ci_minor; } @@ -416,7 +522,10 @@ bool operator!=(const Gpu& left, const Gpu& right) ostream& operator<<(ostream& stream, const Gpu& gpu) { - return stream << gpu.major << '.' << gpu.minor; + if (gpu.ismig) + return stream << gpu.major << '.' << gpu.minor << ':' << gpu.gi_minor << '.' << gpu.ci_minor; + else + return stream << gpu.major << '.' << gpu.minor; } } // namespace slave { diff --git a/src/slave/containerizer/mesos/isolators/gpu/allocator.hpp b/src/slave/containerizer/mesos/isolators/gpu/allocator.hpp index b2eabfebef9..626830e966a 100644 --- a/src/slave/containerizer/mesos/isolators/gpu/allocator.hpp +++ b/src/slave/containerizer/mesos/isolators/gpu/allocator.hpp @@ -41,8 +41,14 @@ namespace slave { // abstraction in terms of it. struct Gpu { + // GPU device unsigned int major; unsigned int minor; + + // MIG support + bool ismig; + unsigned int gi_minor; + unsigned int ci_minor; }; diff --git a/src/slave/containerizer/mesos/isolators/gpu/nvml.cpp b/src/slave/containerizer/mesos/isolators/gpu/nvml.cpp index 2fddf4dda19..5199f5384a7 100644 --- a/src/slave/containerizer/mesos/isolators/gpu/nvml.cpp +++ b/src/slave/containerizer/mesos/isolators/gpu/nvml.cpp @@ -18,7 +18,9 @@ #include +#include #include +#include #include #include @@ -61,17 +63,38 @@ struct NvidiaManagementLibrary nvmlReturn_t (*_deviceGetCount)(unsigned int*), nvmlReturn_t (*_deviceGetHandleByIndex)(unsigned int, nvmlDevice_t*), nvmlReturn_t (*_deviceGetMinorNumber)(nvmlDevice_t, unsigned int*), + nvmlReturn_t (*_deviceGetIndex)(nvmlDevice_t, unsigned int*), + nvmlReturn_t (*_deviceGetMigMode)(nvmlDevice_t, unsigned int*, unsigned int*), + nvmlReturn_t (*_deviceGetMaxMigDeviceCount)(nvmlDevice_t, unsigned int*), + nvmlReturn_t (*_deviceGetMigDeviceHandleByIndex)(nvmlDevice_t, unsigned int, nvmlDevice_t*), + nvmlReturn_t (*_deviceGetDeviceHandleFromMigDeviceHandle)(nvmlDevice_t, nvmlDevice_t*), + nvmlReturn_t (*_deviceGetGpuInstanceId)(nvmlDevice_t, unsigned int*), + nvmlReturn_t (*_deviceGetComputeInstanceId)(nvmlDevice_t, unsigned int*), const char* (*_errorString)(nvmlReturn_t)) : systemGetDriverVersion(_systemGetDriverVersion), deviceGetCount(_deviceGetCount), deviceGetHandleByIndex(_deviceGetHandleByIndex), deviceGetMinorNumber(_deviceGetMinorNumber), + deviceGetIndex(_deviceGetIndex), + deviceGetMigMode(_deviceGetMigMode), + deviceGetMaxMigDeviceCount(_deviceGetMaxMigDeviceCount), + deviceGetMigDeviceHandleByIndex(_deviceGetMigDeviceHandleByIndex), + deviceGetDeviceHandleFromMigDeviceHandle(_deviceGetDeviceHandleFromMigDeviceHandle), + deviceGetGpuInstanceId(_deviceGetGpuInstanceId), + deviceGetComputeInstanceId(_deviceGetComputeInstanceId), errorString(_errorString) {} nvmlReturn_t (*systemGetDriverVersion)(char *, unsigned int); nvmlReturn_t (*deviceGetCount)(unsigned int*); nvmlReturn_t (*deviceGetHandleByIndex)(unsigned int, nvmlDevice_t*); nvmlReturn_t (*deviceGetMinorNumber)(nvmlDevice_t, unsigned int*); + nvmlReturn_t (*deviceGetIndex)(nvmlDevice_t, unsigned int*); + nvmlReturn_t (*deviceGetMigMode)(nvmlDevice_t, unsigned int*, unsigned int*); + nvmlReturn_t (*deviceGetMaxMigDeviceCount)(nvmlDevice_t, unsigned int*); + nvmlReturn_t (*deviceGetMigDeviceHandleByIndex)(nvmlDevice_t, unsigned int, nvmlDevice_t*); + nvmlReturn_t (*deviceGetDeviceHandleFromMigDeviceHandle)(nvmlDevice_t, nvmlDevice_t *); + nvmlReturn_t (*deviceGetGpuInstanceId)(nvmlDevice_t, unsigned int*); + nvmlReturn_t (*deviceGetComputeInstanceId)(nvmlDevice_t, unsigned int*); const char* (*errorString)(nvmlReturn_t); }; @@ -113,6 +136,13 @@ Try initialize() { "nvmlDeviceGetCount", nullptr }, { "nvmlDeviceGetHandleByIndex", nullptr }, { "nvmlDeviceGetMinorNumber", nullptr }, + { "nvmlDeviceGetIndex", nullptr }, + { "nvmlDeviceGetMigMode", nullptr }, + { "nvmlDeviceGetMaxMigDeviceCount", nullptr }, + { "nvmlDeviceGetMigDeviceHandleByIndex", nullptr }, + { "nvmlDeviceGetDeviceHandleFromMigDeviceHandle", nullptr }, + { "nvmlDeviceGetGpuInstanceId", nullptr }, + { "nvmlDeviceGetComputeInstanceId", nullptr }, { "nvmlErrorString", nullptr }, }; @@ -148,6 +178,20 @@ Try initialize() symbols.at("nvmlDeviceGetHandleByIndex"), (nvmlReturn_t (*)(nvmlDevice_t, unsigned int*)) symbols.at("nvmlDeviceGetMinorNumber"), + (nvmlReturn_t (*)(nvmlDevice_t, unsigned int*)) + symbols.at("nvmlDeviceGetIndex"), + (nvmlReturn_t (*)(nvmlDevice_t, unsigned int*, unsigned int*)) + symbols.at("nvmlDeviceGetMigMode"), + (nvmlReturn_t (*)(nvmlDevice_t, unsigned int*)) + symbols.at("nvmlDeviceGetMaxMigDeviceCount"), + (nvmlReturn_t (*)(nvmlDevice_t, unsigned int, nvmlDevice_t*)) + symbols.at("nvmlDeviceGetMigDeviceHandleByIndex"), + (nvmlReturn_t (*)(nvmlDevice_t, nvmlDevice_t*)) + symbols.at("nvmlDeviceGetDeviceHandleFromMigDeviceHandle"), + (nvmlReturn_t (*)(nvmlDevice_t, unsigned int*)) + symbols.at("nvmlDeviceGetGpuInstanceId"), + (nvmlReturn_t (*)(nvmlDevice_t, unsigned int*)) + symbols.at("nvmlDeviceGetComputeInstanceId"), (const char* (*)(nvmlReturn_t)) symbols.at("nvmlErrorString")); @@ -257,4 +301,153 @@ Try deviceGetMinorNumber(nvmlDevice_t handle) return minor; } + +Try deviceGetMigMode(nvmlDevice_t handle) +{ + if (nvml == nullptr) { + return Error("NVML has not been initialized"); + } + + unsigned int current; + unsigned int pending; + nvmlReturn_t result = nvml->deviceGetMigMode(handle, ¤t, &pending); + if (result == NVML_ERROR_NOT_SUPPORTED) { + return false; + } else if (result != NVML_SUCCESS) { + return Error(nvml->errorString(result)); + } + return current == NVML_DEVICE_MIG_ENABLE; +} + + +Try deviceGetMigDeviceCount(nvmlDevice_t handle) +{ + if (nvml == nullptr) { + return Error("NVML has not been initialized"); + } + + unsigned int maxmig; + nvmlReturn_t result = nvml->deviceGetMaxMigDeviceCount(handle, &maxmig); + if (result != NVML_SUCCESS) { + return Error(nvml->errorString(result)); + } + if (maxmig == 0) + return 0; + + for (unsigned int migidx = 0; migidx < maxmig; migidx++) { + nvmlDevice_t mighandle; + nvmlReturn_t result = nvml->deviceGetMigDeviceHandleByIndex(handle, migidx, &mighandle); + if (result == NVML_ERROR_NOT_FOUND) { + return migidx; + } else if (result != NVML_SUCCESS) { + return Error(nvml->errorString(result)); + } + } + return 0; +} + + +Try deviceGetMigDeviceHandleByIndex(nvmlDevice_t handle, unsigned int migindex) +{ + if (nvml == nullptr) { + return Error("NVML has not been initialized"); + } + + nvmlDevice_t mighandle; + nvmlReturn_t result = nvml->deviceGetMigDeviceHandleByIndex(handle, migindex, &mighandle); + if (result == NVML_ERROR_NOT_FOUND) { + return Error("MIG device not found"); + } + if (result != NVML_SUCCESS) { + return Error(nvml->errorString(result)); + } + return mighandle; +} + + +typedef enum { + INSTANCE_GPU, + INSTANCE_COMPUTE, +} InstanceType; + +static Try getMigInstanceMinorByType(nvmlDevice_t mighandle, InstanceType itype) +{ + nvmlDevice_t handle; + nvmlReturn_t result = nvml->deviceGetDeviceHandleFromMigDeviceHandle(mighandle, &handle); + if (result != NVML_SUCCESS) { + return Error(nvml->errorString(result)); + } + + unsigned int devidx; + result = nvml->deviceGetIndex(handle, &devidx); + if (result != NVML_SUCCESS) { + return Error(nvml->errorString(result)); + } + + unsigned int giid; + result = nvml->deviceGetGpuInstanceId(mighandle, &giid); + if (result != NVML_SUCCESS) { + return Error(nvml->errorString(result)); + } + + std::ostringstream procpath; + + switch (itype) { + case INSTANCE_GPU: + procpath << "/proc/driver/nvidia/capabilities/gpu" << devidx << "/mig/gi" << giid << "/access"; + break; + + case INSTANCE_COMPUTE: + unsigned int ciid; + result = nvml->deviceGetComputeInstanceId(mighandle, &ciid); + if (result != NVML_SUCCESS) { + return Error(nvml->errorString(result)); + } + + procpath << "/proc/driver/nvidia/capabilities/gpu" << devidx << "/mig/gi" << giid << "/ci" << ciid << "/access"; + break; + } + + string procline; + std::ifstream procfile(procpath.str().c_str()); + std::getline(procfile, procline); + + size_t pos = procline.find(' '); + if (pos == std::string::npos) { + return Error("Could not parse " + procpath.str() + ": malformed line"); + } + + procline = procline.substr(pos + 1); + + unsigned int minor; + + try { + minor = std::stoi(procline); + } catch (...) { + return Error("Could not parse minor from " + procpath.str()); + } + + return minor; +} + + +Try deviceGetGpuInstanceMinor(nvmlDevice_t handle) +{ + if (nvml == nullptr) { + return Error("NVML has not been initialized"); + } + + return getMigInstanceMinorByType(handle, INSTANCE_GPU); +} + + +Try deviceGetComputeInstanceMinor(nvmlDevice_t handle) +{ + if (nvml == nullptr) { + return Error("NVML has not been initialized"); + } + + return getMigInstanceMinorByType(handle, INSTANCE_COMPUTE); +} + } // namespace nvml { diff --git a/src/slave/containerizer/mesos/isolators/gpu/nvml.hpp b/src/slave/containerizer/mesos/isolators/gpu/nvml.hpp index 12b01aa8c75..118e7495be7 100644 --- a/src/slave/containerizer/mesos/isolators/gpu/nvml.hpp +++ b/src/slave/containerizer/mesos/isolators/gpu/nvml.hpp @@ -56,6 +56,11 @@ Try systemGetDriverVersion(); Try deviceGetCount(); Try deviceGetHandleByIndex(unsigned int index); Try deviceGetMinorNumber(nvmlDevice_t handle); +Try deviceGetMigMode(nvmlDevice_t handle); +Try deviceGetMigDeviceCount(nvmlDevice_t handle); +Try deviceGetMigDeviceHandleByIndex(nvmlDevice_t handle, unsigned int migindex); +Try deviceGetGpuInstanceMinor(nvmlDevice_t handle); +Try deviceGetComputeInstanceMinor(nvmlDevice_t handle); } // namespace nvml { From c77bc4b69274e338b77d37dff2f66bd5bba966cd Mon Sep 17 00:00:00 2001 From: Julien Blache Date: Wed, 19 Oct 2022 12:39:50 -0700 Subject: [PATCH 2/3] Major number for nvidia-caps devices --- .../mesos/isolators/gpu/allocator.cpp | 6 +++++ .../mesos/isolators/gpu/allocator.hpp | 1 + .../mesos/isolators/gpu/nvml.cpp | 24 +++++++++++++++++++ .../mesos/isolators/gpu/nvml.hpp | 1 + 4 files changed, 32 insertions(+) diff --git a/src/slave/containerizer/mesos/isolators/gpu/allocator.cpp b/src/slave/containerizer/mesos/isolators/gpu/allocator.cpp index f4da3b49b71..d8d4f272298 100644 --- a/src/slave/containerizer/mesos/isolators/gpu/allocator.cpp +++ b/src/slave/containerizer/mesos/isolators/gpu/allocator.cpp @@ -95,6 +95,11 @@ static Try> enumerateGpus( } } + Try caps_major = nvml::systemGetCapsMajor(); + if (caps_major.isError()) { + return Error("Failed to get nvidia caps major: " + caps_major.error()); + } + set gpus; foreach (unsigned int index, indices) { @@ -148,6 +153,7 @@ static Try> enumerateGpus( gpu.major = NVIDIA_MAJOR_DEVICE; gpu.minor = minor.get(); gpu.ismig = true; + gpu.caps_major = caps_major.get(); gpu.gi_minor = gi_minor.get(); gpu.ci_minor = ci_minor.get(); diff --git a/src/slave/containerizer/mesos/isolators/gpu/allocator.hpp b/src/slave/containerizer/mesos/isolators/gpu/allocator.hpp index 626830e966a..9aded52e628 100644 --- a/src/slave/containerizer/mesos/isolators/gpu/allocator.hpp +++ b/src/slave/containerizer/mesos/isolators/gpu/allocator.hpp @@ -47,6 +47,7 @@ struct Gpu // MIG support bool ismig; + unsigned int caps_major; unsigned int gi_minor; unsigned int ci_minor; }; diff --git a/src/slave/containerizer/mesos/isolators/gpu/nvml.cpp b/src/slave/containerizer/mesos/isolators/gpu/nvml.cpp index 5199f5384a7..6c2ee831695 100644 --- a/src/slave/containerizer/mesos/isolators/gpu/nvml.cpp +++ b/src/slave/containerizer/mesos/isolators/gpu/nvml.cpp @@ -237,6 +237,30 @@ bool isAvailable() } +Try systemGetCapsMajor() +{ + std::ifstream procfile("/proc/devices"); + + while (procfile) { + string procline; + std::getline(procfile, procline); + + if (procline.find(" nvidia-caps") != std::string::npos) { + unsigned int major; + + try { + major = std::stoi(procline); + } catch (...) { + return Error("Could not parse nvidia-caps major from /proc/devices"); + } + + return major; + } + } + return Error("nvidia-caps not found in /proc/devices"); +} + + Try systemGetDriverVersion() { if (nvml == nullptr) { diff --git a/src/slave/containerizer/mesos/isolators/gpu/nvml.hpp b/src/slave/containerizer/mesos/isolators/gpu/nvml.hpp index 118e7495be7..9a4ed0090ca 100644 --- a/src/slave/containerizer/mesos/isolators/gpu/nvml.hpp +++ b/src/slave/containerizer/mesos/isolators/gpu/nvml.hpp @@ -53,6 +53,7 @@ Try initialize(); // NVML wrapper functions. May be called after initializing // the library. Try systemGetDriverVersion(); +Try systemGetCapsMajor(); Try deviceGetCount(); Try deviceGetHandleByIndex(unsigned int index); Try deviceGetMinorNumber(nvmlDevice_t handle); From 7147fe97b86c140f50f5c3ecc94faf1d839c63b3 Mon Sep 17 00:00:00 2001 From: Julien Blache Date: Wed, 19 Oct 2022 17:05:35 -0700 Subject: [PATCH 3/3] MIG support in GPU isolator --- .../mesos/isolators/gpu/isolator.cpp | 219 +++++++++++++----- 1 file changed, 164 insertions(+), 55 deletions(-) diff --git a/src/slave/containerizer/mesos/isolators/gpu/isolator.cpp b/src/slave/containerizer/mesos/isolators/gpu/isolator.cpp index 99119f938e2..a222be492f1 100644 --- a/src/slave/containerizer/mesos/isolators/gpu/isolator.cpp +++ b/src/slave/containerizer/mesos/isolators/gpu/isolator.cpp @@ -91,6 +91,91 @@ namespace mesos { namespace internal { namespace slave { +namespace { + +Try allowDevice( + const std::string& hierarchy, + const std::string& cgroup, + unsigned int major, + unsigned int minor) +{ + cgroups::devices::Entry entry; + entry.selector.type = Entry::Selector::Type::CHARACTER; + entry.selector.major = major; + entry.selector.minor = minor; + entry.access.read = true; + entry.access.write = true; + entry.access.mknod = true; + + Try allow = cgroups::devices::allow( + hierarchy, cgroup, entry); + + if (allow.isError()) { + return Error("Failed to allow device '" + stringify(entry) + + "': " + allow.error()); + } + + return Nothing(); +} + + +Try denyDevice( + const std::string& hierarchy, + const std::string& cgroup, + unsigned int major, + unsigned int minor) +{ + cgroups::devices::Entry entry; + entry.selector.type = Entry::Selector::Type::CHARACTER; + entry.selector.major = major; + entry.selector.minor = minor; + entry.access.read = true; + entry.access.write = true; + entry.access.mknod = true; + + Try deny = cgroups::devices::deny( + hierarchy, cgroup, entry); + + if (deny.isError()) { + return Error("Failed to deny device '" + stringify(entry) + + "': " + deny.error()); + } + + return Nothing(); +} + + +Try addDeviceToContainer( + const string& device, + const string& devicesDir, + const string& rootfsDir, + ContainerLaunchInfo& launchInfo) +{ + const string devicePath = path::join( + devicesDir, strings::remove(device, "/dev/", strings::PREFIX), device); + + Try mknod = + fs::chroot::copyDeviceNode(device, devicePath); + if (mknod.isError()) { + return Error("Failed to copy device: " + mknod.error()); + } + + // Since we are adding the GPU devices to the container, make + // them read/write to guarantee that they are accessible inside + // the container. + Try chmod = os::chmod(devicePath, 0666); + if (chmod.isError()) { + return Error("Failed to set permissions: " + chmod.error()); + } + + *launchInfo.add_mounts() = protobuf::slave::createContainerMount( + devicePath, path::join(rootfsDir, device), MS_BIND); + + return Nothing(); +} + +} // namespace { + NvidiaGpuIsolatorProcess::NvidiaGpuIsolatorProcess( const Flags& _flags, const string& _hierarchy, @@ -297,9 +382,24 @@ Future NvidiaGpuIsolatorProcess::recover( foreach (const Gpu& gpu, available) { if (entry.selector.major == gpu.major && entry.selector.minor == gpu.minor) { - containerGpus.insert(gpu); - break; - } + if (gpu.ismig) { + // The GPU device itself; only a match with a GPU that + // isn't a MIG instance, as MIG instances need access to + // the GPU device and the MIG devices. + continue; + } + + containerGpus.insert(gpu); + break; + } + + // Match up MIG devices + if ((entry.selector.major == gpu.caps_major) + && ((entry.selector.minor == gpu.gi_minor) + || (entry.selector.minor == gpu.ci_minor))) { + containerGpus.insert(gpu); + break; + } } } @@ -443,39 +543,23 @@ Future> NvidiaGpuIsolatorProcess::_prepare( } foreach (const string& device, nvidia.get()) { - // The directory `/dev/nvidia-caps` was introduced in CUDA 11.0, just - // ignore it since we only care about the Nvidia GPU device files. - // - // TODO(qianzhang): Figure out how to handle the directory - // `/dev/nvidia-caps` more properly. + // Ignore /dev/nvidia-caps, we'll handle that directory later on if (device == "/dev/nvidia-caps") { continue; } - const string devicePath = path::join( - devicesDir, strings::remove(device, "/dev/", strings::PREFIX), device); - - Try mknod = - fs::chroot::copyDeviceNode(device, devicePath); - if (mknod.isError()) { - return Failure( - "Failed to copy device '" + device + "': " + mknod.error()); + Try added = addDeviceToContainer(device, devicesDir, containerConfig.rootfs(), launchInfo); + if (added.isError()) { + return Failure("Could not add device '" + device + "' to container: " + added.error()); } + } - // Since we are adding the GPU devices to the container, make - // them read/write to guarantee that they are accessible inside - // the container. - Try chmod = os::chmod(devicePath, 0666); - if (chmod.isError()) { - return Failure( - "Failed to set permissions on device '" + device + "': " + - chmod.error()); + Try> caps = os::glob("/dev/nvidia-caps/*"); + foreach (const string& device, caps.get()) { + Try added = addDeviceToContainer(device, devicesDir, containerConfig.rootfs(), launchInfo); + if (added.isError()) { + return Failure("Could not add device '" + device + "' to container: " + added.error()); } - - *launchInfo.add_mounts() = protobuf::slave::createContainerMount( - devicePath, - path::join(containerConfig.rootfs(), device), - MS_BIND); } return launchInfo; @@ -520,31 +604,55 @@ Future NvidiaGpuIsolatorProcess::update( } else if (requested < info->allocated.size()) { size_t fewer = info->allocated.size() - requested; + set> deallocated_devs; set deallocated; for (size_t i = 0; i < fewer; i++) { const auto gpu = info->allocated.begin(); - cgroups::devices::Entry entry; - entry.selector.type = Entry::Selector::Type::CHARACTER; - entry.selector.major = gpu->major; - entry.selector.minor = gpu->minor; - entry.access.read = true; - entry.access.write = true; - entry.access.mknod = true; - - Try deny = cgroups::devices::deny( - hierarchy, info->cgroup, entry); - - if (deny.isError()) { - return Failure("Failed to deny cgroups access to GPU device" - " '" + stringify(entry) + "': " + deny.error()); + // We can't blindly deny the main GPU device, as it is needed + // by other MIG devices on that same GPU. + deallocated_devs.insert(std::make_pair(gpu->major, gpu->minor)); + + if (gpu->ismig) { + // MIG GPU instance + Try deny = denyDevice(hierarchy, info->cgroup, gpu->caps_major, gpu->gi_minor); + if (deny.isError()) { + return Failure("Failed to deny cgroups access to MIG GI device: " + deny.error()); + } + + // MIG Compute instance + deny = denyDevice(hierarchy, info->cgroup, gpu->caps_major, gpu->ci_minor); + if (deny.isError()) { + return Failure("Failed to deny cgroups access to MIG CI device: " + deny.error()); + } } deallocated.insert(*gpu); info->allocated.erase(gpu); } + set> allocated_devs; + foreach (Gpu gpu, info->allocated) { + allocated_devs.insert(std::make_pair(gpu.major, gpu.minor)); + } + + // Any GPU device present in the difference of the two sets can now + // be denied, as it is not needed by any of the remaining allocated + // GPUs. + set> safe_deny; + std::set_difference(deallocated_devs.begin(), deallocated_devs.end(), + allocated_devs.begin(), allocated_devs.end(), + std::inserter(safe_deny, safe_deny.begin())); + + foreach (auto dev, safe_deny) { + // Main GPU device node + Try deny = denyDevice(hierarchy, info->cgroup, dev.first, dev.second); + if (deny.isError()) { + return Failure("Failed to deny cgroups access to GPU device: " + deny.error()); + } + } + return allocator.deallocate(deallocated); } @@ -563,20 +671,21 @@ Future NvidiaGpuIsolatorProcess::_update( Info* info = CHECK_NOTNULL(infos.at(containerId)); foreach (const Gpu& gpu, allocation) { - cgroups::devices::Entry entry; - entry.selector.type = Entry::Selector::Type::CHARACTER; - entry.selector.major = gpu.major; - entry.selector.minor = gpu.minor; - entry.access.read = true; - entry.access.write = true; - entry.access.mknod = true; + Try allow = allowDevice(hierarchy, info->cgroup, gpu.major, gpu.minor); + if (allow.isError()) { + return Failure("Failed to grant cgroups access to GPU device: " + allow.error()); + } - Try allow = cgroups::devices::allow( - hierarchy, info->cgroup, entry); + if (gpu.ismig) { + allow = allowDevice(hierarchy, info->cgroup, gpu.caps_major, gpu.gi_minor); + if (allow.isError()) { + return Failure("Failed to grant cgroups access to MIG GI device: " + allow.error()); + } - if (allow.isError()) { - return Failure("Failed to grant cgroups access to GPU device" - " '" + stringify(entry) + "': " + allow.error()); + allow = allowDevice(hierarchy, info->cgroup, gpu.caps_major, gpu.ci_minor); + if (allow.isError()) { + return Failure("Failed to grant cgroups access to MIG CI device: " + allow.error()); + } } }