Skip to content

Commit 2ffbe4b

Browse files
committed
gpu plugin: add by-path mount options
default behaviour stays the same. Signed-off-by: Tuomas Katila <[email protected]>
1 parent d2e3c4b commit 2ffbe4b

File tree

3 files changed

+143
-2
lines changed

3 files changed

+143
-2
lines changed

cmd/gpu_plugin/README.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Table of Contents
1919
* [CDI support](#cdi-support)
2020
* [KMD and UMD](#kmd-and-umd)
2121
* [Health management](#health-management)
22+
* [by-path mounting](#by-path-mounting)
2223
* [Issues with media workloads on multi-GPU setups](#issues-with-media-workloads-on-multi-gpu-setups)
2324
* [Workaround for QSV and VA-API](#workaround-for-qsv-and-va-api)
2425

@@ -60,6 +61,7 @@ For workloads on different KMDs, see [KMD and UMD](#kmd-and-umd).
6061
| -allow-ids | string | "" | A list of PCI Device IDs that are allowed to be registered as resources. Default is empty (=all registered). Cannot be used together with `deny-ids`. |
6162
| -deny-ids | string | "" | A list of PCI Device IDs that are denied to be registered as resources. Default is empty (=all registered). Cannot be used together with `allow-ids`. |
6263
| -allocation-policy | string | none | 3 possible values: balanced, packed, none. For shared-dev-num > 1: _balanced_ mode spreads workloads among GPU devices, _packed_ mode fills one GPU fully before moving to next, and _none_ selects first available device from kubelet. Default is _none_. |
64+
| -bypath | string | single | 3 possible values: single, none, all. Default is single. Changes how the by-path symlinks are handled by the plugin. More [info](#by-path-mounting). |
6365

6466
The plugin also accepts a number of other arguments (common to all plugins) related to logging.
6567
Please use the -h option to see the complete list of logging related options.
@@ -258,6 +260,25 @@ Kubernetes Device Plugin API allows passing device's healthiness to Kubelet. By
258260

259261
Temperature limit can be provided via the command line argument, default is 100C.
260262

263+
### By-path mounting
264+
265+
The DRM devices for the Intel GPUs register `by-path` symlinks under `/dev/dri/by-path`. For each GPU character device, there is a corresponding symlink in the by-path directory:
266+
```
267+
$ ls -l /dev/dri/by-path/
268+
lrwxrwxrwx 1 root root 8 oct x 13:09 pci-0000:00:02.0-card -> ../card1
269+
lrwxrwxrwx 1 root root 13 oct x 13:09 pci-0000:00:02.0-render -> ../renderD128
270+
```
271+
272+
The Intel GPU UMD uses these symlinks to detect hardware properties in some cases. Mounting the by-path symlinks as __symlinks__ with the Device plugin API (DP API) is not possible. When the symlinks are mounted via the DP API, they are mounted as the actual devices, and the symlink information is lost (pci address).
273+
274+
To support possible all use cases, GPU plugin allows changing the by-path mounting method. The options are:
275+
* `single` - Symlinks are individually mounted per device. Default.
276+
* Mostly Works, but is known to have issues with some pytorch workloads. See [issue](https://github.com/intel/intel-device-plugins-for-kubernetes/issues/2158).
277+
* `none` - No symlinks are mounted.
278+
* Aligned with docker use where devices are included with privileged mode.
279+
* `all` - All symlinks are mounted even if only one is allocated by the container.
280+
* Optimal for scale-up workloads where all the GPUs are used by the workload.
281+
261282
### Issues with media workloads on multi-GPU setups
262283

263284
OneVPL media API, 3D and compute APIs provide device discovery

cmd/gpu_plugin/gpu_plugin.go

Lines changed: 30 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,10 @@ const (
5959
monitorSuffix = "_monitoring"
6060
monitorID = "all"
6161

62+
bypathOptionNone = "none"
63+
bypathOptionAll = "all"
64+
bypathOptionSingle = "single"
65+
6266
levelzeroAffinityMaskEnvVar = "ZE_AFFINITY_MASK"
6367

6468
// Period of device scans.
@@ -69,6 +73,7 @@ type cliOptions struct {
6973
preferredAllocationPolicy string
7074
allowIDs string
7175
denyIDs string
76+
bypathMount string
7277
sharedDevNum int
7378
globalTempLimit int
7479
memoryTempLimit int
@@ -289,6 +294,16 @@ func (dp *devicePlugin) bypathMountsForPci(pciAddress, bypathDir string) []plugi
289294
return mounts
290295
}
291296

297+
func (dp *devicePlugin) bypathMountForAll() []pluginapi.Mount {
298+
return []pluginapi.Mount{
299+
{
300+
ContainerPath: dp.bypathDir,
301+
HostPath: dp.bypathDir,
302+
ReadOnly: true,
303+
},
304+
}
305+
}
306+
292307
type devicePlugin struct {
293308
gpuDeviceReg *regexp.Regexp
294309
controlDeviceReg *regexp.Regexp
@@ -660,8 +675,20 @@ func (dp *devicePlugin) createMountsAndCDIDevices(cardPath, name string, devSpec
660675
mounts := []pluginapi.Mount{}
661676

662677
if dp.bypathFound {
663-
if pciAddr, pciErr := dp.pciAddressForCard(cardPath, name); pciErr == nil {
664-
mounts = dp.bypathMountsForPci(pciAddr, dp.bypathDir)
678+
switch dp.options.bypathMount {
679+
case bypathOptionAll:
680+
klog.V(4).Info("Using by-path mount option: all")
681+
mounts = dp.bypathMountForAll()
682+
case bypathOptionNone:
683+
klog.V(4).Info("Using by-path mount option: none")
684+
// no mounts
685+
case bypathOptionSingle:
686+
fallthrough
687+
default:
688+
klog.V(4).Info("Using by-path mount option: single/default")
689+
if pciAddr, pciErr := dp.pciAddressForCard(cardPath, name); pciErr == nil {
690+
mounts = dp.bypathMountsForPci(pciAddr, dp.bypathDir)
691+
}
665692
}
666693
}
667694

@@ -784,6 +811,7 @@ func main() {
784811
flag.StringVar(&prefix, "prefix", "", "Prefix for devfs & sysfs paths")
785812
flag.BoolVar(&opts.enableMonitoring, "enable-monitoring", false, "whether to enable '*_monitoring' (= all GPUs) resource")
786813
flag.BoolVar(&opts.healthManagement, "health-management", false, "enable GPU health management")
814+
flag.StringVar(&opts.bypathMount, "bypath", bypathOptionSingle, "bypath mounting options: single, none, all. Default: single")
787815
flag.BoolVar(&opts.wslScan, "wsl", false, "scan for / use WSL devices")
788816
flag.IntVar(&opts.sharedDevNum, "shared-dev-num", 1, "number of containers sharing the same GPU device")
789817
flag.IntVar(&opts.globalTempLimit, "temp-limit", 100, "Global temperature limit at which device is marked unhealthy")

cmd/gpu_plugin/gpu_plugin_test.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import (
2121
"path/filepath"
2222
"reflect"
2323
"sort"
24+
"strings"
2425
"testing"
2526

2627
"github.com/pkg/errors"
@@ -1155,6 +1156,97 @@ func TestCDIDeviceInclusion(t *testing.T) {
11551156
}
11561157
}
11571158

1159+
func TestByPathOptions(t *testing.T) {
1160+
root, err := os.MkdirTemp("", "test_bypathoptions")
1161+
if err != nil {
1162+
t.Fatalf("Can't create temporary directory: %+v", err)
1163+
}
1164+
// dirs/files need to be removed for the next test
1165+
defer os.RemoveAll(root)
1166+
1167+
sysfs := path.Join(root, "sys")
1168+
devfs := path.Join(root, "dev")
1169+
1170+
sysfslinks := []symlinkItem{
1171+
{"/0042:01:02.0", "/class/drm/card0"},
1172+
{"/0042:01:05.0", "/class/drm/card1"},
1173+
{"driver/i915", "/class/drm/card0/device/driver"},
1174+
{"driver/xe", "/class/drm/card1/device/driver"},
1175+
}
1176+
1177+
devfslinks := []symlinkItem{
1178+
{"/dri/card0", "/dri/by-path/pci-0042:01:02.0-card"},
1179+
{"/dri/renderD128", "/dri/by-path/pci-0042:01:02.0-render"},
1180+
{"/dri/card1", "/dri/by-path/pci-0042:01:05.0-card"},
1181+
{"/dri/renderD129", "/dri/by-path/pci-0042:01:05.0-render"},
1182+
}
1183+
1184+
sysfsDirs := []string{
1185+
"class/drm/card0/device/drm/card0",
1186+
"class/drm/card0/device/drm/renderD128",
1187+
"class/drm/card1/device/drm/card1",
1188+
"class/drm/card1/device/drm/renderD129",
1189+
}
1190+
1191+
sysfsFiles := map[string][]byte{
1192+
"class/drm/card0/device/device": []byte("0x9a49"),
1193+
"class/drm/card0/device/vendor": []byte("0x8086"),
1194+
"class/drm/card1/device/device": []byte("0x9a48"),
1195+
"class/drm/card1/device/vendor": []byte("0x8086"),
1196+
}
1197+
1198+
devfsfiles := map[string][]byte{
1199+
"/dri/card0": []byte("1"),
1200+
"/dri/renderD128": []byte("1"),
1201+
"/dri/card1": []byte("1"),
1202+
"/dri/renderD129": []byte("1"),
1203+
}
1204+
1205+
createSymlinks(t, sysfs, sysfslinks)
1206+
createFiles(t, devfs, devfsfiles)
1207+
createFiles(t, sysfs, sysfsFiles)
1208+
createDirs(t, sysfs, sysfsDirs)
1209+
createSymlinks(t, devfs, devfslinks)
1210+
1211+
plugin := newDevicePlugin(sysfs+"/class/drm", devfs+"/dri", cliOptions{sharedDevNum: 1, bypathMount: bypathOptionAll})
1212+
plugin.bypathFound = true
1213+
1214+
devSpecs := []v1beta1.DeviceSpec{}
1215+
1216+
sysfsPath := filepath.Join(sysfs, "class", "drm", "card0")
1217+
1218+
mounts, _ := plugin.createMountsAndCDIDevices(sysfsPath, "card0", devSpecs)
1219+
1220+
if len(mounts) != 1 {
1221+
t.Error("Invalid count for mounts for by-path option 'all'")
1222+
}
1223+
if !strings.HasSuffix(mounts[0].ContainerPath, "/by-path") {
1224+
t.Error("Invalid container path mount for by-path option 'all'")
1225+
}
1226+
1227+
plugin.options.bypathMount = bypathOptionNone
1228+
1229+
mounts, _ = plugin.createMountsAndCDIDevices(sysfsPath, "card0", devSpecs)
1230+
1231+
if len(mounts) != 0 {
1232+
t.Error("Invalid count for mounts for by-path option 'none'")
1233+
}
1234+
1235+
plugin.options.bypathMount = bypathOptionSingle
1236+
1237+
mounts, _ = plugin.createMountsAndCDIDevices(sysfsPath, "card0", devSpecs)
1238+
1239+
if len(mounts) != 2 {
1240+
t.Error("Invalid count for mounts for by-path option 'single'")
1241+
}
1242+
if !strings.HasSuffix(mounts[0].ContainerPath, "by-path/pci-0042:01:02.0-card") {
1243+
t.Error("Invalid container path mount for by-path option 'single'")
1244+
}
1245+
if !strings.HasSuffix(mounts[1].ContainerPath, "by-path/pci-0042:01:02.0-render") {
1246+
t.Error("Invalid container path mount for by-path option 'single'")
1247+
}
1248+
}
1249+
11581250
func TestParsePCIDeviceIDs(t *testing.T) {
11591251
tests := []struct {
11601252
name string

0 commit comments

Comments
 (0)