From 2502b7b49eca14d293fa3fee3b647aa9452a85f9 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis <cdesiniotis@nvidia.com> Date: Sun, 9 Mar 2025 11:33:22 -0700 Subject: [PATCH 1/4] Revert "Always add 'config' emptyDir volume to GFD and device-plugin daemonsets" This reverts commit 22941b5416db1d4fd26fd9c4bb2aa561f94729fd. Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com> --- .../gpu-feature-discovery/0500_daemonset.yaml | 10 ---------- assets/state-device-plugin/0500_daemonset.yaml | 10 ---------- controllers/object_controls.go | 17 +++++++++++++++-- 3 files changed, 15 insertions(+), 22 deletions(-) diff --git a/assets/gpu-feature-discovery/0500_daemonset.yaml b/assets/gpu-feature-discovery/0500_daemonset.yaml index 7efc1b342..75532744f 100644 --- a/assets/gpu-feature-discovery/0500_daemonset.yaml +++ b/assets/gpu-feature-discovery/0500_daemonset.yaml @@ -104,9 +104,6 @@ spec: value: "" - name: PROCESS_TO_SIGNAL value: "" - volumeMounts: - - name: config - mountPath: /config containers: - image: "FILLED BY THE OPERATOR" name: gpu-feature-discovery @@ -130,8 +127,6 @@ spec: - name: host-sys mountPath: /sys readOnly: true - - name: config - mountPath: /config securityContext: privileged: true - image: "FILLED BY THE OPERATOR" @@ -162,9 +157,6 @@ spec: value: "1" # SIGHUP - name: PROCESS_TO_SIGNAL value: "gpu-feature-discovery" - volumeMounts: - - name: config - mountPath: /config volumes: - name: output-dir hostPath: @@ -183,5 +175,3 @@ spec: hostPath: path: /run/nvidia/driver type: DirectoryOrCreate - - name: config - emptyDir: {} diff --git a/assets/state-device-plugin/0500_daemonset.yaml b/assets/state-device-plugin/0500_daemonset.yaml index e6a68bd16..76de43420 100644 --- a/assets/state-device-plugin/0500_daemonset.yaml +++ b/assets/state-device-plugin/0500_daemonset.yaml @@ -61,9 +61,6 @@ spec: value: "" - name: PROCESS_TO_SIGNAL value: "" - volumeMounts: - - name: config - mountPath: /config containers: - image: "FILLED BY THE OPERATOR" name: nvidia-device-plugin @@ -110,8 +107,6 @@ spec: mountPath: /dev/shm - name: mps-root mountPath: /mps - - name: config - mountPath: /config - image: "FILLED BY THE OPERATOR" name: config-manager command: ["config-manager"] @@ -140,9 +135,6 @@ spec: value: "1" # SIGHUP - name: PROCESS_TO_SIGNAL value: "nvidia-device-plugin" - volumeMounts: - - name: config - mountPath: /config volumes: - name: nvidia-device-plugin-entrypoint configMap: @@ -173,5 +165,3 @@ spec: - name: mps-shm hostPath: path: /run/nvidia/mps/shm - - name: config - emptyDir: {} diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 8103b9009..5696f44b9 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -2456,7 +2456,10 @@ func isCustomPluginConfigSet(pluginConfig *gpuv1.DevicePluginConfig) bool { // adds shared volume mounts required for custom plugin config provided via a ConfigMap func addSharedMountsForPluginConfig(container *corev1.Container, config *gpuv1.DevicePluginConfig) { + emptyDirMount := corev1.VolumeMount{Name: "config", MountPath: "/config"} configVolMount := corev1.VolumeMount{Name: config.Name, MountPath: "/available-configs"} + + container.VolumeMounts = append(container.VolumeMounts, emptyDirMount) container.VolumeMounts = append(container.VolumeMounts, configVolMount) } @@ -2492,7 +2495,7 @@ func handleDevicePluginConfig(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy continue } setContainerEnv(&obj.Spec.Template.Spec.Containers[i], "CONFIG_FILE", "/config/config.yaml") - // add configmap volume mount + // setup sharedvolume(emptydir) for main container addSharedMountsForPluginConfig(&obj.Spec.Template.Spec.Containers[i], config.DevicePlugin.Config) } @@ -2503,8 +2506,9 @@ func handleDevicePluginConfig(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicy shareProcessNamespace := true obj.Spec.Template.Spec.ShareProcessNamespace = &shareProcessNamespace } - // add configmap volume + // setup volumes from configmap and shared emptyDir obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, createConfigMapVolume(config.DevicePlugin.Config.Name, nil)) + obj.Spec.Template.Spec.Volumes = append(obj.Spec.Template.Spec.Volumes, createEmptyDirVolume("config")) // apply env/volume changes to initContainer err := transformConfigManagerInitContainer(obj, config) @@ -3129,6 +3133,15 @@ func createConfigMapVolume(configMapName string, itemsToInclude []corev1.KeyToPa return corev1.Volume{Name: configMapName, VolumeSource: volumeSource} } +func createEmptyDirVolume(volumeName string) corev1.Volume { + return corev1.Volume{ + Name: volumeName, + VolumeSource: corev1.VolumeSource{ + EmptyDir: &corev1.EmptyDirVolumeSource{}, + }, + } +} + func transformDriverContainer(obj *appsv1.DaemonSet, config *gpuv1.ClusterPolicySpec, n ClusterPolicyController) error { driverIndex := 0 driverCtrFound := false From b6747d619e55c821cd477a08dccbae2c74fbaca7 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis <cdesiniotis@nvidia.com> Date: Sun, 9 Mar 2025 11:35:55 -0700 Subject: [PATCH 2/4] Revert "Add init container to GFD for handling imex nodes config mount" This reverts commit a076f8911841f7762ab5373e6be32a435871a701. Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com> --- .../gpu-feature-discovery/0500_configmap.yaml | 22 ++++++ ...500_daemonset.yaml => 0600_daemonset.yaml} | 67 ++++++------------- controllers/object_controls.go | 8 --- 3 files changed, 44 insertions(+), 53 deletions(-) create mode 100644 assets/gpu-feature-discovery/0500_configmap.yaml rename assets/gpu-feature-discovery/{0500_daemonset.yaml => 0600_daemonset.yaml} (73%) diff --git a/assets/gpu-feature-discovery/0500_configmap.yaml b/assets/gpu-feature-discovery/0500_configmap.yaml new file mode 100644 index 000000000..5f6c54496 --- /dev/null +++ b/assets/gpu-feature-discovery/0500_configmap.yaml @@ -0,0 +1,22 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: gpu-feature-discovery-entrypoint + namespace: "FILLED BY THE OPERATOR" + labels: + app: gpu-feature-discovery +data: + entrypoint.sh: |- + #!/bin/bash + + until [[ -f /run/nvidia/validations/driver-ready ]] + do + echo "waiting for the driver validations to be ready..." + sleep 5 + done + + set -o allexport + cat /run/nvidia/validations/driver-ready + . /run/nvidia/validations/driver-ready + + exec gpu-feature-discovery diff --git a/assets/gpu-feature-discovery/0500_daemonset.yaml b/assets/gpu-feature-discovery/0600_daemonset.yaml similarity index 73% rename from assets/gpu-feature-discovery/0500_daemonset.yaml rename to assets/gpu-feature-discovery/0600_daemonset.yaml index 75532744f..1c8198200 100644 --- a/assets/gpu-feature-discovery/0500_daemonset.yaml +++ b/assets/gpu-feature-discovery/0600_daemonset.yaml @@ -33,51 +33,9 @@ spec: securityContext: privileged: true volumeMounts: - - name: run-nvidia-validations - mountPath: /run/nvidia/validations - mountPropagation: HostToContainer - - name: gpu-feature-discovery-imex-init - image: "FILLED BY THE OPERATOR" - command: ["/bin/bash", "-c"] - args: - - | - until [[ -f /run/nvidia/validations/driver-ready ]] - do - echo "waiting for the driver validations to be ready..." - sleep 5 - done - set -o allexport - cat /run/nvidia/validations/driver-ready - . /run/nvidia/validations/driver-ready - - IMEX_NODES_CONFIG_FILE=/etc/nvidia-imex/nodes_config.cfg - if [[ -f /config/${IMEX_NODES_CONFIG_FILE} ]]; then - echo "Removing cached IMEX nodes config" - rm -f /config/${IMEX_NODES_CONFIG_FILE} - fi - if [[ ! -f ${DRIVER_ROOT_CTR_PATH}/${IMEX_NODES_CONFIG_FILE} ]]; then - echo "No IMEX nodes config path detected; Skipping" - exit 0 - fi - echo "Copying IMEX nodes config" - mkdir -p $(dirname /config/${IMEX_NODES_CONFIG_FILE}) - cp ${DRIVER_ROOT_CTR_PATH}/${IMEX_NODES_CONFIG_FILE} /config/${IMEX_NODES_CONFIG_FILE} - securityContext: - privileged: true - volumeMounts: - - name: config - mountPath: /config - - name: run-nvidia-validations - mountPath: /run/nvidia/validations + - name: run-nvidia + mountPath: /run/nvidia mountPropagation: HostToContainer - - name: host-root - mountPath: /host/etc - subPath: etc - readOnly: true - - name: driver-install-dir - mountPath: /driver-root/etc - subPath: etc - readOnly: true - name: config-manager-init image: "FILLED BY THE OPERATOR" command: ["config-manager"] @@ -107,7 +65,9 @@ spec: containers: - image: "FILLED BY THE OPERATOR" name: gpu-feature-discovery - command: ["gpu-feature-discovery"] + command: [ "/bin/bash", "-c" ] + args: + - /bin/entrypoint.sh env: - name: GFD_SLEEP_INTERVAL value: 60s @@ -122,11 +82,24 @@ spec: fieldRef: fieldPath: spec.nodeName volumeMounts: + - name: gpu-feature-discovery-entrypoint + readOnly: true + mountPath: /bin/entrypoint.sh + subPath: entrypoint.sh - name: output-dir mountPath: "/etc/kubernetes/node-feature-discovery/features.d" - name: host-sys mountPath: /sys readOnly: true + - name: run-nvidia-validations + mountPath: /run/nvidia/validations + - name: driver-install-dir + mountPath: /driver-root + mountPropagation: HostToContainer + - name: host-root + mountPath: /host + readOnly: true + mountPropagation: HostToContainer securityContext: privileged: true - image: "FILLED BY THE OPERATOR" @@ -158,6 +131,10 @@ spec: - name: PROCESS_TO_SIGNAL value: "gpu-feature-discovery" volumes: + - name: gpu-feature-discovery-entrypoint + configMap: + name: gpu-feature-discovery-entrypoint + defaultMode: 448 - name: output-dir hostPath: path: "/etc/kubernetes/node-feature-discovery/features.d" diff --git a/controllers/object_controls.go b/controllers/object_controls.go index 5696f44b9..21f980145 100644 --- a/controllers/object_controls.go +++ b/controllers/object_controls.go @@ -895,14 +895,6 @@ func TransformGPUDiscoveryPlugin(obj *appsv1.DaemonSet, config *gpuv1.ClusterPol } obj.Spec.Template.Spec.Containers[0].Image = img - // update image for IMEX init container - for i, initCtr := range obj.Spec.Template.Spec.InitContainers { - if initCtr.Name == "gpu-feature-discovery-imex-init" { - obj.Spec.Template.Spec.InitContainers[i].Image = img - break - } - } - // update image pull policy obj.Spec.Template.Spec.Containers[0].ImagePullPolicy = gpuv1.ImagePullPolicy(config.GPUFeatureDiscovery.ImagePullPolicy) From 0a6f1f2251f4643c615f8d45ad2fbf81fba1ceb9 Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis <cdesiniotis@nvidia.com> Date: Sun, 9 Mar 2025 11:42:25 -0700 Subject: [PATCH 3/4] Revert "Make the IMEX nodes config file available to GFD" This reverts commit 5525636ac933798698b69955496fc293c2a71f83. Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com> --- .../gpu-feature-discovery/0500_configmap.yaml | 22 ------------ ...600_daemonset.yaml => 0500_daemonset.yaml} | 34 +++---------------- 2 files changed, 4 insertions(+), 52 deletions(-) delete mode 100644 assets/gpu-feature-discovery/0500_configmap.yaml rename assets/gpu-feature-discovery/{0600_daemonset.yaml => 0500_daemonset.yaml} (78%) diff --git a/assets/gpu-feature-discovery/0500_configmap.yaml b/assets/gpu-feature-discovery/0500_configmap.yaml deleted file mode 100644 index 5f6c54496..000000000 --- a/assets/gpu-feature-discovery/0500_configmap.yaml +++ /dev/null @@ -1,22 +0,0 @@ -apiVersion: v1 -kind: ConfigMap -metadata: - name: gpu-feature-discovery-entrypoint - namespace: "FILLED BY THE OPERATOR" - labels: - app: gpu-feature-discovery -data: - entrypoint.sh: |- - #!/bin/bash - - until [[ -f /run/nvidia/validations/driver-ready ]] - do - echo "waiting for the driver validations to be ready..." - sleep 5 - done - - set -o allexport - cat /run/nvidia/validations/driver-ready - . /run/nvidia/validations/driver-ready - - exec gpu-feature-discovery diff --git a/assets/gpu-feature-discovery/0600_daemonset.yaml b/assets/gpu-feature-discovery/0500_daemonset.yaml similarity index 78% rename from assets/gpu-feature-discovery/0600_daemonset.yaml rename to assets/gpu-feature-discovery/0500_daemonset.yaml index 1c8198200..8cf620184 100644 --- a/assets/gpu-feature-discovery/0600_daemonset.yaml +++ b/assets/gpu-feature-discovery/0500_daemonset.yaml @@ -65,9 +65,7 @@ spec: containers: - image: "FILLED BY THE OPERATOR" name: gpu-feature-discovery - command: [ "/bin/bash", "-c" ] - args: - - /bin/entrypoint.sh + command: ["gpu-feature-discovery"] env: - name: GFD_SLEEP_INTERVAL value: 60s @@ -82,24 +80,11 @@ spec: fieldRef: fieldPath: spec.nodeName volumeMounts: - - name: gpu-feature-discovery-entrypoint - readOnly: true - mountPath: /bin/entrypoint.sh - subPath: entrypoint.sh - name: output-dir mountPath: "/etc/kubernetes/node-feature-discovery/features.d" - name: host-sys mountPath: /sys readOnly: true - - name: run-nvidia-validations - mountPath: /run/nvidia/validations - - name: driver-install-dir - mountPath: /driver-root - mountPropagation: HostToContainer - - name: host-root - mountPath: /host - readOnly: true - mountPropagation: HostToContainer securityContext: privileged: true - image: "FILLED BY THE OPERATOR" @@ -131,24 +116,13 @@ spec: - name: PROCESS_TO_SIGNAL value: "gpu-feature-discovery" volumes: - - name: gpu-feature-discovery-entrypoint - configMap: - name: gpu-feature-discovery-entrypoint - defaultMode: 448 - name: output-dir hostPath: path: "/etc/kubernetes/node-feature-discovery/features.d" - name: host-sys hostPath: path: /sys - - name: run-nvidia-validations - hostPath: - path: "/run/nvidia/validations" - type: DirectoryOrCreate - - name: host-root - hostPath: - path: / - - name: driver-install-dir + - name: run-nvidia hostPath: - path: /run/nvidia/driver - type: DirectoryOrCreate + path: "/run/nvidia" + type: Directory From 8c4b19db957c2968f88e0932103b1cdf58416b6c Mon Sep 17 00:00:00 2001 From: Christopher Desiniotis <cdesiniotis@nvidia.com> Date: Tue, 11 Mar 2025 10:37:50 -0700 Subject: [PATCH 4/4] Revert "Always add 'config' emptyDir volume to MPS daemonset" This reverts commit b82b91be4f7ace51aa6e96cacc773f3b626ae9e8. Signed-off-by: Christopher Desiniotis <cdesiniotis@nvidia.com> --- assets/state-mps-control-daemon/0400_daemonset.yaml | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/assets/state-mps-control-daemon/0400_daemonset.yaml b/assets/state-mps-control-daemon/0400_daemonset.yaml index 097ce8ca9..3be58af20 100644 --- a/assets/state-mps-control-daemon/0400_daemonset.yaml +++ b/assets/state-mps-control-daemon/0400_daemonset.yaml @@ -72,9 +72,6 @@ spec: value: "" - name: PROCESS_TO_SIGNAL value: "" - volumeMounts: - - name: config - mountPath: /config containers: - image: "FILLED BY OPERATOR" name: mps-control-daemon-ctr @@ -96,8 +93,6 @@ spec: mountPath: /dev/shm - name: mps-root mountPath: /mps - - name: config - mountPath: /config - image: "FILLED BY THE OPERATOR" name: config-manager command: ["config-manager"] @@ -126,9 +121,6 @@ spec: value: "1" # SIGHUP - name: PROCESS_TO_SIGNAL value: "/usr/bin/mps-control-daemon" - volumeMounts: - - name: config - mountPath: /config volumes: - name: run-nvidia hostPath: @@ -141,5 +133,3 @@ spec: - name: mps-shm hostPath: path: /run/nvidia/mps/shm - - name: config - emptyDir: {}