Skip to content

Commit b399cba

Browse files
committed
DRAAdminAccess: add example
Signed-off-by: Rita Zhang <[email protected]>
1 parent fd1da0b commit b399cba

File tree

8 files changed

+461
-9
lines changed

8 files changed

+461
-9
lines changed

README.md

Lines changed: 65 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -356,10 +356,72 @@ You can use the IDs of the GPUs as well as the GPU sharing settings set in
356356
these environment variables to verify that they were handed out in a way
357357
consistent with the semantics shown in the figure above.
358358

359+
### Demo DRA Admin Access Feature
360+
This example driver includes support for the [DRA AdminAccess feature](https://kubernetes.io/docs/concepts/scheduling-eviction/dynamic-resource-allocation/#admin-access), which allows administrators to gain privileged access to devices already in use by other users. In this example, only workloads with admin access resource claims can view sensitive host hardware information through environment variables.
361+
362+
When admin access is granted, the following host information is made available through environment variables:
363+
364+
- `DRA_ADMIN_ACCESS=true` - Indicates admin access is enabled
365+
- `HOST_CPU_INFO` - CPU model and core count information from /proc/cpuinfo
366+
- `HOST_MEMORY_INFO` - Memory capacity and availability from /proc/meminfo
367+
- `HOST_KERNEL_INFO` - Kernel version information from /proc/version
368+
- `HOST_SYSTEM_INFO` - Operating system and architecture (GOOS, GOARCH)
369+
- `HOST_NETWORK_INFO` - Available network interfaces from /proc/net/dev
370+
- `HOST_STORAGE_INFO` - Root filesystem and mount information from /proc/mounts
371+
372+
#### CDI Integration
373+
374+
When admin access is detected, the CDI (Container Device Interface) handler injects the host hardware environment variables into the container specification.
375+
376+
#### Usage Example
377+
378+
See `demo/gpu-test7.yaml` for a complete example. Key points:
379+
380+
1. **Namespace**: Must have admin access label in order to create ResourceClaimTemplate and ResourceClaim with `adminAccess: true`. From kubernetes v1.34+, this label will be `resource.kubernetes.io/admin-access`.
381+
```yaml
382+
apiVersion: v1
383+
kind: Namespace
384+
metadata:
385+
name: gpu-test7
386+
labels:
387+
resource.k8s.io/admin-access: "true"
388+
```
389+
390+
2. **Resource Claim Template**: Request must have `adminAccess: true`. [optional] `allocationMode: All` should be included if the workload needs to get information for all hardware devices.
391+
```yaml
392+
spec:
393+
spec:
394+
devices:
395+
requests:
396+
- name: admin-gpu
397+
exactly:
398+
deviceClassName: gpu.example.com
399+
allocationMode: All
400+
adminAccess: true
401+
```
402+
403+
3. **Container**: Will receive host hardware information via environment variables
404+
```bash
405+
echo "DRA Admin Access: $DRA_ADMIN_ACCESS"
406+
echo "Host CPU Info: $HOST_CPU_INFO"
407+
echo "Host Memory Info: $HOST_MEMORY_INFO"
408+
# ... additional host info variables
409+
```
410+
411+
#### Testing
412+
413+
To run this demo:
414+
```bash
415+
./demo/test-admin-access.sh
416+
```
417+
Note: These workloads can access devices already in use by other workloads. Only these workloads with admin access resource claims can view sensitive host hardware information through environment variables.
418+
419+
### Clean Up
420+
359421
Once you have verified everything is running correctly, delete all of the
360422
example apps:
361423
```bash
362-
kubectl delete --wait=false --filename=demo/gpu-test{1,2,3,4,5}.yaml
424+
kubectl delete --wait=false --filename=demo/gpu-test{1,2,3,4,5,7}.yaml
363425
```
364426

365427
And wait for them to terminate:
@@ -374,6 +436,8 @@ gpu-test3 pod0 1/1 Terminating 0 31m
374436
gpu-test3 pod1 1/1 Terminating 0 31m
375437
gpu-test4 pod0 1/1 Terminating 0 31m
376438
gpu-test5 pod0 4/4 Terminating 0 31m
439+
gpu-test7 pod0 1/1 Terminating 0 31m
440+
gpu-test7 pod1 1/1 Terminating 0 31m
377441
...
378442
```
379443

cmd/dra-example-kubeletplugin/cdi.go

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -36,18 +36,20 @@ const (
3636
)
3737

3838
type CDIHandler struct {
39-
cache *cdiapi.Cache
39+
cache *cdiapi.Cache
40+
hostHardwareInfo *HostHardwareInfo
4041
}
4142

42-
func NewCDIHandler(config *Config) (*CDIHandler, error) {
43+
func NewCDIHandler(config *Config, hostHardwareInfo *HostHardwareInfo) (*CDIHandler, error) {
4344
cache, err := cdiapi.NewCache(
4445
cdiapi.WithSpecDirs(config.flags.cdiRoot),
4546
)
4647
if err != nil {
4748
return nil, fmt.Errorf("unable to create a new CDI cache: %w", err)
4849
}
4950
handler := &CDIHandler{
50-
cache: cache,
51+
cache: cache,
52+
hostHardwareInfo: hostHardwareInfo,
5153
}
5254

5355
return handler, nil
@@ -96,9 +98,24 @@ func (cdi *CDIHandler) CreateClaimSpecFile(claimUID string, devices PreparedDevi
9698
ContainerEdits: &cdispec.ContainerEdits{
9799
Env: []string{
98100
fmt.Sprintf("GPU_DEVICE_%s_RESOURCE_CLAIM=%s", device.DeviceName[4:], claimUID),
101+
fmt.Sprintf("DRA_ADMIN_ACCESS=%t", device.AdminAccess),
99102
},
100103
},
101104
}
105+
106+
// If this device has admin access, inject host hardware information
107+
if device.AdminAccess {
108+
hostEnvVars := []string{
109+
fmt.Sprintf("HOST_CPU_INFO=%s", cdi.hostHardwareInfo.CPUInfo),
110+
fmt.Sprintf("HOST_MEMORY_INFO=%s", cdi.hostHardwareInfo.MemInfo),
111+
fmt.Sprintf("HOST_KERNEL_INFO=%s", cdi.hostHardwareInfo.KernelInfo),
112+
fmt.Sprintf("HOST_SYSTEM_INFO=%s", cdi.hostHardwareInfo.SystemInfo),
113+
fmt.Sprintf("HOST_NETWORK_INFO=%s", cdi.hostHardwareInfo.NetworkInfo),
114+
fmt.Sprintf("HOST_STORAGE_INFO=%s", cdi.hostHardwareInfo.StorageInfo),
115+
}
116+
claimEdits.ContainerEdits.Env = append(claimEdits.ContainerEdits.Env, hostEnvVars...)
117+
}
118+
102119
claimEdits.Append(device.ContainerEdits)
103120

104121
cdiDevice := cdispec.Device{

cmd/dra-example-kubeletplugin/discovery.go

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,9 +17,12 @@
1717
package main
1818

1919
import (
20+
"bufio"
2021
"fmt"
2122
"math/rand"
2223
"os"
24+
"runtime"
25+
"strings"
2326

2427
resourceapi "k8s.io/api/resource/v1beta1"
2528
"k8s.io/apimachinery/pkg/api/resource"
@@ -84,3 +87,181 @@ func hash(s string) int64 {
8487
}
8588
return h
8689
}
90+
91+
// HostHardwareInfo represents hardware information available to admin access.
92+
type HostHardwareInfo struct {
93+
CPUInfo string
94+
MemInfo string
95+
KernelInfo string
96+
SystemInfo string
97+
NetworkInfo string
98+
StorageInfo string
99+
}
100+
101+
// GetHostHardwareInfo gathers hardware information from the host system.
102+
func GetHostHardwareInfo() (*HostHardwareInfo, error) {
103+
info := &HostHardwareInfo{}
104+
105+
// Get CPU information
106+
if cpuInfo, err := readFile("/proc/cpuinfo"); err == nil {
107+
info.CPUInfo = extractCPUModel(cpuInfo)
108+
} else {
109+
info.CPUInfo = fmt.Sprintf("CPU Info unavailable: %v", err)
110+
}
111+
112+
// Get memory information
113+
if memInfo, err := readFile("/proc/meminfo"); err == nil {
114+
info.MemInfo = extractMemInfo(memInfo)
115+
} else {
116+
info.MemInfo = fmt.Sprintf("Memory Info unavailable: %v", err)
117+
}
118+
119+
// Get kernel information
120+
if kernelInfo, err := readFile("/proc/version"); err == nil {
121+
info.KernelInfo = strings.TrimSpace(kernelInfo)
122+
} else {
123+
info.KernelInfo = fmt.Sprintf("Kernel Info unavailable: %v", err)
124+
}
125+
126+
// Get system information (architecture, OS)
127+
info.SystemInfo = fmt.Sprintf("GOOS: %s, GOARCH: %s", runtime.GOOS, runtime.GOARCH)
128+
129+
// Get network information
130+
if netInfo, err := getNetworkInfo(); err == nil {
131+
info.NetworkInfo = netInfo
132+
} else {
133+
info.NetworkInfo = fmt.Sprintf("Network Info unavailable: %v", err)
134+
}
135+
136+
// Get storage information
137+
if storageInfo, err := getStorageInfo(); err == nil {
138+
info.StorageInfo = storageInfo
139+
} else {
140+
info.StorageInfo = fmt.Sprintf("Storage Info unavailable: %v", err)
141+
}
142+
143+
return info, nil
144+
}
145+
146+
func readFile(path string) (string, error) {
147+
data, err := os.ReadFile(path)
148+
if err != nil {
149+
return "", err
150+
}
151+
return string(data), nil
152+
}
153+
154+
func extractCPUModel(cpuInfo string) string {
155+
scanner := bufio.NewScanner(strings.NewReader(cpuInfo))
156+
cpuCount := 0
157+
var modelName string
158+
159+
for scanner.Scan() {
160+
line := scanner.Text()
161+
if strings.HasPrefix(line, "processor") {
162+
cpuCount++
163+
}
164+
if strings.HasPrefix(line, "model name") {
165+
parts := strings.Split(line, ":")
166+
if len(parts) > 1 {
167+
modelName = strings.TrimSpace(parts[1])
168+
}
169+
}
170+
}
171+
172+
if modelName != "" {
173+
return fmt.Sprintf("%d x %s", cpuCount, modelName)
174+
}
175+
return fmt.Sprintf("%d CPU cores", cpuCount)
176+
}
177+
178+
func extractMemInfo(memInfo string) string {
179+
scanner := bufio.NewScanner(strings.NewReader(memInfo))
180+
var totalMem, availableMem string
181+
182+
for scanner.Scan() {
183+
line := scanner.Text()
184+
if strings.HasPrefix(line, "MemTotal:") {
185+
parts := strings.Fields(line)
186+
if len(parts) >= 2 {
187+
totalMem = parts[1] + "kB"
188+
}
189+
}
190+
if strings.HasPrefix(line, "MemAvailable:") {
191+
parts := strings.Fields(line)
192+
if len(parts) >= 2 {
193+
availableMem = parts[1] + "kB"
194+
}
195+
}
196+
}
197+
198+
if totalMem != "" && availableMem != "" {
199+
return fmt.Sprintf("Total: %s, Available: %s", totalMem, availableMem)
200+
}
201+
return "Memory information parsing failed"
202+
}
203+
204+
func getNetworkInfo() (string, error) {
205+
// Read network interfaces
206+
interfaces, err := readFile("/proc/net/dev")
207+
if err != nil {
208+
return "", err
209+
}
210+
211+
scanner := bufio.NewScanner(strings.NewReader(interfaces))
212+
var interfaceNames []string
213+
214+
for scanner.Scan() {
215+
line := strings.TrimSpace(scanner.Text())
216+
if strings.Contains(line, ":") && !strings.Contains(line, "Inter-|") && !strings.Contains(line, "face |") {
217+
parts := strings.Split(line, ":")
218+
if len(parts) > 0 {
219+
ifName := strings.TrimSpace(parts[0])
220+
if ifName != "lo" { // Skip loopback
221+
interfaceNames = append(interfaceNames, ifName)
222+
}
223+
}
224+
}
225+
}
226+
227+
return fmt.Sprintf("Network Interfaces: %s", strings.Join(interfaceNames, ", ")), nil
228+
}
229+
230+
func getStorageInfo() (string, error) {
231+
// Read mounted filesystems
232+
mounts, err := readFile("/proc/mounts")
233+
if err != nil {
234+
return "", err
235+
}
236+
237+
scanner := bufio.NewScanner(strings.NewReader(mounts))
238+
var rootFS string
239+
var mountCount int
240+
241+
for scanner.Scan() {
242+
line := scanner.Text()
243+
fields := strings.Fields(line)
244+
if len(fields) >= 3 {
245+
mountPoint := fields[1]
246+
fsType := fields[2]
247+
248+
if mountPoint == "/" {
249+
rootFS = fmt.Sprintf("Root FS: %s (%s)", fields[0], fsType)
250+
}
251+
252+
// Count real filesystems (not virtual ones)
253+
if !strings.HasPrefix(fields[0], "/proc") &&
254+
!strings.HasPrefix(fields[0], "/sys") &&
255+
!strings.HasPrefix(fields[0], "/dev/pts") &&
256+
fsType != "tmpfs" && fsType != "devtmpfs" &&
257+
fsType != "cgroup" && fsType != "cgroup2" {
258+
mountCount++
259+
}
260+
}
261+
}
262+
263+
if rootFS != "" {
264+
return fmt.Sprintf("%s, Total mounts: %d", rootFS, mountCount), nil
265+
}
266+
return fmt.Sprintf("Storage mounts: %d", mountCount), nil
267+
}

cmd/dra-example-kubeletplugin/driver.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,9 @@ func (d *driver) Shutdown(logger klog.Logger) error {
101101

102102
func (d *driver) PrepareResourceClaims(ctx context.Context, claims []*resourceapi.ResourceClaim) (map[types.UID]kubeletplugin.PrepareResult, error) {
103103
klog.Infof("PrepareResourceClaims is called: number of claims: %d", len(claims))
104+
for i, claim := range claims {
105+
klog.Infof("Claim %d: UID=%s, Namespace=%s, Name=%s", i, claim.UID, claim.Namespace, claim.Name)
106+
}
104107
result := make(map[types.UID]kubeletplugin.PrepareResult)
105108

106109
for _, claim := range claims {
@@ -113,6 +116,7 @@ func (d *driver) PrepareResourceClaims(ctx context.Context, claims []*resourceap
113116
func (d *driver) prepareResourceClaim(_ context.Context, claim *resourceapi.ResourceClaim) kubeletplugin.PrepareResult {
114117
preparedPBs, err := d.state.Prepare(claim)
115118
if err != nil {
119+
klog.Errorf("Error preparing devices for claim %v: %v", claim.UID, err)
116120
return kubeletplugin.PrepareResult{
117121
Err: fmt.Errorf("error preparing devices for claim %v: %w", claim.UID, err),
118122
}

0 commit comments

Comments
 (0)