Skip to content

Commit 0c0f3e3

Browse files
authored
feat(docker): enable CDI GPU sandboxes (#1036)
1 parent 182cbc6 commit 0c0f3e3

7 files changed

Lines changed: 155 additions & 18 deletions

File tree

architecture/sandbox.md

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -460,6 +460,16 @@ Kernel-level error behavior (e.g., Landlock ABI unavailable) depends on `Landloc
460460

461461
**Baseline path filtering**: System-injected baseline paths (e.g., `/app`) are pre-filtered by `enrich_proto_baseline_paths()` / `enrich_sandbox_baseline_paths()` using `Path::exists()` before they reach Landlock. If a baseline `read_write` path is already present in `read_only`, enrichment skips the promotion so explicit policy intent is preserved. User-specified paths are not pre-filtered -- they are evaluated at Landlock apply time so misconfigurations surface as warnings or errors.
462462

463+
**GPU baseline paths**: The supervisor currently infers GPU baseline paths from
464+
device nodes and NVIDIA runtime paths visible inside the sandbox container. The
465+
Docker compute driver can request CDI GPU injection, but this implementation
466+
does not pass CDI metadata into the supervisor. Future device-specific CDI
467+
selection may need follow-up work so the supervisor can enrich Landlock using
468+
the requested CDI device's actual device nodes and mounted library paths. That
469+
design must work for remote Docker daemons, where Docker-reported CDI spec
470+
directories are paths on the daemon host and may not be readable by the gateway
471+
process or the sandbox supervisor.
472+
463473
### Seccomp syscall filtering
464474

465475
**File:** `crates/openshell-sandbox/src/sandbox/linux/seccomp.rs`

crates/openshell-driver-docker/src/lib.rs

Lines changed: 31 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,16 @@
88
use bollard::Docker;
99
use bollard::errors::Error as BollardError;
1010
use bollard::models::{
11-
ContainerCreateBody, ContainerSummary, ContainerSummaryStateEnum, HostConfig, Mount,
12-
MountTypeEnum, RestartPolicy, RestartPolicyNameEnum,
11+
ContainerCreateBody, ContainerSummary, ContainerSummaryStateEnum, DeviceRequest, HostConfig,
12+
Mount, MountTypeEnum, RestartPolicy, RestartPolicyNameEnum,
1313
};
1414
use bollard::query_parameters::{
1515
CreateContainerOptionsBuilder, CreateImageOptions, DownloadFromContainerOptionsBuilder,
1616
ListContainersOptionsBuilder, RemoveContainerOptionsBuilder, StopContainerOptionsBuilder,
1717
};
1818
use bytes::Bytes;
1919
use futures::{Stream, StreamExt};
20-
use openshell_core::config::DEFAULT_STOP_TIMEOUT_SECS;
20+
use openshell_core::config::{CDI_GPU_DEVICE_ALL, DEFAULT_STOP_TIMEOUT_SECS};
2121
use openshell_core::proto::compute::v1::{
2222
CreateSandboxRequest, CreateSandboxResponse, DeleteSandboxRequest, DeleteSandboxResponse,
2323
DriverCondition, DriverSandbox, DriverSandboxStatus, DriverSandboxTemplate,
@@ -159,6 +159,7 @@ struct DockerDriverRuntimeConfig {
159159
supervisor_bin: PathBuf,
160160
guest_tls: Option<DockerGuestTlsPaths>,
161161
daemon_version: String,
162+
supports_gpu: bool,
162163
}
163164

164165
#[derive(Clone)]
@@ -195,6 +196,12 @@ impl DockerComputeDriver {
195196
let version = docker.version().await.map_err(|err| {
196197
Error::execution(format!("failed to query Docker daemon version: {err}"))
197198
})?;
199+
let supports_gpu = docker
200+
.info()
201+
.await
202+
.ok()
203+
.and_then(|info| info.cdi_spec_dirs)
204+
.is_some_and(|dirs| !dirs.is_empty());
198205
let daemon_arch = normalize_docker_arch(version.arch.as_deref().unwrap_or_default());
199206
let supervisor_bin = resolve_supervisor_bin(&docker, docker_config, &daemon_arch).await?;
200207
let guest_tls = docker_guest_tls_paths(config, docker_config)?;
@@ -212,6 +219,7 @@ impl DockerComputeDriver {
212219
supervisor_bin,
213220
guest_tls,
214221
daemon_version: version.version.unwrap_or_else(|| "unknown".to_string()),
222+
supports_gpu,
215223
},
216224
events: broadcast::channel(WATCH_BUFFER).0,
217225
supervisor_readiness,
@@ -230,12 +238,15 @@ impl DockerComputeDriver {
230238
driver_name: "docker".to_string(),
231239
driver_version: self.config.daemon_version.clone(),
232240
default_image: self.config.default_image.clone(),
233-
supports_gpu: false,
241+
supports_gpu: self.config.supports_gpu,
234242
gpu_count: 0,
235243
}
236244
}
237245

238-
fn validate_sandbox(sandbox: &DriverSandbox) -> Result<(), Status> {
246+
fn validate_sandbox(
247+
sandbox: &DriverSandbox,
248+
config: &DockerDriverRuntimeConfig,
249+
) -> Result<(), Status> {
239250
let spec = sandbox
240251
.spec
241252
.as_ref()
@@ -250,9 +261,9 @@ impl DockerComputeDriver {
250261
"docker sandboxes require a template image",
251262
));
252263
}
253-
if spec.gpu {
264+
if spec.gpu && !config.supports_gpu {
254265
return Err(Status::failed_precondition(
255-
"docker compute driver does not support gpu sandboxes",
266+
"docker GPU sandboxes require Docker CDI support. Enable CDI on the Docker daemon, then restart the OpenShell gateway/server so GPU capability is detected.",
256267
));
257268
}
258269
if !template.agent_socket_path.trim().is_empty() {
@@ -300,7 +311,7 @@ impl DockerComputeDriver {
300311
}
301312

302313
async fn create_sandbox_inner(&self, sandbox: &DriverSandbox) -> Result<(), Status> {
303-
Self::validate_sandbox(sandbox)?;
314+
Self::validate_sandbox(sandbox, &self.config)?;
304315

305316
if self
306317
.find_managed_container_summary(&sandbox.id, &sandbox.name)
@@ -674,7 +685,7 @@ impl ComputeDriver for DockerComputeDriver {
674685
.into_inner()
675686
.sandbox
676687
.ok_or_else(|| Status::invalid_argument("sandbox is required"))?;
677-
Self::validate_sandbox(&sandbox)?;
688+
Self::validate_sandbox(&sandbox, &self.config)?;
678689
Ok(Response::new(ValidateSandboxCreateResponse {}))
679690
}
680691

@@ -876,6 +887,16 @@ fn build_environment(sandbox: &DriverSandbox, config: &DockerDriverRuntimeConfig
876887
.collect()
877888
}
878889

890+
fn docker_gpu_device_requests(gpu: bool) -> Option<Vec<DeviceRequest>> {
891+
gpu.then(|| {
892+
vec![DeviceRequest {
893+
driver: Some("cdi".to_string()),
894+
device_ids: Some(vec![CDI_GPU_DEVICE_ALL.to_string()]),
895+
..Default::default()
896+
}]
897+
})
898+
}
899+
879900
fn build_container_create_body(
880901
sandbox: &DriverSandbox,
881902
config: &DockerDriverRuntimeConfig,
@@ -917,6 +938,7 @@ fn build_container_create_body(
917938
host_config: Some(HostConfig {
918939
nano_cpus: resource_limits.nano_cpus,
919940
memory: resource_limits.memory_bytes,
941+
device_requests: docker_gpu_device_requests(spec.gpu),
920942
mounts: Some(build_mounts(config)),
921943
restart_policy: Some(RestartPolicy {
922944
name: Some(RestartPolicyNameEnum::UNLESS_STOPPED),

crates/openshell-driver-docker/src/tests.rs

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ fn runtime_config() -> DockerDriverRuntimeConfig {
5252
key: PathBuf::from("/tmp/tls.key"),
5353
}),
5454
daemon_version: "28.0.0".to_string(),
55+
supports_gpu: false,
5556
}
5657
}
5758

@@ -170,6 +171,48 @@ fn build_container_create_body_clears_inherited_cmd() {
170171
.and_then(|labels| labels.get(SANDBOX_NAMESPACE_LABEL_KEY)),
171172
Some(&"default".to_string())
172173
);
174+
assert!(
175+
create_body
176+
.host_config
177+
.as_ref()
178+
.and_then(|host_config| host_config.device_requests.as_ref())
179+
.is_none(),
180+
"non-GPU containers should not request Docker devices"
181+
);
182+
}
183+
184+
#[test]
185+
fn validate_sandbox_rejects_gpu_when_cdi_unavailable() {
186+
let config = runtime_config();
187+
let mut sandbox = test_sandbox();
188+
sandbox.spec.as_mut().unwrap().gpu = true;
189+
190+
let err = DockerComputeDriver::validate_sandbox(&sandbox, &config).unwrap_err();
191+
192+
assert_eq!(err.code(), tonic::Code::FailedPrecondition);
193+
assert!(err.message().contains("Docker CDI"));
194+
}
195+
196+
#[test]
197+
fn build_container_create_body_maps_gpu_to_all_cdi_device() {
198+
let mut config = runtime_config();
199+
config.supports_gpu = true;
200+
let mut sandbox = test_sandbox();
201+
sandbox.spec.as_mut().unwrap().gpu = true;
202+
203+
let create_body = build_container_create_body(&sandbox, &config).unwrap();
204+
let request = create_body
205+
.host_config
206+
.as_ref()
207+
.and_then(|host_config| host_config.device_requests.as_ref())
208+
.and_then(|requests| requests.first())
209+
.expect("GPU request should add a Docker device request");
210+
211+
assert_eq!(request.driver.as_deref(), Some("cdi"));
212+
assert_eq!(
213+
request.device_ids.as_ref().unwrap(),
214+
&vec![CDI_GPU_DEVICE_ALL.to_string()]
215+
);
173216
}
174217

175218
#[test]

docs/sandboxes/manage-sandboxes.mdx

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ To request GPU resources, add `--gpu`:
3939
openshell sandbox create --gpu -- claude
4040
```
4141

42+
For Docker-backed sandboxes, GPU injection uses Docker CDI. If you enable Docker
43+
CDI after the gateway starts, restart the gateway so OpenShell can detect the
44+
updated Docker daemon capability.
45+
4246
### Custom Containers
4347

4448
Use `--from` to create a sandbox from a pre-built community package, a local directory, or a container image:

e2e/rust/e2e-docker.sh

Lines changed: 26 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
33
# SPDX-License-Identifier: Apache-2.0
44

5-
# Run the Rust e2e smoke test against a standalone gateway running the
5+
# Run a Rust e2e test against a standalone gateway running the
66
# bundled Docker compute driver.
77
#
88
# Unlike the Kubernetes driver (which deploys a k3s cluster) or the VM
@@ -20,10 +20,12 @@
2020
# 4. Starts openshell-gateway with --drivers=docker, binding to a
2121
# random free host port.
2222
# 5. Installs the client cert into the CLI gateway config dir and
23-
# runs the Rust smoke test.
23+
# runs the selected Rust e2e test.
2424
# 6. Tears the gateway process down on exit.
2525
#
26-
# Usage: mise run e2e:docker
26+
# Usage:
27+
# mise run e2e:docker
28+
# mise run e2e:docker:gpu
2729

2830
set -euo pipefail
2931

@@ -35,6 +37,8 @@ STATE_DIR=""
3537
GATEWAY_CONFIG_DIR=""
3638
GATEWAY_PID=""
3739
GATEWAY_LOG="${WORKDIR}/gateway.log"
40+
E2E_TEST="${OPENSHELL_E2E_DOCKER_TEST:-smoke}"
41+
GPU_MODE="${OPENSHELL_E2E_DOCKER_GPU:-0}"
3842
# Unique sandbox namespace for this test run. Set just before the gateway
3943
# is started so cleanup can filter Docker containers strictly to ones
4044
# this run created, even when other OpenShell sandboxes are present on
@@ -119,6 +123,17 @@ if ! command -v openssl >/dev/null 2>&1; then
119123
echo "ERROR: openssl is required to generate ephemeral PKI" >&2
120124
exit 2
121125
fi
126+
if [ "${GPU_MODE}" = "1" ]; then
127+
DOCKER_CDI_SPEC_DIRS="$(docker info --format '{{json .CDISpecDirs}}' 2>/dev/null || true)"
128+
if [ -z "${DOCKER_CDI_SPEC_DIRS}" ] \
129+
|| [ "${DOCKER_CDI_SPEC_DIRS}" = "null" ] \
130+
|| [ "${DOCKER_CDI_SPEC_DIRS}" = "[]" ] \
131+
|| [ "${DOCKER_CDI_SPEC_DIRS}" = "<no value>" ]; then
132+
echo "ERROR: e2e:docker:gpu requires Docker CDI support." >&2
133+
echo " Generate CDI specs and restart Docker, then verify docker info reports CDISpecDirs." >&2
134+
exit 2
135+
fi
136+
fi
122137

123138
normalize_arch() {
124139
case "$1" in
@@ -186,8 +201,10 @@ chmod +x "${SUPERVISOR_BIN}"
186201
# in the image. Use the community sandbox base image (also what real
187202
# deployments default to). Callers can override with
188203
# OPENSHELL_E2E_DOCKER_SANDBOX_IMAGE if they have a smaller local image
189-
# with the required 'sandbox' user.
190-
SANDBOX_IMAGE="${OPENSHELL_E2E_DOCKER_SANDBOX_IMAGE:-ghcr.io/nvidia/openshell-community/sandboxes/base:latest}"
204+
# with the required 'sandbox' user. CDI injects the NVIDIA userspace
205+
# stack at runtime, so the GPU lane uses the same base image.
206+
DEFAULT_SANDBOX_IMAGE="ghcr.io/nvidia/openshell-community/sandboxes/base:latest"
207+
SANDBOX_IMAGE="${OPENSHELL_E2E_DOCKER_SANDBOX_IMAGE:-${DEFAULT_SANDBOX_IMAGE}}"
191208
if ! docker image inspect "${SANDBOX_IMAGE}" >/dev/null 2>&1; then
192209
echo "Pulling ${SANDBOX_IMAGE}..."
193210
docker pull "${SANDBOX_IMAGE}"
@@ -313,8 +330,8 @@ if [ "${elapsed}" -ge "${timeout}" ]; then
313330
exit 1
314331
fi
315332

316-
# ── Run the smoke test ───────────────────────────────────────────────
317-
echo "Running e2e smoke test (gateway: ${OPENSHELL_GATEWAY}, endpoint: ${CLI_GATEWAY_ENDPOINT})..."
318-
cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test smoke -- --nocapture
333+
# ── Run the selected test ────────────────────────────────────────────
334+
echo "Running e2e ${E2E_TEST} test (gateway: ${OPENSHELL_GATEWAY}, endpoint: ${CLI_GATEWAY_ENDPOINT})..."
335+
cargo test --manifest-path e2e/rust/Cargo.toml --features e2e --test "${E2E_TEST}" -- --nocapture
319336

320-
echo "Smoke test passed."
337+
echo "${E2E_TEST} test passed."

e2e/rust/tests/docker_gpu.rs

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
// SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
// SPDX-License-Identifier: Apache-2.0
3+
4+
#![cfg(feature = "e2e")]
5+
6+
//! Docker GPU e2e test.
7+
//!
8+
//! Requires a Docker-backed gateway started with Docker CDI support. The
9+
//! `e2e:docker:gpu` mise task starts that gateway with the default sandbox image
10+
//! unless OPENSHELL_E2E_DOCKER_SANDBOX_IMAGE is set.
11+
12+
use openshell_e2e::harness::output::strip_ansi;
13+
use openshell_e2e::harness::sandbox::SandboxGuard;
14+
15+
#[tokio::test]
16+
async fn docker_gpu_sandbox_runs_nvidia_smi() {
17+
let mut guard = SandboxGuard::create(&[
18+
"--gpu",
19+
"--",
20+
"sh",
21+
"-lc",
22+
"gpu_name=$(nvidia-smi --query-gpu=name --format=csv,noheader | head -n 1); \
23+
test -n \"$gpu_name\"; \
24+
printf 'gpu-ok:%s\n' \"$gpu_name\"",
25+
])
26+
.await
27+
.expect("GPU sandbox create should succeed");
28+
29+
let output = strip_ansi(&guard.create_output);
30+
assert!(
31+
output.contains("gpu-ok:"),
32+
"expected GPU smoke marker in sandbox output:\n{output}"
33+
);
34+
35+
guard.cleanup().await;
36+
}

tasks/test.toml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,3 +60,8 @@ run = "e2e/rust/e2e-vm.sh"
6060
["e2e:docker"]
6161
description = "Run smoke e2e against a standalone gateway with the Docker compute driver"
6262
run = "e2e/rust/e2e-docker.sh"
63+
64+
["e2e:docker:gpu"]
65+
description = "Run GPU e2e against a standalone gateway with the Docker compute driver"
66+
env = { OPENSHELL_E2E_DOCKER_GPU = "1", OPENSHELL_E2E_DOCKER_TEST = "docker_gpu" }
67+
run = "e2e/rust/e2e-docker.sh"

0 commit comments

Comments
 (0)