Skip to content

Commit d73860f

Browse files
authored
feat(driver-kubernetes): sideload supervisor binary via init container (#1154)
* feat: Update kubernetes driver to side-load supervisor binary * Update helm chart with supervisor image config * chore(driver-kubernetes): fix rustfmt formatting
1 parent 4803889 commit d73860f

6 files changed

Lines changed: 188 additions & 72 deletions

File tree

crates/openshell-driver-kubernetes/src/config.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,13 @@ pub struct KubernetesComputeConfig {
66
pub namespace: String,
77
pub default_image: String,
88
pub image_pull_policy: String,
9+
/// Image that provides the `openshell-sandbox` supervisor binary.
10+
/// An init container copies the binary from this image into a shared
11+
/// emptyDir volume before the sandbox container starts.
12+
pub supervisor_image: String,
13+
/// Kubernetes `imagePullPolicy` for the supervisor init container.
14+
/// Empty string delegates to the Kubernetes default.
15+
pub supervisor_image_pull_policy: String,
916
pub grpc_endpoint: String,
1017
pub ssh_socket_path: String,
1118
pub ssh_handshake_secret: String,

crates/openshell-driver-kubernetes/src/driver.rs

Lines changed: 118 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,8 @@ impl KubernetesComputeDriver {
312312
sandbox.spec.as_ref(),
313313
&self.config.default_image,
314314
&self.config.image_pull_policy,
315+
&self.config.supervisor_image,
316+
&self.config.supervisor_image_pull_policy,
315317
&sandbox.id,
316318
&sandbox.name,
317319
&self.config.grpc_endpoint,
@@ -657,27 +659,21 @@ fn map_kube_event_to_platform(
657659
}
658660

659661
/// Path where the supervisor binary is mounted inside the agent container.
660-
/// The supervisor is always side-loaded from the k3s node filesystem via a
661-
/// read-only hostPath volume — it is never baked into sandbox images.
662662
const SUPERVISOR_MOUNT_PATH: &str = "/opt/openshell/bin";
663663

664664
/// Name of the volume used to side-load the supervisor binary.
665665
const SUPERVISOR_VOLUME_NAME: &str = "openshell-supervisor-bin";
666666

667-
/// Path on the k3s node filesystem where the supervisor binary lives.
668-
/// This is baked into the cluster image at build time and can be updated
669-
/// via `docker cp` during local development.
670-
const SUPERVISOR_HOST_PATH: &str = "/opt/openshell/bin";
667+
/// Name of the init container that installs the supervisor binary.
668+
const SUPERVISOR_INIT_CONTAINER_NAME: &str = "openshell-supervisor-install";
671669

672-
/// Build the hostPath volume definition that exposes the supervisor binary
673-
/// from the k3s node filesystem.
670+
/// Build the emptyDir volume that holds the supervisor binary.
671+
///
672+
/// The init container writes the binary here; the agent container reads it.
674673
fn supervisor_volume() -> serde_json::Value {
675674
serde_json::json!({
676675
"name": SUPERVISOR_VOLUME_NAME,
677-
"hostPath": {
678-
"path": SUPERVISOR_HOST_PATH,
679-
"type": "DirectoryOrCreate"
680-
}
676+
"emptyDir": {}
681677
})
682678
}
683679

@@ -690,26 +686,61 @@ fn supervisor_volume_mount() -> serde_json::Value {
690686
})
691687
}
692688

693-
/// Apply supervisor side-load transforms to an already-built pod template JSON.
689+
/// Build the init container that copies the supervisor binary into the emptyDir.
694690
///
695-
/// This injects the hostPath volume, volume mount, command override, and
696-
/// `runAsUser: 0` into the pod template, targeting the `agent` container
697-
/// (or the first container if no `agent` is found).
691+
/// The supervisor image is expected to have `openshell-sandbox` on its PATH
692+
/// (e.g. at `/usr/local/bin/openshell-sandbox`). The init container resolves
693+
/// the binary via `command -v` and copies it into the shared emptyDir volume
694+
/// so the agent container can execute it from a fixed, writable path.
695+
fn supervisor_init_container(
696+
supervisor_image: &str,
697+
supervisor_image_pull_policy: &str,
698+
) -> serde_json::Value {
699+
let copy_cmd = format!(
700+
"set -e && \
701+
mkdir -p {SUPERVISOR_MOUNT_PATH} && \
702+
SUPERVISOR=$(command -v openshell-sandbox) && \
703+
cp \"$SUPERVISOR\" {SUPERVISOR_MOUNT_PATH}/openshell-sandbox && \
704+
chmod +x {SUPERVISOR_MOUNT_PATH}/openshell-sandbox"
705+
);
706+
let mut spec = serde_json::json!({
707+
"name": SUPERVISOR_INIT_CONTAINER_NAME,
708+
"image": supervisor_image,
709+
"command": ["sh", "-c", copy_cmd],
710+
"securityContext": {"runAsUser": 0},
711+
"volumeMounts": [{
712+
"name": SUPERVISOR_VOLUME_NAME,
713+
"mountPath": SUPERVISOR_MOUNT_PATH,
714+
"readOnly": false
715+
}]
716+
});
717+
if !supervisor_image_pull_policy.is_empty() {
718+
spec["imagePullPolicy"] = serde_json::json!(supervisor_image_pull_policy);
719+
}
720+
spec
721+
}
722+
723+
/// Apply supervisor side-load transforms to an already-built pod template JSON.
698724
///
699-
/// The supervisor binary is always side-loaded from the k3s node filesystem
700-
/// via a read-only hostPath volume. No init container is needed.
725+
/// Injects an emptyDir volume, an init container that copies the supervisor
726+
/// binary from the supervisor image into that volume, and a read-only volume
727+
/// mount + command override on the agent container.
701728
///
702729
/// The `runAsUser: 0` override ensures the supervisor binary runs as root
703730
/// regardless of the image's `USER` directive. The supervisor needs root for
704731
/// network namespace creation, proxy setup, and Landlock/seccomp configuration.
705732
/// It drops to the appropriate non-root user for child processes via the
706733
/// policy's `run_as_user`/`run_as_group`.
707-
fn apply_supervisor_sideload(pod_template: &mut serde_json::Value) {
734+
fn apply_supervisor_sideload(
735+
pod_template: &mut serde_json::Value,
736+
supervisor_image: &str,
737+
supervisor_image_pull_policy: &str,
738+
) {
708739
let Some(spec) = pod_template.get_mut("spec").and_then(|v| v.as_object_mut()) else {
709740
return;
710741
};
711742

712-
// 1. Add the hostPath volume to spec.volumes
743+
// 1. Add the emptyDir volume to spec.volumes
713744
let volumes = spec
714745
.entry("volumes")
715746
.or_insert_with(|| serde_json::json!([]))
@@ -718,7 +749,19 @@ fn apply_supervisor_sideload(pod_template: &mut serde_json::Value) {
718749
volumes.push(supervisor_volume());
719750
}
720751

721-
// 2. Find the agent container and add volume mount + command override
752+
// 2. Add the init container that copies the binary into the emptyDir
753+
let init_containers = spec
754+
.entry("initContainers")
755+
.or_insert_with(|| serde_json::json!([]))
756+
.as_array_mut();
757+
if let Some(init_containers) = init_containers {
758+
init_containers.push(supervisor_init_container(
759+
supervisor_image,
760+
supervisor_image_pull_policy,
761+
));
762+
}
763+
764+
// 3. Find the agent container and add volume mount + command override
722765
let Some(containers) = spec.get_mut("containers").and_then(|v| v.as_array_mut()) else {
723766
return;
724767
};
@@ -881,6 +924,8 @@ fn sandbox_to_k8s_spec(
881924
spec: Option<&SandboxSpec>,
882925
default_image: &str,
883926
image_pull_policy: &str,
927+
supervisor_image: &str,
928+
supervisor_image_pull_policy: &str,
884929
sandbox_id: &str,
885930
sandbox_name: &str,
886931
grpc_endpoint: &str,
@@ -921,6 +966,8 @@ fn sandbox_to_k8s_spec(
921966
spec.gpu,
922967
default_image,
923968
image_pull_policy,
969+
supervisor_image,
970+
supervisor_image_pull_policy,
924971
sandbox_id,
925972
sandbox_name,
926973
grpc_endpoint,
@@ -967,6 +1014,8 @@ fn sandbox_to_k8s_spec(
9671014
spec.as_ref().is_some_and(|s| s.gpu),
9681015
default_image,
9691016
image_pull_policy,
1017+
supervisor_image,
1018+
supervisor_image_pull_policy,
9701019
sandbox_id,
9711020
sandbox_name,
9721021
grpc_endpoint,
@@ -992,6 +1041,8 @@ fn sandbox_template_to_k8s(
9921041
gpu: bool,
9931042
default_image: &str,
9941043
image_pull_policy: &str,
1044+
supervisor_image: &str,
1045+
supervisor_image_pull_policy: &str,
9951046
sandbox_id: &str,
9961047
sandbox_name: &str,
9971048
grpc_endpoint: &str,
@@ -1003,9 +1054,6 @@ fn sandbox_template_to_k8s(
10031054
host_gateway_ip: &str,
10041055
inject_workspace: bool,
10051056
) -> serde_json::Value {
1006-
// The supervisor binary is always side-loaded from the node filesystem
1007-
// via a hostPath volume, regardless of which sandbox image is used.
1008-
10091057
let mut metadata = serde_json::Map::new();
10101058
if !template.labels.is_empty() {
10111059
metadata.insert("labels".to_string(), serde_json::json!(template.labels));
@@ -1122,8 +1170,9 @@ fn sandbox_template_to_k8s(
11221170

11231171
let mut result = serde_json::Value::Object(template_value);
11241172

1125-
// Always side-load the supervisor binary from the node filesystem
1126-
apply_supervisor_sideload(&mut result);
1173+
// Side-load the supervisor binary via an init container that copies it
1174+
// from the supervisor image into a shared emptyDir volume.
1175+
apply_supervisor_sideload(&mut result, supervisor_image, supervisor_image_pull_policy);
11271176

11281177
// Inject workspace persistence (init container + PVC volume mount) so
11291178
// that /sandbox data survives pod rescheduling. Skipped when the user
@@ -1437,7 +1486,7 @@ mod tests {
14371486
}
14381487
});
14391488

1440-
apply_supervisor_sideload(&mut pod_template);
1489+
apply_supervisor_sideload(&mut pod_template, "custom-image:latest", "IfNotPresent");
14411490

14421491
let sc = &pod_template["spec"]["containers"][0]["securityContext"];
14431492
assert_eq!(sc["runAsUser"], 0, "runAsUser must be 0 for supervisor");
@@ -1461,7 +1510,7 @@ mod tests {
14611510
}
14621511
});
14631512

1464-
apply_supervisor_sideload(&mut pod_template);
1513+
apply_supervisor_sideload(&mut pod_template, "supervisor-image:latest", "IfNotPresent");
14651514

14661515
let sc = &pod_template["spec"]["containers"][0]["securityContext"];
14671516
assert_eq!(
@@ -1471,7 +1520,7 @@ mod tests {
14711520
}
14721521

14731522
#[test]
1474-
fn supervisor_sideload_injects_hostpath_volume_and_mount() {
1523+
fn supervisor_sideload_injects_emptydir_volume_init_container_and_mount() {
14751524
let mut pod_template = serde_json::json!({
14761525
"spec": {
14771526
"containers": [{
@@ -1481,24 +1530,29 @@ mod tests {
14811530
}
14821531
});
14831532

1484-
apply_supervisor_sideload(&mut pod_template);
1485-
1486-
// No init containers should be present (hostPath, not emptyDir+init)
1487-
assert!(
1488-
pod_template["spec"]["initContainers"].is_null(),
1489-
"hostPath sideload should not create init containers"
1490-
);
1533+
apply_supervisor_sideload(&mut pod_template, "supervisor-image:latest", "IfNotPresent");
14911534

1492-
// Volume should be a hostPath volume
1535+
// Volume should be an emptyDir
14931536
let volumes = pod_template["spec"]["volumes"]
14941537
.as_array()
14951538
.expect("volumes should exist");
14961539
assert_eq!(volumes.len(), 1);
14971540
assert_eq!(volumes[0]["name"], SUPERVISOR_VOLUME_NAME);
1498-
assert_eq!(volumes[0]["hostPath"]["path"], SUPERVISOR_HOST_PATH);
1499-
assert_eq!(volumes[0]["hostPath"]["type"], "DirectoryOrCreate");
1541+
assert!(
1542+
volumes[0]["emptyDir"].is_object(),
1543+
"volume should be emptyDir, not hostPath"
1544+
);
1545+
1546+
// Init container should use the supervisor image, not the sandbox image
1547+
let init_containers = pod_template["spec"]["initContainers"]
1548+
.as_array()
1549+
.expect("initContainers should exist");
1550+
assert_eq!(init_containers.len(), 1);
1551+
assert_eq!(init_containers[0]["name"], SUPERVISOR_INIT_CONTAINER_NAME);
1552+
assert_eq!(init_containers[0]["image"], "supervisor-image:latest");
1553+
assert_eq!(init_containers[0]["imagePullPolicy"], "IfNotPresent");
15001554

1501-
// Agent container command should be overridden
1555+
// Agent container command should be overridden to the emptyDir path
15021556
let command = pod_template["spec"]["containers"][0]["command"]
15031557
.as_array()
15041558
.expect("command should be set");
@@ -1507,7 +1561,7 @@ mod tests {
15071561
format!("{SUPERVISOR_MOUNT_PATH}/openshell-sandbox")
15081562
);
15091563

1510-
// Volume mount should be read-only
1564+
// Agent volume mount should be read-only
15111565
let mounts = pod_template["spec"]["containers"][0]["volumeMounts"]
15121566
.as_array()
15131567
.expect("volumeMounts should exist");
@@ -1572,6 +1626,8 @@ mod tests {
15721626
true,
15731627
"openshell/sandbox:latest",
15741628
"",
1629+
"openshell/supervisor:latest",
1630+
"",
15751631
"sandbox-id",
15761632
"sandbox-name",
15771633
"https://gateway.example.com",
@@ -1614,6 +1670,8 @@ mod tests {
16141670
true,
16151671
"openshell/sandbox:latest",
16161672
"",
1673+
"openshell/supervisor:latest",
1674+
"",
16171675
"sandbox-id",
16181676
"sandbox-name",
16191677
"https://gateway.example.com",
@@ -1652,6 +1710,8 @@ mod tests {
16521710
false,
16531711
"openshell/sandbox:latest",
16541712
"",
1713+
"openshell/supervisor:latest",
1714+
"",
16551715
"sandbox-id",
16561716
"sandbox-name",
16571717
"https://gateway.example.com",
@@ -1686,6 +1746,8 @@ mod tests {
16861746
true,
16871747
"openshell/sandbox:latest",
16881748
"",
1749+
"openshell/supervisor:latest",
1750+
"",
16891751
"sandbox-id",
16901752
"sandbox-name",
16911753
"https://gateway.example.com",
@@ -1713,6 +1775,8 @@ mod tests {
17131775
false,
17141776
"openshell/sandbox:latest",
17151777
"",
1778+
"openshell/supervisor:latest",
1779+
"",
17161780
"sandbox-id",
17171781
"sandbox-name",
17181782
"https://gateway.example.com",
@@ -1744,6 +1808,8 @@ mod tests {
17441808
false,
17451809
"openshell/sandbox:latest",
17461810
"",
1811+
"openshell/supervisor:latest",
1812+
"",
17471813
"sandbox-id",
17481814
"sandbox-name",
17491815
"https://gateway.example.com",
@@ -1770,6 +1836,8 @@ mod tests {
17701836
false,
17711837
"openshell/sandbox:latest",
17721838
"",
1839+
"openshell/supervisor:latest",
1840+
"",
17731841
"sandbox-id",
17741842
"sandbox-name",
17751843
"https://gateway.example.com",
@@ -1913,6 +1981,8 @@ mod tests {
19131981
false,
19141982
"openshell/sandbox:latest",
19151983
"",
1984+
"openshell/supervisor:latest",
1985+
"",
19161986
"sandbox-id",
19171987
"sandbox-name",
19181988
"https://gateway.example.com",
@@ -1925,12 +1995,14 @@ mod tests {
19251995
false, // user provided custom VCTs
19261996
);
19271997

1928-
// No init container should be present
1998+
// Only the supervisor init container should be present — no workspace init container
1999+
let init_containers = pod_template["spec"]["initContainers"]
2000+
.as_array()
2001+
.expect("supervisor init container should always be present");
19292002
assert!(
1930-
pod_template["spec"]["initContainers"].is_null()
1931-
|| pod_template["spec"]["initContainers"]
1932-
.as_array()
1933-
.is_none_or(Vec::is_empty),
2003+
!init_containers
2004+
.iter()
2005+
.any(|c| c["name"] == WORKSPACE_INIT_CONTAINER_NAME),
19342006
"workspace init container must NOT be present when inject_workspace is false"
19352007
);
19362008

crates/openshell-driver-kubernetes/src/main.rs

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,12 @@ struct Args {
5757

5858
#[arg(long, env = "OPENSHELL_HOST_GATEWAY_IP")]
5959
host_gateway_ip: Option<String>,
60+
61+
#[arg(long, env = "OPENSHELL_SUPERVISOR_IMAGE")]
62+
supervisor_image: Option<String>,
63+
64+
#[arg(long, env = "OPENSHELL_SUPERVISOR_IMAGE_PULL_POLICY")]
65+
supervisor_image_pull_policy: Option<String>,
6066
}
6167

6268
#[tokio::main]
@@ -72,6 +78,10 @@ async fn main() -> Result<()> {
7278
namespace: args.sandbox_namespace,
7379
default_image: args.sandbox_image.unwrap_or_default(),
7480
image_pull_policy: args.sandbox_image_pull_policy.unwrap_or_default(),
81+
supervisor_image: args
82+
.supervisor_image
83+
.unwrap_or_else(|| openshell_core::config::DEFAULT_SUPERVISOR_IMAGE.to_string()),
84+
supervisor_image_pull_policy: args.supervisor_image_pull_policy.unwrap_or_default(),
7585
grpc_endpoint: args.grpc_endpoint.unwrap_or_default(),
7686
ssh_socket_path: args.sandbox_ssh_socket_path,
7787
ssh_handshake_secret: args.ssh_handshake_secret,

0 commit comments

Comments
 (0)