diff --git a/.github/workflows/branch-e2e.yml b/.github/workflows/branch-e2e.yml index 23d7d1cf6..ebe783406 100644 --- a/.github/workflows/branch-e2e.yml +++ b/.github/workflows/branch-e2e.yml @@ -115,12 +115,22 @@ jobs: kubernetes-e2e: needs: [pr_metadata, build-gateway, build-supervisor] if: needs.pr_metadata.outputs.should_run == 'true' && needs.pr_metadata.outputs.run_core_e2e == 'true' + strategy: + fail-fast: false + matrix: + include: + - agent_sandbox_api: v1beta1 + agent_sandbox_version: v0.5.0 + - agent_sandbox_api: v1alpha1 + agent_sandbox_version: v0.4.6 permissions: contents: read packages: read uses: ./.github/workflows/e2e-kubernetes-test.yml with: image-tag: ${{ github.sha }} + job-name: Kubernetes E2E (Rust smoke, Agent Sandbox ${{ matrix.agent_sandbox_api }}) + agent-sandbox-version: ${{ matrix.agent_sandbox_version }} kubernetes-ha-e2e: needs: [pr_metadata, build-gateway, build-supervisor] diff --git a/.github/workflows/e2e-kubernetes-test.yml b/.github/workflows/e2e-kubernetes-test.yml index 5ff375922..f8c042b2a 100644 --- a/.github/workflows/e2e-kubernetes-test.yml +++ b/.github/workflows/e2e-kubernetes-test.yml @@ -32,6 +32,11 @@ on: required: false type: string default: "" + agent-sandbox-version: + description: "Agent Sandbox release to install before OpenShell" + required: false + type: string + default: "v0.5.0" mise-version: description: "mise version to install on the bare Kubernetes e2e runner" required: false @@ -114,6 +119,7 @@ jobs: - name: Run Kubernetes E2E (Rust smoke) env: + AGENT_SANDBOX_VERSION: ${{ inputs.agent-sandbox-version }} OPENSHELL_E2E_KUBE_CONTEXT: kind-${{ env.KIND_CLUSTER_NAME }} OPENSHELL_E2E_KUBE_EXTRA_VALUES: ${{ inputs.extra-helm-values }} OPENSHELL_E2E_KUBE_EXTERNAL_POSTGRES_SECRET: ${{ inputs.external-postgres-secret }} diff --git a/architecture/gateway.md b/architecture/gateway.md index 979422d7e..2e0256249 100644 --- a/architecture/gateway.md +++ b/architecture/gateway.md @@ -66,10 +66,13 @@ token through `IssueSandboxToken`. The gateway validates that projected token with Kubernetes `TokenReview`, requires the configured sandbox service account, checks the returned pod binding against the live pod UID, and verifies the pod's controlling `Sandbox` ownerReference against the live Sandbox CR UID and -sandbox-id label before minting the gateway JWT. Supervisors renew gateway JWTs -in memory before expiry only while the sandbox record still exists. Older tokens -are not server-revoked; shared deployments bound replay exposure with short -`gateway_jwt.ttl_secs` lifetimes. The config default is +sandbox-id label before minting the gateway JWT. The bootstrap path accepts +both `agents.x-k8s.io/v1beta1` ownerReferences from newer Agent Sandbox +controllers and `agents.x-k8s.io/v1alpha1` ownerReferences from existing +deployments. Supervisors renew gateway JWTs in memory before expiry only while +the sandbox record still exists. Older tokens are not server-revoked; shared +deployments bound replay exposure with short `gateway_jwt.ttl_secs` lifetimes. +The config default is `gateway_jwt.ttl_secs = 0` for local single-player Docker, Podman, and VM gateways; those tokens carry `exp = 0` and do not expire. Kubernetes and other shared deployments should set a positive TTL. diff --git a/crates/openshell-driver-kubernetes/README.md b/crates/openshell-driver-kubernetes/README.md index 3cdb9fa57..831e4edf2 100644 --- a/crates/openshell-driver-kubernetes/README.md +++ b/crates/openshell-driver-kubernetes/README.md @@ -15,9 +15,13 @@ credential injection, policy polling, logs, and the gateway relay. ## Sandbox Resource -The driver works with the `agents.x-k8s.io/v1alpha1` `Sandbox` custom resource. -Driver events map Kubernetes object state and platform events into the shared -compute-driver protobuf surface used by the gateway. +The driver works with the `agents.x-k8s.io` `Sandbox` custom resource. It +detects the served Sandbox API at runtime, caches the selected API version for +the gateway process, and uses `v1beta1` when available before falling back to +`v1alpha1`. Restart the gateway after an in-place Agent Sandbox upgrade so the +driver can detect served API versions again. Driver events map Kubernetes object +state and platform events into the shared compute-driver protobuf surface used +by the gateway. Kubernetes API calls use explicit timeouts so gRPC handlers do not block indefinitely when the API server is slow or unavailable. diff --git a/crates/openshell-driver-kubernetes/src/driver.rs b/crates/openshell-driver-kubernetes/src/driver.rs index a32034f3a..909568302 100644 --- a/crates/openshell-driver-kubernetes/src/driver.rs +++ b/crates/openshell-driver-kubernetes/src/driver.rs @@ -34,8 +34,9 @@ use openshell_core::proto_struct::{struct_to_json_object, value_to_json}; use serde::Deserialize; use std::collections::BTreeMap; use std::pin::Pin; +use std::sync::Arc; use std::time::Duration; -use tokio::sync::mpsc; +use tokio::sync::{OnceCell, mpsc}; use tokio_stream::wrappers::ReceiverStream; use tracing::{debug, info, warn}; @@ -80,12 +81,19 @@ impl From for openshell_core::ComputeDriverError { const KUBE_API_TIMEOUT: Duration = Duration::from_secs(30); const SANDBOX_GROUP: &str = "agents.x-k8s.io"; -const SANDBOX_VERSION: &str = "v1alpha1"; +const SANDBOX_VERSION_V1BETA1: &str = "v1beta1"; +const SANDBOX_VERSION_V1ALPHA1: &str = "v1alpha1"; +const SANDBOX_VERSIONS: &[&str] = &[SANDBOX_VERSION_V1BETA1, SANDBOX_VERSION_V1ALPHA1]; pub const SANDBOX_KIND: &str = "Sandbox"; const GPU_RESOURCE_NAME: &str = "nvidia.com/gpu"; const SPIFFE_WORKLOAD_API_VOLUME_NAME: &str = "spiffe-workload-api"; +struct AgentSandboxApi { + api: Api, + resource: ApiResource, +} + // This POC treats the selected Struct as a driver-local typed schema. Once the // Kubernetes shape stabilizes, these serde structs may move to driver-local // protobuf definitions, but the typed decode should stay inside this driver. @@ -190,6 +198,7 @@ const WORKSPACE_SENTINEL: &str = ".workspace-initialized"; pub struct KubernetesComputeDriver { client: Client, watch_client: Client, + sandbox_api_version: Arc>, config: KubernetesComputeConfig, } @@ -232,6 +241,7 @@ impl KubernetesComputeDriver { Ok(Self { client, watch_client, + sandbox_api_version: Arc::new(OnceCell::new()), config, }) } @@ -256,16 +266,68 @@ impl KubernetesComputeDriver { &self.config.ssh_socket_path } - fn watch_api(&self) -> Api { - let gvk = GroupVersionKind::gvk(SANDBOX_GROUP, SANDBOX_VERSION, SANDBOX_KIND); + fn agent_sandbox_api(&self, client: Client, sandbox_api_version: &str) -> AgentSandboxApi { + let gvk = GroupVersionKind::gvk(SANDBOX_GROUP, sandbox_api_version, SANDBOX_KIND); let resource = ApiResource::from_gvk(&gvk); - Api::namespaced_with(self.watch_client.clone(), &self.config.namespace, &resource) + let api = Api::namespaced_with(client, &self.config.namespace, &resource); + AgentSandboxApi { api, resource } } - fn api(&self) -> Api { - let gvk = GroupVersionKind::gvk(SANDBOX_GROUP, SANDBOX_VERSION, SANDBOX_KIND); - let resource = ApiResource::from_gvk(&gvk); - Api::namespaced_with(self.client.clone(), &self.config.namespace, &resource) + async fn supported_agent_sandbox_api(&self, client: Client) -> Result { + let sandbox_api_version = self.supported_sandbox_api_version(client.clone()).await?; + Ok(self.agent_sandbox_api(client, sandbox_api_version)) + } + + async fn supported_sandbox_api_version(&self, client: Client) -> Result<&'static str, String> { + self.sandbox_api_version + .get_or_try_init( + || async move { self.detect_supported_sandbox_api_version(client).await }, + ) + .await + .copied() + } + + async fn detect_supported_sandbox_api_version( + &self, + client: Client, + ) -> Result<&'static str, String> { + for sandbox_api_version in SANDBOX_VERSIONS { + let agent_sandbox_api = self.agent_sandbox_api(client.clone(), sandbox_api_version); + match tokio::time::timeout( + KUBE_API_TIMEOUT, + agent_sandbox_api.api.list(&ListParams::default().limit(1)), + ) + .await + { + Ok(Ok(_)) => { + debug!( + namespace = %self.config.namespace, + sandbox_api_version = %sandbox_api_version, + "Selected Agent Sandbox API version" + ); + return Ok(sandbox_api_version); + } + Ok(Err(err)) if should_try_next_sandbox_api_version(&err) => { + debug!( + namespace = %self.config.namespace, + sandbox_api_version = %sandbox_api_version, + error = %err, + "Sandbox API version is not available; trying next supported version" + ); + } + Ok(Err(err)) => return Err(err.to_string()), + Err(_elapsed) => { + return Err(format!( + "timed out after {}s waiting for Kubernetes API", + KUBE_API_TIMEOUT.as_secs() + )); + } + } + } + Err(format!( + "no supported Agent Sandbox API version is available; tried {}", + SANDBOX_VERSIONS.join(", ") + )) } async fn has_gpu_capacity(&self) -> Result { @@ -306,8 +368,10 @@ impl KubernetesComputeDriver { "Fetching sandbox from Kubernetes" ); - let api = self.api(); - match tokio::time::timeout(KUBE_API_TIMEOUT, api.get(name)).await { + let agent_sandbox_api = self + .supported_agent_sandbox_api(self.client.clone()) + .await?; + match tokio::time::timeout(KUBE_API_TIMEOUT, agent_sandbox_api.api.get(name)).await { Ok(Ok(obj)) => sandbox_from_object(&self.config.namespace, obj).map(Some), Ok(Err(KubeError::Api(err))) if err.code == 404 => { debug!(sandbox_name = %name, "Sandbox not found in Kubernetes"); @@ -341,8 +405,15 @@ impl KubernetesComputeDriver { "Listing sandboxes from Kubernetes" ); - let api = self.api(); - match tokio::time::timeout(KUBE_API_TIMEOUT, api.list(&ListParams::default())).await { + let agent_sandbox_api = self + .supported_agent_sandbox_api(self.client.clone()) + .await?; + match tokio::time::timeout( + KUBE_API_TIMEOUT, + agent_sandbox_api.api.list(&ListParams::default()), + ) + .await + { Ok(Ok(list)) => { let mut sandboxes = list .items @@ -396,9 +467,11 @@ impl KubernetesComputeDriver { "Creating sandbox in Kubernetes" ); - let gvk = GroupVersionKind::gvk(SANDBOX_GROUP, SANDBOX_VERSION, SANDBOX_KIND); - let resource = ApiResource::from_gvk(&gvk); - let mut obj = DynamicObject::new(name, &resource); + let agent_sandbox_api = self + .supported_agent_sandbox_api(self.client.clone()) + .await + .map_err(KubernetesDriverError::Message)?; + let mut obj = DynamicObject::new(name, &agent_sandbox_api.resource); obj.metadata = ObjectMeta { name: Some(name.to_string()), namespace: Some(self.config.namespace.clone()), @@ -430,9 +503,11 @@ impl KubernetesComputeDriver { .provider_spiffe_workload_api_socket_path, }; obj.data = sandbox_to_k8s_spec(sandbox.spec.as_ref(), ¶ms); - let api = self.api(); - - match tokio::time::timeout(KUBE_API_TIMEOUT, api.create(&PostParams::default(), &obj)).await + match tokio::time::timeout( + KUBE_API_TIMEOUT, + agent_sandbox_api.api.create(&PostParams::default(), &obj), + ) + .await { Ok(Ok(_result)) => { info!( @@ -473,9 +548,14 @@ impl KubernetesComputeDriver { "Deleting sandbox from Kubernetes" ); - let api = self.api(); - match tokio::time::timeout(KUBE_API_TIMEOUT, api.delete(name, &DeleteParams::default())) - .await + let agent_sandbox_api = self + .supported_agent_sandbox_api(self.client.clone()) + .await?; + match tokio::time::timeout( + KUBE_API_TIMEOUT, + agent_sandbox_api.api.delete(name, &DeleteParams::default()), + ) + .await { Ok(Ok(_response)) => { info!(sandbox_name = %name, "Sandbox deleted from Kubernetes"); @@ -508,8 +588,10 @@ impl KubernetesComputeDriver { } pub async fn sandbox_exists(&self, name: &str) -> Result { - let api = self.api(); - match tokio::time::timeout(KUBE_API_TIMEOUT, api.get(name)).await { + let agent_sandbox_api = self + .supported_agent_sandbox_api(self.client.clone()) + .await?; + match tokio::time::timeout(KUBE_API_TIMEOUT, agent_sandbox_api.api.get(name)).await { Ok(Ok(_)) => Ok(true), Ok(Err(KubeError::Api(err))) if err.code == 404 => Ok(false), Ok(Err(err)) => Err(err.to_string()), @@ -524,9 +606,12 @@ impl KubernetesComputeDriver { #[allow(clippy::unused_async)] pub async fn watch_sandboxes(&self) -> Result { let namespace = self.config.namespace.clone(); - let sandbox_api = self.watch_api(); + let agent_sandbox_api = self + .supported_agent_sandbox_api(self.watch_client.clone()) + .await?; let event_api: Api = Api::namespaced(self.watch_client.clone(), &namespace); - let mut sandbox_stream = watcher::watcher(sandbox_api, watcher::Config::default()).boxed(); + let mut sandbox_stream = + watcher::watcher(agent_sandbox_api.api, watcher::Config::default()).boxed(); let mut event_stream = watcher::watcher(event_api, watcher::Config::default()).boxed(); let (tx, rx) = mpsc::channel(256); @@ -650,6 +735,14 @@ impl KubernetesComputeDriver { } } +fn should_try_next_sandbox_api_version(err: &KubeError) -> bool { + // Kubernetes returns a structured 404 for some missing API resources and a + // raw "404 page not found" body for others. Both mean the probed + // group/version is unavailable and the next supported Sandbox API version + // should be tried. + matches!(err, KubeError::Api(api) if api.code == 404) +} + fn validate_gpu_request( gpu_requirements: Option<&GpuResourceRequirements>, ) -> Result<(), tonic::Status> { @@ -2013,6 +2106,34 @@ mod tests { } } + fn kube_api_error(code: u16, message: &str) -> KubeError { + KubeError::Api(kube::core::ErrorResponse { + status: if code == 404 { + "404 Not Found".to_string() + } else { + "Failure".to_string() + }, + message: message.to_string(), + reason: "Failed to parse error data".to_string(), + code, + }) + } + + #[test] + fn sandbox_api_version_probe_retries_on_structured_and_raw_404() { + let structured = kube_api_error(404, "could not find the requested resource"); + assert!(should_try_next_sandbox_api_version(&structured)); + + let raw = kube_api_error(404, "404 page not found\n"); + assert!(should_try_next_sandbox_api_version(&raw)); + } + + #[test] + fn sandbox_api_version_probe_keeps_non_404_errors() { + let err = kube_api_error(403, "sandboxes.agents.x-k8s.io is forbidden"); + assert!(!should_try_next_sandbox_api_version(&err)); + } + #[test] fn driver_config_rejects_invalid_shape() { let template = SandboxTemplate { diff --git a/crates/openshell-server/src/auth/k8s_sa.rs b/crates/openshell-server/src/auth/k8s_sa.rs index 59f6651bb..eed0e5f08 100644 --- a/crates/openshell-server/src/auth/k8s_sa.rs +++ b/crates/openshell-server/src/auth/k8s_sa.rs @@ -23,6 +23,7 @@ use k8s_openapi::api::{ core::v1::Pod, }; use k8s_openapi::apimachinery::pkg::apis::meta::v1::ObjectMeta; +use kube::Error as KubeError; use kube::api::{Api, ApiResource, PostParams}; use kube::core::{DynamicObject, gvk::GroupVersionKind}; use std::sync::Arc; @@ -40,8 +41,10 @@ pub const ISSUE_SANDBOX_TOKEN_PATH: &str = "/openshell.v1.OpenShell/IssueSandbox /// include `patch pods` (see plan ยง11.8). pub const SANDBOX_ID_ANNOTATION: &str = "openshell.io/sandbox-id"; const SANDBOX_API_GROUP: &str = "agents.x-k8s.io"; -const SANDBOX_API_VERSION: &str = "v1alpha1"; -const SANDBOX_API_VERSION_FULL: &str = "agents.x-k8s.io/v1alpha1"; +const SANDBOX_API_VERSION_V1BETA1: &str = "v1beta1"; +const SANDBOX_API_VERSION_V1ALPHA1: &str = "v1alpha1"; +const SANDBOX_API_VERSION_FULL_V1BETA1: &str = "agents.x-k8s.io/v1beta1"; +const SANDBOX_API_VERSION_FULL_V1ALPHA1: &str = "agents.x-k8s.io/v1alpha1"; const SANDBOX_KIND: &str = "Sandbox"; const SANDBOX_ID_LABEL: &str = "openshell.ai/sandbox-id"; const POD_NAME_EXTRA: &str = "authentication.kubernetes.io/pod-name"; @@ -140,6 +143,7 @@ struct TokenReviewIdentity { #[derive(Debug, Clone, PartialEq, Eq)] struct SandboxOwnerReference { + api_version: String, name: String, uid: String, } @@ -149,7 +153,8 @@ struct SandboxOwnerReference { pub struct LiveK8sResolver { token_reviews_api: Api, pods_api: Api, - sandboxes_api: Api, + sandboxes_api_v1beta1: Api, + sandboxes_api_v1alpha1: Api, expected_audience: String, sandbox_namespace: String, expected_service_account: String, @@ -164,20 +169,51 @@ impl LiveK8sResolver { ) -> Self { let token_reviews_api: Api = Api::all(client.clone()); let pods_api: Api = Api::namespaced(client.clone(), namespace); - let sandbox_gvk = - GroupVersionKind::gvk(SANDBOX_API_GROUP, SANDBOX_API_VERSION, SANDBOX_KIND); - let sandbox_resource = ApiResource::from_gvk(&sandbox_gvk); - let sandboxes_api: Api = - Api::namespaced_with(client, namespace, &sandbox_resource); + let sandbox_gvk_v1beta1 = + GroupVersionKind::gvk(SANDBOX_API_GROUP, SANDBOX_API_VERSION_V1BETA1, SANDBOX_KIND); + let sandbox_resource_v1beta1 = ApiResource::from_gvk(&sandbox_gvk_v1beta1); + let sandbox_gvk_v1alpha1 = GroupVersionKind::gvk( + SANDBOX_API_GROUP, + SANDBOX_API_VERSION_V1ALPHA1, + SANDBOX_KIND, + ); + let sandbox_resource_v1alpha1 = ApiResource::from_gvk(&sandbox_gvk_v1alpha1); + let sandboxes_api_v1beta1: Api = + Api::namespaced_with(client.clone(), namespace, &sandbox_resource_v1beta1); + let sandboxes_api_v1alpha1: Api = + Api::namespaced_with(client, namespace, &sandbox_resource_v1alpha1); Self { token_reviews_api, pods_api, - sandboxes_api, + sandboxes_api_v1beta1, + sandboxes_api_v1alpha1, expected_audience, sandbox_namespace: namespace.to_string(), expected_service_account, } } + + async fn get_sandbox_cr_for_owner( + &self, + owner: &SandboxOwnerReference, + ) -> Result, KubeError> { + let apis = if owner.api_version == SANDBOX_API_VERSION_FULL_V1ALPHA1 { + [&self.sandboxes_api_v1alpha1, &self.sandboxes_api_v1beta1] + } else { + [&self.sandboxes_api_v1beta1, &self.sandboxes_api_v1alpha1] + }; + + for api in apis { + match api.get_opt(&owner.name).await { + Ok(Some(sandbox_cr)) => return Ok(Some(sandbox_cr)), + Ok(None) => {} + Err(err) if should_try_next_sandbox_api_version(&err) => {} + Err(err) => return Err(err), + } + } + + Ok(None) + } } #[async_trait] @@ -258,10 +294,11 @@ impl K8sIdentityResolver for LiveK8sResolver { let sandbox_id = pod_sandbox_id(&pod)?; let owner = sandbox_owner_reference(&pod)?; - let sandbox_cr = self.sandboxes_api.get_opt(&owner.name).await.map_err(|e| { + let sandbox_cr = self.get_sandbox_cr_for_owner(&owner).await.map_err(|e| { warn!( pod = %identity.pod_name, sandbox_owner = %owner.name, + sandbox_owner_api_version = %owner.api_version, error = %e, "failed to fetch owning Sandbox CR for pod identity validation" ); @@ -271,6 +308,7 @@ impl K8sIdentityResolver for LiveK8sResolver { warn!( pod = %identity.pod_name, sandbox_owner = %owner.name, + sandbox_owner_api_version = %owner.api_version, "pod ownerReference points to a Sandbox CR that does not exist" ); return Err(Status::permission_denied("sandbox owner not found")); @@ -370,10 +408,25 @@ fn pod_sandbox_id(pod: &Pod) -> Result { #[allow(clippy::result_large_err)] fn sandbox_owner_reference(pod: &Pod) -> Result { let owner_refs = pod.metadata.owner_references.as_deref().unwrap_or_default(); - let mut sandbox_refs = owner_refs.iter().filter(|owner| { - owner.api_version == SANDBOX_API_VERSION_FULL && owner.kind == SANDBOX_KIND - }); + let mut sandbox_refs = owner_refs + .iter() + .filter(|owner| is_supported_sandbox_owner_reference(owner)); let Some(owner) = sandbox_refs.next() else { + let unsupported_sandbox_api_versions = owner_refs + .iter() + .filter(|owner| owner.kind == SANDBOX_KIND) + .map(|owner| owner.api_version.as_str()) + .collect::>(); + if !unsupported_sandbox_api_versions.is_empty() { + warn!( + api_versions = ?unsupported_sandbox_api_versions, + supported_api_versions = ?[ + SANDBOX_API_VERSION_FULL_V1BETA1, + SANDBOX_API_VERSION_FULL_V1ALPHA1, + ], + "pod Sandbox ownerReference uses unsupported apiVersion" + ); + } return Err(Status::permission_denied( "pod is not controlled by an OpenShell Sandbox", )); @@ -394,11 +447,30 @@ fn sandbox_owner_reference(pod: &Pod) -> Result { )); } Ok(SandboxOwnerReference { + api_version: owner.api_version.clone(), name: owner.name.clone(), uid: owner.uid.clone(), }) } +fn is_supported_sandbox_owner_reference( + owner: &k8s_openapi::apimachinery::pkg::apis::meta::v1::OwnerReference, +) -> bool { + owner.kind == SANDBOX_KIND + && matches!( + owner.api_version.as_str(), + SANDBOX_API_VERSION_FULL_V1BETA1 | SANDBOX_API_VERSION_FULL_V1ALPHA1 + ) +} + +fn should_try_next_sandbox_api_version(err: &KubeError) -> bool { + // Kubernetes returns a structured 404 for some missing API resources and a + // raw "404 page not found" body for others. Both mean the probed + // group/version is unavailable and the next supported Sandbox API version + // should be tried. + matches!(err, KubeError::Api(api) if api.code == 404) +} + #[allow(clippy::result_large_err)] fn validate_sandbox_owner_reference( owner: &SandboxOwnerReference, @@ -486,6 +558,34 @@ mod tests { h } + fn kube_api_error(code: u16, message: &str) -> KubeError { + KubeError::Api(kube::core::ErrorResponse { + status: if code == 404 { + "404 Not Found".to_string() + } else { + "Failure".to_string() + }, + message: message.to_string(), + reason: "Failed to parse error data".to_string(), + code, + }) + } + + #[test] + fn sandbox_api_version_probe_retries_on_structured_and_raw_404() { + let structured = kube_api_error(404, "could not find the requested resource"); + assert!(should_try_next_sandbox_api_version(&structured)); + + let raw = kube_api_error(404, "404 page not found\n"); + assert!(should_try_next_sandbox_api_version(&raw)); + } + + #[test] + fn sandbox_api_version_probe_keeps_non_404_errors() { + let err = kube_api_error(403, "sandboxes.agents.x-k8s.io is forbidden"); + assert!(!should_try_next_sandbox_api_version(&err)); + } + fn token_review_status( authenticated: bool, audiences: Vec<&str>, @@ -515,8 +615,12 @@ mod tests { } fn sandbox_owner(name: &str, uid: &str) -> OwnerReference { + sandbox_owner_with_api_version(SANDBOX_API_VERSION_FULL_V1BETA1, name, uid) + } + + fn sandbox_owner_with_api_version(api_version: &str, name: &str, uid: &str) -> OwnerReference { OwnerReference { - api_version: SANDBOX_API_VERSION_FULL.to_string(), + api_version: api_version.to_string(), block_owner_deletion: None, controller: Some(true), kind: SANDBOX_KIND.to_string(), @@ -549,7 +653,7 @@ mod tests { fn sandbox_cr(name: &str, uid: &str, sandbox_id: &str) -> DynamicObject { let sandbox_gvk = - GroupVersionKind::gvk(SANDBOX_API_GROUP, SANDBOX_API_VERSION, SANDBOX_KIND); + GroupVersionKind::gvk(SANDBOX_API_GROUP, SANDBOX_API_VERSION_V1BETA1, SANDBOX_KIND); let sandbox_resource = ApiResource::from_gvk(&sandbox_gvk); let mut cr = DynamicObject::new(name, &sandbox_resource); cr.metadata.uid = Some(uid.to_string()); @@ -681,6 +785,27 @@ mod tests { assert_eq!( owner, SandboxOwnerReference { + api_version: SANDBOX_API_VERSION_FULL_V1BETA1.to_string(), + name: "sandbox-a".to_string(), + uid: "cr-uid-a".to_string(), + } + ); + } + + #[test] + fn sandbox_owner_reference_accepts_v1alpha1_owner() { + let pod = pod_with_owner_refs(vec![sandbox_owner_with_api_version( + SANDBOX_API_VERSION_FULL_V1ALPHA1, + "sandbox-a", + "cr-uid-a", + )]); + + let owner = sandbox_owner_reference(&pod).expect("expected v1alpha1 Sandbox owner"); + + assert_eq!( + owner, + SandboxOwnerReference { + api_version: SANDBOX_API_VERSION_FULL_V1ALPHA1.to_string(), name: "sandbox-a".to_string(), uid: "cr-uid-a".to_string(), } @@ -696,6 +821,20 @@ mod tests { assert_eq!(err.code(), tonic::Code::PermissionDenied); } + #[test] + fn sandbox_owner_reference_rejects_unsupported_sandbox_api_version() { + let pod = pod_with_owner_refs(vec![sandbox_owner_with_api_version( + "agents.x-k8s.io/v1", + "sandbox-a", + "cr-uid-a", + )]); + + let err = + sandbox_owner_reference(&pod).expect_err("unsupported apiVersion must fail closed"); + + assert_eq!(err.code(), tonic::Code::PermissionDenied); + } + #[test] fn sandbox_owner_reference_requires_controlling_owner() { let mut owner = sandbox_owner("sandbox-a", "cr-uid-a"); @@ -722,6 +861,7 @@ mod tests { #[test] fn validate_sandbox_owner_reference_requires_matching_cr_uid_and_label() { let owner = SandboxOwnerReference { + api_version: SANDBOX_API_VERSION_FULL_V1BETA1.to_string(), name: "sandbox-a".to_string(), uid: "cr-uid-a".to_string(), }; diff --git a/docs/kubernetes/setup.mdx b/docs/kubernetes/setup.mdx index 0a25eceeb..5ab786519 100644 --- a/docs/kubernetes/setup.mdx +++ b/docs/kubernetes/setup.mdx @@ -50,6 +50,10 @@ kubectl -n agent-sandbox-system get pods The controller pod should reach `Running` status within a few seconds. For cluster-specific setup instructions, including KinD and GKE walkthroughs, refer to the [Agent Sandbox getting started guide](https://agent-sandbox.sigs.k8s.io/docs/getting_started/). +### Upgrade Agent Sandbox + +OpenShell detects the served Agent Sandbox `Sandbox` API when the Kubernetes gateway first needs it and caches that choice for the gateway process. If you upgrade Agent Sandbox in place, restart the OpenShell gateway after the Agent Sandbox controller and CRD rollout completes so the gateway can detect the served API versions again. Existing sandboxes keep running during the upgrade, and the restarted gateway can continue managing them. + ## Install OpenShell diff --git a/docs/reference/sandbox-compute-drivers.mdx b/docs/reference/sandbox-compute-drivers.mdx index c81afd66d..ba13dcbd2 100644 --- a/docs/reference/sandbox-compute-drivers.mdx +++ b/docs/reference/sandbox-compute-drivers.mdx @@ -283,7 +283,9 @@ For maintainer-level implementation details, refer to the [Kubernetes driver REA | `workspace_default_storage_size` | `server.workspaceDefaultStorageSize` | Set the default workspace PVC size for new sandboxes. | | `sa_token_ttl_secs` | `server.sandboxJwt.k8sSaTokenTtlSecs` | Set the projected ServiceAccount token TTL used for the bootstrap token exchange. | -The Kubernetes driver creates namespaced `agents.x-k8s.io/v1alpha1` `Sandbox` resources from the Kubernetes SIG Apps [agent-sandbox](https://github.com/kubernetes-sigs/agent-sandbox) project. The Agent Sandbox controller turns those resources into sandbox pods and related storage. +The Kubernetes driver creates namespaced `agents.x-k8s.io` `Sandbox` resources from the Kubernetes SIG Apps [agent-sandbox](https://github.com/kubernetes-sigs/agent-sandbox) project. It detects the served Sandbox API at runtime, caches the selected API version for the gateway process, and uses `v1beta1` when available before falling back to `v1alpha1`, so supported Agent Sandbox installations work without version-specific operator configuration. The Agent Sandbox controller turns those resources into sandbox pods and related storage. + +If Agent Sandbox is upgraded in place, restart the OpenShell gateway after the controller and CRD rollout completes so the gateway can detect the served API versions again. `Sandbox.spec.volumeClaimTemplates` is immutable after creation. To change storage configuration, delete the sandbox and create a new one with the updated spec. diff --git a/e2e/with-kube-gateway.sh b/e2e/with-kube-gateway.sh index bea1c01d3..47b8730dc 100755 --- a/e2e/with-kube-gateway.sh +++ b/e2e/with-kube-gateway.sh @@ -51,9 +51,10 @@ ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" # shellcheck source=e2e/support/gateway-common.sh source "${ROOT}/e2e/support/gateway-common.sh" -# Upstream agent-sandbox release. Bump in lockstep with the supported Sandbox -# field set in crates/openshell-driver-kubernetes (see sandbox_to_k8s_spec). -AGENT_SANDBOX_VERSION="${AGENT_SANDBOX_VERSION:-v0.4.6}" +# Upstream agent-sandbox release. The Kubernetes driver supports the v1beta1 +# Sandbox API introduced in v0.5.0 and falls back to v1alpha1 for v0.4.6 +# clusters. Override this env var to exercise the v1alpha1 controller release. +AGENT_SANDBOX_VERSION="${AGENT_SANDBOX_VERSION:-v0.5.0}" e2e_preserve_mise_dirs e2e_align_docker_host_with_cli_context @@ -88,6 +89,28 @@ kctl() { kubectl --context "${KUBE_CONTEXT}" "$@" } +wait_for_agent_sandbox_crd() { + local deadline + local established + + deadline=$(( $(date +%s) + 120 )) + while [ "$(date +%s)" -lt "${deadline}" ]; do + if kctl get crd/sandboxes.agents.x-k8s.io >/dev/null 2>&1; then + established="$(kctl get crd/sandboxes.agents.x-k8s.io \ + -o 'jsonpath={.status.conditions[?(@.type=="Established")].status}' \ + 2>/dev/null || true)" + if [ "${established}" = "True" ]; then + return 0 + fi + fi + sleep 2 + done + + echo "Timed out waiting for agent-sandbox Sandbox CRD to become Established" >&2 + kctl get crd/sandboxes.agents.x-k8s.io -o yaml >&2 || true + return 1 +} + helmctl() { helm --kube-context "${KUBE_CONTEXT}" "$@" } @@ -533,7 +556,7 @@ fi echo "Installing agent-sandbox CRDs and controller (${AGENT_SANDBOX_VERSION})..." _agent_sandbox_base="https://github.com/kubernetes-sigs/agent-sandbox/releases/download/${AGENT_SANDBOX_VERSION}" kctl apply -f "${_agent_sandbox_base}/manifest.yaml" -kctl wait --for=condition=Established crd/sandboxes.agents.x-k8s.io --timeout=120s +wait_for_agent_sandbox_crd kctl -n agent-sandbox-system rollout status deployment/agent-sandbox-controller --timeout=300s helm_extra_args=() diff --git a/tasks/scripts/helm-k3s-local.sh b/tasks/scripts/helm-k3s-local.sh index 59d8939a6..f9ac186f5 100755 --- a/tasks/scripts/helm-k3s-local.sh +++ b/tasks/scripts/helm-k3s-local.sh @@ -33,9 +33,10 @@ DEFAULT_SANDBOX_PRELOAD_IMAGE="ghcr.io/nvidia/openshell-community/sandboxes/base PRELOAD_SANDBOX_IMAGE="${HELM_K3S_PRELOAD_SANDBOX_IMAGE-${DEFAULT_SANDBOX_PRELOAD_IMAGE}}" # Upstream agent-sandbox release pinned for both CRDs/controller and extensions. -# Refresh this tag in lockstep with the supported field set in -# crates/openshell-driver-kubernetes (see sandbox_to_k8s_spec). -AGENT_SANDBOX_VERSION="${AGENT_SANDBOX_VERSION:-v0.4.6}" +# The Kubernetes driver supports the v1beta1 Sandbox API introduced in v0.5.0 +# and falls back to v1alpha1 for v0.4.6 clusters. Override this env var to +# exercise the v1alpha1 controller release. +AGENT_SANDBOX_VERSION="${AGENT_SANDBOX_VERSION:-v0.5.0}" default_kubeconfig="${ROOT}/kubeconfig" if [[ -n "${HELM_K3S_KUBECONFIG:-}" ]]; then diff --git a/tasks/test.toml b/tasks/test.toml index 444ea15e1..1d0f97856 100644 --- a/tasks/test.toml +++ b/tasks/test.toml @@ -94,6 +94,18 @@ run = "e2e/rust/e2e-podman.sh" description = "Run Rust CLI e2e tests against an OpenShell gateway deployed on Kubernetes via Helm (set OPENSHELL_E2E_KUBE_CONTEXT to reuse a cluster; otherwise creates a local k3d cluster when k3d is installed; set OPENSHELL_E2E_KUBE_TEST= to scope to one test)" run = "e2e/rust/e2e-kubernetes.sh" +["e2e:kubernetes:v1alpha1"] +description = "Run Kubernetes e2e against Agent Sandbox v1alpha1" +env = { AGENT_SANDBOX_VERSION = "v0.4.6" } +run = "e2e/rust/e2e-kubernetes.sh" + +["e2e:kubernetes:agent-sandbox-versions"] +description = "Run Kubernetes e2e against Agent Sandbox v1beta1 and v1alpha1" +run = [ + "e2e/rust/e2e-kubernetes.sh", + "AGENT_SANDBOX_VERSION=v0.4.6 e2e/rust/e2e-kubernetes.sh", +] + ["e2e:kubernetes:db"] description = "Run Kubernetes e2e with all database backend scenarios (SQLite and external PostgreSQL with existingSecret)" env = { OPENSHELL_E2E_KUBE_DB_SCENARIOS = "1" }