Skip to content

Commit 78b685e

Browse files
authored
feat: add configurable timeout for image transfer to gateway containerd (#914)
the transfer of very large sandbox images to containerd can timeout depending on the size of the image and the speed of the local host. Made-with: Cursor
1 parent cbcc4b7 commit 78b685e

3 files changed

Lines changed: 26 additions & 2 deletions

File tree

crates/openshell-bootstrap/src/build.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,10 @@ pub async fn build_and_push_image(
4646
on_log(format!(
4747
"Pushing image {tag} into gateway \"{gateway_name}\""
4848
));
49-
let local_docker = Docker::connect_with_local_defaults()
49+
// Use the long-timeout Docker client so `docker save` of multi-GB images
50+
// doesn't trip the 120s bollard default mid-stream. Override with
51+
// OPENSHELL_DOCKER_TIMEOUT_SECS=<secs>.
52+
let local_docker = crate::docker::connect_local_for_large_transfers()
5053
.into_diagnostic()
5154
.wrap_err("failed to connect to local Docker daemon")?;
5255
let container = container_name(gateway_name);

crates/openshell-bootstrap/src/docker.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,24 @@ use std::collections::HashMap;
2323

2424
const REGISTRY_NAMESPACE_DEFAULT: &str = "openshell";
2525

26+
/// Default total HTTP timeout for Docker API calls that stream large payloads
27+
/// (e.g. `docker save` used by `sandbox create --from`). Bollard's own
28+
/// `connect_with_local_defaults()` ceiling is 120s, which is far too short for
29+
/// multi-GB image exports — a 7 GB image on a laptop SSD takes ~4–5 minutes.
30+
/// One hour is a safe upper bound; override with `OPENSHELL_DOCKER_TIMEOUT_SECS`.
31+
pub(crate) const DEFAULT_LARGE_TRANSFER_TIMEOUT_SECS: u64 = 3600;
32+
33+
/// Build a local-Docker client suitable for large streaming transfers.
34+
/// Respects `OPENSHELL_DOCKER_TIMEOUT_SECS` (in seconds); falls back to
35+
/// [`DEFAULT_LARGE_TRANSFER_TIMEOUT_SECS`] when unset or unparseable.
36+
pub fn connect_local_for_large_transfers() -> std::result::Result<Docker, BollardError> {
37+
let secs: u64 = std::env::var("OPENSHELL_DOCKER_TIMEOUT_SECS")
38+
.ok()
39+
.and_then(|s| s.parse().ok())
40+
.unwrap_or(DEFAULT_LARGE_TRANSFER_TIMEOUT_SECS);
41+
Ok(Docker::connect_with_local_defaults()?.with_timeout(std::time::Duration::from_secs(secs)))
42+
}
43+
2644
/// Resolve the raw GPU device-ID list, replacing the `"auto"` sentinel with a
2745
/// concrete device ID based on whether CDI is enabled on the daemon.
2846
///

crates/openshell-bootstrap/src/lib.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -521,7 +521,10 @@ where
521521
.collect();
522522
if !images.is_empty() {
523523
log("[status] Deploying components".to_string());
524-
let local_docker = Docker::connect_with_local_defaults().into_diagnostic()?;
524+
// Long-timeout client: `docker save` of multi-GB component
525+
// images streams past bollard's 120s default. See
526+
// docker::connect_local_for_large_transfers().
527+
let local_docker = docker::connect_local_for_large_transfers().into_diagnostic()?;
525528
let container = container_name(&name);
526529
let on_log_ref = Arc::clone(&on_log);
527530
let mut push_log = move |msg: String| {

0 commit comments

Comments
 (0)