Skip to content

Commit b4878be

Browse files
committed
fix(server): prevent exec relays from hanging on idle connections
Add HTTP/2 keepalive on supervisor multiplex connections so half-dead sessions cannot leave in-flight exec relays parked indefinitely. Configure SSH keepalive on exec relay clients so long silent commands are not timed out on stdout idle alone; wedged or orphaned relays fail after missed keepalives instead. After a command reports exit status, bound how long the gateway waits for the trailing channel close. Return UNAVAILABLE when a relay closes before reporting exit status rather than defaulting to exit code 1. Signed-off-by: Gal Zaidman <gzaidman@nvidia.com>
1 parent 62b03f0 commit b4878be

3 files changed

Lines changed: 80 additions & 8 deletions

File tree

architecture/gateway.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -365,6 +365,15 @@ The same relay pattern backs interactive SSH, command execution, file sync, and
365365
local service forwarding. The gateway tracks live sessions in memory and
366366
persists session records so tokens can expire or be revoked.
367367

368+
Relay liveness has two backstops so a reset supervisor session cannot leave a
369+
request parked forever. The gateway runs server-side HTTP/2 keepalive on
370+
supervisor connections, and each exec relay's SSH client uses SSH keepalive: an
371+
exec channel may be legitimately silent for a long time (e.g. an agent whose
372+
stdout is redirected to a file), so the exec is never ended on output-idle
373+
alone — instead an unanswered keepalive on a wedged or orphaned relay closes the
374+
channel and returns the exec with an error. Once a command reports its exit
375+
status, the gateway also bounds how long it waits for the trailing channel close.
376+
368377
`ForwardTcp` is the client-facing byte stream for SSH and service forwarding.
369378
The first frame is a `TcpForwardInit` that carries the sandbox ID, an
370379
authorization token from `CreateSshSession`, and an explicit target:

crates/openshell-server/src/grpc/sandbox.rs

Lines changed: 60 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1484,6 +1484,36 @@ fn shell_escape(value: &str) -> Result<String, String> {
14841484
/// Maximum total length of the assembled shell command string.
14851485
const MAX_COMMAND_STRING_LEN: usize = 256 * 1024; // 256 KiB
14861486

1487+
/// SSH keepalive for silent exec relays; stdout idle is not a timeout signal.
1488+
const EXEC_KEEPALIVE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(15);
1489+
1490+
/// Allow this many missed keepalive responses before russh fails the relay.
1491+
const EXEC_KEEPALIVE_MAX: usize = 4;
1492+
1493+
/// Max wait for a trailing `Close` after `ExitStatus`.
1494+
const EXEC_POST_EXIT_CLOSE_TIMEOUT: std::time::Duration = std::time::Duration::from_millis(500);
1495+
1496+
/// russh client config for exec relays.
1497+
fn exec_ssh_client_config() -> russh::client::Config {
1498+
russh::client::Config {
1499+
keepalive_interval: Some(EXEC_KEEPALIVE_INTERVAL),
1500+
keepalive_max: EXEC_KEEPALIVE_MAX,
1501+
..Default::default()
1502+
}
1503+
}
1504+
1505+
/// Treat channel EOF before an exit status as relay failure, not exit code 1.
1506+
fn exec_loop_result(exit_code: Option<i32>) -> Result<i32, Status> {
1507+
exit_code.map_or_else(
1508+
|| {
1509+
Err(Status::unavailable(
1510+
"exec relay closed before the command reported an exit status",
1511+
))
1512+
},
1513+
Ok,
1514+
)
1515+
}
1516+
14871517
fn build_remote_exec_command(req: &ExecSandboxRequest) -> Result<String, String> {
14881518
let mut parts = Vec::new();
14891519
let mut env_entries = req.environment.iter().collect::<Vec<_>>();
@@ -1690,7 +1720,7 @@ async fn run_interactive_exec_with_russh(
16901720
.await
16911721
.map_err(|e| Status::internal(format!("failed to connect to ssh proxy: {e}")))?;
16921722

1693-
let config = Arc::new(russh::client::Config::default());
1723+
let config = Arc::new(exec_ssh_client_config());
16941724
let mut client = russh::client::connect_stream(config, stream, SandboxSshClientHandler)
16951725
.await
16961726
.map_err(|e| Status::internal(format!("failed to establish ssh transport: {e}")))?;
@@ -1746,7 +1776,19 @@ async fn run_interactive_exec_with_russh(
17461776
});
17471777

17481778
let mut exit_code: Option<i32> = None;
1749-
while let Some(msg) = read_half.wait().await {
1779+
loop {
1780+
// Bound the post-ExitStatus wait against a lost Close.
1781+
let msg = if exit_code.is_some() {
1782+
match tokio::time::timeout(EXEC_POST_EXIT_CLOSE_TIMEOUT, read_half.wait()).await {
1783+
Ok(Some(msg)) => msg,
1784+
Ok(None) | Err(_) => break,
1785+
}
1786+
} else {
1787+
match read_half.wait().await {
1788+
Some(msg) => msg,
1789+
None => break,
1790+
}
1791+
};
17501792
match msg {
17511793
ChannelMsg::Data { data } => {
17521794
let event = Ok(ExecSandboxEvent {
@@ -1787,7 +1829,7 @@ async fn run_interactive_exec_with_russh(
17871829
.disconnect(russh::Disconnect::ByApplication, "exec complete", "en")
17881830
.await;
17891831

1790-
Ok(exit_code.unwrap_or(1))
1832+
exec_loop_result(exit_code)
17911833
}
17921834

17931835
/// Create a localhost SSH proxy that bridges to a relay `DuplexStream`.
@@ -1849,7 +1891,7 @@ async fn run_exec_with_russh(
18491891
.await
18501892
.map_err(|e| Status::internal(format!("failed to connect to ssh proxy: {e}")))?;
18511893

1852-
let config = Arc::new(russh::client::Config::default());
1894+
let config = Arc::new(exec_ssh_client_config());
18531895
let mut client = russh::client::connect_stream(config, stream, SandboxSshClientHandler)
18541896
.await
18551897
.map_err(|e| Status::internal(format!("failed to establish ssh transport: {e}")))?;
@@ -1897,7 +1939,19 @@ async fn run_exec_with_russh(
18971939
.map_err(|e| Status::internal(format!("failed to close ssh stdin: {e}")))?;
18981940

18991941
let mut exit_code: Option<i32> = None;
1900-
while let Some(msg) = channel.wait().await {
1942+
loop {
1943+
// Bound the post-ExitStatus wait against a lost Close.
1944+
let msg = if exit_code.is_some() {
1945+
match tokio::time::timeout(EXEC_POST_EXIT_CLOSE_TIMEOUT, channel.wait()).await {
1946+
Ok(Some(msg)) => msg,
1947+
Ok(None) | Err(_) => break,
1948+
}
1949+
} else {
1950+
match channel.wait().await {
1951+
Some(msg) => msg,
1952+
None => break,
1953+
}
1954+
};
19011955
match msg {
19021956
ChannelMsg::Data { data } => {
19031957
let _ = tx
@@ -1935,7 +1989,7 @@ async fn run_exec_with_russh(
19351989
.disconnect(russh::Disconnect::ByApplication, "exec complete", "en")
19361990
.await;
19371991

1938-
Ok(exit_code.unwrap_or(1))
1992+
exec_loop_result(exit_code)
19391993
}
19401994

19411995
// ---------------------------------------------------------------------------

crates/openshell-server/src/multiplex.rs

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ use http_body::Body;
1212
use http_body_util::BodyExt;
1313
use hyper::body::Incoming;
1414
use hyper_util::{
15-
rt::{TokioExecutor, TokioIo},
15+
rt::{TokioExecutor, TokioIo, TokioTimer},
1616
server::conn::auto::Builder,
1717
service::TowerToHyperService,
1818
};
@@ -185,7 +185,16 @@ impl MultiplexService {
185185
let service = MultiplexedService::new(grpc_service, http_service);
186186

187187
let mut builder = Builder::new(TokioExecutor::new());
188-
builder.http2().adaptive_window(true);
188+
// Server-side HTTP/2 keepalive: supervisors hold long-lived sessions, and without
189+
// it the gateway never PINGs them, so idle/half-dead connections linger and orphan
190+
// in-flight relay execs. The timer is required — hyper panics on the keepalive
191+
// interval without one.
192+
builder
193+
.http2()
194+
.timer(TokioTimer::new())
195+
.adaptive_window(true)
196+
.keep_alive_interval(Some(Duration::from_secs(20)))
197+
.keep_alive_timeout(Duration::from_secs(10));
189198

190199
builder
191200
.serve_connection_with_upgrades(TokioIo::new(stream), service)

0 commit comments

Comments
 (0)