From 84619b1b98340b8270893e0f8df0fe8c5ab03951 Mon Sep 17 00:00:00 2001 From: morotti Date: Fri, 17 Oct 2025 17:23:14 +0100 Subject: [PATCH] fix: fix too short timeout causing cascading failures the 2 second timeout on liveness probes it way too short. it is causing cascading failures when the container is busy and cannot reply immediately. this is especially bad if you have cpu limits configured on the ray pods. in addition to that, TCP takes 2-3 seconds to detect a lost packet and retry. you should NEVER have a timeout below 5 seconds in any production software. Signed-off-by: morotti --- ray-operator/config/samples/ray-cluster.auth.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ray-operator/config/samples/ray-cluster.auth.yaml b/ray-operator/config/samples/ray-cluster.auth.yaml index bd6f97fb2f1..019ce8e1d46 100644 --- a/ray-operator/config/samples/ray-cluster.auth.yaml +++ b/ray-operator/config/samples/ray-cluster.auth.yaml @@ -79,7 +79,7 @@ spec: command: - bash - -c - - wget -T 2 -q -O- http://localhost:52365/api/local_raylet_healthz | grep success && wget -T 10 -q -O- http://localhost:8443/api/gcs_healthz | grep success + - wget -T 10 -q -O- http://localhost:52365/api/local_raylet_healthz | grep success && wget -T 10 -q -O- http://localhost:8443/api/gcs_healthz | grep success failureThreshold: 10 initialDelaySeconds: 10 periodSeconds: 5 @@ -90,7 +90,7 @@ spec: command: - bash - -c - - wget -T 2 -q -O- http://localhost:52365/api/local_raylet_healthz | grep success && wget -T 10 -q -O- http://localhost:8443/api/gcs_healthz | grep success + - wget -T 10 -q -O- http://localhost:52365/api/local_raylet_healthz | grep success && wget -T 10 -q -O- http://localhost:8443/api/gcs_healthz | grep success failureThreshold: 120 initialDelaySeconds: 30 periodSeconds: 5