From 84619b1b98340b8270893e0f8df0fe8c5ab03951 Mon Sep 17 00:00:00 2001
From: morotti <r.morotti@gmail.com>
Date: Fri, 17 Oct 2025 17:23:14 +0100
Subject: [PATCH] fix: fix too short timeout causing cascading failures

the 2 second timeout on liveness probes it way too short. it is causing cascading failures when the container is busy and cannot reply immediately. this is especially bad if you have cpu limits configured on the ray pods.

in addition to that, TCP takes 2-3 seconds to detect a lost packet and retry. you should NEVER have a timeout below 5 seconds in any production software.

Signed-off-by: morotti <r.morotti@gmail.com>
---
 ray-operator/config/samples/ray-cluster.auth.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ray-operator/config/samples/ray-cluster.auth.yaml b/ray-operator/config/samples/ray-cluster.auth.yaml
index bd6f97fb2f1..019ce8e1d46 100644
--- a/ray-operator/config/samples/ray-cluster.auth.yaml
+++ b/ray-operator/config/samples/ray-cluster.auth.yaml
@@ -79,7 +79,7 @@ spec:
               command:
               - bash
               - -c
-              - wget -T 2 -q -O- http://localhost:52365/api/local_raylet_healthz | grep success && wget -T 10 -q -O- http://localhost:8443/api/gcs_healthz | grep success
+              - wget -T 10 -q -O- http://localhost:52365/api/local_raylet_healthz | grep success && wget -T 10 -q -O- http://localhost:8443/api/gcs_healthz | grep success
             failureThreshold: 10
             initialDelaySeconds: 10
             periodSeconds: 5
@@ -90,7 +90,7 @@ spec:
               command:
               - bash
               - -c
-              - wget -T 2 -q -O- http://localhost:52365/api/local_raylet_healthz | grep success && wget -T 10 -q -O- http://localhost:8443/api/gcs_healthz | grep success
+              - wget -T 10 -q -O- http://localhost:52365/api/local_raylet_healthz | grep success && wget -T 10 -q -O- http://localhost:8443/api/gcs_healthz | grep success
             failureThreshold: 120
             initialDelaySeconds: 30
             periodSeconds: 5