diff --git a/src/manager.rs b/src/manager.rs index acb4b92..fcfa2ee 100644 --- a/src/manager.rs +++ b/src/manager.rs @@ -253,10 +253,10 @@ impl Manager { let result = tokio::time::timeout(timeout, client.quorum(request)).await; match result { - Ok(response) => { - return response; + Ok(Ok(response)) => { + return Ok(response); } - Err(e) => { + Ok(Err(e)) => { info_with_replica!( self.replica_id, "lighthouse quorum failed. error: {}", @@ -271,6 +271,31 @@ impl Manager { ))); } + // In general, quorum failure will return immediately, + // but not waiting for the timeout. + tokio::time::sleep(timeout).await; + + // Reset the client since the lighthouse server might have failed + // If this also fails, consider increasing `connect_timeout`. + let _ = self.create_lighthouse_client().await; + + retry_count += 1; + } + Err(e) => { + info_with_replica!( + self.replica_id, + "lighthouse quorum timeout. error: {}", + e.to_string() + ); + + if retry_count == self.quorum_retries { + return Err(Status::internal(format!( + "lighthouse quorum failed after {} retries. error: {}", + retry_count, + e.to_string(), + ))); + } + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; // Reset the client since the lighthouse server might have failed