@@ -255,7 +255,7 @@ def _get_metrics(self) -> str:
255255 )
256256 sleep (self .refresh_interval_seconds )
257257
258- def _poll_condition_met (self , metric_value_condition : Callable [[Any ], bool ]):
258+ def _poll_until_condition_met (self , metric_value_condition : Callable [[Any ], bool ]):
259259 """Poll metrics until the condition is met for the metric."""
260260 while True :
261261 metrics = self ._get_metrics ()
@@ -325,7 +325,7 @@ def signal_handler(signum, frame):
325325 signal .signal (signal .SIGINT , signal_handler )
326326 signal .signal (signal .SIGTERM , signal_handler )
327327
328- self ._poll_condition_met (metric_value_condition )
328+ self ._poll_until_condition_met (metric_value_condition )
329329
330330 finally :
331331 # Ensure subprocess is always terminated
@@ -402,6 +402,20 @@ def get_context_list_from_args(
402402 ]
403403
404404
405+ def _get_pod_names (
406+ namespace : str , service : Service , index : int , cluster : Optional [str ] = None
407+ ) -> list [str ]:
408+ kubectl_args = [
409+ "get" ,
410+ "pods" ,
411+ "-o" ,
412+ "name" ,
413+ ]
414+ kubectl_args .extend (get_namespace_args (namespace , cluster ))
415+ pods = run_kubectl_command (kubectl_args , capture_output = True ).stdout .splitlines ()
416+ return [pod .split ("/" )[1 ] for pod in pods if pod .startswith (f"pod/{ service .pod_name } " )]
417+
418+
405419class ServiceRestarter (ABC ):
406420 """Abstract class for restarting service instances."""
407421
@@ -418,18 +432,7 @@ def _restart_pod(
418432 namespace : str , service : Service , index : int , cluster : Optional [str ] = None
419433 ) -> None :
420434 """Restart pod by deleting it"""
421- # Get the list of pods (one string per line).
422- kubectl_args = [
423- "get" ,
424- "pods" ,
425- "-o" ,
426- "name" ,
427- ]
428- kubectl_args .extend (get_namespace_args (namespace , cluster ))
429- pods = run_kubectl_command (kubectl_args , capture_output = True ).stdout .splitlines ()
430-
431- # Filter the list of pods to only include the ones that match the service and extract the pod name.
432- pods = [pod .split ("/" )[1 ] for pod in pods if pod .startswith (f"pod/{ service .pod_name } " )]
435+ pods = _get_pod_names (namespace , service , index , cluster )
433436
434437 if not pods :
435438 print_error (f"Could not find pods for service { service .pod_name } ." )
@@ -516,6 +519,62 @@ def restart_service(self, instance_index: int) -> bool:
516519 return self .check_between_restarts (instance_index )
517520
518521
522+ class WaitOnMetrticOneByOneRestarter (ChecksBetweenRestarts ):
523+ def __init__ (
524+ self ,
525+ namespace_and_instruction_args : NamespaceAndInstructionArgs ,
526+ service : Service ,
527+ metric_name : str ,
528+ metrics_port : int ,
529+ metric_value_condition : Callable [[Any ], bool ],
530+ ):
531+ def _check_between_restarts (instance_index : int ) -> bool :
532+ if not WaitOnMetrticOneByOneRestarter ._wait_for_pods_to_be_ready (service ):
533+ return False
534+ metric_condition_gater = MetricConditionGater (
535+ metric_name , service .pod_name , metrics_port
536+ )
537+ metric_condition_gater .gate (metric_value_condition )
538+ if instance_index == namespace_and_instruction_args .size () - 1 :
539+ return True
540+ return wait_until_y_or_n (f"Do you want to restart the next pod?" )
541+
542+ super ().__init__ (namespace_and_instruction_args , service , _check_between_restarts )
543+
544+ @staticmethod
545+ def _wait_for_pods_to_be_ready (
546+ service : Service , wait_timeout : int = 180 , num_retry : int = 3 , refresh_delay_sec : int = 3
547+ ) -> bool :
548+ for i in range (num_retry ):
549+ pods = _get_pod_names (service )
550+ if pods :
551+ for pod in pods :
552+ print_colored (
553+ f"Waiting for pod { pod } to be ready... (timeout: { wait_timeout } s)"
554+ )
555+ result = run_kubectl_command (
556+ ["wait" , "--for=condition=ready" , "pod/" , pod , "--timeout" , "{timeout}s" ],
557+ capture_output = False ,
558+ )
559+
560+ if result .returncode != 0 :
561+ print_colored (
562+ f"Timed out waiting for pod { pod } to be ready: { result .stderr } , retrying... (attempt { i + 1 } /{ num_retry } )" ,
563+ Colors .YELLOW ,
564+ )
565+ break
566+ return True
567+ else :
568+ print_colored (
569+ f"Could not get pod names for service { service .pod_name } , retrying... (attempt { i + 1 } /{ num_retry } )" ,
570+ Colors .YELLOW ,
571+ )
572+ sleep (refresh_delay_sec )
573+
574+ print_error (f"Pods for service { service .pod_name } are not ready after { num_retry } attempts" )
575+ return False
576+
577+
519578class NoOpServiceRestarter (ServiceRestarter ):
520579 """No-op service restarter."""
521580
0 commit comments