@@ -222,10 +222,10 @@ class MetricConditionGater:
222222 def __init__ (
223223 self ,
224224 metric_name : str ,
225+ namespace : str ,
226+ cluster : Optional [str ],
225227 pod : str ,
226228 metrics_port : int ,
227- namespace : str ,
228- cluster : Optional [str ] = None ,
229229 refresh_interval_seconds : int = 3 ,
230230 ):
231231 self .metric_name = metric_name
@@ -396,6 +396,20 @@ def get_context_list_from_args(
396396 ]
397397
398398
399+ def _get_pod_names (
400+ namespace : str , service : Service , index : int , cluster : Optional [str ] = None
401+ ) -> list [str ]:
402+ kubectl_args = [
403+ "get" ,
404+ "pods" ,
405+ "-o" ,
406+ "name" ,
407+ ]
408+ kubectl_args .extend (get_namespace_args (namespace , cluster ))
409+ pods = run_kubectl_command (kubectl_args , capture_output = True ).stdout .splitlines ()
410+ return [pod .split ("/" )[1 ] for pod in pods if pod .startswith (f"pod/{ service .pod_name } " )]
411+
412+
399413class ServiceRestarter (ABC ):
400414 """Abstract class for restarting service instances."""
401415
@@ -412,18 +426,7 @@ def _restart_pod(
412426 namespace : str , service : Service , index : int , cluster : Optional [str ] = None
413427 ) -> None :
414428 """Restart pod by deleting it"""
415- # Get the list of pods (one string per line).
416- kubectl_args = [
417- "get" ,
418- "pods" ,
419- "-o" ,
420- "name" ,
421- ]
422- kubectl_args .extend (get_namespace_args (namespace , cluster ))
423- pods = run_kubectl_command (kubectl_args , capture_output = True ).stdout .splitlines ()
424-
425- # Filter the list of pods to only include the ones that match the service and extract the pod name.
426- pods = [pod .split ("/" )[1 ] for pod in pods if pod .startswith (f"pod/{ service .pod_name } " )]
429+ pods = _get_pod_names (namespace , service , index , cluster )
427430
428431 if not pods :
429432 print_error (f"Could not find pods for service { service .pod_name } ." )
@@ -512,6 +515,66 @@ def restart_service(self, instance_index: int) -> bool:
512515 return self .check_between_restarts (instance_index )
513516
514517
518+ class WaitOnMetrticOneByOneRestarter (ChecksBetweenRestarts ):
519+ def __init__ (
520+ self ,
521+ namespace_and_instruction_args : NamespaceAndInstructionArgs ,
522+ service : Service ,
523+ metric_name : str ,
524+ metrics_port : int ,
525+ metric_value_condition : Callable [[Any ], bool ],
526+ ):
527+ def _check_between_restarts (instance_index : int ) -> bool :
528+ if not WaitOnMetrticOneByOneRestarter ._wait_for_pods_to_be_ready (service ):
529+ return False
530+ metric_condition_gater = MetricConditionGater (
531+ metric_name ,
532+ namespace_and_instruction_args .get_namespace (instance_index ),
533+ namespace_and_instruction_args .get_cluster (instance_index ),
534+ service .pod_name ,
535+ metrics_port ,
536+ )
537+ metric_condition_gater .gate (metric_value_condition )
538+ if instance_index == namespace_and_instruction_args .size () - 1 :
539+ return True
540+ return wait_until_y_or_n (f"Do you want to restart the next pod?" )
541+
542+ super ().__init__ (namespace_and_instruction_args , service , _check_between_restarts )
543+
544+ @staticmethod
545+ def _wait_for_pods_to_be_ready (
546+ service : Service , wait_timeout : int = 180 , num_retry : int = 3 , refresh_delay_sec : int = 3
547+ ) -> bool :
548+ for i in range (num_retry ):
549+ pods = _get_pod_names (service )
550+ if pods :
551+ for pod in pods :
552+ print_colored (
553+ f"Waiting for pod { pod } to be ready... (timeout: { wait_timeout } s)"
554+ )
555+ result = run_kubectl_command (
556+ ["wait" , "--for=condition=ready" , "pod/" , pod , "--timeout" , "{timeout}s" ],
557+ capture_output = False ,
558+ )
559+
560+ if result .returncode != 0 :
561+ print_colored (
562+ f"Timed out waiting for pod { pod } to be ready: { result .stderr } , retrying... (attempt { i + 1 } /{ num_retry } )" ,
563+ Colors .YELLOW ,
564+ )
565+ break
566+ return True
567+ else :
568+ print_colored (
569+ f"Could not get pod names for service { service .pod_name } , retrying... (attempt { i + 1 } /{ num_retry } )" ,
570+ Colors .YELLOW ,
571+ )
572+ sleep (refresh_delay_sec )
573+
574+ print_error (f"Pods for service { service .pod_name } are not ready after { num_retry } attempts" )
575+ return False
576+
577+
515578class NoOpServiceRestarter (ServiceRestarter ):
516579 """No-op service restarter."""
517580
0 commit comments