@@ -222,10 +222,10 @@ class MetricConditionGater:
222222 def __init__ (
223223 self ,
224224 metric_name : str ,
225+ namespace : str ,
226+ cluster : Optional [str ],
225227 pod : str ,
226228 metrics_port : int ,
227- namespace : str ,
228- cluster : Optional [str ] = None ,
229229 refresh_interval_seconds : int = 3 ,
230230 ):
231231 self .metric_name = metric_name
@@ -396,6 +396,20 @@ def get_context_list_from_args(
396396 ]
397397
398398
399+ def _get_pod_names (
400+ namespace : str , service : Service , index : int , cluster : Optional [str ] = None
401+ ) -> list [str ]:
402+ kubectl_args = [
403+ "get" ,
404+ "pods" ,
405+ "-o" ,
406+ "name" ,
407+ ]
408+ kubectl_args .extend (get_namespace_args (namespace , cluster ))
409+ pods = run_kubectl_command (kubectl_args , capture_output = True ).stdout .splitlines ()
410+ return [pod .split ("/" )[1 ] for pod in pods if pod .startswith (f"pod/{ service .pod_name } " )]
411+
412+
399413class ServiceRestarter (ABC ):
400414 """Abstract class for restarting service instances."""
401415
@@ -412,18 +426,7 @@ def _restart_pod(
412426 namespace : str , service : Service , index : int , cluster : Optional [str ] = None
413427 ) -> None :
414428 """Restart pod by deleting it"""
415- # Get the list of pods (one string per line).
416- kubectl_args = [
417- "get" ,
418- "pods" ,
419- "-o" ,
420- "name" ,
421- ]
422- kubectl_args .extend (get_namespace_args (namespace , cluster ))
423- pods = run_kubectl_command (kubectl_args , capture_output = True ).stdout .splitlines ()
424-
425- # Filter the list of pods to only include the ones that match the service and extract the pod name.
426- pods = [pod .split ("/" )[1 ] for pod in pods if pod .startswith (f"pod/{ service .pod_name } " )]
429+ pods = _get_pod_names (namespace , service , index , cluster )
427430
428431 if not pods :
429432 print_error (f"Could not find pods for service { service .pod_name } ." )
@@ -509,6 +512,66 @@ def restart_service(self, instance_index: int) -> bool:
509512 return self .check_between_restarts (instance_index )
510513
511514
515+ class WaitOnMetrticOneByOneRestarter (ChecksBetweenRestarts ):
516+ def __init__ (
517+ self ,
518+ namespace_and_instruction_args : NamespaceAndInstructionArgs ,
519+ service : Service ,
520+ metric_name : str ,
521+ metrics_port : int ,
522+ metric_value_condition : Callable [[Any ], bool ],
523+ ):
524+ def _check_between_restarts (instance_index : int ) -> bool :
525+ if not WaitOnMetrticOneByOneRestarter ._wait_for_pods_to_be_ready (service ):
526+ return False
527+ metric_condition_gater = MetricConditionGater (
528+ metric_name ,
529+ namespace_and_instruction_args .get_namespace (instance_index ),
530+ namespace_and_instruction_args .get_cluster (instance_index ),
531+ service .pod_name ,
532+ metrics_port ,
533+ )
534+ metric_condition_gater .gate (metric_value_condition )
535+ if instance_index == namespace_and_instruction_args .size () - 1 :
536+ return True
537+ return wait_until_y_or_n (f"Do you want to restart the next pod?" )
538+
539+ super ().__init__ (namespace_and_instruction_args , service , _check_between_restarts )
540+
541+ @staticmethod
542+ def _wait_for_pods_to_be_ready (
543+ service : Service , wait_timeout : int = 180 , num_retry : int = 3 , refresh_delay_sec : int = 3
544+ ) -> bool :
545+ for i in range (num_retry ):
546+ pods = _get_pod_names (service )
547+ if pods :
548+ for pod in pods :
549+ print_colored (
550+ f"Waiting for pod { pod } to be ready... (timeout: { wait_timeout } s)"
551+ )
552+ result = run_kubectl_command (
553+ ["wait" , "--for=condition=ready" , "pod/" , pod , "--timeout" , "{timeout}s" ],
554+ capture_output = False ,
555+ )
556+
557+ if result .returncode != 0 :
558+ print_colored (
559+ f"Timed out waiting for pod { pod } to be ready: { result .stderr } , retrying... (attempt { i + 1 } /{ num_retry } )" ,
560+ Colors .YELLOW ,
561+ )
562+ break
563+ return True
564+ else :
565+ print_colored (
566+ f"Could not get pod names for service { service .pod_name } , retrying... (attempt { i + 1 } /{ num_retry } )" ,
567+ Colors .YELLOW ,
568+ )
569+ sleep (refresh_delay_sec )
570+
571+ print_error (f"Pods for service { service .pod_name } are not ready after { num_retry } attempts" )
572+ return False
573+
574+
512575class NoOpServiceRestarter (ServiceRestarter ):
513576 """No-op service restarter."""
514577
0 commit comments