@@ -222,10 +222,10 @@ class MetricConditionGater:
222222 def __init__ (
223223 self ,
224224 metric_name : str ,
225+ namespace : str ,
226+ cluster : Optional [str ],
225227 pod : str ,
226228 metrics_port : int ,
227- namespace : str ,
228- cluster : Optional [str ] = None ,
229229 refresh_interval_seconds : int = 3 ,
230230 ):
231231 self .metric_name = metric_name
@@ -263,7 +263,7 @@ def _get_metrics(self) -> str:
263263 )
264264 sleep (self .refresh_interval_seconds )
265265
266- def _poll_condition_met (self , metric_value_condition : Callable [[Any ], bool ]):
266+ def _poll_until_condition_met (self , metric_value_condition : Callable [[Any ], bool ]):
267267 """Poll metrics until the condition is met for the metric."""
268268 while True :
269269 metrics = self ._get_metrics ()
@@ -401,6 +401,20 @@ def get_context_list_from_args(
401401 ]
402402
403403
404+ def _get_pod_names (
405+ namespace : str , service : Service , index : int , cluster : Optional [str ] = None
406+ ) -> list [str ]:
407+ kubectl_args = [
408+ "get" ,
409+ "pods" ,
410+ "-o" ,
411+ "name" ,
412+ ]
413+ kubectl_args .extend (get_namespace_args (namespace , cluster ))
414+ pods = run_kubectl_command (kubectl_args , capture_output = True ).stdout .splitlines ()
415+ return [pod .split ("/" )[1 ] for pod in pods if pod .startswith (f"pod/{ service .pod_name } " )]
416+
417+
404418class ServiceRestarter (ABC ):
405419 """Abstract class for restarting service instances."""
406420
@@ -417,18 +431,7 @@ def _restart_pod(
417431 namespace : str , service : Service , index : int , cluster : Optional [str ] = None
418432 ) -> None :
419433 """Restart pod by deleting it"""
420- # Get the list of pods (one string per line).
421- kubectl_args = [
422- "get" ,
423- "pods" ,
424- "-o" ,
425- "name" ,
426- ]
427- kubectl_args .extend (get_namespace_args (namespace , cluster ))
428- pods = run_kubectl_command (kubectl_args , capture_output = True ).stdout .splitlines ()
429-
430- # Filter the list of pods to only include the ones that match the service and extract the pod name.
431- pods = [pod .split ("/" )[1 ] for pod in pods if pod .startswith (f"pod/{ service .pod_name } " )]
434+ pods = _get_pod_names (namespace , service , index , cluster )
432435
433436 if not pods :
434437 print_error (f"Could not find pods for service { service .pod_name } ." )
@@ -514,6 +517,66 @@ def restart_service(self, instance_index: int) -> bool:
514517 return self .check_between_restarts (instance_index )
515518
516519
520+ class WaitOnMetrticOneByOneRestarter (ChecksBetweenRestarts ):
521+ def __init__ (
522+ self ,
523+ namespace_and_instruction_args : NamespaceAndInstructionArgs ,
524+ service : Service ,
525+ metric_name : str ,
526+ metrics_port : int ,
527+ metric_value_condition : Callable [[Any ], bool ],
528+ ):
529+ def _check_between_restarts (instance_index : int ) -> bool :
530+ if not WaitOnMetrticOneByOneRestarter ._wait_for_pods_to_be_ready (service ):
531+ return False
532+ metric_condition_gater = MetricConditionGater (
533+ metric_name ,
534+ namespace_and_instruction_args .get_namespace (instance_index ),
535+ namespace_and_instruction_args .get_cluster (instance_index ),
536+ service .pod_name ,
537+ metrics_port ,
538+ )
539+ metric_condition_gater .gate (metric_value_condition )
540+ if instance_index == namespace_and_instruction_args .size () - 1 :
541+ return True
542+ return wait_until_y_or_n (f"Do you want to restart the next pod?" )
543+
544+ super ().__init__ (namespace_and_instruction_args , service , _check_between_restarts )
545+
546+ @staticmethod
547+ def _wait_for_pods_to_be_ready (
548+ service : Service , wait_timeout : int = 180 , num_retry : int = 3 , refresh_delay_sec : int = 3
549+ ) -> bool :
550+ for i in range (num_retry ):
551+ pods = _get_pod_names (service )
552+ if pods :
553+ for pod in pods :
554+ print_colored (
555+ f"Waiting for pod { pod } to be ready... (timeout: { wait_timeout } s)"
556+ )
557+ result = run_kubectl_command (
558+ ["wait" , "--for=condition=ready" , "pod/" , pod , "--timeout" , "{timeout}s" ],
559+ capture_output = False ,
560+ )
561+
562+ if result .returncode != 0 :
563+ print_colored (
564+ f"Timed out waiting for pod { pod } to be ready: { result .stderr } , retrying... (attempt { i + 1 } /{ num_retry } )" ,
565+ Colors .YELLOW ,
566+ )
567+ break
568+ return True
569+ else :
570+ print_colored (
571+ f"Could not get pod names for service { service .pod_name } , retrying... (attempt { i + 1 } /{ num_retry } )" ,
572+ Colors .YELLOW ,
573+ )
574+ sleep (refresh_delay_sec )
575+
576+ print_error (f"Pods for service { service .pod_name } are not ready after { num_retry } attempts" )
577+ return False
578+
579+
517580class NoOpServiceRestarter (ServiceRestarter ):
518581 """No-op service restarter."""
519582
0 commit comments