@@ -280,6 +280,7 @@ def _poll_until_condition_met(
280280 self , metric_value_condition : "MetricConditionGater.MetricCondition"
281281 ):
282282 """Poll metrics until the condition is met for the metric."""
283+ condition_desc = f"({ metric_value_condition .condition_description if metric_value_condition .condition_description is not None else "" } ) "
283284 while True :
284285 metrics = self ._get_metrics_raw_string ()
285286 assert metrics is not None , f"Failed to get metrics from for pod { self .pod } "
@@ -300,12 +301,6 @@ def _poll_until_condition_met(
300301 f"Metric '{ self .metric_name } ' not found in pod { self .pod } . Assuming the node is not ready."
301302 )
302303 elif metric_value_condition .value_condition (val ):
303- condition_desc = (
304- f"({ metric_value_condition .condition_description } ) "
305- if metric_value_condition .condition_description is not None
306- else ""
307- )
308-
309304 print_colored (
310305 f"Metric { self .metric_name } condition { condition_desc } met (value={ val } )."
311306 )
@@ -435,6 +430,20 @@ def get_context_list_from_args(
435430 ]
436431
437432
433+ def _get_pod_names (
434+ namespace : str , service : Service , index : int , cluster : Optional [str ] = None
435+ ) -> list [str ]:
436+ kubectl_args = [
437+ "get" ,
438+ "pods" ,
439+ "-o" ,
440+ "name" ,
441+ ]
442+ kubectl_args .extend (get_namespace_args (namespace , cluster ))
443+ pods = run_kubectl_command (kubectl_args , capture_output = True ).stdout .splitlines ()
444+ return [pod .split ("/" )[1 ] for pod in pods if pod .startswith (f"pod/{ service .pod_name } " )]
445+
446+
438447class ServiceRestarter (ABC ):
439448 """Abstract class for restarting service instances."""
440449
@@ -451,18 +460,7 @@ def _restart_pod(
451460 namespace : str , service : Service , index : int , cluster : Optional [str ] = None
452461 ) -> None :
453462 """Restart pod by deleting it"""
454- # Get the list of pods (one string per line).
455- kubectl_args = [
456- "get" ,
457- "pods" ,
458- "-o" ,
459- "name" ,
460- ]
461- kubectl_args .extend (get_namespace_args (namespace , cluster ))
462- pods = run_kubectl_command (kubectl_args , capture_output = True ).stdout .splitlines ()
463-
464- # Filter the list of pods to only include the ones that match the service and extract the pod name.
465- pods = [pod .split ("/" )[1 ] for pod in pods if pod .startswith (f"pod/{ service .pod_name } " )]
463+ pods = _get_pod_names (namespace , service , index , cluster )
466464
467465 if not pods :
468466 print_error (f"Could not find pods for service { service .pod_name } ." )
@@ -536,12 +534,12 @@ def __init__(
536534
537535 def restart_service (self , instance_index : int ) -> bool :
538536 """Restart the instance one by one, running the use code in between each restart."""
539- self ._restart_pod (
540- self .namespace_and_instruction_args .get_namespace (instance_index ),
541- self .service ,
542- instance_index ,
543- self .namespace_and_instruction_args .get_cluster (instance_index ),
544- )
537+ # self._restart_pod(
538+ # self.namespace_and_instruction_args.get_namespace(instance_index),
539+ # self.service,
540+ # instance_index,
541+ # self.namespace_and_instruction_args.get_cluster(instance_index),
542+ # )
545543 instructions = self .namespace_and_instruction_args .get_instruction (instance_index )
546544 print_colored (
547545 f"Restarted pod { instance_index } .\n { instructions if instructions is not None else '' } " ,
@@ -550,6 +548,86 @@ def restart_service(self, instance_index: int) -> bool:
550548 return self .check_between_restarts (instance_index )
551549
552550
551+ class WaitOnMetrticOneByOneRestarter (ChecksBetweenRestarts ):
552+ def __init__ (
553+ self ,
554+ namespace_and_instruction_args : NamespaceAndInstructionArgs ,
555+ service : Service ,
556+ metric_name : str ,
557+ metrics_port : int ,
558+ metric_value_condition : "MetricConditionGater.MetricCondition" ,
559+ ):
560+ def _check_between_restarts (instance_index : int ) -> bool :
561+ # This is to prevent the case where we get the pod name of the old pod we just deleted.
562+ # TODO(guy.f): Verify this is not the name of the old pod some other way.
563+ sleep (2 )
564+ pod_names = WaitOnMetrticOneByOneRestarter ._wait_for_pods_to_be_ready (
565+ namespace_and_instruction_args .get_namespace (instance_index ),
566+ namespace_and_instruction_args .get_cluster (instance_index ),
567+ service ,
568+ )
569+ if pod_names is None :
570+ return False
571+
572+ for pod_name in pod_names :
573+ metric_condition_gater = MetricConditionGater (
574+ metric_name ,
575+ namespace_and_instruction_args .get_namespace (instance_index ),
576+ namespace_and_instruction_args .get_cluster (instance_index ),
577+ pod_name ,
578+ metrics_port ,
579+ )
580+ metric_condition_gater .gate (metric_value_condition )
581+ if instance_index == namespace_and_instruction_args .size () - 1 :
582+ return True
583+ return wait_until_y_or_n (f"Do you want to restart the next pod?" )
584+
585+ super ().__init__ (namespace_and_instruction_args , service , _check_between_restarts )
586+
587+ @staticmethod
588+ def _wait_for_pods_to_be_ready (
589+ namespace : str ,
590+ cluster : Optional [str ],
591+ service : Service ,
592+ wait_timeout : int = 180 ,
593+ num_retry : int = 3 ,
594+ refresh_delay_sec : int = 3 ,
595+ ) -> Optional [list [str ]]:
596+ for i in range (num_retry ):
597+ pods = _get_pod_names (namespace , service , 0 , cluster )
598+ if pods :
599+ for pod in pods :
600+ print_colored (
601+ f"Waiting for pod { pod } to be ready... (timeout: { wait_timeout } s)"
602+ )
603+ kubectl_args = [
604+ "wait" ,
605+ "--for=condition=ready" ,
606+ f"pod/{ pod } " ,
607+ "--timeout" ,
608+ f"{ wait_timeout } s" ,
609+ ]
610+ kubectl_args .extend (get_namespace_args (namespace , cluster ))
611+ result = run_kubectl_command (kubectl_args , capture_output = False )
612+
613+ if result .returncode != 0 :
614+ print_colored (
615+ f"Timed out waiting for pod { pod } to be ready: { result .stderr } , retrying... (attempt { i + 1 } /{ num_retry } )" ,
616+ Colors .YELLOW ,
617+ )
618+ break
619+ return pods
620+ else :
621+ print_colored (
622+ f"Could not get pod names for service { service .pod_name } , retrying... (attempt { i + 1 } /{ num_retry } )" ,
623+ Colors .YELLOW ,
624+ )
625+ sleep (refresh_delay_sec )
626+
627+ print_error (f"Pods for service { service .pod_name } are not ready after { num_retry } attempts" )
628+ return None
629+
630+
553631class NoOpServiceRestarter (ServiceRestarter ):
554632 """No-op service restarter."""
555633
0 commit comments