scripts: Add code for waiting until the restarted job is ready and the metric condition is met

guyf-starkware · guyf-starkware · commit 4cb5cb65fc19 · 2025-11-02T12:52:47.000+02:00
diff --git a/scripts/prod/update_config_and_restart_nodes.py b/scripts/prod/update_config_and_restart_nodes.py
@@ -8,9 +8,11 @@
 from update_config_and_restart_nodes_lib import (
     ApolloArgsParserBuilder,
     Colors,
+    MetricConditionGater,
     NamespaceAndInstructionArgs,
     Service,
     ServiceRestarter,
+    WaitOnMetrticOneByOneRestarter,
     print_colored,
     print_error,
     update_config_and_restart_nodes,
@@ -155,10 +157,19 @@ def main():
         None,
     )
 
-    restarter = ServiceRestarter.from_restart_strategy(
-        args.restart_strategy,
+    # Create the appropriate restarter based on the restart strategy
+    # restarter = ServiceRestarter.from_restart_strategy(
+    #     args.restart_strategy,
+    #     namespace_and_instruction_args,
+    #     args.service,
+    # )
+
+    restarter = WaitOnMetrticOneByOneRestarter(
         namespace_and_instruction_args,
         args.service,
+        "mempool_p2p_propagator_local_msgs_processed",
+        8082,
+        MetricConditionGater.MetricCondition(lambda val: val > 4167980, "Greater than 4167980"),
     )
 
     update_config_and_restart_nodes(
diff --git a/scripts/prod/update_config_and_restart_nodes_lib.py b/scripts/prod/update_config_and_restart_nodes_lib.py
@@ -278,12 +278,16 @@ def _poll_until_condition_met(
         self, metric_value_condition: "MetricConditionGater.MetricCondition"
     ):
         """Poll metrics until the condition is met for the metric."""
+<<<<<<< HEAD
         condition_description = (
             f"({metric_value_condition.condition_description}) "
             if metric_value_condition.condition_description is not None
             else ""
         )
 
+=======
+        condition_desc = f"({metric_value_condition.condition_description if metric_value_condition.condition_description is not None else ""}) "
+>>>>>>> 30e33e387 (scripts: Add code for waiting until the restarted job is ready and the metric condition is met)
         while True:
             metrics = self._get_metrics_raw_string()
             assert metrics is not None, f"Failed to get metrics from for pod {self.pod}"
@@ -433,6 +437,20 @@ def get_context_list_from_args(
         ]
 
 
+def _get_pod_names(
+    namespace: str, service: Service, index: int, cluster: Optional[str] = None
+) -> list[str]:
+    kubectl_args = [
+        "get",
+        "pods",
+        "-o",
+        "name",
+    ]
+    kubectl_args.extend(get_namespace_args(namespace, cluster))
+    pods = run_kubectl_command(kubectl_args, capture_output=True).stdout.splitlines()
+    return [pod.split("/")[1] for pod in pods if pod.startswith(f"pod/{service.pod_name}")]
+
+
 class ServiceRestarter(ABC):
     """Abstract class for restarting service instances."""
 
@@ -449,18 +467,7 @@ def _restart_pod(
         namespace: str, service: Service, index: int, cluster: Optional[str] = None
     ) -> None:
         """Restart pod by deleting it"""
-        # Get the list of pods (one string per line).
-        kubectl_args = [
-            "get",
-            "pods",
-            "-o",
-            "name",
-        ]
-        kubectl_args.extend(get_namespace_args(namespace, cluster))
-        pods = run_kubectl_command(kubectl_args, capture_output=True).stdout.splitlines()
-
-        # Filter the list of pods to only include the ones that match the service and extract the pod name.
-        pods = [pod.split("/")[1] for pod in pods if pod.startswith(f"pod/{service.pod_name}")]
+        pods = _get_pod_names(namespace, service, index, cluster)
 
         if not pods:
             print_error(f"Could not find pods for service {service.pod_name}.")
@@ -534,12 +541,12 @@ def __init__(
 
     def restart_service(self, instance_index: int) -> bool:
         """Restart the instance one by one, running the use code in between each restart."""
-        self._restart_pod(
-            self.namespace_and_instruction_args.get_namespace(instance_index),
-            self.service,
-            instance_index,
-            self.namespace_and_instruction_args.get_cluster(instance_index),
-        )
+        # self._restart_pod(
+        #     self.namespace_and_instruction_args.get_namespace(instance_index),
+        #     self.service,
+        #     instance_index,
+        #     self.namespace_and_instruction_args.get_cluster(instance_index),
+        # )
         instructions = self.namespace_and_instruction_args.get_instruction(instance_index)
         print_colored(
             f"Restarted pod {instance_index}.\n{instructions if instructions is not None else ''} ",
@@ -548,6 +555,86 @@ def restart_service(self, instance_index: int) -> bool:
         return self.check_between_restarts(instance_index)
 
 
+class WaitOnMetrticOneByOneRestarter(ChecksBetweenRestarts):
+    def __init__(
+        self,
+        namespace_and_instruction_args: NamespaceAndInstructionArgs,
+        service: Service,
+        metric_name: str,
+        metrics_port: int,
+        metric_value_condition: "MetricConditionGater.MetricCondition",
+    ):
+        def _check_between_restarts(instance_index: int) -> bool:
+            # This is to prevent the case where we get the pod name of the old pod we just deleted.
+            # TODO(guy.f): Verify this is not the name of the old pod some other way.
+            sleep(2)
+            pod_names = WaitOnMetrticOneByOneRestarter._wait_for_pods_to_be_ready(
+                namespace_and_instruction_args.get_namespace(instance_index),
+                namespace_and_instruction_args.get_cluster(instance_index),
+                service,
+            )
+            if pod_names is None:
+                return False
+
+            for pod_name in pod_names:
+                metric_condition_gater = MetricConditionGater(
+                    metric_name,
+                    namespace_and_instruction_args.get_namespace(instance_index),
+                    namespace_and_instruction_args.get_cluster(instance_index),
+                    pod_name,
+                    metrics_port,
+                )
+                metric_condition_gater.gate(metric_value_condition)
+            if instance_index == namespace_and_instruction_args.size() - 1:
+                return True
+            return wait_until_y_or_n(f"Do you want to restart the next pod?")
+
+        super().__init__(namespace_and_instruction_args, service, _check_between_restarts)
+
+    @staticmethod
+    def _wait_for_pods_to_be_ready(
+        namespace: str,
+        cluster: Optional[str],
+        service: Service,
+        wait_timeout: int = 180,
+        num_retry: int = 3,
+        refresh_delay_sec: int = 3,
+    ) -> Optional[list[str]]:
+        for i in range(num_retry):
+            pods = _get_pod_names(namespace, service, 0, cluster)
+            if pods:
+                for pod in pods:
+                    print_colored(
+                        f"Waiting for pod {pod} to be ready... (timeout: {wait_timeout}s)"
+                    )
+                    kubectl_args = [
+                        "wait",
+                        "--for=condition=ready",
+                        f"pod/{pod}",
+                        "--timeout",
+                        f"{wait_timeout}s",
+                    ]
+                    kubectl_args.extend(get_namespace_args(namespace, cluster))
+                    result = run_kubectl_command(kubectl_args, capture_output=False)
+
+                    if result.returncode != 0:
+                        print_colored(
+                            f"Timed out waiting for pod {pod} to be ready: {result.stderr}, retrying... (attempt {i + 1}/{num_retry})",
+                            Colors.YELLOW,
+                        )
+                        break
+                return pods
+            else:
+                print_colored(
+                    f"Could not get pod names for service {service.pod_name}, retrying... (attempt {i + 1}/{num_retry})",
+                    Colors.YELLOW,
+                )
+            sleep(refresh_delay_sec)
+
+        print_error(f"Pods for service {service.pod_name} are not ready after {num_retry} attempts")
+        return None
+
+
 class NoOpServiceRestarter(ServiceRestarter):
     """No-op service restarter."""