scripts: Add a new restarter that gates on metrics (#9925)

guyf-starkware · web-flow · commit 3c013fcf4c43 · 2025-11-04T12:46:06.000Z
diff --git a/scripts/prod/metrics_lib.py b/scripts/prod/metrics_lib.py
@@ -9,43 +9,42 @@
 import socket
 import urllib.error
 import urllib.request
+from common_lib import Colors, get_namespace_args, print_colored, print_error
 from prometheus_client.parser import text_string_to_metric_families
 
-from scripts.prod.common_lib import Colors, get_namespace_args, print_colored, print_error
-
 
 class MetricConditionGater:
     """Gates progress on a metric satisfying a condition.
 
     This class was meant to be used with counter/gauge metrics. It may not work properly with histogram metrics.
     """
 
-    class MetricCondition:
+    class Metric:
         def __init__(
             self,
+            name: str,
             value_condition: Callable[[Any], bool],
             condition_description: Optional[str] = None,
         ):
+            self.name = name
             self.value_condition = value_condition
             self.condition_description = condition_description
 
     def __init__(
         self,
-        metric_name: str,
+        metric: "MetricConditionGater.Metric",
         namespace: str,
         cluster: Optional[str],
         pod: str,
         metrics_port: int,
-        metric_value_condition: "MetricConditionGater.MetricCondition",
         refresh_interval_seconds: int = 3,
     ):
-        self.metric_name = metric_name
+        self.metric = metric
         self.local_port = self._get_free_port()
         self.namespace = namespace
         self.cluster = cluster
         self.pod = pod
         self.metrics_port = metrics_port
-        self.metric_value_condition = metric_value_condition
         self.refresh_interval_seconds = refresh_interval_seconds
 
     @staticmethod
@@ -77,8 +76,8 @@ def _get_metrics_raw_string(self) -> str:
     def _poll_until_condition_met(self):
         """Poll metrics until the condition is met for the metric."""
         condition_description = (
-            f"({self.metric_value_condition.condition_description}) "
-            if self.metric_value_condition.condition_description is not None
+            f"({self.metric.condition_description}) "
+            if self.metric.condition_description is not None
             else ""
         )
 
@@ -89,26 +88,26 @@ def _poll_until_condition_met(self):
             metric_families = text_string_to_metric_families(metrics)
             val = None
             for metric_family in metric_families:
-                if metric_family.name == self.metric_name:
+                if metric_family.name == self.metric.name:
                     if len(metric_family.samples) > 1:
                         print_error(
-                            f"Multiple samples found for metric {self.metric_name}. Using the first one.",
+                            f"Multiple samples found for metric {self.metric.name}. Using the first one.",
                         )
                     val = metric_family.samples[0].value
                     break
 
             if val is None:
                 print_colored(
-                    f"Metric '{self.metric_name}' not found in pod {self.pod}. Assuming the node is not ready."
+                    f"Metric '{self.metric.name}' not found in pod {self.pod}. Assuming the node is not ready."
                 )
-            elif self.metric_value_condition.value_condition(val):
+            elif self.metric.value_condition(val):
                 print_colored(
-                    f"Metric {self.metric_name} condition {condition_description}met (value={val})."
+                    f"Metric {self.metric.name} condition {condition_description}met (value={val})."
                 )
                 return
             else:
                 print_colored(
-                    f"Metric {self.metric_name} condition {condition_description}not met (value={val}). Continuing to wait."
+                    f"Metric {self.metric.name} condition {condition_description}not met (value={val}). Continuing to wait."
                 )
 
             sleep(self.refresh_interval_seconds)
diff --git a/scripts/prod/restarter_lib.py b/scripts/prod/restarter_lib.py
@@ -2,6 +2,7 @@
 
 import sys
 from abc import ABC, abstractmethod
+from time import sleep
 from typing import Callable, Optional
 
 from common_lib import (
@@ -15,6 +16,7 @@
     run_kubectl_command,
     wait_until_y_or_n,
 )
+from metrics_lib import MetricConditionGater
 
 
 def _get_pod_names(
@@ -142,3 +144,113 @@ def restart_service(self, instance_index: int) -> bool:
         """No-op."""
         print_colored("\nSkipping pod restart.")
         return True
+
+
+class WaitOnMetricRestarter(ChecksBetweenRestarts):
+    def __init__(
+        self,
+        namespace_and_instruction_args: NamespaceAndInstructionArgs,
+        service: Service,
+        metrics: list["MetricConditionGater.Metric"],
+        metrics_port: int,
+        restart_strategy: RestartStrategy,
+    ):
+        self.metrics = metrics
+        self.metrics_port = metrics_port
+        if restart_strategy == RestartStrategy.ONE_BY_ONE:
+            check_function = self._check_between_each_restart
+        elif restart_strategy == RestartStrategy.ALL_AT_ONCE:
+            check_function = self._check_all_only_after_last_restart
+        else:
+            print_error(f"Invalid restart strategy: {restart_strategy} for WaitOnMetricRestarter.")
+            sys.exit(1)
+
+        super().__init__(namespace_and_instruction_args, service, check_function)
+
+    def _check_between_each_restart(self, instance_index: int) -> bool:
+        self._wait_for_pod_to_satisfy_condition(instance_index)
+        if instance_index == self.namespace_and_instruction_args.size() - 1:
+            # Last instance, no need to prompt the user about the next restart.
+            return True
+        return wait_until_y_or_n(f"Do you want to restart the next pod?")
+
+    def _check_all_only_after_last_restart(self, instance_index: int) -> bool:
+        # Restart all nodes without waiting for confirmation.
+        if instance_index < self.namespace_and_instruction_args.size() - 1:
+            return True
+
+        # After the last node has been restarted, wait for all pods to satisfy the condition.
+        for instance_index in range(self.namespace_and_instruction_args.size()):
+            self._wait_for_pod_to_satisfy_condition(instance_index)
+        return True
+
+    def _wait_for_pod_to_satisfy_condition(self, instance_index: int) -> bool:
+        # The sleep is to prevent the case where we get the pod name of the old pod we just deleted
+        # instead of the new one.
+        # TODO(guy.f): Verify this is not the name of the old pod some other way.
+        sleep(2)
+        pod_names = WaitOnMetricRestarter._wait_for_pods_to_be_ready(
+            self.namespace_and_instruction_args.get_namespace(instance_index),
+            self.namespace_and_instruction_args.get_cluster(instance_index),
+            self.service,
+        )
+        if pod_names is None:
+            return False
+
+        for pod_name in pod_names:
+            for metric in self.metrics:
+                metric_condition_gater = MetricConditionGater(
+                    metric,
+                    self.namespace_and_instruction_args.get_namespace(instance_index),
+                    self.namespace_and_instruction_args.get_cluster(instance_index),
+                    pod_name,
+                    self.metrics_port,
+                )
+                metric_condition_gater.gate()
+
+    @staticmethod
+    def _wait_for_pods_to_be_ready(
+        namespace: str,
+        cluster: Optional[str],
+        service: Service,
+        wait_timeout: int = 180,
+        num_retry: int = 3,
+        refresh_delay_sec: int = 3,
+    ) -> Optional[list[str]]:
+        """
+        Wait for pods to be in ready mode as reported by Kubernetes.
+        """
+
+        for i in range(num_retry):
+            pods = _get_pod_names(namespace, service, 0, cluster)
+            if pods:
+                for pod in pods:
+                    print_colored(
+                        f"Waiting for pod {pod} to be ready... (timeout set to {wait_timeout}s)"
+                    )
+                    kubectl_args = [
+                        "wait",
+                        "--for=condition=ready",
+                        f"pod/{pod}",
+                        "--timeout",
+                        f"{wait_timeout}s",
+                    ]
+                    kubectl_args.extend(get_namespace_args(namespace, cluster))
+                    result = run_kubectl_command(kubectl_args, capture_output=False)
+
+                    if result.returncode != 0:
+                        print_colored(
+                            f"Timed out waiting for pod {pod} to be ready: {result.stderr}, retrying... (attempt {i + 1}/{num_retry})",
+                            Colors.YELLOW,
+                        )
+                        break
+                return pods
+            else:
+                print_colored(
+                    f"Could not get pod names for service {service.pod_name}, retrying... (attempt {i + 1}/{num_retry})",
+                    Colors.YELLOW,
+                )
+            sleep(refresh_delay_sec)
+
+        print_error(f"Pods for service {service.pod_name} are not ready after {num_retry} attempts")
+        return None