|
2 | 2 |
|
3 | 3 | import sys |
4 | 4 | from abc import ABC, abstractmethod |
| 5 | +from time import sleep |
5 | 6 | from typing import Callable, Optional |
6 | 7 |
|
7 | 8 | from common_lib import ( |
|
15 | 16 | run_kubectl_command, |
16 | 17 | wait_until_y_or_n, |
17 | 18 | ) |
| 19 | +from metrics_lib import MetricConditionGater |
18 | 20 |
|
19 | 21 |
|
20 | 22 | def _get_pod_names( |
@@ -142,3 +144,113 @@ def restart_service(self, instance_index: int) -> bool: |
142 | 144 | """No-op.""" |
143 | 145 | print_colored("\nSkipping pod restart.") |
144 | 146 | return True |
| 147 | + |
| 148 | + |
| 149 | +class WaitOnMetricRestarter(ChecksBetweenRestarts): |
| 150 | + def __init__( |
| 151 | + self, |
| 152 | + namespace_and_instruction_args: NamespaceAndInstructionArgs, |
| 153 | + service: Service, |
| 154 | + metrics: list["MetricConditionGater.Metric"], |
| 155 | + metrics_port: int, |
| 156 | + restart_strategy: RestartStrategy, |
| 157 | + ): |
| 158 | + self.metrics = metrics |
| 159 | + self.metrics_port = metrics_port |
| 160 | + if restart_strategy == RestartStrategy.ONE_BY_ONE: |
| 161 | + check_function = self._check_between_each_restart |
| 162 | + elif restart_strategy == RestartStrategy.ALL_AT_ONCE: |
| 163 | + check_function = self._check_all_only_after_last_restart |
| 164 | + else: |
| 165 | + print_error(f"Invalid restart strategy: {restart_strategy} for WaitOnMetricRestarter.") |
| 166 | + sys.exit(1) |
| 167 | + |
| 168 | + super().__init__(namespace_and_instruction_args, service, check_function) |
| 169 | + |
| 170 | + def _check_between_each_restart(self, instance_index: int) -> bool: |
| 171 | + self._wait_for_pod_to_satisfy_condition(instance_index) |
| 172 | + if instance_index == self.namespace_and_instruction_args.size() - 1: |
| 173 | + # Last instance, no need to prompt the user about the next restart. |
| 174 | + return True |
| 175 | + return wait_until_y_or_n(f"Do you want to restart the next pod?") |
| 176 | + |
| 177 | + def _check_all_only_after_last_restart(self, instance_index: int) -> bool: |
| 178 | + # Restart all nodes without waiting for confirmation. |
| 179 | + if instance_index < self.namespace_and_instruction_args.size() - 1: |
| 180 | + return True |
| 181 | + |
| 182 | + # After the last node has been restarted, wait for all pods to satisfy the condition. |
| 183 | + for instance_index in range(self.namespace_and_instruction_args.size()): |
| 184 | + self._wait_for_pod_to_satisfy_condition(instance_index) |
| 185 | + return True |
| 186 | + |
| 187 | + def _wait_for_pod_to_satisfy_condition(self, instance_index: int) -> bool: |
| 188 | + # The sleep is to prevent the case where we get the pod name of the old pod we just deleted |
| 189 | + # instead of the new one. |
| 190 | + # TODO(guy.f): Verify this is not the name of the old pod some other way. |
| 191 | + sleep(2) |
| 192 | + pod_names = WaitOnMetricRestarter._wait_for_pods_to_be_ready( |
| 193 | + self.namespace_and_instruction_args.get_namespace(instance_index), |
| 194 | + self.namespace_and_instruction_args.get_cluster(instance_index), |
| 195 | + self.service, |
| 196 | + ) |
| 197 | + if pod_names is None: |
| 198 | + return False |
| 199 | + |
| 200 | + for pod_name in pod_names: |
| 201 | + for metric in self.metrics: |
| 202 | + metric_condition_gater = MetricConditionGater( |
| 203 | + metric, |
| 204 | + self.namespace_and_instruction_args.get_namespace(instance_index), |
| 205 | + self.namespace_and_instruction_args.get_cluster(instance_index), |
| 206 | + pod_name, |
| 207 | + self.metrics_port, |
| 208 | + ) |
| 209 | + metric_condition_gater.gate() |
| 210 | + |
| 211 | + @staticmethod |
| 212 | + def _wait_for_pods_to_be_ready( |
| 213 | + namespace: str, |
| 214 | + cluster: Optional[str], |
| 215 | + service: Service, |
| 216 | + wait_timeout: int = 180, |
| 217 | + num_retry: int = 3, |
| 218 | + refresh_delay_sec: int = 3, |
| 219 | + ) -> Optional[list[str]]: |
| 220 | + """ |
| 221 | + Wait for pods to be in ready mode as reported by Kubernetes. |
| 222 | + """ |
| 223 | + |
| 224 | + for i in range(num_retry): |
| 225 | + pods = _get_pod_names(namespace, service, 0, cluster) |
| 226 | + if pods: |
| 227 | + for pod in pods: |
| 228 | + print_colored( |
| 229 | + f"Waiting for pod {pod} to be ready... (timeout set to {wait_timeout}s)" |
| 230 | + ) |
| 231 | + kubectl_args = [ |
| 232 | + "wait", |
| 233 | + "--for=condition=ready", |
| 234 | + f"pod/{pod}", |
| 235 | + "--timeout", |
| 236 | + f"{wait_timeout}s", |
| 237 | + ] |
| 238 | + kubectl_args.extend(get_namespace_args(namespace, cluster)) |
| 239 | + result = run_kubectl_command(kubectl_args, capture_output=False) |
| 240 | + |
| 241 | + if result.returncode != 0: |
| 242 | + print_colored( |
| 243 | + f"Timed out waiting for pod {pod} to be ready: {result.stderr}, retrying... (attempt {i + 1}/{num_retry})", |
| 244 | + Colors.YELLOW, |
| 245 | + ) |
| 246 | + break |
| 247 | + return pods |
| 248 | + else: |
| 249 | + print_colored( |
| 250 | + f"Could not get pod names for service {service.pod_name}, retrying... (attempt {i + 1}/{num_retry})", |
| 251 | + Colors.YELLOW, |
| 252 | + ) |
| 253 | + sleep(refresh_delay_sec) |
| 254 | + |
| 255 | + print_error(f"Pods for service {service.pod_name} are not ready after {num_retry} attempts") |
| 256 | + return None |
0 commit comments