Skip to content

Commit 0f5612a

Browse files
scripts: Add code for waiting until the restarted job is ready and the metric condition is met
1 parent 3d2a34a commit 0f5612a

File tree

1 file changed

+73
-14
lines changed

1 file changed

+73
-14
lines changed

scripts/prod/update_config_and_restart_nodes_lib.py

Lines changed: 73 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -255,7 +255,7 @@ def _get_metrics(self) -> str:
255255
)
256256
sleep(self.refresh_interval_seconds)
257257

258-
def _poll_condition_met(self, metric_value_condition: Callable[[Any], bool]):
258+
def _poll_until_condition_met(self, metric_value_condition: Callable[[Any], bool]):
259259
"""Poll metrics until the condition is met for the metric."""
260260
while True:
261261
metrics = self._get_metrics()
@@ -325,7 +325,7 @@ def signal_handler(signum, frame):
325325
signal.signal(signal.SIGINT, signal_handler)
326326
signal.signal(signal.SIGTERM, signal_handler)
327327

328-
self._poll_condition_met(metric_value_condition)
328+
self._poll_until_condition_met(metric_value_condition)
329329

330330
finally:
331331
# Ensure subprocess is always terminated
@@ -402,6 +402,20 @@ def get_context_list_from_args(
402402
]
403403

404404

405+
def _get_pod_names(
406+
namespace: str, service: Service, index: int, cluster: Optional[str] = None
407+
) -> list[str]:
408+
kubectl_args = [
409+
"get",
410+
"pods",
411+
"-o",
412+
"name",
413+
]
414+
kubectl_args.extend(get_namespace_args(namespace, cluster))
415+
pods = run_kubectl_command(kubectl_args, capture_output=True).stdout.splitlines()
416+
return [pod.split("/")[1] for pod in pods if pod.startswith(f"pod/{service.pod_name}")]
417+
418+
405419
class ServiceRestarter(ABC):
406420
"""Abstract class for restarting service instances."""
407421

@@ -418,18 +432,7 @@ def _restart_pod(
418432
namespace: str, service: Service, index: int, cluster: Optional[str] = None
419433
) -> None:
420434
"""Restart pod by deleting it"""
421-
# Get the list of pods (one string per line).
422-
kubectl_args = [
423-
"get",
424-
"pods",
425-
"-o",
426-
"name",
427-
]
428-
kubectl_args.extend(get_namespace_args(namespace, cluster))
429-
pods = run_kubectl_command(kubectl_args, capture_output=True).stdout.splitlines()
430-
431-
# Filter the list of pods to only include the ones that match the service and extract the pod name.
432-
pods = [pod.split("/")[1] for pod in pods if pod.startswith(f"pod/{service.pod_name}")]
435+
pods = _get_pod_names(namespace, service, index, cluster)
433436

434437
if not pods:
435438
print_error(f"Could not find pods for service {service.pod_name}.")
@@ -516,6 +519,62 @@ def restart_service(self, instance_index: int) -> bool:
516519
return self.check_between_restarts(instance_index)
517520

518521

522+
class WaitOnMetrticOneByOneRestarter(ChecksBetweenRestarts):
523+
def __init__(
524+
self,
525+
namespace_and_instruction_args: NamespaceAndInstructionArgs,
526+
service: Service,
527+
metric_name: str,
528+
metrics_port: int,
529+
metric_value_condition: Callable[[Any], bool],
530+
):
531+
def _check_between_restarts(instance_index: int) -> bool:
532+
if not WaitOnMetrticOneByOneRestarter._wait_for_pods_to_be_ready(service):
533+
return False
534+
metric_condition_gater = MetricConditionGater(
535+
metric_name, service.pod_name, metrics_port
536+
)
537+
metric_condition_gater.gate(metric_value_condition)
538+
if instance_index == namespace_and_instruction_args.size() - 1:
539+
return True
540+
return wait_until_y_or_n(f"Do you want to restart the next pod?")
541+
542+
super().__init__(namespace_and_instruction_args, service, _check_between_restarts)
543+
544+
@staticmethod
545+
def _wait_for_pods_to_be_ready(
546+
service: Service, wait_timeout: int = 180, num_retry: int = 3, refresh_delay_sec: int = 3
547+
) -> bool:
548+
for i in range(num_retry):
549+
pods = _get_pod_names(service)
550+
if pods:
551+
for pod in pods:
552+
print_colored(
553+
f"Waiting for pod {pod} to be ready... (timeout: {wait_timeout}s)"
554+
)
555+
result = run_kubectl_command(
556+
["wait", "--for=condition=ready", "pod/", pod, "--timeout", "{timeout}s"],
557+
capture_output=False,
558+
)
559+
560+
if result.returncode != 0:
561+
print_colored(
562+
f"Timed out waiting for pod {pod} to be ready: {result.stderr}, retrying... (attempt {i + 1}/{num_retry})",
563+
Colors.YELLOW,
564+
)
565+
break
566+
return True
567+
else:
568+
print_colored(
569+
f"Could not get pod names for service {service.pod_name}, retrying... (attempt {i + 1}/{num_retry})",
570+
Colors.YELLOW,
571+
)
572+
sleep(refresh_delay_sec)
573+
574+
print_error(f"Pods for service {service.pod_name} are not ready after {num_retry} attempts")
575+
return False
576+
577+
519578
class NoOpServiceRestarter(ServiceRestarter):
520579
"""No-op service restarter."""
521580

0 commit comments

Comments
 (0)