Skip to content

Commit a92f22d

Browse files
scripts: Add code for waiting until the restarted job is ready and the metric condition is met
1 parent 11a1376 commit a92f22d

File tree

1 file changed

+77
-14
lines changed

1 file changed

+77
-14
lines changed

scripts/prod/update_config_and_restart_nodes_lib.py

Lines changed: 77 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -222,10 +222,10 @@ class MetricConditionGater:
222222
def __init__(
223223
self,
224224
metric_name: str,
225+
namespace: str,
226+
cluster: Optional[str],
225227
pod: str,
226228
metrics_port: int,
227-
namespace: str,
228-
cluster: Optional[str] = None,
229229
refresh_interval_seconds: int = 3,
230230
):
231231
self.metric_name = metric_name
@@ -396,6 +396,20 @@ def get_context_list_from_args(
396396
]
397397

398398

399+
def _get_pod_names(
400+
namespace: str, service: Service, index: int, cluster: Optional[str] = None
401+
) -> list[str]:
402+
kubectl_args = [
403+
"get",
404+
"pods",
405+
"-o",
406+
"name",
407+
]
408+
kubectl_args.extend(get_namespace_args(namespace, cluster))
409+
pods = run_kubectl_command(kubectl_args, capture_output=True).stdout.splitlines()
410+
return [pod.split("/")[1] for pod in pods if pod.startswith(f"pod/{service.pod_name}")]
411+
412+
399413
class ServiceRestarter(ABC):
400414
"""Abstract class for restarting service instances."""
401415

@@ -412,18 +426,7 @@ def _restart_pod(
412426
namespace: str, service: Service, index: int, cluster: Optional[str] = None
413427
) -> None:
414428
"""Restart pod by deleting it"""
415-
# Get the list of pods (one string per line).
416-
kubectl_args = [
417-
"get",
418-
"pods",
419-
"-o",
420-
"name",
421-
]
422-
kubectl_args.extend(get_namespace_args(namespace, cluster))
423-
pods = run_kubectl_command(kubectl_args, capture_output=True).stdout.splitlines()
424-
425-
# Filter the list of pods to only include the ones that match the service and extract the pod name.
426-
pods = [pod.split("/")[1] for pod in pods if pod.startswith(f"pod/{service.pod_name}")]
429+
pods = _get_pod_names(namespace, service, index, cluster)
427430

428431
if not pods:
429432
print_error(f"Could not find pods for service {service.pod_name}.")
@@ -509,6 +512,66 @@ def restart_service(self, instance_index: int) -> bool:
509512
return self.check_between_restarts(instance_index)
510513

511514

515+
class WaitOnMetrticOneByOneRestarter(ChecksBetweenRestarts):
516+
def __init__(
517+
self,
518+
namespace_and_instruction_args: NamespaceAndInstructionArgs,
519+
service: Service,
520+
metric_name: str,
521+
metrics_port: int,
522+
metric_value_condition: Callable[[Any], bool],
523+
):
524+
def _check_between_restarts(instance_index: int) -> bool:
525+
if not WaitOnMetrticOneByOneRestarter._wait_for_pods_to_be_ready(service):
526+
return False
527+
metric_condition_gater = MetricConditionGater(
528+
metric_name,
529+
namespace_and_instruction_args.get_namespace(instance_index),
530+
namespace_and_instruction_args.get_cluster(instance_index),
531+
service.pod_name,
532+
metrics_port,
533+
)
534+
metric_condition_gater.gate(metric_value_condition)
535+
if instance_index == namespace_and_instruction_args.size() - 1:
536+
return True
537+
return wait_until_y_or_n(f"Do you want to restart the next pod?")
538+
539+
super().__init__(namespace_and_instruction_args, service, _check_between_restarts)
540+
541+
@staticmethod
542+
def _wait_for_pods_to_be_ready(
543+
service: Service, wait_timeout: int = 180, num_retry: int = 3, refresh_delay_sec: int = 3
544+
) -> bool:
545+
for i in range(num_retry):
546+
pods = _get_pod_names(service)
547+
if pods:
548+
for pod in pods:
549+
print_colored(
550+
f"Waiting for pod {pod} to be ready... (timeout: {wait_timeout}s)"
551+
)
552+
result = run_kubectl_command(
553+
["wait", "--for=condition=ready", "pod/", pod, "--timeout", "{timeout}s"],
554+
capture_output=False,
555+
)
556+
557+
if result.returncode != 0:
558+
print_colored(
559+
f"Timed out waiting for pod {pod} to be ready: {result.stderr}, retrying... (attempt {i + 1}/{num_retry})",
560+
Colors.YELLOW,
561+
)
562+
break
563+
return True
564+
else:
565+
print_colored(
566+
f"Could not get pod names for service {service.pod_name}, retrying... (attempt {i + 1}/{num_retry})",
567+
Colors.YELLOW,
568+
)
569+
sleep(refresh_delay_sec)
570+
571+
print_error(f"Pods for service {service.pod_name} are not ready after {num_retry} attempts")
572+
return False
573+
574+
512575
class NoOpServiceRestarter(ServiceRestarter):
513576
"""No-op service restarter."""
514577

0 commit comments

Comments
 (0)