Skip to content

Commit 30e33e3

Browse files
scripts: Add code for waiting until the restarted job is ready and the metric condition is met
1 parent f257625 commit 30e33e3

File tree

2 files changed

+115
-26
lines changed

2 files changed

+115
-26
lines changed

scripts/prod/update_config_and_restart_nodes.py

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88
from update_config_and_restart_nodes_lib import (
99
ApolloArgsParserBuilder,
1010
Colors,
11+
MetricConditionGater,
1112
NamespaceAndInstructionArgs,
1213
Service,
1314
ServiceRestarter,
15+
WaitOnMetrticOneByOneRestarter,
1416
print_colored,
1517
print_error,
1618
update_config_and_restart_nodes,
@@ -155,10 +157,19 @@ def main():
155157
None,
156158
)
157159

158-
restarter = ServiceRestarter.from_restart_strategy(
159-
args.restart_strategy,
160+
# Create the appropriate restarter based on the restart strategy
161+
# restarter = ServiceRestarter.from_restart_strategy(
162+
# args.restart_strategy,
163+
# namespace_and_instruction_args,
164+
# args.service,
165+
# )
166+
167+
restarter = WaitOnMetrticOneByOneRestarter(
160168
namespace_and_instruction_args,
161169
args.service,
170+
"mempool_p2p_propagator_local_msgs_processed",
171+
8082,
172+
MetricConditionGater.MetricCondition(lambda val: val > 4167980, "Greater than 4167980"),
162173
)
163174

164175
update_config_and_restart_nodes(

scripts/prod/update_config_and_restart_nodes_lib.py

Lines changed: 102 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,7 @@ def _poll_until_condition_met(
280280
self, metric_value_condition: "MetricConditionGater.MetricCondition"
281281
):
282282
"""Poll metrics until the condition is met for the metric."""
283+
condition_desc = f"({metric_value_condition.condition_description if metric_value_condition.condition_description is not None else ""}) "
283284
while True:
284285
metrics = self._get_metrics_raw_string()
285286
assert metrics is not None, f"Failed to get metrics from for pod {self.pod}"
@@ -300,12 +301,6 @@ def _poll_until_condition_met(
300301
f"Metric '{self.metric_name}' not found in pod {self.pod}. Assuming the node is not ready."
301302
)
302303
elif metric_value_condition.value_condition(val):
303-
condition_desc = (
304-
f"({metric_value_condition.condition_description}) "
305-
if metric_value_condition.condition_description is not None
306-
else ""
307-
)
308-
309304
print_colored(
310305
f"Metric {self.metric_name} condition {condition_desc}met (value={val})."
311306
)
@@ -435,6 +430,20 @@ def get_context_list_from_args(
435430
]
436431

437432

433+
def _get_pod_names(
434+
namespace: str, service: Service, index: int, cluster: Optional[str] = None
435+
) -> list[str]:
436+
kubectl_args = [
437+
"get",
438+
"pods",
439+
"-o",
440+
"name",
441+
]
442+
kubectl_args.extend(get_namespace_args(namespace, cluster))
443+
pods = run_kubectl_command(kubectl_args, capture_output=True).stdout.splitlines()
444+
return [pod.split("/")[1] for pod in pods if pod.startswith(f"pod/{service.pod_name}")]
445+
446+
438447
class ServiceRestarter(ABC):
439448
"""Abstract class for restarting service instances."""
440449

@@ -451,18 +460,7 @@ def _restart_pod(
451460
namespace: str, service: Service, index: int, cluster: Optional[str] = None
452461
) -> None:
453462
"""Restart pod by deleting it"""
454-
# Get the list of pods (one string per line).
455-
kubectl_args = [
456-
"get",
457-
"pods",
458-
"-o",
459-
"name",
460-
]
461-
kubectl_args.extend(get_namespace_args(namespace, cluster))
462-
pods = run_kubectl_command(kubectl_args, capture_output=True).stdout.splitlines()
463-
464-
# Filter the list of pods to only include the ones that match the service and extract the pod name.
465-
pods = [pod.split("/")[1] for pod in pods if pod.startswith(f"pod/{service.pod_name}")]
463+
pods = _get_pod_names(namespace, service, index, cluster)
466464

467465
if not pods:
468466
print_error(f"Could not find pods for service {service.pod_name}.")
@@ -536,12 +534,12 @@ def __init__(
536534

537535
def restart_service(self, instance_index: int) -> bool:
538536
"""Restart the instance one by one, running the use code in between each restart."""
539-
self._restart_pod(
540-
self.namespace_and_instruction_args.get_namespace(instance_index),
541-
self.service,
542-
instance_index,
543-
self.namespace_and_instruction_args.get_cluster(instance_index),
544-
)
537+
# self._restart_pod(
538+
# self.namespace_and_instruction_args.get_namespace(instance_index),
539+
# self.service,
540+
# instance_index,
541+
# self.namespace_and_instruction_args.get_cluster(instance_index),
542+
# )
545543
instructions = self.namespace_and_instruction_args.get_instruction(instance_index)
546544
print_colored(
547545
f"Restarted pod {instance_index}.\n{instructions if instructions is not None else ''} ",
@@ -550,6 +548,86 @@ def restart_service(self, instance_index: int) -> bool:
550548
return self.check_between_restarts(instance_index)
551549

552550

551+
class WaitOnMetrticOneByOneRestarter(ChecksBetweenRestarts):
552+
def __init__(
553+
self,
554+
namespace_and_instruction_args: NamespaceAndInstructionArgs,
555+
service: Service,
556+
metric_name: str,
557+
metrics_port: int,
558+
metric_value_condition: "MetricConditionGater.MetricCondition",
559+
):
560+
def _check_between_restarts(instance_index: int) -> bool:
561+
# This is to prevent the case where we get the pod name of the old pod we just deleted.
562+
# TODO(guy.f): Verify this is not the name of the old pod some other way.
563+
sleep(2)
564+
pod_names = WaitOnMetrticOneByOneRestarter._wait_for_pods_to_be_ready(
565+
namespace_and_instruction_args.get_namespace(instance_index),
566+
namespace_and_instruction_args.get_cluster(instance_index),
567+
service,
568+
)
569+
if pod_names is None:
570+
return False
571+
572+
for pod_name in pod_names:
573+
metric_condition_gater = MetricConditionGater(
574+
metric_name,
575+
namespace_and_instruction_args.get_namespace(instance_index),
576+
namespace_and_instruction_args.get_cluster(instance_index),
577+
pod_name,
578+
metrics_port,
579+
)
580+
metric_condition_gater.gate(metric_value_condition)
581+
if instance_index == namespace_and_instruction_args.size() - 1:
582+
return True
583+
return wait_until_y_or_n(f"Do you want to restart the next pod?")
584+
585+
super().__init__(namespace_and_instruction_args, service, _check_between_restarts)
586+
587+
@staticmethod
588+
def _wait_for_pods_to_be_ready(
589+
namespace: str,
590+
cluster: Optional[str],
591+
service: Service,
592+
wait_timeout: int = 180,
593+
num_retry: int = 3,
594+
refresh_delay_sec: int = 3,
595+
) -> Optional[list[str]]:
596+
for i in range(num_retry):
597+
pods = _get_pod_names(namespace, service, 0, cluster)
598+
if pods:
599+
for pod in pods:
600+
print_colored(
601+
f"Waiting for pod {pod} to be ready... (timeout: {wait_timeout}s)"
602+
)
603+
kubectl_args = [
604+
"wait",
605+
"--for=condition=ready",
606+
f"pod/{pod}",
607+
"--timeout",
608+
f"{wait_timeout}s",
609+
]
610+
kubectl_args.extend(get_namespace_args(namespace, cluster))
611+
result = run_kubectl_command(kubectl_args, capture_output=False)
612+
613+
if result.returncode != 0:
614+
print_colored(
615+
f"Timed out waiting for pod {pod} to be ready: {result.stderr}, retrying... (attempt {i + 1}/{num_retry})",
616+
Colors.YELLOW,
617+
)
618+
break
619+
return pods
620+
else:
621+
print_colored(
622+
f"Could not get pod names for service {service.pod_name}, retrying... (attempt {i + 1}/{num_retry})",
623+
Colors.YELLOW,
624+
)
625+
sleep(refresh_delay_sec)
626+
627+
print_error(f"Pods for service {service.pod_name} are not ready after {num_retry} attempts")
628+
return None
629+
630+
553631
class NoOpServiceRestarter(ServiceRestarter):
554632
"""No-op service restarter."""
555633

0 commit comments

Comments
 (0)