Skip to content
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 96 additions & 12 deletions scripts/prod/update_config_and_restart_nodes_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -433,6 +433,20 @@ def get_context_list_from_args(
]


def _get_pod_names(
namespace: str, service: Service, index: int, cluster: Optional[str] = None
) -> list[str]:
kubectl_args = [
"get",
"pods",
"-o",
"name",
]
kubectl_args.extend(get_namespace_args(namespace, cluster))
pods = run_kubectl_command(kubectl_args, capture_output=True).stdout.splitlines()
return [pod.split("/")[1] for pod in pods if pod.startswith(f"pod/{service.pod_name}")]


class ServiceRestarter(ABC):
"""Abstract class for restarting service instances."""

Expand All @@ -449,18 +463,7 @@ def _restart_pod(
namespace: str, service: Service, index: int, cluster: Optional[str] = None
) -> None:
"""Restart pod by deleting it"""
# Get the list of pods (one string per line).
kubectl_args = [
"get",
"pods",
"-o",
"name",
]
kubectl_args.extend(get_namespace_args(namespace, cluster))
pods = run_kubectl_command(kubectl_args, capture_output=True).stdout.splitlines()

# Filter the list of pods to only include the ones that match the service and extract the pod name.
pods = [pod.split("/")[1] for pod in pods if pod.startswith(f"pod/{service.pod_name}")]
pods = _get_pod_names(namespace, service, index, cluster)

if not pods:
print_error(f"Could not find pods for service {service.pod_name}.")
Expand Down Expand Up @@ -548,6 +551,87 @@ def restart_service(self, instance_index: int) -> bool:
return self.check_between_restarts(instance_index)


class WaitOnMetrticOneByOneRestarter(ChecksBetweenRestarts):
def __init__(
self,
namespace_and_instruction_args: NamespaceAndInstructionArgs,
service: Service,
metric_name: str,
metrics_port: int,
metric_value_condition: "MetricConditionGater.MetricCondition",
):
def _check_between_restarts(instance_index: int) -> bool:
# This is to prevent the case where we get the pod name of the old pod we just deleted.
# TODO(guy.f): Verify this is not the name of the old pod some other way.
sleep(2)
pod_names = WaitOnMetrticOneByOneRestarter._wait_for_pods_to_be_ready(
namespace_and_instruction_args.get_namespace(instance_index),
namespace_and_instruction_args.get_cluster(instance_index),
service,
)
if pod_names is None:
return False

for pod_name in pod_names:
metric_condition_gater = MetricConditionGater(
metric_name,
namespace_and_instruction_args.get_namespace(instance_index),
namespace_and_instruction_args.get_cluster(instance_index),
pod_name,
metrics_port,
metric_value_condition,
)
metric_condition_gater.gate()
if instance_index == namespace_and_instruction_args.size() - 1:
return True
return wait_until_y_or_n(f"Do you want to restart the next pod?")

super().__init__(namespace_and_instruction_args, service, _check_between_restarts)

@staticmethod
def _wait_for_pods_to_be_ready(
namespace: str,
cluster: Optional[str],
service: Service,
wait_timeout: int = 180,
num_retry: int = 3,
refresh_delay_sec: int = 3,
) -> Optional[list[str]]:
for i in range(num_retry):
pods = _get_pod_names(namespace, service, 0, cluster)
if pods:
for pod in pods:
print_colored(
f"Waiting for pod {pod} to be ready... (timeout: {wait_timeout}s)"
)
kubectl_args = [
"wait",
"--for=condition=ready",
f"pod/{pod}",
"--timeout",
f"{wait_timeout}s",
]
kubectl_args.extend(get_namespace_args(namespace, cluster))
result = run_kubectl_command(kubectl_args, capture_output=False)

if result.returncode != 0:
print_colored(
f"Timed out waiting for pod {pod} to be ready: {result.stderr}, retrying... (attempt {i + 1}/{num_retry})",
Colors.YELLOW,
)
break
return pods
else:
print_colored(
f"Could not get pod names for service {service.pod_name}, retrying... (attempt {i + 1}/{num_retry})",
Colors.YELLOW,
)
sleep(refresh_delay_sec)

print_error(f"Pods for service {service.pod_name} are not ready after {num_retry} attempts")
return None


class NoOpServiceRestarter(ServiceRestarter):
"""No-op service restarter."""

Expand Down