-
Notifications
You must be signed in to change notification settings - Fork 0
[DPE-9325]: Network HA tests #23
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: 9/edge
Are you sure you want to change the base?
Changes from all commits
5fb720b
93f8b41
a0e62d4
c7caead
af42d57
8889bd5
7157121
90750a1
6a80e46
301e627
2be061c
e2ea39f
07353c3
a8a2f18
87c443e
2cd5c8b
1f73be7
b51956d
1615306
7aa4505
c63f21e
9093a80
d8e2754
fc9c9d3
3bc8774
b812847
913f85f
073f087
f6a8489
9d36b0f
12e63fb
935d794
be9de60
9b27441
7a70828
3f2177e
31c217a
b5a9bea
ef5415a
888ffbd
5f2bc81
79b45f0
b1b4f06
577171e
c74a27a
2f00f1f
827e58d
199492b
72e4b4f
38923b1
c436a4a
6ea5fa2
bfc60e9
21e1837
c48daeb
25d25b8
e8db36c
a6d02bd
cde911e
1bf286e
efac5a8
c165c68
230b4e5
3dbb471
8b50dff
7c553be
40fb300
50db852
d60a25b
76a9b52
bc3b51b
47f5c12
9a6f877
1fdc7e9
3218f37
e46b5f3
7c5afc2
5244669
9a0a081
60504e2
ea71353
ab3e4c5
99562f0
4258bd5
d00c206
ee7f331
0c4eb4e
994e852
53a6285
7e616ed
a9f33da
d5c3a01
bf491e8
27a6e23
178f560
ed477cf
d4aa771
3beea80
eeddaad
f2e80b0
68b89a4
250e39b
147240f
cb3e0ec
95abc33
7d51cb4
c1fa74e
dfbde41
ec578b7
a14839d
487ec64
b300ccd
40789bf
46cb2a9
01e8a73
c825223
f9c37f8
674b96f
610d333
733dbd1
b1258b4
cbe8f66
b30e1e9
e2ba6ef
ba0ccc0
9949da3
d4cfb59
ac4348b
3fed063
e2a4964
7288292
b521089
e6267cb
f103091
a8f8912
d796c8c
7ea8175
64bb344
320d17d
ce045e4
71008c5
3fef2bf
f500b43
49f8826
fde927f
f56dd74
2dee78c
d327afd
5fe97f0
f15a45a
c8e40d6
f87db82
10940e4
d6a0bce
75a90d3
867e699
0bf408e
c02cf29
184119c
2be86dd
3412ba9
08fc054
e798fc1
a0c6017
d00a189
abe43b9
d801de9
4dc6340
4e399a8
335100b
b229e7d
be5bd08
d62ea9b
cb1138d
a931e7c
d0aeff6
abc1996
dacaaba
a883967
2a78390
1ca51a3
3fbf5c9
e61cf2c
b9b961a
95c6a04
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -19,7 +19,7 @@ | |
| ValkeyServicesFailedToStartError, | ||
| ValkeyWorkloadCommandError, | ||
| ) | ||
| from common.locks import ScaleDownLock, StartLock | ||
| from common.locks import RestartLock, ScaleDownLock, StartLock | ||
| from literals import ( | ||
| CLIENT_PORT, | ||
| DATA_STORAGE, | ||
|
|
@@ -41,6 +41,29 @@ | |
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class RestartWorkloadEvent(ops.EventBase): | ||
| """Event for restarting the workload when certain events happen, e.g. IP change.""" | ||
|
|
||
| def __init__( | ||
| self, handle: ops.Handle, restart_valkey: bool = True, restart_sentinel: bool = True | ||
| ): | ||
| super().__init__(handle) | ||
| self.restart_valkey = restart_valkey | ||
| self.restart_sentinel = restart_sentinel | ||
|
|
||
| def snapshot(self) -> dict[str, str]: | ||
| """Save the state of the event.""" | ||
| return { | ||
| "restart_valkey": str(self.restart_valkey), | ||
| "restart_sentinel": str(self.restart_sentinel), | ||
| } | ||
|
|
||
| def restore(self, snapshot: dict[str, str]) -> None: | ||
| """Restore the state of the event.""" | ||
| self.restart_valkey = snapshot.get("restart_valkey", "True") == "True" | ||
| self.restart_sentinel = snapshot.get("restart_sentinel", "True") == "True" | ||
|
|
||
|
|
||
| class UnitFullyStarted(ops.EventBase): | ||
| """Event that signals that the unit's has fully started. | ||
|
|
||
|
|
@@ -66,6 +89,7 @@ class BaseEvents(ops.Object): | |
| """Handle all base events.""" | ||
|
|
||
| unit_fully_started = ops.EventSource(UnitFullyStarted) | ||
| restart_workload = ops.EventSource(RestartWorkloadEvent) | ||
|
|
||
| def __init__(self, charm: "ValkeyCharm"): | ||
| super().__init__(charm, key="base_events") | ||
|
|
@@ -81,6 +105,7 @@ def __init__(self, charm: "ValkeyCharm"): | |
| self.framework.observe(self.charm.on.config_changed, self._on_config_changed) | ||
| self.framework.observe(self.charm.on.secret_changed, self._on_secret_changed) | ||
| self.framework.observe(self.unit_fully_started, self._on_unit_fully_started) | ||
| self.framework.observe(self.restart_workload, self._on_restart_workload) | ||
| self.framework.observe( | ||
| self.charm.on[DATA_STORAGE].storage_detaching, self._on_storage_detaching | ||
| ) | ||
|
|
@@ -230,7 +255,7 @@ def _on_peer_relation_changed(self, event: ops.RelationChangedEvent) -> None: | |
| if not self.charm.unit.is_leader(): | ||
| return | ||
|
|
||
| for lock in [StartLock(self.charm.state)]: | ||
| for lock in [StartLock(self.charm.state), RestartLock(self.charm.state)]: | ||
| lock.process() | ||
|
|
||
| def _on_update_status(self, event: ops.UpdateStatusEvent) -> None: | ||
|
|
@@ -287,12 +312,15 @@ def _on_leader_elected(self, event: ops.LeaderElectedEvent) -> None: | |
|
|
||
| def _on_config_changed(self, event: ops.ConfigChangedEvent) -> None: | ||
| """Handle the config_changed event.""" | ||
| self.charm.state.unit_server.update( | ||
| { | ||
| "hostname": self.charm.state.hostname, | ||
| "private_ip": self.charm.state.bind_address, | ||
| } | ||
| ) | ||
| # on k8s we use hostnames so we do not have to reconfigure on ip change | ||
| if ( | ||
| self.charm.state.unit_server.model.private_ip | ||
| and self.charm.state.bind_address != self.charm.state.unit_server.model.private_ip | ||
| and self.charm.state.substrate == Substrate.VM | ||
| ): | ||
| self.charm.config_manager.configure_services( | ||
| self.charm.sentinel_manager.get_primary_ip() | ||
| ) | ||
|
|
||
| if not self.charm.unit.is_leader(): | ||
| return | ||
|
|
@@ -524,3 +552,54 @@ def _set_state_for_going_away(self) -> None: | |
| ) | ||
|
|
||
| self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.GOING_AWAY}) | ||
|
|
||
| def _on_restart_workload(self, event: RestartWorkloadEvent) -> None: | ||
| """Handle the restart_workload event.""" | ||
| logger.info( | ||
| "Restarting workload Event. Restart Valkey: %s, Restart Sentinel: %s", | ||
| event.restart_valkey, | ||
| event.restart_sentinel, | ||
| ) | ||
| restart_lock = RestartLock(self.charm.state) | ||
| restart_lock.request_lock() | ||
| if not restart_lock.is_held_by_this_unit: | ||
| logger.info("Waiting for lock to restart workload") | ||
| event.defer() | ||
| return | ||
|
|
||
| if event.restart_valkey: | ||
| self.charm.workload.restart(self.charm.workload.valkey_service) | ||
| if event.restart_sentinel: | ||
| self.charm.sentinel_manager.restart_service() | ||
|
|
||
|
Comment on lines
+570
to
+574
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We need to add error handling here in case these methods raise, otherwise the event would crash. |
||
| if event.restart_valkey and not self.charm.cluster_manager.is_healthy( | ||
| check_replica_sync=False | ||
| ): | ||
| self.charm.status.set_running_status( | ||
| ClusterStatuses.VALKEY_UNHEALTHY_RESTART.value, | ||
| scope="unit", | ||
| component_name=self.charm.cluster_manager.name, | ||
| statuses_state=self.charm.state.statuses, | ||
| ) | ||
|
|
||
| self.charm.state.statuses.delete( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this reset a status that might just previously have been set? Or am I missing something here? |
||
| ClusterStatuses.VALKEY_UNHEALTHY_RESTART.value, | ||
| scope="unit", | ||
| component=self.charm.cluster_manager.name, | ||
| ) | ||
|
|
||
| if event.restart_sentinel and not self.charm.sentinel_manager.is_healthy(): | ||
| self.charm.status.set_running_status( | ||
| ClusterStatuses.SENTINEL_UNHEALTHY_RESTART.value, | ||
| scope="unit", | ||
| component_name=self.charm.cluster_manager.name, | ||
| statuses_state=self.charm.state.statuses, | ||
| ) | ||
|
|
||
| self.charm.state.statuses.delete( | ||
| ClusterStatuses.SENTINEL_UNHEALTHY_RESTART.value, | ||
| scope="unit", | ||
| component=self.charm.cluster_manager.name, | ||
| ) | ||
|
|
||
|
Comment on lines
+575
to
+604
Comment on lines
+575
to
+604
|
||
| restart_lock.release_lock() | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,6 +24,7 @@ | |
| CLIENT_PORT, | ||
| CLIENT_TLS_RELATION_NAME, | ||
| PEER_RELATION, | ||
| Substrate, | ||
| TLSCARotationState, | ||
| TLSState, | ||
| ) | ||
|
|
@@ -78,6 +79,7 @@ def __init__(self, charm: "ValkeyCharm"): | |
| self.charm.on[PEER_RELATION].relation_changed, self._on_peer_relation_changed | ||
| ) | ||
| self.framework.observe(self.charm.on.update_status, self._on_update_status) | ||
| self.framework.observe(self.charm.on.config_changed, self._on_config_changed) | ||
|
|
||
| def _on_peer_relation_created(self, event: ops.RelationCreatedEvent) -> None: | ||
| """Set up self-signed certificates for peer TLS by default.""" | ||
|
|
@@ -299,6 +301,30 @@ def _enable_client_tls(self) -> None: | |
| self.charm.cluster_manager.reload_tls_settings(tls_config) | ||
| self.charm.sentinel_manager.restart_service() | ||
|
|
||
| def _on_config_changed(self, event: ops.ConfigChangedEvent) -> None: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Separating concerns is nice and I really like it! After reading the code though I think I need to revert my earlier comment and admit that it makes more sense to have it in the same event handler and method. Reason being that it seems to be overly complicated just to adhere to the structure, and now half of the logic here is not related to TLS concerns. Therefore, I'd say let's move this code to the |
||
| """Handle the `config-changed` event.""" | ||
| if ( | ||
| self.charm.state.unit_server.model.private_ip | ||
| and self.charm.state.bind_address != self.charm.state.unit_server.model.private_ip | ||
| ): | ||
| if self.charm.tls_manager.certificate_sans_require_update(): | ||
| if not self.charm.state.client_tls_relation: | ||
| self.charm.tls_manager.create_and_store_self_signed_certificate() | ||
| else: | ||
| self.charm.tls_events.refresh_tls_certificates_event.emit() | ||
| event.defer() | ||
| return | ||
|
|
||
| self.charm.state.unit_server.update( | ||
| { | ||
| "hostname": self.charm.state.hostname, | ||
| "private_ip": self.charm.state.bind_address, | ||
| } | ||
| ) | ||
| # only restart on VM because on k8s the hostname is stable and does not change with IP changes | ||
| if self.charm.state.substrate == Substrate.VM: | ||
| self.charm.base_events.restart_workload.emit() | ||
|
|
||
| def _orchestrate_ca_rotation(self) -> None: | ||
| """Orchestrate the workflow when a TLS CA rotation has been initiated.""" | ||
| match self.charm.state.unit_server.tls_ca_rotation_state: | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice! This will certainly be useful!