Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
185 commits
Select commit Hold shift + click to select a range
5fb720b
disbale default user and add charmed-operator user and password gener…
skourta Jan 19, 2026
93f8b41
add secret handlign and config for admin password
skourta Jan 19, 2026
a0e62d4
bind to 0.0.0.0
skourta Jan 20, 2026
c7caead
switch to glide
skourta Jan 20, 2026
af42d57
add unit tests
skourta Jan 20, 2026
8889bd5
add integeration tests
skourta Jan 20, 2026
7157121
add install deps to ci unit tests
skourta Jan 20, 2026
90750a1
add sudo to apt
skourta Jan 20, 2026
6a80e46
install protobug for glide on integration tests
skourta Jan 20, 2026
301e627
auto approve installing deps
skourta Jan 21, 2026
2be061c
update rust
skourta Jan 21, 2026
e2ea39f
sudo apt
skourta Jan 21, 2026
07353c3
set default rust on spread
skourta Jan 21, 2026
a8a2f18
save acl after udpating password so the change persists across restarts
skourta Jan 21, 2026
87c443e
feedback from rene
skourta Jan 22, 2026
2cd5c8b
switch updating password to write acl file and then load it
skourta Jan 22, 2026
1f73be7
implement feedback
skourta Jan 26, 2026
b51956d
add different charm users
skourta Jan 27, 2026
1615306
update passwords on non leader units
skourta Jan 27, 2026
7aa4505
chagne scope of status for units and fix exception catching
skourta Jan 27, 2026
c63f21e
fixing unit tests WIP
skourta Jan 27, 2026
9093a80
Merge branch '9/edge' into create-all-charm-users
skourta Jan 28, 2026
d8e2754
small charm restructure and enahnce unit tests
skourta Jan 28, 2026
fc9c9d3
fix integration tests
skourta Jan 28, 2026
3bc8774
add wrong username update test
skourta Jan 28, 2026
b812847
fix copilot feedback
skourta Jan 28, 2026
913f85f
fix unit tests
skourta Jan 28, 2026
073f087
add charm sentinel user
skourta Jan 28, 2026
f6a8489
initial scale up implementation
skourta Jan 29, 2026
9d36b0f
Merge branch 'DPE-9135-create-all-charm-users' into DPE-9174-scale-up
skourta Jan 29, 2026
12e63fb
set sentinel acl file
skourta Jan 29, 2026
935d794
add monitoring user
skourta Jan 29, 2026
be9de60
Merge branch 'DPE-9135-create-all-charm-users' into DPE-9174-scale-up
skourta Jan 29, 2026
9b27441
revert back secret name
skourta Jan 29, 2026
7a70828
Merge branch 'DPE-9135-create-all-charm-users' into DPE-9174-scale-up
skourta Jan 29, 2026
3f2177e
update users for acls and configs
skourta Jan 29, 2026
31c217a
add update primaryauth on password change
skourta Jan 29, 2026
b5a9bea
switch to ips instead of hostnames
skourta Jan 29, 2026
ef5415a
fix unit tests and remove checks from manager
skourta Jan 29, 2026
888ffbd
add statuses for starting
skourta Jan 29, 2026
5f2bc81
fix unit tests
skourta Jan 29, 2026
79b45f0
switch from valkey glide to valkey cli with subprocess
skourta Jan 30, 2026
b1b4f06
add unit local admin password and fix integration tests
skourta Jan 30, 2026
577171e
fix unit tests
skourta Jan 30, 2026
c74a27a
switch away from glide for integration tests
skourta Jan 30, 2026
2f00f1f
add replica password change and check on all units
skourta Jan 30, 2026
827e58d
add continuouswrites file
skourta Jan 30, 2026
199492b
Merge branch '9/edge' into DPE-9174-scale-up
skourta Feb 4, 2026
72e4b4f
add first integration test for scale up
skourta Feb 4, 2026
38923b1
add scaling spread file
skourta Feb 4, 2026
c436a4a
mock get_private_ip
skourta Feb 4, 2026
6ea5fa2
remove markers and etcd references
skourta Feb 4, 2026
bfc60e9
Merge branch '9/edge' into DPE-9174-scale-up
skourta Feb 5, 2026
21e1837
fix unit tests and add some workload functions
skourta Feb 5, 2026
c48daeb
add mode user and group to write file
skourta Feb 5, 2026
25d25b8
fix integration tests
skourta Feb 6, 2026
e8db36c
add one-by-one scaling up
skourta Feb 9, 2026
a6d02bd
add retries to sentinel discovery and replica sync check
skourta Feb 9, 2026
cde911e
better statuses
skourta Feb 9, 2026
1bf286e
Merge branch '9/edge' into DPE-9174-scale-up
skourta Feb 9, 2026
efac5a8
seed data and auto decode
skourta Feb 9, 2026
c165c68
add different scenarios for unit test non leader starting
skourta Feb 9, 2026
230b4e5
update vm tests
skourta Feb 9, 2026
3dbb471
remove primary ip from databag
skourta Feb 9, 2026
8b50dff
fix unit tests
skourta Feb 9, 2026
7c553be
fix vm startup
skourta Feb 10, 2026
40fb300
move spread file to correct position
skourta Feb 10, 2026
50db852
enable sentinel on VM
skourta Feb 10, 2026
d60a25b
mv cw to the base of integration tests
skourta Feb 10, 2026
76a9b52
add scaling tests on VM
skourta Feb 10, 2026
bc3b51b
fix typos
skourta Feb 10, 2026
47f5c12
fix typo
skourta Feb 10, 2026
9a6f877
remove resource from vm test scaling
skourta Feb 10, 2026
1fdc7e9
remove scaling comment and update cw to be pythonic
skourta Feb 11, 2026
3218f37
remove unused patch
skourta Feb 11, 2026
e46b5f3
turn off write logging for CW
skourta Feb 11, 2026
7c5afc2
add sentinel as daemon for vm and fix permissions for files
skourta Feb 11, 2026
5244669
fix role for valkey sentinel user
skourta Feb 11, 2026
9a0a081
update to the new rock and its user
skourta Feb 12, 2026
60504e2
only log the command no arguments to avoid leaking secrets
skourta Feb 12, 2026
ea71353
refactored start procedure and added rene feedback
skourta Feb 13, 2026
ab3e4c5
fix unit tests and fine tune statuses
skourta Feb 13, 2026
99562f0
fixes for rene feedback
skourta Feb 13, 2026
4258bd5
remove get_private_ip and replace it with bind_address
skourta Feb 13, 2026
d00c206
add unit tests for peer relation changed
skourta Feb 13, 2026
ee7f331
fix some feedback from mehdi
skourta Feb 16, 2026
0c4eb4e
refactor client and add health checks
skourta Feb 16, 2026
994e852
mock tenacity nap times and fix unit tests
skourta Feb 16, 2026
53a6285
update name of charmed_operator_password for units
skourta Feb 16, 2026
7e616ed
remove unnecessary check on admin app password
skourta Feb 16, 2026
a9f33da
add alive check in start
skourta Feb 16, 2026
d5c3a01
remove refresh argument from reading secret
skourta Feb 17, 2026
bf491e8
read and manage sentinel config via a dict
skourta Feb 17, 2026
27a6e23
move workload fields to be body annotations
skourta Feb 17, 2026
178f560
some minor changes based on feedback
skourta Feb 17, 2026
ed477cf
simplify and generalise start up logic
skourta Feb 18, 2026
d4aa771
remove unnecessary state and fix unit tests
skourta Feb 18, 2026
3beea80
only leader starts priomary if num of units is 0
skourta Feb 18, 2026
eeddaad
clean the cases where primary ip is None and set a blocked status if …
skourta Feb 18, 2026
f2e80b0
extend unit test coverage and rename unit tests to reflect business l…
skourta Feb 18, 2026
68b89a4
leader has to start primary because non leaders might not see all uni…
skourta Feb 18, 2026
250e39b
add running status for better UX
skourta Feb 19, 2026
147240f
move to glide and wrap client requests in helpers
skourta Feb 19, 2026
cb3e0ec
install charmed-valkey snap
skourta Feb 19, 2026
95abc33
add sudo and snap to allowlist
skourta Feb 19, 2026
7d51cb4
mv from snap to downloading cli
skourta Feb 19, 2026
c1fa74e
switch creating glie client to context manager to close connection au…
skourta Feb 20, 2026
dfbde41
wip scale down
skourta Feb 20, 2026
ec578b7
revert back is sentinel discovered argument
skourta Feb 20, 2026
a14839d
statuses for scale down
skourta Feb 24, 2026
487ec64
refactor client tp separate valkey and sentinel and use json where po…
skourta Feb 25, 2026
b300ccd
only recompute model when writing to databag
skourta Feb 25, 2026
40789bf
client refactoring and added delief
skourta Feb 25, 2026
46cb2a9
refactor locks to adhere to ux of advanced rollingops
skourta Feb 25, 2026
01e8a73
refactor managers to use the new clients
skourta Feb 25, 2026
c825223
refactor verify_expected_replica_count
skourta Feb 25, 2026
f9c37f8
update base events with new refactoring
skourta Feb 25, 2026
674b96f
remove unnecessary debug log
skourta Feb 25, 2026
610d333
Merge branch '9/edge' into DPE-9324-scale-down
skourta Feb 26, 2026
733dbd1
shorten statuses
skourta Feb 26, 2026
b1258b4
fix unit tests and change some function names on client
skourta Feb 26, 2026
cbe8f66
remove unnecessary catches
skourta Feb 26, 2026
b30e1e9
add scale down unit tests
skourta Feb 26, 2026
e2ba6ef
only try to update passwords on valkey if it is started
skourta Feb 26, 2026
ba0ccc0
add k8s scaledown tests
skourta Feb 26, 2026
9949da3
fix unit tests
skourta Feb 26, 2026
d4cfb59
handle scale down to 0
skourta Feb 26, 2026
ac4348b
fix unit test
skourta Feb 26, 2026
3fed063
add scaling down to 0 and back
skourta Feb 26, 2026
e2a4964
clear cw
skourta Feb 26, 2026
7288292
fix linter
skourta Feb 26, 2026
b521089
add remove app test
skourta Feb 26, 2026
e6267cb
copilot feedback
skourta Feb 27, 2026
f103091
port fix from tls for leader elected event
skourta Feb 27, 2026
a8f8912
cw use databag to filter units and use helper to remove units on both…
skourta Mar 3, 2026
d796c8c
Merge branch '9/edge' into DPE-9324-scale-down
skourta Mar 3, 2026
7ea8175
add c_writes to scale down
skourta Mar 3, 2026
64bb344
fail faster if any hostname is down
skourta Mar 3, 2026
320d17d
rename tests so we can easily run all scale down tests using -k
skourta Mar 3, 2026
ce045e4
vm agnostic test
skourta Mar 3, 2026
71008c5
add scale down primary test on vm
skourta Mar 3, 2026
3fef2bf
feedback from rene
skourta Mar 4, 2026
f500b43
add a todo comment
skourta Mar 4, 2026
49f8826
lint and add clearing c_writes
skourta Mar 4, 2026
fde927f
increase cw request timeout to 1s
skourta Mar 4, 2026
f56dd74
remove unneeded raises and augment unit test coverage for sentinel ma…
skourta Mar 4, 2026
2dee78c
reduce request timeout
skourta Mar 4, 2026
d327afd
Merge branch '9/edge' into DPE-9324-scale-down
skourta Mar 4, 2026
5fe97f0
fix conflicts
skourta Mar 4, 2026
f15a45a
add is_tls_enabled property
skourta Mar 10, 2026
c8e40d6
Merge branch '9/edge' into DPE-9324-scale-down
skourta Mar 10, 2026
f87db82
add primary ip to valkey lock
skourta Mar 10, 2026
10940e4
try to get primary ip for 40s and clean certicicates on leader going out
skourta Mar 10, 2026
d6a0bce
fix and increase unit tests
skourta Mar 10, 2026
75a90d3
lint
skourta Mar 10, 2026
867e699
feedback from rene
skourta Mar 11, 2026
0bf408e
[DPE-9373]: Use hostnames instead of IPs for k8s (#19)
skourta Mar 13, 2026
c02cf29
add timestamp to lock
skourta Mar 16, 2026
184119c
fix lock bug
skourta Mar 16, 2026
2be86dd
network cut on k8s
skourta Mar 16, 2026
3412ba9
add handling ip change certs and tls in network cuts PR
skourta Mar 16, 2026
08fc054
skip tls on k8s and add spread files
skourta Mar 16, 2026
e798fc1
fix linter
skourta Mar 16, 2026
a0c6017
clean cwrites even when test fails
skourta Mar 16, 2026
d00a189
remove f strings in loggers
skourta Mar 17, 2026
abe43b9
charm level feedback
skourta Mar 17, 2026
d801de9
rename ip to endpoint and add existing app
skourta Mar 17, 2026
4dc6340
add support for existing app in scale tests
skourta Mar 17, 2026
4e399a8
patch is_failover_in_progress
skourta Mar 17, 2026
335100b
Merge branch 'DPE-9324-scale-down' into dpe-9325-network-ha
skourta Mar 17, 2026
b229e7d
simplify code
skourta Mar 17, 2026
be5bd08
fix bug and unit tests
skourta Mar 17, 2026
d62ea9b
add network cut without ip change for vm
skourta Mar 18, 2026
cb1138d
Merge branch '9/edge' into DPE-9324-scale-down
skourta Mar 18, 2026
a931e7c
only remove APP_NAME in tests
skourta Mar 18, 2026
d0aeff6
minor feedback
skourta Mar 18, 2026
abc1996
Merge branch 'DPE-9324-scale-down' into dpe-9325-network-ha
skourta Mar 18, 2026
dacaaba
small refactor
skourta Mar 18, 2026
a883967
fix bug in config gen
skourta Mar 18, 2026
2a78390
run tls on k8s too
skourta Mar 18, 2026
1ca51a3
add tls on for k8s
skourta Mar 18, 2026
3fbf5c9
remove skip on build and deploy
skourta Mar 18, 2026
e61cf2c
do not crash if deletion on key fails on valkey on cw clearing
skourta Mar 18, 2026
b9b961a
add rolling restart for ip change
skourta Mar 18, 2026
95c6a04
Merge branch '9/edge' into dpe-9325-network-ha
skourta Mar 18, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
270 changes: 268 additions & 2 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ python-dateutil = "*"
tenacity = "^9.1.2"
# https://github.com/valkey-io/valkey-glide/pull/5124 not yet released
valkey-glide = { git = "https://github.com/skourta/valkey-glide", subdirectory = "python/glide-async", branch = "add-build-rs-to-async-client" }
kubernetes = "^35.0.0"

[tool.coverage.run]
branch = true
Expand Down
51 changes: 41 additions & 10 deletions src/common/locks.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""Collection of locks for cluster operations."""

import logging
import time
from abc import abstractmethod
from typing import TYPE_CHECKING, Protocol, override

Expand Down Expand Up @@ -52,10 +53,16 @@ class DataBagLock(Lockable):

unit_request_lock_atr_name: str
member_with_lock_atr_name: str
lock_timestamp: str = "databaglock_timestamp"

def __init__(self, state: "ClusterState") -> None:
self.state = state

def __init_subclass__(cls) -> None:
"""Initialize subclass attributes."""
super().__init_subclass__()
cls.lock_timestamp = cls.__name__.lower() + "_timestamp"

@property
def units_requesting_lock(self) -> list[str]:
"""Get the list of units requesting the start lock."""
Expand All @@ -68,6 +75,8 @@ def units_requesting_lock(self) -> list[str]:
@property
def next_unit_to_give_lock(self) -> str | None:
"""Get the next unit to give the start lock to."""
if self.state.unit_server.model[self.unit_request_lock_atr_name]:
return self.state.unit_server.unit_name
return self.units_requesting_lock[0] if self.units_requesting_lock else None

@property
Expand Down Expand Up @@ -98,11 +107,13 @@ def is_held_by_this_unit(self) -> bool:

def request_lock(self) -> bool:
"""Request the lock for the local unit."""
self.state.unit_server.update(
{
self.unit_request_lock_atr_name: True,
}
)
if not self.state.unit_server.model[self.unit_request_lock_atr_name]:
self.state.unit_server.update(
{
self.unit_request_lock_atr_name: True,
self.lock_timestamp: time.time(),
}
)
if self.state.unit_server.unit.is_leader():
logger.info(
"Leader unit requesting %s lock. Triggering lock request processing.",
Expand All @@ -114,11 +125,13 @@ def request_lock(self) -> bool:

def release_lock(self) -> bool:
"""Release the lock from the local unit."""
self.state.unit_server.update(
{
self.unit_request_lock_atr_name: False,
}
)
if self.state.unit_server.model[self.unit_request_lock_atr_name]:
self.state.unit_server.update(
{
self.unit_request_lock_atr_name: False,
self.lock_timestamp: time.time(),
}
)
if self.state.unit_server.unit.is_leader():
logger.info(
"Leader unit releasing %s lock. Triggering lock request processing.",
Expand Down Expand Up @@ -157,6 +170,24 @@ def is_lock_free_to_give(self) -> bool:
not self.state.cluster.model.start_member
or not starting_unit
or starting_unit.is_started
or not starting_unit.model.request_start_lock
)


class RestartLock(DataBagLock):
"""Lock for restart operations."""

unit_request_lock_atr_name = "request_restart_lock"
member_with_lock_atr_name = "restart_member"

@property
def is_lock_free_to_give(self) -> bool:
"""Check if the unit with the restart lock has completed its operation."""
restarting_unit = self.unit_with_lock
return (
not self.state.cluster.model.restart_member
or not restarting_unit
or not restarting_unit.model.request_restart_lock
)


Expand Down
2 changes: 2 additions & 0 deletions src/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class PeerAppModel(PeerModel):
charmed_sentinel_peers_password: InternalUsersSecret = Field(default="")
charmed_sentinel_operator_password: InternalUsersSecret = Field(default="")
start_member: str = Field(default="")
restart_member: str = Field(default="")
internal_ca_certificate: InternalCertificatesSecret = Field(default="")
internal_ca_private_key: InternalCertificatesSecret = Field(default="")

Expand All @@ -65,6 +66,7 @@ class PeerUnitModel(PeerModel):
hostname: str = Field(default="")
private_ip: str = Field(default="")
request_start_lock: bool = Field(default=False)
request_restart_lock: bool = Field(default=False)
scale_down_state: str = Field(default="")
tls_client_state: str = Field(default="")
client_cert_ready: bool = Field(default=False)
Expand Down
95 changes: 87 additions & 8 deletions src/events/base_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
ValkeyServicesFailedToStartError,
ValkeyWorkloadCommandError,
)
from common.locks import ScaleDownLock, StartLock
from common.locks import RestartLock, ScaleDownLock, StartLock
from literals import (
CLIENT_PORT,
DATA_STORAGE,
Expand All @@ -41,6 +41,29 @@
logger = logging.getLogger(__name__)


class RestartWorkloadEvent(ops.EventBase):
"""Event for restarting the workload when certain events happen, e.g. IP change."""

def __init__(
self, handle: ops.Handle, restart_valkey: bool = True, restart_sentinel: bool = True
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice! This will certainly be useful!

):
super().__init__(handle)
self.restart_valkey = restart_valkey
self.restart_sentinel = restart_sentinel

def snapshot(self) -> dict[str, str]:
"""Save the state of the event."""
return {
"restart_valkey": str(self.restart_valkey),
"restart_sentinel": str(self.restart_sentinel),
}

def restore(self, snapshot: dict[str, str]) -> None:
"""Restore the state of the event."""
self.restart_valkey = snapshot.get("restart_valkey", "True") == "True"
self.restart_sentinel = snapshot.get("restart_sentinel", "True") == "True"


class UnitFullyStarted(ops.EventBase):
"""Event that signals that the unit's has fully started.

Expand All @@ -66,6 +89,7 @@ class BaseEvents(ops.Object):
"""Handle all base events."""

unit_fully_started = ops.EventSource(UnitFullyStarted)
restart_workload = ops.EventSource(RestartWorkloadEvent)

def __init__(self, charm: "ValkeyCharm"):
super().__init__(charm, key="base_events")
Expand All @@ -81,6 +105,7 @@ def __init__(self, charm: "ValkeyCharm"):
self.framework.observe(self.charm.on.config_changed, self._on_config_changed)
self.framework.observe(self.charm.on.secret_changed, self._on_secret_changed)
self.framework.observe(self.unit_fully_started, self._on_unit_fully_started)
self.framework.observe(self.restart_workload, self._on_restart_workload)
self.framework.observe(
self.charm.on[DATA_STORAGE].storage_detaching, self._on_storage_detaching
)
Expand Down Expand Up @@ -230,7 +255,7 @@ def _on_peer_relation_changed(self, event: ops.RelationChangedEvent) -> None:
if not self.charm.unit.is_leader():
return

for lock in [StartLock(self.charm.state)]:
for lock in [StartLock(self.charm.state), RestartLock(self.charm.state)]:
lock.process()

def _on_update_status(self, event: ops.UpdateStatusEvent) -> None:
Expand Down Expand Up @@ -287,12 +312,15 @@ def _on_leader_elected(self, event: ops.LeaderElectedEvent) -> None:

def _on_config_changed(self, event: ops.ConfigChangedEvent) -> None:
"""Handle the config_changed event."""
self.charm.state.unit_server.update(
{
"hostname": self.charm.state.hostname,
"private_ip": self.charm.state.bind_address,
}
)
# on k8s we use hostnames so we do not have to reconfigure on ip change
if (
self.charm.state.unit_server.model.private_ip
and self.charm.state.bind_address != self.charm.state.unit_server.model.private_ip
and self.charm.state.substrate == Substrate.VM
):
self.charm.config_manager.configure_services(
self.charm.sentinel_manager.get_primary_ip()
)

if not self.charm.unit.is_leader():
return
Expand Down Expand Up @@ -524,3 +552,54 @@ def _set_state_for_going_away(self) -> None:
)

self.charm.state.unit_server.update({"scale_down_state": ScaleDownState.GOING_AWAY})

def _on_restart_workload(self, event: RestartWorkloadEvent) -> None:
"""Handle the restart_workload event."""
logger.info(
"Restarting workload Event. Restart Valkey: %s, Restart Sentinel: %s",
event.restart_valkey,
event.restart_sentinel,
)
restart_lock = RestartLock(self.charm.state)
restart_lock.request_lock()
if not restart_lock.is_held_by_this_unit:
logger.info("Waiting for lock to restart workload")
event.defer()
return

if event.restart_valkey:
self.charm.workload.restart(self.charm.workload.valkey_service)
if event.restart_sentinel:
self.charm.sentinel_manager.restart_service()

Comment on lines +570 to +574
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We need to add error handling here in case these methods raise, otherwise the event would crash.

if event.restart_valkey and not self.charm.cluster_manager.is_healthy(
check_replica_sync=False
):
self.charm.status.set_running_status(
ClusterStatuses.VALKEY_UNHEALTHY_RESTART.value,
scope="unit",
component_name=self.charm.cluster_manager.name,
statuses_state=self.charm.state.statuses,
)

self.charm.state.statuses.delete(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does this reset a status that might just previously have been set? Or am I missing something here?

ClusterStatuses.VALKEY_UNHEALTHY_RESTART.value,
scope="unit",
component=self.charm.cluster_manager.name,
)

if event.restart_sentinel and not self.charm.sentinel_manager.is_healthy():
self.charm.status.set_running_status(
ClusterStatuses.SENTINEL_UNHEALTHY_RESTART.value,
scope="unit",
component_name=self.charm.cluster_manager.name,
statuses_state=self.charm.state.statuses,
)

self.charm.state.statuses.delete(
ClusterStatuses.SENTINEL_UNHEALTHY_RESTART.value,
scope="unit",
component=self.charm.cluster_manager.name,
)

Comment on lines +575 to +604
Comment on lines +575 to +604
restart_lock.release_lock()
26 changes: 26 additions & 0 deletions src/events/tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
CLIENT_PORT,
CLIENT_TLS_RELATION_NAME,
PEER_RELATION,
Substrate,
TLSCARotationState,
TLSState,
)
Expand Down Expand Up @@ -78,6 +79,7 @@ def __init__(self, charm: "ValkeyCharm"):
self.charm.on[PEER_RELATION].relation_changed, self._on_peer_relation_changed
)
self.framework.observe(self.charm.on.update_status, self._on_update_status)
self.framework.observe(self.charm.on.config_changed, self._on_config_changed)

def _on_peer_relation_created(self, event: ops.RelationCreatedEvent) -> None:
"""Set up self-signed certificates for peer TLS by default."""
Expand Down Expand Up @@ -299,6 +301,30 @@ def _enable_client_tls(self) -> None:
self.charm.cluster_manager.reload_tls_settings(tls_config)
self.charm.sentinel_manager.restart_service()

def _on_config_changed(self, event: ops.ConfigChangedEvent) -> None:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Separating concerns is nice and I really like it! After reading the code though I think I need to revert my earlier comment and admit that it makes more sense to have it in the same event handler and method. Reason being that it seems to be overly complicated just to adhere to the structure, and now half of the logic here is not related to TLS concerns. Therefore, I'd say let's move this code to the base_events._on_config_changed() and have a clean implementation there.

"""Handle the `config-changed` event."""
if (
self.charm.state.unit_server.model.private_ip
and self.charm.state.bind_address != self.charm.state.unit_server.model.private_ip
):
if self.charm.tls_manager.certificate_sans_require_update():
if not self.charm.state.client_tls_relation:
self.charm.tls_manager.create_and_store_self_signed_certificate()
else:
self.charm.tls_events.refresh_tls_certificates_event.emit()
event.defer()
return

self.charm.state.unit_server.update(
{
"hostname": self.charm.state.hostname,
"private_ip": self.charm.state.bind_address,
}
)
# only restart on VM because on k8s the hostname is stable and does not change with IP changes
if self.charm.state.substrate == Substrate.VM:
self.charm.base_events.restart_workload.emit()

def _orchestrate_ca_rotation(self) -> None:
"""Orchestrate the workflow when a TLS CA rotation has been initiated."""
match self.charm.state.unit_server.tls_ca_rotation_state:
Expand Down
3 changes: 1 addition & 2 deletions src/managers/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@ def get_config_properties(self, primary_endpoint: str) -> dict[str, str]:
config_properties["aclfile"] = self.workload.acl_file.as_posix()
config_properties["dir"] = self.workload.working_dir.as_posix()

# bind to all interfaces
config_properties["bind"] = self.state.endpoint

# replica related config
Expand All @@ -93,7 +92,7 @@ def get_config_properties(self, primary_endpoint: str) -> dict[str, str]:

def _generate_replica_config(self, primary_endpoint: str) -> dict[str, str]:
"""Generate the config properties related to replica configuration based on the current cluster state."""
local_unit_endpoint = self.state.unit_server.get_endpoint(self.state.substrate)
local_unit_endpoint = self.state.endpoint
replica_config = {
"primaryuser": CharmUsers.VALKEY_REPLICA.value,
"primaryauth": self.state.cluster.internal_users_credentials.get(
Expand Down
47 changes: 47 additions & 0 deletions src/managers/tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,53 @@ def start_ca_rotation_if_required(
)
return True

def get_current_sans(self) -> dict[str, set[str]]:
"""Get the current SANs for a unit's cert."""
cert_file = self.workload.tls_paths.client_cert

sans_ip = set()
sans_dns = set()
if not (
san_lines := self.workload.exec(
[
"openssl",
"x509",
"-ext",
"subjectAltName",
"-noout",
"-in",
cert_file.as_posix(),
]
)[0].splitlines()
):
return {"sans_ip": sans_ip, "sans_dns": sans_dns}

for line in san_lines:
for sans in line.split(", "):
san_type, san_value = sans.split(":")

if san_type.strip() == "DNS":
sans_dns.add(san_value)
if san_type.strip() == "IP Address":
Comment on lines +272 to +276
sans_ip.add(san_value)

return {"sans_ip": sans_ip, "sans_dns": sans_dns}

def certificate_sans_require_update(self) -> bool:
"""Check current certificate sans and determine if certificate requires update.

Returns:
bool: True if certificate sans have changed, False if they are still the same.
"""
current_sans = self.get_current_sans()
new_sans_ip = self.build_sans_ip()
new_sans_dns = self.build_sans_dns()

if new_sans_ip ^ current_sans["sans_ip"] or new_sans_dns ^ current_sans["sans_dns"]:
return True

return False

def get_statuses(self, scope: Scope, recompute: bool = False) -> list[StatusObject]:
"""Compute the TLS statuses."""
status_list: list[StatusObject] = []
Expand Down
12 changes: 12 additions & 0 deletions src/statuses.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,18 @@ class ClusterStatuses(Enum):
running="async",
)

VALKEY_UNHEALTHY_RESTART = StatusObject(
status="maintenance",
message="Valkey unhealthy after restart",
running="async",
)

SENTINEL_UNHEALTHY_RESTART = StatusObject(
status="maintenance",
message="Sentinel unhealthy after restart",
running="async",
)


class StartStatuses(Enum):
"""Collection of possible statuses related to the service start."""
Expand Down
Loading
Loading