Skip to content

Commit 54751d3

Browse files
authored
Merge pull request #1274 from liangxin1300/20230803_stop_cluster_all_crmsh46
[crmsh-4.6] Fix: ui_cluster: Improve the process of 'crm cluster stop' (bsc#1213889)
2 parents 4b74412 + db02a38 commit 54751d3

File tree

5 files changed

+217
-101
lines changed

5 files changed

+217
-101
lines changed

crmsh/ui_cluster.py

+60-25
Original file line numberDiff line numberDiff line change
@@ -193,42 +193,77 @@ def do_start(self, context, *args):
193193
for node in node_list:
194194
logger.info("The cluster stack started on {}".format(node))
195195

196-
@command.skill_level('administrator')
197-
def do_stop(self, context, *args):
198-
'''
199-
Stops the cluster stack on all nodes or specific node(s)
200-
'''
196+
@staticmethod
197+
def _node_ready_to_stop_cluster_service(node):
198+
"""
199+
Check if the specific node is ready to stop cluster service
200+
201+
If both corosync.service and pacemaker.service is active, return True
202+
If some services started, stop them first and return False
203+
"""
201204
service_manager = ServiceManager()
202-
node_list = parse_option_for_nodes(context, *args)
203-
for node in node_list[:]:
204-
if not service_manager.service_is_active("corosync.service", remote_addr=node):
205-
if service_manager.service_is_active("sbd.service", remote_addr=node):
206-
service_manager.stop_service("corosync", remote_addr=node)
207-
logger.info("The cluster stack stopped on {}".format(node))
208-
else:
209-
logger.info("The cluster stack already stopped on {}".format(node))
210-
node_list.remove(node)
211-
elif not service_manager.service_is_active("pacemaker.service", remote_addr=node):
205+
206+
corosync_active = service_manager.service_is_active("corosync.service", remote_addr=node)
207+
sbd_active = service_manager.service_is_active("sbd.service", remote_addr=node)
208+
pacemaker_active = service_manager.service_is_active("pacemaker.service", remote_addr=node)
209+
210+
if not corosync_active:
211+
if sbd_active:
212212
service_manager.stop_service("corosync", remote_addr=node)
213-
logger.info("The cluster stack stopped on {}".format(node))
214-
node_list.remove(node)
215-
if not node_list:
213+
logger.info(f"The cluster stack stopped on {node}")
214+
else:
215+
logger.info(f"The cluster stack already stopped on {node}")
216+
return False
217+
218+
elif not pacemaker_active:
219+
service_manager.stop_service("corosync", remote_addr=node)
220+
logger.info("The cluster stack stopped on {}".format(node))
221+
return False
222+
223+
return True
224+
225+
@staticmethod
226+
def _wait_for_dc(node=None):
227+
"""
228+
Wait for the cluster's DC to become available
229+
"""
230+
if not ServiceManager().service_is_active("pacemaker.service", remote_addr=node):
216231
return
217232

218-
dc_deadtime = utils.get_property("dc-deadtime") or str(constants.DC_DEADTIME_DEFAULT)
233+
dc_deadtime = utils.get_property("dc-deadtime", peer=node) or str(constants.DC_DEADTIME_DEFAULT)
219234
dc_timeout = int(dc_deadtime.strip('s')) + 5
220235
try:
221-
utils.check_function_with_timeout(utils.get_dc, wait_timeout=dc_timeout)
236+
utils.check_function_with_timeout(utils.get_dc, wait_timeout=dc_timeout, peer=node)
222237
except TimeoutError:
223238
logger.error("No DC found currently, please wait if the cluster is still starting")
224-
return False
239+
raise utils.TerminateSubCommand
225240

226-
# When dlm running and quorum is lost, before stop cluster service, should set
227-
# enable_quorum_fencing=0, enable_quorum_lockspace=0 for dlm config option
228-
if utils.is_dlm_running() and not utils.is_quorate():
241+
@staticmethod
242+
def _set_dlm(node=None):
243+
"""
244+
When dlm running and quorum is lost, before stop cluster service, should set
245+
enable_quorum_fencing=0, enable_quorum_lockspace=0 for dlm config option
246+
"""
247+
if utils.is_dlm_running(node) and not utils.is_quorate(node):
229248
logger.debug("Quorum is lost; Set enable_quorum_fencing=0 and enable_quorum_lockspace=0 for dlm")
230-
utils.set_dlm_option(enable_quorum_fencing=0, enable_quorum_lockspace=0)
249+
utils.set_dlm_option(peer=node, enable_quorum_fencing=0, enable_quorum_lockspace=0)
250+
251+
@command.skill_level('administrator')
252+
def do_stop(self, context, *args):
253+
'''
254+
Stops the cluster stack on all nodes or specific node(s)
255+
'''
256+
node_list = parse_option_for_nodes(context, *args)
257+
node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)]
258+
if not node_list:
259+
return
260+
logger.debug(f"stop node list: {node_list}")
261+
262+
self._wait_for_dc(node_list[0])
231263

264+
self._set_dlm(node_list[0])
265+
266+
service_manager = ServiceManager()
232267
# Stop pacemaker since it can make sure cluster has quorum until stop corosync
233268
node_list = service_manager.stop_service("pacemaker", node_list=node_list)
234269
# Then, stop qdevice if is active

crmsh/utils.py

+38-23
Original file line numberDiff line numberDiff line change
@@ -957,14 +957,14 @@ def append_file(dest, src):
957957
return False
958958

959959

960-
def get_dc():
960+
def get_dc(peer=None):
961961
cmd = "crmadmin -D -t 1"
962-
rc, s, _ = ShellUtils().get_stdout_stderr(add_sudo(cmd))
963-
if rc != 0:
962+
_, out, _ = sh.cluster_shell().get_rc_stdout_stderr_without_input(peer, cmd)
963+
if not out:
964964
return None
965-
if not s.startswith("Designated"):
965+
if not out.startswith("Designated"):
966966
return None
967-
return s.split()[-1]
967+
return out.split()[-1]
968968

969969

970970
def wait4dc(what="", show_progress=True):
@@ -2741,48 +2741,63 @@ def is_standby(node):
27412741
return re.search(r'Node\s+{}:\s+standby'.format(node), out) is not None
27422742

27432743

2744-
def get_dlm_option_dict():
2744+
def get_dlm_option_dict(peer=None):
27452745
"""
27462746
Get dlm config option dictionary
27472747
"""
2748-
out = sh.cluster_shell().get_stdout_or_raise_error("dlm_tool dump_config")
2748+
out = sh.cluster_shell().get_stdout_or_raise_error("dlm_tool dump_config", peer)
27492749
return dict(re.findall("(\w+)=(\w+)", out))
27502750

27512751

2752-
def set_dlm_option(**kargs):
2752+
def set_dlm_option(peer=None, **kargs):
27532753
"""
27542754
Set dlm option
27552755
"""
27562756
shell = sh.cluster_shell()
2757-
dlm_option_dict = get_dlm_option_dict()
2757+
dlm_option_dict = get_dlm_option_dict(peer=peer)
27582758
for option, value in kargs.items():
27592759
if option not in dlm_option_dict:
2760-
raise ValueError('"{}" is not dlm config option'.format(option))
2760+
raise ValueError(f'"{option}" is not dlm config option')
27612761
if dlm_option_dict[option] != value:
2762-
shell.get_stdout_or_raise_error('dlm_tool set_config "{}={}"'.format(option, value))
2762+
shell.get_stdout_or_raise_error(f'dlm_tool set_config "{option}={value}"', peer)
27632763

27642764

2765-
def is_dlm_running():
2765+
def is_dlm_running(peer=None):
27662766
"""
27672767
Check if dlm ra controld is running
27682768
"""
2769-
from . import xmlutil
2770-
return xmlutil.CrmMonXmlParser().is_resource_started(constants.DLM_CONTROLD_RA)
2769+
return is_resource_running(constants.DLM_CONTROLD_RA, peer=peer)
2770+
2771+
2772+
def has_resource_configured(ra_type, peer=None):
2773+
"""
2774+
Check if the RA configured
2775+
"""
2776+
out = sh.cluster_shell().get_stdout_or_raise_error("crm_mon -1rR", peer)
2777+
return re.search(ra_type, out) is not None
27712778

27722779

2773-
def is_dlm_configured():
2780+
def is_resource_running(ra_type, peer=None):
2781+
"""
2782+
Check if the RA running
2783+
"""
2784+
out = sh.cluster_shell().get_stdout_or_raise_error("crm_mon -1rR", peer)
2785+
patt = f"\({ra_type}\):\s*Started"
2786+
return re.search(patt, out) is not None
2787+
2788+
2789+
def is_dlm_configured(peer=None):
27742790
"""
27752791
Check if dlm configured
27762792
"""
2777-
from . import xmlutil
2778-
return xmlutil.CrmMonXmlParser().is_resource_configured(constants.DLM_CONTROLD_RA)
2793+
return has_resource_configured(constants.DLM_CONTROLD_RA, peer=peer)
27792794

27802795

2781-
def is_quorate():
2796+
def is_quorate(peer=None):
27822797
"""
27832798
Check if cluster is quorated
27842799
"""
2785-
out = sh.cluster_shell().get_stdout_or_raise_error("corosync-quorumtool -s", success_exit_status={0, 2})
2800+
out = sh.cluster_shell().get_stdout_or_raise_error("corosync-quorumtool -s", peer, success_exit_status={0, 2})
27862801
res = re.search(r'Quorate:\s+(.*)', out)
27872802
if res:
27882803
return res.group(1) == "Yes"
@@ -2808,7 +2823,7 @@ def get_pcmk_delay_max(two_node_without_qdevice=False):
28082823
return 0
28092824

28102825

2811-
def get_property(name, property_type="crm_config"):
2826+
def get_property(name, property_type="crm_config", peer=None):
28122827
"""
28132828
Get cluster properties
28142829
@@ -2819,7 +2834,7 @@ def get_property(name, property_type="crm_config"):
28192834
cmd = "CIB_file={} sudo --preserve-env=CIB_file crm configure get_property {}".format(cib_path, name)
28202835
else:
28212836
cmd = "sudo crm_attribute -t {} -n {} -Gq".format(property_type, name)
2822-
rc, stdout, _ = ShellUtils().get_stdout_stderr(cmd)
2837+
rc, stdout, _ = sh.cluster_shell().get_rc_stdout_stderr_without_input(peer, cmd)
28232838
return stdout if rc == 0 else None
28242839

28252840

@@ -2952,7 +2967,7 @@ def detect_file(_file, remote=None):
29522967
return rc
29532968

29542969

2955-
def check_function_with_timeout(check_function, wait_timeout=30, interval=1):
2970+
def check_function_with_timeout(check_function, wait_timeout=30, interval=1, *args, **kwargs):
29562971
"""
29572972
Run check_function in a loop
29582973
Return when check_function is true
@@ -2961,7 +2976,7 @@ def check_function_with_timeout(check_function, wait_timeout=30, interval=1):
29612976
current_time = int(time.time())
29622977
timeout = current_time + wait_timeout
29632978
while current_time <= timeout:
2964-
if check_function():
2979+
if check_function(*args, **kwargs):
29652980
return
29662981
time.sleep(interval)
29672982
current_time = int(time.time())

test/features/bootstrap_bugs.feature

+14
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,20 @@ Feature: Regression test for bootstrap bugs
132132
When Run "crm cluster stop" on "hanode1"
133133
Then Service "corosync" is "stopped" on "hanode1"
134134

135+
@clean
136+
Scenario: Can't stop all nodes' cluster service when local node's service is down(bsc#1213889)
137+
Given Cluster service is "stopped" on "hanode1"
138+
And Cluster service is "stopped" on "hanode2"
139+
When Run "crm cluster init -y" on "hanode1"
140+
Then Cluster service is "started" on "hanode1"
141+
When Run "crm cluster join -c hanode1 -y" on "hanode2"
142+
Then Cluster service is "started" on "hanode2"
143+
When Wait for DC
144+
And Run "crm cluster stop" on "hanode1"
145+
And Run "crm cluster stop --all" on "hanode1"
146+
Then Cluster service is "stopped" on "hanode1"
147+
And Cluster service is "stopped" on "hanode2"
148+
135149
@skip_non_root
136150
@clean
137151
Scenario: crm cluster join default behavior change in ssh key handling (bsc#1210693)

test/unittests/test_ui_cluster.py

+77-35
Original file line numberDiff line numberDiff line change
@@ -80,52 +80,94 @@ def test_do_start(self, mock_parse_nodes, mock_active, mock_start, mock_qdevice_
8080
mock_qdevice_configured.assert_called_once_with()
8181
mock_info.assert_called_once_with("The cluster stack started on node1")
8282

83-
@mock.patch('logging.Logger.info')
84-
@mock.patch('crmsh.service_manager.ServiceManager.service_is_active')
83+
@mock.patch('crmsh.ui_cluster.Cluster._wait_for_dc')
84+
@mock.patch('crmsh.ui_cluster.Cluster._node_ready_to_stop_cluster_service')
8585
@mock.patch('crmsh.ui_cluster.parse_option_for_nodes')
86-
def test_do_stop_already_stopped(self, mock_parse_nodes, mock_active, mock_info):
86+
def test_do_stop_return(self, mock_parse_nodes, mock_node_ready_to_stop_cluster_service, mock_dc):
87+
mock_parse_nodes.return_value = ["node1", "node2"]
88+
mock_node_ready_to_stop_cluster_service.side_effect = [False, False]
89+
8790
context_inst = mock.Mock()
88-
mock_parse_nodes.return_value = ["node1"]
89-
mock_active.side_effect = [False, False]
90-
self.ui_cluster_inst.do_stop(context_inst, "node1")
91-
mock_active.assert_has_calls([
92-
mock.call("corosync.service", remote_addr="node1"),
93-
mock.call("sbd.service", remote_addr="node1")
94-
])
95-
mock_info.assert_called_once_with("The cluster stack already stopped on node1")
91+
self.ui_cluster_inst.do_stop(context_inst, "node1", "node2")
92+
93+
mock_parse_nodes.assert_called_once_with(context_inst, "node1", "node2")
94+
mock_node_ready_to_stop_cluster_service.assert_has_calls([mock.call("node1"), mock.call("node2")])
95+
mock_dc.assert_not_called()
9696

9797
@mock.patch('logging.Logger.debug')
9898
@mock.patch('logging.Logger.info')
99-
@mock.patch('crmsh.service_manager.ServiceManager.stop_service')
100-
@mock.patch('crmsh.utils.set_dlm_option')
101-
@mock.patch('crmsh.utils.is_quorate')
102-
@mock.patch('crmsh.utils.is_dlm_running')
103-
@mock.patch('crmsh.utils.get_dc')
104-
@mock.patch('crmsh.utils.check_function_with_timeout')
105-
@mock.patch('crmsh.utils.get_property')
106-
@mock.patch('crmsh.service_manager.ServiceManager.service_is_active')
99+
@mock.patch('crmsh.ui_cluster.ServiceManager')
100+
@mock.patch('crmsh.ui_cluster.Cluster._set_dlm')
101+
@mock.patch('crmsh.ui_cluster.Cluster._wait_for_dc')
102+
@mock.patch('crmsh.ui_cluster.Cluster._node_ready_to_stop_cluster_service')
107103
@mock.patch('crmsh.ui_cluster.parse_option_for_nodes')
108-
def test_do_stop(self, mock_parse_nodes, mock_active, mock_get_property, mock_check, mock_get_dc, mock_dlm_running, mock_is_quorate, mock_set_dlm, mock_stop, mock_info, mock_debug):
104+
def test_do_stop(self, mock_parse_nodes, mock_node_ready_to_stop_cluster_service, mock_dc,
105+
mock_set_dlm, mock_service_manager, mock_info, mock_debug):
106+
mock_parse_nodes.return_value = ["node1", "node2"]
107+
mock_node_ready_to_stop_cluster_service.side_effect = [True, False]
108+
mock_service_manager_inst = mock.Mock()
109+
mock_service_manager.return_value = mock_service_manager_inst
110+
mock_service_manager_inst.stop_service.side_effect = [["node1"], ["node1"], ["node1"]]
111+
mock_service_manager_inst.service_is_active.return_value = True
112+
109113
context_inst = mock.Mock()
110-
mock_stop.side_effect = [["node1"], ["ndoe1"], ["node1"]]
111-
mock_parse_nodes.return_value = ["node1"]
112-
mock_active.side_effect = [True, True, True]
113-
mock_dlm_running.return_value = True
114-
mock_is_quorate.return_value = False
115-
mock_get_property.return_value = "20s"
114+
self.ui_cluster_inst.do_stop(context_inst, "node1", "node2")
116115

117-
self.ui_cluster_inst.do_stop(context_inst, "node1")
116+
mock_parse_nodes.assert_called_once_with(context_inst, "node1", "node2")
117+
mock_node_ready_to_stop_cluster_service.assert_has_calls([mock.call("node1"), mock.call("node2")])
118+
mock_debug.assert_called_once_with("stop node list: ['node1']")
119+
mock_dc.assert_called_once_with("node1")
120+
mock_set_dlm.assert_called_once_with("node1")
121+
mock_service_manager_inst.stop_service.assert_has_calls([
122+
mock.call("pacemaker", node_list=["node1"]),
123+
mock.call("corosync-qdevice.service", node_list=["node1"]),
124+
mock.call("corosync", node_list=["node1"]),
125+
])
126+
mock_info.assert_called_once_with("The cluster stack stopped on node1")
118127

119-
mock_active.assert_has_calls([
128+
@mock.patch('logging.Logger.info')
129+
@mock.patch('crmsh.ui_cluster.ServiceManager')
130+
def test_node_ready_to_stop_cluster_service_corosync(self, mock_service_manager, mock_info):
131+
mock_service_manager_inst = mock.Mock()
132+
mock_service_manager.return_value = mock_service_manager_inst
133+
mock_service_manager_inst.service_is_active.side_effect = [False, True, False]
134+
res = self.ui_cluster_inst._node_ready_to_stop_cluster_service("node1")
135+
assert res is False
136+
mock_service_manager_inst.service_is_active.assert_has_calls([
120137
mock.call("corosync.service", remote_addr="node1"),
138+
mock.call("sbd.service", remote_addr="node1"),
121139
mock.call("pacemaker.service", remote_addr="node1"),
122-
mock.call("corosync-qdevice.service")
123140
])
124-
mock_stop.assert_has_calls([
125-
mock.call("pacemaker", node_list=["node1"]),
126-
mock.call("corosync-qdevice.service", node_list=["node1"]),
127-
mock.call("corosync", node_list=["node1"])
141+
mock_service_manager_inst.stop_service.assert_called_once_with("corosync", remote_addr="node1")
142+
mock_info.assert_called_once_with("The cluster stack stopped on node1")
143+
144+
@mock.patch('logging.Logger.info')
145+
@mock.patch('crmsh.ui_cluster.ServiceManager')
146+
def test_node_ready_to_stop_cluster_service_pacemaker(self, mock_service_manager, mock_info):
147+
mock_service_manager_inst = mock.Mock()
148+
mock_service_manager.return_value = mock_service_manager_inst
149+
mock_service_manager_inst.service_is_active.side_effect = [True, True, False]
150+
res = self.ui_cluster_inst._node_ready_to_stop_cluster_service("node1")
151+
assert res is False
152+
mock_service_manager_inst.service_is_active.assert_has_calls([
153+
mock.call("corosync.service", remote_addr="node1"),
154+
mock.call("sbd.service", remote_addr="node1"),
155+
mock.call("pacemaker.service", remote_addr="node1"),
128156
])
157+
mock_service_manager_inst.stop_service.assert_called_once_with("corosync", remote_addr="node1")
129158
mock_info.assert_called_once_with("The cluster stack stopped on node1")
130-
mock_debug.assert_called_once_with("Quorum is lost; Set enable_quorum_fencing=0 and enable_quorum_lockspace=0 for dlm")
131-
mock_check.assert_called_once_with(mock_get_dc, wait_timeout=25)
159+
160+
@mock.patch('logging.Logger.info')
161+
@mock.patch('crmsh.ui_cluster.ServiceManager')
162+
def test_node_ready_to_stop_cluster_service(self, mock_service_manager, mock_info):
163+
mock_service_manager_inst = mock.Mock()
164+
mock_service_manager.return_value = mock_service_manager_inst
165+
mock_service_manager_inst.service_is_active.side_effect = [True, True, True]
166+
res = self.ui_cluster_inst._node_ready_to_stop_cluster_service("node1")
167+
assert res is True
168+
mock_service_manager_inst.service_is_active.assert_has_calls([
169+
mock.call("corosync.service", remote_addr="node1"),
170+
mock.call("sbd.service", remote_addr="node1"),
171+
mock.call("pacemaker.service", remote_addr="node1"),
172+
])
173+
mock_info.assert_not_called()

0 commit comments

Comments
 (0)