Skip to content

Commit 788b5bf

Browse files
committed
Dev: sbd: Improve the process of leveraging maintenance mode
- Drop the function `restart_cluster_if_possible` - Introduced a new function `utils.able_to_restart_cluster` to check if the cluster can be restarted. Call it before changing any configurations. - Add leverage maintenance mode in `sbd device remove` and `sbd purge` commands
1 parent 99d958b commit 788b5bf

File tree

4 files changed

+59
-32
lines changed

4 files changed

+59
-32
lines changed

crmsh/sbd.py

Lines changed: 5 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -593,22 +593,6 @@ def enable_sbd_service(self):
593593
logger.info("Enable %s on node %s", constants.SBD_SERVICE, node)
594594
service_manager.enable_service(constants.SBD_SERVICE, node)
595595

596-
@staticmethod
597-
def restart_cluster_if_possible(with_maintenance_mode=False):
598-
if not ServiceManager().service_is_active(constants.PCMK_SERVICE):
599-
return
600-
if not xmlutil.CrmMonXmlParser().is_non_stonith_resource_running():
601-
bootstrap.restart_cluster()
602-
elif with_maintenance_mode:
603-
if not utils.is_dlm_running():
604-
bootstrap.restart_cluster()
605-
else:
606-
logger.warning("Resource is running, need to restart cluster service manually on each node")
607-
else:
608-
logger.warning("Resource is running, need to restart cluster service manually on each node")
609-
logger.warning("Or, run with `crm -F` or `--force` option, the `sbd` subcommand will leverage maintenance mode for any changes that require restarting sbd.service")
610-
logger.warning("Understand risks that running RA has no cluster protection while the cluster is in maintenance mode and restarting")
611-
612596
def configure_sbd(self):
613597
'''
614598
Configure fence_sbd resource and related properties
@@ -746,6 +730,9 @@ def init_and_deploy_sbd(self, restart_first=False):
746730
self._load_attributes_from_bootstrap()
747731

748732
with utils.leverage_maintenance_mode() as enabled:
733+
if not utils.able_to_restart_cluster(enabled):
734+
return
735+
749736
self.initialize_sbd()
750737
self.update_configuration()
751738
self.enable_sbd_service()
@@ -760,7 +747,7 @@ def init_and_deploy_sbd(self, restart_first=False):
760747
restart_cluster_first = restart_first or \
761748
(self.diskless_sbd and not ServiceManager().service_is_active(constants.SBD_SERVICE))
762749
if restart_cluster_first:
763-
SBDManager.restart_cluster_if_possible(with_maintenance_mode=enabled)
750+
bootstrap.restart_cluster()
764751

765752
self.configure_sbd()
766753
bootstrap.adjust_properties(with_sbd=True)
@@ -770,7 +757,7 @@ def init_and_deploy_sbd(self, restart_first=False):
770757
# This helps prevent unexpected issues, such as nodes being fenced
771758
# due to large SBD_WATCHDOG_TIMEOUT values combined with smaller timeouts.
772759
if not restart_cluster_first:
773-
SBDManager.restart_cluster_if_possible(with_maintenance_mode=enabled)
760+
bootstrap.restart_cluster()
774761

775762
def join_sbd(self, remote_user, peer_host):
776763
'''

crmsh/ui_sbd.py

Lines changed: 20 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -517,8 +517,11 @@ def _device_remove(self, devices_to_remove: typing.List[str]):
517517

518518
logger.info("Remove devices: %s", ';'.join(devices_to_remove))
519519
update_dict = {"SBD_DEVICE": ";".join(left_device_list)}
520-
sbd.SBDManager.update_sbd_configuration(update_dict)
521-
sbd.SBDManager.restart_cluster_if_possible()
520+
with utils.leverage_maintenance_mode() as enabled:
521+
if not utils.able_to_restart_cluster(enabled):
522+
return
523+
sbd.SBDManager.update_sbd_configuration(update_dict)
524+
bootstrap.restart_cluster()
522525

523526
@command.completers_repeating(sbd_device_completer)
524527
def do_device(self, context, *args) -> bool:
@@ -603,20 +606,23 @@ def do_purge(self, context, *args) -> bool:
603606

604607
utils.check_all_nodes_reachable("purging SBD")
605608

606-
if args and args[0] == "crashdump":
607-
if not self._is_crashdump_configured():
608-
logger.error("SBD crashdump is not configured")
609+
with utils.leverage_maintenance_mode() as enabled:
610+
if not utils.able_to_restart_cluster(enabled):
609611
return False
610-
self._set_crashdump_option(delete=True)
611-
update_dict = self._set_crashdump_in_sysconfig(restore=True)
612-
if update_dict:
613-
sbd.SBDManager.update_sbd_configuration(update_dict)
614-
sbd.SBDManager.restart_cluster_if_possible()
615-
return True
616612

617-
sbd.purge_sbd_from_cluster()
618-
sbd.SBDManager.restart_cluster_if_possible()
619-
return True
613+
if args and args[0] == "crashdump":
614+
if not self._is_crashdump_configured():
615+
logger.error("SBD crashdump is not configured")
616+
return False
617+
self._set_crashdump_option(delete=True)
618+
update_dict = self._set_crashdump_in_sysconfig(restore=True)
619+
if update_dict:
620+
sbd.SBDManager.update_sbd_configuration(update_dict)
621+
else:
622+
sbd.purge_sbd_from_cluster()
623+
624+
bootstrap.restart_cluster()
625+
return True
620626

621627
def _print_sbd_type(self):
622628
if not self.service_manager.service_is_active(constants.SBD_SERVICE):

crmsh/utils.py

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3306,4 +3306,31 @@ def validate_and_get_reachable_nodes(
33063306
member_list.remove(node)
33073307

33083308
return member_list + remote_list
3309+
3310+
3311+
def able_to_restart_cluster(in_maintenance_mode: bool = False) -> bool:
3312+
"""
3313+
Check whether it is able to restart cluster now
3314+
1. If pacemaker is not running, return True
3315+
2. If no non-stonith resource is running, return True
3316+
3. If in maintenance mode and DLM is not running, return True
3317+
4. Otherwise, return False with warning messages to guide user
3318+
"""
3319+
if not ServiceManager().service_is_active(constants.PCMK_SERVICE):
3320+
return True
3321+
crm_mon_parser = xmlutil.CrmMonXmlParser()
3322+
if not crm_mon_parser.is_non_stonith_resource_running():
3323+
return True
3324+
elif in_maintenance_mode:
3325+
if is_dlm_running():
3326+
dlm_related_ids = crm_mon_parser.get_resource_top_parent_id_set_via_type(constants.DLM_CONTROLD_RA)
3327+
logger.warning("Please stop DLM related resources (%s) and try again", ', '.join(dlm_related_ids))
3328+
return False
3329+
else:
3330+
return True
3331+
else:
3332+
logger.warning("Please stop all running resources and try again")
3333+
logger.warning("Or run this command with -F/--force option to leverage maintenance mode")
3334+
logger.warning("Understand risks that running RA has no cluster protection while the cluster is in maintenance mode and restarting")
3335+
return False
33093336
# vim:ts=4:sw=4:et:

crmsh/xmlutil.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1627,6 +1627,13 @@ def is_resource_started(self, ra):
16271627
xpath = f'//resource[(@id="{ra}" or @resource_agent="{ra}") and @active="true" and @role="Started"]'
16281628
return bool(self.xml_elem.xpath(xpath))
16291629

1630+
def get_resource_top_parent_id_set_via_type(self, ra_type):
1631+
"""
1632+
Given configured ra type, get the topmost parent ra id set
1633+
"""
1634+
xpath = f'//resource[@resource_agent="{ra_type}"]'
1635+
return set([get_topmost_rsc(elem).get('id') for elem in self.xml_elem.xpath(xpath)])
1636+
16301637
def get_resource_id_list_via_type(self, ra_type):
16311638
"""
16321639
Given configured ra type, get the ra id list

0 commit comments

Comments
 (0)