Dev: sbd: Improve the process of leveraging maintenance mode

liangxin1300 · liangxin1300 · commit 788b5bf95bd3 · 2025-11-14T17:27:41.000+08:00
- Drop the function `restart_cluster_if_possible`
- Introduced a new function `utils.able_to_restart_cluster` to check if
  the cluster can be restarted. Call it before changing any configurations.
- Add leverage maintenance mode in `sbd device remove` and `sbd purge` commands
diff --git a/crmsh/sbd.py b/crmsh/sbd.py
@@ -593,22 +593,6 @@ def enable_sbd_service(self):
                 logger.info("Enable %s on node %s", constants.SBD_SERVICE, node)
                 service_manager.enable_service(constants.SBD_SERVICE, node)
 
-    @staticmethod
-    def restart_cluster_if_possible(with_maintenance_mode=False):
-        if not ServiceManager().service_is_active(constants.PCMK_SERVICE):
-            return
-        if not xmlutil.CrmMonXmlParser().is_non_stonith_resource_running():
-            bootstrap.restart_cluster()
-        elif with_maintenance_mode:
-            if not utils.is_dlm_running():
-                bootstrap.restart_cluster()
-            else:
-                logger.warning("Resource is running, need to restart cluster service manually on each node")
-        else:
-            logger.warning("Resource is running, need to restart cluster service manually on each node")
-            logger.warning("Or, run with `crm -F` or `--force` option, the `sbd` subcommand will leverage maintenance mode for any changes that require restarting sbd.service")
-            logger.warning("Understand risks that running RA has no cluster protection while the cluster is in maintenance mode and restarting")
-
     def configure_sbd(self):
         '''
         Configure fence_sbd resource and related properties
@@ -746,6 +730,9 @@ def init_and_deploy_sbd(self, restart_first=False):
             self._load_attributes_from_bootstrap()
 
         with utils.leverage_maintenance_mode() as enabled:
+            if not utils.able_to_restart_cluster(enabled):
+                return
+
             self.initialize_sbd()
             self.update_configuration()
             self.enable_sbd_service()
@@ -760,7 +747,7 @@ def init_and_deploy_sbd(self, restart_first=False):
                 restart_cluster_first = restart_first or \
                         (self.diskless_sbd and not ServiceManager().service_is_active(constants.SBD_SERVICE))
                 if restart_cluster_first:
-                    SBDManager.restart_cluster_if_possible(with_maintenance_mode=enabled)
+                    bootstrap.restart_cluster()
 
                 self.configure_sbd()
                 bootstrap.adjust_properties(with_sbd=True)
@@ -770,7 +757,7 @@ def init_and_deploy_sbd(self, restart_first=False):
                 # This helps prevent unexpected issues, such as nodes being fenced
                 # due to large SBD_WATCHDOG_TIMEOUT values combined with smaller timeouts.
                 if not restart_cluster_first:
-                    SBDManager.restart_cluster_if_possible(with_maintenance_mode=enabled)
+                    bootstrap.restart_cluster()
 
     def join_sbd(self, remote_user, peer_host):
         '''
diff --git a/crmsh/ui_sbd.py b/crmsh/ui_sbd.py
@@ -517,8 +517,11 @@ def _device_remove(self, devices_to_remove: typing.List[str]):
 
         logger.info("Remove devices: %s", ';'.join(devices_to_remove))
         update_dict = {"SBD_DEVICE": ";".join(left_device_list)}
-        sbd.SBDManager.update_sbd_configuration(update_dict)
-        sbd.SBDManager.restart_cluster_if_possible()
+        with utils.leverage_maintenance_mode() as enabled:
+            if not utils.able_to_restart_cluster(enabled):
+                return
+            sbd.SBDManager.update_sbd_configuration(update_dict)
+            bootstrap.restart_cluster()
 
     @command.completers_repeating(sbd_device_completer)
     def do_device(self, context, *args) -> bool:
@@ -603,20 +606,23 @@ def do_purge(self, context, *args) -> bool:
 
         utils.check_all_nodes_reachable("purging SBD")
 
-        if args and args[0] == "crashdump":
-            if not self._is_crashdump_configured():
-                logger.error("SBD crashdump is not configured")
+        with utils.leverage_maintenance_mode() as enabled:
+            if not utils.able_to_restart_cluster(enabled):
                 return False
-            self._set_crashdump_option(delete=True)
-            update_dict = self._set_crashdump_in_sysconfig(restore=True)
-            if update_dict:
-                sbd.SBDManager.update_sbd_configuration(update_dict)
-                sbd.SBDManager.restart_cluster_if_possible()
-            return True
 
-        sbd.purge_sbd_from_cluster()
-        sbd.SBDManager.restart_cluster_if_possible()
-        return True
+            if args and args[0] == "crashdump":
+                if not self._is_crashdump_configured():
+                    logger.error("SBD crashdump is not configured")
+                    return False
+                self._set_crashdump_option(delete=True)
+                update_dict = self._set_crashdump_in_sysconfig(restore=True)
+                if update_dict:
+                    sbd.SBDManager.update_sbd_configuration(update_dict)
+            else:
+                sbd.purge_sbd_from_cluster()
+
+            bootstrap.restart_cluster()
+            return True
 
     def _print_sbd_type(self):
         if not self.service_manager.service_is_active(constants.SBD_SERVICE):
diff --git a/crmsh/utils.py b/crmsh/utils.py
@@ -3306,4 +3306,31 @@ def validate_and_get_reachable_nodes(
             member_list.remove(node)
 
     return member_list + remote_list
+
+
+def able_to_restart_cluster(in_maintenance_mode: bool = False) -> bool:
+    """
+    Check whether it is able to restart cluster now
+    1. If pacemaker is not running, return True
+    2. If no non-stonith resource is running, return True
+    3. If in maintenance mode and DLM is not running, return True
+    4. Otherwise, return False with warning messages to guide user
+    """
+    if not ServiceManager().service_is_active(constants.PCMK_SERVICE):
+        return True
+    crm_mon_parser = xmlutil.CrmMonXmlParser()
+    if not crm_mon_parser.is_non_stonith_resource_running():
+        return True
+    elif in_maintenance_mode:
+        if is_dlm_running():
+            dlm_related_ids = crm_mon_parser.get_resource_top_parent_id_set_via_type(constants.DLM_CONTROLD_RA)
+            logger.warning("Please stop DLM related resources (%s) and try again", ', '.join(dlm_related_ids))
+            return False
+        else:
+            return True
+    else:
+        logger.warning("Please stop all running resources and try again")
+        logger.warning("Or run this command with -F/--force option to leverage maintenance mode")
+        logger.warning("Understand risks that running RA has no cluster protection while the cluster is in maintenance mode and restarting")
+        return False
 # vim:ts=4:sw=4:et:
diff --git a/crmsh/xmlutil.py b/crmsh/xmlutil.py
@@ -1627,6 +1627,13 @@ def is_resource_started(self, ra):
         xpath = f'//resource[(@id="{ra}" or @resource_agent="{ra}") and @active="true" and @role="Started"]'
         return bool(self.xml_elem.xpath(xpath))
 
+    def get_resource_top_parent_id_set_via_type(self, ra_type):
+        """
+        Given configured ra type, get the topmost parent ra id set
+        """
+        xpath = f'//resource[@resource_agent="{ra_type}"]'
+        return set([get_topmost_rsc(elem).get('id') for elem in self.xml_elem.xpath(xpath)])
+
     def get_resource_id_list_via_type(self, ra_type):
         """
         Given configured ra type, get the ra id list