ClusterLabs · liangxin1300 · Oct 20, 2025 · Oct 19, 2025 · Oct 20, 2025 · Nov 21, 2025
diff --git a/crmsh/bootstrap.py b/crmsh/bootstrap.py
@@ -275,7 +275,7 @@ def _validate_nodes_option(self):
             utils.fatal(f"Overriding current user '{self.current_user}' by '{user}'. Ouch, don't do it.")
         self.user_at_node_list = [value for (user, node), value in zip(li, self.user_at_node_list) if node != me]
         for user, node in (utils.parse_user_at_host(x) for x in self.user_at_node_list):
-            utils.node_reachable_check(node)
+            utils.ssh_reachable_check(node)
 
     def _validate_cluster_node(self):
         """
@@ -2449,7 +2449,7 @@ def bootstrap_join(context):
             _context.initialize_user()
 
         remote_user, cluster_node = _parse_user_at_host(_context.cluster_node, _context.current_user)
-        utils.node_reachable_check(cluster_node)
+        utils.ssh_reachable_check(cluster_node)
         join_ssh(cluster_node, remote_user)
         remote_user = utils.user_of(cluster_node)
 

diff --git a/crmsh/qdevice.py b/crmsh/qdevice.py
@@ -208,14 +208,11 @@ def check_qnetd_addr(qnetd_addr):
         except socket.error:
             raise ValueError("host \"{}\" is unreachable".format(qnetd_addr))
 
-        utils.node_reachable_check(qnetd_addr)
+        utils.ssh_reachable_check(qnetd_addr)
 
         if utils.InterfacesInfo.ip_in_local(qnetd_ip):
             raise ValueError("host for qnetd must be a remote one")
 
-        if not utils.check_port_open(qnetd_ip, 22):
-            raise ValueError("ssh service on \"{}\" not available".format(qnetd_addr))
-
     @staticmethod
     def check_qdevice_port(qdevice_port):
         if not utils.valid_port(qdevice_port):

diff --git a/crmsh/ui_cluster.py b/crmsh/ui_cluster.py
@@ -26,7 +26,7 @@
 from .prun import prun
 from .service_manager import ServiceManager
 from .sh import ShellUtils
-from .ui_node import parse_option_for_nodes
+from . import ui_utils
 from . import constants
 
 
@@ -167,23 +167,24 @@ def do_start(self, context, *args):
         '''
         Starts the cluster stack on all nodes or specific node(s)
         '''
-        node_list = parse_option_for_nodes(context, *args)
+        try:
+            node_list = ui_utils.parse_and_validate_node_args("start", *args)
+        except utils.NoSSHError as msg:
+            logger.error('%s', msg)
+            logger.info("Please try 'crm cluster start' on each node")
+            return False
+
         service_check_list = ["pacemaker.service"]
         start_qdevice = False
         if corosync.is_qdevice_configured():
             start_qdevice = True
             service_check_list.append("corosync-qdevice.service")
 
         service_manager = ServiceManager()
-        try:
-            for node in node_list[:]:
-                if all([service_manager.service_is_active(srv, remote_addr=node) for srv in service_check_list]):
-                    logger.info("The cluster stack already started on {}".format(node))
-                    node_list.remove(node)
-        except utils.NoSSHError as msg:
-            logger.error('%s', msg)
-            logger.info("Please try 'crm cluster start' on each node")
-            return False
+        for node in node_list[:]:
+            if all([service_manager.service_is_active(srv, remote_addr=node) for srv in service_check_list]):
+                logger.info("The cluster stack already started on {}".format(node))
+                node_list.remove(node)
         if not node_list:
             return
 
@@ -248,13 +249,14 @@ def do_stop(self, context, *args):
         '''
         Stops the cluster stack on all nodes or specific node(s)
         '''
-        node_list = parse_option_for_nodes(context, *args)
         try:
-            node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)]
+            node_list = ui_utils.parse_and_validate_node_args("stop", *args)
         except utils.NoSSHError as msg:
             logger.error('%s', msg)
             logger.info("Please try 'crm cluster stop' on each node")
             return False
+
+        node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)]
         if not node_list:
             return
         logger.debug(f"stop node list: {node_list}")
@@ -297,7 +299,7 @@ def do_enable(self, context, *args):
         '''
         Enable the cluster services on this node
         '''
-        node_list = parse_option_for_nodes(context, *args)
+        node_list = ui_utils.parse_and_validate_node_args("enable", *args)
         service_manager = ServiceManager()
         node_list = service_manager.enable_service("pacemaker.service", node_list=node_list)
         if service_manager.service_is_available("corosync-qdevice.service") and corosync.is_qdevice_configured():
@@ -310,7 +312,7 @@ def do_disable(self, context, *args):
         '''
         Disable the cluster services on this node
         '''
-        node_list = parse_option_for_nodes(context, *args)
+        node_list = ui_utils.parse_and_validate_node_args("disable", *args)
         service_manager = ServiceManager()
         node_list = service_manager.disable_service("pacemaker.service", node_list=node_list)
         service_manager.disable_service("corosync-qdevice.service", node_list=node_list)

diff --git a/crmsh/ui_node.py b/crmsh/ui_node.py
@@ -6,7 +6,6 @@
 import copy
 import subprocess
 from lxml import etree
-from argparse import ArgumentParser, RawDescriptionHelpFormatter
 
 from . import config
 from . import command
@@ -219,47 +218,6 @@ def print_node(uname, ident, node_type, other, inst_attr, offline):
         print(term.render("\t%s" % (s)))
 
 
-def parse_option_for_nodes(context, *args):
-    """
-    Parse option for nodes
-    Return a node list
-    """
-    action_type = context.get_command_name()
-    action_target = "node" if action_type in ["standby", "online"] else "cluster service"
-    action = "{} {}".format(action_type, action_target)
-    usage_template = """
-Specify node(s) on which to {action}.
-If no nodes are specified, {action} on the local node.
-If --all is specified, {action} on all nodes."""
-    addtion_usage = ""
-    if action_type == "standby":
-        usage_template += """
-\n\nAdditionally, you may specify a lifetime for the standby---if set to
-"reboot", the node will be back online once it reboots. "forever" will
-keep the node in standby after reboot. The life time defaults to
-"forever"."""
-        addtion_usage = " [lifetime]"
-
-    parser = ArgumentParser(description=usage_template.format(action=action),
-                       usage="{} [--all | <node>... ]{}".format(action_type, addtion_usage),
-                       add_help=False,
-                       formatter_class=RawDescriptionHelpFormatter)
-    parser.add_argument("-h", "--help", action="store_true", dest="help", help="Show this help message")
-    parser.add_argument("--all", help="To {} on all nodes".format(action), action="store_true", dest="all")
-
-    options, args = parser.parse_known_args(args)
-    if options.help:
-        parser.print_help()
-        raise utils.TerminateSubCommand(success=True)
-    if options is None or args is None:
-        raise utils.TerminateSubCommand
-    if options.all and args:
-        context.fatal_error("Should either use --all or specific node(s)")
-
-    include_remote = action_type in ["standby", "online"]
-    return utils.validate_and_get_reachable_nodes(args, options.all, include_remote)
-
-
 class NodeMgmt(command.UI):
     '''
     Nodes management class
@@ -348,7 +306,7 @@ def do_standby(self, context, *args):
             args = args[:-1]
 
         # Parse node option
-        node_list = parse_option_for_nodes(context, *args)
+        node_list = ui_utils.parse_and_validate_node_args("standby", *args)
         if not node_list:
             return
 
@@ -436,7 +394,7 @@ def do_online(self, context, *args):
         To avoid race condition for --all option, melt all online values into one cib replace session
         """
         # Parse node option
-        node_list = parse_option_for_nodes(context, *args)
+        node_list = ui_utils.parse_and_validate_node_args("online", *args)
         if not node_list:
             return
 

diff --git a/crmsh/ui_utils.py b/crmsh/ui_utils.py
@@ -6,6 +6,7 @@
 import inspect
 from . import utils
 from . import log
+from argparse import ArgumentParser, RawDescriptionHelpFormatter
 
 
 logger = log.setup_logger(__name__)
@@ -162,3 +163,45 @@ def mknamed():
     if max_args >= 0 and len(args) > max_args:
         raise ValueError("Expected (%s), takes at most %d arguments (%d given)" %
                          (mknamed(), max_args-nskip, len(args)-nskip))
+
+
+def parse_and_validate_node_args(command_name, *args) -> list:
+    '''
+    Parses option for node-related commands
+    Then validates and returns the reachable node list
+    '''
+    action_target = "node" if command_name in ["standby", "online"] else "cluster service"
+    action = f"{command_name} {action_target}"
+    usage_template = """
+Specify node(s) on which to {action}.
+If no nodes are specified, {action} on the local node.
+If --all is specified, {action} on all nodes."""
+    addtion_usage = ""
+    if command_name == "standby":
+        usage_template += """
+\n\nAdditionally, you may specify a lifetime for the standby---if set to
+"reboot", the node will be back online once it reboots. "forever" will
+keep the node in standby after reboot. The life time defaults to
+"forever"."""
+        addtion_usage = " [lifetime]"
+
+    parser = ArgumentParser(
+            description=usage_template.format(action=action),
+            usage=f"{command_name} [--all | <node>... ]{addtion_usage}",
+            add_help=False,
+            formatter_class=RawDescriptionHelpFormatter
+    )
+    parser.add_argument("-h", "--help", action="store_true", dest="help", help="Show this help message")
+    parser.add_argument("--all", help=f"To {action} on all nodes", action="store_true", dest="all")
+
+    options, args = parser.parse_known_args(args)
+    if options.help:
+        parser.print_help()
+        raise utils.TerminateSubCommand(success=True)
+    if options is None or args is None:
+        raise utils.TerminateSubCommand
+    if options.all and args:
+        raise ValueError("Should either use --all or specific node(s)")
+
+    include_remote = command_name in ["standby", "online"]
+    return utils.validate_and_get_reachable_nodes(args, options.all, include_remote)
diff --git a/crmsh/utils.py b/crmsh/utils.py
@@ -2405,25 +2405,23 @@ def package_is_installed(pkg, remote_addr=None):
     return rc == 0
 
 
-def node_reachable_check(node, ping_count=1, port=22, timeout=3):
+def ssh_reachable_check(node):
     """
-    Check if node is reachable by using ping and socket to ssh port
+    Check if node is reachable by checking SSH port is open
     """
-    rc, _, _ = ShellUtils().get_stdout_stderr(f"ping -n -c {ping_count} -W {timeout} {node}")
-    if rc == 0:
-        return True
-    # ping failed, try to connect to ssh port by socket
-    if check_port_open(node, port, timeout):
+    if node == this_node() or check_port_open(node, 22):
         return True
-    # both ping and socket failed
-    raise ValueError(f"host \"{node}\" is unreachable")
+    if config.core.no_ssh:
+        raise NoSSHError(constants.NO_SSH_ERROR_MSG)
+    else:
+        raise ValueError(f"host \"{node}\" is unreachable via SSH")
 
 
 def get_reachable_node_list(node_list:list[str]) -> list[str]:
     reachable_node_list = []
     for node in node_list:
         try:
-            if node == this_node() or node_reachable_check(node):
+            if ssh_reachable_check(node):
                 reachable_node_list.append(node)
         except ValueError as e:
             logger.warning(str(e))
@@ -2451,6 +2449,12 @@ def __init__(self, msg: str, dead_nodes=None):
         self.dead_nodes = dead_nodes or []
 
 
+class UnreachableNodeError(ValueError):
+    def __init__(self, msg: str, unreachable_nodes=None):
+        super().__init__(msg)
+        self.unreachable_nodes = unreachable_nodes or []
+
+
 def check_all_nodes_reachable(action_to_do: str, peer_node: str = None):
     """
     Check if all cluster nodes are reachable
@@ -2461,7 +2465,7 @@ def check_all_nodes_reachable(action_to_do: str, peer_node: str = None):
     dead_nodes = []
     for node in offline_nodes:
         try:
-            node_reachable_check(node)
+            ssh_reachable_check(node)
         except ValueError:
             dead_nodes.append(node)
     if dead_nodes:
@@ -2472,8 +2476,17 @@ def check_all_nodes_reachable(action_to_do: str, peer_node: str = None):
         """
         raise DeadNodeError(msg, dead_nodes)
 
+    unreachable_nodes = []
     for node in online_nodes:
-        node_reachable_check(node)
+        try:
+            ssh_reachable_check(node)
+        except ValueError:
+            unreachable_nodes.append(node)
+    if unreachable_nodes:
+        msg = f"""There are unreachable nodes: {', '.join(unreachable_nodes)}.
+Please check the network connectivity before {action_to_do}.
+        """
+        raise UnreachableNodeError(msg, unreachable_nodes)
 
 
 def re_split_string(reg, string):

diff --git a/test/features/cluster_blocking_ssh.feature b/test/features/cluster_blocking_ssh.feature
@@ -60,8 +60,6 @@ Feature: cluster testing with ssh blocked
     And     Run "firewall-cmd --zone=public --add-rich-rule='rule port port=22 protocol=tcp drop' --permanent && firewall-cmd --reload" on "hanode2"
     And     Try "ssh -o ConnectTimeout=5 hanode2" on "hanode1"
     Then    Except "ssh: connect to host hanode2 port 22: Connection timed out" in stderr
-    When    Run "timeout 5s crm report || echo "timeout"" on "hanode1"
-    Then    Expected "timeout" in stdout
     When    Write multi lines to file "/etc/crm/crm.conf" on "hanode1"
       """
       [core]

diff --git a/test/features/qdevice_validate.feature b/test/features/qdevice_validate.feature
@@ -23,7 +23,7 @@ Feature: corosync qdevice/qnetd options validate
   Scenario: Service ssh on qnetd node not available
     When    Run "systemctl stop sshd.service" on "node-without-ssh"
     When    Try "crm cluster init --qnetd-hostname=node-without-ssh"
-    Then    Except "ERROR: cluster.init: ssh service on "node-without-ssh" not available"
+    Then    Except "ERROR: cluster.init: host "node-without-ssh" is unreachable via SSH"
 
   @clean
   Scenario: Option "--qdevice-port" set wrong port

diff --git a/test/unittests/test_qdevice.py b/test/unittests/test_qdevice.py
@@ -203,7 +203,7 @@ def test_qdevice_p12_on_cluster(self):
         self.assertEqual(res, "/etc/corosync/qdevice/net/node1.com/qdevice-net-node.p12")
 
     @mock.patch('crmsh.utils.InterfacesInfo.ip_in_local')
-    @mock.patch('crmsh.utils.node_reachable_check')
+    @mock.patch('crmsh.utils.ssh_reachable_check')
     @mock.patch('socket.getaddrinfo')
     def test_check_qnetd_addr_local(self, mock_getaddrinfo, mock_reachable, mock_in_local):
         mock_getaddrinfo.return_value = [(None, ("10.10.10.123",)),]