Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions crmsh/bootstrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,7 @@ def _validate_nodes_option(self):
utils.fatal(f"Overriding current user '{self.current_user}' by '{user}'. Ouch, don't do it.")
self.user_at_node_list = [value for (user, node), value in zip(li, self.user_at_node_list) if node != me]
for user, node in (utils.parse_user_at_host(x) for x in self.user_at_node_list):
utils.node_reachable_check(node)
utils.ssh_reachable_check(node)

def _validate_cluster_node(self):
"""
Expand Down Expand Up @@ -2449,7 +2449,7 @@ def bootstrap_join(context):
_context.initialize_user()

remote_user, cluster_node = _parse_user_at_host(_context.cluster_node, _context.current_user)
utils.node_reachable_check(cluster_node)
utils.ssh_reachable_check(cluster_node)
join_ssh(cluster_node, remote_user)
remote_user = utils.user_of(cluster_node)

Expand Down
5 changes: 1 addition & 4 deletions crmsh/qdevice.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,14 +208,11 @@ def check_qnetd_addr(qnetd_addr):
except socket.error:
raise ValueError("host \"{}\" is unreachable".format(qnetd_addr))

utils.node_reachable_check(qnetd_addr)
utils.ssh_reachable_check(qnetd_addr)

if utils.InterfacesInfo.ip_in_local(qnetd_ip):
raise ValueError("host for qnetd must be a remote one")

if not utils.check_port_open(qnetd_ip, 22):
raise ValueError("ssh service on \"{}\" not available".format(qnetd_addr))

@staticmethod
def check_qdevice_port(qdevice_port):
if not utils.valid_port(qdevice_port):
Expand Down
32 changes: 17 additions & 15 deletions crmsh/ui_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
from .prun import prun
from .service_manager import ServiceManager
from .sh import ShellUtils
from .ui_node import parse_option_for_nodes
from . import ui_utils
from . import constants


Expand Down Expand Up @@ -167,23 +167,24 @@ def do_start(self, context, *args):
'''
Starts the cluster stack on all nodes or specific node(s)
'''
node_list = parse_option_for_nodes(context, *args)
try:
node_list = ui_utils.parse_and_validate_node_args("start", *args)
except utils.NoSSHError as msg:
logger.error('%s', msg)
logger.info("Please try 'crm cluster start' on each node")
return False

service_check_list = ["pacemaker.service"]
start_qdevice = False
if corosync.is_qdevice_configured():
start_qdevice = True
service_check_list.append("corosync-qdevice.service")

service_manager = ServiceManager()
try:
for node in node_list[:]:
if all([service_manager.service_is_active(srv, remote_addr=node) for srv in service_check_list]):
logger.info("The cluster stack already started on {}".format(node))
node_list.remove(node)
except utils.NoSSHError as msg:
logger.error('%s', msg)
logger.info("Please try 'crm cluster start' on each node")
return False
for node in node_list[:]:
if all([service_manager.service_is_active(srv, remote_addr=node) for srv in service_check_list]):
logger.info("The cluster stack already started on {}".format(node))
node_list.remove(node)
if not node_list:
return

Expand Down Expand Up @@ -248,13 +249,14 @@ def do_stop(self, context, *args):
'''
Stops the cluster stack on all nodes or specific node(s)
'''
node_list = parse_option_for_nodes(context, *args)
try:
node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)]
node_list = ui_utils.parse_and_validate_node_args("stop", *args)
except utils.NoSSHError as msg:
logger.error('%s', msg)
logger.info("Please try 'crm cluster stop' on each node")
return False

node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)]
if not node_list:
return
logger.debug(f"stop node list: {node_list}")
Expand Down Expand Up @@ -297,7 +299,7 @@ def do_enable(self, context, *args):
'''
Enable the cluster services on this node
'''
node_list = parse_option_for_nodes(context, *args)
node_list = ui_utils.parse_and_validate_node_args("enable", *args)
service_manager = ServiceManager()
node_list = service_manager.enable_service("pacemaker.service", node_list=node_list)
if service_manager.service_is_available("corosync-qdevice.service") and corosync.is_qdevice_configured():
Expand All @@ -310,7 +312,7 @@ def do_disable(self, context, *args):
'''
Disable the cluster services on this node
'''
node_list = parse_option_for_nodes(context, *args)
node_list = ui_utils.parse_and_validate_node_args("disable", *args)
service_manager = ServiceManager()
node_list = service_manager.disable_service("pacemaker.service", node_list=node_list)
service_manager.disable_service("corosync-qdevice.service", node_list=node_list)
Expand Down
46 changes: 2 additions & 44 deletions crmsh/ui_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import copy
import subprocess
from lxml import etree
from argparse import ArgumentParser, RawDescriptionHelpFormatter

from . import config
from . import command
Expand Down Expand Up @@ -219,47 +218,6 @@ def print_node(uname, ident, node_type, other, inst_attr, offline):
print(term.render("\t%s" % (s)))


def parse_option_for_nodes(context, *args):
"""
Parse option for nodes
Return a node list
"""
action_type = context.get_command_name()
action_target = "node" if action_type in ["standby", "online"] else "cluster service"
action = "{} {}".format(action_type, action_target)
usage_template = """
Specify node(s) on which to {action}.
If no nodes are specified, {action} on the local node.
If --all is specified, {action} on all nodes."""
addtion_usage = ""
if action_type == "standby":
usage_template += """
\n\nAdditionally, you may specify a lifetime for the standby---if set to
"reboot", the node will be back online once it reboots. "forever" will
keep the node in standby after reboot. The life time defaults to
"forever"."""
addtion_usage = " [lifetime]"

parser = ArgumentParser(description=usage_template.format(action=action),
usage="{} [--all | <node>... ]{}".format(action_type, addtion_usage),
add_help=False,
formatter_class=RawDescriptionHelpFormatter)
parser.add_argument("-h", "--help", action="store_true", dest="help", help="Show this help message")
parser.add_argument("--all", help="To {} on all nodes".format(action), action="store_true", dest="all")

options, args = parser.parse_known_args(args)
if options.help:
parser.print_help()
raise utils.TerminateSubCommand(success=True)
if options is None or args is None:
raise utils.TerminateSubCommand
if options.all and args:
context.fatal_error("Should either use --all or specific node(s)")

include_remote = action_type in ["standby", "online"]
return utils.validate_and_get_reachable_nodes(args, options.all, include_remote)


class NodeMgmt(command.UI):
'''
Nodes management class
Expand Down Expand Up @@ -348,7 +306,7 @@ def do_standby(self, context, *args):
args = args[:-1]

# Parse node option
node_list = parse_option_for_nodes(context, *args)
node_list = ui_utils.parse_and_validate_node_args("standby", *args)
if not node_list:
return

Expand Down Expand Up @@ -436,7 +394,7 @@ def do_online(self, context, *args):
To avoid race condition for --all option, melt all online values into one cib replace session
"""
# Parse node option
node_list = parse_option_for_nodes(context, *args)
node_list = ui_utils.parse_and_validate_node_args("online", *args)
if not node_list:
return

Expand Down
43 changes: 43 additions & 0 deletions crmsh/ui_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import inspect
from . import utils
from . import log
from argparse import ArgumentParser, RawDescriptionHelpFormatter


logger = log.setup_logger(__name__)
Expand Down Expand Up @@ -162,3 +163,45 @@ def mknamed():
if max_args >= 0 and len(args) > max_args:
raise ValueError("Expected (%s), takes at most %d arguments (%d given)" %
(mknamed(), max_args-nskip, len(args)-nskip))


def parse_and_validate_node_args(command_name, *args) -> list:
'''
Parses option for node-related commands
Then validates and returns the reachable node list
'''
action_target = "node" if command_name in ["standby", "online"] else "cluster service"
action = f"{command_name} {action_target}"
usage_template = """
Specify node(s) on which to {action}.
If no nodes are specified, {action} on the local node.
If --all is specified, {action} on all nodes."""
addtion_usage = ""
if command_name == "standby":
usage_template += """
\n\nAdditionally, you may specify a lifetime for the standby---if set to
"reboot", the node will be back online once it reboots. "forever" will
keep the node in standby after reboot. The life time defaults to
"forever"."""
addtion_usage = " [lifetime]"

parser = ArgumentParser(
description=usage_template.format(action=action),
usage=f"{command_name} [--all | <node>... ]{addtion_usage}",
add_help=False,
formatter_class=RawDescriptionHelpFormatter
)
parser.add_argument("-h", "--help", action="store_true", dest="help", help="Show this help message")
parser.add_argument("--all", help=f"To {action} on all nodes", action="store_true", dest="all")

options, args = parser.parse_known_args(args)
if options.help:
parser.print_help()
raise utils.TerminateSubCommand(success=True)
if options is None or args is None:
raise utils.TerminateSubCommand
if options.all and args:
raise ValueError("Should either use --all or specific node(s)")

include_remote = command_name in ["standby", "online"]
return utils.validate_and_get_reachable_nodes(args, options.all, include_remote)
37 changes: 25 additions & 12 deletions crmsh/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2405,25 +2405,23 @@ def package_is_installed(pkg, remote_addr=None):
return rc == 0


def node_reachable_check(node, ping_count=1, port=22, timeout=3):
def ssh_reachable_check(node):
"""
Check if node is reachable by using ping and socket to ssh port
Check if node is reachable by checking SSH port is open
"""
rc, _, _ = ShellUtils().get_stdout_stderr(f"ping -n -c {ping_count} -W {timeout} {node}")
if rc == 0:
return True
# ping failed, try to connect to ssh port by socket
if check_port_open(node, port, timeout):
if node == this_node() or check_port_open(node, 22):
return True
# both ping and socket failed
raise ValueError(f"host \"{node}\" is unreachable")
if config.core.no_ssh:
raise NoSSHError(constants.NO_SSH_ERROR_MSG)
else:
raise ValueError(f"host \"{node}\" is unreachable via SSH")


def get_reachable_node_list(node_list:list[str]) -> list[str]:
reachable_node_list = []
for node in node_list:
try:
if node == this_node() or node_reachable_check(node):
if ssh_reachable_check(node):
reachable_node_list.append(node)
except ValueError as e:
logger.warning(str(e))
Expand Down Expand Up @@ -2451,6 +2449,12 @@ def __init__(self, msg: str, dead_nodes=None):
self.dead_nodes = dead_nodes or []


class UnreachableNodeError(ValueError):
def __init__(self, msg: str, unreachable_nodes=None):
super().__init__(msg)
self.unreachable_nodes = unreachable_nodes or []


def check_all_nodes_reachable(action_to_do: str, peer_node: str = None):
"""
Check if all cluster nodes are reachable
Expand All @@ -2461,7 +2465,7 @@ def check_all_nodes_reachable(action_to_do: str, peer_node: str = None):
dead_nodes = []
for node in offline_nodes:
try:
node_reachable_check(node)
ssh_reachable_check(node)
except ValueError:
dead_nodes.append(node)
if dead_nodes:
Expand All @@ -2472,8 +2476,17 @@ def check_all_nodes_reachable(action_to_do: str, peer_node: str = None):
"""
raise DeadNodeError(msg, dead_nodes)

unreachable_nodes = []
for node in online_nodes:
node_reachable_check(node)
try:
ssh_reachable_check(node)
except ValueError:
unreachable_nodes.append(node)
if unreachable_nodes:
msg = f"""There are unreachable nodes: {', '.join(unreachable_nodes)}.
Please check the network connectivity before {action_to_do}.
"""
raise UnreachableNodeError(msg, unreachable_nodes)


def re_split_string(reg, string):
Expand Down
2 changes: 0 additions & 2 deletions test/features/cluster_blocking_ssh.feature
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,6 @@ Feature: cluster testing with ssh blocked
And Run "firewall-cmd --zone=public --add-rich-rule='rule port port=22 protocol=tcp drop' --permanent && firewall-cmd --reload" on "hanode2"
And Try "ssh -o ConnectTimeout=5 hanode2" on "hanode1"
Then Except "ssh: connect to host hanode2 port 22: Connection timed out" in stderr
When Run "timeout 5s crm report || echo "timeout"" on "hanode1"
Then Expected "timeout" in stdout
When Write multi lines to file "/etc/crm/crm.conf" on "hanode1"
"""
[core]
Expand Down
2 changes: 1 addition & 1 deletion test/features/qdevice_validate.feature
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ Feature: corosync qdevice/qnetd options validate
Scenario: Service ssh on qnetd node not available
When Run "systemctl stop sshd.service" on "node-without-ssh"
When Try "crm cluster init --qnetd-hostname=node-without-ssh"
Then Except "ERROR: cluster.init: ssh service on "node-without-ssh" not available"
Then Except "ERROR: cluster.init: host "node-without-ssh" is unreachable via SSH"

@clean
Scenario: Option "--qdevice-port" set wrong port
Expand Down
2 changes: 1 addition & 1 deletion test/unittests/test_qdevice.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def test_qdevice_p12_on_cluster(self):
self.assertEqual(res, "/etc/corosync/qdevice/net/node1.com/qdevice-net-node.p12")

@mock.patch('crmsh.utils.InterfacesInfo.ip_in_local')
@mock.patch('crmsh.utils.node_reachable_check')
@mock.patch('crmsh.utils.ssh_reachable_check')
@mock.patch('socket.getaddrinfo')
def test_check_qnetd_addr_local(self, mock_getaddrinfo, mock_reachable, mock_in_local):
mock_getaddrinfo.return_value = [(None, ("10.10.10.123",)),]
Expand Down
Loading