Skip to content

Commit 1e10566

Browse files
authored
Fix: utils: Raise UnreachableNodeError for those ssh unreachable nodes (bsc#1250645) (#1860)
## Problem When SSH is not available at the peer node, configuring sbd will fail and might lead to inconsistent ``` # crm sbd configure watchdog-timeout=120 INFO: No 'msgwait-timeout=' specified in the command, use 2*watchdog timeout: 240 INFO: Configuring disk-based SBD INFO: Initializing SBD device /dev/sda5 INFO: Update SBD_WATCHDOG_DEV in /etc/sysconfig/sbd: /dev/watchdog0 ERROR: sbd.configure: Failed to run command test -d /etc/sysconfig || mkdir -p /etc/sysconfig on root@sle16-2: Cannot create SSH connection to root@sle16-2: ssh: connect to host sle16-2 port 22: Connection refused ``` See issue #1738 ## Changed Raise exception when detecting that any node's SSH port is not reachable ``` # crm sbd configure watchdog-timeout=120 WARNING: host "sle16-2" is unreachable ERROR: sbd.configure: There are unreachable nodes: sle16-2. Please check the network connectivity before configuring SBD. ```
2 parents 0c3bd0a + d5c9bd6 commit 1e10566

File tree

11 files changed

+102
-91
lines changed

11 files changed

+102
-91
lines changed

crmsh/bootstrap.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -273,7 +273,7 @@ def _validate_nodes_option(self):
273273
utils.fatal(f"Overriding current user '{self.current_user}' by '{user}'. Ouch, don't do it.")
274274
self.user_at_node_list = [value for (user, node), value in zip(li, self.user_at_node_list) if node != me]
275275
for user, node in (utils.parse_user_at_host(x) for x in self.user_at_node_list):
276-
utils.node_reachable_check(node)
276+
utils.ssh_reachable_check(node)
277277

278278
def _validate_cluster_node(self):
279279
"""
@@ -2360,7 +2360,7 @@ def bootstrap_join(context):
23602360
_context.initialize_user()
23612361

23622362
remote_user, cluster_node = _parse_user_at_host(_context.cluster_node, _context.current_user)
2363-
utils.node_reachable_check(cluster_node)
2363+
utils.ssh_reachable_check(cluster_node)
23642364
join_ssh(cluster_node, remote_user)
23652365
remote_user = utils.user_of(cluster_node)
23662366

crmsh/qdevice.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -208,14 +208,11 @@ def check_qnetd_addr(qnetd_addr):
208208
except socket.error:
209209
raise ValueError("host \"{}\" is unreachable".format(qnetd_addr))
210210

211-
utils.node_reachable_check(qnetd_addr)
211+
utils.ssh_reachable_check(qnetd_addr)
212212

213213
if utils.InterfacesInfo.ip_in_local(qnetd_ip):
214214
raise ValueError("host for qnetd must be a remote one")
215215

216-
if not utils.check_port_open(qnetd_ip, 22):
217-
raise ValueError("ssh service on \"{}\" not available".format(qnetd_addr))
218-
219216
@staticmethod
220217
def check_qdevice_port(qdevice_port):
221218
if not utils.valid_port(qdevice_port):

crmsh/ui_cluster.py

Lines changed: 17 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
from .prun import prun
2727
from .service_manager import ServiceManager
2828
from .sh import ShellUtils
29-
from .ui_node import parse_option_for_nodes
29+
from . import ui_utils
3030
from . import constants
3131

3232

@@ -167,23 +167,24 @@ def do_start(self, context, *args):
167167
'''
168168
Starts the cluster stack on all nodes or specific node(s)
169169
'''
170-
node_list = parse_option_for_nodes(context, *args)
170+
try:
171+
node_list = ui_utils.parse_and_validate_node_args("start", *args)
172+
except utils.NoSSHError as msg:
173+
logger.error('%s', msg)
174+
logger.info("Please try 'crm cluster start' on each node")
175+
return False
176+
171177
service_check_list = ["pacemaker.service"]
172178
start_qdevice = False
173179
if corosync.is_qdevice_configured():
174180
start_qdevice = True
175181
service_check_list.append("corosync-qdevice.service")
176182

177183
service_manager = ServiceManager()
178-
try:
179-
for node in node_list[:]:
180-
if all([service_manager.service_is_active(srv, remote_addr=node) for srv in service_check_list]):
181-
logger.info("The cluster stack already started on {}".format(node))
182-
node_list.remove(node)
183-
except utils.NoSSHError as msg:
184-
logger.error('%s', msg)
185-
logger.info("Please try 'crm cluster start' on each node")
186-
return False
184+
for node in node_list[:]:
185+
if all([service_manager.service_is_active(srv, remote_addr=node) for srv in service_check_list]):
186+
logger.info("The cluster stack already started on {}".format(node))
187+
node_list.remove(node)
187188
if not node_list:
188189
return
189190

@@ -248,13 +249,14 @@ def do_stop(self, context, *args):
248249
'''
249250
Stops the cluster stack on all nodes or specific node(s)
250251
'''
251-
node_list = parse_option_for_nodes(context, *args)
252252
try:
253-
node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)]
253+
node_list = ui_utils.parse_and_validate_node_args("stop", *args)
254254
except utils.NoSSHError as msg:
255255
logger.error('%s', msg)
256256
logger.info("Please try 'crm cluster stop' on each node")
257257
return False
258+
259+
node_list = [n for n in node_list if self._node_ready_to_stop_cluster_service(n)]
258260
if not node_list:
259261
return
260262
logger.debug(f"stop node list: {node_list}")
@@ -297,7 +299,7 @@ def do_enable(self, context, *args):
297299
'''
298300
Enable the cluster services on this node
299301
'''
300-
node_list = parse_option_for_nodes(context, *args)
302+
node_list = ui_utils.parse_and_validate_node_args("enable", *args)
301303
service_manager = ServiceManager()
302304
node_list = service_manager.enable_service("pacemaker.service", node_list=node_list)
303305
if service_manager.service_is_available("corosync-qdevice.service") and corosync.is_qdevice_configured():
@@ -310,7 +312,7 @@ def do_disable(self, context, *args):
310312
'''
311313
Disable the cluster services on this node
312314
'''
313-
node_list = parse_option_for_nodes(context, *args)
315+
node_list = ui_utils.parse_and_validate_node_args("disable", *args)
314316
service_manager = ServiceManager()
315317
node_list = service_manager.disable_service("pacemaker.service", node_list=node_list)
316318
service_manager.disable_service("corosync-qdevice.service", node_list=node_list)

crmsh/ui_node.py

Lines changed: 2 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
import copy
77
import subprocess
88
from lxml import etree
9-
from argparse import ArgumentParser, RawDescriptionHelpFormatter
109

1110
from . import config
1211
from . import command
@@ -219,47 +218,6 @@ def print_node(uname, ident, node_type, other, inst_attr, offline):
219218
print(term.render("\t%s" % (s)))
220219

221220

222-
def parse_option_for_nodes(context, *args):
223-
"""
224-
Parse option for nodes
225-
Return a node list
226-
"""
227-
action_type = context.get_command_name()
228-
action_target = "node" if action_type in ["standby", "online"] else "cluster service"
229-
action = "{} {}".format(action_type, action_target)
230-
usage_template = """
231-
Specify node(s) on which to {action}.
232-
If no nodes are specified, {action} on the local node.
233-
If --all is specified, {action} on all nodes."""
234-
addtion_usage = ""
235-
if action_type == "standby":
236-
usage_template += """
237-
\n\nAdditionally, you may specify a lifetime for the standby---if set to
238-
"reboot", the node will be back online once it reboots. "forever" will
239-
keep the node in standby after reboot. The life time defaults to
240-
"forever"."""
241-
addtion_usage = " [lifetime]"
242-
243-
parser = ArgumentParser(description=usage_template.format(action=action),
244-
usage="{} [--all | <node>... ]{}".format(action_type, addtion_usage),
245-
add_help=False,
246-
formatter_class=RawDescriptionHelpFormatter)
247-
parser.add_argument("-h", "--help", action="store_true", dest="help", help="Show this help message")
248-
parser.add_argument("--all", help="To {} on all nodes".format(action), action="store_true", dest="all")
249-
250-
options, args = parser.parse_known_args(args)
251-
if options.help:
252-
parser.print_help()
253-
raise utils.TerminateSubCommand(success=True)
254-
if options is None or args is None:
255-
raise utils.TerminateSubCommand
256-
if options.all and args:
257-
context.fatal_error("Should either use --all or specific node(s)")
258-
259-
include_remote = action_type in ["standby", "online"]
260-
return utils.validate_and_get_reachable_nodes(args, options.all, include_remote)
261-
262-
263221
class NodeMgmt(command.UI):
264222
'''
265223
Nodes management class
@@ -343,7 +301,7 @@ def do_standby(self, context, *args):
343301
args = args[:-1]
344302

345303
# Parse node option
346-
node_list = parse_option_for_nodes(context, *args)
304+
node_list = ui_utils.parse_and_validate_node_args("standby", *args)
347305
if not node_list:
348306
return
349307

@@ -431,7 +389,7 @@ def do_online(self, context, *args):
431389
To avoid race condition for --all option, melt all online values into one cib replace session
432390
"""
433391
# Parse node option
434-
node_list = parse_option_for_nodes(context, *args)
392+
node_list = ui_utils.parse_and_validate_node_args("online", *args)
435393
if not node_list:
436394
return
437395

crmsh/ui_utils.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import inspect
77
from . import utils
88
from . import log
9+
from argparse import ArgumentParser, RawDescriptionHelpFormatter
910

1011

1112
logger = log.setup_logger(__name__)
@@ -162,3 +163,45 @@ def mknamed():
162163
if max_args >= 0 and len(args) > max_args:
163164
raise ValueError("Expected (%s), takes at most %d arguments (%d given)" %
164165
(mknamed(), max_args-nskip, len(args)-nskip))
166+
167+
168+
def parse_and_validate_node_args(command_name, *args) -> list:
169+
'''
170+
Parses option for node-related commands
171+
Then validates and returns the reachable node list
172+
'''
173+
action_target = "node" if command_name in ["standby", "online"] else "cluster service"
174+
action = f"{command_name} {action_target}"
175+
usage_template = """
176+
Specify node(s) on which to {action}.
177+
If no nodes are specified, {action} on the local node.
178+
If --all is specified, {action} on all nodes."""
179+
addtion_usage = ""
180+
if command_name == "standby":
181+
usage_template += """
182+
\n\nAdditionally, you may specify a lifetime for the standby---if set to
183+
"reboot", the node will be back online once it reboots. "forever" will
184+
keep the node in standby after reboot. The life time defaults to
185+
"forever"."""
186+
addtion_usage = " [lifetime]"
187+
188+
parser = ArgumentParser(
189+
description=usage_template.format(action=action),
190+
usage=f"{command_name} [--all | <node>... ]{addtion_usage}",
191+
add_help=False,
192+
formatter_class=RawDescriptionHelpFormatter
193+
)
194+
parser.add_argument("-h", "--help", action="store_true", dest="help", help="Show this help message")
195+
parser.add_argument("--all", help=f"To {action} on all nodes", action="store_true", dest="all")
196+
197+
options, args = parser.parse_known_args(args)
198+
if options.help:
199+
parser.print_help()
200+
raise utils.TerminateSubCommand(success=True)
201+
if options is None or args is None:
202+
raise utils.TerminateSubCommand
203+
if options.all and args:
204+
raise ValueError("Should either use --all or specific node(s)")
205+
206+
include_remote = command_name in ["standby", "online"]
207+
return utils.validate_and_get_reachable_nodes(args, options.all, include_remote)

crmsh/utils.py

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -2428,25 +2428,23 @@ def package_is_installed(pkg, remote_addr=None):
24282428
return rc == 0
24292429

24302430

2431-
def node_reachable_check(node, ping_count=1, port=22, timeout=3):
2431+
def ssh_reachable_check(node):
24322432
"""
2433-
Check if node is reachable by using ping and socket to ssh port
2433+
Check if node is reachable by checking SSH port is open
24342434
"""
2435-
rc, _, _ = ShellUtils().get_stdout_stderr(f"ping -n -c {ping_count} -W {timeout} {node}")
2436-
if rc == 0:
2437-
return True
2438-
# ping failed, try to connect to ssh port by socket
2439-
if check_port_open(node, port, timeout):
2435+
if node == this_node() or check_port_open(node, 22):
24402436
return True
2441-
# both ping and socket failed
2442-
raise ValueError(f"host \"{node}\" is unreachable")
2437+
if config.core.no_ssh:
2438+
raise NoSSHError(constants.NO_SSH_ERROR_MSG)
2439+
else:
2440+
raise ValueError(f"host \"{node}\" is unreachable via SSH")
24432441

24442442

24452443
def get_reachable_node_list(node_list:list[str]) -> list[str]:
24462444
reachable_node_list = []
24472445
for node in node_list:
24482446
try:
2449-
if node == this_node() or node_reachable_check(node):
2447+
if ssh_reachable_check(node):
24502448
reachable_node_list.append(node)
24512449
except ValueError as e:
24522450
logger.warning(str(e))
@@ -2474,6 +2472,12 @@ def __init__(self, msg: str, dead_nodes=None):
24742472
self.dead_nodes = dead_nodes or []
24752473

24762474

2475+
class UnreachableNodeError(ValueError):
2476+
def __init__(self, msg: str, unreachable_nodes=None):
2477+
super().__init__(msg)
2478+
self.unreachable_nodes = unreachable_nodes or []
2479+
2480+
24772481
def check_all_nodes_reachable(action_to_do: str, peer_node: str = None):
24782482
"""
24792483
Check if all cluster nodes are reachable
@@ -2484,7 +2488,7 @@ def check_all_nodes_reachable(action_to_do: str, peer_node: str = None):
24842488
dead_nodes = []
24852489
for node in offline_nodes:
24862490
try:
2487-
node_reachable_check(node)
2491+
ssh_reachable_check(node)
24882492
except ValueError:
24892493
dead_nodes.append(node)
24902494
if dead_nodes:
@@ -2495,8 +2499,17 @@ def check_all_nodes_reachable(action_to_do: str, peer_node: str = None):
24952499
"""
24962500
raise DeadNodeError(msg, dead_nodes)
24972501

2502+
unreachable_nodes = []
24982503
for node in online_nodes:
2499-
node_reachable_check(node)
2504+
try:
2505+
ssh_reachable_check(node)
2506+
except ValueError:
2507+
unreachable_nodes.append(node)
2508+
if unreachable_nodes:
2509+
msg = f"""There are unreachable nodes: {', '.join(unreachable_nodes)}.
2510+
Please check the network connectivity before {action_to_do}.
2511+
"""
2512+
raise UnreachableNodeError(msg, unreachable_nodes)
25002513

25012514

25022515
def re_split_string(reg, string):

test/features/cluster_blocking_ssh.feature

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,8 +60,6 @@ Feature: cluster testing with ssh blocked
6060
And Run "firewall-cmd --zone=public --add-rich-rule='rule port port=22 protocol=tcp drop' --permanent && firewall-cmd --reload" on "hanode2"
6161
And Try "ssh -o ConnectTimeout=5 hanode2" on "hanode1"
6262
Then Except "ssh: connect to host hanode2 port 22: Connection timed out" in stderr
63-
When Run "timeout 5s crm report || echo "timeout"" on "hanode1"
64-
Then Expected "timeout" in stdout
6563
When Write multi lines to file "/etc/crm/crm.conf" on "hanode1"
6664
"""
6765
[core]

test/features/qdevice_validate.feature

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ Feature: corosync qdevice/qnetd options validate
2323
Scenario: Service ssh on qnetd node not available
2424
When Run "systemctl stop sshd.service" on "node-without-ssh"
2525
When Try "crm cluster init --qnetd-hostname=node-without-ssh"
26-
Then Except "ERROR: cluster.init: ssh service on "node-without-ssh" not available"
26+
Then Except "ERROR: cluster.init: host "node-without-ssh" is unreachable via SSH"
2727

2828
@clean
2929
Scenario: Option "--qdevice-port" set wrong port

test/unittests/test_qdevice.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -203,7 +203,7 @@ def test_qdevice_p12_on_cluster(self):
203203
self.assertEqual(res, "/etc/corosync/qdevice/net/node1.com/qdevice-net-node.p12")
204204

205205
@mock.patch('crmsh.utils.InterfacesInfo.ip_in_local')
206-
@mock.patch('crmsh.utils.node_reachable_check')
206+
@mock.patch('crmsh.utils.ssh_reachable_check')
207207
@mock.patch('socket.getaddrinfo')
208208
def test_check_qnetd_addr_local(self, mock_getaddrinfo, mock_reachable, mock_in_local):
209209
mock_getaddrinfo.return_value = [(None, ("10.10.10.123",)),]

0 commit comments

Comments
 (0)