Package: release.debian.org Severity: normal User: [email protected] Usertags: unblock X-Debbugs-Cc: [email protected] Control: affects -1 + src:crmsh
Please unblock package crmsh New version of crmsh package is already uploaded to unstable, but will not have enough time to transition to testing. [ Reason ] crmsh 5.0.0 finally brings support for Corosync 3 that is already included in Debian. [ Impact ] Version 5.0.0-rc1 is already in testing so new version would be included but it might have some bugs. [ Tests ] Build tests and autopkgtests are passing for the new version. [ Risks ] Risk should be relatively low, as rc2 version add some fixes and nothing else should depend on crmsh package. [ Checklist ] [x] all changes are documented in the d/changelog [x] I reviewed all changes and I approve them [x] attach debdiff against the package in testing unblock crmsh/5.0.0~rc2-1 diff -Nru crmsh-5.0.0~rc1/ChangeLog crmsh-5.0.0~rc2/ChangeLog --- crmsh-5.0.0~rc1/ChangeLog 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/ChangeLog 2025-07-15 07:03:56.000000000 +0200 @@ -1,3 +1,36 @@ +* Tue Jul 15 2025 Xin Liang <[email protected]> +- Release 5.0.0 rc2 +- Dev: migration: allow to run migration locally (jsc#PED-8252) +- Dev: Remove unused code +- Dev: utils: Validate if local node is a cluster member on peer node's view +- Dev: ui_cluster: Enhance membership validation for `cluster run` command +- Dev: corosync: Get value from runtime.config prefix and update default token value +- Fix: bootstrap: should fallback to default user when `core.hosts` is not availabe from the seed node (bsc#1245343) +- Fix: bootstrap: Refine qnetd passwordless configuration logic (bsc#1245387) +- Fix: log: Improve function confirm's logic (bsc#1245386) +- Dev: bootstrap: Remove dead node from the cluster +- Dev: Prevent actions when offline nodes are unreachable +- Dev: xmlutil: Address circular import issue +- Dev: bootstrap: Remove user@host item from /root/.config/crm/crm.conf when removing node +- Dev: run-functional-tests: Fetch container's IP address correctly +- Dev: provide a friendly message when passwordless ssh does not work (bsc#1244525) +- Fix: bootstrap: Reload corosync after sync corosync.conf (bsc#1244437) +- Fix: bootstrap: setup_passwordless_with_other_nodes does not update the authorized_keys on localhost (bsc#1244314) +- Dev: cibconfig: Prevent adding Pacemaker remote resources to groups, orders, or colocations +- Fix: report.collect: Detect log existence before using it (bsc#1244515) +- Dev: bootstrap: Improve node removal handling and messaging +- Dev: ui_corosync: Write changes made by `corosync.set` to temporary file first +- Dev: bootstrap: Improve configuration for admin IP +- Dev: bootstrap: do not hide ssh-copy-id outputs in debug mode +- Fix: bootstrap: add sleeps to avoid triggering sshd PerSourcePenalties (bsc#1243141) +- Fix: crash_test: Correctly retrieve fence event information (bsc#1243786) +- Dev: doc: Update help text for `corosync set` command +- Dev: corosync_config_format: Skip comment key +- ui_corosync: Add push reminder after called `corosync set` +- Dev: ui_corosync: Call `corosync -t` to do verification +- Dev: migration: use atomic write to modify corosync.conf on remote nodes (jsc#PED-8252) +- Dev: Dockerfile: Install pacemaker-remote package + * Wed May 21 2025 Xin Liang <[email protected]> - Release 5.0.0 rc1 - Dev: Drop scripts and templates which include unsupported RAs (jsc#PED-8924) (#1800) diff -Nru crmsh-5.0.0~rc1/codecov.yml crmsh-5.0.0~rc2/codecov.yml --- crmsh-5.0.0~rc1/codecov.yml 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/codecov.yml 2025-07-15 07:03:56.000000000 +0200 @@ -8,7 +8,7 @@ threshold: 0.35% codecov: notify: - after_n_builds: 31 + after_n_builds: 32 comment: - after_n_builds: 31 + after_n_builds: 32 layout: "condensed_header, flags, files, condensed_footer" diff -Nru crmsh-5.0.0~rc1/crmsh/bootstrap.py crmsh-5.0.0~rc2/crmsh/bootstrap.py --- crmsh-5.0.0~rc1/crmsh/bootstrap.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/bootstrap.py 2025-07-15 07:03:56.000000000 +0200 @@ -18,6 +18,7 @@ import re import tempfile import time +from time import sleep import readline import shutil import typing @@ -226,7 +227,7 @@ if self.stage == "sbd": if self.cluster_is_running: - utils.check_all_nodes_reachable() + utils.check_all_nodes_reachable("setup SBD") for node in utils.list_cluster_nodes(): if not utils.package_is_installed("sbd", node): utils.fatal(SBDManager.SBD_NOT_INSTALLED_MSG + f" on {node}") @@ -474,7 +475,7 @@ sh.cluster_shell().get_stdout_or_raise_error(f"crm -F configure load {action} {configuration_tmpfile}") -def wait_for_resource(message, resource, timeout_ms=WAIT_TIMEOUT_MS_DEFAULT): +def wait_for_resource(message, resource, timeout_ms=WAIT_TIMEOUT_MS_DEFAULT, fatal_on_timeout=True): """ Wait for resource started """ @@ -485,7 +486,12 @@ break status_progress(progress_bar) if 0 < timeout_ms <= (int(time.clock_gettime(time.CLOCK_MONOTONIC) * 1000) - start_time): - utils.fatal('Time out waiting for resource.') + error_msg = f'Time out waiting for resource "{resource}" to start.' + if fatal_on_timeout: + utils.fatal(error_msg) + else: + logger.error(error_msg) + break sleep(1) @@ -552,14 +558,6 @@ return "" -def sleep(t): - """ - Sleep for t seconds. - """ - t = float(t) - time.sleep(t) - - def status_progress(progress_bar): if not _context or not _context.quiet: progress_bar.progress() @@ -969,8 +967,11 @@ if utils.check_ssh_passwd_need(local_user, remote_user, remote_node, shell): configure_ssh_key(local_user) public_keys = ssh_key.fetch_public_key_file_list(None, local_user) + sleep(5) # bsc#1243141: sshd PerSourcePenalties logger.info("Configuring SSH passwordless with {}@{}".format(remote_user, remote_node)) - cmd = f"ssh-copy-id -i {public_keys[0].public_key_file()} '{remote_user}@{remote_node}' &> /dev/null" + cmd = f"ssh-copy-id -i {public_keys[0].public_key_file()} '{remote_user}@{remote_node}'" + if not config.core.debug: + cmd += ' &> /dev/null' result = shell.su_subprocess_run(local_user, cmd, tty=True) return SshCopyIdResult(result.returncode, public_keys) else: @@ -1525,7 +1526,12 @@ adminaddr = prompt_for_string('Virtual IP', valid_func=Validation.valid_admin_ip) crm_configure_load("update", 'primitive admin-ip IPaddr2 ip=%s op monitor interval=10 timeout=20' % (utils.doublequote(adminaddr))) - wait_for_resource("Configuring virtual IP ({})".format(adminaddr), "admin-ip") + wait_for_resource( + f"Configuring virtual IP ({adminaddr})", + "admin-ip", + timeout_ms=5000, + fatal_on_timeout=False + ) def configure_qdevice_interactive(): @@ -1624,6 +1630,7 @@ return logger.info("""Configure Qdevice/Qnetd:""") + utils.check_all_nodes_reachable("setup Qdevice") cluster_node_list = utils.list_cluster_nodes() for node in cluster_node_list: if not ServiceManager().service_is_available("corosync-qdevice.service", node): @@ -1889,12 +1896,23 @@ init_node_hostname = out # Swap ssh public key between join node and other cluster nodes for node in (node for node in cluster_node_list if node != init_node_hostname): - remote_user_to_swap = utils.user_of(node) - remote_privileged_user = remote_user_to_swap + try: + remote_privileged_user = utils.user_of(node) + except UserNotFoundError: + remote_privileged_user = local_user result = ssh_copy_id_no_raise(local_user, remote_privileged_user, node, local_shell) if result.returncode != 0: - utils.fatal("Failed to login to remote host {}@{}".format(remote_user_to_swap, node)) - _merge_ssh_authorized_keys(cluster_node_list) + utils.fatal("Failed to login to remote host {}@{}".format(remote_privileged_user, node)) + else: + user_by_host.add(remote_privileged_user, node) + user_by_host.save_local() + if utils.this_node() in cluster_node_list: + nodes_including_self = cluster_node_list + else: + nodes_including_self = [utils.this_node()] + nodes_including_self.extend(cluster_node_list) + # FIXME: 2 layers of loop is unnecessary? + _merge_ssh_authorized_keys(shell, user_of_host.UserOfHost.instance(), nodes_including_self) if local_user != 'hacluster': change_user_shell('hacluster', node) swap_public_ssh_key(node, 'hacluster', 'hacluster', local_user, remote_privileged_user, local_shell) @@ -1904,7 +1922,7 @@ user_by_host.save_remote(cluster_node_list) -def _merge_ssh_authorized_keys(nodes: typing.Sequence[str]): +def _merge_ssh_authorized_keys(shell: sh.ClusterShell, user_of_host: user_of_host.UserOfHost, nodes: typing.Sequence[str]): keys = set() with tempfile.TemporaryDirectory(prefix='crmsh-bootstrap-') as tmpdir: # sftp does not accept `~` @@ -1913,7 +1931,11 @@ for line in f: if line.startswith('ssh-'): keys.add(line.rstrip()) - parallax.parallax_run(nodes, _merge_line_into_file('~/.ssh/authorized_keys', keys)) + script = _merge_line_into_file('~/.ssh/authorized_keys', keys) + for node in nodes: + rc, error = shell.get_rc_and_error(node, user_of_host.user_of(node), script) + if rc != 0: + raise ValueError(error) def swap_key_for_hacluster(other_node_list): @@ -1989,10 +2011,11 @@ shell = sh.cluster_shell() - if is_qdevice_configured and not _context.use_ssh_agent: - # trigger init_qnetd_remote on init node - cmd = f"crm cluster init qnetd_remote {utils.this_node()} -y" - shell.get_stdout_or_raise_error(cmd, seed_host) + if is_qdevice_configured: + if not _context.use_ssh_agent or not _keys_from_ssh_agent(): + # trigger init_qnetd_remote on init node + cmd = f"crm cluster init qnetd_remote {utils.this_node()} -y" + shell.get_stdout_or_raise_error(cmd, seed_host) shutil.copy(corosync.conf(), COROSYNC_CONF_ORIG) @@ -2033,7 +2056,7 @@ except corosync.IPAlreadyConfiguredError as e: logger.warning(e) sync_file(corosync.conf()) - shell.get_stdout_or_raise_error('sudo corosync-cfgtool -R', seed_host) + shell.get_stdout_or_raise_error('corosync-cfgtool -R', seed_host) _context.sbd_manager.join_sbd(remote_user, seed_host) @@ -2045,27 +2068,25 @@ adjust_properties() - with logger_utils.status_long("Reloading cluster configuration"): + # Ditch no-quorum-policy=ignore + no_quorum_policy = utils.get_property("no-quorum-policy") + if no_quorum_policy == "ignore": + logger.info("Ditching no-quorum-policy=ignore") + if not utils.delete_property("no-quorum-policy"): + logger.error("Failed to delete no-quorum-policy=ignore") - # Ditch no-quorum-policy=ignore - no_quorum_policy = utils.get_property("no-quorum-policy") - if no_quorum_policy == "ignore": - logger.info("Ditching no-quorum-policy=ignore") - if not utils.delete_property("no-quorum-policy"): - logger.error("Failed to delete no-quorum-policy=ignore") + corosync.configure_two_node() + sync_file(corosync.conf()) + sync_files_to_disk() - invoke("crm cluster run 'crm corosync reload'") + with logger_utils.status_long("Reloading cluster configuration"): + shell.get_stdout_or_raise_error("corosync-cfgtool -R") if is_qdevice_configured: start_qdevice_on_join_node(seed_host) else: ServiceManager(sh.ClusterShellAdaptorForLocalShell(sh.LocalShell())).disable_service("corosync-qdevice.service") - if not is_qdevice_configured: - corosync.configure_two_node() - sync_file(corosync.conf()) - sync_files_to_disk() - def adjust_priority_in_rsc_defaults(is_2node_wo_qdevice): """ @@ -2154,14 +2175,15 @@ shell.get_stdout_or_raise_error(cmd, remote) -def remove_node_from_cluster(node): +def remove_node_from_cluster(node, dead_node=False): """ Remove node from running cluster and the corosync / pacemaker configuration. """ node_ip = get_cluster_node_ip(node) - stop_services(SERVICES_STOP_LIST, remote_addr=node) - qdevice.QDevice.remove_qdevice_db([node]) - rm_configuration_files(node) + if not dead_node: + stop_services(SERVICES_STOP_LIST, remote_addr=node) + qdevice.QDevice.remove_qdevice_db([node]) + rm_configuration_files(node) # execute the command : crm node delete $HOSTNAME logger.info("Removing node %s from CIB", node) @@ -2182,10 +2204,14 @@ sync_file(CSYNC2_CFG) sync_file(corosync.conf()) - # Trigger corosync config reload to ensure expected_votes is propagated - invoke("corosync-cfgtool -R") + sh.cluster_shell().get_stdout_or_raise_error("corosync-cfgtool -R") + + if not dead_node: + FirewallManager(peer=node).remove_service() - FirewallManager(peer=node).remove_service() + user_by_host = utils.HostUserConfig() + user_by_host.remove(node) + user_by_host.save_remote(utils.list_cluster_nodes()) def ssh_stage_finished(): @@ -2382,6 +2408,7 @@ try: with lock_inst.lock(): service_manager = ServiceManager() + utils.check_all_nodes_reachable("joining a node to the cluster", cluster_node) _context.node_list_in_cluster = utils.fetch_cluster_node_list_from_node(cluster_node) setup_passwordless_with_other_nodes(cluster_node) _context.skip_csync2 = not service_manager.service_is_active(CSYNC2_SERVICE, cluster_node) @@ -2423,7 +2450,7 @@ if not confirm("Removing QDevice service and configuration from cluster: Are you sure?"): return - utils.check_all_nodes_reachable() + utils.check_all_nodes_reachable("removing QDevice from the cluster") qdevice_reload_policy = qdevice.evaluate_qdevice_quorum_effect(qdevice.QDEVICE_REMOVE) logger.info("Disable corosync-qdevice.service") @@ -2439,7 +2466,7 @@ corosync.configure_two_node(removing=True) sync_file(corosync.conf()) if qdevice_reload_policy == qdevice.QdevicePolicy.QDEVICE_RELOAD: - invoke("crm cluster run 'crm corosync reload'") + sh.cluster_shell().get_stdout_or_raise_error("corosync-cfgtool -R") elif qdevice_reload_policy == qdevice.QdevicePolicy.QDEVICE_RESTART: restart_cluster() else: @@ -2488,7 +2515,23 @@ utils.fatal("No existing IP/hostname specified (use -c option)") remote_user, cluster_node = _parse_user_at_host(_context.cluster_node, _context.current_user) - cluster_node = get_node_canonical_hostname(cluster_node) + + try: + utils.check_all_nodes_reachable("removing a node from the cluster") + except utils.DeadNodeError as e: + if force_flag and cluster_node in e.dead_nodes: + remove_node_from_cluster(cluster_node, dead_node=True) + bootstrap_finished() + return + else: + raise + + if service_manager.service_is_active("pacemaker.service", cluster_node): + cluster_node = get_node_canonical_hostname(cluster_node) + else: + configured_nodes = utils.list_cluster_nodes() + if cluster_node not in configured_nodes: + utils.fatal(f"Node {cluster_node} is not configured in cluster! (valid nodes: {', '.join(configured_nodes)})") if not force_flag and not confirm("Removing node \"{}\" from the cluster: Are you sure?".format(cluster_node)): return diff -Nru crmsh-5.0.0~rc1/crmsh/cibconfig.py crmsh-5.0.0~rc2/crmsh/cibconfig.py --- crmsh-5.0.0~rc1/crmsh/cibconfig.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/cibconfig.py 2025-07-15 07:03:56.000000000 +0200 @@ -1702,14 +1702,21 @@ if self.node is None: # eh? logger.error("%s: no xml (strange)", self.obj_id) return utils.get_check_rc() + + rc = VerifyResult.SUCCESS l = get_resource_meta_list() if self.obj_type == "clone": l += constants.clone_meta_attributes elif self.obj_type == "ms": l += constants.clone_meta_attributes + constants.ms_meta_attributes elif self.obj_type == "group": + for c in self.node.iterchildren(): + if c.tag == "primitive" and c.get("class") == "ocf" and c.get("type") == "remote": + logger.error("Cannot put remote resource '%s' in a group", c.get("id")) + rc |= VerifyResult.FATAL_ERROR l += constants.group_meta_attributes - return sanity_check_meta(self.obj_id, self.node, l) + rc |= sanity_check_meta(self.obj_id, self.node, l) + return rc def repr_gv(self, gv_obj, from_grp=False): ''' @@ -1774,6 +1781,21 @@ return rc +def _check_if_primitive_in_constraint_is_remote(obj) -> VerifyResult: + rc = VerifyResult.SUCCESS + primitives = [] + if obj.obj_type == "colocation": + primitives = [obj.node.get("rsc"), obj.node.get("with-rsc")] + elif obj.obj_type == "order": + primitives = [obj.node.get("first"), obj.node.get("then")] + for rscid in primitives: + tgt = cib_factory.find_object(rscid) + if tgt and tgt.node.get("class") == "ocf" and tgt.node.get("type") == "remote": + logger.error("Cannot put remote resource '%s' in %s constraint", rscid, obj.obj_type) + rc |= VerifyResult.FATAL_ERROR + return rc + + class CibLocation(CibObject): ''' Location constraint. @@ -2000,7 +2022,9 @@ if self.node is None: logger.error("%s: no xml (strange)", self.obj_id) return utils.get_check_rc() - return _check_if_constraint_ref_is_child(self) + rc1 = _check_if_constraint_ref_is_child(self) + rc2 = _check_if_primitive_in_constraint_is_remote(self) + return rc1 | rc2 class CibRscTicket(CibSimpleConstraint): diff -Nru crmsh-5.0.0~rc1/crmsh/completers.py crmsh-5.0.0~rc2/crmsh/completers.py --- crmsh-5.0.0~rc1/crmsh/completers.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/completers.py 2025-07-15 07:03:56.000000000 +0200 @@ -70,7 +70,7 @@ nodes = call(xmlutil.listnodes) -online_nodes = call(lambda x: xmlutil.CrmMonXmlParser().get_node_list(x), "online") -standby_nodes = call(lambda x: xmlutil.CrmMonXmlParser().get_node_list(x), "standby") +online_nodes = call(lambda x: xmlutil.CrmMonXmlParser().get_node_list(standby=x), False) +standby_nodes = call(lambda x: xmlutil.CrmMonXmlParser().get_node_list(standby=x), True) shadows = call(xmlutil.listshadows) diff -Nru crmsh-5.0.0~rc1/crmsh/corosync_config_format.py crmsh-5.0.0~rc2/crmsh/corosync_config_format.py --- crmsh-5.0.0~rc1/crmsh/corosync_config_format.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/corosync_config_format.py 2025-07-15 07:03:56.000000000 +0200 @@ -186,6 +186,8 @@ match node: case dict(_): for key, value in node.items(): + if key.startswith(COMMENT_PREFIX): + continue queue.append((value, (*path, key))) case list(_) as li: for item in li: diff -Nru crmsh-5.0.0~rc1/crmsh/corosync.py crmsh-5.0.0~rc2/crmsh/corosync.py --- crmsh-5.0.0~rc1/crmsh/corosync.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/corosync.py 2025-07-15 07:03:56.000000000 +0200 @@ -23,7 +23,7 @@ logger = log.setup_logger(__name__) -COROSYNC_TOKEN_DEFAULT = 1000 # in ms units +COROSYNC_TOKEN_DEFAULT = 3000 # in ms units COROSYNC_CONF_TEMPLATE = """ # Generated by crmsh # For more details please see corosync.conf.5 man page @@ -313,12 +313,13 @@ inst.save() -def get_corosync_value(key): +def get_corosync_value(key, cmapctl_prefix="runtime.config"): """ Get corosync configuration value from corosync-cmapctl or corosync.conf """ try: - out = sh.cluster_shell().get_stdout_or_raise_error("corosync-cmapctl {}".format(key)) + cmapctl_prefix = f"{cmapctl_prefix.strip('.')}." if cmapctl_prefix else "" + out = sh.cluster_shell().get_stdout_or_raise_error(f"corosync-cmapctl {cmapctl_prefix}{key}") res = re.search(r'{}\s+.*=\s+(.*)'.format(key), out) return res.group(1) if res else None except ValueError: @@ -359,6 +360,8 @@ """ try: ConfParser(config_file=config_file) + corosync_verify_cmd = f"corosync -c {config_file} -t" if config_file else "corosync -t" + sh.cluster_shell().get_stdout_or_raise_error(corosync_verify_cmd) except ValueError as e: logger.error("Invalid %s: %s", config_file or conf(), e) return False @@ -499,12 +502,12 @@ return inst.get_all(path) @classmethod - def set_value(cls, path, value, index=0): + def set_value(cls, path, value, index=0, config_file=None): """ Class method to set value for path Then write back to config file """ - inst = cls() + inst = cls(config_file=config_file) inst.set(path, value, index) inst.save() diff -Nru crmsh-5.0.0~rc1/crmsh/crash_test/config.py crmsh-5.0.0~rc2/crmsh/crash_test/config.py --- crmsh-5.0.0~rc1/crmsh/crash_test/config.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/crash_test/config.py 2025-07-15 07:03:56.000000000 +0200 @@ -4,6 +4,5 @@ iptables -{action} OUTPUT -d {peer_ip} -j DROP''' REMOVE_PORT = "firewall-cmd --zone=public --remove-port={port}/udp" ADD_PORT = "firewall-cmd --zone=public --add-port={port}/udp" -FENCE_HISTORY = "stonith_admin -h {node}" SBD_CONF = "/etc/sysconfig/sbd" SBD_CHECK_CMD = "sbd -d {dev} dump" diff -Nru crmsh-5.0.0~rc1/crmsh/crash_test/task.py crmsh-5.0.0~rc2/crmsh/crash_test/task.py --- crmsh-5.0.0~rc1/crmsh/crash_test/task.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/crash_test/task.py 2025-07-15 07:03:56.000000000 +0200 @@ -1,11 +1,12 @@ import os -import re import time import threading import shutil import tempfile +from datetime import datetime from contextlib import contextmanager from crmsh import utils as crmshutils +from crmsh import xmlutil from crmsh import log from . import utils from . import config @@ -27,6 +28,7 @@ REBOOT_WARNING = """!!! WARNING WARNING WARNING !!! THIS CASE MAY LEAD TO NODE BE FENCED. TYPE Yes TO CONTINUE, OTHER INPUTS WILL CANCEL THIS CASE [Yes/No](No): """ + TIME_STR_FORMAT = '%Y/%m/%d %H:%M:%S' def __init__(self, description, flush=False, quiet=False): """ @@ -37,7 +39,7 @@ self.force = False self.quiet = quiet self.messages = [] - self.timestamp = utils.now() + self.timestamp = datetime.now() self.description = description utils.msg_info(self.description, to_stdout=False) self.flush = flush @@ -81,7 +83,7 @@ Build base results """ self.result = { - "Timestamp": self.timestamp, + "Timestamp": self.timestamp.strftime(self.TIME_STR_FORMAT), "Description": self.description, "Messages": ["{} {}:{}".format(m[2], m[0].upper(), m[1]) for m in self.messages] @@ -128,36 +130,24 @@ 1. There is one latest fence action successfully done 2. No fence action during fence timeout, thread_stop_event triggered by main thread """ - target_node = None - from_node = None - fence_timestamp = None - # Try to find out which node fire the fence action while not self.thread_stop_event.is_set(): - rc, out, _ = ShellUtils().get_stdout_stderr("crm_mon -1|grep -A1 \"Fencing Actions:\"") - if rc == 0 and out: - match = re.search(r"of (.*) pending: .*origin=(.*)$", out) - if match: - target_node, from_node = match.groups() - self.info("Node \"{}\" will be fenced by \"{}\"!".format(target_node, from_node)) + fence_event_dict = xmlutil.CrmMonXmlParser().get_last_fence_event_info() + if fence_event_dict: + target_node = fence_event_dict.get('target') + origin_node = fence_event_dict.get('origin') + complete_time = fence_event_dict.get('completed') + status = fence_event_dict.get('status') + if status == "pending" and not self.fence_start_event.is_set(): + self.info(f"Node \"{target_node}\" will be fenced by \"{origin_node}\"!") self.fence_start_event.set() - break - time.sleep(1) - - # Try to find out proof that fence happened - while not self.thread_stop_event.is_set(): - rc, out, _ = ShellUtils().get_stdout_stderr(config.FENCE_HISTORY.format(node=target_node)) - if rc == 0 and out: - match = re.search(r"Node {} last fenced at: (.*)".format(target_node), out) - if match: - fence_timestamp = match.group(1) - task_timestamp_dt = utils.str_to_datetime(self.timestamp, '%Y/%m/%d %H:%M:%S') - fence_timestamp_dt = utils.str_to_datetime(fence_timestamp, '%a %b %d %H:%M:%S %Y') - # If the fence action timestamp larger than this task's timestamp - # That is the proof - if task_timestamp_dt < fence_timestamp_dt: - self.info("Node \"{}\" was successfully fenced by \"{}\"".format(target_node, from_node)) - # Tell main thread fence happened + # Try to find out proof that fence happened + elif status == "success": + task_timestamp = self.timestamp.timestamp() + complete_timestamp = datetime.fromisoformat(complete_time).timestamp() + # This success event should after the task started + if task_timestamp < complete_timestamp: + self.info(f"Node \"{target_node}\" was fenced by \"{origin_node}\" at {complete_time}") self.fence_finish_event.set() break time.sleep(1) @@ -259,7 +249,7 @@ message = "{} [{}]".format(self.description, utils.CGREEN + "Pass" + utils.CEND) else: message = "{} [{}]".format(self.description, utils.CRED + "Fail" + utils.CEND) - logger.info(message, extra={'timestamp': '[{}]'.format(self.timestamp)}) + logger.info(message, extra={'timestamp': '[{}]'.format(self.timestamp.strftime(self.TIME_STR_FORMAT))}) for msg in self.messages: logger.log(utils.LEVEL[msg[0]], msg[1], extra={'timestamp': ' '}) diff -Nru crmsh-5.0.0~rc1/crmsh/log_patterns.py crmsh-5.0.0~rc2/crmsh/log_patterns.py --- crmsh-5.0.0~rc1/crmsh/log_patterns.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/log_patterns.py 2025-07-15 07:03:56.000000000 +0200 @@ -25,122 +25,6 @@ __all__ = ('patterns',) -_patterns_old = { - "resource": ( - ( # detail 0 - "lrmd.*%% (?:start|stop|promote|demote|migrate)", - "lrmd.*RA output: .%%:.*:stderr", - "lrmd.*WARN: Managed %%:.*exited", - "lrmd.*WARN: .* %% .*timed out$", - "crmd.*LRM operation %%_(?:start|stop|promote|demote|migrate)_.*confirmed=true", - "crmd.*LRM operation %%_.*Timed Out", - "[(]%%[)]\\[", - ), - ( # detail 1 - "lrmd.*%% (?:probe|notify)", - "lrmd.*Managed %%:.*exited", - ), - ), - "node": ( - ( # detail 0 - " %% .*Corosync.Cluster.Engine", - " %% .*Executive.Service.RELEASE", - " %% .*Requesting.shutdown", - " %% .*Shutdown.complete", - " %% .*Configuration.validated..Starting.heartbeat", - "pengine.*Scheduling Node %% for STONITH", - "crmd.* of %% failed", - "stonith-ng.*host '%%'", - "Exec.*on %% ", - "Node %% will be fenced", - "stonith-ng.*for %% timed", - "stonith-ng.*can not fence %%:", - "stonithd.*Succeeded.*node %%:", - "(?:lost|memb): %% ", - "crmd.*(?:NEW|LOST):.* %% ", - "Node return implies stonith of %% ", - ), - ( # detail 1 - ), - ), - "quorum": ( - ( # detail 0 - "crmd.*Updating.quorum.status", - "crmd.*quorum.(?:lost|ac?quir)", - ), - ( # detail 1 - ), - ), - "events": ( - ( # detail 0 - "CRIT:", - "ERROR:", - ), - ( # detail 1 - "WARN:", - ), - ), -} - -_patterns_118 = { - "resource": ( - ( # detail 0 - "crmd.*Initiating.*%%_(?:start|stop|promote|demote|migrate)_", - "lrmd.*operation_finished: %%_", - "lrmd.*executing - rsc:%% action:(?:start|stop|promote|demote|migrate)", - "lrmd.*finished - rsc:%% action:(?:start|stop|promote|demote|migrate)", - - "crmd.*LRM operation %%_(?:start|stop|promote|demote|migrate)_.*confirmed=true", - "crmd.*LRM operation %%_.*Timed Out", - "[(]%%[)]\\[", - ), - ( # detail 1 - "crmd.*Initiating.*%%_(?:monitor_0|notify)", - "lrmd.*executing - rsc:%% action:(?:monitor_0|notify)", - "lrmd.*finished - rsc:%% action:(?:monitor_0|notify)", - ), - ), - "node": ( - ( # detail 0 - " %% .*Corosync.Cluster.Engine", - " %% .*Executive.Service.RELEASE", - " %% .*crm_shutdown:.Requesting.shutdown", - " %% .*pcmk_shutdown:.Shutdown.complete", - " %% .*Configuration.validated..Starting.heartbeat", - "pengine.*Scheduling Node %% for STONITH", - "pengine.*Node %% will be fenced", - "crmd.*for %% failed", - "stonith-ng.*host '%%'", - "Exec.*on %% ", - "Node %% will be fenced", - "stonith-ng.*on %% for.*timed out", - "stonith-ng.*can not fence %%:", - "stonithd.*Succeeded.*node %%:", - "(?:lost|memb): %% ", - "crmd.*(?:NEW|LOST|new|lost):.* %% ", - "Node return implies stonith of %% ", - ), - ( # detail 1 - ), - ), - "quorum": ( - ( # detail 0 - "crmd.*Updating.(quorum).status", - r"crmd.*quorum.(?:lost|ac?quir[^\s]*)", - ), - ( # detail 1 - ), - ), - "events": ( - ( # detail 0 - "(CRIT|crit|ERROR|error|UNCLEAN|unclean):", - ), - ( # detail 1 - "(WARN|warning):", - ), - ), -} - _patterns_200 = { "resource": ( ( # detail 0 @@ -280,8 +164,3 @@ def patterns(cib_f=None): if utils.is_min_pcmk_ver(constants.PCMK_VERSION_DEFAULT, cib_f=cib_f): return _patterns_200 - is118 = utils.is_larger_than_pcmk_118(cib_f=cib_f) - if is118: - return _patterns_118 - else: - return _patterns_old diff -Nru crmsh-5.0.0~rc1/crmsh/log.py crmsh-5.0.0~rc2/crmsh/log.py --- crmsh-5.0.0~rc1/crmsh/log.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/log.py 2025-07-15 07:03:56.000000000 +0200 @@ -431,8 +431,9 @@ """ while True: ans = self.wait_input("{} (y/n)? ".format(msg.strip("? "))) - if ans: - return ans.lower() == "y" + if not ans or ans.lower() not in ('y', 'n'): + continue + return ans.lower() == 'y' def syntax_err(self, s, token='', context='', msg=''): err = "syntax" diff -Nru crmsh-5.0.0~rc1/crmsh/migration.py crmsh-5.0.0~rc2/crmsh/migration.py --- crmsh-5.0.0~rc1/crmsh/migration.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/migration.py 2025-07-15 07:03:56.000000000 +0200 @@ -156,9 +156,12 @@ return ret -def migrate(): +def migrate(args: typing.Sequence[str]): + parser = argparse.ArgumentParser(args[0]) + parser.add_argument('--local', action='store_true') + parsed_args = parser.parse_args(args[1:]) try: - match _check_impl(local=False, json='', summary=False): + match _check_impl(local=parsed_args.local, json='', summary=False): case CheckReturnCode.ALREADY_MIGRATED: logger.info("This cluster works on SLES 16. No migration is needed.") return 0 @@ -167,7 +170,7 @@ return 0 case CheckReturnCode.PASS_NEED_AUTO_FIX: logger.info('Starting migration...') - migrate_corosync_conf() + migrate_corosync_conf(local=parsed_args.local) logger.info('Finished migration.') return 0 case _: @@ -372,7 +375,7 @@ ) -def migrate_corosync_conf(): +def migrate_corosync_conf(local: bool): conf_path = corosync.conf() with open(conf_path, 'r', encoding='utf-8') as f: config = corosync_config_format.DomParser(f).dom() @@ -387,12 +390,17 @@ 'Finish migrating corosync configuration. The original configuration is renamed to %s.bak', os.path.basename(conf_path), ) - for host, result in prun.pcopy_to_remote(conf_path, utils.list_cluster_nodes_except_me(), conf_path).items(): - match result: - case None: - pass - case prun.PRunError() as e: - logger.error("Failed to copy crmsh.conf to host %s: %s", host, e) + if not local: + for host, result in prun.pcopy_to_remote( + conf_path, + utils.list_cluster_nodes_except_me(), conf_path, + atomic_write=True, + ).items(): + match result: + case None: + pass + case prun.PRunError() as e: + logger.error("Failed to copy crmsh.conf to host %s: %s", host, e) def migrate_corosync_conf_impl(config): diff -Nru crmsh-5.0.0~rc1/crmsh/prun/prun.py crmsh-5.0.0~rc2/crmsh/prun/prun.py --- crmsh-5.0.0~rc1/crmsh/prun/prun.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/prun/prun.py 2025-07-15 07:03:56.000000000 +0200 @@ -1,5 +1,6 @@ # prun.py - run command or copy files on multiple hosts concurrently import os +import random import socket import tempfile import typing @@ -147,6 +148,7 @@ hosts: typing.Sequence[str], dst: str, recursive: bool = False, *, + atomic_write: bool = False, timeout_seconds: int = -1, concurrency: int = _DEFAULT_CONCURRENCY, interceptor: PRunInterceptor = PRunInterceptor(), @@ -161,7 +163,11 @@ return {x: None for x in hosts} flags = '-pr' if recursive else '-p' local_sudoer, _ = UserOfHost.instance().user_pair_for_ssh(hosts[0]) - script = "put {} '{}' '{}'\n".format(flags, src, dst) + if atomic_write: + suffix = '{:x}-{}'.format(random.SystemRandom().getrandbits(64), socket.gethostname()) + script = f"put {flags} '{src}' '{dst}.{suffix}'\nrename '{dst}.{suffix}' '{dst}'\n" + else: + script = "put {} '{}' '{}'\n".format(flags, src, dst) ssh = None try: # sftp -S does not parse args, it accepts only a single executable. So we create one. diff -Nru crmsh-5.0.0~rc1/crmsh/qdevice.py crmsh-5.0.0~rc2/crmsh/qdevice.py --- crmsh-5.0.0~rc1/crmsh/qdevice.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/qdevice.py 2025-07-15 07:03:56.000000000 +0200 @@ -630,7 +630,6 @@ """ Adjust SBD_WATCHDOG_TIMEOUT when configuring qdevice and diskless SBD """ - utils.check_all_nodes_reachable() self.using_diskless_sbd = SBDUtils.is_using_diskless_sbd() # add qdevice after diskless sbd started if self.using_diskless_sbd: diff -Nru crmsh-5.0.0~rc1/crmsh/report/collect.py crmsh-5.0.0~rc2/crmsh/report/collect.py --- crmsh-5.0.0~rc1/crmsh/report/collect.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/report/collect.py 2025-07-15 07:03:56.000000000 +0200 @@ -10,7 +10,7 @@ import pwd import datetime from subprocess import TimeoutExpired -from typing import List +from typing import List, Optional import crmsh.user_of_host from crmsh import log, sh, corosync @@ -23,11 +23,11 @@ logger = log.setup_report_logger(__name__) -def get_corosync_log() -> str: +def get_corosync_log() -> Optional[str]: """ Get the path of the corosync log file """ - corosync_log = "" + corosync_log = None corosync_conf_path = corosync.conf() if os.path.exists(corosync_conf_path): corosync_log = corosync.get_value("logging.logfile") @@ -36,7 +36,7 @@ return corosync_log -def get_pcmk_log() -> str: +def get_pcmk_log() -> Optional[str]: """ Get the path of the pacemaker log file """ @@ -57,7 +57,7 @@ return log logger.warning("No valid pacemaker log file found") - return "" + return None def collect_ha_logs(context: core.Context) -> None: @@ -65,10 +65,10 @@ Collect pacemaker, corosync and extra logs """ log_list = [get_pcmk_log(), get_corosync_log()] + context.extra_log_list - log_list = [os.path.expanduser(log) for log in log_list] + log_list = [os.path.expanduser(log) for log in log_list if log is not None] log_list_marked_same_basename = utils.mark_duplicate_basenames(log_list) for log, same_basename in log_list_marked_same_basename: - if log and os.path.isfile(log): + if os.path.isfile(log): utils.dump_logset(context, log, create_dir=same_basename) diff -Nru crmsh-5.0.0~rc1/crmsh/ui_cluster.py crmsh-5.0.0~rc2/crmsh/ui_cluster.py --- crmsh-5.0.0~rc1/crmsh/ui_cluster.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/ui_cluster.py 2025-07-15 07:03:56.000000000 +0200 @@ -839,7 +839,7 @@ case 'sles16': try: if parsed_args.fix: - migration.migrate() + migration.migrate(['sles16'] + remaining_args) else: return 0 == migration.check(['sles16'] + remaining_args) except migration.MigrationFailure as e: @@ -895,13 +895,7 @@ ''' Execute the given command on all nodes/specific node(s), report outcome ''' - if nodes: - hosts = list(nodes) - else: - hosts = utils.list_cluster_nodes() - if hosts is None: - context.fatal_error("failed to get node list from cluster") - + hosts = utils.validate_and_get_reachable_nodes(nodes, all_nodes=True) for host, result in prun.prun({x: cmd for x in hosts}).items(): if isinstance(result, prun.PRunError): logger.error("[%s]: %s", host, result) diff -Nru crmsh-5.0.0~rc1/crmsh/ui_corosync.py crmsh-5.0.0~rc2/crmsh/ui_corosync.py --- crmsh-5.0.0~rc1/crmsh/ui_corosync.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/ui_corosync.py 2025-07-15 07:03:56.000000000 +0200 @@ -6,8 +6,10 @@ import json import re import subprocess +import os import sys import typing +import shutil from . import command, sh, parallax, iproute2 from . import completers @@ -369,6 +371,13 @@ nodes = utils.list_cluster_nodes() return corosync.diff_configuration(nodes, checksum=checksum) + @staticmethod + def _note_for_push(): + if len(utils.list_cluster_nodes()) > 1: + logger.warning("\"%s\" has changed, should be synced with other nodes", corosync.conf()) + logger.info("Use \"crm corosync diff\" to show the difference") + logger.info("Use \"crm corosync push\" to sync") + @command.skill_level('administrator') def do_edit(self, context): ''' @@ -377,10 +386,8 @@ cfg = corosync.conf() try: rc = utils.edit_file_ext(cfg, corosync.is_valid_corosync_conf) - if rc and len(utils.list_cluster_nodes()) > 1: - logger.warning(f"\"{cfg}\" has changed, should be synced with other nodes") - logger.info("Use \"crm corosync diff\" to show the difference") - logger.info("Use \"crm corosync push\" to sync") + if rc: + self._note_for_push() except IOError as e: context.fatal_error(str(e)) @@ -412,7 +419,16 @@ @command.completers(completers.call(corosync.get_all_paths)) def do_set(self, context, path, value, index: int = 0): """Set a corosync configuration value""" - corosync.set_value(path, value, index) + corosync_conf_file = corosync.conf() + with utils.create_tempfile(dir=os.path.dirname(corosync_conf_file)) as temp_file: + shutil.copyfile(corosync_conf_file, temp_file) + corosync.ConfParser.set_value(path, value, index=index, config_file=temp_file) + if corosync.is_valid_corosync_conf(temp_file): + os.rename(temp_file, corosync_conf_file) + self._note_for_push() + return True + else: + return False @command.level(Link) def do_link(self): diff -Nru crmsh-5.0.0~rc1/crmsh/ui_node.py crmsh-5.0.0~rc2/crmsh/ui_node.py --- crmsh-5.0.0~rc1/crmsh/ui_node.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/ui_node.py 2025-07-15 07:03:56.000000000 +0200 @@ -243,18 +243,7 @@ if options.all and args: context.fatal_error("Should either use --all or specific node(s)") - # return local node - if (not options.all and not args) or (len(args) == 1 and args[0] == utils.this_node()): - return [utils.this_node()] - member_list = utils.list_cluster_nodes() or utils.get_address_list_from_corosync_conf() - if not member_list: - context.fatal_error("Cannot get the node list from cluster") - for node in args: - if node not in member_list: - context.fatal_error(f"Node '{node}' is not a member of the cluster") - - node_list = member_list if options.all else args - return utils.get_reachable_node_list(node_list) + return utils.validate_and_get_reachable_nodes(args, options.all) class NodeMgmt(command.UI): @@ -268,19 +257,7 @@ node_delete = """cibadmin -D -o nodes -X '<node uname="%s"/>'""" node_delete_status = """cibadmin -D -o status -X '<node_state uname="%s"/>'""" node_cleanup_resources = "crm_resource --cleanup --node '%s'" - node_clear_state = _oneline("""cibadmin %s - -o status --xml-text - '<node_state id="%s" - uname="%s" - ha="active" - in_ccm="false" - crmd="offline" - join="member" - expected="down" - crm-debug-origin="manual_clear" - shutdown="0" - />'""") - node_clear_state_118 = "stonith_admin --confirm %s" + node_clear_state = "stonith_admin --confirm %s" crm_node = "crm_node" node_fence = "crm_attribute -t status -N '%s' -n terminate -v true" dc = "crmadmin -D" @@ -500,24 +477,21 @@ if not config.core.force and \ not utils.ask("Do you really want to drop state for node %s?" % node): return False - if utils.is_larger_than_pcmk_118(): - cib_elem = xmlutil.cibdump2elem() - if cib_elem is None: - return False - crmd = cib_elem.xpath("//node_state[@uname=\"%s\"]/@crmd" % node) - if not crmd: - logger.error("Node '%s' not found in CIB", node) - return False - if crmd == ["online"] or (crmd[0].isdigit() and int(crmd[0]) != 0): - return utils.ext_cmd(self.node_cleanup_resources % node) == 0 - in_ccm = cib_elem.xpath("//node_state[@uname=\"%s\"]/@in_ccm" % node) - if in_ccm == ["true"] or (in_ccm[0].isdigit() and int(in_ccm[0]) != 0): - logger.warning("Node is offline according to Pacemaker, but online according to corosync. First shut down node '%s'", node) - return False - return utils.ext_cmd(self.node_clear_state_118 % node) == 0 - else: - return utils.ext_cmd(self.node_clear_state % ("-M -c", node, node)) == 0 and \ - utils.ext_cmd(self.node_clear_state % ("-R", node, node)) == 0 + + cib_elem = xmlutil.cibdump2elem() + if cib_elem is None: + return False + crmd = cib_elem.xpath("//node_state[@uname=\"%s\"]/@crmd" % node) + if not crmd: + logger.error("Node '%s' not found in CIB", node) + return False + if crmd == ["online"] or (crmd[0].isdigit() and int(crmd[0]) != 0): + return utils.ext_cmd(self.node_cleanup_resources % node) == 0 + in_ccm = cib_elem.xpath("//node_state[@uname=\"%s\"]/@in_ccm" % node) + if in_ccm == ["true"] or (in_ccm[0].isdigit() and int(in_ccm[0]) != 0): + logger.warning("Node is offline according to Pacemaker, but online according to corosync. First shut down node '%s'", node) + return False + return utils.ext_cmd(self.node_clear_state % node) == 0 @classmethod def call_delnode(cls, node): diff -Nru crmsh-5.0.0~rc1/crmsh/ui_sbd.py crmsh-5.0.0~rc2/crmsh/ui_sbd.py --- crmsh-5.0.0~rc1/crmsh/ui_sbd.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/ui_sbd.py 2025-07-15 07:03:56.000000000 +0200 @@ -516,6 +516,8 @@ if len(args) < 2: raise self.SyntaxError("No device specified") + utils.check_all_nodes_reachable("configuring SBD device") + logger.info("Configured sbd devices: %s", ';'.join(self.device_list_from_config)) if len(args) == 2 and ";" in args[1]: device_list_from_args = args[1].split(";") @@ -549,6 +551,8 @@ if not self._service_is_active(service): return False + utils.check_all_nodes_reachable("configuring SBD") + parameter_dict = self._parse_args(args) if sbd.SBDUtils.is_using_disk_based_sbd(): self._configure_diskbase(parameter_dict) @@ -572,6 +576,8 @@ if not self._service_is_active(constants.SBD_SERVICE): return False + utils.check_all_nodes_reachable("purging SBD") + if args and args[0] == "crashdump": self._set_crashdump_option(delete=True) update_dict = self._set_crashdump_in_sysconfig(restore=True) diff -Nru crmsh-5.0.0~rc1/crmsh/user_of_host.py crmsh-5.0.0~rc2/crmsh/user_of_host.py --- crmsh-5.0.0~rc1/crmsh/user_of_host.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/user_of_host.py 2025-07-15 07:03:56.000000000 +0200 @@ -2,6 +2,7 @@ import os import socket import subprocess +import time import typing from . import config @@ -36,7 +37,7 @@ if cached is None: ret = self._get_user_of_host_from_config(host) if ret is None: - raise UserNotFoundError() + raise UserNotFoundError(f"UserNotFoundError: host={host}") else: self._user_cache[host] = ret return ret @@ -64,7 +65,7 @@ else: ret = self._guess_user_for_ssh(host) if ret is None: - raise UserNotFoundError from None + raise UserNotFoundError(f'Passwordless ssh to host "{host}" does not work.') from None else: self._user_pair_cache[host] = ret return ret @@ -102,6 +103,7 @@ args.extend(['-o', 'BatchMode=yes', host, 'sudo', 'true']) else: args.extend(['-o', 'BatchMode=yes', host, 'true']) + logger.debug("subprocess.call(%s)", args) rc = subprocess.call( args, stdin=subprocess.DEVNULL, @@ -113,6 +115,7 @@ user = userdir.getuser() return user, user else: + time.sleep(5) # bsc#1243141: sshd PerSourcePenalties return None diff -Nru crmsh-5.0.0~rc1/crmsh/utils.py crmsh-5.0.0~rc2/crmsh/utils.py --- crmsh-5.0.0~rc1/crmsh/utils.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/utils.py 2025-07-15 07:03:56.000000000 +0200 @@ -42,6 +42,7 @@ from . import options from . import term from . import log +from . import xmlutil from .prun import prun from .sh import ShellUtils from .service_manager import ServiceManager @@ -1638,10 +1639,6 @@ return is_larger_than_min_version(constants.pcmk_version, min_ver) -def is_larger_than_pcmk_118(cib_f=None): - return is_min_pcmk_ver("1.1.8", cib_f=cib_f) - - # quote function from python module shlex.py in python 3.3 _find_unsafe = re.compile(r'[^\w@%+=:,./-]').search @@ -1722,7 +1719,6 @@ ''' Returns a list of nodes in the cluster. ''' - from . import xmlutil rc, out, err = ShellUtils().get_stdout_stderr(constants.CIB_QUERY, no_reg=no_reg) # When cluster service running if rc == 0: @@ -2438,7 +2434,7 @@ reachable_node_list = [] for node in node_list: try: - if node_reachable_check(node): + if node == this_node() or node_reachable_check(node): reachable_node_list.append(node) except ValueError as e: logger.warning(str(e)) @@ -2460,12 +2456,34 @@ return dict(re.findall(r"(Expected|Total) votes:\s+(\d+)", out)) -def check_all_nodes_reachable(): +class DeadNodeError(ValueError): + def __init__(self, msg: str, dead_nodes=None): + super().__init__(msg) + self.dead_nodes = dead_nodes or [] + + +def check_all_nodes_reachable(action_to_do: str, peer_node: str = None): """ Check if all cluster nodes are reachable """ - out = sh.cluster_shell().get_stdout_or_raise_error("crm_node -l") - for node in re.findall(r"\d+ (.*) \w+", out): + crm_mon_inst = xmlutil.CrmMonXmlParser(peer_node) + online_nodes = crm_mon_inst.get_node_list() + offline_nodes = crm_mon_inst.get_node_list(online=False) + dead_nodes = [] + for node in offline_nodes: + try: + node_reachable_check(node) + except ValueError: + dead_nodes.append(node) + if dead_nodes: + # dead nodes bring risk to cluster, either bring them online or remove them + msg = f"""There are offline nodes also unreachable: {', '.join(dead_nodes)}. +Please bring them online before {action_to_do}. +Or use `crm cluster remove <offline_node> --force` to remove the offline node. + """ + raise DeadNodeError(msg, dead_nodes) + + for node in online_nodes: node_reachable_check(node) @@ -3051,6 +3069,10 @@ def get(self, host): return self._hosts_users[host] + def remove(self, host): + if host in self._hosts_users: + del self._hosts_users[host] + def add(self, user, host): self._hosts_users[host] = user @@ -3220,4 +3242,46 @@ def __bool__(self): return self in self.SUCCESS | self.WARNING + + +def validate_and_get_reachable_nodes( + nodes: typing.List[str] = [], + all_nodes: bool = False + ) -> typing.List[str]: + + no_cib = False + cluster_member_list = list_cluster_nodes() + if not cluster_member_list: + cluster_member_list = get_address_list_from_corosync_conf() + if cluster_member_list: + no_cib = True + + if not cluster_member_list: + fatal("Cannot get the member list of the cluster") + for node in nodes: + if node not in cluster_member_list: + fatal(f"Node '{node}' is not a member of the cluster") + + local_node = this_node() + # Return local node if no nodes specified + if not nodes and not all_nodes: + return [local_node] + # Use all cluster members if no nodes specified and all_nodes is True + node_list = nodes or cluster_member_list + # Filter out unreachable nodes + node_list = get_reachable_node_list(node_list) + if no_cib: + return node_list + + shell = sh.cluster_shell() + crm_mon_inst = xmlutil.CrmMonXmlParser() + for node in node_list[:]: + if node == local_node or crm_mon_inst.is_node_online(node): + continue + out = shell.get_stdout_or_raise_error("crm node show", node) + if not re.search(rf"^{local_node}\(\d\): member", out, re.M): + logger.error("From the view of node '%s', node '%s' is not a member of the cluster", node, local_node) + node_list.remove(node) + + return node_list # vim:ts=4:sw=4:et: diff -Nru crmsh-5.0.0~rc1/crmsh/xmlutil.py crmsh-5.0.0~rc2/crmsh/xmlutil.py --- crmsh-5.0.0~rc1/crmsh/xmlutil.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/crmsh/xmlutil.py 2025-07-15 07:03:56.000000000 +0200 @@ -18,10 +18,7 @@ from . import constants from . import userdir from .sh import ShellUtils -from .utils import add_sudo, str2file, str2tmp, get_boolean, handle_role_for_ocf_1_1, copy_local_file, rmfile -from .utils import stdout2list, crm_msec, crm_time_cmp -from .utils import olist, get_cib_in_use, get_tempdir, to_ascii, is_boolean_true -from .utils import VerifyResult +from . import utils from . import log @@ -53,14 +50,14 @@ # because xmlparse function requires the function descriptor not the plain text # and this would be so much work to redo it. # It's not too bad, but it's still a workaround and better be refactored, so FIXME! - copy_local_file(s, cib_tmp_copy) + utils.copy_local_file(s, cib_tmp_copy) f = open(cib_tmp_copy, 'r') logger.debug("{} successfully read the cib.xml".format(userdir.getuser())) cib_elem = xmlparse(f) f.close() if cib_tmp_copy != '': - rmfile(cib_tmp_copy) + utils.rmfile(cib_tmp_copy) if options.regression_tests and cib_elem is None: print("Failed to read CIB from file: %s" % (s)) return cib_elem @@ -91,7 +88,7 @@ def sudocall(cmd): - cmd = add_sudo(cmd) + cmd = utils.add_sudo(cmd) if options.regression_tests: print(".EXT", cmd) p = subprocess.Popen( @@ -102,7 +99,7 @@ try: outp, errp = p.communicate() p.wait() - return p.returncode, to_ascii(outp), to_ascii(errp) + return p.returncode, utils.to_ascii(outp), utils.to_ascii(errp) except IOError as msg: logger.error("running %s: %s", cmd, msg) return None, None, None @@ -111,7 +108,7 @@ def cibdump2file(fname): _, outp, _ = sudocall(cib_dump) if outp is not None: - return str2file(outp, fname) + return utils.str2file(outp, fname) return None @@ -119,7 +116,7 @@ try: _, outp, _ = sudocall(cib_dump) if outp is not None: - return str2tmp(outp) + return utils.str2tmp(outp) except IOError as msg: logger.error(msg) return None @@ -158,17 +155,17 @@ def sanity_check_nvpairs(ident, node, attr_list): - rc = VerifyResult.SUCCESS + rc = utils.VerifyResult.SUCCESS for nvpair in node.iterchildren("nvpair"): n = nvpair.get("name") if n and n not in attr_list: logger.warning("%s: unknown attribute '%s'", ident, n) - rc |= VerifyResult.WARNING + rc |= utils.VerifyResult.WARNING return rc def sanity_check_meta(ident, node, attr_list): - rc = VerifyResult.SUCCESS + rc = utils.VerifyResult.SUCCESS if node is None or not attr_list: return rc for c in node.iterchildren(): @@ -397,7 +394,7 @@ def is_live_cib(): '''We working with the live cluster?''' - return not get_cib_in_use() and not os.getenv("CIB_file") + return not utils.get_cib_in_use() and not os.getenv("CIB_file") def is_crmuser(): @@ -413,14 +410,14 @@ home = userdir.gethomedir(config.core.user) if home and home.startswith(os.path.sep): return os.path.join(home, ".cib") - return get_tempdir() + return utils.get_tempdir() def listshadows(): d = cib_shadow_dir() if not os.path.isdir(d): return [] - rc, l = stdout2list("ls %s | fgrep shadow. | sed 's/^shadow\\.//'" % d) + rc, l = utils.stdout2list("ls %s | fgrep shadow. | sed 's/^shadow\\.//'" % d) return l @@ -564,7 +561,7 @@ def is_ms_or_promotable_clone(node): - is_promotable_type = is_boolean_true(is_attr_set(node, "promotable")) + is_promotable_type = utils.is_boolean_true(is_attr_set(node, "promotable")) is_ms_type = node.tag in ("master", "ms") return is_ms_type or is_promotable_type @@ -826,9 +823,9 @@ interval = interval or "0" for op in matching_name: opint = op.get("interval") - if interval == "non-0" and crm_msec(opint) > 0: + if interval == "non-0" and utils.crm_msec(opint) > 0: return op - if crm_time_cmp(opint, interval) == 0: + if utils.crm_time_cmp(opint, interval) == 0: return op return None @@ -837,7 +834,7 @@ interval = (op == "monitor" and "non-0" or "0") op_n = find_operation(rsc_node, op == "probe" and "monitor" or op, interval) timeout = op_n is not None and op_n.get("timeout") or default_timeout - return crm_msec(timeout) + return utils.crm_msec(timeout) def op2list(node): @@ -926,11 +923,11 @@ def is_resource_cli(s): - return s in olist(constants.resource_cli_names) + return s in utils.olist(constants.resource_cli_names) def is_constraint_cli(s): - return s in olist(constants.constraint_cli_names) + return s in utils.olist(constants.constraint_cli_names) def referenced_resources(node): @@ -1015,7 +1012,7 @@ l.append(rset) c_obj.updated = True c_modified = True - elif not get_boolean(rset.get("sequential"), True) and rref_cnt > 1: + elif not utils.get_boolean(rset.get("sequential"), True) and rref_cnt > 1: nonseq_rset = True cnt += rref_cnt rmnodes(l) @@ -1440,7 +1437,7 @@ """ <nvpair name="" value="" /> """ - value = handle_role_for_ocf_1_1(value, name=name) + value = utils.handle_role_for_ocf_1_1(value, name=name) return new("nvpair", name=name, value=value) @@ -1534,16 +1531,20 @@ xpath = f'//node[@name="{node}" and @online="true"]' return bool(self.xml_elem.xpath(xpath)) - def get_node_list(self, attr=None): + def get_node_list(self, online=True, standby=False, exclude_remote=True) -> list[str]: """ Get a list of nodes based on the given attribute """ - attr_dict = { - 'standby': '[@standby="true"]', - 'online': '[@standby="false"]' - } - xpath_str = f'//node{attr_dict.get(attr, "")}' - return [e.get('name') for e in self.xml_elem.xpath(xpath_str)] + xpath_str = '//nodes/node' + conditions = [] + online_value = "true" if online else "false" + conditions.append(f'@online="{online_value}"') + standby_value = "true" if standby else "false" + conditions.append(f'@standby="{standby_value}"') + if exclude_remote: + conditions.append('@type="member"') + xpath_str += '[' + ' and '.join(conditions) + ']' + return [elem.get('name') for elem in self.xml_elem.xpath(xpath_str)] def is_resource_configured(self, ra_type): """ @@ -1574,4 +1575,19 @@ """ xpath = f'//resource[@resource_agent="{ra_type}"]' return [elem.get('id') for elem in self.xml_elem.xpath(xpath)] + + def get_last_fence_event_info(self) -> dict: + fence_event_info = {} + fence_events = self.xml_elem.xpath(r'//fence_history/fence_event') + if not fence_events: + return fence_event_info + last_event = fence_events[0] + fence_event_info = { + 'origin': last_event.get('origin', ''), + 'target': last_event.get('target', ''), + 'action': last_event.get('action', ''), + 'status': last_event.get('status', ''), + 'completed': last_event.get('completed', '') + } + return fence_event_info # vim:ts=4:sw=4:et: diff -Nru crmsh-5.0.0~rc1/data-manifest crmsh-5.0.0~rc2/data-manifest --- crmsh-5.0.0~rc1/data-manifest 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/data-manifest 2025-07-15 07:03:56.000000000 +0200 @@ -81,6 +81,7 @@ test/features/healthcheck.feature test/features/migration.feature test/features/ocfs2.feature +test/features/pacemaker_remote.feature test/features/qdevice_options.feature test/features/qdevice_setup_remove.feature test/features/qdevice_usercase.feature diff -Nru crmsh-5.0.0~rc1/debian/changelog crmsh-5.0.0~rc2/debian/changelog --- crmsh-5.0.0~rc1/debian/changelog 2025-06-19 21:08:58.000000000 +0200 +++ crmsh-5.0.0~rc2/debian/changelog 2025-07-15 20:42:43.000000000 +0200 @@ -1,3 +1,10 @@ +crmsh (5.0.0~rc2-1) unstable; urgency=medium + + * New upstream version 5.0.0~rc2 + * d/patches: refresh for new version + + -- Valentin Vidic <[email protected]> Tue, 15 Jul 2025 20:42:43 +0200 + crmsh (5.0.0~rc1-1) unstable; urgency=medium * New upstream version 5.0.0~rc1 diff -Nru crmsh-5.0.0~rc1/debian/patches/0013-Fix-cluster-bootstrap.patch crmsh-5.0.0~rc2/debian/patches/0013-Fix-cluster-bootstrap.patch --- crmsh-5.0.0~rc1/debian/patches/0013-Fix-cluster-bootstrap.patch 2025-06-14 13:32:03.000000000 +0200 +++ crmsh-5.0.0~rc2/debian/patches/0013-Fix-cluster-bootstrap.patch 2025-07-15 20:39:10.000000000 +0200 @@ -4,7 +4,7 @@ Last-Update: 2019-01-12 --- a/crmsh/utils.py +++ b/crmsh/utils.py -@@ -2410,13 +2410,13 @@ +@@ -2406,13 +2406,13 @@ """ Check if package is installed """ @@ -22,7 +22,7 @@ --- a/crmsh/bootstrap.py +++ b/crmsh/bootstrap.py -@@ -54,8 +54,8 @@ +@@ -55,8 +55,8 @@ logger_utils = log.LoggerUtils(logger) diff -Nru crmsh-5.0.0~rc1/debian/patches/0027-Rename_sbd_config.patch crmsh-5.0.0~rc2/debian/patches/0027-Rename_sbd_config.patch --- crmsh-5.0.0~rc1/debian/patches/0027-Rename_sbd_config.patch 2025-06-19 21:08:39.000000000 +0200 +++ crmsh-5.0.0~rc2/debian/patches/0027-Rename_sbd_config.patch 2025-07-15 20:40:33.000000000 +0200 @@ -5,10 +5,10 @@ This patch header follows DEP-3: http://dep.debian.net/deps/dep3/ --- a/crmsh/crash_test/config.py +++ b/crmsh/crash_test/config.py -@@ -5,5 +5,5 @@ +@@ -4,5 +4,5 @@ + iptables -{action} OUTPUT -d {peer_ip} -j DROP''' REMOVE_PORT = "firewall-cmd --zone=public --remove-port={port}/udp" ADD_PORT = "firewall-cmd --zone=public --add-port={port}/udp" - FENCE_HISTORY = "stonith_admin -h {node}" -SBD_CONF = "/etc/sysconfig/sbd" +SBD_CONF = "/etc/default/sbd" SBD_CHECK_CMD = "sbd -d {dev} dump" @@ -92,7 +92,7 @@ content_list = [ --- a/doc/crm.8.adoc +++ b/doc/crm.8.adoc -@@ -2138,7 +2138,7 @@ +@@ -2139,7 +2139,7 @@ Main functionailities include: - Show configured disk metadata @@ -142,7 +142,7 @@ A shared device must be available and visible on all nodes. --- a/test/unittests/test_bootstrap.py +++ b/test/unittests/test_bootstrap.py -@@ -1885,7 +1885,7 @@ +@@ -1906,7 +1906,7 @@ bootstrap.rm_configuration_files() mock_run.assert_has_calls([ mock.call('rm -f file1 file2', None), diff -Nru crmsh-5.0.0~rc1/doc/crm.8.adoc crmsh-5.0.0~rc2/doc/crm.8.adoc --- crmsh-5.0.0~rc1/doc/crm.8.adoc 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/doc/crm.8.adoc 2025-07-15 07:03:56.000000000 +0200 @@ -1010,20 +1010,16 @@ Verifies the health of a specified topic. ............... -health hawk2|sles16 [--fix] +health hawk2|sles16 [--local] [--fix] ............... * `hawk2`: check or fix key-based ssh authentication for user hacluster, which -is needed by hawk2. + is needed by hawk2. + ** `--fix`: attempts to automatically resolve any detected issues, eg. + hacluster passwordless * `sles16`: check whether the cluster is good to migrate to SLES 16. - -The optional `--fix` argument attempts to automatically resolve any detected -issues. - -.Note on sles16 -**************************** -`--fix` is only available after the OS is migrated to SLES 16. -**************************** + ** `--local`: run checks in local mode + ** `--fix`: attempts to automatically resolve any detected issues. [[cmdhelp.cluster.init,Initializes a new HA cluster,From Code]] ==== `init` @@ -1366,9 +1362,14 @@ exist in the configuration file, it will be added. However, if the section containing the value does not exist, the command will fail. +If there are multiple entries with the same path, you can specify index +`set <path> <value> <index>` to set a specific entry. +The default index is `0`. + Usage: ......... set quorum.expected_votes 2 +set nodelist.node.nodeid 3 1 ......... [[cmdhelp.corosync.show,Display the corosync configuration]] diff -Nru crmsh-5.0.0~rc1/.github/workflows/crmsh-ci.yml crmsh-5.0.0~rc2/.github/workflows/crmsh-ci.yml --- crmsh-5.0.0~rc1/.github/workflows/crmsh-ci.yml 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/.github/workflows/crmsh-ci.yml 2025-07-15 07:03:56.000000000 +0200 @@ -445,6 +445,19 @@ token: ${{ secrets.CODECOV_TOKEN }} flags: integration + functional_test_pacemaker_remote: + runs-on: ubuntu-24.04 + timeout-minutes: 40 + steps: + - uses: actions/checkout@v4 + - name: functional test for pacemaker remote + run: | + $CONTAINER_SCRIPT `$GET_INDEX_OF pacemaker_remote` + - uses: codecov/codecov-action@v4 + with: + token: ${{ secrets.CODECOV_TOKEN }} + flags: integration + original_regression_test: runs-on: ubuntu-24.04 timeout-minutes: 40 diff -Nru crmsh-5.0.0~rc1/test/features/bootstrap_bugs.feature crmsh-5.0.0~rc2/test/features/bootstrap_bugs.feature --- crmsh-5.0.0~rc1/test/features/bootstrap_bugs.feature 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/test/features/bootstrap_bugs.feature 2025-07-15 07:03:56.000000000 +0200 @@ -287,3 +287,17 @@ When Run "crm cluster join -c hanode1 -y" on "hanode2" And Run "crm configure show" on "hanode1" Then Expected "no-quorum-policy=ignore" not in stdout + + @clean + Scenario: Join when `core.hosts` is not available from the seed node (bsc#1245343) + Given Cluster service is "stopped" on "hanode1" + And Cluster service is "stopped" on "hanode2" + When Run "crm cluster init -y" on "hanode1" + And Run "crm cluster join -c hanode1 -y" on "hanode2" + And Run "rm -r /root/.config/crm" on "hanode1,hanode2" + And Run "crm cluster join -c hanode1 -y" on "hanode3" + Then Cluster service is "started" on "hanode3" + When Run "crm cluster stop --all" on "hanode3" + Then Cluster service is "stopped" on "hanode1" + And Cluster service is "stopped" on "hanode2" + And Cluster service is "stopped" on "hanode3" diff -Nru crmsh-5.0.0~rc1/test/features/bootstrap_init_join_remove.feature crmsh-5.0.0~rc2/test/features/bootstrap_init_join_remove.feature --- crmsh-5.0.0~rc1/test/features/bootstrap_init_join_remove.feature 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/test/features/bootstrap_init_join_remove.feature 2025-07-15 07:03:56.000000000 +0200 @@ -19,6 +19,8 @@ And Cluster is using "knet" transport mode Scenario: Init cluster service on node "hanode1", and join on node "hanode2" + Then Run "corosync-cmapctl|grep "votequorum.two_node .* = 1"" OK + Then Run "corosync-cmapctl|grep "votequorum.two_node .* = 1"" OK on "hanode2" Scenario: Support --all or specific node to manage cluster and nodes When Run "crm node standby --all" on "hanode1" @@ -73,6 +75,25 @@ Then Directory "/var/lib/pacemaker/cib/" is empty on "hanode2" Then Directory "/var/lib/pacemaker/pengine/" is empty on "hanode2" Then Directory "/var/lib/corosync/" is empty on "hanode2" + When Try "crm cluster run "crm cluster stop" hanode1" on "hanode2" + Then Expected "Cannot get the member list of the cluster" in stderr + + Scenario: Remove peer node when cluster is not running + Then File "/etc/corosync/authkey" exists on "hanode2" + Then File "/etc/corosync/corosync.conf" exists on "hanode2" + Then File "/etc/pacemaker/authkey" exists on "hanode2" + Then Directory "/var/lib/pacemaker/cib/" not empty on "hanode2" + Then Directory "/var/lib/corosync/" not empty on "hanode2" + When Run "crm cluster stop" on "hanode2" + When Try "crm cluster remove @hanode2.ip.0 -y" on "hanode1" + Then Expected "Node @hanode2.ip.0 is not configured in cluster! (valid nodes: hanode1, hanode2)" in stderr + When Run "crm cluster remove hanode2 -y" on "hanode1" + Then File "/etc/corosync/authkey" not exist on "hanode2" + Then File "/etc/corosync/corosync.conf" not exist on "hanode2" + Then File "/etc/pacemaker/authkey" not exist on "hanode2" + Then Directory "/var/lib/pacemaker/cib/" is empty on "hanode2" + Then Directory "/var/lib/pacemaker/pengine/" is empty on "hanode2" + Then Directory "/var/lib/corosync/" is empty on "hanode2" Scenario: Remove local node "hanode1" When Run "crm configure primitive d1 Dummy" on "hanode1" @@ -186,3 +207,16 @@ Then Cluster service is "started" on "hanode3" And Online nodes are "hanode1 hanode2 hanode3" And Check passwordless for hacluster between "hanode1 hanode2 hanode3" "successfully" + + @skip_non_root + Scenario: Remove offline and unreachable node + When Run "init 0" on "hanode2" + Then Online nodes are "hanode1" + When Run "sleep 10" on "hanode1" + When Try "crm cluster remove hanode2 -y" on "hanode1" + Then Expected "There are offline nodes also unreachable: hanode2" in stderr + When Try "crm status|grep "OFFLINE.*hanode2"" on "hanode1" + Then Expected return code is "0" + When Run "crm cluster remove hanode2 -y --force" on "hanode1" + When Try "crm status|grep "OFFLINE.*hanode2"" on "hanode1" + Then Expected return code is "1" diff -Nru crmsh-5.0.0~rc1/test/features/bootstrap_options.feature crmsh-5.0.0~rc2/test/features/bootstrap_options.feature --- crmsh-5.0.0~rc1/test/features/bootstrap_options.feature 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/test/features/bootstrap_options.feature 2025-07-15 07:03:56.000000000 +0200 @@ -7,7 +7,7 @@ "-n": Set the name of the configured cluster "-A": Configure IP address as an administration virtual IP Tag @clean means need to stop cluster service if the service is available - Need nodes: hanode1 hanode2 hanode3 + Need nodes: hanode1 hanode2 hanode3 qnetd-node @clean Scenario: Check help output @@ -134,6 +134,13 @@ And Cluster name is "hatest" And Cluster virtual IP is "@vip.0" And Show cluster status on "hanode1" + + @clean + Scenario: Invalid virtual IP address wouldn't block cluster init + Given Cluster service is "stopped" on "hanode1" + When Run "crm cluster init -A 60.60.60.6 --qnetd-hostname qnetd-node -y" on "hanode1" + Then Expected "Time out waiting for resource "admin-ip" to start" in stderr + Then Service "corosync-qdevice" is "started" on "hanode1" @clean Scenario: Detect multi IP in the same NIC diff -Nru crmsh-5.0.0~rc1/test/features/corosync_ui.feature crmsh-5.0.0~rc2/test/features/corosync_ui.feature --- crmsh-5.0.0~rc1/test/features/corosync_ui.feature 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/test/features/corosync_ui.feature 2025-07-15 07:03:56.000000000 +0200 @@ -48,3 +48,19 @@ Then Expected "Duplicated" in stderr When Try "crm corosync link add hanode1=192.0.2.101 hanode2=192.0.2.102 options knet_link_priority=10" on "hanode1" Then Expected "not a configured interface address" in stderr + + Scenario: corosync set + Given Nodes ["hanode1", "hanode2"] are cleaned up + And Cluster service is "stopped" on "hanode1" + And Cluster service is "stopped" on "hanode2" + When Run "crm cluster init -y" on "hanode1" + Then Cluster service is "started" on "hanode1" + When Run "crm cluster join -c hanode1 -y" on "hanode2" + Then Cluster service is "started" on "hanode2" + And Online nodes are "hanode1 hanode2" + When Try "crm corosync set totem.version 3" on "hanode1" + Then Expected "parse error in config" in stderr + When Run "crm corosync get totem.version" on "hanode1" + Then Expected "2" in stdout + When Run "crm corosync set totem.token 6000" on "hanode1" + Then Expected "Use "crm corosync push" to sync" in stdout diff -Nru crmsh-5.0.0~rc1/test/features/crm_report_bugs.feature crmsh-5.0.0~rc2/test/features/crm_report_bugs.feature --- crmsh-5.0.0~rc1/test/features/crm_report_bugs.feature 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/test/features/crm_report_bugs.feature 2025-07-15 07:03:56.000000000 +0200 @@ -171,3 +171,11 @@ # found password Then Expected return code is "0" When Run "rm -rf report.tar.bz2 report" on "hanode1" + + @clean + Scenario: When no logfile in corosync.conf (bsc#1244515, bsc#1232821) + When Run "sed -i '/logfile:/d' /etc/corosync/corosync.conf" on "hanode1" + When Run "sed -i '/logfile:/d' /etc/corosync/corosync.conf" on "hanode2" + When Try "crm report report" on "hanode1" + # Should no exception here + Then Expected "TypeError:" not in stderr diff -Nru crmsh-5.0.0~rc1/test/features/crm_report_normal.feature crmsh-5.0.0~rc2/test/features/crm_report_normal.feature --- crmsh-5.0.0~rc1/test/features/crm_report_normal.feature 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/test/features/crm_report_normal.feature 2025-07-15 07:03:56.000000000 +0200 @@ -87,6 +87,10 @@ When Run "crm report -d /tmp/report" on "hanode1" Then Directory "/tmp/report" created + Then Directory "/tmp/report/hanode1/crm.conf.d/root/.config/crm" created + Then Directory "/tmp/report/hanode1/crm.conf.d/etc/crm" created + Then Directory "/tmp/report/hanode2/crm.conf.d/root/.config/crm" created + Then Directory "/tmp/report/hanode2/crm.conf.d/etc/crm" created When Try "crm report -d /tmp/report" on "hanode1" Then Expected "Destination directory /tmp/report exists, please cleanup or use -Z option" in stderr When Run "crm report -d -Z /tmp/report" on "hanode1" diff -Nru crmsh-5.0.0~rc1/test/features/pacemaker_remote.feature crmsh-5.0.0~rc2/test/features/pacemaker_remote.feature --- crmsh-5.0.0~rc1/test/features/pacemaker_remote.feature 1970-01-01 01:00:00.000000000 +0100 +++ crmsh-5.0.0~rc2/test/features/pacemaker_remote.feature 2025-07-15 07:03:56.000000000 +0200 @@ -0,0 +1,31 @@ +Feature: Test deployment of pacemaker remote + + Need nodes: hanode1 hanode2 pcmk-remote-node1 pcmk-remote-node2 + + Scenario: Setup a two nodes cluster with two pacemaker-remote nodes + Given Nodes ["hanode1", "hanode2"] are cleaned up + And Cluster service is "stopped" on "hanode1" + And Cluster service is "stopped" on "hanode2" + When Run "crm cluster init -y" on "hanode1" + Then Cluster service is "started" on "hanode1" + When Run "crm cluster join -c hanode1 -y" on "hanode2" + Then Cluster service is "started" on "hanode2" + And Online nodes are "hanode1 hanode2" + + When Run "scp -rp /etc/pacemaker pcmk-remote-node1:/etc" on "hanode1" + And Run "scp -rp /etc/pacemaker pcmk-remote-node2:/etc" on "hanode1" + And Run "systemctl start pacemaker_remote" on "pcmk-remote-node1" + And Run "systemctl start pacemaker_remote" on "pcmk-remote-node2" + And Run "crm configure primitive remote-node1 ocf:pacemaker:remote params server=pcmk-remote-node1 reconnect_interval=10m op monitor interval=30s" on "hanode1" + And Run "crm configure primitive remote-node2 ocf:pacemaker:remote params server=pcmk-remote-node2 reconnect_interval=10m op monitor interval=30s" on "hanode1" + And Wait "5" seconds + Then Remote online nodes are "remote-node1 remote-node2" + + Scenario: Prevent adding remote RA to group, order and colocation + When Run "crm configure primitive d Dummy" on "hanode1" + When Try "crm configure group g d remote-node1" + Then Expected "Cannot put remote resource 'remote-node1' in a group" in stderr + When Try "crm configure order o1 d remote-node1" + Then Expected "Cannot put remote resource 'remote-node1' in order constraint" in stderr + When Try "crm configure colocation c1 inf: d remote-node1" + Then Expected "Cannot put remote resource 'remote-node1' in colocation constraint" in stderr diff -Nru crmsh-5.0.0~rc1/test/features/steps/step_implementation.py crmsh-5.0.0~rc2/test/features/steps/step_implementation.py --- crmsh-5.0.0~rc1/test/features/steps/step_implementation.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/test/features/steps/step_implementation.py 2025-07-15 07:03:56.000000000 +0200 @@ -248,6 +248,13 @@ assert online(context, nodelist) is True +@then('Remote online nodes are "{nodelist}"') +def step_impl(context, nodelist): + _, out, _ = run_command(context, 'crm_mon -1|grep RemoteOnline:') + for node in nodelist.split(): + assert node in out, "Node {} not found in {}".format(node, out) + + @then('Node "{node}" is standby') def step_impl(context, node): assert crmutils.is_standby(node) is True diff -Nru crmsh-5.0.0~rc1/test/run-functional-tests crmsh-5.0.0~rc2/test/run-functional-tests --- crmsh-5.0.0~rc1/test/run-functional-tests 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/test/run-functional-tests 2025-07-15 07:03:56.000000000 +0200 @@ -204,7 +204,7 @@ podman_exec $node_name "echo 'StrictHostKeyChecking no' >> /etc/ssh/ssh_config" podman_exec $node_name "systemctl start sshd.service" - if [ "$node_name" != "qnetd-node" ];then + if [[ "$node_name" != "qnetd-node" && ! "$node_name" =~ ^pcmk-remote-node[0-9]$ ]];then podman cp $PROJECT_PATH $node_name:/opt/crmsh info "Building crmsh on \"$node_name\"..." podman_exec $node_name "$make_cmd" || \ @@ -246,7 +246,10 @@ config_cluster() { node_num=$# insert_str="" - container_ip_array=(`podman network inspect $HA_NETWORK_ARRAY -f '{{range .Containers}}{{printf "%s " .IPv4Address}}{{end}}'`) + for node in $*;do + ip=`podman container inspect $node -f "{{.NetworkSettings.Networks.$HA_NETWORK_ARRAY.IPAddress}}"|tr -d "\r"` + container_ip_array+=($ip) + done for i in $(seq $node_num -1 1);do ip=`echo ${container_ip_array[$((i-1))]}|awk -F/ '{print $1}'` @@ -320,7 +323,7 @@ cleanup_cluster() { info "Cleanup ha specific containers..." - podman ps --format json | jq -r '.[].Names[]' | grep -E 'hanode.*|qnetd-node' | xargs podman stop + podman ps --format json | jq -r '.[].Names[]' | grep -E 'hanode.*|qnetd-node|pcmk-remote-node[0-9]' | xargs podman stop info "Cleanup ha specific container networks..." exist_network_array=() for network in ${HA_NETWORK_ARRAY[@]};do diff -Nru crmsh-5.0.0~rc1/test/unittests/test_bootstrap.py crmsh-5.0.0~rc2/test/unittests/test_bootstrap.py --- crmsh-5.0.0~rc1/test/unittests/test_bootstrap.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/test/unittests/test_bootstrap.py 2025-07-15 07:03:56.000000000 +0200 @@ -181,7 +181,7 @@ ctx.cluster_is_running = True with self.assertRaises(ValueError): ctx._validate_sbd_option() - mock_check_all.assert_called_once_with() + mock_check_all.assert_called_once_with("setup SBD") mock_installed.assert_has_calls([ mock.call("sbd", "node1"), mock.call("sbd", "node2") @@ -950,9 +950,9 @@ mock.call('alice', 'node1'), mock.call('bob', 'node2'), ]) - mock_host_user_config.return_value.save_local.assert_called_once_with() + mock_host_user_config.return_value.save_local.assert_called() mock_ssh_copy_id.assert_called_once_with('carol', 'foo', 'node2', mock_local_shell.return_value) - mock_merge_ssh_authorized_keys.assert_called_once_with(['node1', 'node2']) + mock_merge_ssh_authorized_keys.assert_called_once_with(mock_cluster_shell.return_value, mock_user_of_host.instance.return_value, ['node3', 'node1', 'node2']) mock_change_user_shell.assert_called_once_with('hacluster', 'node2') mock_swap_public_ssh_key.assert_called_once_with('node2', 'hacluster', 'hacluster', 'carol', 'foo', mock_local_shell.return_value) mock_swap_key_for_hacluster.assert_called_once_with(['node1', 'node2']) @@ -1147,6 +1147,7 @@ mock_status.assert_not_called() mock_disable.assert_called_once_with("corosync-qdevice.service") + @mock.patch('crmsh.utils.check_all_nodes_reachable') @mock.patch('crmsh.bootstrap._select_user_pair_for_ssh_for_secondary_components') @mock.patch('crmsh.utils.HostUserConfig') @mock.patch('crmsh.user_of_host.UserOfHost.instance') @@ -1163,6 +1164,7 @@ mock_qdevice_configured, mock_confirm, mock_list_nodes, mock_user_of_host, mock_host_user_config_class, mock_select_user_pair_for_ssh, + mock_check_all_nodes ): mock_list_nodes.return_value = [] bootstrap._context = mock.Mock(qdevice_inst=self.qdevice_with_ip, current_user="bob") @@ -1186,7 +1188,9 @@ mock_qdevice_configured.assert_called_once_with() mock_confirm.assert_called_once_with("Qdevice is already configured - overwrite?") self.qdevice_with_ip.start_qdevice_service.assert_called_once_with() + mock_check_all_nodes.assert_called_once_with("setup Qdevice") + @mock.patch('crmsh.utils.check_all_nodes_reachable') @mock.patch('crmsh.bootstrap._select_user_pair_for_ssh_for_secondary_components') @mock.patch('crmsh.utils.HostUserConfig') @mock.patch('crmsh.user_of_host.UserOfHost.instance') @@ -1201,7 +1205,7 @@ @mock.patch('logging.Logger.info') def test_init_qdevice(self, mock_info, mock_local_shell, mock_ssh, mock_configure_ssh_key, mock_qdevice_configured, mock_this_node, mock_list_nodes, mock_adjust_priority, mock_adjust_fence_delay, - mock_user_of_host, mock_host_user_config_class, mock_select_user_pair_for_ssh): + mock_user_of_host, mock_host_user_config_class, mock_select_user_pair_for_ssh, mock_check_all_nodes): bootstrap._context = mock.Mock(qdevice_inst=self.qdevice_with_ip, current_user="bob") mock_this_node.return_value = "192.0.2.100" mock_list_nodes.return_value = [] @@ -1230,7 +1234,9 @@ self.qdevice_with_ip.set_cluster_name.assert_called_once_with() self.qdevice_with_ip.valid_qnetd.assert_called_once_with() self.qdevice_with_ip.config_and_start_qdevice.assert_called_once_with() + mock_check_all_nodes.assert_called_once_with("setup Qdevice") + @mock.patch('crmsh.utils.check_all_nodes_reachable') @mock.patch('crmsh.utils.fatal') @mock.patch('crmsh.utils.HostUserConfig') @mock.patch('crmsh.service_manager.ServiceManager.service_is_available') @@ -1241,6 +1247,7 @@ mock_info, mock_list_nodes, mock_available, mock_host_user_config_class, mock_fatal, + mock_check_all_nodes ): bootstrap._context = mock.Mock(qdevice_inst=self.qdevice_with_ip) mock_list_nodes.return_value = ["node1"] @@ -1255,6 +1262,7 @@ mock_fatal.assert_called_once_with("corosync-qdevice.service is not available on node1") mock_available.assert_called_once_with("corosync-qdevice.service", "node1") mock_info.assert_called_once_with("Configure Qdevice/Qnetd:") + mock_check_all_nodes.assert_called_once_with("setup Qdevice") @mock.patch('crmsh.bootstrap.prompt_for_string') def test_configure_qdevice_interactive_return(self, mock_prompt): @@ -1337,6 +1345,7 @@ mock_qdevice_configured.assert_called_once_with() mock_confirm.assert_called_once_with("Removing QDevice service and configuration from cluster: Are you sure?") + @mock.patch('crmsh.sh.cluster_shell') @mock.patch('crmsh.bootstrap.adjust_properties') @mock.patch('crmsh.bootstrap.sync_file') @mock.patch('crmsh.corosync.configure_two_node') @@ -1352,16 +1361,18 @@ @mock.patch('crmsh.corosync.is_qdevice_configured') def test_remove_qdevice_reload(self, mock_qdevice_configured, mock_confirm, mock_reachable, mock_evaluate, mock_status, mock_invoke, mock_status_long, mock_remove_config, mock_remove_db, - mock_remove_files, mock_config_two_node, mock_sync, mock_adjust_priority): + mock_remove_files, mock_config_two_node, mock_sync, mock_adjust_priority, mock_cluster_shell): mock_qdevice_configured.return_value = True mock_confirm.return_value = True mock_evaluate.return_value = qdevice.QdevicePolicy.QDEVICE_RELOAD + mock_cluster_shell_inst = mock.Mock() + mock_cluster_shell.return_value = mock_cluster_shell_inst bootstrap.remove_qdevice() mock_qdevice_configured.assert_called_once_with() mock_confirm.assert_called_once_with("Removing QDevice service and configuration from cluster: Are you sure?") - mock_reachable.assert_called_once_with() + mock_reachable.assert_called_once_with("removing QDevice from the cluster") mock_evaluate.assert_called_once_with(qdevice.QDEVICE_REMOVE) mock_status.assert_has_calls([ mock.call("Disable corosync-qdevice.service"), @@ -1370,11 +1381,11 @@ mock_invoke.assert_has_calls([ mock.call("crm cluster run 'systemctl disable corosync-qdevice'"), mock.call("crm cluster run 'systemctl stop corosync-qdevice'"), - mock.call("crm cluster run 'crm corosync reload'") ] ) mock_status_long.assert_called_once_with("Removing QDevice configuration from cluster") mock_remove_config.assert_called_once_with() mock_remove_db.assert_called_once_with() + mock_cluster_shell_inst.get_stdout_or_raise_error.assert_called_once_with("corosync-cfgtool -R") @mock.patch('crmsh.service_manager.ServiceManager.start_service') @mock.patch('crmsh.qdevice.QDevice') @@ -1696,6 +1707,7 @@ mock_prompt.assert_called_once_with("IP address or hostname of cluster node (e.g.: 192.168.1.1)", ".+") mock_error.assert_called_once_with("No existing IP/hostname specified (use -c option)") + @mock.patch('crmsh.utils.check_all_nodes_reachable') @mock.patch('crmsh.bootstrap.confirm') @mock.patch('crmsh.bootstrap.get_node_canonical_hostname') @mock.patch('crmsh.bootstrap.remove_qdevice') @@ -1704,7 +1716,7 @@ @mock.patch('crmsh.bootstrap.init') @mock.patch('crmsh.bootstrap.Context') def test_bootstrap_remove_no_confirm(self, mock_context, mock_init, mock_active, - mock_error, mock_qdevice, mock_hostname, mock_confirm): + mock_error, mock_qdevice, mock_hostname, mock_confirm, mock_check_all_nodes): mock_context_inst = mock.Mock(cluster_node="node1", force=False, qdevice_rm_flag=None) mock_context.return_value = mock_context_inst mock_active.return_value = [True, True] @@ -1722,7 +1734,9 @@ mock_error.assert_not_called() mock_hostname.assert_called_once_with('node1') mock_confirm.assert_called_once_with('Removing node "node1" from the cluster: Are you sure?') + mock_check_all_nodes.assert_called_once_with("removing a node from the cluster") + @mock.patch('crmsh.utils.check_all_nodes_reachable') @mock.patch('crmsh.utils.this_node') @mock.patch('crmsh.bootstrap.confirm') @mock.patch('crmsh.bootstrap.get_node_canonical_hostname') @@ -1732,7 +1746,7 @@ @mock.patch('crmsh.bootstrap.init') @mock.patch('crmsh.bootstrap.Context') def test_bootstrap_remove_self_need_force(self, mock_context, mock_init, mock_active, - mock_error, mock_qdevice, mock_hostname, mock_confirm, mock_this_node): + mock_error, mock_qdevice, mock_hostname, mock_confirm, mock_this_node, mock_check_all_nodes): mock_context_inst = mock.Mock(cluster_node="node1", force=False, qdevice_rm_flag=None) mock_context.return_value = mock_context_inst mock_active.return_value = [True, True] @@ -1755,6 +1769,7 @@ mock_this_node.assert_called_once_with() mock_error.assert_called_once_with("Removing self requires --force") + @mock.patch('crmsh.utils.check_all_nodes_reachable') @mock.patch('crmsh.bootstrap.bootstrap_finished') @mock.patch('crmsh.sh.ClusterShell.get_stdout_or_raise_error') @mock.patch('crmsh.bootstrap.remove_self') @@ -1767,7 +1782,7 @@ @mock.patch('crmsh.bootstrap.init') @mock.patch('crmsh.bootstrap.Context') def test_bootstrap_remove_self(self, mock_context, mock_init, mock_active, - mock_error, mock_qdevice, mock_hostname, mock_confirm, mock_this_node, mock_self, mock_run, mock_finished): + mock_error, mock_qdevice, mock_hostname, mock_confirm, mock_this_node, mock_self, mock_run, mock_finished, mock_check_all_nodes): mock_context_inst = mock.Mock(cluster_node="node1", force=True, qdevice_rm_flag=None) mock_context.return_value = mock_context_inst mock_active.return_value = [True, True] @@ -1788,7 +1803,9 @@ mock_error.assert_not_called() mock_self.assert_called_once_with(True) mock_run.assert_called_once_with('rm -rf /var/lib/crmsh', 'node1') + mock_check_all_nodes.assert_called_once_with("removing a node from the cluster") + @mock.patch('crmsh.utils.check_all_nodes_reachable') @mock.patch('crmsh.xmlutil.listnodes') @mock.patch('crmsh.utils.this_node') @mock.patch('crmsh.bootstrap.confirm') @@ -1799,7 +1816,7 @@ @mock.patch('crmsh.bootstrap.init') @mock.patch('crmsh.bootstrap.Context') def test_bootstrap_remove_not_in_cluster(self, mock_context, mock_init, mock_active, - mock_error, mock_qdevice, mock_hostname, mock_confirm, mock_this_node, mock_list): + mock_error, mock_qdevice, mock_hostname, mock_confirm, mock_this_node, mock_list, mock_check_all_nodes): mock_context_inst = mock.Mock(cluster_node="node2", force=True, qdevice_rm_flag=None) mock_context.return_value = mock_context_inst mock_active.return_value = [True, True] @@ -1821,7 +1838,9 @@ mock_confirm.assert_not_called() mock_this_node.assert_called_once_with() mock_error.assert_called_once_with("Specified node node2 is not configured in cluster! Unable to remove.") + mock_check_all_nodes.assert_called_once_with("removing a node from the cluster") + @mock.patch('crmsh.utils.check_all_nodes_reachable') @mock.patch('crmsh.sh.ClusterShell.get_stdout_or_raise_error') @mock.patch('crmsh.utils.fetch_cluster_node_list_from_node') @mock.patch('crmsh.bootstrap.remove_node_from_cluster') @@ -1836,10 +1855,10 @@ @mock.patch('crmsh.bootstrap.Context') def test_bootstrap_remove(self, mock_context, mock_init, mock_active, mock_error, mock_qdevice, mock_hostname, mock_confirm, mock_this_node, - mock_list, mock_remove, mock_fetch, mock_run): + mock_list, mock_remove, mock_fetch, mock_run, mock_check_all_nodes): mock_context_inst = mock.Mock(cluster_node="node2", qdevice_rm_flag=None, force=True) mock_context.return_value = mock_context_inst - mock_active.side_effect = [True, False] + mock_active.side_effect = [True, False, True] mock_hostname.return_value = "node2" mock_this_node.return_value = "node1" mock_list.return_value = ["node1", "node2"] @@ -1850,7 +1869,8 @@ mock_init.assert_called_once_with() mock_active.assert_has_calls([ mock.call("corosync.service"), - mock.call("csync2.socket") + mock.call("csync2.socket"), + mock.call("pacemaker.service", "node2") ]) mock_qdevice.assert_not_called() mock_hostname.assert_called_once_with('node2') @@ -1858,6 +1878,7 @@ mock_error.assert_not_called() mock_remove.assert_called_once_with('node2') mock_run.assert_called_once_with('rm -rf /var/lib/crmsh', 'node2') + mock_check_all_nodes.assert_called_once_with("removing a node from the cluster") @mock.patch('crmsh.utils.fatal') @mock.patch('crmsh.sh.ClusterShell.get_rc_stdout_stderr_without_input') @@ -1983,6 +2004,8 @@ ]) mock_error.assert_called_once_with("Removing the node node1 from {} failed".format(bootstrap.CSYNC2_CFG)) + @mock.patch('crmsh.utils.HostUserConfig') + @mock.patch('crmsh.sh.cluster_shell') @mock.patch('crmsh.bootstrap.FirewallManager') @mock.patch.object(NodeMgmt, 'call_delnode') @mock.patch('crmsh.service_manager.ServiceManager.service_is_active') @@ -2000,7 +2023,7 @@ @mock.patch('crmsh.bootstrap.get_cluster_node_ip') def test_remove_node_from_cluster_hostname(self, mock_get_ip, mock_stop, mock_status, mock_invoke, mock_invokerc, mock_error, mock_get_values, mock_del, mock_csync2, - mock_adjust_priority, mock_adjust_fence_delay, mock_rm_conf_files, mock_is_active, mock_cal_delnode, mock_firewall): + mock_adjust_priority, mock_adjust_fence_delay, mock_rm_conf_files, mock_is_active, mock_cal_delnode, mock_firewall, mock_cluster_shell, mock_host_user_config): mock_get_ip.return_value = "10.10.10.1" mock_cal_delnode.return_value = True mock_invoke.side_effect = [(True, None, None)] @@ -2010,6 +2033,8 @@ mock_firewall_inst = mock.Mock() mock_firewall.return_value = mock_firewall_inst mock_firewall_inst.remove_service = mock.Mock() + mock_cluster_shell_inst = mock.Mock() + mock_cluster_shell.return_value = mock_cluster_shell_inst bootstrap._context = mock.Mock(cluster_node="node1", rm_list=["file1", "file2"]) bootstrap.remove_node_from_cluster('node1') @@ -2021,9 +2046,7 @@ ]) mock_stop.assert_called_once_with(bootstrap.SERVICES_STOP_LIST, remote_addr="node1") mock_cal_delnode.assert_called_once_with("node1") - mock_invoke.assert_has_calls([ - mock.call("corosync-cfgtool -R") - ]) + mock_cluster_shell_inst.get_stdout_or_raise_error.assert_called_once_with("corosync-cfgtool -R") mock_invokerc.assert_called_once_with("sed -i /node1/d {}".format(bootstrap.CSYNC2_CFG)) mock_error.assert_not_called() mock_get_values.assert_called_once_with("nodelist.node.ring0_addr") diff -Nru crmsh-5.0.0~rc1/test/unittests/test_corosync.py crmsh-5.0.0~rc2/test/unittests/test_corosync.py --- crmsh-5.0.0~rc1/test/unittests/test_corosync.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/test/unittests/test_corosync.py 2025-07-15 07:03:56.000000000 +0200 @@ -214,7 +214,7 @@ mock_run.side_effect = ValueError mock_get_value.return_value = None assert corosync.get_corosync_value("xxx") is None - mock_run.assert_called_once_with("corosync-cmapctl xxx") + mock_run.assert_called_once_with("corosync-cmapctl runtime.config.xxx") mock_get_value.assert_called_once_with("xxx") @@ -222,7 +222,7 @@ def test_get_corosync_value(mock_run): mock_run.return_value = "totem.token = 10000" assert corosync.get_corosync_value("totem.token") == "10000" - mock_run.assert_called_once_with("corosync-cmapctl totem.token") + mock_run.assert_called_once_with("corosync-cmapctl runtime.config.totem.token") class TestConfigParserSet(unittest.TestCase): diff -Nru crmsh-5.0.0~rc1/test/unittests/test_crashtest_task.py crmsh-5.0.0~rc2/test/unittests/test_crashtest_task.py --- crmsh-5.0.0~rc1/test/unittests/test_crashtest_task.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/test/unittests/test_crashtest_task.py 2025-07-15 07:03:56.000000000 +0200 @@ -203,14 +203,14 @@ """ @mock.patch('crmsh.crash_test.utils.msg_info') - @mock.patch('crmsh.crash_test.utils.now') - def setUp(self, mock_now, mock_msg_info): + def setUp(self, mock_msg_info): """ Test setUp. """ - mock_now.return_value = "2019/07/10 01:15:15" + fake_timedate = datetime(2019, 7, 10, 1, 15, 15) main.ctx = mock.Mock(task_list=[{"process_name": "xin", "age": 38}]) self.task_check_inst = task.TaskCheck("task check job1", quiet=False) + self.task_check_inst.timestamp = fake_timedate self.task_check_inst_quiet = task.TaskCheck("task check job1", quiet=True) def tearDown(self): @@ -559,15 +559,14 @@ """ @mock.patch('crmsh.crash_test.utils.msg_info') - @mock.patch('crmsh.crash_test.utils.now') - def setUp(self, mock_now, mock_info): + def setUp(self, mock_info): """ Test setUp. """ - mock_now.return_value = "2019/07/10 01:15:15" + fake_now = datetime(2019, 7, 10, 1, 15, 15) main.ctx = mock.Mock(task_list={"process_name": "xin", "age": 38}) self.task_inst = task.Task("task description", flush=True) - mock_now.assert_called_once_with() + self.task_inst.timestamp = fake_now def tearDown(self): """ @@ -647,7 +646,7 @@ def test_build_base_result(self): self.task_inst.build_base_result() expected_result = { - "Timestamp": self.task_inst.timestamp, + "Timestamp": '2019/07/10 01:15:15', "Description": self.task_inst.description, "Messages": [] } @@ -666,45 +665,33 @@ mock_ask.assert_called_once_with(task.Task.REBOOT_WARNING) self.task_inst.info.assert_called_once_with("Testcase cancelled") - @mock.patch('crmsh.crash_test.utils.str_to_datetime') @mock.patch('time.sleep') @mock.patch('crmsh.crash_test.task.Task.info') - @mock.patch('crmsh.sh.ShellUtils.get_stdout_stderr') - def test_fence_action_monitor(self, mock_run, mock_info, mock_sleep, mock_datetime): + @mock.patch('crmsh.xmlutil.CrmMonXmlParser') + def test_fence_action_monitor(self, mock_parser, mock_info, mock_sleep): self.task_inst.thread_stop_event = mock.Mock() - self.task_inst.thread_stop_event.is_set.side_effect = [False, False, False, False] + self.task_inst.thread_stop_event.is_set.side_effect = [False, False] self.task_inst.fence_start_event = mock.Mock() + self.task_inst.fence_start_event.is_set.side_effect = [False, True] self.task_inst.fence_finish_event = mock.Mock() - output = "Pending Fencing Actions:\n * reboot of 15sp2-2 pending: client=pacemaker-controld.2430, origin=15sp2-1" - output2 = "Node 15sp2-2 last fenced at: Tue Jan 19 16:08:37 2021" - mock_run.side_effect = [(1, None, None), (0, output, None), (1, None, None), (0, output2, None)] - self.task_inst.timestamp = "2021/01/19 16:08:24" - mock_datetime.side_effect = [ - datetime.strptime(self.task_inst.timestamp, '%Y/%m/%d %H:%M:%S'), - datetime.strptime("Tue Jan 19 16:08:37 2021", '%a %b %d %H:%M:%S %Y') + mock_parser_inst = mock.Mock() + mock_parser.return_value = mock_parser_inst + mock_parser_inst.get_last_fence_event_info.side_effect = [ + {"target": "node2", "origin": "node1", "status": "pending", "completed": ""}, + {"target": "node2", "origin": "node1", "status": "success", "completed": "2025-05-30 10:41:58.376958 +08:00"}, ] self.task_inst.fence_action_monitor() - self.task_inst.thread_stop_event.is_set.assert_has_calls([ - mock.call(), - mock.call(), - mock.call(), - mock.call() - ]) - mock_run.assert_has_calls([ - mock.call("crm_mon -1|grep -A1 \"Fencing Actions:\""), - mock.call("crm_mon -1|grep -A1 \"Fencing Actions:\""), - mock.call(config.FENCE_HISTORY.format(node="15sp2-2")), - mock.call(config.FENCE_HISTORY.format(node="15sp2-2")) - ]) + self.task_inst.thread_stop_event.is_set.assert_has_calls([mock.call(), mock.call()]) mock_info.assert_has_calls([ - mock.call("Node \"15sp2-2\" will be fenced by \"15sp2-1\"!"), - mock.call("Node \"15sp2-2\" was successfully fenced by \"15sp2-1\"") + mock.call("Node \"node2\" will be fenced by \"node1\"!"), + mock.call("Node \"node2\" was fenced by \"node1\" at 2025-05-30 10:41:58.376958 +08:00") ]) self.task_inst.fence_start_event.set.assert_called_once_with() self.task_inst.fence_finish_event.set.assert_called_once_with() + class TestFixSBD(TestCase): """ Class to test TaskFixSBD of task.py diff -Nru crmsh-5.0.0~rc1/test/unittests/test_qdevice.py crmsh-5.0.0~rc2/test/unittests/test_qdevice.py --- crmsh-5.0.0~rc1/test/unittests/test_qdevice.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/test/unittests/test_qdevice.py 2025-07-15 07:03:56.000000000 +0200 @@ -842,15 +842,13 @@ @mock.patch('crmsh.sbd.SBDManager.update_sbd_configuration') @mock.patch('crmsh.sbd.SBDUtils.get_sbd_value_from_config') @mock.patch('crmsh.sbd.SBDUtils.is_using_diskless_sbd') - @mock.patch('crmsh.utils.check_all_nodes_reachable') - def test_adjust_sbd_watchdog_timeout_with_qdevice(self, mock_check_reachable, mock_using_diskless_sbd, mock_get_sbd_value, mock_update_config, mock_get_timeout, mock_set_property): + def test_adjust_sbd_watchdog_timeout_with_qdevice(self, mock_using_diskless_sbd, mock_get_sbd_value, mock_update_config, mock_get_timeout, mock_set_property): mock_using_diskless_sbd.return_value = True mock_get_sbd_value.return_value = "" mock_get_timeout.return_value = 100 self.qdevice_with_stage_cluster_name.adjust_sbd_watchdog_timeout_with_qdevice() - mock_check_reachable.assert_called_once_with() mock_using_diskless_sbd.assert_called_once_with() mock_get_sbd_value.assert_called_once_with("SBD_WATCHDOG_TIMEOUT") mock_update_config.assert_called_once_with({"SBD_WATCHDOG_TIMEOUT": str(sbd.SBDTimeout.SBD_WATCHDOG_TIMEOUT_DEFAULT_WITH_QDEVICE)}) diff -Nru crmsh-5.0.0~rc1/test/unittests/test_report_collect.py crmsh-5.0.0~rc2/test/unittests/test_report_collect.py --- crmsh-5.0.0~rc1/test/unittests/test_report_collect.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/test/unittests/test_report_collect.py 2025-07-15 07:03:56.000000000 +0200 @@ -13,7 +13,7 @@ def test_get_pcmk_log_no_config(self, mock_isfile, mock_warning): mock_isfile.side_effect = [False, False, False] res = collect.get_pcmk_log() - self.assertEqual(res, "") + self.assertIsNone(res) mock_isfile.assert_has_calls([ mock.call(constants.PCMKCONF), mock.call("/var/log/pacemaker/pacemaker.log"), @@ -74,7 +74,7 @@ def test_get_corosync_log_not_exist(self, mock_conf, mock_exists, mock_warning): mock_conf.return_value = "/etc/corosync/corosync.conf" mock_exists.return_value = False - self.assertEqual(collect.get_corosync_log(), "") + self.assertIsNone(collect.get_corosync_log()) @mock.patch('crmsh.corosync.get_value') @mock.patch('os.path.exists') diff -Nru crmsh-5.0.0~rc1/test/unittests/test_ui_sbd.py crmsh-5.0.0~rc2/test/unittests/test_ui_sbd.py --- crmsh-5.0.0~rc1/test/unittests/test_ui_sbd.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/test/unittests/test_ui_sbd.py 2025-07-15 07:03:56.000000000 +0200 @@ -535,9 +535,10 @@ mock_logger_error.assert_called_once_with('%s', "No device specified") mock_logger_info.assert_called_once_with("Usage: crm sbd device <add|remove> <device>...") + @mock.patch('crmsh.utils.check_all_nodes_reachable') @mock.patch('logging.Logger.info') @mock.patch('crmsh.sbd.SBDUtils.is_using_disk_based_sbd') - def test_do_device_add(self, mock_is_using_disk_based_sbd, mock_logger_info): + def test_do_device_add(self, mock_is_using_disk_based_sbd, mock_logger_info, mock_check_all_nodes_reachable): mock_is_using_disk_based_sbd.return_value = True self.sbd_instance_diskbased.service_is_active = mock.Mock(return_value=True) self.sbd_instance_diskbased._load_attributes = mock.Mock() @@ -546,10 +547,12 @@ self.assertTrue(res) self.sbd_instance_diskbased._device_add.assert_called_once_with(["/dev/sda2", "/dev/sda3"]) mock_logger_info.assert_called_once_with("Configured sbd devices: %s", "/dev/sda1") + mock_check_all_nodes_reachable.assert_called_once_with("configuring SBD device") + @mock.patch('crmsh.utils.check_all_nodes_reachable') @mock.patch('logging.Logger.info') @mock.patch('crmsh.sbd.SBDUtils.is_using_disk_based_sbd') - def test_do_device_remove(self, mock_is_using_disk_based_sbd, mock_logger_info): + def test_do_device_remove(self, mock_is_using_disk_based_sbd, mock_logger_info, mock_check_all_nodes_reachable): mock_is_using_disk_based_sbd.return_value = True self.sbd_instance_diskbased.service_is_active = mock.Mock(return_value=True) self.sbd_instance_diskbased._load_attributes = mock.Mock() @@ -558,6 +561,7 @@ self.assertTrue(res) self.sbd_instance_diskbased._device_remove.assert_called_once_with(["/dev/sda1"]) mock_logger_info.assert_called_once_with("Configured sbd devices: %s", "/dev/sda1") + mock_check_all_nodes_reachable.assert_called_once_with("configuring SBD device") @mock.patch('crmsh.sbd.purge_sbd_from_cluster') def test_do_purge_no_service(self, mock_purge_sbd_from_cluster): @@ -567,8 +571,9 @@ self.assertFalse(res) mock_purge_sbd_from_cluster.assert_not_called() + @mock.patch('crmsh.utils.check_all_nodes_reachable') @mock.patch('crmsh.sbd.purge_sbd_from_cluster') - def test_do_purge(self, mock_purge_sbd_from_cluster): + def test_do_purge(self, mock_purge_sbd_from_cluster, mock_check_all_nodes_reachable): self.sbd_instance_diskbased._load_attributes = mock.Mock() self.sbd_instance_diskbased._service_is_active = mock.Mock(return_value=True) res = self.sbd_instance_diskbased.do_purge(mock.Mock()) @@ -577,6 +582,7 @@ self.sbd_instance_diskbased._load_attributes.assert_called_once() self.sbd_instance_diskbased._service_is_active.assert_called_once_with(constants.SBD_SERVICE) mock_purge_sbd_from_cluster.assert_called_once_with() + mock_check_all_nodes_reachable.assert_called_once_with("purging SBD") @mock.patch('crmsh.xmlutil.CrmMonXmlParser') def test_print_sbd_agent_status(self, mock_CrmMonXmlParser): diff -Nru crmsh-5.0.0~rc1/test/unittests/test_utils.py crmsh-5.0.0~rc2/test/unittests/test_utils.py --- crmsh-5.0.0~rc1/test/unittests/test_utils.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/test/unittests/test_utils.py 2025-07-15 07:03:56.000000000 +0200 @@ -971,12 +971,26 @@ @mock.patch('crmsh.utils.node_reachable_check') [email protected]('crmsh.sh.ClusterShell.get_stdout_or_raise_error') -def test_check_all_nodes_reachable(mock_run, mock_reachable): - mock_run.return_value = "1084783297 15sp2-1 member" - utils.check_all_nodes_reachable() - mock_run.assert_called_once_with("crm_node -l") - mock_reachable.assert_called_once_with("15sp2-1") [email protected]('crmsh.xmlutil.CrmMonXmlParser') +def test_check_all_nodes_reachable_dead_nodes(mock_xml, mock_reachable): + mock_xml_inst = mock.Mock() + mock_xml.return_value = mock_xml_inst + mock_xml_inst.get_node_list.side_effect = [["node1"], ["node2"]] + mock_reachable.side_effect = ValueError + + with pytest.raises(utils.DeadNodeError) as err: + utils.check_all_nodes_reachable("testing") + assert err.value.dead_nodes == ["node2"] + + [email protected]('crmsh.utils.node_reachable_check') [email protected]('crmsh.xmlutil.CrmMonXmlParser') +def test_check_all_nodes_reachable(mock_xml, mock_reachable): + mock_xml_inst = mock.Mock() + mock_xml.return_value = mock_xml_inst + mock_xml_inst.get_node_list.side_effect = [["node1"], []] + utils.check_all_nodes_reachable("testing") + mock_reachable.assert_called_once_with("node1") @mock.patch('crmsh.sh.ShellUtils.get_stdout_stderr') @@ -1425,3 +1439,73 @@ rc = rc1 | rc2 | rc3 assert bool(rc) is False assert utils.VerifyResult.NON_FATAL_ERROR in rc + + [email protected]('crmsh.utils.fatal') [email protected]('crmsh.utils.get_address_list_from_corosync_conf') [email protected]('crmsh.utils.list_cluster_nodes') +def test_validate_and_get_reachable_nodes_cannot_get_member(mock_list_nodes, mock_get_address, mock_fatal): + mock_list_nodes.return_value = None + mock_get_address.return_value = None + mock_fatal.side_effect = ValueError + with pytest.raises(ValueError): + utils.validate_and_get_reachable_nodes([]) + mock_fatal.assert_called_once_with("Cannot get the member list of the cluster") + mock_get_address.assert_called_once_with() + + [email protected]('crmsh.utils.fatal') [email protected]('crmsh.utils.list_cluster_nodes') +def test_validate_and_get_reachable_nodes_not_a_member(mock_list_nodes, mock_fatal): + mock_list_nodes.return_value = ["node1", "node2"] + mock_fatal.side_effect = ValueError + with pytest.raises(ValueError): + utils.validate_and_get_reachable_nodes(["node3"]) + mock_fatal.assert_called_once_with("Node 'node3' is not a member of the cluster") + + [email protected]('crmsh.utils.this_node') [email protected]('crmsh.utils.list_cluster_nodes') +def test_validate_and_get_reachable_nodes_return_local(mock_list_nodes, mock_this_node): + mock_list_nodes.return_value = ["node1", "node2"] + mock_this_node.return_value = "node1" + res = utils.validate_and_get_reachable_nodes() + assert res == ["node1"] + + [email protected]('crmsh.utils.get_reachable_node_list') [email protected]('crmsh.utils.this_node') [email protected]('crmsh.utils.get_address_list_from_corosync_conf') [email protected]('crmsh.utils.list_cluster_nodes') +def test_validate_and_get_reachable_nodes_no_cib(mock_list_nodes, mock_get_address, mock_this_node, mock_get_reachable): + mock_list_nodes.return_value = None + mock_get_address.return_value = ["node1", "node2"] + mock_get_reachable.return_value = ["node1"] + mock_this_node.return_value = "node1" + res = utils.validate_and_get_reachable_nodes(all_nodes=True) + assert res == ["node1"] + + [email protected]('logging.Logger.error') [email protected]('crmsh.xmlutil.CrmMonXmlParser') [email protected]('crmsh.sh.cluster_shell') [email protected]('crmsh.utils.get_reachable_node_list') [email protected]('crmsh.utils.this_node') [email protected]('crmsh.utils.list_cluster_nodes') +def test_validate_and_get_reachable_nodes(mock_list_nodes, mock_this_node, mock_get_reachable, mock_shell, mock_xml, mock_error): + mock_list_nodes.return_value = ["node1", "node2"] + mock_get_reachable.return_value = ["node1", "node2"] + mock_this_node.return_value = "node2" + mock_shell_inst = mock.Mock() + mock_shell.return_value = mock_shell_inst + mock_shell_inst.get_stdout_or_raise_error.return_value = """ +node1(1): member + """ + mock_xml_inst = mock.Mock() + mock_xml.return_value = mock_xml_inst + mock_xml_inst.is_node_online.return_value = False + + res = utils.validate_and_get_reachable_nodes(all_nodes=True) + assert res == ["node2"] + + mock_error.assert_called_once_with("From the view of node '%s', node '%s' is not a member of the cluster", 'node1', 'node2') diff -Nru crmsh-5.0.0~rc1/test/unittests/test_xmlutil.py crmsh-5.0.0~rc2/test/unittests/test_xmlutil.py --- crmsh-5.0.0~rc1/test/unittests/test_xmlutil.py 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/test/unittests/test_xmlutil.py 2025-07-15 07:03:56.000000000 +0200 @@ -41,8 +41,8 @@ assert self.parser_inst.is_node_online("tbw-2") is False def test_get_node_list(self): - assert self.parser_inst.get_node_list("standby") == ['tbw-1'] - assert self.parser_inst.get_node_list("online") == ['tbw-2'] + assert self.parser_inst.get_node_list(standby=True) == ['tbw-1'] + assert self.parser_inst.get_node_list(online=False) == ['tbw-2'] def test_is_resource_configured(self): assert self.parser_inst.is_resource_configured("test") is False diff -Nru crmsh-5.0.0~rc1/test_container/Dockerfile crmsh-5.0.0~rc2/test_container/Dockerfile --- crmsh-5.0.0~rc1/test_container/Dockerfile 2025-05-22 11:11:04.000000000 +0200 +++ crmsh-5.0.0~rc2/test_container/Dockerfile 2025-07-15 07:03:56.000000000 +0200 @@ -7,7 +7,7 @@ firewalld iptables iptables-backend-nft \ make autoconf automake vim which libxslt-tools mailx iproute2 iputils bzip2 tar file glibc-locale-base dos2unix cpio gawk sudo \ python313 python313-pip python313-lxml python313-python-dateutil python313-build python313-PyYAML python313-curses python313-behave python313-coverage python313-packaging \ - csync2 corosync corosync-qdevice pacemaker booth corosync-qnetd + csync2 corosync corosync-qdevice pacemaker pacemaker-remote booth corosync-qnetd RUN ssh-keygen -t rsa -f /root/.ssh/id_rsa -N '' && \ cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys && \

