commit 6cd6a0ab9e47857a6d298feda4591680dcdae1d6
Merge: beee20a 4dc4e03
Author: Klaus Aehlig <[email protected]>
Date: Mon Jan 12 15:47:56 2015 +0100
Merge branch 'stable-2.13' into master
* stable-2.13
Also test that migration tags are identified correctly
Strictly adhere migration tags specification
Make usage of memory.memsw.limit_in_bytes optional
Allow for checking of individual cgroup parameter presence
Properly downgrade LXC parameters
Additional lxc_ prefix removal
Rename lxc_cgroup_use to extra_cgroups and improve docs
Rename lxc_tty to num_ttys
Rename lxc_startup_wait to startup_timeout
Remove the rapi-workload test suite
Consider upgrade case for renew-crypto
Redirect output of upgrade QA
Deal with nonexisting public key file
Fix readd wrt adding authorized keys
QA: Add more verification checks for SSH
Document '--verify-ssh-clutter' option
Make clutter test optional
ClusterVerify: check for clutter in 'authorized_keys'
Fix problems with node key adding and removing
Fix confusing messages during 'renew-crypto'
Renew the master node's SSH key
* stable-2.12
Look up RPC port
Add withDefaultOnIOError
Fix typo in gnt_cluster output
* stable-2.11
Also count your own vote
* stable-2.10
Fix typo in gnt_cluster output
Conflicts:
lib/cmdlib/cluster/verify.py: trivial
tools/cfgupgrade: use master version
Signed-off-by: Klaus Aehlig <[email protected]>
diff --cc lib/cmdlib/cluster/verify.py
index 7ce8c5a,0000000..c666d11
mode 100644,000000..100644
--- a/lib/cmdlib/cluster/verify.py
+++ b/lib/cmdlib/cluster/verify.py
@@@ -1,2190 -1,0 +1,2194 @@@
+#
+#
+
+# Copyright (C) 2014 Google Inc.
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met:
+#
+# 1. Redistributions of source code must retain the above copyright notice,
+# this list of conditions and the following disclaimer.
+#
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
+# IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
+# TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR
+# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+"""Logical units for cluster verification."""
+
+import itertools
+import logging
+import operator
+import re
+import time
+import ganeti.masterd.instance
+import ganeti.rpc.node as rpc
+
+from ganeti import compat
+from ganeti import constants
+from ganeti import errors
+from ganeti import locking
+from ganeti import pathutils
+from ganeti import utils
+from ganeti import vcluster
+from ganeti import hypervisor
+from ganeti import opcodes
+
+from ganeti.cmdlib.base import LogicalUnit, NoHooksLU, ResultWithJobs
+from ganeti.cmdlib.common import ShareAll, ComputeAncillaryFiles, \
+ CheckNodePVs, ComputeIPolicyInstanceViolation, AnnotateDiskParams, \
+ SupportsOob
+
+
+def _GetAllHypervisorParameters(cluster, instances):
+ """Compute the set of all hypervisor parameters.
+
+ @type cluster: L{objects.Cluster}
+ @param cluster: the cluster object
+ @param instances: list of L{objects.Instance}
+ @param instances: additional instances from which to obtain parameters
+ @rtype: list of (origin, hypervisor, parameters)
+ @return: a list with all parameters found, indicating the hypervisor they
+ apply to, and the origin (can be "cluster", "os X", or "instance Y")
+
+ """
+ hvp_data = []
+
+ for hv_name in cluster.enabled_hypervisors:
+ hvp_data.append(("cluster", hv_name, cluster.GetHVDefaults(hv_name)))
+
+ for os_name, os_hvp in cluster.os_hvp.items():
+ for hv_name, hv_params in os_hvp.items():
+ if hv_params:
+ full_params = cluster.GetHVDefaults(hv_name, os_name=os_name)
+ hvp_data.append(("os %s" % os_name, hv_name, full_params))
+
+ # TODO: collapse identical parameter values in a single one
+ for instance in instances:
+ if instance.hvparams:
+ hvp_data.append(("instance %s" % instance.name, instance.hypervisor,
+ cluster.FillHV(instance)))
+
+ return hvp_data
+
+
+class _VerifyErrors(object):
+ """Mix-in for cluster/group verify LUs.
+
+ It provides _Error and _ErrorIf, and updates the self.bad boolean. (Expects
+ self.op and self._feedback_fn to be available.)
+
+ """
+
+ ETYPE_FIELD = "code"
+ ETYPE_ERROR = constants.CV_ERROR
+ ETYPE_WARNING = constants.CV_WARNING
+
+ def _Error(self, ecode, item, msg, *args, **kwargs):
+ """Format an error message.
+
+ Based on the opcode's error_codes parameter, either format a
+ parseable error code, or a simpler error string.
+
+ This must be called only from Exec and functions called from Exec.
+
+ """
+ ltype = kwargs.get(self.ETYPE_FIELD, self.ETYPE_ERROR)
+ itype, etxt, _ = ecode
+ # If the error code is in the list of ignored errors, demote the error to
a
+ # warning
+ if etxt in self.op.ignore_errors: # pylint: disable=E1101
+ ltype = self.ETYPE_WARNING
+ # first complete the msg
+ if args:
+ msg = msg % args
+ # then format the whole message
+ if self.op.error_codes: # This is a mix-in. pylint: disable=E1101
+ msg = "%s:%s:%s:%s:%s" % (ltype, etxt, itype, item, msg)
+ else:
+ if item:
+ item = " " + item
+ else:
+ item = ""
+ msg = "%s: %s%s: %s" % (ltype, itype, item, msg)
+ # and finally report it via the feedback_fn
+ self._feedback_fn(" - %s" % msg) # Mix-in. pylint: disable=E1101
+ # do not mark the operation as failed for WARN cases only
+ if ltype == self.ETYPE_ERROR:
+ self.bad = True
+
+ def _ErrorIf(self, cond, *args, **kwargs):
+ """Log an error message if the passed condition is True.
+
+ """
+ if (bool(cond)
+ or self.op.debug_simulate_errors): # pylint: disable=E1101
+ self._Error(*args, **kwargs)
+
+
+class LUClusterVerify(NoHooksLU):
+ """Submits all jobs necessary to verify the cluster.
+
+ """
+ REQ_BGL = False
+
+ def ExpandNames(self):
+ self.needed_locks = {}
+
+ def Exec(self, feedback_fn):
+ jobs = []
+
+ if self.op.group_name:
+ groups = [self.op.group_name]
+ depends_fn = lambda: None
+ else:
+ groups = self.cfg.GetNodeGroupList()
+
+ # Verify global configuration
+ jobs.append([
+ opcodes.OpClusterVerifyConfig(ignore_errors=self.op.ignore_errors),
+ ])
+
+ # Always depend on global verification
+ depends_fn = lambda: [(-len(jobs), [])]
+
+ jobs.extend(
+ [opcodes.OpClusterVerifyGroup(group_name=group,
+ ignore_errors=self.op.ignore_errors,
- depends=depends_fn())]
++ depends=depends_fn(),
++ verify_clutter=self.op.verify_clutter)]
+ for group in groups)
+
+ # Fix up all parameters
+ for op in itertools.chain(*jobs): # pylint: disable=W0142
+ op.debug_simulate_errors = self.op.debug_simulate_errors
+ op.verbose = self.op.verbose
+ op.error_codes = self.op.error_codes
+ try:
+ op.skip_checks = self.op.skip_checks
+ except AttributeError:
+ assert not isinstance(op, opcodes.OpClusterVerifyGroup)
+
+ return ResultWithJobs(jobs)
+
+
+class LUClusterVerifyDisks(NoHooksLU):
+ """Verifies the cluster disks status.
+
+ """
+ REQ_BGL = False
+
+ def ExpandNames(self):
+ self.share_locks = ShareAll()
+ self.needed_locks = {
+ locking.LEVEL_NODEGROUP: locking.ALL_SET,
+ }
+
+ def Exec(self, feedback_fn):
+ group_names = self.owned_locks(locking.LEVEL_NODEGROUP)
+
+ # Submit one instance of L{opcodes.OpGroupVerifyDisks} per node group
+ return ResultWithJobs([[opcodes.OpGroupVerifyDisks(group_name=group)]
+ for group in group_names])
+
+
+class LUClusterVerifyConfig(NoHooksLU, _VerifyErrors):
+ """Verifies the cluster config.
+
+ """
+ REQ_BGL = False
+
+ def _VerifyHVP(self, hvp_data):
+ """Verifies locally the syntax of the hypervisor parameters.
+
+ """
+ for item, hv_name, hv_params in hvp_data:
+ msg = ("hypervisor %s parameters syntax check (source %s): %%s" %
+ (item, hv_name))
+ try:
+ hv_class = hypervisor.GetHypervisorClass(hv_name)
+ utils.ForceDictType(hv_params, constants.HVS_PARAMETER_TYPES)
+ hv_class.CheckParameterSyntax(hv_params)
+ except errors.GenericError, err:
+ self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg % str(err))
+
+ def ExpandNames(self):
+ self.needed_locks = dict.fromkeys(locking.LEVELS, locking.ALL_SET)
+ self.share_locks = ShareAll()
+
+ def CheckPrereq(self):
+ """Check prerequisites.
+
+ """
+ # Retrieve all information
+ self.all_group_info = self.cfg.GetAllNodeGroupsInfo()
+ self.all_node_info = self.cfg.GetAllNodesInfo()
+ self.all_inst_info = self.cfg.GetAllInstancesInfo()
+
+ def Exec(self, feedback_fn):
+ """Verify integrity of cluster, performing various test on nodes.
+
+ """
+ self.bad = False
+ self._feedback_fn = feedback_fn
+
+ feedback_fn("* Verifying cluster config")
+
+ for msg in self.cfg.VerifyConfig():
+ self._ErrorIf(True, constants.CV_ECLUSTERCFG, None, msg)
+
+ feedback_fn("* Verifying cluster certificate files")
+
+ for cert_filename in pathutils.ALL_CERT_FILES:
+ (errcode, msg) = utils.VerifyCertificate(cert_filename)
+ self._ErrorIf(errcode, constants.CV_ECLUSTERCERT, None, msg,
code=errcode)
+
+ self._ErrorIf(not utils.CanRead(constants.LUXID_USER,
+ pathutils.NODED_CERT_FILE),
+ constants.CV_ECLUSTERCERT,
+ None,
+ pathutils.NODED_CERT_FILE + " must be accessible by the " +
+ constants.LUXID_USER + " user")
+
+ feedback_fn("* Verifying hypervisor parameters")
+
+ self._VerifyHVP(_GetAllHypervisorParameters(self.cfg.GetClusterInfo(),
+ self.all_inst_info.values()))
+
+ feedback_fn("* Verifying all nodes belong to an existing group")
+
+ # We do this verification here because, should this bogus circumstance
+ # occur, it would never be caught by VerifyGroup, which only acts on
+ # nodes/instances reachable from existing node groups.
+
+ dangling_nodes = set(node for node in self.all_node_info.values()
+ if node.group not in self.all_group_info)
+
+ dangling_instances = {}
+ no_node_instances = []
+
+ for inst in self.all_inst_info.values():
+ if inst.primary_node in [node.uuid for node in dangling_nodes]:
+ dangling_instances.setdefault(inst.primary_node, []).append(inst)
+ elif inst.primary_node not in self.all_node_info:
+ no_node_instances.append(inst)
+
+ pretty_dangling = [
+ "%s (%s)" %
+ (node.name,
+ utils.CommaJoin(inst.name for
+ inst in dangling_instances.get(node.uuid, [])))
+ for node in dangling_nodes]
+
+ self._ErrorIf(bool(dangling_nodes), constants.CV_ECLUSTERDANGLINGNODES,
+ None,
+ "the following nodes (and their instances) belong to a non"
+ " existing group: %s", utils.CommaJoin(pretty_dangling))
+
+ self._ErrorIf(bool(no_node_instances), constants.CV_ECLUSTERDANGLINGINST,
+ None,
+ "the following instances have a non-existing primary-node:"
+ " %s", utils.CommaJoin(inst.name for
+ inst in no_node_instances))
+
+ return not self.bad
+
+
+class LUClusterVerifyGroup(LogicalUnit, _VerifyErrors):
+ """Verifies the status of a node group.
+
+ """
+ HPATH = "cluster-verify"
+ HTYPE = constants.HTYPE_CLUSTER
+ REQ_BGL = False
+
+ _HOOKS_INDENT_RE = re.compile("^", re.M)
+
+ class NodeImage(object):
+ """A class representing the logical and physical status of a node.
+
+ @type uuid: string
+ @ivar uuid: the node UUID to which this object refers
+ @ivar volumes: a structure as returned from
+ L{ganeti.backend.GetVolumeList} (runtime)
+ @ivar instances: a list of running instances (runtime)
+ @ivar pinst: list of configured primary instances (config)
+ @ivar sinst: list of configured secondary instances (config)
+ @ivar sbp: dictionary of {primary-node: list of instances} for all
+ instances for which this node is secondary (config)
+ @ivar mfree: free memory, as reported by hypervisor (runtime)
+ @ivar dfree: free disk, as reported by the node (runtime)
+ @ivar offline: the offline status (config)
+ @type rpc_fail: boolean
+ @ivar rpc_fail: whether the RPC verify call was successfull (overall,
+ not whether the individual keys were correct) (runtime)
+ @type lvm_fail: boolean
+ @ivar lvm_fail: whether the RPC call didn't return valid LVM data
+ @type hyp_fail: boolean
+ @ivar hyp_fail: whether the RPC call didn't return the instance list
+ @type ghost: boolean
+ @ivar ghost: whether this is a known node or not (config)
+ @type os_fail: boolean
+ @ivar os_fail: whether the RPC call didn't return valid OS data
+ @type oslist: list
+ @ivar oslist: list of OSes as diagnosed by DiagnoseOS
+ @type vm_capable: boolean
+ @ivar vm_capable: whether the node can host instances
+ @type pv_min: float
+ @ivar pv_min: size in MiB of the smallest PVs
+ @type pv_max: float
+ @ivar pv_max: size in MiB of the biggest PVs
+
+ """
+ def __init__(self, offline=False, uuid=None, vm_capable=True):
+ self.uuid = uuid
+ self.volumes = {}
+ self.instances = []
+ self.pinst = []
+ self.sinst = []
+ self.sbp = {}
+ self.mfree = 0
+ self.dfree = 0
+ self.offline = offline
+ self.vm_capable = vm_capable
+ self.rpc_fail = False
+ self.lvm_fail = False
+ self.hyp_fail = False
+ self.ghost = False
+ self.os_fail = False
+ self.oslist = {}
+ self.pv_min = None
+ self.pv_max = None
+
+ def ExpandNames(self):
+ # This raises errors.OpPrereqError on its own:
+ self.group_uuid = self.cfg.LookupNodeGroup(self.op.group_name)
+
+ # Get instances in node group; this is unsafe and needs verification later
+ inst_uuids = \
+ self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True)
+
+ self.needed_locks = {
+ locking.LEVEL_INSTANCE: self.cfg.GetInstanceNames(inst_uuids),
+ locking.LEVEL_NODEGROUP: [self.group_uuid],
+ locking.LEVEL_NODE: [],
+ }
+
+ self.share_locks = ShareAll()
+
+ def DeclareLocks(self, level):
+ if level == locking.LEVEL_NODE:
+ # Get members of node group; this is unsafe and needs verification later
+ nodes = set(self.cfg.GetNodeGroup(self.group_uuid).members)
+
+ # In Exec(), we warn about mirrored instances that have primary and
+ # secondary living in separate node groups. To fully verify that
+ # volumes for these instances are healthy, we will need to do an
+ # extra call to their secondaries. We ensure here those nodes will
+ # be locked.
+ for inst_name in self.owned_locks(locking.LEVEL_INSTANCE):
+ # Important: access only the instances whose lock is owned
+ instance = self.cfg.GetInstanceInfoByName(inst_name)
+ disks = self.cfg.GetInstanceDisks(instance.uuid)
+ if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
+ nodes.update(self.cfg.GetInstanceSecondaryNodes(instance.uuid))
+
+ self.needed_locks[locking.LEVEL_NODE] = nodes
+
+ def CheckPrereq(self):
+ assert self.group_uuid in self.owned_locks(locking.LEVEL_NODEGROUP)
+ self.group_info = self.cfg.GetNodeGroup(self.group_uuid)
+
+ group_node_uuids = set(self.group_info.members)
+ group_inst_uuids = \
+ self.cfg.GetNodeGroupInstances(self.group_uuid, primary_only=True)
+
+ unlocked_node_uuids = \
+ group_node_uuids.difference(self.owned_locks(locking.LEVEL_NODE))
+
+ unlocked_inst_uuids = \
+ group_inst_uuids.difference(
+ [self.cfg.GetInstanceInfoByName(name).uuid
+ for name in self.owned_locks(locking.LEVEL_INSTANCE)])
+
+ if unlocked_node_uuids:
+ raise errors.OpPrereqError(
+ "Missing lock for nodes: %s" %
+ utils.CommaJoin(self.cfg.GetNodeNames(unlocked_node_uuids)),
+ errors.ECODE_STATE)
+
+ if unlocked_inst_uuids:
+ raise errors.OpPrereqError(
+ "Missing lock for instances: %s" %
+ utils.CommaJoin(self.cfg.GetInstanceNames(unlocked_inst_uuids)),
+ errors.ECODE_STATE)
+
+ self.all_node_info = self.cfg.GetAllNodesInfo()
+ self.all_inst_info = self.cfg.GetAllInstancesInfo()
+ self.all_disks_info = self.cfg.GetAllDisksInfo()
+
+ self.my_node_uuids = group_node_uuids
+ self.my_node_info = dict((node_uuid, self.all_node_info[node_uuid])
+ for node_uuid in group_node_uuids)
+
+ self.my_inst_uuids = group_inst_uuids
+ self.my_inst_info = dict((inst_uuid, self.all_inst_info[inst_uuid])
+ for inst_uuid in group_inst_uuids)
+
+ # We detect here the nodes that will need the extra RPC calls for
verifying
+ # split LV volumes; they should be locked.
+ extra_lv_nodes = set()
+
+ for inst in self.my_inst_info.values():
+ disks = self.cfg.GetInstanceDisks(inst.uuid)
+ if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
+ inst_nodes = self.cfg.GetInstanceNodes(inst.uuid)
+ for nuuid in inst_nodes:
+ if self.all_node_info[nuuid].group != self.group_uuid:
+ extra_lv_nodes.add(nuuid)
+
+ unlocked_lv_nodes = \
+ extra_lv_nodes.difference(self.owned_locks(locking.LEVEL_NODE))
+
+ if unlocked_lv_nodes:
+ raise errors.OpPrereqError("Missing node locks for LV check: %s" %
+ utils.CommaJoin(unlocked_lv_nodes),
+ errors.ECODE_STATE)
+ self.extra_lv_nodes = list(extra_lv_nodes)
+
+ def _VerifyNode(self, ninfo, nresult):
+ """Perform some basic validation on data returned from a node.
+
+ - check the result data structure is well formed and has all the
+ mandatory fields
+ - check ganeti version
+
+ @type ninfo: L{objects.Node}
+ @param ninfo: the node to check
+ @param nresult: the results from the node
+ @rtype: boolean
+ @return: whether overall this call was successful (and we can expect
+ reasonable values in the respose)
+
+ """
+ # main result, nresult should be a non-empty dict
+ test = not nresult or not isinstance(nresult, dict)
+ self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name,
+ "unable to verify node: no data returned")
+ if test:
+ return False
+
+ # compares ganeti version
+ local_version = constants.PROTOCOL_VERSION
+ remote_version = nresult.get("version", None)
+ test = not (remote_version and
+ isinstance(remote_version, (list, tuple)) and
+ len(remote_version) == 2)
+ self._ErrorIf(test, constants.CV_ENODERPC, ninfo.name,
+ "connection to node returned invalid data")
+ if test:
+ return False
+
+ test = local_version != remote_version[0]
+ self._ErrorIf(test, constants.CV_ENODEVERSION, ninfo.name,
+ "incompatible protocol versions: master %s,"
+ " node %s", local_version, remote_version[0])
+ if test:
+ return False
+
+ # node seems compatible, we can actually try to look into its results
+
+ # full package version
+ self._ErrorIf(constants.RELEASE_VERSION != remote_version[1],
+ constants.CV_ENODEVERSION, ninfo.name,
+ "software version mismatch: master %s, node %s",
+ constants.RELEASE_VERSION, remote_version[1],
+ code=self.ETYPE_WARNING)
+
+ hyp_result = nresult.get(constants.NV_HYPERVISOR, None)
+ if ninfo.vm_capable and isinstance(hyp_result, dict):
+ for hv_name, hv_result in hyp_result.iteritems():
+ test = hv_result is not None
+ self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
+ "hypervisor %s verify failure: '%s'", hv_name,
hv_result)
+
+ hvp_result = nresult.get(constants.NV_HVPARAMS, None)
+ if ninfo.vm_capable and isinstance(hvp_result, list):
+ for item, hv_name, hv_result in hvp_result:
+ self._ErrorIf(True, constants.CV_ENODEHV, ninfo.name,
+ "hypervisor %s parameter verify failure (source %s):
%s",
+ hv_name, item, hv_result)
+
+ test = nresult.get(constants.NV_NODESETUP,
+ ["Missing NODESETUP results"])
+ self._ErrorIf(test, constants.CV_ENODESETUP, ninfo.name,
+ "node setup error: %s", "; ".join(test))
+
+ return True
+
+ def _VerifyNodeTime(self, ninfo, nresult,
+ nvinfo_starttime, nvinfo_endtime):
+ """Check the node time.
+
+ @type ninfo: L{objects.Node}
+ @param ninfo: the node to check
+ @param nresult: the remote results for the node
+ @param nvinfo_starttime: the start time of the RPC call
+ @param nvinfo_endtime: the end time of the RPC call
+
+ """
+ ntime = nresult.get(constants.NV_TIME, None)
+ try:
+ ntime_merged = utils.MergeTime(ntime)
+ except (ValueError, TypeError):
+ self._ErrorIf(True, constants.CV_ENODETIME, ninfo.name,
+ "Node returned invalid time")
+ return
+
+ if ntime_merged < (nvinfo_starttime - constants.NODE_MAX_CLOCK_SKEW):
+ ntime_diff = "%.01fs" % abs(nvinfo_starttime - ntime_merged)
+ elif ntime_merged > (nvinfo_endtime + constants.NODE_MAX_CLOCK_SKEW):
+ ntime_diff = "%.01fs" % abs(ntime_merged - nvinfo_endtime)
+ else:
+ ntime_diff = None
+
+ self._ErrorIf(ntime_diff is not None, constants.CV_ENODETIME, ninfo.name,
+ "Node time diverges by at least %s from master node time",
+ ntime_diff)
+
+ def _UpdateVerifyNodeLVM(self, ninfo, nresult, vg_name, nimg):
+ """Check the node LVM results and update info for cross-node checks.
+
+ @type ninfo: L{objects.Node}
+ @param ninfo: the node to check
+ @param nresult: the remote results for the node
+ @param vg_name: the configured VG name
+ @type nimg: L{NodeImage}
+ @param nimg: node image
+
+ """
+ if vg_name is None:
+ return
+
+ # checks vg existence and size > 20G
+ vglist = nresult.get(constants.NV_VGLIST, None)
+ test = not vglist
+ self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name,
+ "unable to check volume groups")
+ if not test:
+ vgstatus = utils.CheckVolumeGroupSize(vglist, vg_name,
+ constants.MIN_VG_SIZE)
+ self._ErrorIf(vgstatus, constants.CV_ENODELVM, ninfo.name, vgstatus)
+
+ # Check PVs
+ (errmsgs, pvminmax) = CheckNodePVs(nresult, self._exclusive_storage)
+ for em in errmsgs:
+ self._Error(constants.CV_ENODELVM, ninfo.name, em)
+ if pvminmax is not None:
+ (nimg.pv_min, nimg.pv_max) = pvminmax
+
+ def _VerifyGroupDRBDVersion(self, node_verify_infos):
+ """Check cross-node DRBD version consistency.
+
+ @type node_verify_infos: dict
+ @param node_verify_infos: infos about nodes as returned from the
+ node_verify call.
+
+ """
+ node_versions = {}
+ for node_uuid, ndata in node_verify_infos.items():
+ nresult = ndata.payload
+ if nresult:
+ version = nresult.get(constants.NV_DRBDVERSION, None)
+ if version:
+ node_versions[node_uuid] = version
+
+ if len(set(node_versions.values())) > 1:
+ for node_uuid, version in sorted(node_versions.items()):
+ msg = "DRBD version mismatch: %s" % version
+ self._Error(constants.CV_ENODEDRBDHELPER, node_uuid, msg,
+ code=self.ETYPE_WARNING)
+
+ def _VerifyGroupLVM(self, node_image, vg_name):
+ """Check cross-node consistency in LVM.
+
+ @type node_image: dict
+ @param node_image: info about nodes, mapping from node to names to
+ L{NodeImage} objects
+ @param vg_name: the configured VG name
+
+ """
+ if vg_name is None:
+ return
+
+ # Only exclusive storage needs this kind of checks
+ if not self._exclusive_storage:
+ return
+
+ # exclusive_storage wants all PVs to have the same size (approximately),
+ # if the smallest and the biggest ones are okay, everything is fine.
+ # pv_min is None iff pv_max is None
+ vals = filter((lambda ni: ni.pv_min is not None), node_image.values())
+ if not vals:
+ return
+ (pvmin, minnode_uuid) = min((ni.pv_min, ni.uuid) for ni in vals)
+ (pvmax, maxnode_uuid) = max((ni.pv_max, ni.uuid) for ni in vals)
+ bad = utils.LvmExclusiveTestBadPvSizes(pvmin, pvmax)
+ self._ErrorIf(bad, constants.CV_EGROUPDIFFERENTPVSIZE,
self.group_info.name,
+ "PV sizes differ too much in the group; smallest (%s MB) is"
+ " on %s, biggest (%s MB) is on %s",
+ pvmin, self.cfg.GetNodeName(minnode_uuid),
+ pvmax, self.cfg.GetNodeName(maxnode_uuid))
+
+ def _VerifyNodeBridges(self, ninfo, nresult, bridges):
+ """Check the node bridges.
+
+ @type ninfo: L{objects.Node}
+ @param ninfo: the node to check
+ @param nresult: the remote results for the node
+ @param bridges: the expected list of bridges
+
+ """
+ if not bridges:
+ return
+
+ missing = nresult.get(constants.NV_BRIDGES, None)
+ test = not isinstance(missing, list)
+ self._ErrorIf(test, constants.CV_ENODENET, ninfo.name,
+ "did not return valid bridge information")
+ if not test:
+ self._ErrorIf(bool(missing), constants.CV_ENODENET, ninfo.name,
+ "missing bridges: %s" % utils.CommaJoin(sorted(missing)))
+
+ def _VerifyNodeUserScripts(self, ninfo, nresult):
+ """Check the results of user scripts presence and executability on the
node
+
+ @type ninfo: L{objects.Node}
+ @param ninfo: the node to check
+ @param nresult: the remote results for the node
+
+ """
+ test = not constants.NV_USERSCRIPTS in nresult
+ self._ErrorIf(test, constants.CV_ENODEUSERSCRIPTS, ninfo.name,
+ "did not return user scripts information")
+
+ broken_scripts = nresult.get(constants.NV_USERSCRIPTS, None)
+ if not test:
+ self._ErrorIf(broken_scripts, constants.CV_ENODEUSERSCRIPTS, ninfo.name,
+ "user scripts not present or not executable: %s" %
+ utils.CommaJoin(sorted(broken_scripts)))
+
+ def _VerifyNodeNetwork(self, ninfo, nresult):
+ """Check the node network connectivity results.
+
+ @type ninfo: L{objects.Node}
+ @param ninfo: the node to check
+ @param nresult: the remote results for the node
+
+ """
+ test = constants.NV_NODELIST not in nresult
+ self._ErrorIf(test, constants.CV_ENODESSH, ninfo.name,
+ "node hasn't returned node ssh connectivity data")
+ if not test:
+ if nresult[constants.NV_NODELIST]:
+ for a_node, a_msg in nresult[constants.NV_NODELIST].items():
+ self._ErrorIf(True, constants.CV_ENODESSH, ninfo.name,
+ "ssh communication with node '%s': %s", a_node, a_msg)
+
+ test = constants.NV_NODENETTEST not in nresult
+ self._ErrorIf(test, constants.CV_ENODENET, ninfo.name,
+ "node hasn't returned node tcp connectivity data")
+ if not test:
+ if nresult[constants.NV_NODENETTEST]:
+ nlist = utils.NiceSort(nresult[constants.NV_NODENETTEST].keys())
+ for anode in nlist:
+ self._ErrorIf(True, constants.CV_ENODENET, ninfo.name,
+ "tcp communication with node '%s': %s",
+ anode, nresult[constants.NV_NODENETTEST][anode])
+
+ test = constants.NV_MASTERIP not in nresult
+ self._ErrorIf(test, constants.CV_ENODENET, ninfo.name,
+ "node hasn't returned node master IP reachability data")
+ if not test:
+ if not nresult[constants.NV_MASTERIP]:
+ if ninfo.uuid == self.master_node:
+ msg = "the master node cannot reach the master IP (not configured?)"
+ else:
+ msg = "cannot reach the master IP"
+ self._ErrorIf(True, constants.CV_ENODENET, ninfo.name, msg)
+
+ def _VerifyInstance(self, instance, node_image, diskstatus):
+ """Verify an instance.
+
+ This function checks to see if the required block devices are
+ available on the instance's node, and that the nodes are in the correct
+ state.
+
+ """
+ pnode_uuid = instance.primary_node
+ pnode_img = node_image[pnode_uuid]
+ groupinfo = self.cfg.GetAllNodeGroupsInfo()
+
+ node_vol_should = {}
+ self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should)
+
+ cluster = self.cfg.GetClusterInfo()
+ ipolicy = ganeti.masterd.instance.CalculateGroupIPolicy(cluster,
+ self.group_info)
+ err = ComputeIPolicyInstanceViolation(ipolicy, instance, self.cfg)
+ self._ErrorIf(err, constants.CV_EINSTANCEPOLICY, instance.name,
+ utils.CommaJoin(err), code=self.ETYPE_WARNING)
+
+ for node_uuid in node_vol_should:
+ n_img = node_image[node_uuid]
+ if n_img.offline or n_img.rpc_fail or n_img.lvm_fail:
+ # ignore missing volumes on offline or broken nodes
+ continue
+ for volume in node_vol_should[node_uuid]:
+ test = volume not in n_img.volumes
+ self._ErrorIf(test, constants.CV_EINSTANCEMISSINGDISK, instance.name,
+ "volume %s missing on node %s", volume,
+ self.cfg.GetNodeName(node_uuid))
+
+ if instance.admin_state == constants.ADMINST_UP:
+ test = instance.uuid not in pnode_img.instances and not
pnode_img.offline
+ self._ErrorIf(test, constants.CV_EINSTANCEDOWN, instance.name,
+ "instance not running on its primary node %s",
+ self.cfg.GetNodeName(pnode_uuid))
+ self._ErrorIf(pnode_img.offline, constants.CV_EINSTANCEBADNODE,
+ instance.name, "instance is marked as running and lives
on"
+ " offline node %s", self.cfg.GetNodeName(pnode_uuid))
+
+ diskdata = [(nname, success, status, idx)
+ for (nname, disks) in diskstatus.items()
+ for idx, (success, status) in enumerate(disks)]
+
+ for nname, success, bdev_status, idx in diskdata:
+ # the 'ghost node' construction in Exec() ensures that we have a
+ # node here
+ snode = node_image[nname]
+ bad_snode = snode.ghost or snode.offline
+ self._ErrorIf(instance.disks_active and
+ not success and not bad_snode,
+ constants.CV_EINSTANCEFAULTYDISK, instance.name,
+ "couldn't retrieve status for disk/%s on %s: %s",
+ idx, self.cfg.GetNodeName(nname), bdev_status)
+
+ if instance.disks_active and success and bdev_status.is_degraded:
+ msg = "disk/%s on %s is degraded" % (idx, self.cfg.GetNodeName(nname))
+
+ code = self.ETYPE_ERROR
+ accepted_lds = [constants.LDS_OKAY, constants.LDS_SYNC]
+
+ if bdev_status.ldisk_status in accepted_lds:
+ code = self.ETYPE_WARNING
+
+ msg += "; local disk state is '%s'" % \
+ constants.LDS_NAMES[bdev_status.ldisk_status]
+
+ self._Error(constants.CV_EINSTANCEFAULTYDISK, instance.name, msg,
+ code=code)
+
+ self._ErrorIf(pnode_img.rpc_fail and not pnode_img.offline,
+ constants.CV_ENODERPC, self.cfg.GetNodeName(pnode_uuid),
+ "instance %s, connection to primary node failed",
+ instance.name)
+
+ secondary_nodes = self.cfg.GetInstanceSecondaryNodes(instance.uuid)
+ self._ErrorIf(len(secondary_nodes) > 1,
+ constants.CV_EINSTANCELAYOUT, instance.name,
+ "instance has multiple secondary nodes: %s",
+ utils.CommaJoin(secondary_nodes),
+ code=self.ETYPE_WARNING)
+
+ inst_nodes = self.cfg.GetInstanceNodes(instance.uuid)
+ es_flags = rpc.GetExclusiveStorageForNodes(self.cfg, inst_nodes)
+ disks = self.cfg.GetInstanceDisks(instance.uuid)
+ if any(es_flags.values()):
+ if not utils.AllDiskOfType(disks, constants.DTS_EXCL_STORAGE):
+ # Disk template not compatible with exclusive_storage: no instance
+ # node should have the flag set
+ es_nodes = [n
+ for (n, es) in es_flags.items()
+ if es]
+ unsupported = [d.dev_type for d in disks
+ if d.dev_type not in constants.DTS_EXCL_STORAGE]
+ self._Error(constants.CV_EINSTANCEUNSUITABLENODE, instance.name,
+ "instance uses disk types %s, which are not supported on"
+ " nodes that have exclusive storage set: %s",
+ utils.CommaJoin(unsupported),
+ utils.CommaJoin(self.cfg.GetNodeNames(es_nodes)))
+ for (idx, disk) in enumerate(disks):
+ self._ErrorIf(disk.spindles is None,
+ constants.CV_EINSTANCEMISSINGCFGPARAMETER,
instance.name,
+ "number of spindles not configured for disk %s while"
+ " exclusive storage is enabled, try running"
+ " gnt-cluster repair-disk-sizes", idx)
+
+ if utils.AnyDiskOfType(disks, constants.DTS_INT_MIRROR):
+ instance_nodes = utils.NiceSort(inst_nodes)
+ instance_groups = {}
+
+ for node_uuid in instance_nodes:
+ instance_groups.setdefault(self.all_node_info[node_uuid].group,
+ []).append(node_uuid)
+
+ pretty_list = [
+ "%s (group %s)" % (utils.CommaJoin(self.cfg.GetNodeNames(nodes)),
+ groupinfo[group].name)
+ # Sort so that we always list the primary node first.
+ for group, nodes in sorted(instance_groups.items(),
+ key=lambda (_, nodes): pnode_uuid in nodes,
+ reverse=True)]
+
+ self._ErrorIf(len(instance_groups) > 1,
+ constants.CV_EINSTANCESPLITGROUPS,
+ instance.name, "instance has primary and secondary nodes
in"
+ " different groups: %s", utils.CommaJoin(pretty_list),
+ code=self.ETYPE_WARNING)
+
+ inst_nodes_offline = []
+ for snode in secondary_nodes:
+ s_img = node_image[snode]
+ self._ErrorIf(s_img.rpc_fail and not s_img.offline,
constants.CV_ENODERPC,
+ self.cfg.GetNodeName(snode),
+ "instance %s, connection to secondary node failed",
+ instance.name)
+
+ if s_img.offline:
+ inst_nodes_offline.append(snode)
+
+ # warn that the instance lives on offline nodes
+ self._ErrorIf(inst_nodes_offline, constants.CV_EINSTANCEBADNODE,
+ instance.name, "instance has offline secondary node(s) %s",
+ utils.CommaJoin(self.cfg.GetNodeNames(inst_nodes_offline)))
+ # ... or ghost/non-vm_capable nodes
+ for node_uuid in inst_nodes:
+ self._ErrorIf(node_image[node_uuid].ghost,
constants.CV_EINSTANCEBADNODE,
+ instance.name, "instance lives on ghost node %s",
+ self.cfg.GetNodeName(node_uuid))
+ self._ErrorIf(not node_image[node_uuid].vm_capable,
+ constants.CV_EINSTANCEBADNODE, instance.name,
+ "instance lives on non-vm_capable node %s",
+ self.cfg.GetNodeName(node_uuid))
+
+ def _VerifyOrphanVolumes(self, node_vol_should, node_image, reserved):
+ """Verify if there are any unknown volumes in the cluster.
+
+ The .os, .swap and backup volumes are ignored. All other volumes are
+ reported as unknown.
+
+ @type reserved: L{ganeti.utils.FieldSet}
+ @param reserved: a FieldSet of reserved volume names
+
+ """
+ for node_uuid, n_img in node_image.items():
+ if (n_img.offline or n_img.rpc_fail or n_img.lvm_fail or
+ self.all_node_info[node_uuid].group != self.group_uuid):
+ # skip non-healthy nodes
+ continue
+ for volume in n_img.volumes:
+ test = ((node_uuid not in node_vol_should or
+ volume not in node_vol_should[node_uuid]) and
+ not reserved.Matches(volume))
+ self._ErrorIf(test, constants.CV_ENODEORPHANLV,
+ self.cfg.GetNodeName(node_uuid),
+ "volume %s is unknown", volume,
+ code=_VerifyErrors.ETYPE_WARNING)
+
+ def _VerifyNPlusOneMemory(self, node_image, all_insts):
+ """Verify N+1 Memory Resilience.
+
+ Check that if one single node dies we can still start all the
+ instances it was primary for.
+
+ """
+ cluster_info = self.cfg.GetClusterInfo()
+ for node_uuid, n_img in node_image.items():
+ # This code checks that every node which is now listed as
+ # secondary has enough memory to host all instances it is
+ # supposed to should a single other node in the cluster fail.
+ # FIXME: not ready for failover to an arbitrary node
+ # FIXME: does not support file-backed instances
+ # WARNING: we currently take into account down instances as well
+ # as up ones, considering that even if they're down someone
+ # might want to start them even in the event of a node failure.
+ if n_img.offline or \
+ self.all_node_info[node_uuid].group != self.group_uuid:
+ # we're skipping nodes marked offline and nodes in other groups from
+ # the N+1 warning, since most likely we don't have good memory
+ # information from them; we already list instances living on such
+ # nodes, and that's enough warning
+ continue
+ #TODO(dynmem): also consider ballooning out other instances
+ for prinode, inst_uuids in n_img.sbp.items():
+ needed_mem = 0
+ for inst_uuid in inst_uuids:
+ bep = cluster_info.FillBE(all_insts[inst_uuid])
+ if bep[constants.BE_AUTO_BALANCE]:
+ needed_mem += bep[constants.BE_MINMEM]
+ test = n_img.mfree < needed_mem
+ self._ErrorIf(test, constants.CV_ENODEN1,
+ self.cfg.GetNodeName(node_uuid),
+ "not enough memory to accomodate instance failovers"
+ " should node %s fail (%dMiB needed, %dMiB available)",
+ self.cfg.GetNodeName(prinode), needed_mem, n_img.mfree)
+
+ def _VerifyClientCertificates(self, nodes, all_nvinfo):
+ """Verifies the consistency of the client certificates.
+
+ This includes several aspects:
+ - the individual validation of all nodes' certificates
+ - the consistency of the master candidate certificate map
+ - the consistency of the master candidate certificate map with the
+ certificates that the master candidates are actually using.
+
+ @param nodes: the list of nodes to consider in this verification
+ @param all_nvinfo: the map of results of the verify_node call to
+ all nodes
+
+ """
+ candidate_certs = self.cfg.GetClusterInfo().candidate_certs
+ if candidate_certs is None or len(candidate_certs) == 0:
+ self._ErrorIf(
+ True, constants.CV_ECLUSTERCLIENTCERT, None,
+ "The cluster's list of master candidate certificates is empty."
+ " If you just updated the cluster, please run"
+ " 'gnt-cluster renew-crypto --new-node-certificates'.")
+ return
+
+ self._ErrorIf(
+ len(candidate_certs) != len(set(candidate_certs.values())),
+ constants.CV_ECLUSTERCLIENTCERT, None,
+ "There are at least two master candidates configured to use the same"
+ " certificate.")
+
+ # collect the client certificate
+ for node in nodes:
+ if node.offline:
+ continue
+
+ nresult = all_nvinfo[node.uuid]
+ if nresult.fail_msg or not nresult.payload:
+ continue
+
+ (errcode, msg) = nresult.payload.get(constants.NV_CLIENT_CERT, None)
+
+ self._ErrorIf(
+ errcode is not None, constants.CV_ECLUSTERCLIENTCERT, None,
+ "Client certificate of node '%s' failed validation: %s (code '%s')",
+ node.uuid, msg, errcode)
+
+ if not errcode:
+ digest = msg
+ if node.master_candidate:
+ if node.uuid in candidate_certs:
+ self._ErrorIf(
+ digest != candidate_certs[node.uuid],
+ constants.CV_ECLUSTERCLIENTCERT, None,
+ "Client certificate digest of master candidate '%s' does not"
+ " match its entry in the cluster's map of master candidate"
+ " certificates. Expected: %s Got: %s", node.uuid,
+ digest, candidate_certs[node.uuid])
+ else:
+ self._ErrorIf(
+ True, constants.CV_ECLUSTERCLIENTCERT, None,
+ "The master candidate '%s' does not have an entry in the"
+ " map of candidate certificates.", node.uuid)
+ self._ErrorIf(
+ digest in candidate_certs.values(),
+ constants.CV_ECLUSTERCLIENTCERT, None,
+ "Master candidate '%s' is using a certificate of another node.",
+ node.uuid)
+ else:
+ self._ErrorIf(
+ node.uuid in candidate_certs,
+ constants.CV_ECLUSTERCLIENTCERT, None,
+ "Node '%s' is not a master candidate, but still listed in the"
+ " map of master candidate certificates.", node.uuid)
+ self._ErrorIf(
+ (node.uuid not in candidate_certs) and
+ (digest in candidate_certs.values()),
+ constants.CV_ECLUSTERCLIENTCERT, None,
+ "Node '%s' is not a master candidate and is incorrectly using a"
+ " certificate of another node which is master candidate.",
+ node.uuid)
+
+ def _VerifySshSetup(self, nodes, all_nvinfo):
- """Evaluates the verification results of the SSH setup.
++ """Evaluates the verification results of the SSH setup and clutter test.
+
+ @param nodes: List of L{objects.Node} objects
+ @param all_nvinfo: RPC results
+
+ """
+ for node in nodes:
+ if not node.offline:
+ nresult = all_nvinfo[node.uuid]
+ if nresult.fail_msg or not nresult.payload:
+ self._ErrorIf(True, constants.CV_ENODESSH, node.name,
+ "Could not verify the SSH setup of this node.")
+ return
- result = nresult.payload.get(constants.NV_SSH_SETUP, None)
- error_msg = ""
- if isinstance(result, list):
- error_msg = " ".join(result)
- self._ErrorIf(result,
- constants.CV_ENODESSH, None, error_msg)
++ for ssh_test in [constants.NV_SSH_SETUP, constants.NV_SSH_CLUTTER]:
++ result = nresult.payload.get(ssh_test, None)
++ error_msg = ""
++ if isinstance(result, list):
++ error_msg = " ".join(result)
++ self._ErrorIf(result,
++ constants.CV_ENODESSH, None, error_msg)
+
+ def _VerifyFiles(self, nodes, master_node_uuid, all_nvinfo,
+ (files_all, files_opt, files_mc, files_vm)):
+ """Verifies file checksums collected from all nodes.
+
+ @param nodes: List of L{objects.Node} objects
+ @param master_node_uuid: UUID of master node
+ @param all_nvinfo: RPC results
+
+ """
+ # Define functions determining which nodes to consider for a file
+ files2nodefn = [
+ (files_all, None),
+ (files_mc, lambda node: (node.master_candidate or
+ node.uuid == master_node_uuid)),
+ (files_vm, lambda node: node.vm_capable),
+ ]
+
+ # Build mapping from filename to list of nodes which should have the file
+ nodefiles = {}
+ for (files, fn) in files2nodefn:
+ if fn is None:
+ filenodes = nodes
+ else:
+ filenodes = filter(fn, nodes)
+ nodefiles.update((filename,
+ frozenset(map(operator.attrgetter("uuid"),
filenodes)))
+ for filename in files)
+
+ assert set(nodefiles) == (files_all | files_mc | files_vm)
+
+ fileinfo = dict((filename, {}) for filename in nodefiles)
+ ignore_nodes = set()
+
+ for node in nodes:
+ if node.offline:
+ ignore_nodes.add(node.uuid)
+ continue
+
+ nresult = all_nvinfo[node.uuid]
+
+ if nresult.fail_msg or not nresult.payload:
+ node_files = None
+ else:
+ fingerprints = nresult.payload.get(constants.NV_FILELIST, {})
+ node_files = dict((vcluster.LocalizeVirtualPath(key), value)
+ for (key, value) in fingerprints.items())
+ del fingerprints
+
+ test = not (node_files and isinstance(node_files, dict))
+ self._ErrorIf(test, constants.CV_ENODEFILECHECK, node.name,
+ "Node did not return file checksum data")
+ if test:
+ ignore_nodes.add(node.uuid)
+ continue
+
+ # Build per-checksum mapping from filename to nodes having it
+ for (filename, checksum) in node_files.items():
+ assert filename in nodefiles
+ fileinfo[filename].setdefault(checksum, set()).add(node.uuid)
+
+ for (filename, checksums) in fileinfo.items():
+ assert compat.all(len(i) > 10 for i in checksums), "Invalid checksum"
+
+ # Nodes having the file
+ with_file = frozenset(node_uuid
+ for node_uuids in fileinfo[filename].values()
+ for node_uuid in node_uuids) - ignore_nodes
+
+ expected_nodes = nodefiles[filename] - ignore_nodes
+
+ # Nodes missing file
+ missing_file = expected_nodes - with_file
+
+ if filename in files_opt:
+ # All or no nodes
+ self._ErrorIf(missing_file and missing_file != expected_nodes,
+ constants.CV_ECLUSTERFILECHECK, None,
+ "File %s is optional, but it must exist on all or no"
+ " nodes (not found on %s)",
+ filename,
+ utils.CommaJoin(
+ utils.NiceSort(
+ map(self.cfg.GetNodeName, missing_file))))
+ else:
+ self._ErrorIf(missing_file, constants.CV_ECLUSTERFILECHECK, None,
+ "File %s is missing from node(s) %s", filename,
+ utils.CommaJoin(
+ utils.NiceSort(
+ map(self.cfg.GetNodeName, missing_file))))
+
+ # Warn if a node has a file it shouldn't
+ unexpected = with_file - expected_nodes
+ self._ErrorIf(unexpected,
+ constants.CV_ECLUSTERFILECHECK, None,
+ "File %s should not exist on node(s) %s",
+ filename, utils.CommaJoin(
+ utils.NiceSort(map(self.cfg.GetNodeName,
unexpected))))
+
+ # See if there are multiple versions of the file
+ test = len(checksums) > 1
+ if test:
+ variants = ["variant %s on %s" %
+ (idx + 1,
+ utils.CommaJoin(utils.NiceSort(
+ map(self.cfg.GetNodeName, node_uuids))))
+ for (idx, (checksum, node_uuids)) in
+ enumerate(sorted(checksums.items()))]
+ else:
+ variants = []
+
+ self._ErrorIf(test, constants.CV_ECLUSTERFILECHECK, None,
+ "File %s found with %s different checksums (%s)",
+ filename, len(checksums), "; ".join(variants))
+
+ def _VerifyNodeDrbdHelper(self, ninfo, nresult, drbd_helper):
+ """Verify the drbd helper.
+
+ """
+ if drbd_helper:
+ helper_result = nresult.get(constants.NV_DRBDHELPER, None)
+ test = (helper_result is None)
+ self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
+ "no drbd usermode helper returned")
+ if helper_result:
+ status, payload = helper_result
+ test = not status
+ self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
+ "drbd usermode helper check unsuccessful: %s", payload)
+ test = status and (payload != drbd_helper)
+ self._ErrorIf(test, constants.CV_ENODEDRBDHELPER, ninfo.name,
+ "wrong drbd usermode helper: %s", payload)
+
+ @staticmethod
+ def _ComputeDrbdMinors(ninfo, instanceinfo, disks_info, drbd_map, error_if):
+ """Gives the DRBD information in a map for a node.
+
+ @type ninfo: L{objects.Node}
+ @param ninfo: the node to check
+ @param instanceinfo: the dict of instances
+ @param disks_info: the dict of disks
+ @param drbd_map: the DRBD map as returned by
+ L{ganeti.config.ConfigWriter.ComputeDRBDMap}
+ @type error_if: callable like L{_ErrorIf}
+ @param error_if: The error reporting function
+ @return: dict from minor number to (disk_uuid, instance_uuid, active)
+
+ """
+ node_drbd = {}
+ for minor, disk_uuid in drbd_map[ninfo.uuid].items():
+ test = disk_uuid not in disks_info
+ error_if(test, constants.CV_ECLUSTERCFG, None,
+ "ghost disk '%s' in temporary DRBD map", disk_uuid)
+ # ghost disk should not be active, but otherwise we
+ # don't give double warnings (both ghost disk and
+ # unallocated minor in use)
+ if test:
+ node_drbd[minor] = (disk_uuid, None, False)
+ else:
+ disk_active = False
+ disk_instance = None
+ for (inst_uuid, inst) in instanceinfo.items():
+ if disk_uuid in inst.disks:
+ disk_active = inst.disks_active
+ disk_instance = inst_uuid
+ break
+ node_drbd[minor] = (disk_uuid, disk_instance, disk_active)
+ return node_drbd
+
+ def _VerifyNodeDrbd(self, ninfo, nresult, instanceinfo, disks_info,
+ drbd_helper, drbd_map):
+ """Verifies and the node DRBD status.
+
+ @type ninfo: L{objects.Node}
+ @param ninfo: the node to check
+ @param nresult: the remote results for the node
+ @param instanceinfo: the dict of instances
+ @param disks_info: the dict of disks
+ @param drbd_helper: the configured DRBD usermode helper
+ @param drbd_map: the DRBD map as returned by
+ L{ganeti.config.ConfigWriter.ComputeDRBDMap}
+
+ """
+ self._VerifyNodeDrbdHelper(ninfo, nresult, drbd_helper)
+
+ # compute the DRBD minors
+ node_drbd = self._ComputeDrbdMinors(ninfo, instanceinfo, disks_info,
+ drbd_map, self._ErrorIf)
+
+ # and now check them
+ used_minors = nresult.get(constants.NV_DRBDLIST, [])
+ test = not isinstance(used_minors, (tuple, list))
+ self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
+ "cannot parse drbd status file: %s", str(used_minors))
+ if test:
+ # we cannot check drbd status
+ return
+
+ for minor, (disk_uuid, inst_uuid, must_exist) in node_drbd.items():
+ test = minor not in used_minors and must_exist
+ if inst_uuid is not None:
+ attached = "(attached in instance '%s')" % \
+ self.cfg.GetInstanceName(inst_uuid)
+ else:
+ attached = "(detached)"
+ self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
+ "drbd minor %d of disk %s %s is not active",
+ minor, disk_uuid, attached)
+ for minor in used_minors:
+ test = minor not in node_drbd
+ self._ErrorIf(test, constants.CV_ENODEDRBD, ninfo.name,
+ "unallocated drbd minor %d is in use", minor)
+
+ def _UpdateNodeOS(self, ninfo, nresult, nimg):
+ """Builds the node OS structures.
+
+ @type ninfo: L{objects.Node}
+ @param ninfo: the node to check
+ @param nresult: the remote results for the node
+ @param nimg: the node image object
+
+ """
+ remote_os = nresult.get(constants.NV_OSLIST, None)
+ test = (not isinstance(remote_os, list) or
+ not compat.all(isinstance(v, list) and len(v) == 8
+ for v in remote_os))
+
+ self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name,
+ "node hasn't returned valid OS data")
+
+ nimg.os_fail = test
+
+ if test:
+ return
+
+ os_dict = {}
+
+ for (name, os_path, status, diagnose,
+ variants, parameters, api_ver,
+ trusted) in nresult[constants.NV_OSLIST]:
+
+ if name not in os_dict:
+ os_dict[name] = []
+
+ # parameters is a list of lists instead of list of tuples due to
+ # JSON lacking a real tuple type, fix it:
+ parameters = [tuple(v) for v in parameters]
+ os_dict[name].append((os_path, status, diagnose,
+ set(variants), set(parameters), set(api_ver),
+ trusted))
+
+ nimg.oslist = os_dict
+
+ def _VerifyNodeOS(self, ninfo, nimg, base):
+ """Verifies the node OS list.
+
+ @type ninfo: L{objects.Node}
+ @param ninfo: the node to check
+ @param nimg: the node image object
+ @param base: the 'template' node we match against (e.g. from the master)
+
+ """
+ assert not nimg.os_fail, "Entered _VerifyNodeOS with failed OS rpc?"
+
+ beautify_params = lambda l: ["%s: %s" % (k, v) for (k, v) in l]
+ for os_name, os_data in nimg.oslist.items():
+ assert os_data, "Empty OS status for OS %s?!" % os_name
+ f_path, f_status, f_diag, f_var, f_param, f_api, f_trusted = os_data[0]
+ self._ErrorIf(not f_status, constants.CV_ENODEOS, ninfo.name,
+ "Invalid OS %s (located at %s): %s",
+ os_name, f_path, f_diag)
+ self._ErrorIf(len(os_data) > 1, constants.CV_ENODEOS, ninfo.name,
+ "OS '%s' has multiple entries"
+ " (first one shadows the rest): %s",
+ os_name, utils.CommaJoin([v[0] for v in os_data]))
+ # comparisons with the 'base' image
+ test = os_name not in base.oslist
+ self._ErrorIf(test, constants.CV_ENODEOS, ninfo.name,
+ "Extra OS %s not present on reference node (%s)",
+ os_name, self.cfg.GetNodeName(base.uuid))
+ if test:
+ continue
+ assert base.oslist[os_name], "Base node has empty OS status?"
+ _, b_status, _, b_var, b_param, b_api, b_trusted =
base.oslist[os_name][0]
+ if not b_status:
+ # base OS is invalid, skipping
+ continue
+ for kind, a, b in [("API version", f_api, b_api),
+ ("variants list", f_var, b_var),
+ ("parameters", beautify_params(f_param),
+ beautify_params(b_param))]:
+ self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name,
+ "OS %s for %s differs from reference node %s:"
+ " [%s] vs. [%s]", kind, os_name,
+ self.cfg.GetNodeName(base.uuid),
+ utils.CommaJoin(sorted(a)), utils.CommaJoin(sorted(b)))
+ for kind, a, b in [("trusted", f_trusted, b_trusted)]:
+ self._ErrorIf(a != b, constants.CV_ENODEOS, ninfo.name,
+ "OS %s for %s differs from reference node %s:"
+ " %s vs. %s", kind, os_name,
+ self.cfg.GetNodeName(base.uuid), a, b)
+
+ # check any missing OSes
+ missing = set(base.oslist.keys()).difference(nimg.oslist.keys())
+ self._ErrorIf(missing, constants.CV_ENODEOS, ninfo.name,
+ "OSes present on reference node %s"
+ " but missing on this node: %s",
+ self.cfg.GetNodeName(base.uuid), utils.CommaJoin(missing))
+
+ def _VerifyAcceptedFileStoragePaths(self, ninfo, nresult, is_master):
+ """Verifies paths in L{pathutils.FILE_STORAGE_PATHS_FILE}.
+
+ @type ninfo: L{objects.Node}
+ @param ninfo: the node to check
+ @param nresult: the remote results for the node
+ @type is_master: bool
+ @param is_master: Whether node is the master node
+
+ """
+ cluster = self.cfg.GetClusterInfo()
+ if (is_master and
+ (cluster.IsFileStorageEnabled() or
+ cluster.IsSharedFileStorageEnabled())):
+ try:
+ fspaths = nresult[constants.NV_ACCEPTED_STORAGE_PATHS]
+ except KeyError:
+ # This should never happen
+ self._ErrorIf(True, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
+ "Node did not return forbidden file storage paths")
+ else:
+ self._ErrorIf(fspaths, constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
+ "Found forbidden file storage paths: %s",
+ utils.CommaJoin(fspaths))
+ else:
+ self._ErrorIf(constants.NV_ACCEPTED_STORAGE_PATHS in nresult,
+ constants.CV_ENODEFILESTORAGEPATHS, ninfo.name,
+ "Node should not have returned forbidden file storage"
+ " paths")
+
+ def _VerifyStoragePaths(self, ninfo, nresult, file_disk_template,
+ verify_key, error_key):
+ """Verifies (file) storage paths.
+
+ @type ninfo: L{objects.Node}
+ @param ninfo: the node to check
+ @param nresult: the remote results for the node
+ @type file_disk_template: string
+ @param file_disk_template: file-based disk template, whose directory
+ is supposed to be verified
+ @type verify_key: string
+ @param verify_key: key for the verification map of this file
+ verification step
+ @param error_key: error key to be added to the verification results
+ in case something goes wrong in this verification step
+
+ """
+ assert (file_disk_template in
utils.storage.GetDiskTemplatesOfStorageTypes(
+ constants.ST_FILE, constants.ST_SHARED_FILE,
constants.ST_GLUSTER
+ ))
+
+ cluster = self.cfg.GetClusterInfo()
+ if cluster.IsDiskTemplateEnabled(file_disk_template):
+ self._ErrorIf(
+ verify_key in nresult,
+ error_key, ninfo.name,
+ "The configured %s storage path is unusable: %s" %
+ (file_disk_template, nresult.get(verify_key)))
+
+ def _VerifyFileStoragePaths(self, ninfo, nresult):
+ """Verifies (file) storage paths.
+
+ @see: C{_VerifyStoragePaths}
+
+ """
+ self._VerifyStoragePaths(
+ ninfo, nresult, constants.DT_FILE,
+ constants.NV_FILE_STORAGE_PATH,
+ constants.CV_ENODEFILESTORAGEPATHUNUSABLE)
+
+ def _VerifySharedFileStoragePaths(self, ninfo, nresult):
+ """Verifies (file) storage paths.
+
+ @see: C{_VerifyStoragePaths}
+
+ """
+ self._VerifyStoragePaths(
+ ninfo, nresult, constants.DT_SHARED_FILE,
+ constants.NV_SHARED_FILE_STORAGE_PATH,
+ constants.CV_ENODESHAREDFILESTORAGEPATHUNUSABLE)
+
+ def _VerifyGlusterStoragePaths(self, ninfo, nresult):
+ """Verifies (file) storage paths.
+
+ @see: C{_VerifyStoragePaths}
+
+ """
+ self._VerifyStoragePaths(
+ ninfo, nresult, constants.DT_GLUSTER,
+ constants.NV_GLUSTER_STORAGE_PATH,
+ constants.CV_ENODEGLUSTERSTORAGEPATHUNUSABLE)
+
+ def _VerifyOob(self, ninfo, nresult):
+ """Verifies out of band functionality of a node.
+
+ @type ninfo: L{objects.Node}
+ @param ninfo: the node to check
+ @param nresult: the remote results for the node
+
+ """
+ # We just have to verify the paths on master and/or master candidates
+ # as the oob helper is invoked on the master
+ if ((ninfo.master_candidate or ninfo.master_capable) and
+ constants.NV_OOB_PATHS in nresult):
+ for path_result in nresult[constants.NV_OOB_PATHS]:
+ self._ErrorIf(path_result, constants.CV_ENODEOOBPATH,
+ ninfo.name, path_result)
+
+ def _UpdateNodeVolumes(self, ninfo, nresult, nimg, vg_name):
+ """Verifies and updates the node volume data.
+
+ This function will update a L{NodeImage}'s internal structures
+ with data from the remote call.
+
+ @type ninfo: L{objects.Node}
+ @param ninfo: the node to check
+ @param nresult: the remote results for the node
+ @param nimg: the node image object
+ @param vg_name: the configured VG name
+
+ """
+ nimg.lvm_fail = True
+ lvdata = nresult.get(constants.NV_LVLIST, "Missing LV data")
+ if vg_name is None:
+ pass
+ elif isinstance(lvdata, basestring):
+ self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name,
+ "LVM problem on node: %s", utils.SafeEncode(lvdata))
+ elif not isinstance(lvdata, dict):
+ self._ErrorIf(True, constants.CV_ENODELVM, ninfo.name,
+ "rpc call to node failed (lvlist)")
+ else:
+ nimg.volumes = lvdata
+ nimg.lvm_fail = False
+
+ def _UpdateNodeInstances(self, ninfo, nresult, nimg):
+ """Verifies and updates the node instance list.
+
+ If the listing was successful, then updates this node's instance
+ list. Otherwise, it marks the RPC call as failed for the instance
+ list key.
+
+ @type ninfo: L{objects.Node}
+ @param ninfo: the node to check
+ @param nresult: the remote results for the node
+ @param nimg: the node image object
+
+ """
+ idata = nresult.get(constants.NV_INSTANCELIST, None)
+ test = not isinstance(idata, list)
+ self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
+ "rpc call to node failed (instancelist): %s",
+ utils.SafeEncode(str(idata)))
+ if test:
+ nimg.hyp_fail = True
+ else:
+ nimg.instances = [inst.uuid for (_, inst) in
+ self.cfg.GetMultiInstanceInfoByName(idata)]
+
+ def _UpdateNodeInfo(self, ninfo, nresult, nimg, vg_name):
+ """Verifies and computes a node information map
+
+ @type ninfo: L{objects.Node}
+ @param ninfo: the node to check
+ @param nresult: the remote results for the node
+ @param nimg: the node image object
+ @param vg_name: the configured VG name
+
+ """
+ # try to read free memory (from the hypervisor)
+ hv_info = nresult.get(constants.NV_HVINFO, None)
+ test = not isinstance(hv_info, dict) or "memory_free" not in hv_info
+ self._ErrorIf(test, constants.CV_ENODEHV, ninfo.name,
+ "rpc call to node failed (hvinfo)")
+ if not test:
+ try:
+ nimg.mfree = int(hv_info["memory_free"])
+ except (ValueError, TypeError):
+ self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name,
+ "node returned invalid nodeinfo, check hypervisor")
+
+ # FIXME: devise a free space model for file based instances as well
+ if vg_name is not None:
+ test = (constants.NV_VGLIST not in nresult or
+ vg_name not in nresult[constants.NV_VGLIST])
+ self._ErrorIf(test, constants.CV_ENODELVM, ninfo.name,
+ "node didn't return data for the volume group '%s'"
+ " - it is either missing or broken", vg_name)
+ if not test:
+ try:
+ nimg.dfree = int(nresult[constants.NV_VGLIST][vg_name])
+ except (ValueError, TypeError):
+ self._ErrorIf(True, constants.CV_ENODERPC, ninfo.name,
+ "node returned invalid LVM info, check LVM status")
+
+ def _CollectDiskInfo(self, node_uuids, node_image, instanceinfo):
+ """Gets per-disk status information for all instances.
+
+ @type node_uuids: list of strings
+ @param node_uuids: Node UUIDs
+ @type node_image: dict of (UUID, L{objects.Node})
+ @param node_image: Node objects
+ @type instanceinfo: dict of (UUID, L{objects.Instance})
+ @param instanceinfo: Instance objects
+ @rtype: {instance: {node: [(succes, payload)]}}
+ @return: a dictionary of per-instance dictionaries with nodes as
+ keys and disk information as values; the disk information is a
+ list of tuples (success, payload)
+
+ """
+ node_disks = {}
+ node_disks_dev_inst_only = {}
+ diskless_instances = set()
+ nodisk_instances = set()
+
+ for nuuid in node_uuids:
+ node_inst_uuids = list(itertools.chain(node_image[nuuid].pinst,
+ node_image[nuuid].sinst))
+ diskless_instances.update(uuid for uuid in node_inst_uuids
+ if not instanceinfo[uuid].disks)
+ disks = [(inst_uuid, disk)
+ for inst_uuid in node_inst_uuids
+ for disk in self.cfg.GetInstanceDisks(inst_uuid)]
+
+ if not disks:
+ nodisk_instances.update(uuid for uuid in node_inst_uuids
+ if instanceinfo[uuid].disks)
+ # No need to collect data
+ continue
+
+ node_disks[nuuid] = disks
+
+ # _AnnotateDiskParams makes already copies of the disks
+ dev_inst_only = []
+ for (inst_uuid, dev) in disks:
+ (anno_disk,) = AnnotateDiskParams(instanceinfo[inst_uuid], [dev],
+ self.cfg)
+ dev_inst_only.append((anno_disk, instanceinfo[inst_uuid]))
+
+ node_disks_dev_inst_only[nuuid] = dev_inst_only
+
+ assert len(node_disks) == len(node_disks_dev_inst_only)
+
+ # Collect data from all nodes with disks
+ result = self.rpc.call_blockdev_getmirrorstatus_multi(
+ node_disks.keys(), node_disks_dev_inst_only)
+
+ assert len(result) == len(node_disks)
+
+ instdisk = {}
+
+ for (nuuid, nres) in result.items():
+ node = self.cfg.GetNodeInfo(nuuid)
+ disks = node_disks[node.uuid]
+
+ if nres.offline:
+ # No data from this node
+ data = len(disks) * [(False, "node offline")]
+ else:
+ msg = nres.fail_msg
+ self._ErrorIf(msg, constants.CV_ENODERPC, node.name,
+ "while getting disk information: %s", msg)
+ if msg:
+ # No data from this node
+ data = len(disks) * [(False, msg)]
+ else:
+ data = []
+ for idx, i in enumerate(nres.payload):
+ if isinstance(i, (tuple, list)) and len(i) == 2:
+ data.append(i)
+ else:
+ logging.warning("Invalid result from node %s, entry %d: %s",
+ node.name, idx, i)
+ data.append((False, "Invalid result from the remote node"))
+
+ for ((inst_uuid, _), status) in zip(disks, data):
+ instdisk.setdefault(inst_uuid, {}).setdefault(node.uuid, []) \
+ .append(status)
+
+ # Add empty entries for diskless instances.
+ for inst_uuid in diskless_instances:
+ assert inst_uuid not in instdisk
+ instdisk[inst_uuid] = {}
+ # ...and disk-full instances that happen to have no disks
+ for inst_uuid in nodisk_instances:
+ assert inst_uuid not in instdisk
+ instdisk[inst_uuid] = {}
+
+ assert compat.all(len(statuses) == len(instanceinfo[inst].disks) and
+ len(nuuids) <= len(
+ self.cfg.GetInstanceNodes(instanceinfo[inst].uuid))
and
+ compat.all(isinstance(s, (tuple, list)) and
+ len(s) == 2 for s in statuses)
+ for inst, nuuids in instdisk.items()
+ for nuuid, statuses in nuuids.items())
+ if __debug__:
+ instdisk_keys = set(instdisk)
+ instanceinfo_keys = set(instanceinfo)
+ assert instdisk_keys == instanceinfo_keys, \
+ ("instdisk keys (%s) do not match instanceinfo keys (%s)" %
+ (instdisk_keys, instanceinfo_keys))
+
+ return instdisk
+
+ @staticmethod
+ def _SshNodeSelector(group_uuid, all_nodes):
+ """Create endless iterators for all potential SSH check hosts.
+
+ """
+ nodes = [node for node in all_nodes
+ if (node.group != group_uuid and
+ not node.offline)]
+ keyfunc = operator.attrgetter("group")
+
+ return map(itertools.cycle,
+ [sorted(map(operator.attrgetter("name"), names))
+ for _, names in itertools.groupby(sorted(nodes, key=keyfunc),
+ keyfunc)])
+
+ @classmethod
+ def _SelectSshCheckNodes(cls, group_nodes, group_uuid, all_nodes):
+ """Choose which nodes should talk to which other nodes.
+
+ We will make nodes contact all nodes in their group, and one node from
+ every other group.
+
+ @rtype: tuple of (string, dict of strings to list of strings, string)
+ @return: a tuple containing the list of all online nodes, a dictionary
+ mapping node names to additional nodes of other node groups to which
+ connectivity should be tested, and a list of all online master
+ candidates
+
+ @warning: This algorithm has a known issue if one node group is much
+ smaller than others (e.g. just one node). In such a case all other
+ nodes will talk to the single node.
+
+ """
+ online_nodes = sorted(node.name for node in group_nodes if not
node.offline)
+ online_mcs = sorted(node.name for node in group_nodes
+ if (node.master_candidate and not node.offline))
+ sel = cls._SshNodeSelector(group_uuid, all_nodes)
+
+ return (online_nodes,
+ dict((name, sorted([i.next() for i in sel]))
+ for name in online_nodes),
+ online_mcs)
+
+ def _PrepareSshSetupCheck(self):
+ """Prepare the input data for the SSH setup verification.
+
+ """
+ all_nodes_info = self.cfg.GetAllNodesInfo()
+ potential_master_candidates = self.cfg.GetPotentialMasterCandidates()
+ node_status = [
+ (uuid, node_info.name, node_info.master_candidate,
+ node_info.name in potential_master_candidates)
+ for (uuid, node_info) in all_nodes_info.items()]
+ return node_status
+
+ def BuildHooksEnv(self):
+ """Build hooks env.
+
+ Cluster-Verify hooks just ran in the post phase and their failure makes
+ the output be logged in the verify output and the verification to fail.
+
+ """
+ env = {
+ "CLUSTER_TAGS": " ".join(self.cfg.GetClusterInfo().GetTags()),
+ }
+
+ env.update(("NODE_TAGS_%s" % node.name, " ".join(node.GetTags()))
+ for node in self.my_node_info.values())
+
+ return env
+
+ def BuildHooksNodes(self):
+ """Build hooks nodes.
+
+ """
+ return ([], list(self.my_node_info.keys()))
+
+ @staticmethod
+ def _VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced,
+ i_offline, n_offline, n_drained):
+ feedback_fn("* Other Notes")
+ if i_non_redundant:
+ feedback_fn(" - NOTICE: %d non-redundant instance(s) found."
+ % len(i_non_redundant))
+
+ if i_non_a_balanced:
+ feedback_fn(" - NOTICE: %d non-auto-balanced instance(s) found."
+ % len(i_non_a_balanced))
+
+ if i_offline:
+ feedback_fn(" - NOTICE: %d offline instance(s) found." % i_offline)
+
+ if n_offline:
+ feedback_fn(" - NOTICE: %d offline node(s) found." % n_offline)
+
+ if n_drained:
+ feedback_fn(" - NOTICE: %d drained node(s) found." % n_drained)
+
+ def Exec(self, feedback_fn):
+ """Verify integrity of the node group, performing various test on nodes.
+
+ """
+ # This method has too many local variables. pylint: disable=R0914
+ feedback_fn("* Verifying group '%s'" % self.group_info.name)
+
+ if not self.my_node_uuids:
+ # empty node group
+ feedback_fn("* Empty node group, skipping verification")
+ return True
+
+ self.bad = False
+ verbose = self.op.verbose
+ self._feedback_fn = feedback_fn
+
+ vg_name = self.cfg.GetVGName()
+ drbd_helper = self.cfg.GetDRBDHelper()
+ cluster = self.cfg.GetClusterInfo()
+ hypervisors = cluster.enabled_hypervisors
+ node_data_list = self.my_node_info.values()
+
+ i_non_redundant = [] # Non redundant instances
+ i_non_a_balanced = [] # Non auto-balanced instances
+ i_offline = 0 # Count of offline instances
+ n_offline = 0 # Count of offline nodes
+ n_drained = 0 # Count of nodes being drained
+ node_vol_should = {}
+
+ # FIXME: verify OS list
+
+ # File verification
+ filemap = ComputeAncillaryFiles(cluster, False)
+
+ # do local checksums
+ master_node_uuid = self.master_node = self.cfg.GetMasterNode()
+ master_ip = self.cfg.GetMasterIP()
+
+ feedback_fn("* Gathering data (%d nodes)" % len(self.my_node_uuids))
+
+ user_scripts = []
+ if self.cfg.GetUseExternalMipScript():
+ user_scripts.append(pathutils.EXTERNAL_MASTER_SETUP_SCRIPT)
+
+ node_verify_param = {
+ constants.NV_FILELIST:
+ map(vcluster.MakeVirtualPath,
+ utils.UniqueSequence(filename
+ for files in filemap
+ for filename in files)),
+ constants.NV_NODELIST:
+ self._SelectSshCheckNodes(node_data_list, self.group_uuid,
+ self.all_node_info.values()),
+ constants.NV_HYPERVISOR: hypervisors,
+ constants.NV_HVPARAMS:
+ _GetAllHypervisorParameters(cluster, self.all_inst_info.values()),
+ constants.NV_NODENETTEST: [(node.name, node.primary_ip,
node.secondary_ip)
+ for node in node_data_list
+ if not node.offline],
+ constants.NV_INSTANCELIST: hypervisors,
+ constants.NV_VERSION: None,
+ constants.NV_HVINFO: self.cfg.GetHypervisorType(),
+ constants.NV_NODESETUP: None,
+ constants.NV_TIME: None,
+ constants.NV_MASTERIP: (self.cfg.GetMasterNodeName(), master_ip),
+ constants.NV_OSLIST: None,
+ constants.NV_NONVMNODES: self.cfg.GetNonVmCapableNodeNameList(),
+ constants.NV_USERSCRIPTS: user_scripts,
+ constants.NV_CLIENT_CERT: None,
+ }
+
+ if self.cfg.GetClusterInfo().modify_ssh_setup:
+ node_verify_param[constants.NV_SSH_SETUP] = self._PrepareSshSetupCheck()
++ if self.op.verify_clutter:
++ node_verify_param[constants.NV_SSH_CLUTTER] = True
+
+ if vg_name is not None:
+ node_verify_param[constants.NV_VGLIST] = None
+ node_verify_param[constants.NV_LVLIST] = vg_name
+ node_verify_param[constants.NV_PVLIST] = [vg_name]
+
+ if cluster.IsDiskTemplateEnabled(constants.DT_DRBD8):
+ if drbd_helper:
+ node_verify_param[constants.NV_DRBDVERSION] = None
+ node_verify_param[constants.NV_DRBDLIST] = None
+ node_verify_param[constants.NV_DRBDHELPER] = drbd_helper
+
+ if cluster.IsFileStorageEnabled() or \
+ cluster.IsSharedFileStorageEnabled():
+ # Load file storage paths only from master node
+ node_verify_param[constants.NV_ACCEPTED_STORAGE_PATHS] = \
+ self.cfg.GetMasterNodeName()
+ if cluster.IsFileStorageEnabled():
+ node_verify_param[constants.NV_FILE_STORAGE_PATH] = \
+ cluster.file_storage_dir
+ if cluster.IsSharedFileStorageEnabled():
+ node_verify_param[constants.NV_SHARED_FILE_STORAGE_PATH] = \
+ cluster.shared_file_storage_dir
+
+ # bridge checks
+ # FIXME: this needs to be changed per node-group, not cluster-wide
+ bridges = set()
+ default_nicpp = cluster.nicparams[constants.PP_DEFAULT]
+ if default_nicpp[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
+ bridges.add(default_nicpp[constants.NIC_LINK])
+ for inst_uuid in self.my_inst_info.values():
+ for nic in inst_uuid.nics:
+ full_nic = cluster.SimpleFillNIC(nic.nicparams)
+ if full_nic[constants.NIC_MODE] == constants.NIC_MODE_BRIDGED:
+ bridges.add(full_nic[constants.NIC_LINK])
+
+ if bridges:
+ node_verify_param[constants.NV_BRIDGES] = list(bridges)
+
+ # Build our expected cluster state
+ node_image = dict((node.uuid, self.NodeImage(offline=node.offline,
+ uuid=node.uuid,
+ vm_capable=node.vm_capable))
+ for node in node_data_list)
+
+ # Gather OOB paths
+ oob_paths = []
+ for node in self.all_node_info.values():
+ path = SupportsOob(self.cfg, node)
+ if path and path not in oob_paths:
+ oob_paths.append(path)
+
+ if oob_paths:
+ node_verify_param[constants.NV_OOB_PATHS] = oob_paths
+
+ for inst_uuid in self.my_inst_uuids:
+ instance = self.my_inst_info[inst_uuid]
+ if instance.admin_state == constants.ADMINST_OFFLINE:
+ i_offline += 1
+
+ inst_nodes = self.cfg.GetInstanceNodes(instance.uuid)
+ for nuuid in inst_nodes:
+ if nuuid not in node_image:
+ gnode = self.NodeImage(uuid=nuuid)
+ gnode.ghost = (nuuid not in self.all_node_info)
+ node_image[nuuid] = gnode
+
+ self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should)
+
+ pnode = instance.primary_node
+ node_image[pnode].pinst.append(instance.uuid)
+
+ for snode in self.cfg.GetInstanceSecondaryNodes(instance.uuid):
+ nimg = node_image[snode]
+ nimg.sinst.append(instance.uuid)
+ if pnode not in nimg.sbp:
+ nimg.sbp[pnode] = []
+ nimg.sbp[pnode].append(instance.uuid)
+
+ es_flags = rpc.GetExclusiveStorageForNodes(self.cfg,
+ self.my_node_info.keys())
+ # The value of exclusive_storage should be the same across the group, so
if
+ # it's True for at least a node, we act as if it were set for all the
nodes
+ self._exclusive_storage = compat.any(es_flags.values())
+ if self._exclusive_storage:
+ node_verify_param[constants.NV_EXCLUSIVEPVS] = True
+
+ node_group_uuids = dict(map(lambda n: (n.name, n.group),
+ self.cfg.GetAllNodesInfo().values()))
+ groups_config = self.cfg.GetAllNodeGroupsInfoDict()
+
+ # At this point, we have the in-memory data structures complete,
+ # except for the runtime information, which we'll gather next
+
+ # NOTE: Here we lock the configuration for the duration of RPC calls,
+ # which means that the cluster configuration changes are blocked during
+ # this period.
+ # This is something that should be done only exceptionally and only for
+ # justified cases!
+ # In this case, we need the lock as we can only verify the integrity of
+ # configuration files on MCs only if we know nobody else is modifying it.
+ # FIXME: The check for integrity of config.data should be moved to
+ # WConfD, which is the only one who can otherwise ensure nobody
+ # will modify the configuration during the check.
+ with self.cfg.GetConfigManager(shared=True, forcelock=True):
+ feedback_fn("* Gathering information about nodes (%s nodes)" %
+ len(self.my_node_uuids))
+ # Force the configuration to be fully distributed before doing any tests
+ self.cfg.FlushConfig()
+ # Due to the way our RPC system works, exact response times cannot be
+ # guaranteed (e.g. a broken node could run into a timeout). By keeping
+ # the time before and after executing the request, we can at least have
+ # a time window.
+ nvinfo_starttime = time.time()
+ # Get lock on the configuration so that nobody modifies it concurrently.
+ # Otherwise it can be modified by other jobs, failing the consistency
+ # test.
+ # NOTE: This is an exceptional situation, we should otherwise avoid
+ # locking the configuration for something but very fast, pure
operations.
+ cluster_name = self.cfg.GetClusterName()
+ hvparams = self.cfg.GetClusterInfo().hvparams
+ all_nvinfo = self.rpc.call_node_verify(self.my_node_uuids,
+ node_verify_param,
+ cluster_name,
+ hvparams,
+ node_group_uuids,
+ groups_config)
+ nvinfo_endtime = time.time()
+
+ if self.extra_lv_nodes and vg_name is not None:
+ feedback_fn("* Gathering information about extra nodes (%s nodes)" %
+ len(self.extra_lv_nodes))
+ extra_lv_nvinfo = \
+ self.rpc.call_node_verify(self.extra_lv_nodes,
+ {constants.NV_LVLIST: vg_name},
+ self.cfg.GetClusterName(),
+ self.cfg.GetClusterInfo().hvparams,
+ node_group_uuids,
+ groups_config)
+ else:
+ extra_lv_nvinfo = {}
+
+ # If not all nodes are being checked, we need to make sure the master
+ # node and a non-checked vm_capable node are in the list.
+ absent_node_uuids =
set(self.all_node_info).difference(self.my_node_info)
+ if absent_node_uuids:
+ vf_nvinfo = all_nvinfo.copy()
+ vf_node_info = list(self.my_node_info.values())
+ additional_node_uuids = []
+ if master_node_uuid not in self.my_node_info:
+ additional_node_uuids.append(master_node_uuid)
+ vf_node_info.append(self.all_node_info[master_node_uuid])
+ # Add the first vm_capable node we find which is not included,
+ # excluding the master node (which we already have)
+ for node_uuid in absent_node_uuids:
+ nodeinfo = self.all_node_info[node_uuid]
+ if (nodeinfo.vm_capable and not nodeinfo.offline and
+ node_uuid != master_node_uuid):
+ additional_node_uuids.append(node_uuid)
+ vf_node_info.append(self.all_node_info[node_uuid])
+ break
+ key = constants.NV_FILELIST
+
+ feedback_fn("* Gathering information about the master node")
+ vf_nvinfo.update(self.rpc.call_node_verify(
+ additional_node_uuids, {key: node_verify_param[key]},
+ self.cfg.GetClusterName(), self.cfg.GetClusterInfo().hvparams,
+ node_group_uuids,
+ groups_config))
+ else:
+ vf_nvinfo = all_nvinfo
+ vf_node_info = self.my_node_info.values()
+
+ all_drbd_map = self.cfg.ComputeDRBDMap()
+
+ feedback_fn("* Gathering disk information (%s nodes)" %
+ len(self.my_node_uuids))
+ instdisk = self._CollectDiskInfo(self.my_node_info.keys(), node_image,
+ self.my_inst_info)
+
+ feedback_fn("* Verifying configuration file consistency")
+
+ self._VerifyClientCertificates(self.my_node_info.values(), all_nvinfo)
+ if self.cfg.GetClusterInfo().modify_ssh_setup:
+ self._VerifySshSetup(self.my_node_info.values(), all_nvinfo)
+ self._VerifyFiles(vf_node_info, master_node_uuid, vf_nvinfo, filemap)
+
+ feedback_fn("* Verifying node status")
+
+ refos_img = None
+
+ for node_i in node_data_list:
+ nimg = node_image[node_i.uuid]
+
+ if node_i.offline:
+ if verbose:
+ feedback_fn("* Skipping offline node %s" % (node_i.name,))
+ n_offline += 1
+ continue
+
+ if node_i.uuid == master_node_uuid:
+ ntype = "master"
+ elif node_i.master_candidate:
+ ntype = "master candidate"
+ elif node_i.drained:
+ ntype = "drained"
+ n_drained += 1
+ else:
+ ntype = "regular"
+ if verbose:
+ feedback_fn("* Verifying node %s (%s)" % (node_i.name, ntype))
+
+ msg = all_nvinfo[node_i.uuid].fail_msg
+ self._ErrorIf(msg, constants.CV_ENODERPC, node_i.name,
+ "while contacting node: %s", msg)
+ if msg:
+ nimg.rpc_fail = True
+ continue
+
+ nresult = all_nvinfo[node_i.uuid].payload
+
+ nimg.call_ok = self._VerifyNode(node_i, nresult)
+ self._VerifyNodeTime(node_i, nresult, nvinfo_starttime, nvinfo_endtime)
+ self._VerifyNodeNetwork(node_i, nresult)
+ self._VerifyNodeUserScripts(node_i, nresult)
+ self._VerifyOob(node_i, nresult)
+ self._VerifyAcceptedFileStoragePaths(node_i, nresult,
+ node_i.uuid == master_node_uuid)
+ self._VerifyFileStoragePaths(node_i, nresult)
+ self._VerifySharedFileStoragePaths(node_i, nresult)
+ self._VerifyGlusterStoragePaths(node_i, nresult)
+
+ if nimg.vm_capable:
+ self._UpdateVerifyNodeLVM(node_i, nresult, vg_name, nimg)
+ if constants.DT_DRBD8 in cluster.enabled_disk_templates:
+ self._VerifyNodeDrbd(node_i, nresult, self.all_inst_info,
+ self.all_disks_info, drbd_helper, all_drbd_map)
+
+ if (constants.DT_PLAIN in cluster.enabled_disk_templates) or \
+ (constants.DT_DRBD8 in cluster.enabled_disk_templates):
+ self._UpdateNodeVolumes(node_i, nresult, nimg, vg_name)
+ self._UpdateNodeInstances(node_i, nresult, nimg)
+ self._UpdateNodeInfo(node_i, nresult, nimg, vg_name)
+ self._UpdateNodeOS(node_i, nresult, nimg)
+
+ if not nimg.os_fail:
+ if refos_img is None:
+ refos_img = nimg
+ self._VerifyNodeOS(node_i, nimg, refos_img)
+ self._VerifyNodeBridges(node_i, nresult, bridges)
+
+ # Check whether all running instances are primary for the node. (This
+ # can no longer be done from _VerifyInstance below, since some of the
+ # wrong instances could be from other node groups.)
+ non_primary_inst_uuids = set(nimg.instances).difference(nimg.pinst)
+
+ for inst_uuid in non_primary_inst_uuids:
+ test = inst_uuid in self.all_inst_info
+ self._ErrorIf(test, constants.CV_EINSTANCEWRONGNODE,
+ self.cfg.GetInstanceName(inst_uuid),
+ "instance should not run on node %s", node_i.name)
+ self._ErrorIf(not test, constants.CV_ENODEORPHANINSTANCE,
node_i.name,
+ "node is running unknown instance %s", inst_uuid)
+
+ self._VerifyGroupDRBDVersion(all_nvinfo)
+ self._VerifyGroupLVM(node_image, vg_name)
+
+ for node_uuid, result in extra_lv_nvinfo.items():
+ self._UpdateNodeVolumes(self.all_node_info[node_uuid], result.payload,
+ node_image[node_uuid], vg_name)
+
+ feedback_fn("* Verifying instance status")
+ for inst_uuid in self.my_inst_uuids:
+ instance = self.my_inst_info[inst_uuid]
+ if verbose:
+ feedback_fn("* Verifying instance %s" % instance.name)
+ self._VerifyInstance(instance, node_image, instdisk[inst_uuid])
+
+ # If the instance is not fully redundant we cannot survive losing its
+ # primary node, so we are not N+1 compliant.
+ inst_disks = self.cfg.GetInstanceDisks(instance.uuid)
+ if not utils.AllDiskOfType(inst_disks, constants.DTS_MIRRORED):
+ i_non_redundant.append(instance)
+
+ if not cluster.FillBE(instance)[constants.BE_AUTO_BALANCE]:
+ i_non_a_balanced.append(instance)
+
+ feedback_fn("* Verifying orphan volumes")
+ reserved = utils.FieldSet(*cluster.reserved_lvs)
+
+ # We will get spurious "unknown volume" warnings if any node of this group
+ # is secondary for an instance whose primary is in another group. To avoid
+ # them, we find these instances and add their volumes to node_vol_should.
+ for instance in self.all_inst_info.values():
+ for secondary in self.cfg.GetInstanceSecondaryNodes(instance.uuid):
+ if (secondary in self.my_node_info
+ and instance.name not in self.my_inst_info):
+ self.cfg.GetInstanceLVsByNode(instance.uuid, lvmap=node_vol_should)
+ break
+
+ self._VerifyOrphanVolumes(node_vol_should, node_image, reserved)
+
+ if constants.VERIFY_NPLUSONE_MEM not in self.op.skip_checks:
+ feedback_fn("* Verifying N+1 Memory redundancy")
+ self._VerifyNPlusOneMemory(node_image, self.my_inst_info)
+
+ self._VerifyOtherNotes(feedback_fn, i_non_redundant, i_non_a_balanced,
+ i_offline, n_offline, n_drained)
+
+ return not self.bad
+
+ def HooksCallBack(self, phase, hooks_results, feedback_fn, lu_result):
+ """Analyze the post-hooks' result
+
+ This method analyses the hook result, handles it, and sends some
+ nicely-formatted feedback back to the user.
+
+ @param phase: one of L{constants.HOOKS_PHASE_POST} or
+ L{constants.HOOKS_PHASE_PRE}; it denotes the hooks phase
+ @param hooks_results: the results of the multi-node hooks rpc call
+ @param feedback_fn: function used send feedback back to the caller
+ @param lu_result: previous Exec result
+ @return: the new Exec result, based on the previous result
+ and hook results
+
+ """
+ # We only really run POST phase hooks, only for non-empty groups,
+ # and are only interested in their results
+ if not self.my_node_uuids:
+ # empty node group
+ pass
+ elif phase == constants.HOOKS_PHASE_POST:
+ # Used to change hooks' output to proper indentation
+ feedback_fn("* Hooks Results")
+ assert hooks_results, "invalid result from hooks"
+
+ for node_name in hooks_results:
+ res = hooks_results[node_name]
+ msg = res.fail_msg
+ test = msg and not res.offline
+ self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
+ "Communication failure in hooks execution: %s", msg)
+ if test:
+ lu_result = False
+ continue
+ if res.offline:
+ # No need to investigate payload if node is offline
+ continue
+ for script, hkr, output in res.payload:
+ test = hkr == constants.HKR_FAIL
+ self._ErrorIf(test, constants.CV_ENODEHOOKS, node_name,
+ "Script %s failed, output:", script)
+ if test:
+ output = self._HOOKS_INDENT_RE.sub(" ", output)
+ feedback_fn("%s" % output)
+ lu_result = False
+
+ return lu_result
diff --cc tools/cfgupgrade
index 7c1a119,398ac70..8f6a20e
--- a/tools/cfgupgrade
+++ b/tools/cfgupgrade
@@@ -529,52 -470,69 +529,80 @@@ def UpgradeAll(config_data)
# DOWNGRADE ------------------------------------------------------------
-def DowngradeExtAccess(config_data):
- # Remove 'access' for ext storage from cluster diskparams
- cluster_extparams = config_data["cluster"]["diskparams"].get("ext", None)
- if (cluster_extparams is not None and
- "access" in cluster_extparams):
- del cluster_extparams["access"]
+def _RecursiveRemoveNodes(disk):
+ if "nodes" in disk:
+ del disk["nodes"]
+ for disk in disk.get("children", []):
+ _RecursiveRemoveNodes(disk)
- # Remove 'access' for ext storage from nodegroup diskparams
- for group in config_data["nodegroups"].values():
- group_extparams = group["diskparams"].get("ext", None)
- if (group_extparams is not None and
- "access" in group_extparams):
- del group_extparams["access"]
+def DowngradeDiskNodes(config_data):
+ if "disks" not in config_data:
+ raise Error("Can't find the 'disks' dictionary in the configuration.")
+ for disk in config_data["disks"].itervalues():
+ _RecursiveRemoveNodes(disk)
-def DowngradeDataCollectors(config_data):
- cluster = config_data["cluster"]
- if "data_collectors" in cluster:
- del cluster["data_collectors"]
+
+def DowngradeForthcomingInstances(config_data):
+ if "instances" not in config_data:
+ raise Error("Can't find the 'instances' dictionary in the configuration.")
+ instances = config_data["instances"]
+ uuids = instances.keys()
+ for uuid in uuids:
+ if instances[uuid].get("forthcoming"):
+ del instances[uuid]
-def DowngradeFilters(config_data):
- if "filters" in config_data:
- del config_data["filters"]
+def DowngradeForthcomingDisks(config_data):
+ if "instances" not in config_data:
+ raise Error("Can't find the 'instances' dictionary in the configuration.")
+ instances = config_data["instances"]
+ if "disks" not in config_data:
+ raise Error("Can't find the 'disks' dictionary in the configuration.")
+ disks = config_data["disks"]
+ uuids = disks.keys()
+ for uuid in uuids:
+ if disks[uuid].get("forthcoming"):
+ del disks[uuid]
+ for inst in instances:
+ if "disk" in inst and uuid in inst["disks"]:
+ inst["disks"].remove(uuid)
+ def DowngradeLxcParams(hvparams):
+ hv = "lxc"
+ if hv not in hvparams:
+ return
+
+ params_to_del = [
+ "devices",
+ "drop_capabilities",
+ "extra_cgroups",
+ "extra_config",
+ "num_ttys",
+ "startup_timeout",
+ ]
+ for param in params_to_del:
+ if param in hvparams[hv]:
+ del hvparams[hv][param]
+
+
+ def DowngradeAllLxcParams(config_data):
+ cluster = config_data["cluster"]
+ if "hvparams" in cluster:
+ DowngradeLxcParams(cluster["hvparams"])
+
+ for iobj in cluster.get("instances", {}):
+ if "hvparams" in iobj:
+ DowngradeLxcParams(iobj["hvparams"])
+
+
def DowngradeAll(config_data):
- # Any code specific to a particular version should be labeled that way, so
- # it can be removed when updating to the next version.
config_data["version"] = version.BuildVersion(DOWNGRADE_MAJOR,
DOWNGRADE_MINOR, 0)
- DowngradeExtAccess(config_data)
- DowngradeDataCollectors(config_data)
- DowngradeFilters(config_data)
- DowngradeAllLxcParams(config_data)
+ DowngradeForthcomingInstances(config_data)
+ DowngradeForthcomingDisks(config_data)
+ DowngradeDiskNodes(config_data)
def _ParseOptions():
--
Klaus Aehlig
Google Germany GmbH, Dienerstr. 12, 80331 Muenchen
Registergericht und -nummer: Hamburg, HRB 86891
Sitz der Gesellschaft: Hamburg
Geschaeftsfuehrer: Graham Law, Christine Elizabeth Flores