Hello Martin Peřina, Federico Simoncelli,

I'd like you to do a code review.  Please visit

    http://gerrit.ovirt.org/31148

to review the following change.

Change subject: fencing: Introduce getHostLeaseStatus API
......................................................................

fencing: Introduce getHostLeaseStatus API

When fencing another host, we like to check if the other host has access
to storage, to prevent unwanted fencing of a lively host. This patch
adds an internal API for getting another host status on monitored
domains.

The new getHostLeaseStatus API returns a dictionary of monitored domains
UUIDs and host lease status for each domain, as reported by the cluster
lock.  The caller will apply a fencing policy using this data to decide
if host is lively enough to prevent fencing.

Here is an example result:

    {'04e604d3-71eb-41e0-a8a3-74404bdd9d75': 'live',
     '09049dc6-2007-492e-afd9-3ec201775b2a': 'dead',
     '20501ea5-cf6b-453a-b23d-a708c28cec1f': 'live',
     '2b02c759-2e4a-4cf0-9b8e-37a6ec77f821': 'unavailable',
     ...
     'fc348be8-18f4-42ff-8421-9043203e13a6': 'live'}

See clusterlock.py for the possible statuses and their meaning.

Change-Id: Iccd62e58a194aa0ceb0f5e2503b8ec7e4349971b
Bug-Url: https://bugzilla.redhat.com/1110172
Signed-off-by: Nir Soffer <[email protected]>
Reviewed-on: http://gerrit.ovirt.org/28873
Reviewed-by: Martin Peřina <[email protected]>
Tested-by: Martin Peřina <[email protected]>
Reviewed-by: Federico Simoncelli <[email protected]>
---
M tests/testrunner.py
M vdsm/storage/clusterlock.py
M vdsm/storage/domainMonitor.py
M vdsm/storage/hsm.py
M vdsm/storage/sd.py
5 files changed, 111 insertions(+), 2 deletions(-)


  git pull ssh://gerrit.ovirt.org:29418/vdsm refs/changes/48/31148/1

diff --git a/tests/testrunner.py b/tests/testrunner.py
index eed743f..37b4c95 100644
--- a/tests/testrunner.py
+++ b/tests/testrunner.py
@@ -52,8 +52,19 @@
 
 PERMUTATION_ATTR = "_permutations_"
 
-# At the moment of this writing the tests don't need a mock object for sanlock
-sys.modules.update({'sanlock': object()})
+
+class FakeSanlock(object):
+    """
+    Minimal test double exposing what the tests needs at this point.
+    """
+    HOST_UNKNOWN = 1
+    HOST_FREE = 2
+    HOST_LIVE = 3
+    HOST_FAIL = 4
+    HOST_DEAD = 5
+
+
+sys.modules.update({'sanlock': FakeSanlock()})
 
 
 def dummyTextGenerator(size):
diff --git a/vdsm/storage/clusterlock.py b/vdsm/storage/clusterlock.py
index 6ca3e03..24a5d81 100644
--- a/vdsm/storage/clusterlock.py
+++ b/vdsm/storage/clusterlock.py
@@ -41,6 +41,33 @@
 SDM_LEASE_NAME = 'SDM'
 SDM_LEASE_OFFSET = 512 * 2048
 
+# Host status - currently only sanlock supports this, and the documentaion
+# describes the sanlock implementation. For more info see:
+# https://git.fedorahosted.org/cgit/sanlock.git/tree/src/lockspace.c
+
+# Cannot tell because clusterlock does not implement this or call failed.
+HOST_STATUS_UNAVAILABLE = "unavailable"
+
+# Host has a lease on the storage, but the clusterlock cannot tell if the host
+# is live or dead yet. Would typically last for 10-20 seconds, but it's
+# possible that this could persist for up to 80 seconds before host is
+# considered live or fail.
+HOST_STATUS_UNKNOWN = "unknown"
+
+# There is no lease for this host id.
+HOST_STATUS_FREE = "free"
+
+# Host has renewed its lease in the last 80 seconds. It may be renewing its
+# lease now or not, we can tell that only by checking again later.
+HOST_STATUS_LIVE = "live"
+
+# Host has not renewed its lease for 80 seconds. Would last for 60 seconds
+# before host is considered dead.
+HOST_STATUS_FAIL = "fail"
+
+# Host has not renewed its lease for 140 seconds.
+HOST_STATUS_DEAD = "dead"
+
 
 class InquireNotSupportedError(Exception):
     """Raised when the clusterlock class is not supporting inquire"""
@@ -90,6 +117,9 @@
 
     def hasHostId(self, hostId):
         return True
+
+    def getHostStatus(self, hostId):
+        return HOST_STATUS_UNAVAILABLE
 
     def acquire(self, hostID):
         leaseTimeMs = self._leaseTimeSec * 1000
@@ -153,6 +183,15 @@
 
 
 class SANLock(object):
+
+    STATUS_NAME = {
+        sanlock.HOST_UNKNOWN: HOST_STATUS_UNKNOWN,
+        sanlock.HOST_FREE: HOST_STATUS_FREE,
+        sanlock.HOST_LIVE: HOST_STATUS_LIVE,
+        sanlock.HOST_FAIL: HOST_STATUS_FAIL,
+        sanlock.HOST_DEAD: HOST_STATUS_DEAD,
+    }
+
     log = logging.getLogger("Storage.SANLock")
 
     _sanlock_fd = None
@@ -224,6 +263,28 @@
                 self.log.debug("Unable to inquire sanlock lockspace "
                                "status, returning False", exc_info=True)
                 return False
+
+    def getHostStatus(self, hostId):
+        # Note: get_hosts has off-by-one bug when asking for particular host
+        # id, so get all hosts info and filter.
+        # See https://bugzilla.redhat.com/1111210
+        try:
+            hosts = sanlock.get_hosts(self._sdUUID)
+        except sanlock.SanlockException as e:
+            self.log.debug("Unable to get host %d status in lockspace %s: %s",
+                           hostId, self._sdUUID, e)
+            return HOST_STATUS_UNAVAILABLE
+
+        for info in hosts:
+            if info['host_id'] == hostId:
+                status = info['flags']
+                return self.STATUS_NAME[status]
+
+        # get_hosts with host_id=0 returns only hosts with timestamp != 0,
+        # which means that no host is using this host id now. If there a
+        # timestamp, sanlock will return HOST_UNKNOWN and then HOST_LIVE or
+        # HOST_FAIL.
+        return HOST_STATUS_FREE
 
     # The hostId parameter is maintained here only for compatibility with
     # ClusterLock. We could consider to remove it in the future but keeping it
@@ -358,6 +419,9 @@
             currentHostId, lockFile = self._getLease()
             return currentHostId == hostId
 
+    def getHostStatus(self, hostId):
+        return HOST_STATUS_UNAVAILABLE
+
     def acquire(self, hostId):
         with self._globalLockMapSync:
             self.log.info("Acquiring local lock for domain %s (id: %s)",
diff --git a/vdsm/storage/domainMonitor.py b/vdsm/storage/domainMonitor.py
index 8a50b4f..d9e515c 100644
--- a/vdsm/storage/domainMonitor.py
+++ b/vdsm/storage/domainMonitor.py
@@ -23,6 +23,7 @@
 import weakref
 
 import logging
+import clusterlock
 import misc
 from vdsm import utils
 from vdsm.config import config
@@ -121,6 +122,17 @@
     def getStatus(self, sdUUID):
         return self._domains[sdUUID].getStatus()
 
+    def getHostStatus(self, domains):
+        status = {}
+        for sdUUID, hostId in domains.iteritems():
+            try:
+                monitor = self._domains[sdUUID]
+            except KeyError:
+                status[sdUUID] = clusterlock.HOST_STATUS_UNAVAILABLE
+            else:
+                status[sdUUID] = monitor.getHostStatus(hostId)
+        return status
+
     def close(self):
         self.log.info("Stopping domain monitors")
         for sdUUID in self._domains.keys():
@@ -160,6 +172,11 @@
     def getStatus(self):
         return self.status.copy()
 
+    def getHostStatus(self, hostId):
+        if not self.domain:
+            return clusterlock.HOST_STATUS_UNAVAILABLE
+        return self.domain.getHostStatus(hostId)
+
     @utils.traceback(on=log.name)
     def _monitorLoop(self):
         self.log.debug("Starting domain monitor for %s", self.sdUUID)
diff --git a/vdsm/storage/hsm.py b/vdsm/storage/hsm.py
index cbdf108..d743759 100644
--- a/vdsm/storage/hsm.py
+++ b/vdsm/storage/hsm.py
@@ -3647,3 +3647,17 @@
         with rmanager.acquireResource(STORAGE, HSM_DOM_MON_LOCK,
                                       rm.LockType.exclusive):
             self.domainMonitor.stopMonitoring(sdUUID)
+
+    @public
+    def getHostLeaseStatus(self, domains):
+        """
+        Returns host lease status for specified domains.
+
+        Warning: Internal use only.
+
+        :param domains:     mapping of host id indexed by domain uuid.
+        :returns:           mapping of host lease status indexed by domain
+                            uuid.  See clusterlock.py for possible values and
+                            their meaning.
+        """
+        return {'domains': self.domainMonitor.getHostStatus(domains)}
diff --git a/vdsm/storage/sd.py b/vdsm/storage/sd.py
index c65fd03..b34cc79 100644
--- a/vdsm/storage/sd.py
+++ b/vdsm/storage/sd.py
@@ -482,6 +482,9 @@
     def hasHostId(self, hostId):
         return self._clusterLock.hasHostId(hostId)
 
+    def getHostStatus(self, hostId):
+        return self._clusterLock.getHostStatus(hostId)
+
     def hasVolumeLeases(self):
         domVersion = self.getVersion()
         try:


-- 
To view, visit http://gerrit.ovirt.org/31148
To unsubscribe, visit http://gerrit.ovirt.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iccd62e58a194aa0ceb0f5e2503b8ec7e4349971b
Gerrit-PatchSet: 1
Gerrit-Project: vdsm
Gerrit-Branch: ovirt-3.5
Gerrit-Owner: Nir Soffer <[email protected]>
Gerrit-Reviewer: Federico Simoncelli <[email protected]>
Gerrit-Reviewer: Martin Peřina <[email protected]>
_______________________________________________
vdsm-patches mailing list
[email protected]
https://lists.fedorahosted.org/mailman/listinfo/vdsm-patches

Reply via email to