Jiří Moskovčák has uploaded a new change for review.

Change subject: try harder when initializing
......................................................................

try harder when initializing

the storage initialization is async task, so sometimes
we try to access some files before the file exists, in
that case we should just wait a while and try again

Change-Id: Iac39bedfa78479f71674c3f5e673ba3814f52279
Signed-off-by: Jiri Moskovcak <[email protected]>
---
M ovirt_hosted_engine_ha/agent/constants.py.in
M ovirt_hosted_engine_ha/agent/hosted_engine.py
M ovirt_hosted_engine_ha/lib/exceptions.py
3 files changed, 67 insertions(+), 35 deletions(-)


  git pull ssh://gerrit.ovirt.org:29418/ovirt-hosted-engine-ha 
refs/changes/37/29937/1

diff --git a/ovirt_hosted_engine_ha/agent/constants.py.in 
b/ovirt_hosted_engine_ha/agent/constants.py.in
index 869b833..df94c8e 100644
--- a/ovirt_hosted_engine_ha/agent/constants.py.in
+++ b/ovirt_hosted_engine_ha/agent/constants.py.in
@@ -88,3 +88,6 @@
 
 # The length of history
 STATS_HISTORY_SECS = 15 * 60
+
+WAIT_FOR_STORAGE_RETRY = 5
+WAIT_FOR_STORAGE_DELAY = 5
diff --git a/ovirt_hosted_engine_ha/agent/hosted_engine.py 
b/ovirt_hosted_engine_ha/agent/hosted_engine.py
index 6078720..5b79950 100644
--- a/ovirt_hosted_engine_ha/agent/hosted_engine.py
+++ b/ovirt_hosted_engine_ha/agent/hosted_engine.py
@@ -295,10 +295,14 @@
         error_count = 0
 
         # make sure everything is initialized
-        self._initialize_broker()
+        # VDSM has to be initialized first, because it prepares the
+        # storage domain connection
+        # Broker then initializes the pieces needed for metadata and leases
+        # which are then used by sanlock
         self._initialize_vdsm()
-        self._initialize_sanlock()
         self._initialize_domain_monitor()
+        self._initialize_broker()
+        self._initialize_sanlock()
 
         for old_state, state, delay in self.fsm:
             if self._shutdown_requested_callback():
@@ -312,10 +316,10 @@
 
             try:
                 # make sure everything is still initialized
-                self._initialize_broker()
                 self._initialize_vdsm()
-                self._initialize_sanlock()
                 self._initialize_domain_monitor()
+                self._initialize_broker()
+                self._initialize_sanlock()
 
                 # log state
                 self._log.info("Current state %s (score: %d)",
@@ -383,6 +387,7 @@
                 raise
             else:
                 self._local_monitors[m['field']] = lm
+
         self._log.info("Broker initialized, all submonitors started")
 
     def _initialize_vdsm(self):
@@ -470,37 +475,53 @@
                            " is acquired (file: %s)",
                            constants.LOCKSPACE_NAME, self.host_id, lease_file)
 
-        try:
-            sanlock.add_lockspace(constants.LOCKSPACE_NAME,
-                                  self.host_id, lease_file)
-        except sanlock.SanlockException as e:
-            acquired_lock = False
-            msg = None
-            if hasattr(e, 'errno'):
-                if e.errno == errno.EEXIST:
-                    self._log.debug("Host already holds lock")
-                    acquired_lock = True
-                elif e.errno == errno.EINVAL:
-                    msg = ("cannot get lock on host id {0}:"
-                           " host already holds lock on a different host id"
-                           .format(self.host_id))
-                elif e.errno == errno.EINTR:
-                    msg = ("cannot get lock on host id {0}:"
-                           " sanlock operation interrupted (will retry)"
-                           .format(self.host_id))
-                elif e.errno == errno.EINPROGRESS:
-                    msg = ("cannot get lock on host id {0}:"
-                           " sanlock operation in progress (will retry)"
-                           .format(self.host_id))
-            if not acquired_lock:
-                if not msg:
-                    msg = ("cannot get lock on host id {0}: {1}"
-                           .format(self.host_id, str(e)))
-                self._log.error(msg, exc_info=True)
-                raise Exception("Failed to initialize sanlock: {0}"
-                                .format(msg))
-        else:
-            self._log.info("Acquired lock on host id %d", self.host_id)
+        for attempt in range(constants.WAIT_FOR_STORAGE_RETRY):
+            try:
+                sanlock.add_lockspace(constants.LOCKSPACE_NAME,
+                                      self.host_id, lease_file)
+            except sanlock.SanlockException as e:
+                if hasattr(e, 'errno'):
+                    if e.errno == errno.EEXIST:
+                        self._log.debug("Host already holds lock")
+                        break
+                    elif e.errno == errno.EINVAL:
+                        self._log.error(
+                            "cannot get lock on host id {0}: "
+                            "host already holds lock on a different"
+                            " host id"
+                            .format(self.host_id))
+                        raise  # this shouldn't happen, so throw the exception
+                    elif e.errno == errno.EINTR:
+                        self._log.warn("cannot get lock on host id {0}:"
+                                       " sanlock operation interrupted"
+                                       " (will retry)"
+                                       .format(self.host_id))
+                    elif e.errno == errno.EINPROGRESS:
+                        self._log.warn("cannot get lock on host id {0}:"
+                                       " sanlock operation in progress"
+                                       "(will retry)"
+                                       .format(self.host_id))
+                    elif e.errno == errno.ENOENT:
+                        self._log.warn("cannot get lock on host id {0}:"
+                                       " the lock file '{1}' is missing"
+                                       "(will retry)"
+                                       .format(self.host_id, lease_file))
+            else:  # no exception, we acquired the lock
+                self._log.info("Acquired lock on host id %d", self.host_id)
+                break
+
+            # some temporary problem has occurred (usually waiting for
+            # the storage), so wait a while and try again
+            self._log.info("Failed to acquire the lock. Waiting '{0}'s before"
+                           " the next attempt".
+                           format(constants.WAIT_FOR_STORAGE_DELAY))
+            time.sleep(constants.WAIT_FOR_STORAGE_DELAY)
+        else:  # happens only if all attempts are exhausted
+            raise ex.SanlockInitializationError(
+                "Failed to initialize sanlock, the number of errors has"
+                " exceeded the limit")
+
+        # we get here only if the the lock is acquired
         self._sanlock_initialized = True
 
     def _initialize_domain_monitor(self):
diff --git a/ovirt_hosted_engine_ha/lib/exceptions.py 
b/ovirt_hosted_engine_ha/lib/exceptions.py
index 824b059..5eb6a83 100644
--- a/ovirt_hosted_engine_ha/lib/exceptions.py
+++ b/ovirt_hosted_engine_ha/lib/exceptions.py
@@ -45,3 +45,11 @@
 
 class FatalMetadataError(Exception):
     pass
+
+
+class SanlockInitializationError(Exception):
+    pass
+
+
+class BrokerInitializationError(Exception):
+    pass


-- 
To view, visit http://gerrit.ovirt.org/29937
To unsubscribe, visit http://gerrit.ovirt.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Iac39bedfa78479f71674c3f5e673ba3814f52279
Gerrit-PatchSet: 1
Gerrit-Project: ovirt-hosted-engine-ha
Gerrit-Branch: ovirt-hosted-engine-ha-1.1
Gerrit-Owner: Jiří Moskovčák <[email protected]>
_______________________________________________
Engine-patches mailing list
[email protected]
http://lists.ovirt.org/mailman/listinfo/engine-patches

Reply via email to