Liron Ar has uploaded a new change for review.

Change subject: core: intrdoucing host immediate domain recovery mechanism
......................................................................

core: intrdoucing host immediate domain recovery mechanism

Ovirt engine allows hosts to be activated even if they can't access some
of the data center's storage domain in case that those domains are
marked as "inactive" which means that all the hosts that are already in
status up reported them as problematic (therefore there's no need to
prevent "new" hosts from being activated).

In case that we have an inactive domain that we failed to connect to
it's storage server we won't have the link for that domain and we won't
be able to produce it (as the mount was possible unavailable when we
attempted to connect to the storage server).
If the connectivity to that domain will return, host that was already
active before might report that he has access to the domain which will
cause the engine to change that domain's status to "active". The issue
is that hosts that were activated after the connectivity was lost would
move to non operational (causing to vm migration..etc) as they possibly
won't have connection to the domain (it's a race between the domain
status being changed to Active and the domain auto recovery meachanism)
and won't have the needed links of that domain.

The implemented solution is attempting to prevent hosts from moving to
non-operational status to avoid the related affects of it.
A new quartz job is set to run every 30 seconds, that job will inspect
all reports of hosts that were gatherd since it's last run. The
motivation for that implementation is to aggregate the operations on the
different hosts together to avoid long wait time and block other "pool"
operations.
If any hosts has a "new" report on a domain that is active or unknown
that it can't access for "storage" reason, those hosts would be
reconnected to the active/unknown domains storage servers and will
refresh it's storage pool metadata.

the engine will attempt to "recover" each host only once for each
problematic report to avoid flooding the system with recovery attempts,
if the host would still have problem accessing the domain it'll be moved
to non operational as usual.

Change-Id: Idb7b2fe8c87805986aaf25cd0f24f605d67d4186
Bug-Url: https://bugzilla.redhat.com/show_bug.cgi?id=1093924
Signed-off-by: Liron Aravot <[email protected]>
---
M 
backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/MaintenanceVdsCommand.java
M 
backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsEventListener.java
M 
backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/businessentities/IVdsEventListener.java
M 
backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java
M 
backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/eventqueue/EventType.java
M 
backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/locks/LockingGroup.java
M 
backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java
M 
backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/storage/StoragePoolDomainHelper.java
M packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql
9 files changed, 298 insertions(+), 45 deletions(-)


  git pull ssh://gerrit.ovirt.org:29418/ovirt-engine refs/changes/23/27523/1

diff --git 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/MaintenanceVdsCommand.java
 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/MaintenanceVdsCommand.java
index 2421f9c..a11eaf2 100644
--- 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/MaintenanceVdsCommand.java
+++ 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/MaintenanceVdsCommand.java
@@ -33,6 +33,8 @@
 import org.ovirt.engine.core.common.eventqueue.EventType;
 import org.ovirt.engine.core.common.job.Step;
 import org.ovirt.engine.core.common.job.StepEnum;
+import org.ovirt.engine.core.common.locks.LockingGroup;
+import org.ovirt.engine.core.common.utils.Pair;
 import 
org.ovirt.engine.core.common.vdscommands.DisconnectStoragePoolVDSCommandParameters;
 import 
org.ovirt.engine.core.common.vdscommands.SetHaMaintenanceModeVDSCommandParameters;
 import 
org.ovirt.engine.core.common.vdscommands.SetVdsStatusVDSCommandParameters;
@@ -43,6 +45,8 @@
 import org.ovirt.engine.core.utils.ejb.BeanProxyType;
 import org.ovirt.engine.core.utils.ejb.BeanType;
 import org.ovirt.engine.core.utils.ejb.EjbUtils;
+import org.ovirt.engine.core.utils.lock.EngineLock;
+import org.ovirt.engine.core.utils.lock.LockManagerFactory;
 import org.ovirt.engine.core.vdsbroker.irsbroker.IrsBrokerCommand;
 
 @NonTransactiveCommandAttribute
@@ -222,21 +226,29 @@
         // Clear the problematic timers since the VDS is in maintenance so it 
doesn't make sense to check it
         // anymore.
         if (!Guid.Empty.equals(vds.getStoragePoolId())) {
-            clearDomainCache(vds);
+            EngineLock lock = new 
EngineLock(Collections.singletonMap(vds.getId().toString(),
+                    new 
Pair<>(LockingGroup.VDS_POOL_AND_STORAGE_CONNECTIONS.toString(),
+                            
VdcBllMessages.ACTION_TYPE_FAILED_OBJECT_LOCKED.toString())), null);
+            try {
+                LockManagerFactory.getLockManager().acquireLockWait(lock);
+                clearDomainCache(vds);
 
-            StoragePool storage_pool = DbFacade.getInstance()
-                    .getStoragePoolDao()
-                    .get(vds.getStoragePoolId());
-            if (StoragePoolStatus.Uninitialized != storage_pool
-                    .getStatus()) {
-                Backend.getInstance().getResourceManager()
-                        .RunVdsCommand(
-                                VDSCommandType.DisconnectStoragePool,
-                                new 
DisconnectStoragePoolVDSCommandParameters(vds.getId(),
-                                        vds.getStoragePoolId(), 
vds.getVdsSpmId()));
-                HostStoragePoolParametersBase params =
-                        new HostStoragePoolParametersBase(storage_pool, vds);
-                
Backend.getInstance().runInternalAction(VdcActionType.DisconnectHostFromStoragePoolServers,
 params);
+                StoragePool storage_pool = DbFacade.getInstance()
+                        .getStoragePoolDao()
+                        .get(vds.getStoragePoolId());
+                if (StoragePoolStatus.Uninitialized != storage_pool
+                        .getStatus()) {
+                    Backend.getInstance().getResourceManager()
+                            .RunVdsCommand(
+                                    VDSCommandType.DisconnectStoragePool,
+                                    new 
DisconnectStoragePoolVDSCommandParameters(vds.getId(),
+                                            vds.getStoragePoolId(), 
vds.getVdsSpmId()));
+                    HostStoragePoolParametersBase params =
+                            new HostStoragePoolParametersBase(storage_pool, 
vds);
+                    
Backend.getInstance().runInternalAction(VdcActionType.DisconnectHostFromStoragePoolServers,
 params);
+                }
+            } finally {
+                LockManagerFactory.getLockManager().releaseLock(lock);
             }
         }
     }
diff --git 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsEventListener.java
 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsEventListener.java
index 73c6a15..659c830 100644
--- 
a/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsEventListener.java
+++ 
b/backend/manager/modules/bll/src/main/java/org/ovirt/engine/core/bll/VdsEventListener.java
@@ -18,6 +18,7 @@
 import org.ovirt.engine.core.bll.storage.StoragePoolStatusHandler;
 import org.ovirt.engine.core.common.AuditLogType;
 import org.ovirt.engine.core.common.action.AddVmFromScratchParameters;
+import 
org.ovirt.engine.core.common.action.ConnectHostToStoragePoolServersParameters;
 import org.ovirt.engine.core.common.action.FenceVdsActionParameters;
 import org.ovirt.engine.core.common.action.HostStoragePoolParametersBase;
 import org.ovirt.engine.core.common.action.MaintenanceNumberOfVdssParameters;
@@ -197,6 +198,13 @@
         return isSucceeded;
     }
 
+    @Override
+    public boolean connectHostToDomainsInActiveOrUnknownStatus(VDS vds) {
+        ConnectHostToStoragePoolServersParameters params = new 
ConnectHostToStoragePoolServersParameters(vds, false);
+        return 
Backend.getInstance().runInternalAction(VdcActionType.ConnectHostToStoragePoolServers,
 params).getSucceeded();
+    }
+
+
     private List<VdcActionParametersBase> 
createMigrateVmToServerParametersList(List<VmStatic> vmsToMigrate, final VDS 
vds) {
         return LinqUtils.foreach(vmsToMigrate,
                 new Function<VmStatic, VdcActionParametersBase>() {
diff --git 
a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/businessentities/IVdsEventListener.java
 
b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/businessentities/IVdsEventListener.java
index b80987a..9481fa3 100644
--- 
a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/businessentities/IVdsEventListener.java
+++ 
b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/businessentities/IVdsEventListener.java
@@ -27,6 +27,8 @@
 
     boolean vdsUpEvent(VDS vds);
 
+    boolean connectHostToDomainsInActiveOrUnknownStatus(VDS vds);
+
     void processOnClientIpChange(VDS vds, Guid vmId);
 
     void processOnCpuFlagsChange(Guid vdsId);
diff --git 
a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java
 
b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java
index 96bc571..2131290 100644
--- 
a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java
+++ 
b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/config/ConfigValues.java
@@ -481,6 +481,9 @@
     @TypeConverterAttribute(Integer.class)
     @DefaultValueAttribute("3")
     StoragePoolRefreshTimeInSeconds,
+    @TypeConverterAttribute(Integer.class)
+    @DefaultValueAttribute("30")
+    HostStorageConnectionAndPoolRefreshTimeInSeconds,
     @Reloadable
     @TypeConverterAttribute(Integer.class)
     @DefaultValueAttribute("3")
diff --git 
a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/eventqueue/EventType.java
 
b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/eventqueue/EventType.java
index 38b6fa3..c56ab26 100644
--- 
a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/eventqueue/EventType.java
+++ 
b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/eventqueue/EventType.java
@@ -8,5 +8,6 @@
     VDSSTOARGEPROBLEMS,
     DOMAINMONITORING,
     VDSCLEARCACHE,
-    VDSCONNECTTOPOOL;
+    VDSCONNECTTOPOOL,
+    VDSSPOOLREFRESH;
 }
diff --git 
a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/locks/LockingGroup.java
 
b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/locks/LockingGroup.java
index 6539a6f..f9bedf0 100644
--- 
a/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/locks/LockingGroup.java
+++ 
b/backend/manager/modules/common/src/main/java/org/ovirt/engine/core/common/locks/LockingGroup.java
@@ -24,6 +24,7 @@
     REMOTE_VM,
     OVF_UPDATE,
     /** This group is used for indication that an operation is executed using 
the specified host */
-    VDS_EXECUTION;
+    VDS_EXECUTION,
+    VDS_POOL_AND_STORAGE_CONNECTIONS;
 
 }
diff --git 
a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java
 
b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java
index 69262b3..e58d8f7 100644
--- 
a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java
+++ 
b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/irsbroker/IrsBrokerCommand.java
@@ -42,10 +42,12 @@
 import org.ovirt.engine.core.common.config.ConfigValues;
 import org.ovirt.engine.core.common.errors.VDSError;
 import org.ovirt.engine.core.common.errors.VdcBllErrors;
+import org.ovirt.engine.core.common.errors.VdcBllMessages;
 import org.ovirt.engine.core.common.eventqueue.Event;
 import org.ovirt.engine.core.common.eventqueue.EventQueue;
 import org.ovirt.engine.core.common.eventqueue.EventResult;
 import org.ovirt.engine.core.common.eventqueue.EventType;
+import org.ovirt.engine.core.common.locks.LockingGroup;
 import org.ovirt.engine.core.common.utils.Pair;
 import 
org.ovirt.engine.core.common.vdscommands.ConnectStoragePoolVDSCommandParameters;
 import 
org.ovirt.engine.core.common.vdscommands.DisconnectStoragePoolVDSCommandParameters;
@@ -67,6 +69,8 @@
 import org.ovirt.engine.core.utils.ejb.BeanProxyType;
 import org.ovirt.engine.core.utils.ejb.BeanType;
 import org.ovirt.engine.core.utils.ejb.EjbUtils;
+import org.ovirt.engine.core.utils.lock.EngineLock;
+import org.ovirt.engine.core.utils.lock.LockManagerFactory;
 import org.ovirt.engine.core.utils.log.Log;
 import org.ovirt.engine.core.utils.log.LogFactory;
 import org.ovirt.engine.core.utils.log.Logged;
@@ -143,6 +147,7 @@
         public Object syncObj = new Object();
 
         private final String storagePoolRefreshJobId;
+        private final String domainRecoverOnHostJobId;
         private final HashSet<Guid> mTriedVdssList = new HashSet<Guid>();
         private Guid mCurrentVdsId;
 
@@ -181,6 +186,14 @@
             storagePoolRefreshJobId = 
SchedulerUtilQuartzImpl.getInstance().scheduleAFixedDelayJob(this,
                     "_updatingTimer_Elapsed", new Class[0], new Object[0], 
storagePoolRefreshTime,
                     storagePoolRefreshTime, TimeUnit.SECONDS);
+            domainRecoverOnHostJobId =
+                    
SchedulerUtilQuartzImpl.getInstance().scheduleAFixedDelayJob(this,
+                            "hostsStorageConnectionsAndPoolMetadataRefresh",
+                            new Class[0],
+                            new Object[0],
+                            Config.<Integer> 
getValue(ConfigValues.HostStorageConnectionAndPoolRefreshTimeInSeconds),
+                            storagePoolRefreshTime,
+                            TimeUnit.SECONDS);
         }
 
         @OnTimerMethodAnnotation("_updatingTimer_Elapsed")
@@ -1059,12 +1072,13 @@
 
         private final Map<Guid, HashSet<Guid>> _domainsInProblem = new 
ConcurrentHashMap<Guid, HashSet<Guid>>();
         private final Map<Guid, HashSet<Guid>> _domainsInMaintenance = new 
ConcurrentHashMap<Guid, HashSet<Guid>>();
+        private final Map<Guid, Guid> vdsReportsOnUnseenDomain = new 
ConcurrentHashMap<Guid, Guid>();
+        private final Map<Guid, Guid> vdsHandeledReportsOnUnseenDomains = new 
ConcurrentHashMap<Guid, Guid>();
         private final Map<Guid, String> _timers = new HashMap<Guid, String>();
 
         public void updateVdsDomainsData(final Guid vdsId, final String 
vdsName,
                 final ArrayList<VDSDomainsData> data) {
 
-            Set<Guid> domainsInProblems = null;
             Set<Guid> domainsInMaintenance = null;
             StoragePool storagePool =
                     
DbFacade.getInstance().getStoragePoolDao().get(_storagePoolId);
@@ -1072,6 +1086,7 @@
                     && (storagePool.getStatus() == StoragePoolStatus.Up || 
storagePool.getStatus() == StoragePoolStatus.NonResponsive)) {
 
                 try {
+                    Map<Guid, DomainMonitoringResult> 
domainsProblematicReportInfo = new HashMap<>();
                     // build a list of all domains in pool
                     // which are in status Active or Unknown
                     Set<Guid> domainsInPool = new HashSet<Guid>(
@@ -1087,14 +1102,13 @@
                     // build a list of all the domains in
                     // pool (domainsInPool) that are not
                     // visible by the host.
-                    List<Guid> domainsInPoolThatNonVisibleByVds = new 
ArrayList<Guid>();
                     Set<Guid> dataDomainIds = new HashSet<Guid>();
                     for (VDSDomainsData tempData : data) {
                         dataDomainIds.add(tempData.getDomainId());
                     }
                     for (Guid tempDomainId : domainsInPool) {
                         if (!dataDomainIds.contains(tempDomainId)) {
-                            domainsInPoolThatNonVisibleByVds.add(tempDomainId);
+                            domainsProblematicReportInfo.put(tempDomainId, 
DomainMonitoringResult.NOT_REPORTED);
                         }
                     }
 
@@ -1104,16 +1118,16 @@
                     // ConfigValues.MaxStorageVdsTimeoutCheckSec)
                     // and are contained in the Active or
                     // Unknown domains in pool
-                    List<Guid> domainsSeenByVdsInProblem = new 
ArrayList<Guid>();
                     for (VDSDomainsData tempData : data) {
                         if (domainsInPool.contains(tempData.getDomainId())) {
-                            if (isDomainReportedAsProblematic(tempData, 
false)) {
-                                
domainsSeenByVdsInProblem.add(tempData.getDomainId());
+                            DomainMonitoringResult domainMonitoringResult = 
analyzeDomainReport(tempData, false);
+                            if (domainMonitoringResult.isInvalid()) {
+                                
domainsProblematicReportInfo.put(tempData.getDomainId(), 
domainMonitoringResult);
                             } else if (tempData.getDelay() > Config.<Double> 
getValue(ConfigValues.MaxStorageVdsDelayCheckSec)) {
                                 logDelayedDomain(vdsId, tempData);
                             }
                         } else if 
(inActiveDomainsInPool.contains(tempData.getDomainId())
-                                && !isDomainReportedAsProblematic(tempData, 
false)) {
+                                && analyzeDomainReport(tempData, 
false).isValid()) {
                             log.warnFormat("Storage Domain {0} was reported by 
Host {1} as Active in Pool {2}, moving to active status",
                                     getDomainIdTuple(tempData.getDomainId()),
                                     vdsName,
@@ -1140,35 +1154,26 @@
                         }
                     }
 
-                    // build a list of all potential domains
-                    // in problem
-                    domainsInProblems = new HashSet<Guid>();
-                    domainsInProblems.addAll(domainsInPoolThatNonVisibleByVds);
-                    domainsInProblems.addAll(domainsSeenByVdsInProblem);
-
+                    updateDomainInProblem(vdsId, vdsName, 
domainsProblematicReportInfo, domainsInMaintenance);
                 } catch (RuntimeException ex) {
                     log.error("error in updateVdsDomainsData", ex);
                 }
-
             }
-            updateDomainInProblem(vdsId, vdsName, domainsInProblems, 
domainsInMaintenance);
         }
 
-        private void updateDomainInProblem(final Guid vdsId, final String 
vdsName, final Set<Guid> domainsInProblems,
+        private void updateDomainInProblem(final Guid vdsId, final String 
vdsName, final Map<Guid, DomainMonitoringResult> domainsInProblem,
                                            final Set<Guid> 
domainsInMaintenance) {
-            if (domainsInProblems != null) {
                 ((EventQueue) EjbUtils.findBean(BeanType.EVENTQUEUE_MANAGER, 
BeanProxyType.LOCAL)).submitEventSync(new Event(_storagePoolId,
                         null, vdsId, EventType.DOMAINMONITORING, ""),
                         new Callable<EventResult>() {
                             @Override
                             public EventResult call() {
                                 EventResult result = new EventResult(true, 
EventType.DOMAINMONITORING);
-                                updateProblematicVdsData(vdsId, vdsName, 
domainsInProblems);
+                                updateProblematicVdsData(vdsId, vdsName, 
domainsInProblem);
                                 updateMaintenanceVdsData(vdsId, vdsName, 
domainsInMaintenance);
                                 return result;
                             }
                         });
-            }
         }
 
         private void logDelayedDomain(final Guid vdsId, VDSDomainsData 
tempData) {
@@ -1191,7 +1196,7 @@
             List<Guid> domainWhichWereSeen = new ArrayList<Guid>();
             for (VDSDomainsData vdsDomainData : vdsDomainsData) {
                 if (domainsInPool.contains(vdsDomainData.getDomainId())) {
-                    if (isDomainReportedAsProblematic(vdsDomainData, true)) {
+                    if (analyzeDomainReport(vdsDomainData, true).isInvalid()) {
                         domainsInProblem.add(vdsDomainData.getDomainId());
                     }
                     domainWhichWereSeen.add(vdsDomainData.getDomainId());
@@ -1207,14 +1212,38 @@
             return domainsInProblem;
         }
 
-        private boolean isDomainReportedAsProblematic(VDSDomainsData tempData, 
boolean isLog) {
+        private enum DomainMonitoringResult {
+            PROBLEMATIC(false), STORAGE_ACCCESS_ERROR(false), OK(true), 
NOT_REPORTED(false);
+
+            private boolean valid;
+
+            private DomainMonitoringResult(boolean valid) {
+                this.valid = valid;
+            }
+
+            public boolean isValid() {
+                return valid;
+            }
+
+            public boolean isInvalid() {
+                return !valid;
+            }
+        }
+
+        private DomainMonitoringResult analyzeDomainReport(VDSDomainsData 
tempData, boolean isLog) {
             if (tempData.getCode() != 0) {
                 if (isLog) {
                     log.errorFormat("Domain {0} was reported with error code 
{1}",
                             getDomainIdTuple(tempData.getDomainId()),
                             tempData.getCode());
                 }
-                return true;
+
+                if (tempData.getCode() == 
VdcBllErrors.StorageDomainDoesNotExist.getValue()
+                        || tempData.getCode() == 
VdcBllErrors.StorageException.getValue()) {
+                    return DomainMonitoringResult.STORAGE_ACCCESS_ERROR;
+                }
+
+                return DomainMonitoringResult.PROBLEMATIC;
             }
             if (tempData.getLastCheck() > Config
                     .<Double> 
getValue(ConfigValues.MaxStorageVdsTimeoutCheckSec)) {
@@ -1223,9 +1252,10 @@
                             getDomainIdTuple(tempData.getDomainId()),
                             tempData.getLastCheck());
                 }
-                return true;
+                return DomainMonitoringResult.PROBLEMATIC;
             }
-            return false;
+
+            return DomainMonitoringResult.OK;
         }
 
         private void updateMaintenanceVdsData(final Guid vdsId, final String 
vdsName, Set<Guid> domainsInMaintenance) {
@@ -1252,23 +1282,48 @@
             }
         }
 
-        private void updateProblematicVdsData(final Guid vdsId, final String 
vdsName, Set<Guid> domainsInProblems) {
+        private void clearVdsReportInfoOnUnseenDomain(Guid vdsId) {
+            vdsReportsOnUnseenDomain.remove(vdsId);
+        }
+
+        private void updateProblematicVdsData(final Guid vdsId, final String 
vdsName, Map<Guid, DomainMonitoringResult> domainsInProblem) {
             // for all problematic domains
             // update cache of _domainsInProblem
             // and _vdssInProblem and add a new
             // timer for new domains in problem
-            Set<Guid> domainsInProblemKeySet = _domainsInProblem.keySet();
-            for (Guid domainId : domainsInProblems) {
-                if (domainsInProblemKeySet.contains(domainId)) {
+            boolean newDomainUnreachableByHost = false;
+            int domainsUnreachableByHost = 0;
+            for (Map.Entry<Guid, DomainMonitoringResult> entry : 
domainsInProblem.entrySet()) {
+                Guid domainId = entry.getKey();
+                DomainMonitoringResult domainMonitoringResult = 
entry.getValue();
+                HashSet<Guid> hostsReportedDomainAsProblematic = 
_domainsInProblem.get(domainId);
+                boolean domainNotFound = domainMonitoringResult == 
DomainMonitoringResult.STORAGE_ACCCESS_ERROR;
+                if (domainNotFound) {
+                    domainsUnreachableByHost++;
+                }
+                if (hostsReportedDomainAsProblematic != null) {
+                    if (!hostsReportedDomainAsProblematic.contains(vdsId) && 
domainNotFound) {
+                        newDomainUnreachableByHost = true;
+                    }
                     // existing domains in problem
                     updateDomainInProblemData(domainId, vdsId, vdsName);
                 } else {
+                    if (domainNotFound) {
+                        newDomainUnreachableByHost = true;
+                    }
                     // new domains in problems
                     addDomainInProblemData(domainId, vdsId, vdsName);
                 }
             }
+
+            if (domainsUnreachableByHost == 0) {
+                clearVdsReportInfoOnUnseenDomain(vdsId);
+            } else if (newDomainUnreachableByHost) {
+                vdsReportsOnUnseenDomain.put(vdsId, Guid.newGuid());
+            }
+
             Set<Guid> notReportedDomainsByHost = new 
HashSet<Guid>(_domainsInProblem.keySet());
-            notReportedDomainsByHost.removeAll(domainsInProblems);
+            notReportedDomainsByHost.removeAll(domainsInProblem.keySet());
             for (Guid domainId : notReportedDomainsByHost) {
                 Set<Guid> vdsForDomain = _domainsInProblem.get(domainId);
                 if (vdsForDomain != null && vdsForDomain.contains(vdsId)) {
@@ -1317,6 +1372,107 @@
                             return result;
                         }
                     });
+        }
+
+        
@OnTimerMethodAnnotation("hostsStorageConnectionsAndPoolMetadataRefresh")
+        public void hostsStorageConnectionsAndPoolMetadataRefresh() {
+            if (vdsReportsOnUnseenDomain.isEmpty()) {
+                if (!vdsHandeledReportsOnUnseenDomains.isEmpty()) {
+                    vdsHandeledReportsOnUnseenDomains.clear();
+                }
+
+                return;
+            }
+
+            Map<Guid, Guid> reportsToHandle = new HashMap<>();
+            reportsToHandle.putAll(vdsReportsOnUnseenDomain);
+
+            for (Map.Entry<Guid, Guid> entry : 
vdsHandeledReportsOnUnseenDomains.entrySet()) {
+                Guid vdsId = entry.getKey();
+                Guid currentReportId = reportsToHandle.get(vdsId);
+                if (currentReportId == null) {
+                    vdsHandeledReportsOnUnseenDomains.remove(vdsId);
+                } else {
+                    Guid handledReportId = entry.getValue();
+                    if (currentReportId.equals(handledReportId)) {
+                        reportsToHandle.remove(vdsId);
+                    }
+                }
+            }
+
+            if (reportsToHandle.isEmpty()) {
+                return;
+            }
+
+            List<Callable<Void>> connectStorageTasks = new ArrayList<>();
+            final List<Callable<Void>> refreshStoragePoolMetadata = new 
ArrayList<>();
+            final StoragePool storagePool = 
DbFacade.getInstance().getStoragePoolDao().get(_storagePoolId);
+            final Guid masterDomainId =
+                    
DbFacade.getInstance().getStorageDomainDao().getMasterStorageDomainIdForPool(_storagePoolId);
+
+            Map<String, Pair<String, String>> acquiredLocks = new HashMap<>();
+            try {
+                for (Map.Entry<Guid, Guid> entry : reportsToHandle.entrySet()) 
{
+                    Guid vdsId = entry.getKey();
+                    Guid currentReportId = entry.getValue();
+
+                    vdsHandeledReportsOnUnseenDomains.put(vdsId, 
currentReportId);
+                    Map<String, Pair<String, String>> lockMap = 
Collections.singletonMap(vdsId.toString(),
+                            new 
Pair<>(LockingGroup.VDS_POOL_AND_STORAGE_CONNECTIONS.toString(),
+                                    
VdcBllMessages.ACTION_TYPE_FAILED_OBJECT_LOCKED.toString()));
+                    if (!LockManagerFactory.getLockManager()
+                            .acquireLock(new EngineLock(lockMap, null))
+                            .getFirst()) {
+                        continue;
+                    }
+
+                    acquiredLocks.putAll(lockMap);
+                    // this check is to verify after the lock is taken that 
the host wasn't moved to maintenance to
+                    // avoid connecting it to the storage servers and to the 
pool when it's on maintenance.
+                    if (!vdsReportsOnUnseenDomain.containsKey(vdsId)) {
+                        continue;
+                    }
+
+                    final VDS vds = 
DbFacade.getInstance().getVdsDao().get(entry.getKey());
+                    connectStorageTasks.add(new Callable<Void>() {
+
+                        @Override
+                        public Void call() {
+                            ResourceManager.getInstance()
+                                    
.getEventListener().connectHostToDomainsInActiveOrUnknownStatus(vds);
+                            return null;
+                        }
+                    });
+
+                    refreshStoragePoolMetadata.add(new Callable<Void>() {
+
+                        @Override
+                        public Void call() {
+                            
StoragePoolDomainHelper.refreshHostPoolMetadata(vds, storagePool, 
masterDomainId);
+                            return null;
+                        }
+                    });
+                }
+
+                ThreadPoolUtil.invokeAll(connectStorageTasks);
+
+                ((EventQueue) EjbUtils.findBean(BeanType.EVENTQUEUE_MANAGER, 
BeanProxyType.LOCAL)).submitEventSync(new Event(_storagePoolId,
+                        null,
+                        null,
+                        EventType.VDSSPOOLREFRESH,
+                        ""),
+                        new Callable<EventResult>() {
+                            @Override
+                            public EventResult call() {
+                                
ThreadPoolUtil.invokeAll(refreshStoragePoolMetadata);
+                                return new EventResult(true, 
EventType.VDSSPOOLREFRESH);
+                            }
+                        });
+            } finally {
+                if (!acquiredLocks.isEmpty()) {
+                    LockManagerFactory.getLockManager().releaseLock(new 
EngineLock(acquiredLocks, null));
+                }
+            }
         }
 
         private void updateDomainInProblemData(Guid domainId, Guid vdsId, 
String vdsName) {
@@ -1457,6 +1613,7 @@
             }
             removeVdsAsProblematic(nonOpVdss);
             removeVdsFromDomainMaintenance(nonOpVdss);
+            removeVdsFromUnseenDomainsReport(nonOpVdss);
         }
 
         private void removeVdsAsProblematic(List<Guid> nonOpVdss) {
@@ -1483,6 +1640,13 @@
                 if (entry.getValue().isEmpty()) {
                     iterDomainsInProblem.remove();
                 }
+            }
+        }
+
+        private void removeVdsFromUnseenDomainsReport(List<Guid> nonOpVdss) {
+            log.infoFormat("Removing host(s) {0} from hosts unseen domain 
report cache", nonOpVdss);
+            for(Guid id : nonOpVdss) {
+                clearVdsReportInfoOnUnseenDomain(id);
             }
         }
 
@@ -1530,6 +1694,7 @@
                 log.info("IrsProxyData::disposing");
                 resetIrs();
                 
SchedulerUtilQuartzImpl.getInstance().deleteJob(storagePoolRefreshJobId);
+                
SchedulerUtilQuartzImpl.getInstance().deleteJob(domainRecoverOnHostJobId);
                 _disposed = true;
             }
         }
diff --git 
a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/storage/StoragePoolDomainHelper.java
 
b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/storage/StoragePoolDomainHelper.java
index bde2488..f6abe84 100644
--- 
a/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/storage/StoragePoolDomainHelper.java
+++ 
b/backend/manager/modules/vdsbroker/src/main/java/org/ovirt/engine/core/vdsbroker/storage/StoragePoolDomainHelper.java
@@ -8,11 +8,21 @@
 import java.util.Set;
 
 import org.ovirt.engine.core.common.businessentities.StorageDomainStatus;
+import org.ovirt.engine.core.common.businessentities.StoragePool;
 import org.ovirt.engine.core.common.businessentities.StoragePoolIsoMap;
+import org.ovirt.engine.core.common.businessentities.VDS;
+import org.ovirt.engine.core.common.errors.VDSError;
+import org.ovirt.engine.core.common.errors.VdcBLLException;
+import org.ovirt.engine.core.common.errors.VdcBllErrors;
+import 
org.ovirt.engine.core.common.vdscommands.ConnectStoragePoolVDSCommandParameters;
+import 
org.ovirt.engine.core.common.vdscommands.RefreshStoragePoolVDSCommandParameters;
+import org.ovirt.engine.core.common.vdscommands.VDSCommandType;
+import org.ovirt.engine.core.common.vdscommands.VDSReturnValue;
 import org.ovirt.engine.core.compat.Guid;
 import org.ovirt.engine.core.dal.dbbroker.DbFacade;
 import org.ovirt.engine.core.utils.log.Log;
 import org.ovirt.engine.core.utils.log.LogFactory;
+import org.ovirt.engine.core.vdsbroker.ResourceManager;
 
 
 public class StoragePoolDomainHelper {
@@ -38,6 +48,56 @@
         return storageDomains;
     }
 
+    /**
+     * Refreshes the given vds pool metadata, if the host isn't connected to 
the pool it'll be connected.
+     *
+     * @return boolean indicating whether the host pool metadata was 
"refreshed" succesfully (either by refresh or
+     *         connect)
+     */
+    public static boolean refreshHostPoolMetadata(VDS vds, StoragePool 
storagePool, Guid masterDomainId) {
+        try {
+            ResourceManager.getInstance().runVdsCommand(
+                    VDSCommandType.RefreshStoragePool,
+                    new RefreshStoragePoolVDSCommandParameters(vds.getId(),
+                            storagePool.getId(),
+                            masterDomainId,
+                            storagePool.getmaster_domain_version()));
+        } catch (VdcBLLException ex) {
+            VDSError error = ex.getVdsError();
+            if (error.getCode() != VdcBllErrors.StoragePoolUnknown) {
+                log.infoFormat("Failed to refresh host {0} pool {1} metadata 
with error {2} (message: {3})",
+                        vds.getName(),
+                        storagePool.getId(), error.getCode(), 
error.getMessage());
+                return false;
+            }
+
+            error = null;
+
+            try {
+                VDSReturnValue vdsReturnValue = 
ResourceManager.getInstance().runVdsCommand(
+                        VDSCommandType.ConnectStoragePool,
+                        new ConnectStoragePoolVDSCommandParameters(vds.getId(),
+                                storagePool.getId(), vds.getVdsSpmId(),
+                                masterDomainId, storagePool
+                                        .getmaster_domain_version()));
+                if (!vdsReturnValue.getSucceeded()) {
+                    error = vdsReturnValue.getVdsError();
+                }
+            } catch (VdcBLLException e) {
+                error = e.getVdsError();
+            }
+
+            if (error != null) {
+                log.infoFormat("Failed to connect host {0} to pool {1} with 
error {2} (message: {3})",
+                        vds.getName(),
+                        storagePool.getId(), error.getCode(), 
error.getMessage());
+                return false;
+            }
+        }
+
+        return true;
+    }
+
     public static void updateApplicablePoolDomainsStatuses(Guid storagePoolId,
             Set<StorageDomainStatus> applicableStatusesForUpdate,
             StorageDomainStatus newStatus, String reason) {
diff --git a/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql 
b/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql
index 155f22f..ade2e86 100644
--- a/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql
+++ b/packaging/dbscripts/upgrade/pre_upgrade/0000_config.sql
@@ -496,6 +496,7 @@
 select fn_db_add_config_value('StoragePoolNameSizeLimit','40','general');
 select 
fn_db_add_config_value('StoragePoolNonOperationalResetTimeoutInMin','3','general');
 select 
fn_db_add_config_value('StoragePoolRefreshTimeInSeconds','10','general');
+select 
fn_db_add_config_value('HostStorageConnectionAndPoolRefreshTimeInSeconds','30','general');
 select 
fn_db_add_config_value('SucceededJobCleanupTimeInMinutes','10','general');
 select fn_db_add_config_value('SupportedClusterLevels','3.0','general');
 select fn_db_add_config_value('SupportedStorageFormats','0,2','3.0');


-- 
To view, visit http://gerrit.ovirt.org/27523
To unsubscribe, visit http://gerrit.ovirt.org/settings

Gerrit-MessageType: newchange
Gerrit-Change-Id: Idb7b2fe8c87805986aaf25cd0f24f605d67d4186
Gerrit-PatchSet: 1
Gerrit-Project: ovirt-engine
Gerrit-Branch: master
Gerrit-Owner: Liron Ar <[email protected]>
_______________________________________________
Engine-patches mailing list
[email protected]
http://lists.ovirt.org/mailman/listinfo/engine-patches

Reply via email to