DaanHoogland commented on code in PR #8089:
URL: https://github.com/apache/cloudstack/pull/8089#discussion_r1358156550
##########
engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java:
##########
@@ -801,46 +802,56 @@ public boolean stop() {
protected boolean handleDisconnectWithoutInvestigation(final AgentAttache
attache, final Status.Event event, final boolean transitState, final boolean
removeAgent) {
Review Comment:
this is now a very complex method with a nested try block. can you disect it
a bit?
##########
engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java:
##########
@@ -1120,18 +1131,26 @@ private AgentAttache handleConnectedAgent(final Link
link, final StartupCommand[
final HostVO host =
_resourceMgr.createHostVOForConnectedAgent(startup);
if (host != null) {
- ready = new ReadyCommand(host.getDataCenterId(), host.getId(),
NumbersUtil.enableHumanReadableSizes);
-
- if (!indirectAgentLB.compareManagementServerList(host.getId(),
host.getDataCenterId(), agentMSHostList, lbAlgorithm)) {
- final List<String> newMSList =
indirectAgentLB.getManagementServerList(host.getId(), host.getDataCenterId(),
null);
- ready.setMsHostList(newMSList);
- ready.setLbAlgorithm(indirectAgentLB.getLBAlgorithmName());
-
ready.setLbCheckInterval(indirectAgentLB.getLBPreferredHostCheckInterval(host.getClusterId()));
- s_logger.debug("Agent's management server host list is not
up to date, sending list update:" + newMSList);
- }
+ GlobalLock joinLock = getHostJoinLock(host.getId());
+ if (joinLock.lock(60)) {
+ try {
+ ready = new ReadyCommand(host.getDataCenterId(),
host.getId(), NumbersUtil.enableHumanReadableSizes);
+
+ if
(!indirectAgentLB.compareManagementServerList(host.getId(),
host.getDataCenterId(), agentMSHostList, lbAlgorithm)) {
+ final List<String> newMSList =
indirectAgentLB.getManagementServerList(host.getId(), host.getDataCenterId(),
null);
+ ready.setMsHostList(newMSList);
+
ready.setLbAlgorithm(indirectAgentLB.getLBAlgorithmName());
+
ready.setLbCheckInterval(indirectAgentLB.getLBPreferredHostCheckInterval(host.getClusterId()));
+ s_logger.debug("Agent's management server host
list is not up to date, sending list update:" + newMSList);
+ }
- attache = createAttacheForConnect(host, link);
- attache = notifyMonitorsOfConnection(attache, startup, false);
+ attache = createAttacheForConnect(host, link);
+ attache = notifyMonitorsOfConnection(attache, startup,
false);
+ } finally {
+ joinLock.unlock();
+ }
+ }
+ joinLock.releaseRef();
Review Comment:
can this be a new method please?
##########
engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java:
##########
@@ -1337,12 +1358,19 @@ protected void processRequest(final Link link, final
Request request) {
"Host [" + hostDesc + "] lost
connection to gateway (default route) and is possibly having network connection
issues.");
} else {
_alertMgr.clearAlert(AlertManager.AlertType.ALERT_TYPE_ROUTING,
host.getDataCenterId(), host.getPodId());
+ if (host.getStatus() != Status.Up) {
+ // Only transit state when the
status is not Up to avoid unnecessary db calls
+ requestStartupCommand = true;
+ }
}
} else {
s_logger.debug("Not processing " +
PingRoutingCommand.class.getSimpleName() + " for agent id=" + cmdHostId + ";
can't find the host in the DB");
}
+ } else if (host != null && host.getStatus() !=
Status.Up) {
+ // Only transit state when the status is not
Up to avoid unnecessary db calls
+ requestStartupCommand = true;
}
Review Comment:
happens here as well, can you unify this with 1361-1364?
##########
engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java:
##########
@@ -1120,18 +1131,26 @@ private AgentAttache handleConnectedAgent(final Link
link, final StartupCommand[
final HostVO host =
_resourceMgr.createHostVOForConnectedAgent(startup);
if (host != null) {
- ready = new ReadyCommand(host.getDataCenterId(), host.getId(),
NumbersUtil.enableHumanReadableSizes);
-
- if (!indirectAgentLB.compareManagementServerList(host.getId(),
host.getDataCenterId(), agentMSHostList, lbAlgorithm)) {
- final List<String> newMSList =
indirectAgentLB.getManagementServerList(host.getId(), host.getDataCenterId(),
null);
- ready.setMsHostList(newMSList);
- ready.setLbAlgorithm(indirectAgentLB.getLBAlgorithmName());
-
ready.setLbCheckInterval(indirectAgentLB.getLBPreferredHostCheckInterval(host.getClusterId()));
- s_logger.debug("Agent's management server host list is not
up to date, sending list update:" + newMSList);
- }
+ GlobalLock joinLock = getHostJoinLock(host.getId());
Review Comment:
the exaact same lock for connect and disconnect, can't that lead to a
life-lock? Not sure as the agent might function as arbiter so this is a genuine
question.
##########
test/integration/smoke/test_host_ping.py:
##########
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Check state transition of host from Alert to Up on Ping
+"""
+
+# Import Local Modules
+from marvin.cloudstackTestCase import *
+from marvin.lib.utils import *
+from marvin.lib.base import *
+from marvin.lib.common import *
+from nose.plugins.attrib import attr
+
+_multiprocess_shared_ = False
+
+
+class TestHostHA(cloudstackTestCase):
+
+ def setUp(self):
+ self.logger = logging.getLogger('TestHM')
+ self.stream_handler = logging.StreamHandler()
+ self.logger.setLevel(logging.DEBUG)
+ self.logger.addHandler(self.stream_handler)
+ self.apiclient = self.testClient.getApiClient()
+ self.hypervisor = self.testClient.getHypervisorInfo()
+ self.mgtSvrDetails = self.config.__dict__["mgtSvr"][0].__dict__
+ self.dbConnection = self.testClient.getDbConnection()
+ self.services = self.testClient.getParsedTestDataConfig()
+ self.zone = get_zone(self.apiclient, self.testClient.getZoneForTests())
+ self.pod = get_pod(self.apiclient, self.zone.id)
+ self.cleanup = []
+
+ def tearDown(self):
+ try:
+ # Clean up, terminate the created templates
+ cleanup_resources(self.apiclient, self.cleanup)
+
+ except Exception as e:
+ raise Exception("Warning: Exception during cleanup : %s" % e)
+
+ return
Review Comment:
```suggestion
def tearDown(self):
super(TestHostHA, self).tearDown()
```
##########
test/integration/smoke/test_host_ping.py:
##########
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+""" Check state transition of host from Alert to Up on Ping
+"""
+
+# Import Local Modules
+from marvin.cloudstackTestCase import *
+from marvin.lib.utils import *
+from marvin.lib.base import *
+from marvin.lib.common import *
+from nose.plugins.attrib import attr
+
+_multiprocess_shared_ = False
+
+
+class TestHostHA(cloudstackTestCase):
+
+ def setUp(self):
+ self.logger = logging.getLogger('TestHM')
+ self.stream_handler = logging.StreamHandler()
+ self.logger.setLevel(logging.DEBUG)
+ self.logger.addHandler(self.stream_handler)
+ self.apiclient = self.testClient.getApiClient()
+ self.hypervisor = self.testClient.getHypervisorInfo()
+ self.mgtSvrDetails = self.config.__dict__["mgtSvr"][0].__dict__
+ self.dbConnection = self.testClient.getDbConnection()
+ self.services = self.testClient.getParsedTestDataConfig()
+ self.zone = get_zone(self.apiclient, self.testClient.getZoneForTests())
+ self.pod = get_pod(self.apiclient, self.zone.id)
+ self.cleanup = []
+
+ def tearDown(self):
+ try:
+ # Clean up, terminate the created templates
+ cleanup_resources(self.apiclient, self.cleanup)
+
+ except Exception as e:
+ raise Exception("Warning: Exception during cleanup : %s" % e)
+
+ return
+
+ def checkHostStateInCloudstack(self, state, host_id):
+ try:
+ listHost = Host.list(
+ self.apiclient,
+ type='Routing',
+ zoneid=self.zone.id,
+ podid=self.pod.id,
+ id=host_id
+ )
+ self.assertEqual(
+ isinstance(listHost, list),
+ True,
+ "Check if listHost returns a valid response"
+ )
+
+ self.assertEqual(
+ len(listHost),
+ 1,
+ "Check if listHost returns a host"
+ )
+ self.logger.debug(" Host state is %s " % listHost[0].state)
+ if listHost[0].state == state:
+ return True, 1
+ else:
+ return False, 1
+ except Exception as e:
+ self.logger.debug("Got exception %s" % e)
+ return False, 1
+
+
+ @attr(
+ tags=[
+ "advanced",
+ "advancedns",
+ "smoke",
+ "basic"],
+ required_hardware="true")
+ def test_01_host_ping_on_alert(self):
+ listHost = Host.list(
+ self.apiclient,
+ type='Routing',
+ zoneid=self.zone.id,
+ podid=self.pod.id,
+ )
+ for host in listHost:
+ self.logger.debug('Hypervisor = {}'.format(host.id))
+
+
+ hostToTest = listHost[0]
+ sql_query = "UPDATE host SET status = 'Alert' WHERE uuid = '" +
hostToTest.id + "'"
+ self.dbConnection.execute(sql_query)
+
+ hostUpInCloudstack = wait_until(40, 10,
self.checkHostStateInCloudstack, "Up", hostToTest.id)
Review Comment:
any logic behind the wait period and retry count here?
default ping interval time two would be my logic.
##########
engine/orchestration/src/main/java/com/cloud/agent/manager/AgentManagerImpl.java:
##########
@@ -1337,12 +1358,19 @@ protected void processRequest(final Link link, final
Request request) {
"Host [" + hostDesc + "] lost
connection to gateway (default route) and is possibly having network connection
issues.");
} else {
_alertMgr.clearAlert(AlertManager.AlertType.ALERT_TYPE_ROUTING,
host.getDataCenterId(), host.getPodId());
+ if (host.getStatus() != Status.Up) {
+ // Only transit state when the
status is not Up to avoid unnecessary db calls
+ requestStartupCommand = true;
+ }
Review Comment:
this block ...
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]