DaanHoogland commented on code in PR #9885:
URL: https://github.com/apache/cloudstack/pull/9885#discussion_r1837748547
##########
engine/schema/src/main/resources/META-INF/db/schema-41910to42000.sql:
##########
@@ -425,3 +425,10 @@ INSERT IGNORE INTO `cloud`.`guest_os_hypervisor` (uuid,
hypervisor_type, hypervi
CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.vm_instance', 'delete_protection',
'boolean DEFAULT FALSE COMMENT "delete protection for vm" ');
CALL `cloud`.`IDEMPOTENT_ADD_COLUMN`('cloud.volumes', 'delete_protection',
'boolean DEFAULT FALSE COMMENT "delete protection for volumes" ');
+
+-- Modify index for mshost_peer
+DELETE FROM `cloud`.`mshost_peer`;
+CALL
`cloud`.`IDEMPOTENT_DROP_FOREIGN_KEY`('cloud.mshost_peer','fk_mshost_peer__owner_mshost');
+CALL
`cloud`.`IDEMPOTENT_DROP_INDEX`('i_mshost_peer__owner_peer_runid','mshost_peer');
+CALL `cloud`.`IDEMPOTENT_ADD_UNIQUE_KEY`('cloud.mshost_peer',
'i_mshost_peer__owner_peer', '(owner_mshost, peer_mshost)');
+CALL `cloud`.`IDEMPOTENT_ADD_FOREIGN_KEY`('cloud.mshost_peer',
'fk_mshost_peer__owner_mshost', '(owner_mshost)', '`mshost`(`id`)');
Review Comment:
doesn't this mean it's going to be 21 instead of 20.1?
in any case not in this file, unless we add it to 4.20.
##########
framework/cluster/src/main/java/com/cloud/cluster/ClusterManagerImpl.java:
##########
@@ -898,6 +903,44 @@ private void peerScan() throws ActiveFencingException {
final Profiler profilerInvalidatedNodeList = new Profiler();
profilerInvalidatedNodeList.start();
+ processInvalidatedNodes(invalidatedNodeList);
+ profilerInvalidatedNodeList.stop();
+
+ final Profiler profilerRemovedList = new Profiler();
+ profilerRemovedList.start();
+ processRemovedNodes(cutTime, removedNodeList);
+ profilerRemovedList.stop();
+
+ final Profiler profilerNewList = new Profiler();
+ profilerNewList.start();
+ processNewNodes(cutTime, currentList);
+ profilerNewList.stop();
+
+ final Profiler profilerInactiveList = new Profiler();
+ profilerInactiveList.start();
+ processInactiveNodes(cutTime);
+ profilerInactiveList.stop();
+
+ profiler.stop();
+
+ logger.debug(String.format("Peer scan is finished. profiler: %s ,
profilerQueryActiveList: %s, " +
+ ", profilerSyncClusterInfo: %s,
profilerInvalidatedNodeList: %s, profilerRemovedList: %s," +
+ ", profilerNewList: %s, profilerInactiveList: %s",
+ profiler, profilerQueryActiveList, profilerSyncClusterInfo,
profilerInvalidatedNodeList, profilerRemovedList,
+ profilerNewList, profilerInactiveList));
+
+ if (profiler.getDurationInMillis() >= HeartbeatInterval.value()) {
+ if (logger.isDebugEnabled()) {
+ logger.debug(String.format("Peer scan takes too long to
finish. profiler: %s , profilerQueryActiveList: %s, " +
+ ", profilerSyncClusterInfo: %s,
profilerInvalidatedNodeList: %s, profilerRemovedList: %s," +
+ ", profilerNewList: %s, profilerInactiveList: %s",
+ profiler, profilerQueryActiveList,
profilerSyncClusterInfo, profilerInvalidatedNodeList, profilerRemovedList,
+ profilerNewList, profilerInactiveList));
+ }
+ }
Review Comment:
```suggestion
if (profiler.getDurationInMillis() >= HeartbeatInterval.value()) {
if (logger.isDebugEnabled()) {
logger.debug("Peer scan takes too long to finish. profiler:
{} , profilerQueryActiveList: {}, " +
", profilerSyncClusterInfo: {},
profilerInvalidatedNodeList: {}, profilerRemovedList: {}," +
", profilerNewList: {}, profilerInactiveList: {}",
profiler, profilerQueryActiveList,
profilerSyncClusterInfo,
profilerInvalidatedNodeList, profilerRemovedList,
profilerNewList, profilerInactiveList);
}
}
```
##########
framework/cluster/src/main/java/com/cloud/cluster/ClusterManagerImpl.java:
##########
@@ -898,6 +903,44 @@ private void peerScan() throws ActiveFencingException {
final Profiler profilerInvalidatedNodeList = new Profiler();
profilerInvalidatedNodeList.start();
+ processInvalidatedNodes(invalidatedNodeList);
+ profilerInvalidatedNodeList.stop();
+
+ final Profiler profilerRemovedList = new Profiler();
+ profilerRemovedList.start();
+ processRemovedNodes(cutTime, removedNodeList);
+ profilerRemovedList.stop();
+
+ final Profiler profilerNewList = new Profiler();
+ profilerNewList.start();
+ processNewNodes(cutTime, currentList);
+ profilerNewList.stop();
+
+ final Profiler profilerInactiveList = new Profiler();
+ profilerInactiveList.start();
+ processInactiveNodes(cutTime);
+ profilerInactiveList.stop();
+
+ profiler.stop();
+
+ logger.debug(String.format("Peer scan is finished. profiler: %s ,
profilerQueryActiveList: %s, " +
+ ", profilerSyncClusterInfo: %s,
profilerInvalidatedNodeList: %s, profilerRemovedList: %s," +
+ ", profilerNewList: %s, profilerInactiveList: %s",
+ profiler, profilerQueryActiveList, profilerSyncClusterInfo,
profilerInvalidatedNodeList, profilerRemovedList,
+ profilerNewList, profilerInactiveList));
+
+ if (profiler.getDurationInMillis() >= HeartbeatInterval.value()) {
+ if (logger.isDebugEnabled()) {
+ logger.debug(String.format("Peer scan takes too long to
finish. profiler: %s , profilerQueryActiveList: %s, " +
+ ", profilerSyncClusterInfo: %s,
profilerInvalidatedNodeList: %s, profilerRemovedList: %s," +
+ ", profilerNewList: %s, profilerInactiveList: %s",
+ profiler, profilerQueryActiveList,
profilerSyncClusterInfo, profilerInvalidatedNodeList, profilerRemovedList,
+ profilerNewList, profilerInactiveList));
+ }
+ }
+ }
+
+ private void processInvalidatedNodes(List<ManagementServerHostVO>
invalidatedNodeList) {
Review Comment:
:+1:
##########
framework/cluster/src/main/java/com/cloud/cluster/ClusterManagerImpl.java:
##########
@@ -911,33 +954,34 @@ private void peerScan() throws ActiveFencingException {
queueNotification(new
ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeRemoved,
invalidatedNodeList));
}
- profilerInvalidatedNodeList.stop();
+ }
- final Profiler profilerRemovedList = new Profiler();
- profilerRemovedList.start();
+ private void processRemovedNodes(Date cutTime,
List<ManagementServerHostVO> removedNodeList) {
// process removed node list
final Iterator<ManagementServerHostVO> it = removedNodeList.iterator();
while (it.hasNext()) {
final ManagementServerHostVO mshost = it.next();
- if (!pingManagementNode(mshost)) {
- logger.warn("Management node " + mshost.getId() + " is
detected inactive by timestamp and also not pingable");
+ // Check if peer state is Up in the period
+ if (!_mshostPeerDao.isPeerUpState(_mshostId, mshost.getId(), new
Date(cutTime.getTime() - HeartbeatThreshold.value()))) {
+ logger.warn("Management node " + mshost.getId() + " is
detected inactive by timestamp and did not send node status to this node");
Review Comment:
```suggestion
logger.warn("Management node {} is detected inactive by
timestamp and did not send node status to this node", mshost.getId());
```
##########
framework/cluster/src/main/java/com/cloud/cluster/ClusterManagerImpl.java:
##########
@@ -959,18 +1003,31 @@ private void peerScan() throws ActiveFencingException {
if (newNodeList.size() > 0) {
queueNotification(new
ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeAdded,
newNodeList));
}
+ }
- profiler.stop();
-
- if (profiler.getDurationInMillis() >= HeartbeatInterval.value()) {
- if (logger.isDebugEnabled()) {
- logger.debug("Peer scan takes too long to finish. profiler: "
+ profiler.toString() + ", profilerQueryActiveList: " +
- profilerQueryActiveList.toString() + ",
profilerSyncClusterInfo: " + profilerSyncClusterInfo.toString() + ",
profilerInvalidatedNodeList: " +
- profilerInvalidatedNodeList.toString() + ",
profilerRemovedList: " + profilerRemovedList.toString());
+ private void processInactiveNodes(Date cutTime) {
+ final List<ManagementServerHostVO> inactiveList =
_mshostDao.getInactiveList(new Date(cutTime.getTime() -
HeartbeatThreshold.value()));
+ if (inactiveList.size() > 0) {
+ if (logger.isInfoEnabled()) {
+ logger.info(String.format("Found %s inactive management server
node based on timestamp", inactiveList.size()));
Review Comment:
```suggestion
logger.info("Found {} inactive management server node based
on timestamp", inactiveList.size());
```
##########
framework/cluster/src/main/java/com/cloud/cluster/ClusterManagerImpl.java:
##########
@@ -911,33 +954,34 @@ private void peerScan() throws ActiveFencingException {
queueNotification(new
ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeRemoved,
invalidatedNodeList));
}
- profilerInvalidatedNodeList.stop();
+ }
- final Profiler profilerRemovedList = new Profiler();
- profilerRemovedList.start();
+ private void processRemovedNodes(Date cutTime,
List<ManagementServerHostVO> removedNodeList) {
// process removed node list
final Iterator<ManagementServerHostVO> it = removedNodeList.iterator();
while (it.hasNext()) {
final ManagementServerHostVO mshost = it.next();
- if (!pingManagementNode(mshost)) {
- logger.warn("Management node " + mshost.getId() + " is
detected inactive by timestamp and also not pingable");
+ // Check if peer state is Up in the period
+ if (!_mshostPeerDao.isPeerUpState(_mshostId, mshost.getId(), new
Date(cutTime.getTime() - HeartbeatThreshold.value()))) {
+ logger.warn("Management node " + mshost.getId() + " is
detected inactive by timestamp and did not send node status to this node");
_activePeers.remove(mshost.getId());
try {
JmxUtil.unregisterMBean("ClusterManager", "Node " +
mshost.getId());
} catch (final Exception e) {
logger.warn("Unable to deregiester cluster node from JMX
monitoring due to exception " + e.toString());
}
} else {
- logger.info("Management node " + mshost.getId() + " is
detected inactive by timestamp but is pingable");
+ logger.info("Management node " + mshost.getId() + " is
detected inactive by timestamp but sent node status to this node");
Review Comment:
```suggestion
logger.info("Management node {} is detected inactive by
timestamp but sent node status to this node", mshost.getId());
```
##########
framework/cluster/src/main/java/com/cloud/cluster/ClusterManagerImpl.java:
##########
@@ -959,18 +1003,31 @@ private void peerScan() throws ActiveFencingException {
if (newNodeList.size() > 0) {
queueNotification(new
ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeAdded,
newNodeList));
}
+ }
- profiler.stop();
-
- if (profiler.getDurationInMillis() >= HeartbeatInterval.value()) {
- if (logger.isDebugEnabled()) {
- logger.debug("Peer scan takes too long to finish. profiler: "
+ profiler.toString() + ", profilerQueryActiveList: " +
- profilerQueryActiveList.toString() + ",
profilerSyncClusterInfo: " + profilerSyncClusterInfo.toString() + ",
profilerInvalidatedNodeList: " +
- profilerInvalidatedNodeList.toString() + ",
profilerRemovedList: " + profilerRemovedList.toString());
+ private void processInactiveNodes(Date cutTime) {
+ final List<ManagementServerHostVO> inactiveList =
_mshostDao.getInactiveList(new Date(cutTime.getTime() -
HeartbeatThreshold.value()));
+ if (inactiveList.size() > 0) {
+ if (logger.isInfoEnabled()) {
+ logger.info(String.format("Found %s inactive management server
node based on timestamp", inactiveList.size()));
}
+ for (final ManagementServerHostVO host : inactiveList) {
+ logger.info(String.format("management server node msid: %s,
name: %s, service ip: %s, version: %s",
+ host.getMsid(), host.getName(), host.getServiceIP(),
host.getVersion()));
Review Comment:
```suggestion
logger.info("management server node msid: {}, name: {},
service ip: {}, version: {}",
host.getMsid(), host.getName(),
host.getServiceIP(), host.getVersion());
```
##########
server/src/main/java/com/cloud/server/StatsCollector.java:
##########
@@ -1153,6 +1157,17 @@ public String newStatus(ClusterServicePdu pdu) {
try {
hostStatsEntry = gson.fromJson(pdu.getJsonPackage(),new
TypeToken<ManagementServerHostStatsEntry>(){}.getType());
managementServerHostStats.put(hostStatsEntry.getManagementServerHostUuid(),
hostStatsEntry);
+
+ // Update peer state to Up in mshost_peer
+ ManagementServerHostVO mgmtServerVo =
managementServerHostDao.findByMsid(managementServerNodeId);
+ if (mgmtServerVo != null) {
+ msId = mgmtServerVo.getId();
+ if (msId != hostStatsEntry.getManagementServerHostId()) {
+ managementServerHostPeerDao.updatePeerInfo(msId,
hostStatsEntry.getManagementServerHostId(),
hostStatsEntry.getManagementServerRunId(), ManagementServerHost.State.Up);
+ }
+ } else {
+ logger.warn(String.format("Cannot find management server
with msid [%s]. Therefore, do not update peer info.", managementServerNodeId));
+ }
Review Comment:
new method?
##########
framework/cluster/src/main/java/com/cloud/cluster/ClusterManagerImpl.java:
##########
@@ -959,18 +1003,31 @@ private void peerScan() throws ActiveFencingException {
if (newNodeList.size() > 0) {
queueNotification(new
ClusterManagerMessage(ClusterManagerMessage.MessageType.nodeAdded,
newNodeList));
}
+ }
- profiler.stop();
-
- if (profiler.getDurationInMillis() >= HeartbeatInterval.value()) {
- if (logger.isDebugEnabled()) {
- logger.debug("Peer scan takes too long to finish. profiler: "
+ profiler.toString() + ", profilerQueryActiveList: " +
- profilerQueryActiveList.toString() + ",
profilerSyncClusterInfo: " + profilerSyncClusterInfo.toString() + ",
profilerInvalidatedNodeList: " +
- profilerInvalidatedNodeList.toString() + ",
profilerRemovedList: " + profilerRemovedList.toString());
+ private void processInactiveNodes(Date cutTime) {
+ final List<ManagementServerHostVO> inactiveList =
_mshostDao.getInactiveList(new Date(cutTime.getTime() -
HeartbeatThreshold.value()));
+ if (inactiveList.size() > 0) {
+ if (logger.isInfoEnabled()) {
+ logger.info(String.format("Found %s inactive management server
node based on timestamp", inactiveList.size()));
}
+ for (final ManagementServerHostVO host : inactiveList) {
+ logger.info(String.format("management server node msid: %s,
name: %s, service ip: %s, version: %s",
+ host.getMsid(), host.getName(), host.getServiceIP(),
host.getVersion()));
+ // Check if any peer state is Up in the period
+ if (ManagementServerHost.State.Up.equals(host.getState()) &&
+ !_mshostPeerDao.isPeerUpState(host.getId(), new
Date(cutTime.getTime() - HeartbeatThreshold.value()))) {
+ logger.warn("Management node " + host.getId() + " is
detected inactive by timestamp and did not send node status to all other
nodes");
Review Comment:
```suggestion
logger.warn("Management node {} is detected inactive by
timestamp and did not send node status to all other nodes", host.getId());
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]