This is an automated email from the ASF dual-hosted git repository.
sodonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 5545f86577 HDDS-9902. Decommission: Admin monitor should call
RM.checkContainerState to check for under-replication (#5770)
5545f86577 is described below
commit 5545f865773aa19c59ef9ad0a1ba98cb3d5b9dc0
Author: Stephen O'Donnell <[email protected]>
AuthorDate: Wed Dec 13 21:34:18 2023 +0000
HDDS-9902. Decommission: Admin monitor should call RM.checkContainerState
to check for under-replication (#5770)
---
.../replication/ContainerHealthResult.java | 22 ++++++
.../health/ECReplicationCheckHandler.java | 20 +++++-
.../hdds/scm/node/DatanodeAdminMonitorImpl.java | 79 ++++++++++++----------
.../hdds/scm/node/NodeDecommissionMetrics.java | 43 ++++++------
.../health/TestECReplicationCheckHandler.java | 14 ++--
.../scm/node/DatanodeAdminMonitorTestUtil.java | 19 ++++++
.../hdds/scm/node/TestDatanodeAdminMonitor.java | 10 ++-
.../hdds/scm/node/TestNodeDecommissionMetrics.java | 14 ++--
8 files changed, 149 insertions(+), 72 deletions(-)
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ContainerHealthResult.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ContainerHealthResult.java
index 5724056ca7..a2262cdafd 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ContainerHealthResult.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/ContainerHealthResult.java
@@ -111,6 +111,7 @@ public class ContainerHealthResult {
private boolean isMissing = false;
private boolean hasHealthyReplicas;
private boolean hasUnReplicatedOfflineIndexes = false;
+ private boolean offlineIndexesOkAfterPending = false;
private int requeueCount = 0;
public UnderReplicatedHealthResult(ContainerInfo containerInfo,
@@ -231,6 +232,27 @@ public class ContainerHealthResult {
return hasUnReplicatedOfflineIndexes;
}
+ /**
+ * Pass true if a container has some indexes which are only on nodes which
+ * are DECOMMISSIONING or ENTERING_MAINTENANCE, but the container has a
+ * pending add to correct the under replication caused by decommission, but
+ * it has not completed yet.
+ * @param val Pass True if the container has a pending add to correct the
+ * under replication caused by decommission. False otherwise.
+ */
+ public void setOfflineIndexesOkAfterPending(boolean val) {
+ offlineIndexesOkAfterPending = val;
+ }
+
+ /**
+ * Returns true if a container has under-replication caused by offline
+ * indexes, but it is corrected by a pending add.
+ * @return
+ */
+ public boolean offlineIndexesOkAfterPending() {
+ return offlineIndexesOkAfterPending;
+ }
+
public boolean hasHealthyReplicas() {
return hasHealthyReplicas;
}
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ECReplicationCheckHandler.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ECReplicationCheckHandler.java
index 197b53ed90..1b58694441 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ECReplicationCheckHandler.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/container/replication/health/ECReplicationCheckHandler.java
@@ -74,13 +74,19 @@ public class ECReplicationCheckHandler extends
AbstractCheck {
report.incrementAndSample(
ReplicationManagerReport.HealthState.UNHEALTHY, containerID);
}
+ // An EC container can be both unrecoverable and have offline
replicas. In this case, we need
+ // to report both states as the decommission monitor needs to wait for
an extra copy to be
+ // made of the offline replica before decommission can complete.
+ if (underHealth.hasUnreplicatedOfflineIndexes()) {
+
report.incrementAndSample(ReplicationManagerReport.HealthState.UNDER_REPLICATED,
containerID);
+ }
} else {
report.incrementAndSample(
ReplicationManagerReport.HealthState.UNDER_REPLICATED,
containerID);
}
if (!underHealth.isReplicatedOkAfterPending() &&
(!underHealth.isUnrecoverable()
- || underHealth.hasUnreplicatedOfflineIndexes())) {
+ || (underHealth.hasUnreplicatedOfflineIndexes() &&
!underHealth.offlineIndexesOkAfterPending()))) {
request.getReplicationQueue().enqueue(underHealth);
}
LOG.debug("Container {} is Under Replicated. isReplicatedOkAfterPending "
@@ -140,9 +146,17 @@ public class ECReplicationCheckHandler extends
AbstractCheck {
container, remainingRedundancy, dueToOutOfService,
replicaCount.isSufficientlyReplicated(true),
replicaCount.isUnrecoverable());
- if (replicaCount.decommissioningOnlyIndexes(true).size() > 0
- || replicaCount.maintenanceOnlyIndexes(true).size() > 0) {
+ // If the container has a pending add to correct the under replication
caused by decommission
+ // then we need to wait for that to complete before we can say the
container is replicated OK.
+ // Therefore we should set the setHasUnReplicatedOfflineIndexes based on
not considering
+ // pending.
+ if (!replicaCount.decommissioningOnlyIndexes(false).isEmpty()
+ || !replicaCount.maintenanceOnlyIndexes(false).isEmpty()) {
result.setHasUnReplicatedOfflineIndexes(true);
+ if (replicaCount.decommissioningOnlyIndexes(true).isEmpty()
+ && replicaCount.maintenanceOnlyIndexes(true).isEmpty()) {
+ result.setOfflineIndexesOkAfterPending(true);
+ }
}
result.setIsMissing(replicaCount.isMissing());
return result;
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
index 693a3474de..455307c6be 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorImpl.java
@@ -25,6 +25,7 @@ import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.hdds.scm.container.ContainerID;
import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
import org.apache.hadoop.hdds.scm.container.ContainerReplica;
+import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport;
import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaCount;
import org.apache.hadoop.hdds.scm.container.replication.ReplicationManager;
import org.apache.hadoop.hdds.scm.events.SCMEvents;
@@ -84,7 +85,7 @@ public class DatanodeAdminMonitorImpl implements
DatanodeAdminMonitor {
private long sufficientlyReplicatedContainers = 0;
private long trackedDecomMaintenance = 0;
private long trackedRecommission = 0;
- private long unhealthyContainers = 0;
+ private long unClosedContainers = 0;
private long underReplicatedContainers = 0;
private Map<String, ContainerStateInWorkflow> containerStateByHost;
@@ -196,7 +197,7 @@ public class DatanodeAdminMonitorImpl implements
DatanodeAdminMonitor {
synchronized void setMetricsToGauge() {
synchronized (metrics) {
- metrics.setContainersUnhealthyTotal(unhealthyContainers);
+ metrics.setContainersUnClosedTotal(unClosedContainers);
metrics.setRecommissionNodesTotal(trackedRecommission);
metrics.setDecommissioningMaintenanceNodesTotal(
trackedDecomMaintenance);
@@ -212,7 +213,7 @@ public class DatanodeAdminMonitorImpl implements
DatanodeAdminMonitor {
void resetContainerMetrics() {
pipelinesWaitingToClose = 0;
sufficientlyReplicatedContainers = 0;
- unhealthyContainers = 0;
+ unClosedContainers = 0;
underReplicatedContainers = 0;
}
@@ -341,9 +342,9 @@ public class DatanodeAdminMonitorImpl implements
DatanodeAdminMonitor {
int sufficientlyReplicated = 0;
int deleting = 0;
int underReplicated = 0;
- int unhealthy = 0;
+ int unclosed = 0;
List<ContainerID> underReplicatedIDs = new ArrayList<>();
- List<ContainerID> unhealthyIDs = new ArrayList<>();
+ List<ContainerID> unClosedIDs = new ArrayList<>();
Set<ContainerID> containers =
nodeManager.getContainers(dn);
for (ContainerID cid : containers) {
@@ -361,20 +362,6 @@ public class DatanodeAdminMonitorImpl implements
DatanodeAdminMonitor {
continue;
}
- if (replicaSet.isSufficientlyReplicatedForOffline(dn, nodeManager)) {
- sufficientlyReplicated++;
- } else {
- if (LOG.isDebugEnabled()) {
- underReplicatedIDs.add(cid);
- }
- if (underReplicated < containerDetailsLoggingLimit
- || LOG.isDebugEnabled()) {
- LOG.info("Under Replicated Container {} {}; {}",
- cid, replicaSet, replicaDetails(replicaSet.getReplicas()));
- }
- underReplicated++;
- }
-
boolean isHealthy;
/*
If LegacyReplicationManager is enabled, then use the
@@ -390,42 +377,66 @@ public class DatanodeAdminMonitorImpl implements
DatanodeAdminMonitor {
}
if (!isHealthy) {
if (LOG.isDebugEnabled()) {
- unhealthyIDs.add(cid);
+ unClosedIDs.add(cid);
}
- if (unhealthy < containerDetailsLoggingLimit
+ if (unclosed < containerDetailsLoggingLimit
|| LOG.isDebugEnabled()) {
- LOG.info("Unhealthy Container {} {}; {}",
- cid, replicaSet, replicaDetails(replicaSet.getReplicas()));
+ LOG.info("Unclosed Container {} {}; {}", cid, replicaSet,
replicaDetails(replicaSet.getReplicas()));
+ }
+ unclosed++;
+ continue;
+ }
+
+ // If we get here, the container is closed or quasi-closed and all the
replicas match that
+ // state, except for any which are unhealthy. As the container is
closed, we can check
+ // if it is sufficiently replicated using replicationManager, but this
only works if the
+ // legacy RM is not enabled.
+ boolean replicatedOK;
+ if (legacyEnabled) {
+ replicatedOK = replicaSet.isSufficientlyReplicatedForOffline(dn,
nodeManager);
+ } else {
+ ReplicationManagerReport report = new ReplicationManagerReport();
+ replicationManager.checkContainerStatus(replicaSet.getContainer(),
report);
+ replicatedOK =
report.getStat(ReplicationManagerReport.HealthState.UNDER_REPLICATED) == 0;
+ }
+
+ if (replicatedOK) {
+ sufficientlyReplicated++;
+ } else {
+ if (LOG.isDebugEnabled()) {
+ underReplicatedIDs.add(cid);
+ }
+ if (underReplicated < containerDetailsLoggingLimit ||
LOG.isDebugEnabled()) {
+ LOG.info("Under Replicated Container {} {}; {}", cid, replicaSet,
replicaDetails(replicaSet.getReplicas()));
}
- unhealthy++;
+ underReplicated++;
}
} catch (ContainerNotFoundException e) {
- LOG.warn("ContainerID {} present in node list for {} but not found " +
- "in containerManager", cid, dn);
+ LOG.warn("ContainerID {} present in node list for {} but not found in
containerManager", cid, dn);
}
}
LOG.info("{} has {} sufficientlyReplicated, {} deleting, {} " +
- "underReplicated and {} unhealthy containers",
- dn, sufficientlyReplicated, deleting, underReplicated, unhealthy);
+ "underReplicated and {} unclosed containers",
+ dn, sufficientlyReplicated, deleting, underReplicated, unclosed);
containerStateByHost.put(dn.getHostName(),
new ContainerStateInWorkflow(dn.getHostName(),
sufficientlyReplicated,
underReplicated,
- unhealthy,
+ unclosed,
0L));
sufficientlyReplicatedContainers += sufficientlyReplicated;
underReplicatedContainers += underReplicated;
- unhealthyContainers += unhealthy;
+ unClosedContainers += unclosed;
if (LOG.isDebugEnabled() && underReplicatedIDs.size() < 10000 &&
- unhealthyIDs.size() < 10000) {
- LOG.debug("{} has {} underReplicated [{}] and {} unhealthy [{}] " +
+ unClosedIDs.size() < 10000) {
+ LOG.debug("{} has {} underReplicated [{}] and {} unclosed [{}] " +
"containers", dn, underReplicated,
underReplicatedIDs.stream().map(
Object::toString).collect(Collectors.joining(", ")),
- unhealthy, unhealthyIDs.stream().map(
+ unclosed, unClosedIDs.stream().map(
Object::toString).collect(Collectors.joining(", ")));
}
- return underReplicated == 0 && unhealthy == 0;
+ return underReplicated == 0 && unclosed == 0;
}
private String replicaDetails(Collection<ContainerReplica> replicas) {
diff --git
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionMetrics.java
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionMetrics.java
index 8be3d2a6d0..8217594ecf 100644
---
a/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionMetrics.java
+++
b/hadoop-hdds/server-scm/src/main/java/org/apache/hadoop/hdds/scm/node/NodeDecommissionMetrics.java
@@ -55,8 +55,8 @@ public final class NodeDecommissionMetrics implements
MetricsSource {
@Metric("Number of containers under replicated in tracked nodes.")
private MutableGaugeLong containersUnderReplicatedTotal;
- @Metric("Number of containers unhealthy in tracked nodes.")
- private MutableGaugeLong containersUnhealthyTotal;
+ @Metric("Number of containers not fully closed in tracked nodes.")
+ private MutableGaugeLong containersUnClosedTotal;
@Metric("Number of containers sufficiently replicated in tracked nodes.")
private MutableGaugeLong containersSufficientlyReplicatedTotal;
@@ -67,7 +67,7 @@ public final class NodeDecommissionMetrics implements
MetricsSource {
*/
public static final class ContainerStateInWorkflow {
private long sufficientlyReplicated = 0;
- private long unhealthyContainers = 0;
+ private long unclosedContainers = 0;
private long underReplicatedContainers = 0;
private String host = "";
private long pipelinesWaitingToClose = 0;
@@ -91,22 +91,18 @@ public final class NodeDecommissionMetrics implements
MetricsSource {
+ "for host in decommissioning and "
+ "maintenance mode");
- private static final MetricsInfo HOST_UNHEALTHY_CONTAINERS = Interns.info(
- "UnhealthyContainersDN",
- "Number of unhealthy containers "
- + "for host in decommissioning and "
- + "maintenance mode");
-
+ private static final MetricsInfo HOST_UNCLOSED_CONTAINERS =
Interns.info("UnclosedContainersDN",
+ "Number of containers not fully closed for host in decommissioning and
maintenance mode");
public ContainerStateInWorkflow(String host,
long sufficiently,
long under,
- long unhealthy,
+ long unclosed,
long pipelinesToClose) {
this.host = host;
sufficientlyReplicated = sufficiently;
underReplicatedContainers = under;
- unhealthyContainers = unhealthy;
+ unclosedContainers = unclosed;
pipelinesWaitingToClose = pipelinesToClose;
}
@@ -126,8 +122,8 @@ public final class NodeDecommissionMetrics implements
MetricsSource {
return underReplicatedContainers;
}
- public long getUnhealthyContainers() {
- return unhealthyContainers;
+ public long getUnclosedContainers() {
+ return unclosedContainers;
}
}
@@ -166,7 +162,7 @@ public final class NodeDecommissionMetrics implements
MetricsSource {
recommissionNodesTotal.snapshot(builder, all);
pipelinesWaitingToCloseTotal.snapshot(builder, all);
containersUnderReplicatedTotal.snapshot(builder, all);
- containersUnhealthyTotal.snapshot(builder, all);
+ containersUnClosedTotal.snapshot(builder, all);
containersSufficientlyReplicatedTotal.snapshot(builder, all);
MetricsRecordBuilder recordBuilder = builder;
@@ -182,8 +178,8 @@ public final class NodeDecommissionMetrics implements
MetricsSource {
e.getValue().getUnderReplicatedContainers())
.addGauge(ContainerStateInWorkflow.HOST_SUFFICIENTLY_REPLICATED,
e.getValue().getSufficientlyReplicated())
- .addGauge(ContainerStateInWorkflow.HOST_UNHEALTHY_CONTAINERS,
- e.getValue().getUnhealthyContainers());
+ .addGauge(ContainerStateInWorkflow.HOST_UNCLOSED_CONTAINERS,
+ e.getValue().getUnclosedContainers());
}
recordBuilder.endRecord();
}
@@ -218,10 +214,9 @@ public final class NodeDecommissionMetrics implements
MetricsSource {
.set(numTrackedUnderReplicated);
}
- public synchronized void setContainersUnhealthyTotal(
- long numTrackedUnhealthy) {
- containersUnhealthyTotal
- .set(numTrackedUnhealthy);
+ public synchronized void setContainersUnClosedTotal(
+ long numTrackedUnclosed) {
+ containersUnClosedTotal.set(numTrackedUnclosed);
}
public synchronized void setContainersSufficientlyReplicatedTotal(
@@ -246,8 +241,8 @@ public final class NodeDecommissionMetrics implements
MetricsSource {
return containersUnderReplicatedTotal.value();
}
- public synchronized long getContainersUnhealthyTotal() {
- return containersUnhealthyTotal.value();
+ public synchronized long getContainersUnClosedTotal() {
+ return containersUnClosedTotal.value();
}
public synchronized long getContainersSufficientlyReplicatedTotal() {
@@ -282,9 +277,9 @@ public final class NodeDecommissionMetrics implements
MetricsSource {
}
@VisibleForTesting
- public Long getUnhealthyContainersByHost(String host) {
+ public Long getUnClosedContainersByHost(String host) {
ContainerStateInWorkflow workflowMetrics = metricsByHost.get(host);
return workflowMetrics == null ? null :
- workflowMetrics.getUnhealthyContainers();
+ workflowMetrics.getUnclosedContainers();
}
}
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestECReplicationCheckHandler.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestECReplicationCheckHandler.java
index ccb239637a..415f6261e1 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestECReplicationCheckHandler.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/container/replication/health/TestECReplicationCheckHandler.java
@@ -341,12 +341,13 @@ public class TestECReplicationCheckHandler {
assertFalse(result.underReplicatedDueToOutOfService());
assertTrue(result.isUnrecoverable());
assertTrue(result.hasUnreplicatedOfflineIndexes());
+ assertFalse(result.offlineIndexesOkAfterPending());
assertTrue(healthCheck.handle(request));
// Unrecoverable so not added to the queue
assertEquals(1, repQueue.underReplicatedQueueSize());
assertEquals(0, repQueue.overReplicatedQueueSize());
- assertEquals(0, report.getStat(
+ assertEquals(1, report.getStat(
ReplicationManagerReport.HealthState.UNDER_REPLICATED));
assertEquals(1, report.getStat(
ReplicationManagerReport.HealthState.MISSING));
@@ -354,12 +355,12 @@ public class TestECReplicationCheckHandler {
@Test
public void testUnderReplicatedAndUnrecoverableWithDecommissionPending() {
- testUnderReplicatedAndUnrecoverableWithOffline(DECOMMISSIONING);
+ testUnderReplicatedAndUnrecoverableWithOfflinePending(DECOMMISSIONING);
}
@Test
public void testUnderReplicatedAndUnrecoverableWithMaintenancePending() {
- testUnderReplicatedAndUnrecoverableWithOffline(ENTERING_MAINTENANCE);
+
testUnderReplicatedAndUnrecoverableWithOfflinePending(ENTERING_MAINTENANCE);
}
private void testUnderReplicatedAndUnrecoverableWithOfflinePending(
@@ -383,10 +384,13 @@ public class TestECReplicationCheckHandler {
assertFalse(result.isReplicatedOkAfterPending());
assertFalse(result.underReplicatedDueToOutOfService());
assertTrue(result.isUnrecoverable());
- assertFalse(result.hasUnreplicatedOfflineIndexes());
+ // The pending has not completed yet, so the container is still
under-replicated.
+ assertTrue(result.hasUnreplicatedOfflineIndexes());
+ assertTrue(result.offlineIndexesOkAfterPending());
assertTrue(healthCheck.handle(request));
- // Unrecoverable so not added to the queue
+ // Unrecoverable, but there are offline indexes which need replicated. As
there is a pending
+ // add on the offline index, it is not added to the queue.
assertEquals(0, repQueue.underReplicatedQueueSize());
assertEquals(0, repQueue.overReplicatedQueueSize());
assertEquals(1, report.getStat(
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorTestUtil.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorTestUtil.java
index 9b5c348595..4433c0cb6f 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorTestUtil.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/DatanodeAdminMonitorTestUtil.java
@@ -27,6 +27,7 @@ import org.apache.hadoop.hdds.scm.container.ContainerID;
import org.apache.hadoop.hdds.scm.container.ContainerInfo;
import org.apache.hadoop.hdds.scm.container.ContainerNotFoundException;
import org.apache.hadoop.hdds.scm.container.ContainerReplica;
+import org.apache.hadoop.hdds.scm.container.ReplicationManagerReport;
import org.apache.hadoop.hdds.scm.container.replication.ContainerReplicaCount;
import
org.apache.hadoop.hdds.scm.container.replication.ECContainerReplicaCount;
import
org.apache.hadoop.hdds.scm.container.replication.RatisContainerReplicaCount;
@@ -151,6 +152,7 @@ public final class DatanodeAdminMonitorTestUtil {
*/
public static void mockGetContainerReplicaCount(
ReplicationManager repManager,
+ boolean underReplicated,
HddsProtos.LifeCycleState containerState,
HddsProtos.NodeOperationalState...replicaStates)
throws ContainerNotFoundException {
@@ -160,6 +162,7 @@ public final class DatanodeAdminMonitorTestUtil {
.thenAnswer(invocation ->
generateReplicaCount((ContainerID)invocation.getArguments()[0],
containerState, replicaStates));
+ mockCheckContainerState(repManager, underReplicated);
}
/**
@@ -175,6 +178,7 @@ public final class DatanodeAdminMonitorTestUtil {
*/
public static void mockGetContainerReplicaCountForEC(
ReplicationManager repManager,
+ boolean underReplicated,
HddsProtos.LifeCycleState containerState,
ECReplicationConfig repConfig,
Triple<HddsProtos.NodeOperationalState, DatanodeDetails,
@@ -186,6 +190,21 @@ public final class DatanodeAdminMonitorTestUtil {
.thenAnswer(invocation ->
generateECReplicaCount((ContainerID)invocation.getArguments()[0],
repConfig, containerState, replicaStates));
+ mockCheckContainerState(repManager, underReplicated);
+ }
+
+ private static void mockCheckContainerState(ReplicationManager repManager,
boolean underReplicated)
+ throws ContainerNotFoundException {
+
Mockito.when(repManager.checkContainerStatus(Mockito.any(ContainerInfo.class),
+ Mockito.any(ReplicationManagerReport.class)))
+ .then(invocation -> {
+ ReplicationManagerReport report = invocation.getArgument(1);
+ if (underReplicated) {
+
report.increment(ReplicationManagerReport.HealthState.UNDER_REPLICATED);
+ return true;
+ }
+ return false;
+ });
}
/**
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
index 09ff7cb0ff..4b389fbcf2 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestDatanodeAdminMonitor.java
@@ -179,6 +179,7 @@ public class TestDatanodeAdminMonitor {
DatanodeAdminMonitorTestUtil
.mockGetContainerReplicaCount(
repManager,
+ true,
HddsProtos.LifeCycleState.CLOSED,
DECOMMISSIONED,
IN_SERVICE,
@@ -205,6 +206,7 @@ public class TestDatanodeAdminMonitor {
DatanodeAdminMonitorTestUtil
.mockGetContainerReplicaCount(
repManager,
+ false,
HddsProtos.LifeCycleState.CLOSED,
IN_SERVICE,
IN_SERVICE,
@@ -376,6 +378,7 @@ public class TestDatanodeAdminMonitor {
DatanodeAdminMonitorTestUtil
.mockGetContainerReplicaCount(
repManager,
+ true,
HddsProtos.LifeCycleState.DELETING,
DECOMMISSIONED,
IN_SERVICE);
@@ -404,6 +407,7 @@ public class TestDatanodeAdminMonitor {
DatanodeAdminMonitorTestUtil
.mockGetContainerReplicaCountForEC(
repManager,
+ true,
HddsProtos.LifeCycleState.CLOSED,
new ECReplicationConfig(3, 2),
Triple.of(DECOMMISSIONING, dn1, 1),
@@ -432,6 +436,7 @@ public class TestDatanodeAdminMonitor {
DatanodeAdminMonitorTestUtil
.mockGetContainerReplicaCountForEC(
repManager,
+ false,
HddsProtos.LifeCycleState.CLOSED,
new ECReplicationConfig(3, 2),
Triple.of(DECOMMISSIONING, dn1, 1),
@@ -459,6 +464,7 @@ public class TestDatanodeAdminMonitor {
DatanodeAdminMonitorTestUtil
.mockGetContainerReplicaCount(
repManager,
+ true,
HddsProtos.LifeCycleState.CLOSED,
DECOMMISSIONED,
IN_SERVICE,
@@ -496,6 +502,7 @@ public class TestDatanodeAdminMonitor {
DatanodeAdminMonitorTestUtil
.mockGetContainerReplicaCount(
repManager,
+ true,
HddsProtos.LifeCycleState.CLOSED,
DECOMMISSIONED,
IN_SERVICE,
@@ -590,12 +597,13 @@ public class TestDatanodeAdminMonitor {
DatanodeAdminMonitorTestUtil
.mockGetContainerReplicaCount(
repManager,
+ true,
HddsProtos.LifeCycleState.CLOSED,
IN_MAINTENANCE,
ENTERING_MAINTENANCE,
IN_MAINTENANCE);
- // Add the node to the monitor, it should transiting to
+ // Add the node to the monitor, it should transition to
// REPLICATE_CONTAINERS as the containers are under-replicated for
// maintenance.
monitor.startMonitoring(dn1);
diff --git
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionMetrics.java
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionMetrics.java
index 0383ef122d..7a6a888780 100644
---
a/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionMetrics.java
+++
b/hadoop-hdds/server-scm/src/test/java/org/apache/hadoop/hdds/scm/node/TestNodeDecommissionMetrics.java
@@ -175,6 +175,7 @@ public class TestNodeDecommissionMetrics {
nodeManager.setContainers(dn1, containers);
DatanodeAdminMonitorTestUtil
.mockGetContainerReplicaCount(repManager,
+ true,
HddsProtos.LifeCycleState.CLOSED,
DECOMMISSIONED,
IN_SERVICE,
@@ -214,6 +215,7 @@ public class TestNodeDecommissionMetrics {
nodeManager.setContainers(dn1, containers);
DatanodeAdminMonitorTestUtil
.mockGetContainerReplicaCount(repManager,
+ false,
HddsProtos.LifeCycleState.CLOSED,
IN_SERVICE,
IN_SERVICE,
@@ -233,11 +235,11 @@ public class TestNodeDecommissionMetrics {
}
/**
- * Test for collecting metric for unhealthy containers
+ * Test for collecting metric for unclosed containers
* from nodes in decommissioning and maintenance workflow.
*/
@Test
- public void testDecommMonitorCollectUnhealthyContainers()
+ public void testDecommMonitorCollectUnclosedContainers()
throws ContainerNotFoundException, NodeNotFoundException {
DatanodeDetails dn1 = MockDatanodeDetails.createDatanodeDetails(
"datanode_host1",
@@ -249,22 +251,23 @@ public class TestNodeDecommissionMetrics {
containers.add(ContainerID.valueOf(1));
// set OPEN container with 1 replica CLOSED replica state,
- // in-service node, generates monitored unhealthy container replica
+ // in-service node, generates monitored unclosed container replica
nodeManager.setContainers(dn1, containers);
DatanodeAdminMonitorTestUtil
.mockGetContainerReplicaCount(repManager,
+ true,
HddsProtos.LifeCycleState.OPEN,
IN_SERVICE);
monitor.startMonitoring(dn1);
monitor.run();
Assertions.assertEquals(1,
- metrics.getContainersUnhealthyTotal());
+ metrics.getContainersUnClosedTotal());
// should have host specific metric collected
// for datanode_host1
Assertions.assertEquals(1,
- metrics.getUnhealthyContainersByHost(
+ metrics.getUnClosedContainersByHost(
"datanode_host1"));
}
@@ -299,6 +302,7 @@ public class TestNodeDecommissionMetrics {
nodeManager.setContainers(dn2, containersDn2);
DatanodeAdminMonitorTestUtil
.mockGetContainerReplicaCount(repManager,
+ true,
HddsProtos.LifeCycleState.CLOSED,
DECOMMISSIONED,
IN_SERVICE,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]