This is an automated email from the ASF dual-hosted git repository.
sodonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 85c60f2354 HDDS-8448. Send FCR to SCM on volume failure. (#4773)
85c60f2354 is described below
commit 85c60f2354a8983db870166a9dc6883066fe75ff
Author: ashishkumar50 <[email protected]>
AuthorDate: Tue Jun 6 18:48:29 2023 +0530
HDDS-8448. Send FCR to SCM on volume failure. (#4773)
---
.../ozone/container/common/impl/ContainerSet.java | 42 +++++--
.../ozone/container/ozoneimpl/OzoneContainer.java | 2 +-
.../common/volume/TestStorageVolumeChecker.java | 9 +-
.../common/volume/TestVolumeSetDiskChecks.java | 128 +++++++++++++++++++++
4 files changed, 163 insertions(+), 18 deletions(-)
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerSet.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerSet.java
index 726e149db5..2df5d40ef9 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerSet.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerSet.java
@@ -21,10 +21,12 @@ package org.apache.hadoop.ozone.container.common.impl;
import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableMap;
+import com.google.protobuf.Message;
import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos;
import
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReportsProto;
import
org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException;
import org.apache.hadoop.ozone.container.common.interfaces.Container;
+import org.apache.hadoop.ozone.container.common.statemachine.StateContext;
import org.apache.hadoop.ozone.container.common.volume.HddsVolume;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -41,6 +43,8 @@ import java.util.Set;
import java.util.concurrent.ConcurrentNavigableMap;
import java.util.concurrent.ConcurrentSkipListMap;
import java.util.concurrent.ConcurrentSkipListSet;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
import static
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.RECOVERING;
@@ -179,21 +183,39 @@ public class ContainerSet implements
Iterable<Container<?>> {
return containerMap.size();
}
- public void handleVolumeFailures() {
+ /**
+ * Remove all containers belonging to failed volume.
+ * Send FCR which will not contain removed containers.
+ *
+ * @param context StateContext
+ * @return
+ */
+ public void handleVolumeFailures(StateContext context) {
+ AtomicBoolean failedVolume = new AtomicBoolean(false);
+ AtomicInteger containerCount = new AtomicInteger(0);
containerMap.values().forEach(c -> {
if (c.getContainerData().getVolume().isFailed()) {
- try {
- c.markContainerUnhealthy();
- LOG.info("Marking Container {} UNHEALTHY as the Volume {} " +
+ removeContainer(c.getContainerData().getContainerID());
+ LOG.debug("Removing Container {} as the Volume {} " +
"has failed", c.getContainerData().getContainerID(),
- c.getContainerData().getVolume());
- } catch (StorageContainerException e) {
- LOG.error("Failed to move container {} to UNHEALTHY state in "
- + "volume {}", c.getContainerData().getContainerID(),
- c.getContainerData().getVolume(), e);
- }
+ c.getContainerData().getVolume());
+ failedVolume.set(true);
+ containerCount.incrementAndGet();
}
});
+
+ if (failedVolume.get()) {
+ try {
+ LOG.info("Removed {} containers on failed volumes",
+ containerCount.get());
+ // There are some failed volume(container), send FCR to SCM
+ Message report = context.getFullContainerReportDiscardPendingICR();
+ context.refreshFullReport(report);
+ context.getParent().triggerHeartbeat();
+ } catch (Exception e) {
+ LOG.error("Failed to send FCR in Volume failure", e);
+ }
+ }
}
@Override
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
index bc2461b3ff..4e52d25e47 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
@@ -437,7 +437,7 @@ public class OzoneContainer {
public void handleVolumeFailures() {
if (containerSet != null) {
- containerSet.handleVolumeFailures();
+ containerSet.handleVolumeFailures(context);
}
}
diff --git
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeChecker.java
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeChecker.java
index 6694b3e380..9b7170f8fa 100644
---
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeChecker.java
+++
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeChecker.java
@@ -273,13 +273,8 @@ public class TestStorageVolumeChecker {
Assert.assertEquals(1, volumeSet.getVolumesList().size());
Assert.assertEquals(1, volumeSet.getFailedVolumesList().size());
- i = 0;
- for (ContainerDataProto.State state : ContainerDataProto.State.values()) {
- if (!state.equals(ContainerDataProto.State.INVALID)) {
- Assert.assertEquals(ContainerDataProto.State.UNHEALTHY,
- containerSet.getContainer(++i).getContainerState());
- }
- }
+ // All containers should be removed from containerSet
+ Assert.assertEquals(0, containerSet.getContainerMap().size());
ozoneContainer.stop();
}
diff --git
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestVolumeSetDiskChecks.java
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestVolumeSetDiskChecks.java
index 84263de93c..bb281f1eb5 100644
---
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestVolumeSetDiskChecks.java
+++
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestVolumeSetDiskChecks.java
@@ -20,18 +20,35 @@ package org.apache.hadoop.ozone.container.common.volume;
import java.io.File;
import java.io.IOException;
+import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Collection;
+import java.util.HashMap;
import java.util.List;
+import java.util.Map;
import java.util.Set;
import java.util.UUID;
+import com.google.protobuf.Message;
import org.apache.hadoop.hdds.DFSConfigKeysLegacy;
+import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.conf.ConfigurationSource;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos;
+import org.apache.hadoop.hdds.scm.ScmConfigKeys;
import org.apache.hadoop.ozone.OzoneConfigKeys;
+import org.apache.hadoop.ozone.OzoneConsts;
+import org.apache.hadoop.ozone.container.ContainerTestHelper;
import org.apache.hadoop.ozone.container.common.ContainerTestUtils;
+import org.apache.hadoop.ozone.container.common.TestDatanodeStateMachine;
+import org.apache.hadoop.ozone.container.common.impl.ContainerLayoutVersion;
+import org.apache.hadoop.ozone.container.common.impl.ContainerSet;
import
org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration;
+import
org.apache.hadoop.ozone.container.common.statemachine.DatanodeStateMachine;
+import org.apache.hadoop.ozone.container.common.statemachine.StateContext;
+import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainer;
+import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData;
+import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer;
import org.apache.ozone.test.GenericTestUtils;
import org.apache.hadoop.util.DiskChecker.DiskErrorException;
import org.apache.hadoop.util.Timer;
@@ -44,10 +61,13 @@ import org.junit.After;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertThat;
import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
import org.junit.Assert;
import org.junit.Rule;
import org.junit.Test;
+import org.junit.jupiter.api.Assertions;
import org.junit.rules.ExpectedException;
import org.junit.rules.Timeout;
import org.slf4j.Logger;
@@ -226,6 +246,114 @@ public class TestVolumeSetDiskChecks {
return ozoneConf;
}
+ /**
+ * Verify that when volume fails, containers are removed from containerSet.
+ * And FCR report is being sent.
+ * @throws IOException
+ */
+ @Test
+ public void testVolumeFailure() throws IOException {
+ final int numVolumes = 5;
+
+ conf = getConfWithDataNodeDirs(numVolumes);
+ ContainerTestUtils.enableSchemaV3(conf);
+ UUID datanodeId = UUID.randomUUID();
+ StorageVolumeChecker dummyChecker =
+ new DummyChecker(conf, new Timer(), 0);
+
+ OzoneContainer ozoneContainer = mock(OzoneContainer.class);
+ ContainerSet conSet = new ContainerSet(20);
+ when(ozoneContainer.getContainerSet()).thenReturn(conSet);
+
+ String path = GenericTestUtils
+ .getTempPath(TestDatanodeStateMachine.class.getSimpleName());
+ File testRoot = new File(path);
+
+ conf.set(HddsConfigKeys.OZONE_METADATA_DIRS,
+ new File(testRoot, "scm").getAbsolutePath());
+ path = new File(testRoot, "datanodeID").getAbsolutePath();
+ conf.set(ScmConfigKeys.OZONE_SCM_DATANODE_ID_DIR, path);
+
+ long containerID = ContainerTestHelper.getTestContainerID();
+ ContainerLayoutVersion layout = ContainerLayoutVersion.FILE_PER_CHUNK;
+ KeyValueContainerData data =
+ new KeyValueContainerData(containerID, layout,
+ ContainerTestHelper.CONTAINER_MAX_SIZE,
+ UUID.randomUUID().toString(), datanodeId.toString());
+ data.closeContainer();
+ data.setSchemaVersion(OzoneConsts.SCHEMA_V3);
+
+ long containerID1 = ContainerTestHelper.getTestContainerID();
+ KeyValueContainerData data1 =
+ new KeyValueContainerData(containerID1, layout,
+ ContainerTestHelper.CONTAINER_MAX_SIZE,
+ UUID.randomUUID().toString(), datanodeId.toString());
+ data1.closeContainer();
+ data1.setSchemaVersion(OzoneConsts.SCHEMA_V3);
+
+ final MutableVolumeSet volumeSet = new MutableVolumeSet(
+ UUID.randomUUID().toString(), conf, null,
+ StorageVolume.VolumeType.DATA_VOLUME,
+ dummyChecker);
+
+ final MutableVolumeSet volumeSet1 = new MutableVolumeSet(
+ UUID.randomUUID().toString(), conf, null,
+ StorageVolume.VolumeType.DATA_VOLUME,
+ dummyChecker);
+
+ KeyValueContainer container = new KeyValueContainer(data, conf);
+ container.create(volumeSet,
+ new RoundRobinVolumeChoosingPolicy(), UUID.randomUUID().toString());
+ conSet.addContainer(container);
+
+ KeyValueContainer container1 = new KeyValueContainer(data1, conf);
+ container1.create(volumeSet1,
+ new RoundRobinVolumeChoosingPolicy(), UUID.randomUUID().toString());
+ conSet.addContainer(container1);
+ DatanodeStateMachine datanodeStateMachineMock =
+ mock(DatanodeStateMachine.class);
+ StateContext stateContext = new StateContext(
+ new OzoneConfiguration(), DatanodeStateMachine
+ .DatanodeStates.getInitState(),
+ datanodeStateMachineMock);
+ InetSocketAddress scm1 = new InetSocketAddress("scm1", 9001);
+ stateContext.addEndpoint(scm1);
+ when(datanodeStateMachineMock.getContainer()).thenReturn(ozoneContainer);
+
+ Map<String, Integer> expectedReportCount = new HashMap<>();
+ checkReportCount(stateContext.getAllAvailableReports(scm1),
+ expectedReportCount);
+
+ // Fail one volume
+ volumeSet1.failVolume(volumeSet1.getVolumesList().get(0)
+ .getStorageDir().getPath());
+
+ conSet.handleVolumeFailures(stateContext);
+ // ContainerID1 should be removed belonging to failed volume
+ Assert.assertNull(conSet.getContainer(containerID1));
+ // ContainerID should exist belonging to normal volume
+ Assert.assertNotNull(conSet.getContainer(containerID));
+ expectedReportCount.put(
+ StorageContainerDatanodeProtocolProtos.ContainerReportsProto
+ .getDescriptor().getFullName(), 1);
+
+ // Check FCR report should be present
+ checkReportCount(stateContext.getAllAvailableReports(scm1),
+ expectedReportCount);
+ volumeSet.shutdown();
+ }
+
+ void checkReportCount(List<Message> reports,
+ Map<String, Integer> expectedReportCount) {
+ Map<String, Integer> reportCount = new HashMap<>();
+ for (Message report : reports) {
+ final String reportName = report.getDescriptorForType().getFullName();
+ reportCount.put(reportName, reportCount.getOrDefault(reportName, 0) + 1);
+ }
+ // Verify
+ Assertions.assertEquals(expectedReportCount, reportCount);
+ }
+
/**
* A no-op checker that fails the given number of volumes and succeeds
* the rest.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]