This is an automated email from the ASF dual-hosted git repository.

sodonnell pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new 85c60f2354 HDDS-8448. Send FCR to SCM on volume failure. (#4773)
85c60f2354 is described below

commit 85c60f2354a8983db870166a9dc6883066fe75ff
Author: ashishkumar50 <[email protected]>
AuthorDate: Tue Jun 6 18:48:29 2023 +0530

    HDDS-8448. Send FCR to SCM on volume failure. (#4773)
---
 .../ozone/container/common/impl/ContainerSet.java  |  42 +++++--
 .../ozone/container/ozoneimpl/OzoneContainer.java  |   2 +-
 .../common/volume/TestStorageVolumeChecker.java    |   9 +-
 .../common/volume/TestVolumeSetDiskChecks.java     | 128 +++++++++++++++++++++
 4 files changed, 163 insertions(+), 18 deletions(-)

diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerSet.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerSet.java
index 726e149db5..2df5d40ef9 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerSet.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/common/impl/ContainerSet.java
@@ -21,10 +21,12 @@ package org.apache.hadoop.ozone.container.common.impl;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.base.Preconditions;
 import com.google.common.collect.ImmutableMap;
+import com.google.protobuf.Message;
 import org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos;
 import 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos.ContainerReportsProto;
 import 
org.apache.hadoop.hdds.scm.container.common.helpers.StorageContainerException;
 import org.apache.hadoop.ozone.container.common.interfaces.Container;
+import org.apache.hadoop.ozone.container.common.statemachine.StateContext;
 import org.apache.hadoop.ozone.container.common.volume.HddsVolume;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -41,6 +43,8 @@ import java.util.Set;
 import java.util.concurrent.ConcurrentNavigableMap;
 import java.util.concurrent.ConcurrentSkipListMap;
 import java.util.concurrent.ConcurrentSkipListSet;
+import java.util.concurrent.atomic.AtomicBoolean;
+import java.util.concurrent.atomic.AtomicInteger;
 
 import static 
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerDataProto.State.RECOVERING;
 
@@ -179,21 +183,39 @@ public class ContainerSet implements 
Iterable<Container<?>> {
     return containerMap.size();
   }
 
-  public void handleVolumeFailures() {
+  /**
+   * Remove all containers belonging to failed volume.
+   * Send FCR which will not contain removed containers.
+   *
+   * @param  context StateContext
+   * @return
+   */
+  public void handleVolumeFailures(StateContext context) {
+    AtomicBoolean failedVolume = new AtomicBoolean(false);
+    AtomicInteger containerCount = new AtomicInteger(0);
     containerMap.values().forEach(c -> {
       if (c.getContainerData().getVolume().isFailed()) {
-        try {
-          c.markContainerUnhealthy();
-          LOG.info("Marking Container {} UNHEALTHY as the Volume {} " +
+        removeContainer(c.getContainerData().getContainerID());
+        LOG.debug("Removing Container {} as the Volume {} " +
               "has failed", c.getContainerData().getContainerID(),
-              c.getContainerData().getVolume());
-        } catch (StorageContainerException e) {
-          LOG.error("Failed to move container {} to UNHEALTHY state in "
-                  + "volume {}", c.getContainerData().getContainerID(),
-              c.getContainerData().getVolume(), e);
-        }
+            c.getContainerData().getVolume());
+        failedVolume.set(true);
+        containerCount.incrementAndGet();
       }
     });
+
+    if (failedVolume.get()) {
+      try {
+        LOG.info("Removed {} containers on failed volumes",
+            containerCount.get());
+        // There are some failed volume(container), send FCR to SCM
+        Message report = context.getFullContainerReportDiscardPendingICR();
+        context.refreshFullReport(report);
+        context.getParent().triggerHeartbeat();
+      } catch (Exception e) {
+        LOG.error("Failed to send FCR in Volume failure", e);
+      }
+    }
   }
 
   @Override
diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
index bc2461b3ff..4e52d25e47 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
@@ -437,7 +437,7 @@ public class OzoneContainer {
 
   public void handleVolumeFailures() {
     if (containerSet != null) {
-      containerSet.handleVolumeFailures();
+      containerSet.handleVolumeFailures(context);
     }
   }
 
diff --git 
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeChecker.java
 
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeChecker.java
index 6694b3e380..9b7170f8fa 100644
--- 
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeChecker.java
+++ 
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestStorageVolumeChecker.java
@@ -273,13 +273,8 @@ public class TestStorageVolumeChecker {
     Assert.assertEquals(1, volumeSet.getVolumesList().size());
     Assert.assertEquals(1, volumeSet.getFailedVolumesList().size());
 
-    i = 0;
-    for (ContainerDataProto.State state : ContainerDataProto.State.values()) {
-      if (!state.equals(ContainerDataProto.State.INVALID)) {
-        Assert.assertEquals(ContainerDataProto.State.UNHEALTHY,
-            containerSet.getContainer(++i).getContainerState());
-      }
-    }
+    // All containers should be removed from containerSet
+    Assert.assertEquals(0, containerSet.getContainerMap().size());
 
     ozoneContainer.stop();
   }
diff --git 
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestVolumeSetDiskChecks.java
 
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestVolumeSetDiskChecks.java
index 84263de93c..bb281f1eb5 100644
--- 
a/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestVolumeSetDiskChecks.java
+++ 
b/hadoop-hdds/container-service/src/test/java/org/apache/hadoop/ozone/container/common/volume/TestVolumeSetDiskChecks.java
@@ -20,18 +20,35 @@ package org.apache.hadoop.ozone.container.common.volume;
 
 import java.io.File;
 import java.io.IOException;
+import java.net.InetSocketAddress;
 import java.util.ArrayList;
 import java.util.Collection;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
 import java.util.UUID;
 
+import com.google.protobuf.Message;
 import org.apache.hadoop.hdds.DFSConfigKeysLegacy;
+import org.apache.hadoop.hdds.HddsConfigKeys;
 import org.apache.hadoop.hdds.conf.ConfigurationSource;
 import org.apache.hadoop.hdds.conf.OzoneConfiguration;
+import 
org.apache.hadoop.hdds.protocol.proto.StorageContainerDatanodeProtocolProtos;
+import org.apache.hadoop.hdds.scm.ScmConfigKeys;
 import org.apache.hadoop.ozone.OzoneConfigKeys;
+import org.apache.hadoop.ozone.OzoneConsts;
+import org.apache.hadoop.ozone.container.ContainerTestHelper;
 import org.apache.hadoop.ozone.container.common.ContainerTestUtils;
+import org.apache.hadoop.ozone.container.common.TestDatanodeStateMachine;
+import org.apache.hadoop.ozone.container.common.impl.ContainerLayoutVersion;
+import org.apache.hadoop.ozone.container.common.impl.ContainerSet;
 import 
org.apache.hadoop.ozone.container.common.statemachine.DatanodeConfiguration;
+import 
org.apache.hadoop.ozone.container.common.statemachine.DatanodeStateMachine;
+import org.apache.hadoop.ozone.container.common.statemachine.StateContext;
+import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainer;
+import org.apache.hadoop.ozone.container.keyvalue.KeyValueContainerData;
+import org.apache.hadoop.ozone.container.ozoneimpl.OzoneContainer;
 import org.apache.ozone.test.GenericTestUtils;
 import org.apache.hadoop.util.DiskChecker.DiskErrorException;
 import org.apache.hadoop.util.Timer;
@@ -44,10 +61,13 @@ import org.junit.After;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertThat;
 import static org.junit.Assert.assertTrue;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
 
 import org.junit.Assert;
 import org.junit.Rule;
 import org.junit.Test;
+import org.junit.jupiter.api.Assertions;
 import org.junit.rules.ExpectedException;
 import org.junit.rules.Timeout;
 import org.slf4j.Logger;
@@ -226,6 +246,114 @@ public class TestVolumeSetDiskChecks {
     return ozoneConf;
   }
 
+  /**
+   * Verify that when volume fails, containers are removed from containerSet.
+   * And FCR report is being sent.
+   * @throws IOException
+   */
+  @Test
+  public void testVolumeFailure() throws IOException {
+    final int numVolumes = 5;
+
+    conf = getConfWithDataNodeDirs(numVolumes);
+    ContainerTestUtils.enableSchemaV3(conf);
+    UUID datanodeId = UUID.randomUUID();
+    StorageVolumeChecker dummyChecker =
+        new DummyChecker(conf, new Timer(), 0);
+
+    OzoneContainer ozoneContainer = mock(OzoneContainer.class);
+    ContainerSet conSet = new ContainerSet(20);
+    when(ozoneContainer.getContainerSet()).thenReturn(conSet);
+
+    String path = GenericTestUtils
+        .getTempPath(TestDatanodeStateMachine.class.getSimpleName());
+    File testRoot = new File(path);
+
+    conf.set(HddsConfigKeys.OZONE_METADATA_DIRS,
+        new File(testRoot, "scm").getAbsolutePath());
+    path = new File(testRoot, "datanodeID").getAbsolutePath();
+    conf.set(ScmConfigKeys.OZONE_SCM_DATANODE_ID_DIR, path);
+
+    long containerID = ContainerTestHelper.getTestContainerID();
+    ContainerLayoutVersion layout = ContainerLayoutVersion.FILE_PER_CHUNK;
+    KeyValueContainerData data =
+        new KeyValueContainerData(containerID, layout,
+            ContainerTestHelper.CONTAINER_MAX_SIZE,
+            UUID.randomUUID().toString(), datanodeId.toString());
+    data.closeContainer();
+    data.setSchemaVersion(OzoneConsts.SCHEMA_V3);
+
+    long containerID1 = ContainerTestHelper.getTestContainerID();
+    KeyValueContainerData data1 =
+        new KeyValueContainerData(containerID1, layout,
+            ContainerTestHelper.CONTAINER_MAX_SIZE,
+            UUID.randomUUID().toString(), datanodeId.toString());
+    data1.closeContainer();
+    data1.setSchemaVersion(OzoneConsts.SCHEMA_V3);
+
+    final MutableVolumeSet volumeSet = new MutableVolumeSet(
+        UUID.randomUUID().toString(), conf, null,
+        StorageVolume.VolumeType.DATA_VOLUME,
+        dummyChecker);
+
+    final MutableVolumeSet volumeSet1 = new MutableVolumeSet(
+        UUID.randomUUID().toString(), conf, null,
+        StorageVolume.VolumeType.DATA_VOLUME,
+        dummyChecker);
+
+    KeyValueContainer container = new KeyValueContainer(data, conf);
+    container.create(volumeSet,
+        new RoundRobinVolumeChoosingPolicy(), UUID.randomUUID().toString());
+    conSet.addContainer(container);
+
+    KeyValueContainer container1 = new KeyValueContainer(data1, conf);
+    container1.create(volumeSet1,
+        new RoundRobinVolumeChoosingPolicy(), UUID.randomUUID().toString());
+    conSet.addContainer(container1);
+    DatanodeStateMachine datanodeStateMachineMock =
+        mock(DatanodeStateMachine.class);
+    StateContext stateContext = new StateContext(
+        new OzoneConfiguration(), DatanodeStateMachine
+        .DatanodeStates.getInitState(),
+        datanodeStateMachineMock);
+    InetSocketAddress scm1 = new InetSocketAddress("scm1", 9001);
+    stateContext.addEndpoint(scm1);
+    when(datanodeStateMachineMock.getContainer()).thenReturn(ozoneContainer);
+
+    Map<String, Integer> expectedReportCount = new HashMap<>();
+    checkReportCount(stateContext.getAllAvailableReports(scm1),
+        expectedReportCount);
+
+    // Fail one volume
+    volumeSet1.failVolume(volumeSet1.getVolumesList().get(0)
+        .getStorageDir().getPath());
+
+    conSet.handleVolumeFailures(stateContext);
+    // ContainerID1 should be removed belonging to failed volume
+    Assert.assertNull(conSet.getContainer(containerID1));
+    // ContainerID should exist belonging to normal volume
+    Assert.assertNotNull(conSet.getContainer(containerID));
+    expectedReportCount.put(
+        StorageContainerDatanodeProtocolProtos.ContainerReportsProto
+            .getDescriptor().getFullName(), 1);
+
+    // Check FCR report should be present
+    checkReportCount(stateContext.getAllAvailableReports(scm1),
+        expectedReportCount);
+    volumeSet.shutdown();
+  }
+
+  void checkReportCount(List<Message> reports,
+                        Map<String, Integer> expectedReportCount) {
+    Map<String, Integer> reportCount = new HashMap<>();
+    for (Message report : reports) {
+      final String reportName = report.getDescriptorForType().getFullName();
+      reportCount.put(reportName, reportCount.getOrDefault(reportName, 0) + 1);
+    }
+    // Verify
+    Assertions.assertEquals(expectedReportCount, reportCount);
+  }
+
   /**
    * A no-op checker that fails the given number of volumes and succeeds
    * the rest.


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to