This is an automated email from the ASF dual-hosted git repository.

erose pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git


The following commit(s) were added to refs/heads/master by this push:
     new eaf9a7db26 HDDS-9852. Intermittent timeout in testCorruptionDetected 
waiting for container to become unhealthy (#5948)
eaf9a7db26 is described below

commit eaf9a7db2675a9a5bd0a7f6f45a0cbb79f1cf6d4
Author: Doroszlai, Attila <[email protected]>
AuthorDate: Mon Jan 8 23:25:28 2024 +0100

    HDDS-9852. Intermittent timeout in testCorruptionDetected waiting for 
container to become unhealthy (#5948)
---
 .../AbstractBackgroundContainerScanner.java        | 49 ++++++++++++++++------
 .../ozone/container/ozoneimpl/OzoneContainer.java  | 16 +++++++
 ...tBackgroundContainerDataScannerIntegration.java |  6 ++-
 .../TestContainerScannerIntegrationAbstract.java   | 47 ++++++++++++++++-----
 4 files changed, 93 insertions(+), 25 deletions(-)

diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/AbstractBackgroundContainerScanner.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/AbstractBackgroundContainerScanner.java
index 139952d212..0ba01a191f 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/AbstractBackgroundContainerScanner.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/AbstractBackgroundContainerScanner.java
@@ -39,6 +39,7 @@ public abstract class AbstractBackgroundContainerScanner 
extends Thread {
   private final long dataScanInterval;
 
   private final AtomicBoolean stopping;
+  private final AtomicBoolean pausing = new AtomicBoolean();
 
   public AbstractBackgroundContainerScanner(String name,
       long dataScanInterval) {
@@ -69,30 +70,44 @@ public abstract class AbstractBackgroundContainerScanner 
extends Thread {
 
   @VisibleForTesting
   public final void runIteration() {
+    final boolean paused = pausing.get();
     long startTime = System.nanoTime();
-    scanContainers();
+    if (!paused) {
+      scanContainers();
+    }
     long totalDuration = System.nanoTime() - startTime;
     if (stopping.get()) {
       return;
     }
-    AbstractContainerScannerMetrics metrics = getMetrics();
-    metrics.incNumScanIterations();
-    LOG.info("Completed an iteration in {} minutes." +
-            " Number of iterations (since the data-node restart) : {}" +
-            ", Number of containers scanned in this iteration : {}" +
-            ", Number of unhealthy containers found in this iteration : {}",
-        TimeUnit.NANOSECONDS.toMinutes(totalDuration),
-        metrics.getNumScanIterations(),
-        metrics.getNumContainersScanned(),
-        metrics.getNumUnHealthyContainers());
+    if (paused) {
+      LOG.debug("Skipped iteration due to pause");
+    } else {
+      AbstractContainerScannerMetrics metrics = getMetrics();
+      metrics.incNumScanIterations();
+      LOG.info("Completed an iteration in {} minutes." +
+              " Number of iterations (since the data-node restart) : {}" +
+              ", Number of containers scanned in this iteration : {}" +
+              ", Number of unhealthy containers found in this iteration : {}",
+          TimeUnit.NANOSECONDS.toMinutes(totalDuration),
+          metrics.getNumScanIterations(),
+          metrics.getNumContainersScanned(),
+          metrics.getNumUnHealthyContainers());
+    }
     long elapsedMillis = TimeUnit.NANOSECONDS.toMillis(totalDuration);
     long remainingSleep = dataScanInterval - elapsedMillis;
     handleRemainingSleep(remainingSleep);
   }
 
-  public final void scanContainers() {
+  private void scanContainers() {
     Iterator<Container<?>> itr = getContainerIterator();
-    while (!stopping.get() && itr.hasNext()) {
+    while (itr.hasNext()) {
+      final boolean stopped = stopping.get();
+      final boolean paused = pausing.get();
+      if (stopped || paused) {
+        LOG.info("{} exits scan loop stop={} pause={}", this, stopped, paused);
+        break;
+      }
+
       Container<?> c = itr.next();
       try {
         scanContainer(c);
@@ -139,6 +154,14 @@ public abstract class AbstractBackgroundContainerScanner 
extends Thread {
     }
   }
 
+  public void pause() {
+    pausing.getAndSet(true);
+  }
+
+  public void unpause() {
+    pausing.getAndSet(false);
+  }
+
   @VisibleForTesting
   public abstract AbstractContainerScannerMetrics getMetrics();
 }
diff --git 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
index 277ab4464e..f050c96a45 100644
--- 
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
+++ 
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
@@ -70,6 +70,7 @@ import java.io.IOException;
 import java.time.Duration;
 import java.util.ArrayList;
 import java.util.Iterator;
+import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.concurrent.ThreadFactory;
@@ -111,6 +112,7 @@ public class OzoneContainer {
   private final ContainerController controller;
   private BackgroundContainerMetadataScanner metadataScanner;
   private List<BackgroundContainerDataScanner> dataScanners;
+  private List<AbstractBackgroundContainerScanner> backgroundScanners;
   private final BlockDeletingService blockDeletingService;
   private final StaleRecoveringContainerScrubbingService
       recoveringContainerScrubbingService;
@@ -338,8 +340,10 @@ public class OzoneContainer {
           "the on-demand container scanner have been disabled.");
       return;
     }
+
     initOnDemandContainerScanner(c);
 
+    backgroundScanners = new LinkedList<>();
     // This config is for testing the scanners in isolation.
     if (c.isMetadataScanEnabled()) {
       initMetadataScanner(c);
@@ -363,6 +367,7 @@ public class OzoneContainer {
           new BackgroundContainerDataScanner(c, controller, (HddsVolume) v);
       s.start();
       dataScanners.add(s);
+      backgroundScanners.add(s);
     }
   }
 
@@ -370,6 +375,7 @@ public class OzoneContainer {
     if (this.metadataScanner == null) {
       this.metadataScanner =
           new BackgroundContainerMetadataScanner(c, controller);
+      backgroundScanners.add(metadataScanner);
     }
     this.metadataScanner.start();
   }
@@ -402,6 +408,16 @@ public class OzoneContainer {
     OnDemandContainerDataScanner.shutdown();
   }
 
+  @VisibleForTesting
+  public void pauseContainerScrub() {
+    backgroundScanners.forEach(AbstractBackgroundContainerScanner::pause);
+  }
+
+  @VisibleForTesting
+  public void resumeContainerScrub() {
+    backgroundScanners.forEach(AbstractBackgroundContainerScanner::unpause);
+  }
+
   /**
    * Starts serving requests to ozone container.
    *
diff --git 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java
 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java
index 218c35c7d3..adc1234c2e 100644
--- 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java
+++ 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java
@@ -71,6 +71,8 @@ class TestBackgroundContainerDataScannerIntegration
   @EnumSource
   void testCorruptionDetected(ContainerCorruptions corruption)
       throws Exception {
+    pauseScanner();
+
     long containerID = writeDataThenCloseContainer();
     // Container corruption has not yet been introduced.
     Container<?> container = getDnContainer(containerID);
@@ -78,10 +80,12 @@ class TestBackgroundContainerDataScannerIntegration
 
     corruption.applyTo(container);
 
+    resumeScanner();
+
     // Wait for the scanner to detect corruption.
     GenericTestUtils.waitFor(
         () -> container.getContainerState() == State.UNHEALTHY,
-        500, 5000);
+        500, 15_000);
 
     // Wait for SCM to get a report of the unhealthy replica.
     waitForScmToSeeUnhealthyReplica(containerID);
diff --git 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java
 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java
index ce66700094..f53e041b54 100644
--- 
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java
+++ 
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java
@@ -20,6 +20,7 @@
 package org.apache.hadoop.ozone.dn.scanner;
 
 import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
 import org.apache.hadoop.hdds.HddsConfigKeys;
 import org.apache.hadoop.hdds.conf.OzoneConfiguration;
 import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
@@ -52,15 +53,16 @@ import java.io.File;
 import java.io.IOException;
 import java.io.UncheckedIOException;
 import java.nio.file.Files;
+import java.nio.file.Path;
 import java.nio.file.StandardOpenOption;
 import java.time.Duration;
 import java.util.Arrays;
 import java.util.EnumSet;
 import java.util.Objects;
 import java.util.Optional;
-import java.util.Random;
 import java.util.Set;
 import java.util.UUID;
+import java.util.concurrent.ThreadLocalRandom;
 import java.util.concurrent.TimeUnit;
 import java.util.function.Consumer;
 
@@ -115,6 +117,15 @@ public abstract class 
TestContainerScannerIntegrationAbstract {
     bucket = volume.getBucket(bucketName);
   }
 
+  void pauseScanner() {
+    getOzoneContainer().pauseContainerScrub();
+  }
+
+  void resumeScanner() {
+    getOzoneContainer().resumeContainerScrub();
+  }
+
+
   @AfterAll
   static void shutdown() throws IOException {
     if (ozClient != null) {
@@ -142,11 +153,14 @@ public abstract class 
TestContainerScannerIntegrationAbstract {
             != HddsProtos.LifeCycleState.OPEN);
   }
 
-  protected Container<?> getDnContainer(long containerID) {
+  private static OzoneContainer getOzoneContainer() {
     assertEquals(1, cluster.getHddsDatanodes().size());
     HddsDatanodeService dn = cluster.getHddsDatanodes().get(0);
-    OzoneContainer oc = dn.getDatanodeStateMachine().getContainer();
-    return oc.getContainerSet().getContainer(containerID);
+    return dn.getDatanodeStateMachine().getContainer();
+  }
+
+  protected Container<?> getDnContainer(long containerID) {
+    return getOzoneContainer().getContainerSet().getContainer(containerID);
   }
 
   protected long writeDataThenCloseContainer() throws Exception {
@@ -308,7 +322,6 @@ public abstract class 
TestContainerScannerIntegrationAbstract {
 
     private final Consumer<Container<?>> corruption;
     private final ScanResult.FailureType expectedResult;
-    private static final Random RANDOM = new Random();
 
     ContainerCorruptions(Consumer<Container<?>> corruption,
                          ScanResult.FailureType expectedResult) {
@@ -345,11 +358,21 @@ public abstract class 
TestContainerScannerIntegrationAbstract {
      * Overwrite the file with random bytes.
      */
     private static void corruptFile(File file) {
-      byte[] corruptedBytes = new byte[(int)file.length()];
-      RANDOM.nextBytes(corruptedBytes);
       try {
-        Files.write(file.toPath(), corruptedBytes,
-            StandardOpenOption.TRUNCATE_EXISTING);
+        final int length = (int) file.length();
+
+        Path path = file.toPath();
+        final byte[] original = IOUtils.readFully(Files.newInputStream(path), 
length);
+
+        final byte[] corruptedBytes = new byte[length];
+        ThreadLocalRandom.current().nextBytes(corruptedBytes);
+
+        Files.write(path, corruptedBytes,
+            StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.SYNC);
+
+        assertThat(IOUtils.readFully(Files.newInputStream(path), length))
+            .isEqualTo(corruptedBytes)
+            .isNotEqualTo(original);
       } catch (IOException ex) {
         // Fail the test.
         throw new UncheckedIOException(ex);
@@ -361,8 +384,10 @@ public abstract class 
TestContainerScannerIntegrationAbstract {
      */
     private static void truncateFile(File file) {
       try {
-        Files.write(file.toPath(), new byte[]{},
-            StandardOpenOption.TRUNCATE_EXISTING);
+        Files.write(file.toPath(), new byte[0],
+            StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.SYNC);
+
+        assertEquals(0, file.length());
       } catch (IOException ex) {
         // Fail the test.
         throw new UncheckedIOException(ex);


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to