This is an automated email from the ASF dual-hosted git repository.
erose pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new eaf9a7db26 HDDS-9852. Intermittent timeout in testCorruptionDetected
waiting for container to become unhealthy (#5948)
eaf9a7db26 is described below
commit eaf9a7db2675a9a5bd0a7f6f45a0cbb79f1cf6d4
Author: Doroszlai, Attila <[email protected]>
AuthorDate: Mon Jan 8 23:25:28 2024 +0100
HDDS-9852. Intermittent timeout in testCorruptionDetected waiting for
container to become unhealthy (#5948)
---
.../AbstractBackgroundContainerScanner.java | 49 ++++++++++++++++------
.../ozone/container/ozoneimpl/OzoneContainer.java | 16 +++++++
...tBackgroundContainerDataScannerIntegration.java | 6 ++-
.../TestContainerScannerIntegrationAbstract.java | 47 ++++++++++++++++-----
4 files changed, 93 insertions(+), 25 deletions(-)
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/AbstractBackgroundContainerScanner.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/AbstractBackgroundContainerScanner.java
index 139952d212..0ba01a191f 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/AbstractBackgroundContainerScanner.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/AbstractBackgroundContainerScanner.java
@@ -39,6 +39,7 @@ public abstract class AbstractBackgroundContainerScanner
extends Thread {
private final long dataScanInterval;
private final AtomicBoolean stopping;
+ private final AtomicBoolean pausing = new AtomicBoolean();
public AbstractBackgroundContainerScanner(String name,
long dataScanInterval) {
@@ -69,30 +70,44 @@ public abstract class AbstractBackgroundContainerScanner
extends Thread {
@VisibleForTesting
public final void runIteration() {
+ final boolean paused = pausing.get();
long startTime = System.nanoTime();
- scanContainers();
+ if (!paused) {
+ scanContainers();
+ }
long totalDuration = System.nanoTime() - startTime;
if (stopping.get()) {
return;
}
- AbstractContainerScannerMetrics metrics = getMetrics();
- metrics.incNumScanIterations();
- LOG.info("Completed an iteration in {} minutes." +
- " Number of iterations (since the data-node restart) : {}" +
- ", Number of containers scanned in this iteration : {}" +
- ", Number of unhealthy containers found in this iteration : {}",
- TimeUnit.NANOSECONDS.toMinutes(totalDuration),
- metrics.getNumScanIterations(),
- metrics.getNumContainersScanned(),
- metrics.getNumUnHealthyContainers());
+ if (paused) {
+ LOG.debug("Skipped iteration due to pause");
+ } else {
+ AbstractContainerScannerMetrics metrics = getMetrics();
+ metrics.incNumScanIterations();
+ LOG.info("Completed an iteration in {} minutes." +
+ " Number of iterations (since the data-node restart) : {}" +
+ ", Number of containers scanned in this iteration : {}" +
+ ", Number of unhealthy containers found in this iteration : {}",
+ TimeUnit.NANOSECONDS.toMinutes(totalDuration),
+ metrics.getNumScanIterations(),
+ metrics.getNumContainersScanned(),
+ metrics.getNumUnHealthyContainers());
+ }
long elapsedMillis = TimeUnit.NANOSECONDS.toMillis(totalDuration);
long remainingSleep = dataScanInterval - elapsedMillis;
handleRemainingSleep(remainingSleep);
}
- public final void scanContainers() {
+ private void scanContainers() {
Iterator<Container<?>> itr = getContainerIterator();
- while (!stopping.get() && itr.hasNext()) {
+ while (itr.hasNext()) {
+ final boolean stopped = stopping.get();
+ final boolean paused = pausing.get();
+ if (stopped || paused) {
+ LOG.info("{} exits scan loop stop={} pause={}", this, stopped, paused);
+ break;
+ }
+
Container<?> c = itr.next();
try {
scanContainer(c);
@@ -139,6 +154,14 @@ public abstract class AbstractBackgroundContainerScanner
extends Thread {
}
}
+ public void pause() {
+ pausing.getAndSet(true);
+ }
+
+ public void unpause() {
+ pausing.getAndSet(false);
+ }
+
@VisibleForTesting
public abstract AbstractContainerScannerMetrics getMetrics();
}
diff --git
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
index 277ab4464e..f050c96a45 100644
---
a/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
+++
b/hadoop-hdds/container-service/src/main/java/org/apache/hadoop/ozone/container/ozoneimpl/OzoneContainer.java
@@ -70,6 +70,7 @@ import java.io.IOException;
import java.time.Duration;
import java.util.ArrayList;
import java.util.Iterator;
+import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ThreadFactory;
@@ -111,6 +112,7 @@ public class OzoneContainer {
private final ContainerController controller;
private BackgroundContainerMetadataScanner metadataScanner;
private List<BackgroundContainerDataScanner> dataScanners;
+ private List<AbstractBackgroundContainerScanner> backgroundScanners;
private final BlockDeletingService blockDeletingService;
private final StaleRecoveringContainerScrubbingService
recoveringContainerScrubbingService;
@@ -338,8 +340,10 @@ public class OzoneContainer {
"the on-demand container scanner have been disabled.");
return;
}
+
initOnDemandContainerScanner(c);
+ backgroundScanners = new LinkedList<>();
// This config is for testing the scanners in isolation.
if (c.isMetadataScanEnabled()) {
initMetadataScanner(c);
@@ -363,6 +367,7 @@ public class OzoneContainer {
new BackgroundContainerDataScanner(c, controller, (HddsVolume) v);
s.start();
dataScanners.add(s);
+ backgroundScanners.add(s);
}
}
@@ -370,6 +375,7 @@ public class OzoneContainer {
if (this.metadataScanner == null) {
this.metadataScanner =
new BackgroundContainerMetadataScanner(c, controller);
+ backgroundScanners.add(metadataScanner);
}
this.metadataScanner.start();
}
@@ -402,6 +408,16 @@ public class OzoneContainer {
OnDemandContainerDataScanner.shutdown();
}
+ @VisibleForTesting
+ public void pauseContainerScrub() {
+ backgroundScanners.forEach(AbstractBackgroundContainerScanner::pause);
+ }
+
+ @VisibleForTesting
+ public void resumeContainerScrub() {
+ backgroundScanners.forEach(AbstractBackgroundContainerScanner::unpause);
+ }
+
/**
* Starts serving requests to ozone container.
*
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java
index 218c35c7d3..adc1234c2e 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestBackgroundContainerDataScannerIntegration.java
@@ -71,6 +71,8 @@ class TestBackgroundContainerDataScannerIntegration
@EnumSource
void testCorruptionDetected(ContainerCorruptions corruption)
throws Exception {
+ pauseScanner();
+
long containerID = writeDataThenCloseContainer();
// Container corruption has not yet been introduced.
Container<?> container = getDnContainer(containerID);
@@ -78,10 +80,12 @@ class TestBackgroundContainerDataScannerIntegration
corruption.applyTo(container);
+ resumeScanner();
+
// Wait for the scanner to detect corruption.
GenericTestUtils.waitFor(
() -> container.getContainerState() == State.UNHEALTHY,
- 500, 5000);
+ 500, 15_000);
// Wait for SCM to get a report of the unhealthy replica.
waitForScmToSeeUnhealthyReplica(containerID);
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java
index ce66700094..f53e041b54 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/dn/scanner/TestContainerScannerIntegrationAbstract.java
@@ -20,6 +20,7 @@
package org.apache.hadoop.ozone.dn.scanner;
import org.apache.commons.io.FileUtils;
+import org.apache.commons.io.IOUtils;
import org.apache.hadoop.hdds.HddsConfigKeys;
import org.apache.hadoop.hdds.conf.OzoneConfiguration;
import org.apache.hadoop.hdds.protocol.proto.HddsProtos;
@@ -52,15 +53,16 @@ import java.io.File;
import java.io.IOException;
import java.io.UncheckedIOException;
import java.nio.file.Files;
+import java.nio.file.Path;
import java.nio.file.StandardOpenOption;
import java.time.Duration;
import java.util.Arrays;
import java.util.EnumSet;
import java.util.Objects;
import java.util.Optional;
-import java.util.Random;
import java.util.Set;
import java.util.UUID;
+import java.util.concurrent.ThreadLocalRandom;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
@@ -115,6 +117,15 @@ public abstract class
TestContainerScannerIntegrationAbstract {
bucket = volume.getBucket(bucketName);
}
+ void pauseScanner() {
+ getOzoneContainer().pauseContainerScrub();
+ }
+
+ void resumeScanner() {
+ getOzoneContainer().resumeContainerScrub();
+ }
+
+
@AfterAll
static void shutdown() throws IOException {
if (ozClient != null) {
@@ -142,11 +153,14 @@ public abstract class
TestContainerScannerIntegrationAbstract {
!= HddsProtos.LifeCycleState.OPEN);
}
- protected Container<?> getDnContainer(long containerID) {
+ private static OzoneContainer getOzoneContainer() {
assertEquals(1, cluster.getHddsDatanodes().size());
HddsDatanodeService dn = cluster.getHddsDatanodes().get(0);
- OzoneContainer oc = dn.getDatanodeStateMachine().getContainer();
- return oc.getContainerSet().getContainer(containerID);
+ return dn.getDatanodeStateMachine().getContainer();
+ }
+
+ protected Container<?> getDnContainer(long containerID) {
+ return getOzoneContainer().getContainerSet().getContainer(containerID);
}
protected long writeDataThenCloseContainer() throws Exception {
@@ -308,7 +322,6 @@ public abstract class
TestContainerScannerIntegrationAbstract {
private final Consumer<Container<?>> corruption;
private final ScanResult.FailureType expectedResult;
- private static final Random RANDOM = new Random();
ContainerCorruptions(Consumer<Container<?>> corruption,
ScanResult.FailureType expectedResult) {
@@ -345,11 +358,21 @@ public abstract class
TestContainerScannerIntegrationAbstract {
* Overwrite the file with random bytes.
*/
private static void corruptFile(File file) {
- byte[] corruptedBytes = new byte[(int)file.length()];
- RANDOM.nextBytes(corruptedBytes);
try {
- Files.write(file.toPath(), corruptedBytes,
- StandardOpenOption.TRUNCATE_EXISTING);
+ final int length = (int) file.length();
+
+ Path path = file.toPath();
+ final byte[] original = IOUtils.readFully(Files.newInputStream(path),
length);
+
+ final byte[] corruptedBytes = new byte[length];
+ ThreadLocalRandom.current().nextBytes(corruptedBytes);
+
+ Files.write(path, corruptedBytes,
+ StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.SYNC);
+
+ assertThat(IOUtils.readFully(Files.newInputStream(path), length))
+ .isEqualTo(corruptedBytes)
+ .isNotEqualTo(original);
} catch (IOException ex) {
// Fail the test.
throw new UncheckedIOException(ex);
@@ -361,8 +384,10 @@ public abstract class
TestContainerScannerIntegrationAbstract {
*/
private static void truncateFile(File file) {
try {
- Files.write(file.toPath(), new byte[]{},
- StandardOpenOption.TRUNCATE_EXISTING);
+ Files.write(file.toPath(), new byte[0],
+ StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.SYNC);
+
+ assertEquals(0, file.length());
} catch (IOException ex) {
// Fail the test.
throw new UncheckedIOException(ex);
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]