This is an automated email from the ASF dual-hosted git repository.
adoroszlai pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/master by this push:
new 99a5703372 HDDS-10652. EC Reconstruction fails with "IOException: None
of the block data have checksum" after upgrade (#6520)
99a5703372 is described below
commit 99a5703372994038186381ae93aafe8f5a066fa4
Author: Siddhant Sangwan <[email protected]>
AuthorDate: Thu Apr 18 21:44:44 2024 +0530
HDDS-10652. EC Reconstruction fails with "IOException: None of the block
data have checksum" after upgrade (#6520)
---
.../hdds/scm/storage/ECBlockOutputStream.java | 24 +++++--
.../storage/TestBlockOutputStreamCorrectness.java | 83 ++++++++++++++++++++++
2 files changed, 100 insertions(+), 7 deletions(-)
diff --git
a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/ECBlockOutputStream.java
b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/ECBlockOutputStream.java
index adecc3e4c1..3c545b776f 100644
---
a/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/ECBlockOutputStream.java
+++
b/hadoop-hdds/client/src/main/java/org/apache/hadoop/hdds/scm/storage/ECBlockOutputStream.java
@@ -115,17 +115,28 @@ public class ECBlockOutputStream extends
BlockOutputStream {
}
BlockData checksumBlockData = null;
+ BlockID blockID = null;
//Reverse Traversal as all parity will have checksumBytes
for (int i = blockData.length - 1; i >= 0; i--) {
BlockData bd = blockData[i];
if (bd == null) {
continue;
}
+ if (blockID == null) {
+ // store the BlockID for logging
+ blockID = bd.getBlockID();
+ }
List<ChunkInfo> chunks = bd.getChunks();
- if (chunks != null && chunks.size() > 0 && chunks.get(0)
- .hasStripeChecksum()) {
- checksumBlockData = bd;
- break;
+ if (chunks != null && chunks.size() > 0) {
+ if (chunks.get(0).hasStripeChecksum()) {
+ checksumBlockData = bd;
+ break;
+ } else {
+ ChunkInfo chunk = chunks.get(0);
+ LOG.debug("The first chunk in block with index {} does not have
stripeChecksum. BlockID: {}, Block " +
+ "size: {}. Chunk length: {}, Chunk offset: {},
hasChecksumData: {}, chunks size: {}.", i,
+ bd.getBlockID(), bd.getSize(), chunk.getLen(),
chunk.getOffset(), chunk.hasChecksumData(), chunks.size());
+ }
}
}
@@ -158,9 +169,8 @@ public class ECBlockOutputStream extends BlockOutputStream {
getContainerBlockData().clearChunks();
getContainerBlockData().addAllChunks(newChunkList);
} else {
- throw new IOException("None of the block data have checksum " +
- "which means " + parity + "(parity)+1 blocks are " +
- "not present");
+ LOG.warn("Could not find checksum data in any index for blockData with
BlockID {}, length {} and " +
+ "blockGroupLength {}.", blockID, blockData.length, blockGroupLength);
}
return executePutBlock(close, force, blockGroupLength);
diff --git
a/hadoop-hdds/client/src/test/java/org/apache/hadoop/hdds/scm/storage/TestBlockOutputStreamCorrectness.java
b/hadoop-hdds/client/src/test/java/org/apache/hadoop/hdds/scm/storage/TestBlockOutputStreamCorrectness.java
index d06c9cf684..bf4830c6fc 100644
---
a/hadoop-hdds/client/src/test/java/org/apache/hadoop/hdds/scm/storage/TestBlockOutputStreamCorrectness.java
+++
b/hadoop-hdds/client/src/test/java/org/apache/hadoop/hdds/scm/storage/TestBlockOutputStreamCorrectness.java
@@ -23,9 +23,13 @@ import java.util.Map;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.atomic.AtomicInteger;
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.ImmutableMap;
import org.apache.commons.lang3.RandomUtils;
import org.apache.hadoop.hdds.client.BlockID;
+import org.apache.hadoop.hdds.client.ECReplicationConfig;
import org.apache.hadoop.hdds.protocol.DatanodeDetails;
+import org.apache.hadoop.hdds.protocol.MockDatanodeDetails;
import
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ChecksumType;
import
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandRequestProto;
import
org.apache.hadoop.hdds.protocol.datanode.proto.ContainerProtos.ContainerCommandResponseProto;
@@ -42,8 +46,13 @@ import org.apache.hadoop.hdds.scm.XceiverClientReply;
import org.apache.hadoop.hdds.scm.XceiverClientSpi;
import org.apache.hadoop.hdds.scm.pipeline.MockPipeline;
import org.apache.hadoop.hdds.scm.pipeline.Pipeline;
+import org.apache.hadoop.hdds.scm.pipeline.PipelineID;
import org.apache.hadoop.ozone.OzoneConsts;
+import org.apache.hadoop.ozone.container.common.helpers.BlockData;
+import org.apache.hadoop.ozone.container.common.helpers.ChunkInfo;
import org.apache.ratis.thirdparty.com.google.protobuf.ByteString;
+import org.junit.jupiter.api.Assertions;
+import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;
@@ -83,6 +92,66 @@ class TestBlockOutputStreamCorrectness {
}
}
+ /**
+ * Tests an EC offline reconstruction scenario in which none of the
ChunkInfo in an EC stripe have stripeChecksum.
+ * Such ChunkInfo will exist for any EC data that was written in a version
in which the ChunkInfo protobuf message did
+ * not have the stripeChecksum field. Here, we assert that executePutBlock
during reconstruction does not throw an
+ * exception because of missing stripeChecksum. This essentially tests
compatibility between an Ozone version that
+ * did not have stripeChecksum and a version that has stripeChecksum.
+ */
+ @Test
+ public void
testMissingStripeChecksumDoesNotMakeExecutePutBlockFailDuringECReconstruction()
throws IOException {
+ // setup some parameters required for creating ECBlockOutputStream
+ OzoneClientConfig config = new OzoneClientConfig();
+ ECReplicationConfig replicationConfig = new ECReplicationConfig(3, 2);
+ BlockID blockID = new BlockID(1, 1);
+ DatanodeDetails datanodeDetails =
MockDatanodeDetails.randomDatanodeDetails();
+ Pipeline pipeline = Pipeline.newBuilder()
+ .setId(PipelineID.valueOf(datanodeDetails.getUuid()))
+ .setReplicationConfig(replicationConfig)
+ .setNodes(ImmutableList.of(datanodeDetails))
+ .setState(Pipeline.PipelineState.CLOSED)
+ // we'll executePutBlock for the parity index 5 because stripeChecksum
is written to either the first or the
+ // parity indexes
+ .setReplicaIndexes(ImmutableMap.of(datanodeDetails, 5)).build();
+
+ BlockLocationInfo locationInfo = new BlockLocationInfo.Builder()
+ .setBlockID(blockID)
+ .setOffset(1)
+ .setLength(10)
+ .setPipeline(pipeline).build();
+
+ /*
+ The array of BlockData contains metadata about blocks and their chunks,
and is read in executePutBlock. In
+ this test, we deliberately don't write stripeChecksum to any chunk. The
expectation is that executePutBlock
+ should not throw an exception because of missing stripeChecksum.
+ */
+ BlockData[] blockData = createBlockDataWithoutStripeChecksum(blockID,
replicationConfig);
+ try (ECBlockOutputStream ecBlockOutputStream =
createECBlockOutputStream(config, replicationConfig, blockID,
+ pipeline)) {
+ Assertions.assertDoesNotThrow(() ->
ecBlockOutputStream.executePutBlock(true, true, locationInfo.getLength(),
+ blockData));
+ }
+ }
+
+ /**
+ * Creates a BlockData array with {@link
ECReplicationConfig#getRequiredNodes()} number of elements.
+ */
+ private BlockData[] createBlockDataWithoutStripeChecksum(BlockID blockID,
ECReplicationConfig replicationConfig) {
+ int requiredNodes = replicationConfig.getRequiredNodes();
+ BlockData[] blockDataArray = new BlockData[requiredNodes];
+
+ // add just one ChunkInfo to each BlockData.
+ for (int i = 0; i < requiredNodes; i++) {
+ BlockData data = new BlockData(blockID);
+ // create a ChunkInfo with no stripeChecksum
+ ChunkInfo chunkInfo = new ChunkInfo("abc", 0, 10);
+ data.addChunk(chunkInfo.getProtoBufMessage());
+ blockDataArray[i] = data;
+ }
+ return blockDataArray;
+ }
+
private BlockOutputStream createBlockOutputStream(BufferPool bufferPool)
throws IOException {
@@ -114,6 +183,20 @@ class TestBlockOutputStreamCorrectness {
() -> newFixedThreadPool(10));
}
+ private ECBlockOutputStream createECBlockOutputStream(OzoneClientConfig
clientConfig,
+ ECReplicationConfig repConfig, BlockID blockID, Pipeline pipeline)
throws IOException {
+ final XceiverClientManager xcm = mock(XceiverClientManager.class);
+ when(xcm.acquireClient(any()))
+ .thenReturn(new MockXceiverClientSpi(pipeline));
+
+ ContainerClientMetrics clientMetrics = ContainerClientMetrics.acquire();
+ StreamBufferArgs streamBufferArgs =
+ StreamBufferArgs.getDefaultStreamBufferArgs(repConfig, clientConfig);
+
+ return new ECBlockOutputStream(blockID, xcm, pipeline, BufferPool.empty(),
clientConfig, null,
+ clientMetrics, streamBufferArgs, () -> newFixedThreadPool(2));
+ }
+
/**
* XCeiverClient which simulates responses.
*/
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]