steveloughran commented on code in PR #6069:
URL: https://github.com/apache/hadoop/pull/6069#discussion_r1350497313


##########
hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/services/AbfsClient.java:
##########
@@ -1412,6 +1447,97 @@ private void appendIfNotEmpty(StringBuilder sb, String 
regEx,
     }
   }
 
+  /**
+   * Add MD5 hash as request header to the append request
+   * @param requestHeaders to be updated with checksum header
+   * @param reqParams for getting offset and length
+   * @param buffer for getting input data for MD5 computation
+   * @throws AbfsRestOperationException if Md5 computation fails
+   */
+  private void addCheckSumHeaderForWrite(List<AbfsHttpHeader> requestHeaders,
+      final AppendRequestParameters reqParams, final byte[] buffer)
+      throws AbfsRestOperationException {
+    String md5Hash = computeMD5Hash(buffer, reqParams.getoffset(),
+        reqParams.getLength());
+    requestHeaders.add(new AbfsHttpHeader(CONTENT_MD5, md5Hash));
+  }
+
+  /**
+   * To verify the checksum information received from server for the data read
+   * @param buffer stores the data received from server
+   * @param result HTTP Operation Result
+   * @param bufferOffset Position where data returned by server is saved in 
buffer
+   * @throws AbfsRestOperationException if Md5Mismatch
+   */
+  private void verifyCheckSumForRead(final byte[] buffer,
+      final AbfsHttpOperation result, final int bufferOffset)
+      throws AbfsRestOperationException {
+    // Number of bytes returned by server could be less than or equal to what
+    // caller requests. In case it is less, extra bytes will be initialized to 0
+    // Server returned MD5 Hash will be computed on what server returned.
+    // We need to get exact data that server returned and compute its md5 hash
+    // Computed hash should be equal to what server returned
+    int numberOfBytesRead = (int) result.getBytesReceived();
+    if (numberOfBytesRead == 0) {
+      return;
+    }
+    String md5HashComputed = computeMD5Hash(buffer, bufferOffset,
+        numberOfBytesRead);
+    String md5HashActual = result.getResponseHeader(CONTENT_MD5);

Review Comment:
   is this ever not going be returned?



##########
hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/services/AbfsClient.java:
##########
@@ -1412,6 +1447,97 @@ private void appendIfNotEmpty(StringBuilder sb, String 
regEx,
     }
   }
 
+  /**
+   * Add MD5 hash as request header to the append request
+   * @param requestHeaders to be updated with checksum header
+   * @param reqParams for getting offset and length
+   * @param buffer for getting input data for MD5 computation
+   * @throws AbfsRestOperationException if Md5 computation fails
+   */
+  private void addCheckSumHeaderForWrite(List<AbfsHttpHeader> requestHeaders,
+      final AppendRequestParameters reqParams, final byte[] buffer)
+      throws AbfsRestOperationException {
+    String md5Hash = computeMD5Hash(buffer, reqParams.getoffset(),
+        reqParams.getLength());
+    requestHeaders.add(new AbfsHttpHeader(CONTENT_MD5, md5Hash));
+  }
+
+  /**
+   * To verify the checksum information received from server for the data read
+   * @param buffer stores the data received from server
+   * @param result HTTP Operation Result
+   * @param bufferOffset Position where data returned by server is saved in 
buffer
+   * @throws AbfsRestOperationException if Md5Mismatch
+   */
+  private void verifyCheckSumForRead(final byte[] buffer,
+      final AbfsHttpOperation result, final int bufferOffset)
+      throws AbfsRestOperationException {
+    // Number of bytes returned by server could be less than or equal to what
+    // caller requests. In case it is less, extra bytes will be initialized to 0
+    // Server returned MD5 Hash will be computed on what server returned.
+    // We need to get exact data that server returned and compute its md5 hash
+    // Computed hash should be equal to what server returned
+    int numberOfBytesRead = (int) result.getBytesReceived();
+    if (numberOfBytesRead == 0) {
+      return;
+    }
+    String md5HashComputed = computeMD5Hash(buffer, bufferOffset,
+        numberOfBytesRead);
+    String md5HashActual = result.getResponseHeader(CONTENT_MD5);
+    if (!md5HashComputed.equals(md5HashActual)) {
+      throw new AbfsInvalidChecksumException(result.getRequestId());
+    }
+  }
+
+  /**
+   * Conditions check for allowing checksum support for read operation.
+   * As per the azure documentation following conditions should be met before
+   * Sending MD5 Hash in request headers.
+   * {@link <a 
href="https://learn.microsoft.com/en-us/rest/api/storageservices/datalakestoragegen2/path/read";></a>}
+   * 1. Range header should be present as one of the request headers

Review Comment:
   1. use HTML <ol><li> indexing so that it renders well.
   2. use MUST over should so that RFC2119 MUST/SHOULD/MAY terms apply
   



##########
hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/ITestAzureBlobFileSystemChecksum.java:
##########
@@ -0,0 +1,259 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.azurebfs;
+
+import java.security.SecureRandom;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import 
org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsInvalidChecksumException;
+import 
org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsRestOperationException;
+import 
org.apache.hadoop.fs.azurebfs.contracts.services.AppendRequestParameters;
+import org.apache.hadoop.fs.azurebfs.services.AbfsClient;
+import org.apache.hadoop.fs.impl.OpenFileParameters;
+
+import static 
org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.MD5_ERROR_SERVER_MESSAGE;
+import static 
org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_BUFFERED_PREAD_DISABLE;
+import static 
org.apache.hadoop.fs.azurebfs.constants.FileSystemConfigurations.ONE_MB;
+import static 
org.apache.hadoop.fs.azurebfs.contracts.services.AppendRequestParameters.Mode.APPEND_MODE;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.mockito.ArgumentMatchers.any;
+
+/**
+ * Test For Verifying Checksum Related Operations
+ */
+public class ITestAzureBlobFileSystemChecksum extends 
AbstractAbfsIntegrationTest {
+
+  public ITestAzureBlobFileSystemChecksum() throws Exception {
+    super();
+  }
+
+  @Test
+  public void testWriteReadWithChecksum() throws Exception {
+    testWriteReadWithChecksumInternal(true);
+    testWriteReadWithChecksumInternal(false);
+  }
+
+  @Test
+  public void testAppendWithChecksumAtDifferentOffsets() throws Exception {
+    AzureBlobFileSystem fs = getConfiguredFileSystem(4 * ONE_MB, 4 * ONE_MB, 
true);
+    AbfsClient client = fs.getAbfsStore().getClient();
+    Path path = path("testPath");
+    fs.create(path);
+    byte[] data= generateRandomBytes(4 * ONE_MB);
+
+    appendWithOffsetHelper(client, path, data, fs, 0);
+    appendWithOffsetHelper(client, path, data, fs, 1 * ONE_MB);
+    appendWithOffsetHelper(client, path, data, fs, 2 * ONE_MB);
+    appendWithOffsetHelper(client, path, data, fs, 4 * ONE_MB - 1);
+  }
+
+  @Test
+  public void testReadWithChecksumAtDifferentOffsets() throws Exception {
+    AzureBlobFileSystem fs = getConfiguredFileSystem(4 * ONE_MB, 4 * ONE_MB, 
true);

Review Comment:
   make all these constants like MB4, MB16 for reuse



##########
hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/ITestAzureBlobFileSystemChecksum.java:
##########
@@ -0,0 +1,259 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.azurebfs;
+
+import java.security.SecureRandom;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import 
org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsInvalidChecksumException;
+import 
org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsRestOperationException;
+import 
org.apache.hadoop.fs.azurebfs.contracts.services.AppendRequestParameters;
+import org.apache.hadoop.fs.azurebfs.services.AbfsClient;
+import org.apache.hadoop.fs.impl.OpenFileParameters;
+
+import static 
org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.MD5_ERROR_SERVER_MESSAGE;
+import static 
org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_BUFFERED_PREAD_DISABLE;
+import static 
org.apache.hadoop.fs.azurebfs.constants.FileSystemConfigurations.ONE_MB;
+import static 
org.apache.hadoop.fs.azurebfs.contracts.services.AppendRequestParameters.Mode.APPEND_MODE;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.mockito.ArgumentMatchers.any;
+
+/**
+ * Test For Verifying Checksum Related Operations
+ */
+public class ITestAzureBlobFileSystemChecksum extends 
AbstractAbfsIntegrationTest {
+
+  public ITestAzureBlobFileSystemChecksum() throws Exception {
+    super();
+  }
+
+  @Test
+  public void testWriteReadWithChecksum() throws Exception {
+    testWriteReadWithChecksumInternal(true);
+    testWriteReadWithChecksumInternal(false);
+  }
+
+  @Test
+  public void testAppendWithChecksumAtDifferentOffsets() throws Exception {
+    AzureBlobFileSystem fs = getConfiguredFileSystem(4 * ONE_MB, 4 * ONE_MB, 
true);
+    AbfsClient client = fs.getAbfsStore().getClient();
+    Path path = path("testPath");

Review Comment:
   use methodPath() to get a new path with the method name; guarantees there's 
no conflict with others.



##########
hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/services/AbfsClient.java:
##########
@@ -875,10 +873,15 @@ private boolean checkUserError(int responseStatusCode) {
         && responseStatusCode < HttpURLConnection.HTTP_INTERNAL_ERROR);
   }
 
-  private boolean isMd5ChecksumError(final AzureBlobFileSystemException e) {
+  /**
+   * To check if the failure exception returned by server is due to MD5 
Mismatch
+   * @param e Exception returned by AbfsRestOperation
+   * @return boolean whether exception is due to MD5Mismatch or not
+   */
+  protected boolean isMd5ChecksumError(final AzureBlobFileSystemException e) {
     return ((AbfsRestOperationException) e).getStatusCode()
         == HttpURLConnection.HTTP_BAD_REQUEST
-        && ((AbfsRestOperationException) 
e).getErrorMessage().contains(MD5_ERROR);
+        && e.getMessage().contains(MD5_ERROR_SERVER_MESSAGE);

Review Comment:
   If the errorMessage is that explicit "Md5Mismatch" string, then we should be 
looking for that



##########
hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/contracts/exceptions/AbfsRuntimeException.java:
##########
@@ -0,0 +1,54 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * <p>
+ * http://www.apache.org/licenses/LICENSE-2.0
+ * <p>
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package org.apache.hadoop.fs.azurebfs.contracts.exceptions;
+
+import org.apache.hadoop.classification.InterfaceAudience;
+import org.apache.hadoop.classification.InterfaceStability;
+import org.apache.hadoop.fs.azurebfs.contracts.services.AzureServiceErrorCode;
+
+/**
+ * Exception to wrap invalid checksum verification on client side.

Review Comment:
   * javadoc incorrect.
   * unsure about name as often RuntimeException is a way of saying "this is an 
RTE", rather than here "we caught an RTE".
   
   proposed
   1. a more accurate name "AbfsDriverException"
   2. take a Throwable in case ever needed



##########
hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/ITestAzureBlobFileSystemChecksum.java:
##########
@@ -0,0 +1,259 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.azurebfs;
+
+import java.security.SecureRandom;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import 
org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsInvalidChecksumException;
+import 
org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsRestOperationException;
+import 
org.apache.hadoop.fs.azurebfs.contracts.services.AppendRequestParameters;
+import org.apache.hadoop.fs.azurebfs.services.AbfsClient;
+import org.apache.hadoop.fs.impl.OpenFileParameters;
+
+import static 
org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.MD5_ERROR_SERVER_MESSAGE;
+import static 
org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_BUFFERED_PREAD_DISABLE;
+import static 
org.apache.hadoop.fs.azurebfs.constants.FileSystemConfigurations.ONE_MB;
+import static 
org.apache.hadoop.fs.azurebfs.contracts.services.AppendRequestParameters.Mode.APPEND_MODE;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.mockito.ArgumentMatchers.any;
+
+/**
+ * Test For Verifying Checksum Related Operations
+ */
+public class ITestAzureBlobFileSystemChecksum extends 
AbstractAbfsIntegrationTest {
+
+  public ITestAzureBlobFileSystemChecksum() throws Exception {
+    super();
+  }
+
+  @Test
+  public void testWriteReadWithChecksum() throws Exception {
+    testWriteReadWithChecksumInternal(true);
+    testWriteReadWithChecksumInternal(false);
+  }
+
+  @Test
+  public void testAppendWithChecksumAtDifferentOffsets() throws Exception {
+    AzureBlobFileSystem fs = getConfiguredFileSystem(4 * ONE_MB, 4 * ONE_MB, 
true);
+    AbfsClient client = fs.getAbfsStore().getClient();
+    Path path = path("testPath");
+    fs.create(path);
+    byte[] data= generateRandomBytes(4 * ONE_MB);
+
+    appendWithOffsetHelper(client, path, data, fs, 0);
+    appendWithOffsetHelper(client, path, data, fs, 1 * ONE_MB);
+    appendWithOffsetHelper(client, path, data, fs, 2 * ONE_MB);
+    appendWithOffsetHelper(client, path, data, fs, 4 * ONE_MB - 1);
+  }
+
+  @Test
+  public void testReadWithChecksumAtDifferentOffsets() throws Exception {
+    AzureBlobFileSystem fs = getConfiguredFileSystem(4 * ONE_MB, 4 * ONE_MB, 
true);
+    AbfsClient client = fs.getAbfsStore().getClient();
+    fs.getAbfsStore().setClient(client);
+    Path path = path("testPath");
+
+    byte[] data = generateRandomBytes(16 * ONE_MB);
+    FSDataOutputStream out = fs.create(path);

Review Comment:
   use try with resources so close is always called.



##########
hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/ITestAzureBlobFileSystemChecksum.java:
##########
@@ -0,0 +1,259 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.azurebfs;
+
+import java.security.SecureRandom;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import 
org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsInvalidChecksumException;
+import 
org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsRestOperationException;
+import 
org.apache.hadoop.fs.azurebfs.contracts.services.AppendRequestParameters;
+import org.apache.hadoop.fs.azurebfs.services.AbfsClient;
+import org.apache.hadoop.fs.impl.OpenFileParameters;
+
+import static 
org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.MD5_ERROR_SERVER_MESSAGE;
+import static 
org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_BUFFERED_PREAD_DISABLE;
+import static 
org.apache.hadoop.fs.azurebfs.constants.FileSystemConfigurations.ONE_MB;
+import static 
org.apache.hadoop.fs.azurebfs.contracts.services.AppendRequestParameters.Mode.APPEND_MODE;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.mockito.ArgumentMatchers.any;
+
+/**
+ * Test For Verifying Checksum Related Operations
+ */
+public class ITestAzureBlobFileSystemChecksum extends 
AbstractAbfsIntegrationTest {
+
+  public ITestAzureBlobFileSystemChecksum() throws Exception {
+    super();
+  }
+
+  @Test
+  public void testWriteReadWithChecksum() throws Exception {
+    testWriteReadWithChecksumInternal(true);
+    testWriteReadWithChecksumInternal(false);
+  }
+
+  @Test
+  public void testAppendWithChecksumAtDifferentOffsets() throws Exception {
+    AzureBlobFileSystem fs = getConfiguredFileSystem(4 * ONE_MB, 4 * ONE_MB, 
true);

Review Comment:
   does this need a close() after to clean up threads? if so: because most 
tests do this, add a field "abfs" and then in teardown use 
IOUtils.cleanupWithLogger() to close...then call superclass.teardown



##########
hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/services/AbfsClient.java:
##########
@@ -1412,6 +1447,97 @@ private void appendIfNotEmpty(StringBuilder sb, String 
regEx,
     }
   }
 
+  /**
+   * Add MD5 hash as request header to the append request
+   * @param requestHeaders to be updated with checksum header
+   * @param reqParams for getting offset and length
+   * @param buffer for getting input data for MD5 computation
+   * @throws AbfsRestOperationException if Md5 computation fails
+   */
+  private void addCheckSumHeaderForWrite(List<AbfsHttpHeader> requestHeaders,
+      final AppendRequestParameters reqParams, final byte[] buffer)
+      throws AbfsRestOperationException {
+    String md5Hash = computeMD5Hash(buffer, reqParams.getoffset(),
+        reqParams.getLength());
+    requestHeaders.add(new AbfsHttpHeader(CONTENT_MD5, md5Hash));
+  }
+
+  /**
+   * To verify the checksum information received from server for the data read
+   * @param buffer stores the data received from server
+   * @param result HTTP Operation Result
+   * @param bufferOffset Position where data returned by server is saved in 
buffer
+   * @throws AbfsRestOperationException if Md5Mismatch
+   */
+  private void verifyCheckSumForRead(final byte[] buffer,
+      final AbfsHttpOperation result, final int bufferOffset)
+      throws AbfsRestOperationException {
+    // Number of bytes returned by server could be less than or equal to what
+    // caller requests. In case it is less, extra bytes will be initialized to 0
+    // Server returned MD5 Hash will be computed on what server returned.
+    // We need to get exact data that server returned and compute its md5 hash
+    // Computed hash should be equal to what server returned
+    int numberOfBytesRead = (int) result.getBytesReceived();
+    if (numberOfBytesRead == 0) {
+      return;
+    }
+    String md5HashComputed = computeMD5Hash(buffer, bufferOffset,
+        numberOfBytesRead);
+    String md5HashActual = result.getResponseHeader(CONTENT_MD5);
+    if (!md5HashComputed.equals(md5HashActual)) {
+      throw new AbfsInvalidChecksumException(result.getRequestId());
+    }
+  }
+
+  /**
+   * Conditions check for allowing checksum support for read operation.
+   * As per the azure documentation following conditions should be met before
+   * Sending MD5 Hash in request headers.
+   * {@link <a 
href="https://learn.microsoft.com/en-us/rest/api/storageservices/datalakestoragegen2/path/read";></a>}
+   * 1. Range header should be present as one of the request headers
+   * 2. buffer length should not exceed 4MB.
+   * @param requestHeaders to be checked for range header
+   * @param rangeHeader must be present
+   * @param bufferLength must be less than 4MB

Review Comment:
   nit: less than or equal



##########
hadoop-tools/hadoop-azure/src/main/java/org/apache/hadoop/fs/azurebfs/constants/ConfigurationKeys.java:
##########
@@ -241,6 +241,9 @@ public final class ConfigurationKeys {
   /** Add extra resilience to rename failures, at the expense of performance. 
*/
   public static final String FS_AZURE_ABFS_RENAME_RESILIENCE = 
"fs.azure.enable.rename.resilience";
 
+  /** Add extra layer of verification of the integrity of the request content 
during transport. */
+  public static final String FS_AZURE_ABFS_ENABLE_CHECKSUM_VALIDATION = 
"fs.azure.enable.checksum.validation";

Review Comment:
   + a {@value} entry in the javadocs so IDEs as well as javadocs show the value



##########
hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/ITestAzureBlobFileSystemChecksum.java:
##########
@@ -0,0 +1,259 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.azurebfs;
+
+import java.security.SecureRandom;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import 
org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsInvalidChecksumException;
+import 
org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsRestOperationException;
+import 
org.apache.hadoop.fs.azurebfs.contracts.services.AppendRequestParameters;
+import org.apache.hadoop.fs.azurebfs.services.AbfsClient;
+import org.apache.hadoop.fs.impl.OpenFileParameters;
+
+import static 
org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.MD5_ERROR_SERVER_MESSAGE;
+import static 
org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_BUFFERED_PREAD_DISABLE;
+import static 
org.apache.hadoop.fs.azurebfs.constants.FileSystemConfigurations.ONE_MB;
+import static 
org.apache.hadoop.fs.azurebfs.contracts.services.AppendRequestParameters.Mode.APPEND_MODE;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.mockito.ArgumentMatchers.any;
+
+/**
+ * Test For Verifying Checksum Related Operations
+ */
+public class ITestAzureBlobFileSystemChecksum extends 
AbstractAbfsIntegrationTest {
+
+  public ITestAzureBlobFileSystemChecksum() throws Exception {
+    super();
+  }
+
+  @Test
+  public void testWriteReadWithChecksum() throws Exception {
+    testWriteReadWithChecksumInternal(true);
+    testWriteReadWithChecksumInternal(false);
+  }
+
+  @Test
+  public void testAppendWithChecksumAtDifferentOffsets() throws Exception {
+    AzureBlobFileSystem fs = getConfiguredFileSystem(4 * ONE_MB, 4 * ONE_MB, 
true);
+    AbfsClient client = fs.getAbfsStore().getClient();
+    Path path = path("testPath");
+    fs.create(path);
+    byte[] data= generateRandomBytes(4 * ONE_MB);
+
+    appendWithOffsetHelper(client, path, data, fs, 0);
+    appendWithOffsetHelper(client, path, data, fs, 1 * ONE_MB);
+    appendWithOffsetHelper(client, path, data, fs, 2 * ONE_MB);
+    appendWithOffsetHelper(client, path, data, fs, 4 * ONE_MB - 1);
+  }
+
+  @Test
+  public void testReadWithChecksumAtDifferentOffsets() throws Exception {
+    AzureBlobFileSystem fs = getConfiguredFileSystem(4 * ONE_MB, 4 * ONE_MB, 
true);
+    AbfsClient client = fs.getAbfsStore().getClient();
+    fs.getAbfsStore().setClient(client);
+    Path path = path("testPath");
+
+    byte[] data = generateRandomBytes(16 * ONE_MB);
+    FSDataOutputStream out = fs.create(path);
+    out.write(data);
+    out.hflush();
+    out.close();
+
+    readWithOffsetAndPositionHelper(client, path, data, fs, 0, 0);
+    readWithOffsetAndPositionHelper(client, path, data, fs, 4 * ONE_MB, 0);
+    readWithOffsetAndPositionHelper(client, path, data, fs, 4 * ONE_MB, 1 * 
ONE_MB);
+    readWithOffsetAndPositionHelper(client, path, data, fs, 8 * ONE_MB, 2 * 
ONE_MB);
+    readWithOffsetAndPositionHelper(client, path, data, fs, 15 * ONE_MB, 4 * 
ONE_MB - 1);
+  }
+
+  @Test
+  public void testWriteReadWithChecksumAndOptions() throws Exception {
+    testWriteReadWithChecksumAndOptionsInternal(true);
+    testWriteReadWithChecksumAndOptionsInternal(false);
+  }
+
+  @Test
+  public void testAbfsInvalidChecksumExceptionInAppend() throws Exception {
+    AzureBlobFileSystem fs = getConfiguredFileSystem(4 * ONE_MB, 4 * ONE_MB, 
true);
+    AbfsClient spiedClient = Mockito.spy(fs.getAbfsStore().getClient());
+    fs.getAbfsStore().setClient(spiedClient);
+    Path path = path("testPath");
+    fs.create(path);
+    byte[] data= generateRandomBytes(4 * ONE_MB);
+    String invalidMD5Hash = 
spiedClient.computeMD5Hash("InvalidData".getBytes(), 0, 11);
+    Mockito.doReturn(invalidMD5Hash).when(spiedClient).computeMD5Hash(any(),
+        any(Integer.class), any(Integer.class));
+    AbfsRestOperationException ex = 
intercept(AbfsInvalidChecksumException.class, () -> {
+      appendWithOffsetHelper(spiedClient, path, data, fs, 0);
+    });
+
+    Assertions.assertThat(ex.getErrorMessage())
+        .describedAs("Exception Message should contain MD5Mismatch")
+        .contains(MD5_ERROR_SERVER_MESSAGE);
+  }
+
+  @Test
+  public void testAbfsInvalidChecksumExceptionInRead() throws Exception {
+    AzureBlobFileSystem fs = getConfiguredFileSystem(4 * ONE_MB, 4 * ONE_MB, 
true);
+    AbfsClient spiedClient = Mockito.spy(fs.getAbfsStore().getClient());
+    fs.getAbfsStore().setClient(spiedClient);
+    Path path = path("testPath");
+    byte[] data = generateRandomBytes(3 * ONE_MB);
+    FSDataOutputStream out = fs.create(path);
+    out.write(data);
+    out.hflush();
+    out.close();
+
+    String invalidMD5Hash = 
spiedClient.computeMD5Hash("InvalidData".getBytes(), 0, 11);
+    Mockito.doReturn(invalidMD5Hash).when(spiedClient).computeMD5Hash(any(),
+        any(Integer.class), any(Integer.class));
+
+    intercept(AbfsInvalidChecksumException.class, () -> {
+      readWithOffsetAndPositionHelper(spiedClient, path, data, fs, 0, 0);
+    });
+  }
+
+  private void testWriteReadWithChecksumInternal(final boolean 
readAheadEnabled)
+      throws Exception {
+    AzureBlobFileSystem fs = getConfiguredFileSystem(4 * ONE_MB, 4 * ONE_MB, 
readAheadEnabled);
+    final int dataSize = 16 * ONE_MB + 1000;
+
+    Path testPath = path("testPath");
+    byte[] bytesUploaded = generateRandomBytes(dataSize);
+    FSDataOutputStream out = fs.create(testPath);
+    out.write(bytesUploaded);
+    out.hflush();
+    out.close();
+
+    FSDataInputStream in = fs.open(testPath);

Review Comment:
   also needs closing



##########
hadoop-tools/hadoop-azure/src/site/markdown/abfs.md:
##########
@@ -909,22 +909,31 @@ DelegatingSSLSocketFactory.SSLChannelMode. The default 
value will be
 DelegatingSSLSocketFactory.SSLChannelMode.Default.
 
 ### <a name="serverconfigoptions"></a> Server Options
-When the config `fs.azure.io.read.tolerate.concurrent.append` is made true, the
+`fs.azure.io.read.tolerate.concurrent.append`: When the config is made true, 
the
 If-Match header sent to the server for read calls will be set as * otherwise 
the
 same will be set with ETag. This is basically a mechanism in place to handle 
the
 reads with optimistic concurrency.
 Please refer the following links for further information.
 1. 
https://docs.microsoft.com/en-us/rest/api/storageservices/datalakestoragegen2/path/read
 2. 
https://azure.microsoft.com/de-de/blog/managing-concurrency-in-microsoft-azure-storage-2/
 
-listStatus API fetches the FileStatus information from server in a page by page
-manner. The config `fs.azure.list.max.results` used to set the maxResults URI
- param which sets the pagesize(maximum results per call). The value should
- be >  0. By default this will be 5000. Server has a maximum value for this
- parameter as 5000. So even if the config is above 5000 the response will only
+`fs.azure.list.max.results`: listStatus API fetches the FileStatus information
+from server in a page by page manner. The config used to set the maxResults URI

Review Comment:
   nit: change "The config used to set" to "The config is used to set"



##########
hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/ITestAzureBlobFileSystemChecksum.java:
##########
@@ -0,0 +1,259 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.azurebfs;
+
+import java.security.SecureRandom;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import 
org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsInvalidChecksumException;
+import 
org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsRestOperationException;
+import 
org.apache.hadoop.fs.azurebfs.contracts.services.AppendRequestParameters;
+import org.apache.hadoop.fs.azurebfs.services.AbfsClient;
+import org.apache.hadoop.fs.impl.OpenFileParameters;
+
+import static 
org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.MD5_ERROR_SERVER_MESSAGE;
+import static 
org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_BUFFERED_PREAD_DISABLE;
+import static 
org.apache.hadoop.fs.azurebfs.constants.FileSystemConfigurations.ONE_MB;
+import static 
org.apache.hadoop.fs.azurebfs.contracts.services.AppendRequestParameters.Mode.APPEND_MODE;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.mockito.ArgumentMatchers.any;
+
+/**
+ * Test For Verifying Checksum Related Operations
+ */
+public class ITestAzureBlobFileSystemChecksum extends 
AbstractAbfsIntegrationTest {
+
+  public ITestAzureBlobFileSystemChecksum() throws Exception {
+    super();
+  }
+
+  @Test
+  public void testWriteReadWithChecksum() throws Exception {
+    testWriteReadWithChecksumInternal(true);
+    testWriteReadWithChecksumInternal(false);
+  }
+
+  @Test
+  public void testAppendWithChecksumAtDifferentOffsets() throws Exception {
+    AzureBlobFileSystem fs = getConfiguredFileSystem(4 * ONE_MB, 4 * ONE_MB, 
true);
+    AbfsClient client = fs.getAbfsStore().getClient();
+    Path path = path("testPath");
+    fs.create(path);

Review Comment:
   needs closing after line 73; put in try with resources or close in finally



##########
hadoop-tools/hadoop-azure/src/test/java/org/apache/hadoop/fs/azurebfs/ITestAzureBlobFileSystemChecksum.java:
##########
@@ -0,0 +1,259 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.fs.azurebfs;
+
+import java.security.SecureRandom;
+import java.util.Arrays;
+import java.util.HashSet;
+
+import org.assertj.core.api.Assertions;
+import org.junit.Test;
+import org.mockito.Mockito;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FSDataInputStream;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import 
org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsInvalidChecksumException;
+import 
org.apache.hadoop.fs.azurebfs.contracts.exceptions.AbfsRestOperationException;
+import 
org.apache.hadoop.fs.azurebfs.contracts.services.AppendRequestParameters;
+import org.apache.hadoop.fs.azurebfs.services.AbfsClient;
+import org.apache.hadoop.fs.impl.OpenFileParameters;
+
+import static 
org.apache.hadoop.fs.azurebfs.constants.AbfsHttpConstants.MD5_ERROR_SERVER_MESSAGE;
+import static 
org.apache.hadoop.fs.azurebfs.constants.ConfigurationKeys.FS_AZURE_BUFFERED_PREAD_DISABLE;
+import static 
org.apache.hadoop.fs.azurebfs.constants.FileSystemConfigurations.ONE_MB;
+import static 
org.apache.hadoop.fs.azurebfs.contracts.services.AppendRequestParameters.Mode.APPEND_MODE;
+import static org.apache.hadoop.test.LambdaTestUtils.intercept;
+import static org.mockito.ArgumentMatchers.any;
+
+/**
+ * Test For Verifying Checksum Related Operations
+ */
+public class ITestAzureBlobFileSystemChecksum extends 
AbstractAbfsIntegrationTest {
+
+  public ITestAzureBlobFileSystemChecksum() throws Exception {
+    super();
+  }
+
+  @Test
+  public void testWriteReadWithChecksum() throws Exception {
+    testWriteReadWithChecksumInternal(true);
+    testWriteReadWithChecksumInternal(false);
+  }
+
+  @Test
+  public void testAppendWithChecksumAtDifferentOffsets() throws Exception {
+    AzureBlobFileSystem fs = getConfiguredFileSystem(4 * ONE_MB, 4 * ONE_MB, 
true);
+    AbfsClient client = fs.getAbfsStore().getClient();
+    Path path = path("testPath");
+    fs.create(path);
+    byte[] data= generateRandomBytes(4 * ONE_MB);
+
+    appendWithOffsetHelper(client, path, data, fs, 0);
+    appendWithOffsetHelper(client, path, data, fs, 1 * ONE_MB);
+    appendWithOffsetHelper(client, path, data, fs, 2 * ONE_MB);
+    appendWithOffsetHelper(client, path, data, fs, 4 * ONE_MB - 1);
+  }
+
+  @Test
+  public void testReadWithChecksumAtDifferentOffsets() throws Exception {
+    AzureBlobFileSystem fs = getConfiguredFileSystem(4 * ONE_MB, 4 * ONE_MB, 
true);
+    AbfsClient client = fs.getAbfsStore().getClient();
+    fs.getAbfsStore().setClient(client);
+    Path path = path("testPath");

Review Comment:
   use methodPath() here and below



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: common-issues-unsubscr...@hadoop.apache.org
For additional commands, e-mail: common-issues-h...@hadoop.apache.org


Reply via email to