This is an automated email from the ASF dual-hosted git repository.
weichiu pushed a commit to branch HDDS-7593
in repository https://gitbox.apache.org/repos/asf/ozone.git
The following commit(s) were added to refs/heads/HDDS-7593 by this push:
new 370b9d7c5c HDDS-10256. Retry block allocation when SCM is in safe
mode. (#6189)
370b9d7c5c is described below
commit 370b9d7c5cb85344cef4ab360776030baf395b27
Author: ashishkumar50 <[email protected]>
AuthorDate: Sat Feb 10 06:40:32 2024 +0530
HDDS-10256. Retry block allocation when SCM is in safe mode. (#6189)
Co-authored-by: ashishk <[email protected]>
---
...OzoneManagerProtocolClientSideTranslatorPB.java | 41 +++++++++++++---
.../apache/hadoop/ozone/om/TestScmSafeMode.java | 54 ++++++++++++++++++++++
2 files changed, 89 insertions(+), 6 deletions(-)
diff --git
a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java
b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java
index bd40dfcf02..7d029ba044 100644
---
a/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java
+++
b/hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/protocolPB/OzoneManagerProtocolClientSideTranslatorPB.java
@@ -229,6 +229,7 @@ import org.apache.hadoop.util.ProtobufUtils;
import static org.apache.hadoop.ozone.OzoneConsts.OM_S3_CALLER_CONTEXT_PREFIX;
import static org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes;
+import static
org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.SCM_IN_SAFE_MODE;
import static
org.apache.hadoop.ozone.om.exceptions.OMException.ResultCodes.TOKEN_ERROR_OTHER;
import static
org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.CancelPrepareRequest;
import static
org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.CancelPrepareResponse;
@@ -258,6 +259,10 @@ public final class
OzoneManagerProtocolClientSideTranslatorPB
= new ThreadLocal<>();
private boolean s3AuthCheck;
+
+ public static final int BLOCK_ALLOCATION_RETRY_COUNT = 5;
+ public static final int BLOCK_ALLOCATION_RETRY_WAIT_TIME_MS = 3000;
+
public OzoneManagerProtocolClientSideTranslatorPB(OmTransport omTransport,
String clientId) {
this.clientID = clientId;
@@ -728,8 +733,7 @@ public final class
OzoneManagerProtocolClientSideTranslatorPB
.setCreateKeyRequest(req)
.build();
- CreateKeyResponse keyResponse =
- handleError(submitRequest(omRequest)).getCreateKeyResponse();
+ CreateKeyResponse keyResponse =
handleSubmitRequestAndSCMSafeModeRetry(omRequest).getCreateKeyResponse();
return new OpenKeySession(keyResponse.getID(),
OmKeyInfo.getFromProtobuf(keyResponse.getKeyInfo()),
keyResponse.getOpenVersion());
@@ -774,8 +778,7 @@ public final class
OzoneManagerProtocolClientSideTranslatorPB
.setAllocateBlockRequest(req)
.build();
- AllocateBlockResponse resp = handleError(submitRequest(omRequest))
- .getAllocateBlockResponse();
+ AllocateBlockResponse resp =
handleSubmitRequestAndSCMSafeModeRetry(omRequest).getAllocateBlockResponse();
return OmKeyLocationInfo.getFromProtobuf(resp.getKeyLocation());
}
@@ -2243,12 +2246,38 @@ public final class
OzoneManagerProtocolClientSideTranslatorPB
OMRequest omRequest = createOMRequest(Type.CreateFile)
.setCreateFileRequest(createFileRequest)
.build();
- CreateFileResponse resp =
- handleError(submitRequest(omRequest)).getCreateFileResponse();
+ CreateFileResponse resp =
handleSubmitRequestAndSCMSafeModeRetry(omRequest).getCreateFileResponse();
+
return new OpenKeySession(resp.getID(),
OmKeyInfo.getFromProtobuf(resp.getKeyInfo()), resp.getOpenVersion());
}
+
+ @Nonnull
+ private OMResponse handleSubmitRequestAndSCMSafeModeRetry(OMRequest
omRequest) throws IOException {
+ int retryCount = BLOCK_ALLOCATION_RETRY_COUNT;
+ while (true) {
+ try {
+ return handleError(submitRequest(omRequest));
+ } catch (OMException e) {
+ if (e.getResult().equals(SCM_IN_SAFE_MODE) && retryCount > 0) {
+ System.err.println("SCM is in safe mode. Will retry in " +
+ BLOCK_ALLOCATION_RETRY_WAIT_TIME_MS + "ms");
+ retryCount--;
+ try {
+ Thread.sleep(BLOCK_ALLOCATION_RETRY_WAIT_TIME_MS);
+ continue;
+ } catch (InterruptedException ex) {
+ throw new OMException(ex.getMessage(),
ResultCodes.SCM_IN_SAFE_MODE);
+ }
+ } else if (e.getResult().equals(SCM_IN_SAFE_MODE) && retryCount == 0) {
+ throw new OMException(e.getMessage(), ResultCodes.SCM_IN_SAFE_MODE);
+ }
+ throw e;
+ }
+ }
+ }
+
@Override
public List<OzoneFileStatus> listStatus(OmKeyArgs args, boolean recursive,
String startKey, long numEntries, boolean allowPartialPrefixes)
diff --git
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java
index 14b1a30b44..a5e3c69a53 100644
---
a/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java
+++
b/hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestScmSafeMode.java
@@ -17,6 +17,12 @@
*/
package org.apache.hadoop.ozone.om;
+import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
+import org.apache.hadoop.fs.FSDataOutputStream;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.fs.SafeMode;
+import org.apache.hadoop.fs.SafeModeAction;
import org.apache.hadoop.hdds.utils.IOUtils;
import org.apache.commons.lang3.RandomStringUtils;
import org.apache.hadoop.hdds.HddsConfigKeys;
@@ -55,11 +61,14 @@ import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import java.util.concurrent.TimeoutException;
import static org.apache.hadoop.hdds.client.ReplicationType.RATIS;
import static org.apache.hadoop.hdds.client.ReplicationFactor.ONE;
import static
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_DEADNODE_INTERVAL;
import static
org.apache.hadoop.hdds.scm.ScmConfigKeys.OZONE_SCM_STALENODE_INTERVAL;
+import static org.apache.hadoop.ozone.OzoneConsts.OZONE_OFS_URI_SCHEME;
+import static org.apache.hadoop.ozone.om.OMConfigKeys.OZONE_OM_ADDRESS_KEY;
import static org.assertj.core.api.Assertions.assertThat;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertFalse;
@@ -334,4 +343,49 @@ public class TestScmSafeMode {
cluster.restartStorageContainerManager(true);
assertFalse(scm.isInSafeMode());
}
+
+ @Test
+ public void testCreateRetryWhileSCMSafeMode() throws Exception {
+ // Test1: Test safe mode when there are no containers in system.
+ cluster.stop();
+
+ try {
+ cluster = builder.build();
+ } catch (IOException e) {
+ fail("Cluster startup failed.");
+ }
+
+ final String rootPath = String.format("%s://%s/",
+ OZONE_OFS_URI_SCHEME, conf.get(OZONE_OM_ADDRESS_KEY));
+ conf.set(CommonConfigurationKeysPublic.FS_DEFAULT_NAME_KEY, rootPath);
+ OMMetrics omMetrics = cluster.getOzoneManager().getMetrics();
+ long allocateBlockReqCount = omMetrics.getNumBlockAllocateFails();
+
+ try (FileSystem fs = FileSystem.get(conf)) {
+ assertTrue(((SafeMode)fs).setSafeMode(SafeModeAction.GET));
+
+ Thread t = new Thread(() -> {
+ try {
+ LOG.info("Wait for allocate block fails at least once");
+ GenericTestUtils.waitFor(() -> omMetrics.getNumBlockAllocateFails()
> allocateBlockReqCount,
+ 100, 10000);
+
+ cluster.startHddsDatanodes();
+ cluster.waitForClusterToBeReady();
+ cluster.waitTobeOutOfSafeMode();
+ } catch (InterruptedException | TimeoutException e) {
+ throw new RuntimeException(e);
+ }
+ });
+ t.start();
+
+ final Path file = new Path("file");
+ try (FSDataOutputStream outputStream = fs.create(file, true)) {
+ LOG.info("Successfully created a file");
+ }
+ t.join();
+ }
+
+ assertFalse(cluster.getStorageContainerManager().isInSafeMode());
+ }
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]