This is an automated email from the ASF dual-hosted git repository.

zhangduo pushed a commit to branch branch-3
in repository https://gitbox.apache.org/repos/asf/hbase.git


The following commit(s) were added to refs/heads/branch-3 by this push:
     new 3d49d100323 HBASE-29294 Master crashed because of failing to update 
master region (#6979)
3d49d100323 is described below

commit 3d49d10032332479ca6609a4ab62d4d1feaf5ecb
Author: Duo Zhang <[email protected]>
AuthorDate: Wed May 14 20:00:37 2025 +0800

    HBASE-29294 Master crashed because of failing to update master region 
(#6979)
    
    Signed-off-by: Viraj Jasani <[email protected]>
    (cherry picked from commit 91ecd467410d413d08193f9720ff946063712d51)
---
 .../org/apache/hadoop/hbase/ipc/RpcServer.java     | 11 ++--
 .../hadoop/hbase/master/region/MasterRegion.java   | 29 +++++++--
 .../store/region/RegionProcedureStore.java         | 75 ++++++++--------------
 .../hbase/master/region/MasterRegionTestBase.java  |  4 +-
 .../master/region/TestMasterRegionRpcTimeout.java  | 58 +++++++++++++++++
 .../TestMasterRegionWALSyncTimeoutIOException.java |  3 +
 6 files changed, 118 insertions(+), 62 deletions(-)

diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcServer.java 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcServer.java
index fce51f36014..2db08fd7398 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcServer.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/ipc/RpcServer.java
@@ -770,10 +770,9 @@ public abstract class RpcServer implements 
RpcServerInterface, ConfigurationObse
   }
 
   /**
-   * Used by {@link 
org.apache.hadoop.hbase.procedure2.store.region.RegionProcedureStore}. For
-   * master's rpc call, it may generate new procedure and mutate the region 
which store procedure.
-   * There are some check about rpc when mutate region, such as rpc timeout 
check. So unset the rpc
-   * call to avoid the rpc check.
+   * Used by {@link org.apache.hadoop.hbase.master.region.MasterRegion}, to 
avoid hit row lock
+   * timeout when updating master region in a rpc call. See HBASE-23895, 
HBASE-29251 and HBASE-29294
+   * for more details.
    * @return the currently ongoing rpc call
    */
   public static Optional<RpcCall> unsetCurrentCall() {
@@ -783,8 +782,8 @@ public abstract class RpcServer implements 
RpcServerInterface, ConfigurationObse
   }
 
   /**
-   * Used by {@link 
org.apache.hadoop.hbase.procedure2.store.region.RegionProcedureStore}. Set the
-   * rpc call back after mutate region.
+   * Used by {@link org.apache.hadoop.hbase.master.region.MasterRegion}. Set 
the rpc call back after
+   * mutate region.
    */
   public static void setCurrentCall(RpcCall rpcCall) {
     CurCall.set(rpcCall);
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java
index f8d7e7a1823..97447e37b7c 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/region/MasterRegion.java
@@ -22,6 +22,7 @@ import static 
org.apache.hadoop.hbase.HConstants.HREGION_LOGDIR_NAME;
 import com.google.errorprone.annotations.RestrictedApi;
 import java.io.IOException;
 import java.util.List;
+import java.util.Optional;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileStatus;
 import org.apache.hadoop.fs.FileSystem;
@@ -40,6 +41,8 @@ import org.apache.hadoop.hbase.client.ResultScanner;
 import org.apache.hadoop.hbase.client.Scan;
 import org.apache.hadoop.hbase.client.TableDescriptor;
 import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
+import org.apache.hadoop.hbase.ipc.RpcCall;
+import org.apache.hadoop.hbase.ipc.RpcServer;
 import org.apache.hadoop.hbase.log.HBaseMarkers;
 import org.apache.hadoop.hbase.regionserver.HRegion;
 import org.apache.hadoop.hbase.regionserver.HRegion.FlushResult;
@@ -154,12 +157,7 @@ public final class MasterRegion {
     }
   }
 
-  /**
-   * Performs the mutation to the master region using UpdateMasterRegion 
update action.
-   * @param action Update region action.
-   * @throws IOException IO error that causes active master to abort.
-   */
-  public void update(UpdateMasterRegion action) throws IOException {
+  private void update0(UpdateMasterRegion action) throws IOException {
     for (int tries = 0; tries < maxRetriesForRegionUpdates; tries++) {
       try {
         // If the update is successful, return immediately.
@@ -191,6 +189,25 @@ public final class MasterRegion {
     }
   }
 
+  /**
+   * Performs the mutation to the master region using UpdateMasterRegion 
update action.
+   * @param action Update region action.
+   * @throws IOException IO error that causes active master to abort.
+   */
+  public void update(UpdateMasterRegion action) throws IOException {
+    // Since now we will abort master when updating master region fails, and 
when updating, if the
+    // rpc is already timed out, we will hit a TimeoutIOException which 
indicates that we can not
+    // get the row lock in time, so here we need to unset the rpc call to 
prevent this, otherwise
+    // master will abort with a rpc timeout, which is not necessary...
+    // See HBASE-29294.
+    Optional<RpcCall> rpcCall = RpcServer.unsetCurrentCall();
+    try {
+      update0(action);
+    } finally {
+      rpcCall.ifPresent(RpcServer::setCurrentCall);
+    }
+  }
+
   /**
    * Log the error and abort the master daemon immediately. Use this utility 
only when procedure
    * state store update fails and the only way to recover is by terminating 
the active master so
diff --git 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure2/store/region/RegionProcedureStore.java
 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure2/store/region/RegionProcedureStore.java
index 9cc42bf587f..0e666fe6c94 100644
--- 
a/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure2/store/region/RegionProcedureStore.java
+++ 
b/hbase-server/src/main/java/org/apache/hadoop/hbase/procedure2/store/region/RegionProcedureStore.java
@@ -29,7 +29,6 @@ import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
-import java.util.Optional;
 import org.apache.commons.lang3.mutable.MutableLong;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
@@ -41,8 +40,6 @@ import org.apache.hadoop.hbase.client.Delete;
 import org.apache.hadoop.hbase.client.Mutation;
 import org.apache.hadoop.hbase.client.Put;
 import org.apache.hadoop.hbase.client.Scan;
-import org.apache.hadoop.hbase.ipc.RpcCall;
-import org.apache.hadoop.hbase.ipc.RpcServer;
 import org.apache.hadoop.hbase.log.HBaseMarkers;
 import org.apache.hadoop.hbase.master.assignment.AssignProcedure;
 import org.apache.hadoop.hbase.master.assignment.MoveRegionProcedure;
@@ -304,20 +301,6 @@ public class RegionProcedureStore extends 
ProcedureStoreBase {
     rowsToLock.add(row);
   }
 
-  /**
-   * Insert procedure may be called by master's rpc call. There are some check 
about the rpc call
-   * when mutate region. Here unset the current rpc call and set it back in 
finally block. See
-   * HBASE-23895 for more details.
-   */
-  private void runWithoutRpcCall(Runnable runnable) {
-    Optional<RpcCall> rpcCall = RpcServer.unsetCurrentCall();
-    try {
-      runnable.run();
-    } finally {
-      rpcCall.ifPresent(RpcServer::setCurrentCall);
-    }
-  }
-
   @Override
   public void insert(Procedure<?> proc, Procedure<?>[] subProcs) {
     if (subProcs == null || subProcs.length == 0) {
@@ -327,50 +310,44 @@ public class RegionProcedureStore extends 
ProcedureStoreBase {
     }
     List<Mutation> mutations = new ArrayList<>(subProcs.length + 1);
     List<byte[]> rowsToLock = new ArrayList<>(subProcs.length + 1);
-    runWithoutRpcCall(() -> {
-      try {
-        serializePut(proc, mutations, rowsToLock);
-        for (Procedure<?> subProc : subProcs) {
-          serializePut(subProc, mutations, rowsToLock);
-        }
-        region.update(r -> r.mutateRowsWithLocks(mutations, rowsToLock, 
NO_NONCE, NO_NONCE));
-      } catch (IOException e) {
-        LOG.error(HBaseMarkers.FATAL, "Failed to insert proc {}, sub procs 
{}", proc,
-          Arrays.toString(subProcs), e);
-        throw new UncheckedIOException(e);
+    try {
+      serializePut(proc, mutations, rowsToLock);
+      for (Procedure<?> subProc : subProcs) {
+        serializePut(subProc, mutations, rowsToLock);
       }
-    });
+      region.update(r -> r.mutateRowsWithLocks(mutations, rowsToLock, 
NO_NONCE, NO_NONCE));
+    } catch (IOException e) {
+      LOG.error(HBaseMarkers.FATAL, "Failed to insert proc {}, sub procs {}", 
proc,
+        Arrays.toString(subProcs), e);
+      throw new UncheckedIOException(e);
+    }
   }
 
   @Override
   public void insert(Procedure<?>[] procs) {
     List<Mutation> mutations = new ArrayList<>(procs.length);
     List<byte[]> rowsToLock = new ArrayList<>(procs.length);
-    runWithoutRpcCall(() -> {
-      try {
-        for (Procedure<?> proc : procs) {
-          serializePut(proc, mutations, rowsToLock);
-        }
-        region.update(r -> r.mutateRowsWithLocks(mutations, rowsToLock, 
NO_NONCE, NO_NONCE));
-      } catch (IOException e) {
-        LOG.error(HBaseMarkers.FATAL, "Failed to insert procs {}", 
Arrays.toString(procs), e);
-        throw new UncheckedIOException(e);
+    try {
+      for (Procedure<?> proc : procs) {
+        serializePut(proc, mutations, rowsToLock);
       }
-    });
+      region.update(r -> r.mutateRowsWithLocks(mutations, rowsToLock, 
NO_NONCE, NO_NONCE));
+    } catch (IOException e) {
+      LOG.error(HBaseMarkers.FATAL, "Failed to insert procs {}", 
Arrays.toString(procs), e);
+      throw new UncheckedIOException(e);
+    }
   }
 
   @Override
   public void update(Procedure<?> proc) {
-    runWithoutRpcCall(() -> {
-      try {
-        ProcedureProtos.Procedure proto = 
ProcedureUtil.convertToProtoProcedure(proc);
-        region.update(r -> r.put(new 
Put(Bytes.toBytes(proc.getProcId())).addColumn(PROC_FAMILY,
-          PROC_QUALIFIER, proto.toByteArray())));
-      } catch (IOException e) {
-        LOG.error(HBaseMarkers.FATAL, "Failed to update proc {}", proc, e);
-        throw new UncheckedIOException(e);
-      }
-    });
+    try {
+      ProcedureProtos.Procedure proto = 
ProcedureUtil.convertToProtoProcedure(proc);
+      region.update(r -> r.put(new 
Put(Bytes.toBytes(proc.getProcId())).addColumn(PROC_FAMILY,
+        PROC_QUALIFIER, proto.toByteArray())));
+    } catch (IOException e) {
+      LOG.error(HBaseMarkers.FATAL, "Failed to update proc {}", proc, e);
+      throw new UncheckedIOException(e);
+    }
   }
 
   @Override
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/MasterRegionTestBase.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/MasterRegionTestBase.java
index 2b5080ec80a..0526fd3ba70 100644
--- 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/MasterRegionTestBase.java
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/MasterRegionTestBase.java
@@ -53,6 +53,8 @@ public class MasterRegionTestBase {
 
   protected DirScanPool logCleanerPool;
 
+  protected Server server;
+
   protected static byte[] CF1 = Bytes.toBytes("f1");
 
   protected static byte[] CF2 = Bytes.toBytes("f2");
@@ -94,7 +96,7 @@ public class MasterRegionTestBase {
     choreService = new ChoreService(getClass().getSimpleName());
     hfileCleanerPool = DirScanPool.getHFileCleanerScanPool(conf);
     logCleanerPool = DirScanPool.getLogCleanerScanPool(conf);
-    Server server = mock(Server.class);
+    server = mock(Server.class);
     when(server.getConfiguration()).thenReturn(conf);
     when(server.getServerName())
       .thenReturn(ServerName.valueOf("localhost", 12345, 
EnvironmentEdgeManager.currentTime()));
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/TestMasterRegionRpcTimeout.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/TestMasterRegionRpcTimeout.java
new file mode 100644
index 00000000000..66cdaf4cc76
--- /dev/null
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/TestMasterRegionRpcTimeout.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.region;
+
+import static org.junit.Assert.assertSame;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import java.io.IOException;
+import org.apache.hadoop.hbase.HBaseClassTestRule;
+import org.apache.hadoop.hbase.client.Put;
+import org.apache.hadoop.hbase.ipc.RpcCall;
+import org.apache.hadoop.hbase.ipc.RpcServer;
+import org.apache.hadoop.hbase.testclassification.MasterTests;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.EnvironmentEdgeManager;
+import org.junit.ClassRule;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+/**
+ * Make sure that we will not get rpc timeout when updating master region.
+ */
+@Category({ MasterTests.class, MediumTests.class })
+public class TestMasterRegionRpcTimeout extends MasterRegionTestBase {
+
+  @ClassRule
+  public static final HBaseClassTestRule CLASS_RULE =
+    HBaseClassTestRule.forClass(TestMasterRegionRpcTimeout.class);
+
+  @Test
+  public void testRpcTimeout() throws IOException {
+    RpcCall call = mock(RpcCall.class);
+    // fake a RpcCall which is already timed out
+    when(call.getDeadline()).thenReturn(EnvironmentEdgeManager.currentTime() - 
10000);
+    RpcServer.setCurrentCall(call);
+    // make sure that the update operation does not throw timeout exception
+    region.update(
+      r -> r.put(new Put(Bytes.toBytes("row")).addColumn(CF1, QUALIFIER, 
Bytes.toBytes("value"))));
+    assertSame(call, RpcServer.getCurrentCall().get());
+  }
+}
diff --git 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/TestMasterRegionWALSyncTimeoutIOException.java
 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/TestMasterRegionWALSyncTimeoutIOException.java
index 30b9978f84c..02cad9d8c4b 100644
--- 
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/TestMasterRegionWALSyncTimeoutIOException.java
+++ 
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/region/TestMasterRegionWALSyncTimeoutIOException.java
@@ -18,6 +18,8 @@
 package org.apache.hadoop.hbase.master.region;
 
 import static org.junit.Assert.assertThrows;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.verify;
 
 import java.io.IOException;
 import java.time.Duration;
@@ -79,6 +81,7 @@ public class TestMasterRegionWALSyncTimeoutIOException 
extends MasterRegionTestB
         Thread.sleep(Duration.ofSeconds(1).toMillis());
       }
     });
+    verify(server).abort(any(), any());
   }
 
   public static class SlowAsyncFSWAL extends AsyncFSWAL {

Reply via email to