This is an automated email from the ASF dual-hosted git repository. zhangduo pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/master by this push: new dfb0d3eb0b7 HBASE-29259 Master crash when loading procedures (#6905) dfb0d3eb0b7 is described below commit dfb0d3eb0b771d9131531e55646f9c6a5b4c01d2 Author: Duo Zhang <zhang...@apache.org> AuthorDate: Sun Apr 20 13:57:22 2025 +0800 HBASE-29259 Master crash when loading procedures (#6905) Signed-off-by: Nick Dimiduk <ndimi...@apache.org> --- .../apache/hadoop/hbase/procedure2/Procedure.java | 19 ++++ .../hadoop/hbase/procedure2/ProcedureExecutor.java | 4 + .../assignment/RegionRemoteProcedureBase.java | 35 ++++-- .../assignment/TransitRegionStateProcedure.java | 31 +++-- .../TestTRSPPersistUninitializedSubProc.java | 125 +++++++++++++++++++++ 5 files changed, 196 insertions(+), 18 deletions(-) diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java index 7b6e7ab8e98..9b4d013e7b6 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/Procedure.java @@ -346,6 +346,25 @@ public abstract class Procedure<TEnvironment> implements Comparable<Procedure<TE // no-op } + /** + * Called before we call the execute method of this procedure, but after we acquire the execution + * lock and procedure scheduler lock. + */ + protected void beforeExec(TEnvironment env) throws ProcedureSuspendedException { + // no-op + } + + /** + * Called after we call the execute method of this procedure, and also after we initialize all the + * sub procedures and persist the the state if persistence is needed. + * <p> + * This is for doing some hooks after we initialize the sub procedures. See HBASE-29259 for more + * details on why we can not release the region lock inside the execute method. + */ + protected void afterExec(TEnvironment env) { + // no-op + } + /** * Called when the procedure is marked as completed (success or rollback). The procedure * implementor may use this method to cleanup in-memory states. This operation will not be retried diff --git a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java index b6df5759134..1f0029e9874 100644 --- a/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java +++ b/hbase-procedure/src/main/java/org/apache/hadoop/hbase/procedure2/ProcedureExecutor.java @@ -1822,6 +1822,7 @@ public class ProcedureExecutor<TEnvironment> { reExecute = false; procedure.resetPersistence(); try { + procedure.beforeExec(getEnvironment()); subprocs = procedure.doExecute(getEnvironment()); if (subprocs != null && subprocs.length == 0) { subprocs = null; @@ -1831,11 +1832,13 @@ public class ProcedureExecutor<TEnvironment> { suspended = true; } catch (ProcedureYieldException e) { LOG.trace("Yield {}", procedure, e); + procedure.afterExec(getEnvironment()); yieldProcedure(procedure); return; } catch (InterruptedException e) { LOG.trace("Yield interrupt {}", procedure, e); handleInterruptedException(procedure, e); + procedure.afterExec(getEnvironment()); yieldProcedure(procedure); return; } catch (Throwable e) { @@ -1907,6 +1910,7 @@ public class ProcedureExecutor<TEnvironment> { updateStoreOnExec(procStack, procedure, subprocs); } } + procedure.afterExec(getEnvironment()); // if the store is not running we are aborting if (!store.isRunning()) { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java index c14af9e9cc7..a828b5b668f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java @@ -296,12 +296,37 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur } @Override - protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env) - throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException { + protected void beforeExec(MasterProcedureEnv env) throws ProcedureSuspendedException { RegionStateNode regionNode = getRegionNode(env); if (!regionNode.isLockedBy(this)) { - regionNode.lock(this, () -> ProcedureFutureUtil.wakeUp(this, env)); + // The wake up action will be called under the lock inside RegionStateNode for implementing + // RegionStateNodeLock, so if we call ProcedureUtil.wakeUp where we will acquire the procedure + // execution lock directly, it may cause dead lock since in normal case procedure execution + // case, we will acquire the procedure execution lock first and then acquire the lock inside + // RegionStateNodeLock. This is the reason why we need to schedule the task to a thread pool + // and execute asynchronously. + regionNode.lock(this, + () -> env.getAsyncTaskExecutor().execute(() -> ProcedureFutureUtil.wakeUp(this, env))); } + } + + @Override + protected void afterExec(MasterProcedureEnv env) { + // only release the lock if there is no pending updating meta operation + if (future == null) { + RegionStateNode regionNode = getRegionNode(env); + // in beforeExec, we may throw ProcedureSuspendedException which means we do not get the lock, + // in this case we should not call unlock + if (regionNode.isLockedBy(this)) { + regionNode.unlock(this); + } + } + } + + @Override + protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env) + throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException { + RegionStateNode regionNode = getRegionNode(env); try { switch (state) { case REGION_REMOTE_PROCEDURE_DISPATCH: { @@ -356,10 +381,6 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur long backoff = retryCounter.getBackoffTimeAndIncrementAttempts(); LOG.warn("Failed updating meta, suspend {}secs {}; {};", backoff / 1000, this, regionNode, e); throw suspend(Math.toIntExact(backoff), true); - } finally { - if (future == null) { - regionNode.unlock(this); - } } } diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java index e1952964a2a..12ddb255936 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/TransitRegionStateProcedure.java @@ -40,7 +40,6 @@ import org.apache.hadoop.hbase.master.ServerManager; import org.apache.hadoop.hbase.master.procedure.AbstractStateMachineRegionProcedure; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; -import org.apache.hadoop.hbase.procedure2.Procedure; import org.apache.hadoop.hbase.procedure2.ProcedureFutureUtil; import org.apache.hadoop.hbase.procedure2.ProcedureMetrics; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; @@ -443,21 +442,31 @@ public class TransitRegionStateProcedure return Flow.HAS_MORE_STATE; } - // Override to lock RegionStateNode - @SuppressWarnings("rawtypes") @Override - protected Procedure[] execute(MasterProcedureEnv env) - throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException { + protected void beforeExec(MasterProcedureEnv env) throws ProcedureSuspendedException { RegionStateNode regionNode = env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion()); if (!regionNode.isLockedBy(this)) { - regionNode.lock(this, () -> ProcedureFutureUtil.wakeUp(this, env)); + // The wake up action will be called under the lock inside RegionStateNode for implementing + // RegionStateNodeLock, so if we call ProcedureUtil.wakeUp where we will acquire the procedure + // execution lock directly, it may cause dead lock since in normal case procedure execution + // case, we will acquire the procedure execution lock first and then acquire the lock inside + // RegionStateNodeLock. This is the reason why we need to schedule the task to a thread pool + // and execute asynchronously. + regionNode.lock(this, + () -> env.getAsyncTaskExecutor().execute(() -> ProcedureFutureUtil.wakeUp(this, env))); } - try { - return super.execute(env); - } finally { - if (future == null) { - // release the lock if there is no pending updating meta operation + } + + @Override + protected void afterExec(MasterProcedureEnv env) { + // only release the lock if there is no pending updating meta operation + if (future == null) { + RegionStateNode regionNode = + env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(getRegion()); + // in beforeExec, we may throw ProcedureSuspendedException which means we do not get the lock, + // in this case we should not call unlock + if (regionNode.isLockedBy(this)) { regionNode.unlock(this); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTRSPPersistUninitializedSubProc.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTRSPPersistUninitializedSubProc.java new file mode 100644 index 00000000000..8199edc9b5a --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestTRSPPersistUninitializedSubProc.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master.assignment; + +import java.io.IOException; +import java.io.UncheckedIOException; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtil; +import org.apache.hadoop.hbase.ServerName; +import org.apache.hadoop.hbase.TableName; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.master.HMaster; +import org.apache.hadoop.hbase.master.assignment.TransitRegionStateProcedure.TransitionType; +import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; +import org.apache.hadoop.hbase.master.procedure.ServerCrashProcedure; +import org.apache.hadoop.hbase.procedure2.Procedure; +import org.apache.hadoop.hbase.procedure2.ProcedureExecutor; +import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; +import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility; +import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.testclassification.MediumTests; +import org.apache.hadoop.hbase.util.Bytes; +import org.junit.AfterClass; +import org.junit.BeforeClass; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +import org.apache.hadoop.hbase.shaded.protobuf.generated.ProcedureProtos.ProcedureState; + +/** + * Testcase for HBASE-29259 + */ +@Category({ MasterTests.class, MediumTests.class }) +public class TestTRSPPersistUninitializedSubProc { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestTRSPPersistUninitializedSubProc.class); + + private static HBaseTestingUtil UTIL = new HBaseTestingUtil(); + + private static byte[] CF = Bytes.toBytes("cf"); + + private static TableName TN = TableName.valueOf("tn"); + + public static class TRSPForTest extends TransitRegionStateProcedure { + + private boolean injected = false; + + public TRSPForTest() { + } + + public TRSPForTest(MasterProcedureEnv env, RegionInfo hri, ServerName assignCandidate, + boolean forceNewPlan, TransitionType type) { + super(env, hri, assignCandidate, forceNewPlan, type); + } + + @Override + protected Procedure[] execute(MasterProcedureEnv env) + throws ProcedureSuspendedException, ProcedureYieldException, InterruptedException { + Procedure[] subProcs = super.execute(env); + if (!injected && subProcs != null && subProcs[0] instanceof CloseRegionProcedure) { + injected = true; + ServerName sn = ((CloseRegionProcedure) subProcs[0]).targetServer; + env.getMasterServices().getServerManager().expireServer(sn); + try { + UTIL.waitFor(15000, () -> env.getMasterServices().getProcedures().stream().anyMatch( + p -> p instanceof ServerCrashProcedure && p.getState() != ProcedureState.INITIALIZING)); + } catch (IOException e) { + throw new UncheckedIOException(e); + } + // sleep 10 seconds to let the SCP interrupt the TRSP, where we will call TRSP.serverCrashed + Thread.sleep(10000); + } + return subProcs; + } + } + + @BeforeClass + public static void setUpBeforeClass() throws Exception { + UTIL.startMiniCluster(2); + UTIL.getAdmin().balancerSwitch(false, true); + UTIL.createTable(TN, CF); + UTIL.waitTableAvailable(TN); + } + + @AfterClass + public static void tearDownAfterClass() throws Exception { + UTIL.shutdownMiniCluster(); + } + + @Test + public void testServerCrash() throws Exception { + HMaster master = UTIL.getHBaseCluster().getMaster(); + ProcedureExecutor<MasterProcedureEnv> procExec = master.getMasterProcedureExecutor(); + RegionInfo region = UTIL.getAdmin().getRegions(TN).get(0); + RegionStateNode rsn = + master.getAssignmentManager().getRegionStates().getRegionStateNode(region); + TRSPForTest trsp = + new TRSPForTest(procExec.getEnvironment(), region, null, false, TransitionType.REOPEN); + // attach it to RegionStateNode, to simulate normal reopen + rsn.setProcedure(trsp); + procExec.submitProcedure(trsp); + ProcedureTestingUtility.waitProcedure(procExec, trsp); + // make sure we do not store invalid procedure to procedure store + ProcedureTestingUtility.restart(procExec); + } +}