This is an automated email from the ASF dual-hosted git repository.
zhangduo pushed a commit to branch branch-2.5
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2.5 by this push:
new 99f56960148 HBASE-29315: Fail Split/MergeTableRegionProcedure if table
modification in progress (branch-2.5) (#7036)
99f56960148 is described below
commit 99f5696014886a69eabc8218156bad9a8d97d603
Author: Charles Connell <[email protected]>
AuthorDate: Wed May 28 10:51:37 2025 -0400
HBASE-29315: Fail Split/MergeTableRegionProcedure if table modification in
progress (branch-2.5) (#7036)
Signed-off-by: Duo Zhang <[email protected]>
---
.../assignment/MergeTableRegionsProcedure.java | 13 ++++
.../assignment/SplitTableRegionProcedure.java | 16 +++++
.../AbstractStateMachineTableProcedure.java | 11 +++
...ionServerHostingReplicaSlowOpenCoprocessor.java | 59 +++++++++++++++
.../assignment/TestMergeTableRegionsProcedure.java | 29 ++++++++
.../assignment/TestSplitTableRegionProcedure.java | 83 ++++++++++------------
6 files changed, 166 insertions(+), 45 deletions(-)
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MergeTableRegionsProcedure.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MergeTableRegionsProcedure.java
index 3b6fa561b01..c9c3442451f 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MergeTableRegionsProcedure.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MergeTableRegionsProcedure.java
@@ -450,6 +450,19 @@ public class MergeTableRegionsProcedure
+ RegionInfo.getShortNameToLog(regionsToMerge) + ", because we are
snapshotting " + tn);
}
+ /*
+ * Sometimes a ModifyTableProcedure has edited a table descriptor to
change the number of region
+ * replicas for a table, but it has not yet opened/closed the new
replicas. The
+ * ModifyTableProcedure assumes that nobody else will do the
opening/closing of the new
+ * replicas, but a concurrent MergeTableRegionProcedure would violate that
assumption.
+ */
+ if (isTableModificationInProgress(env)) {
+ setFailure(getClass().getSimpleName(),
+ new IOException("Skip merging regions " +
RegionInfo.getShortNameToLog(regionsToMerge)
+ + ", because there is an active procedure that is modifying the
table " + tn));
+ return false;
+ }
+
// Mostly this check is not used because we already check the switch
before submit a merge
// procedure. Just for safe, check the switch again. This procedure can be
rollbacked if
// the switch was set to false after submit.
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
index a989f19b05e..40c720681bd 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
@@ -103,6 +103,8 @@ public class SplitTableRegionProcedure
private RegionInfo daughterTwoRI;
private byte[] bestSplitRow;
private RegionSplitPolicy splitPolicy;
+ // exposed for unit testing
+ boolean checkTableModifyInProgress = true;
public SplitTableRegionProcedure() {
// Required by the Procedure framework to create the procedure on replay
@@ -513,6 +515,20 @@ public class SplitTableRegionProcedure
+ ", because we are taking snapshot for the table " +
getParentRegion().getTable()));
return false;
}
+
+ /*
+ * Sometimes a ModifyTableProcedure has edited a table descriptor to
change the number of region
+ * replicas for a table, but it has not yet opened/closed the new
replicas. The
+ * ModifyTableProcedure assumes that nobody else will do the
opening/closing of the new
+ * replicas, but a concurrent SplitTableRegionProcedure would violate that
assumption.
+ */
+ if (checkTableModifyInProgress && isTableModificationInProgress(env)) {
+ setFailure(new IOException("Skip splitting region " +
getParentRegion().getShortNameToLog()
+ + ", because there is an active procedure that is modifying the table "
+ + getParentRegion().getTable()));
+ return false;
+ }
+
// Check whether the region is splittable
RegionStateNode node =
env.getAssignmentManager().getRegionStates().getRegionStateNode(getParentRegion());
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/AbstractStateMachineTableProcedure.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/AbstractStateMachineTableProcedure.java
index b6dce5198d4..67076507496 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/AbstractStateMachineTableProcedure.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/AbstractStateMachineTableProcedure.java
@@ -192,4 +192,15 @@ public abstract class
AbstractStateMachineTableProcedure<TState>
}
regionNode.checkOnline();
}
+
+ /**
+ * Some procedures cannot safely run while a table is being modified. Helper
method to allow them
+ * to check if such a procedure is running on a table.
+ */
+ protected final boolean isTableModificationInProgress(MasterProcedureEnv
env) {
+ return
env.getMasterServices().getMasterProcedureExecutor().getProcedures().stream()
+ .filter(p -> !p.isFinished()).filter(p -> p instanceof
TableProcedureInterface)
+ .map(p -> (TableProcedureInterface) p).filter(p ->
getTableName().equals(p.getTableName()))
+ .anyMatch(TableQueue::requireTableExclusiveLock);
+ }
}
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/RegionServerHostingReplicaSlowOpenCoprocessor.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/RegionServerHostingReplicaSlowOpenCoprocessor.java
new file mode 100644
index 00000000000..bd5a28868dd
--- /dev/null
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/RegionServerHostingReplicaSlowOpenCoprocessor.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.assignment;
+
+import java.util.Optional;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.coprocessor.ObserverContext;
+import org.apache.hadoop.hbase.coprocessor.RegionCoprocessor;
+import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
+import org.apache.hadoop.hbase.coprocessor.RegionObserver;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This coprocessor is used to slow down opening of the replica regions.
+ */
+public class RegionServerHostingReplicaSlowOpenCoprocessor
+ implements RegionCoprocessor, RegionObserver {
+
+ private static final Logger LOG =
+
LoggerFactory.getLogger(RegionServerHostingReplicaSlowOpenCoprocessor.class);
+
+ static volatile boolean slowDownReplicaOpen = false;
+
+ @Override
+ public Optional<RegionObserver> getRegionObserver() {
+ return Optional.of(this);
+ }
+
+ @Override
+ public void preOpen(ObserverContext<RegionCoprocessorEnvironment> c) {
+ int replicaId =
c.getEnvironment().getRegion().getRegionInfo().getReplicaId();
+ if (replicaId != RegionInfo.DEFAULT_REPLICA_ID) {
+ while (slowDownReplicaOpen) {
+ LOG.info("Slow down replica region open a bit");
+ try {
+ Thread.sleep(250);
+ } catch (InterruptedException ignored) {
+ return;
+ }
+ }
+ }
+ }
+}
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestMergeTableRegionsProcedure.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestMergeTableRegionsProcedure.java
index 41267a19373..0c95057bbf0 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestMergeTableRegionsProcedure.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestMergeTableRegionsProcedure.java
@@ -39,6 +39,7 @@ import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureConstants;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureTestingUtility;
+import org.apache.hadoop.hbase.master.procedure.ModifyTableProcedure;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
@@ -92,6 +93,8 @@ public class TestMergeTableRegionsProcedure {
conf.setInt("hbase.master.maximum.ping.server.attempts", 3);
conf.setInt("hbase.master.ping.server.retry.sleep.interval", 1);
conf.setInt(MasterProcedureConstants.MASTER_PROCEDURE_THREADS, 1);
+ conf.set("hbase.coprocessor.region.classes",
+ RegionServerHostingReplicaSlowOpenCoprocessor.class.getName());
}
@BeforeClass
@@ -345,6 +348,32 @@ public class TestMergeTableRegionsProcedure {
assertRegionCount(tableName, initialRegionCount - 1);
}
+ @Test
+ public void testMergeDetectsModifyTableProcedure() throws Exception {
+ final TableName tableName = TableName.valueOf(name.getMethodName());
+ final ProcedureExecutor<MasterProcedureEnv> procExec =
getMasterProcedureExecutor();
+
+ List<RegionInfo> regions = createTable(tableName);
+
+ RegionServerHostingReplicaSlowOpenCoprocessor.slowDownReplicaOpen = true;
+ TableDescriptor td =
TableDescriptorBuilder.newBuilder(admin.getDescriptor(tableName))
+ .setRegionReplication(2).build();
+ long modifyProcId =
+ procExec.submitProcedure(new
ModifyTableProcedure(procExec.getEnvironment(), td));
+
+ // Merge regions of the table, the MergeTableRegionsProcedure will fail
because there is a
+ // ModifyTableProcedure in progress
+ MergeTableRegionsProcedure mergeProcedure = new MergeTableRegionsProcedure(
+ procExec.getEnvironment(), regions.toArray(new RegionInfo[0]), false);
+ long mergeProcId = procExec.submitProcedure(mergeProcedure);
+ ProcedureTestingUtility.waitProcedure(procExec, mergeProcId);
+ ProcedureTestingUtility.assertProcFailed(procExec, mergeProcId);
+
+ RegionServerHostingReplicaSlowOpenCoprocessor.slowDownReplicaOpen = false;
+ ProcedureTestingUtility.waitProcedure(procExec, modifyProcId);
+ ProcedureTestingUtility.assertProcNotFailed(procExec, modifyProcId);
+ }
+
private List<RegionInfo> createTable(final TableName tableName) throws
Exception {
TableDescriptor desc = TableDescriptorBuilder.newBuilder(tableName)
.setColumnFamily(ColumnFamilyDescriptorBuilder.of(FAMILY)).build();
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestSplitTableRegionProcedure.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestSplitTableRegionProcedure.java
index 020f090f1dd..d9ea9231099 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestSplitTableRegionProcedure.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestSplitTableRegionProcedure.java
@@ -25,7 +25,6 @@ import static org.junit.Assert.fail;
import java.io.IOException;
import java.util.List;
-import java.util.Optional;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
@@ -43,14 +42,12 @@ import org.apache.hadoop.hbase.client.RegionReplicaUtil;
import org.apache.hadoop.hbase.client.Result;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.client.TableDescriptor;
-import org.apache.hadoop.hbase.coprocessor.ObserverContext;
-import org.apache.hadoop.hbase.coprocessor.RegionCoprocessor;
-import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
-import org.apache.hadoop.hbase.coprocessor.RegionObserver;
+import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureConstants;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureTestingUtility;
+import org.apache.hadoop.hbase.master.procedure.ModifyTableProcedure;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
@@ -107,38 +104,7 @@ public class TestSplitTableRegionProcedure {
conf.setInt(MasterProcedureConstants.MASTER_PROCEDURE_THREADS, 1);
conf.setLong(HConstants.MAJOR_COMPACTION_PERIOD, 0);
conf.set("hbase.coprocessor.region.classes",
- RegionServerHostingReplicaSlowOpenCopro.class.getName());
- conf.setInt("hbase.client.sync.wait.timeout.msec", 1500);
- }
-
- /**
- * This copro is used to slow down opening of the replica regions.
- */
- public static class RegionServerHostingReplicaSlowOpenCopro
- implements RegionCoprocessor, RegionObserver {
- static int countForReplica = 0;
- static boolean slowDownReplicaOpen = false;
-
- @Override
- public Optional<RegionObserver> getRegionObserver() {
- return Optional.of(this);
- }
-
- @Override
- public void preOpen(ObserverContext<RegionCoprocessorEnvironment> c)
throws IOException {
- int replicaId =
c.getEnvironment().getRegion().getRegionInfo().getReplicaId();
- if ((replicaId != RegionInfo.DEFAULT_REPLICA_ID) && (countForReplica ==
0)) {
- countForReplica++;
- while (slowDownReplicaOpen) {
- LOG.info("Slow down replica region open a bit");
- try {
- Thread.sleep(100);
- } catch (InterruptedException ie) {
- // Ingore
- }
- }
- }
- }
+ RegionServerHostingReplicaSlowOpenCoprocessor.class.getName());
}
@BeforeClass
@@ -183,15 +149,13 @@ public class TestSplitTableRegionProcedure {
final TableName tableName = TableName.valueOf(name.getMethodName());
final ProcedureExecutor<MasterProcedureEnv> procExec =
getMasterProcedureExecutor();
- RegionServerHostingReplicaSlowOpenCopro.slowDownReplicaOpen = true;
+ RegionServerHostingReplicaSlowOpenCoprocessor.slowDownReplicaOpen = true;
RegionInfo[] regions =
MasterProcedureTestingUtility.createTable(procExec, tableName, null,
columnFamilyName1);
- try {
- HBaseTestingUtility.setReplicas(UTIL.getAdmin(), tableName, 2);
- } catch (IOException ioe) {
-
- }
+ TableDescriptor td =
TableDescriptorBuilder.newBuilder(UTIL.getAdmin().getDescriptor(tableName))
+ .setRegionReplication(2).build();
+ procExec.submitProcedure(new
ModifyTableProcedure(procExec.getEnvironment(), td));
// wait until the primary region is online.
HBaseTestingUtility.await(2000, () -> {
@@ -215,7 +179,7 @@ public class TestSplitTableRegionProcedure {
ProcedureTestingUtility.waitProcedure(procExec, procId);
// Let replica parent region open.
- RegionServerHostingReplicaSlowOpenCopro.slowDownReplicaOpen = false;
+ RegionServerHostingReplicaSlowOpenCoprocessor.slowDownReplicaOpen = false;
// wait until the replica region is online.
HBaseTestingUtility.await(2000, () -> {
@@ -223,7 +187,10 @@ public class TestSplitTableRegionProcedure {
AssignmentManager am =
UTIL.getHBaseCluster().getMaster().getAssignmentManager();
if (am == null) return false;
RegionInfo replicaRegion =
RegionReplicaUtil.getRegionInfoForReplica(regions[0], 1);
- if (am.getRegionStates().getRegionState(replicaRegion).isOpened()) {
+ if (
+ am.getRegionStates().getRegionState(replicaRegion) != null
+ && am.getRegionStates().getRegionState(replicaRegion).isOpened()
+ ) {
return true;
}
return false;
@@ -556,6 +523,32 @@ public class TestSplitTableRegionProcedure {
verify(tableName, splitRowNum);
}
+ @Test
+ public void testSplitDetectsModifyTableProcedure() throws Exception {
+ final TableName tableName = TableName.valueOf(name.getMethodName());
+ final ProcedureExecutor<MasterProcedureEnv> procExec =
getMasterProcedureExecutor();
+
+ RegionInfo[] regions =
+ MasterProcedureTestingUtility.createTable(procExec, tableName, null,
columnFamilyName1);
+ RegionServerHostingReplicaSlowOpenCoprocessor.slowDownReplicaOpen = true;
+ TableDescriptor td =
TableDescriptorBuilder.newBuilder(UTIL.getAdmin().getDescriptor(tableName))
+ .setRegionReplication(2).build();
+ long modifyProcId =
+ procExec.submitProcedure(new
ModifyTableProcedure(procExec.getEnvironment(), td));
+
+ // Split region of the table, the SplitTableRegionProcedure will fail
because there is a
+ // ModifyTableProcedure in progress
+ SplitTableRegionProcedure splitProcedure = new SplitTableRegionProcedure(
+ procExec.getEnvironment(), regions[0], HConstants.CATALOG_FAMILY);
+ long splitProcId = procExec.submitProcedure(splitProcedure);
+ ProcedureTestingUtility.waitProcedure(procExec, splitProcId);
+ ProcedureTestingUtility.assertProcFailed(procExec, splitProcId);
+
+ RegionServerHostingReplicaSlowOpenCoprocessor.slowDownReplicaOpen = false;
+ ProcedureTestingUtility.waitProcedure(procExec, modifyProcId);
+ ProcedureTestingUtility.assertProcNotFailed(procExec, modifyProcId);
+ }
+
private void deleteData(final TableName tableName, final int
startDeleteRowNum)
throws IOException, InterruptedException {
Table t = UTIL.getConnection().getTable(tableName);