This is an automated email from the ASF dual-hosted git repository.
zhangduo pushed a commit to branch branch-3
in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-3 by this push:
new debc0e2b773 HBASE-29315 Fail Split/MergeTableRegionProcedure if table
modification in progress (#6991)
debc0e2b773 is described below
commit debc0e2b7731392df1f63e4b2f08a29ea2b9c906
Author: Charles Connell <[email protected]>
AuthorDate: Fri May 23 10:56:45 2025 -0400
HBASE-29315 Fail Split/MergeTableRegionProcedure if table modification in
progress (#6991)
Signed-off-by: Duo Zhang <[email protected]>
(cherry picked from commit c4d8b00267adb8246df991397c0748cc625f756d)
---
.../assignment/MergeTableRegionsProcedure.java | 13 ++++
.../assignment/SplitTableRegionProcedure.java | 16 +++++
.../AbstractStateMachineTableProcedure.java | 11 +++
...ionServerHostingReplicaSlowOpenCoprocessor.java | 59 +++++++++++++++
.../assignment/TestMergeTableRegionsProcedure.java | 29 ++++++++
.../assignment/TestSplitTableRegionProcedure.java | 84 ++++++++++------------
.../procedure/TestTruncateRegionProcedure.java | 3 -
7 files changed, 166 insertions(+), 49 deletions(-)
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MergeTableRegionsProcedure.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MergeTableRegionsProcedure.java
index 4ad45757ad5..38eae148004 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MergeTableRegionsProcedure.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/MergeTableRegionsProcedure.java
@@ -450,6 +450,19 @@ public class MergeTableRegionsProcedure
"Skip merging regions " + regionNamesToLog + ", because we are
snapshotting " + tn);
}
+ /*
+ * Sometimes a ModifyTableProcedure has edited a table descriptor to
change the number of region
+ * replicas for a table, but it has not yet opened/closed the new
replicas. The
+ * ModifyTableProcedure assumes that nobody else will do the
opening/closing of the new
+ * replicas, but a concurrent MergeTableRegionProcedure would violate that
assumption.
+ */
+ if (isTableModificationInProgress(env)) {
+ setFailure(getClass().getSimpleName(),
+ new IOException("Skip merging regions " + regionNamesToLog
+ + ", because there is an active procedure that is modifying the
table " + tn));
+ return false;
+ }
+
// Mostly the below two checks are not used because we already check the
switches before
// submitting the merge procedure. Just for safety, we are checking the
switch again here.
// Also, in case the switch was set to false after submission, this
procedure can be rollbacked,
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
index 1dc6768cb05..3d3d3d18de2 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/SplitTableRegionProcedure.java
@@ -103,6 +103,8 @@ public class SplitTableRegionProcedure
private RegionInfo daughterTwoRI;
private byte[] bestSplitRow;
private RegionSplitPolicy splitPolicy;
+ // exposed for unit testing
+ boolean checkTableModifyInProgress = true;
public SplitTableRegionProcedure() {
// Required by the Procedure framework to create the procedure on replay
@@ -514,6 +516,20 @@ public class SplitTableRegionProcedure
+ ", because we are taking snapshot for the table " +
getParentRegion().getTable()));
return false;
}
+
+ /*
+ * Sometimes a ModifyTableProcedure has edited a table descriptor to
change the number of region
+ * replicas for a table, but it has not yet opened/closed the new
replicas. The
+ * ModifyTableProcedure assumes that nobody else will do the
opening/closing of the new
+ * replicas, but a concurrent SplitTableRegionProcedure would violate that
assumption.
+ */
+ if (checkTableModifyInProgress && isTableModificationInProgress(env)) {
+ setFailure(new IOException("Skip splitting region " +
getParentRegion().getShortNameToLog()
+ + ", because there is an active procedure that is modifying the table "
+ + getParentRegion().getTable()));
+ return false;
+ }
+
// Check whether the region is splittable
RegionStateNode node =
env.getAssignmentManager().getRegionStates().getRegionStateNode(getParentRegion());
diff --git
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/AbstractStateMachineTableProcedure.java
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/AbstractStateMachineTableProcedure.java
index b6dce5198d4..67076507496 100644
---
a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/AbstractStateMachineTableProcedure.java
+++
b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/AbstractStateMachineTableProcedure.java
@@ -192,4 +192,15 @@ public abstract class
AbstractStateMachineTableProcedure<TState>
}
regionNode.checkOnline();
}
+
+ /**
+ * Some procedures cannot safely run while a table is being modified. Helper
method to allow them
+ * to check if such a procedure is running on a table.
+ */
+ protected final boolean isTableModificationInProgress(MasterProcedureEnv
env) {
+ return
env.getMasterServices().getMasterProcedureExecutor().getProcedures().stream()
+ .filter(p -> !p.isFinished()).filter(p -> p instanceof
TableProcedureInterface)
+ .map(p -> (TableProcedureInterface) p).filter(p ->
getTableName().equals(p.getTableName()))
+ .anyMatch(TableQueue::requireTableExclusiveLock);
+ }
}
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/RegionServerHostingReplicaSlowOpenCoprocessor.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/RegionServerHostingReplicaSlowOpenCoprocessor.java
new file mode 100644
index 00000000000..67fafbbe4be
--- /dev/null
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/RegionServerHostingReplicaSlowOpenCoprocessor.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.master.assignment;
+
+import java.util.Optional;
+import org.apache.hadoop.hbase.client.RegionInfo;
+import org.apache.hadoop.hbase.coprocessor.ObserverContext;
+import org.apache.hadoop.hbase.coprocessor.RegionCoprocessor;
+import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
+import org.apache.hadoop.hbase.coprocessor.RegionObserver;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This coprocessor is used to slow down opening of the replica regions.
+ */
+public class RegionServerHostingReplicaSlowOpenCoprocessor
+ implements RegionCoprocessor, RegionObserver {
+
+ private static final Logger LOG =
+
LoggerFactory.getLogger(RegionServerHostingReplicaSlowOpenCoprocessor.class);
+
+ static volatile boolean slowDownReplicaOpen = false;
+
+ @Override
+ public Optional<RegionObserver> getRegionObserver() {
+ return Optional.of(this);
+ }
+
+ @Override
+ public void preOpen(ObserverContext<? extends RegionCoprocessorEnvironment>
c) {
+ int replicaId =
c.getEnvironment().getRegion().getRegionInfo().getReplicaId();
+ if (replicaId != RegionInfo.DEFAULT_REPLICA_ID) {
+ while (slowDownReplicaOpen) {
+ LOG.info("Slow down replica region open a bit");
+ try {
+ Thread.sleep(250);
+ } catch (InterruptedException ignored) {
+ return;
+ }
+ }
+ }
+ }
+}
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestMergeTableRegionsProcedure.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestMergeTableRegionsProcedure.java
index c0c4e355f2b..936cdc032fb 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestMergeTableRegionsProcedure.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestMergeTableRegionsProcedure.java
@@ -42,6 +42,7 @@ import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureConstants;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureTestingUtility;
+import org.apache.hadoop.hbase.master.procedure.ModifyTableProcedure;
import org.apache.hadoop.hbase.master.procedure.TestSnapshotProcedure;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
@@ -100,6 +101,8 @@ public class TestMergeTableRegionsProcedure {
conf.setInt("hbase.master.maximum.ping.server.attempts", 3);
conf.setInt("hbase.master.ping.server.retry.sleep.interval", 1);
conf.setInt(MasterProcedureConstants.MASTER_PROCEDURE_THREADS, 1);
+ conf.set("hbase.coprocessor.region.classes",
+ RegionServerHostingReplicaSlowOpenCoprocessor.class.getName());
}
@BeforeClass
@@ -391,6 +394,32 @@ public class TestMergeTableRegionsProcedure {
assertEquals(initialRegionCount,
UTIL.getAdmin().getRegions(tableName).size());
}
+ @Test
+ public void testMergeDetectsModifyTableProcedure() throws Exception {
+ final TableName tableName = TableName.valueOf(name.getMethodName());
+ final ProcedureExecutor<MasterProcedureEnv> procExec =
getMasterProcedureExecutor();
+
+ List<RegionInfo> regions = createTable(tableName);
+
+ RegionServerHostingReplicaSlowOpenCoprocessor.slowDownReplicaOpen = true;
+ TableDescriptor td =
TableDescriptorBuilder.newBuilder(admin.getDescriptor(tableName))
+ .setRegionReplication(2).build();
+ long modifyProcId =
+ procExec.submitProcedure(new
ModifyTableProcedure(procExec.getEnvironment(), td));
+
+ // Merge regions of the table, the MergeTableRegionsProcedure will fail
because there is a
+ // ModifyTableProcedure in progress
+ MergeTableRegionsProcedure mergeProcedure = new MergeTableRegionsProcedure(
+ procExec.getEnvironment(), regions.toArray(new RegionInfo[0]), false);
+ long mergeProcId = procExec.submitProcedure(mergeProcedure);
+ ProcedureTestingUtility.waitProcedure(procExec, mergeProcId);
+ ProcedureTestingUtility.assertProcFailed(procExec, mergeProcId);
+
+ RegionServerHostingReplicaSlowOpenCoprocessor.slowDownReplicaOpen = false;
+ ProcedureTestingUtility.waitProcedure(procExec, modifyProcId);
+ ProcedureTestingUtility.assertProcNotFailed(procExec, modifyProcId);
+ }
+
private List<RegionInfo> createTable(final TableName tableName) throws
Exception {
TableDescriptor desc = TableDescriptorBuilder.newBuilder(tableName)
.setColumnFamily(ColumnFamilyDescriptorBuilder.of(FAMILY)).build();
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestSplitTableRegionProcedure.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestSplitTableRegionProcedure.java
index 3d79fe208cf..e9dd0d3ac98 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestSplitTableRegionProcedure.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestSplitTableRegionProcedure.java
@@ -25,7 +25,6 @@ import static org.junit.Assert.fail;
import java.io.IOException;
import java.util.List;
-import java.util.Optional;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.Cell;
import org.apache.hadoop.hbase.CellUtil;
@@ -45,14 +44,12 @@ import org.apache.hadoop.hbase.client.SnapshotDescription;
import org.apache.hadoop.hbase.client.SnapshotType;
import org.apache.hadoop.hbase.client.Table;
import org.apache.hadoop.hbase.client.TableDescriptor;
-import org.apache.hadoop.hbase.coprocessor.ObserverContext;
-import org.apache.hadoop.hbase.coprocessor.RegionCoprocessor;
-import org.apache.hadoop.hbase.coprocessor.RegionCoprocessorEnvironment;
-import org.apache.hadoop.hbase.coprocessor.RegionObserver;
+import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
import org.apache.hadoop.hbase.master.RegionState;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureConstants;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv;
import org.apache.hadoop.hbase.master.procedure.MasterProcedureTestingUtility;
+import org.apache.hadoop.hbase.master.procedure.ModifyTableProcedure;
import org.apache.hadoop.hbase.master.procedure.TestSnapshotProcedure;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.procedure2.ProcedureMetrics;
@@ -114,39 +111,7 @@ public class TestSplitTableRegionProcedure {
conf.setInt(MasterProcedureConstants.MASTER_PROCEDURE_THREADS, 1);
conf.setLong(HConstants.MAJOR_COMPACTION_PERIOD, 0);
conf.set("hbase.coprocessor.region.classes",
- RegionServerHostingReplicaSlowOpenCopro.class.getName());
- conf.setInt("hbase.client.sync.wait.timeout.msec", 1500);
- }
-
- /**
- * This copro is used to slow down opening of the replica regions.
- */
- public static class RegionServerHostingReplicaSlowOpenCopro
- implements RegionCoprocessor, RegionObserver {
- static int countForReplica = 0;
- static boolean slowDownReplicaOpen = false;
-
- @Override
- public Optional<RegionObserver> getRegionObserver() {
- return Optional.of(this);
- }
-
- @Override
- public void preOpen(ObserverContext<? extends
RegionCoprocessorEnvironment> c)
- throws IOException {
- int replicaId =
c.getEnvironment().getRegion().getRegionInfo().getReplicaId();
- if ((replicaId != RegionInfo.DEFAULT_REPLICA_ID) && (countForReplica ==
0)) {
- countForReplica++;
- while (slowDownReplicaOpen) {
- LOG.info("Slow down replica region open a bit");
- try {
- Thread.sleep(100);
- } catch (InterruptedException ie) {
- // Ingore
- }
- }
- }
- }
+ RegionServerHostingReplicaSlowOpenCoprocessor.class.getName());
}
@BeforeClass
@@ -191,15 +156,13 @@ public class TestSplitTableRegionProcedure {
final TableName tableName = TableName.valueOf(name.getMethodName());
final ProcedureExecutor<MasterProcedureEnv> procExec =
getMasterProcedureExecutor();
- RegionServerHostingReplicaSlowOpenCopro.slowDownReplicaOpen = true;
+ RegionServerHostingReplicaSlowOpenCoprocessor.slowDownReplicaOpen = true;
RegionInfo[] regions =
MasterProcedureTestingUtility.createTable(procExec, tableName, null,
columnFamilyName1);
- try {
- HBaseTestingUtil.setReplicas(UTIL.getAdmin(), tableName, 2);
- } catch (IOException ioe) {
-
- }
+ TableDescriptor td =
TableDescriptorBuilder.newBuilder(UTIL.getAdmin().getDescriptor(tableName))
+ .setRegionReplication(2).build();
+ procExec.submitProcedure(new
ModifyTableProcedure(procExec.getEnvironment(), td));
// wait until the primary region is online.
HBaseTestingUtil.await(2000, () -> {
@@ -223,7 +186,7 @@ public class TestSplitTableRegionProcedure {
ProcedureTestingUtility.waitProcedure(procExec, procId);
// Let replica parent region open.
- RegionServerHostingReplicaSlowOpenCopro.slowDownReplicaOpen = false;
+ RegionServerHostingReplicaSlowOpenCoprocessor.slowDownReplicaOpen = false;
// wait until the replica region is online.
HBaseTestingUtil.await(2000, () -> {
@@ -231,7 +194,10 @@ public class TestSplitTableRegionProcedure {
AssignmentManager am =
UTIL.getHBaseCluster().getMaster().getAssignmentManager();
if (am == null) return false;
RegionInfo replicaRegion =
RegionReplicaUtil.getRegionInfoForReplica(regions[0], 1);
- if (am.getRegionStates().getRegionState(replicaRegion).isOpened()) {
+ if (
+ am.getRegionStates().getRegionState(replicaRegion) != null
+ && am.getRegionStates().getRegionState(replicaRegion).isOpened()
+ ) {
return true;
}
return false;
@@ -610,6 +576,32 @@ public class TestSplitTableRegionProcedure {
assertEquals(splitFailedCount + 1,
splitProcMetrics.getFailedCounter().getCount());
}
+ @Test
+ public void testSplitDetectsModifyTableProcedure() throws Exception {
+ final TableName tableName = TableName.valueOf(name.getMethodName());
+ final ProcedureExecutor<MasterProcedureEnv> procExec =
getMasterProcedureExecutor();
+
+ RegionInfo[] regions =
+ MasterProcedureTestingUtility.createTable(procExec, tableName, null,
columnFamilyName1);
+ RegionServerHostingReplicaSlowOpenCoprocessor.slowDownReplicaOpen = true;
+ TableDescriptor td =
TableDescriptorBuilder.newBuilder(UTIL.getAdmin().getDescriptor(tableName))
+ .setRegionReplication(2).build();
+ long modifyProcId =
+ procExec.submitProcedure(new
ModifyTableProcedure(procExec.getEnvironment(), td));
+
+ // Split region of the table, the SplitTableRegionProcedure will fail
because there is a
+ // ModifyTableProcedure in progress
+ SplitTableRegionProcedure splitProcedure = new SplitTableRegionProcedure(
+ procExec.getEnvironment(), regions[0], HConstants.CATALOG_FAMILY);
+ long splitProcId = procExec.submitProcedure(splitProcedure);
+ ProcedureTestingUtility.waitProcedure(procExec, splitProcId);
+ ProcedureTestingUtility.assertProcFailed(procExec, splitProcId);
+
+ RegionServerHostingReplicaSlowOpenCoprocessor.slowDownReplicaOpen = false;
+ ProcedureTestingUtility.waitProcedure(procExec, modifyProcId);
+ ProcedureTestingUtility.assertProcNotFailed(procExec, modifyProcId);
+ }
+
private void deleteData(final TableName tableName, final int
startDeleteRowNum)
throws IOException, InterruptedException {
Table t = UTIL.getConnection().getTable(tableName);
diff --git
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestTruncateRegionProcedure.java
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestTruncateRegionProcedure.java
index 8ce60ee5550..8ea493984e6 100644
---
a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestTruncateRegionProcedure.java
+++
b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/procedure/TestTruncateRegionProcedure.java
@@ -35,7 +35,6 @@ import org.apache.hadoop.hbase.client.RegionInfo;
import org.apache.hadoop.hbase.client.RegionReplicaUtil;
import org.apache.hadoop.hbase.client.TableDescriptor;
import org.apache.hadoop.hbase.client.TableDescriptorBuilder;
-import org.apache.hadoop.hbase.master.assignment.TestSplitTableRegionProcedure;
import org.apache.hadoop.hbase.procedure2.Procedure;
import org.apache.hadoop.hbase.procedure2.ProcedureExecutor;
import org.apache.hadoop.hbase.procedure2.ProcedureTestingUtility;
@@ -70,8 +69,6 @@ public class TestTruncateRegionProcedure extends
TestTableDDLProcedureBase {
private static void setupConf(Configuration conf) {
conf.setInt(MasterProcedureConstants.MASTER_PROCEDURE_THREADS, 1);
conf.setLong(HConstants.MAJOR_COMPACTION_PERIOD, 0);
- conf.set("hbase.coprocessor.region.classes",
-
TestSplitTableRegionProcedure.RegionServerHostingReplicaSlowOpenCopro.class.getName());
conf.setInt("hbase.client.sync.wait.timeout.msec", 60000);
}