This is an automated email from the ASF dual-hosted git repository. huaxiangsun pushed a commit to branch branch-2.3 in repository https://gitbox.apache.org/repos/asf/hbase.git
The following commit(s) were added to refs/heads/branch-2.3 by this push: new 0a4ddd6 HBASE-25639 meta replica state is not respected during active master switch (#3052) 0a4ddd6 is described below commit 0a4ddd6c3bcf50ce7f3563e1699145d38cdf9de0 Author: huaxiangsun <huaxiang...@apache.org> AuthorDate: Thu Mar 18 11:35:46 2021 -0700 HBASE-25639 meta replica state is not respected during active master switch (#3052) Signed-off-by: stack <st...@duboce.net> --- .../org/apache/hadoop/hbase/master/HMaster.java | 2 +- .../hadoop/hbase/master/MasterMetaBootstrap.java | 27 +++--- .../hbase/master/assignment/AssignmentManager.java | 36 +++++--- .../master/TestMasterFailoverWithMetaReplica.java | 102 +++++++++++++++++++++ 4 files changed, 138 insertions(+), 29 deletions(-) diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java index b78f7f3..5f8b63d 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/HMaster.java @@ -1018,7 +1018,7 @@ public class HMaster extends HRegionServer implements MasterServices { RegionState rs = this.assignmentManager.getRegionStates(). getRegionState(RegionInfoBuilder.FIRST_META_REGIONINFO); LOG.info("hbase:meta {}", rs); - if (rs != null && rs.isOffline()) { + if ((rs == null) || (rs != null && rs.isOffline())) { Optional<InitMetaProcedure> optProc = procedureExecutor.getProcedures().stream() .filter(p -> p instanceof InitMetaProcedure).map(o -> (InitMetaProcedure) o).findAny(); initMetaProc = optProc.orElseGet(() -> { diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java index da8d228..800ae97 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/MasterMetaBootstrap.java @@ -21,7 +21,6 @@ package org.apache.hadoop.hbase.master; import java.io.IOException; import java.util.List; import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfoBuilder; import org.apache.hadoop.hbase.client.RegionReplicaUtil; @@ -61,25 +60,25 @@ class MasterMetaBootstrap { throw new IllegalStateException("hbase:meta must be initialized first before we can " + "assign out its replicas"); } - ServerName metaServername = MetaTableLocator.getMetaRegionLocation(this.master.getZooKeeper()); + for (int i = 1; i < numReplicas; i++) { // Get current meta state for replica from zk. - RegionState metaState = MetaTableLocator.getMetaRegionState(master.getZooKeeper(), i); RegionInfo hri = RegionReplicaUtil.getRegionInfoForReplica( - RegionInfoBuilder.FIRST_META_REGIONINFO, i); - LOG.debug(hri.getRegionNameAsString() + " replica region state from zookeeper=" + metaState); - if (metaServername.equals(metaState.getServerName())) { - metaState = null; - LOG.info(hri.getRegionNameAsString() + - " old location is same as current hbase:meta location; setting location as null..."); - } + RegionInfoBuilder.FIRST_META_REGIONINFO, i); + + RegionState rs = assignmentManager.getRegionStates().getRegionState(hri); + LOG.debug(hri.getRegionNameAsString() + " replica region state from zookeeper=" + rs); + // These assigns run inline. All is blocked till they complete. Only interrupt is shutting // down hosting server which calls AM#stop. - if (metaState != null && metaState.getServerName() != null) { - // Try to retain old assignment. - assignmentManager.assignAsync(hri, metaState.getServerName()); - } else { + if (rs == null) { assignmentManager.assignAsync(hri); + } else if (rs != null && rs.isOffline()) { + if (rs.getServerName() != null) { + assignmentManager.assignAsync(hri, rs.getServerName()); + } else { + assignmentManager.assignAsync(hri); + } } } unassignExcessMetaReplica(numReplicas); diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index edc7dee..880de2a 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -45,6 +45,7 @@ import org.apache.hadoop.hbase.client.DoNotRetryRegionException; import org.apache.hadoop.hbase.client.MasterSwitchType; import org.apache.hadoop.hbase.client.RegionInfo; import org.apache.hadoop.hbase.client.RegionInfoBuilder; +import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.client.RegionStatesCount; import org.apache.hadoop.hbase.client.Result; import org.apache.hadoop.hbase.client.TableState; @@ -225,21 +226,28 @@ public class AssignmentManager { ZKWatcher zkw = master.getZooKeeper(); // it could be null in some tests if (zkw != null) { - // here we are still in the early steps of active master startup. There is only one thread(us) - // can access AssignmentManager and create region node, so here we do not need to lock the - // region node. - RegionState regionState = MetaTableLocator.getMetaRegionState(zkw); - RegionStateNode regionNode = - regionStates.getOrCreateRegionStateNode(RegionInfoBuilder.FIRST_META_REGIONINFO); - regionNode.setRegionLocation(regionState.getServerName()); - regionNode.setState(regionState.getState()); - if (regionNode.getProcedure() != null) { - regionNode.getProcedure().stateLoaded(this, regionNode); - } - if (regionState.getServerName() != null) { - regionStates.addRegionToServer(regionNode); + List<String> metaZNodes = zkw.getMetaReplicaNodes(); + LOG.debug("hbase:meta replica znodes: {}", metaZNodes); + for (String metaZNode : metaZNodes) { + int replicaId = zkw.getZNodePaths().getMetaReplicaIdFromZnode(metaZNode); + // here we are still in the early steps of active master startup. There is only one thread(us) + // can access AssignmentManager and create region node, so here we do not need to lock the + // region node. + RegionState regionState = MetaTableLocator.getMetaRegionState(zkw, replicaId); + RegionStateNode regionNode = regionStates.getOrCreateRegionStateNode(regionState.getRegion()); + regionNode.setRegionLocation(regionState.getServerName()); + regionNode.setState(regionState.getState()); + if (regionNode.getProcedure() != null) { + regionNode.getProcedure().stateLoaded(this, regionNode); + } + if (regionState.getServerName() != null) { + regionStates.addRegionToServer(regionNode); + } + if (RegionReplicaUtil.isDefaultReplica(replicaId)) { + setMetaAssigned(regionState.getRegion(), regionState.getState() == State.OPEN); + } + LOG.debug("Loaded hbase:meta {}", regionNode); } - setMetaAssigned(regionState.getRegion(), regionState.getState() == State.OPEN); } } diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailoverWithMetaReplica.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailoverWithMetaReplica.java new file mode 100644 index 0000000..c58dc06 --- /dev/null +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestMasterFailoverWithMetaReplica.java @@ -0,0 +1,102 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hbase.master; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +import java.util.List; +import org.apache.hadoop.hbase.HBaseClassTestRule; +import org.apache.hadoop.hbase.HBaseTestingUtility; +import org.apache.hadoop.hbase.HConstants; +import org.apache.hadoop.hbase.MiniHBaseCluster; +import org.apache.hadoop.hbase.StartMiniClusterOption; +import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.client.RegionInfoBuilder; +import org.apache.hadoop.hbase.client.RegionReplicaUtil; +import org.apache.hadoop.hbase.master.assignment.AssignmentTestingUtil; +import org.apache.hadoop.hbase.testclassification.LargeTests; +import org.apache.hadoop.hbase.testclassification.MasterTests; +import org.apache.hadoop.hbase.util.JVMClusterUtil; +import org.junit.ClassRule; +import org.junit.Test; +import org.junit.experimental.categories.Category; + +@Category({MasterTests.class, LargeTests.class}) +public class TestMasterFailoverWithMetaReplica { + + @ClassRule + public static final HBaseClassTestRule CLASS_RULE = + HBaseClassTestRule.forClass(TestMasterFailoverWithMetaReplica.class); + private static final int num_of_meta_replica = 2; + + /** + * Test that during master failover, there is no state change for meta replica regions + * + * @throws Exception + */ + @Test + public void testMasterFailoverWithMetaReplica() throws Exception { + // Start the cluster + HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility(); + TEST_UTIL.getConfiguration().setInt(HConstants.META_REPLICAS_NUM, num_of_meta_replica); + + StartMiniClusterOption option = StartMiniClusterOption.builder() + .numMasters(2).numRegionServers(num_of_meta_replica).build(); + TEST_UTIL.startMiniCluster(option); + MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); + + assertTrue(cluster.waitForActiveAndReadyMaster()); + HMaster oldMaster = cluster.getMaster(); + + // Make sure meta replica regions are assigned. + for (int replicaId = 1; replicaId < num_of_meta_replica; replicaId++) { + RegionInfo h = RegionReplicaUtil + .getRegionInfoForReplica(RegionInfoBuilder.FIRST_META_REGIONINFO, replicaId); + AssignmentTestingUtil.waitForAssignment(oldMaster.getAssignmentManager(), h); + } + + int oldProcedureNum = oldMaster.getProcedures().size(); + + int activeIndex = -1; + List<JVMClusterUtil.MasterThread> masterThreads = cluster.getMasterThreads(); + + for (int i = 0; i < masterThreads.size(); i++) { + if (masterThreads.get(i).getMaster().isActiveMaster()) { + activeIndex = i; + } + } + + // Stop the active master and wait for new master to come online. + cluster.stopMaster(activeIndex); + cluster.waitOnMaster(activeIndex); + assertTrue(cluster.waitForActiveAndReadyMaster()); + // double check this is actually a new master + HMaster newMaster = cluster.getMaster(); + assertFalse(oldMaster == newMaster); + + int newProcedureNum = newMaster.getProcedures().size(); + + // Make sure all region servers report back and there is no new procedures. + assertEquals(newMaster.getServerManager().getOnlineServers().size(), num_of_meta_replica); + assertEquals(oldProcedureNum, newProcedureNum); + + // Stop the cluster + TEST_UTIL.shutdownMiniCluster(); + } +}