Repository: hbase Updated Branches: refs/heads/master bd0435892 -> 3afe9fb7e
HBASE-21017 Revisit the expected states for open/close Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/3afe9fb7 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/3afe9fb7 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/3afe9fb7 Branch: refs/heads/master Commit: 3afe9fb7e6ebfa71187cbe131558a83fae61cecd Parents: bd04358 Author: zhangduo <zhang...@apache.org> Authored: Tue Aug 28 06:46:00 2018 +0800 Committer: Duo Zhang <zhang...@apache.org> Committed: Tue Aug 28 14:49:22 2018 +0800 ---------------------------------------------------------------------- .../master/assignment/AssignmentManager.java | 55 +++++++++++++--- .../master/assignment/CloseRegionProcedure.java | 6 ++ .../master/assignment/OpenRegionProcedure.java | 6 ++ .../assignment/RegionRemoteProcedureBase.java | 40 +++++++++--- .../hbase/master/assignment/RegionStates.java | 16 ----- .../hadoop/hbase/client/TestEnableTable.java | 68 -------------------- 6 files changed, 88 insertions(+), 103 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/3afe9fb7/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java index a91f8e4..745703f 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java @@ -552,7 +552,7 @@ public class AssignmentManager implements ServerListener { TransitRegionStateProcedure proc; regionNode.lock(); try { - preTransitCheck(regionNode, RegionStates.STATES_EXPECTED_ON_OPEN); + preTransitCheck(regionNode, STATES_EXPECTED_ON_ASSIGN); proc = TransitRegionStateProcedure.assign(getProcedureEnvironment(), regionInfo, sn); regionNode.setProcedure(proc); } finally { @@ -573,7 +573,7 @@ public class AssignmentManager implements ServerListener { TransitRegionStateProcedure proc; regionNode.lock(); try { - preTransitCheck(regionNode, RegionStates.STATES_EXPECTED_ON_CLOSE); + preTransitCheck(regionNode, STATES_EXPECTED_ON_UNASSIGN_OR_MOVE); proc = TransitRegionStateProcedure.unassign(getProcedureEnvironment(), regionInfo); regionNode.setProcedure(proc); } finally { @@ -591,7 +591,7 @@ public class AssignmentManager implements ServerListener { TransitRegionStateProcedure proc; regionNode.lock(); try { - preTransitCheck(regionNode, RegionStates.STATES_EXPECTED_ON_CLOSE); + preTransitCheck(regionNode, STATES_EXPECTED_ON_UNASSIGN_OR_MOVE); regionNode.checkOnline(); proc = TransitRegionStateProcedure.move(getProcedureEnvironment(), regionInfo, targetServer); regionNode.setProcedure(proc); @@ -1411,6 +1411,35 @@ public class AssignmentManager implements ServerListener { } // ============================================================================================ + // Expected states on region state transition. + // Notice that there is expected states for transiting to OPENING state, this is because SCP. + // See the comments in regionOpening method for more details. + // ============================================================================================ + private static final State[] STATES_EXPECTED_ON_OPEN = { + State.OPENING, // Normal case + State.OPEN // Retrying + }; + + private static final State[] STATES_EXPECTED_ON_CLOSING = { + State.OPEN, // Normal case + State.CLOSING, // Retrying + State.SPLITTING, // Offline the split parent + State.MERGING // Offline the merge parents + }; + + private static final State[] STATES_EXPECTED_ON_CLOSED = { + State.CLOSING, // Normal case + State.CLOSED // Retrying + }; + + // This is for manually scheduled region assign, can add other states later if we find out other + // usages + private static final State[] STATES_EXPECTED_ON_ASSIGN = { State.CLOSED, State.OFFLINE }; + + // We only allow unassign or move a region which is in OPEN state. + private static final State[] STATES_EXPECTED_ON_UNASSIGN_OR_MOVE = { State.OPEN }; + + // ============================================================================================ // Region Status update // Should only be called in TransitRegionStateProcedure // ============================================================================================ @@ -1432,7 +1461,10 @@ public class AssignmentManager implements ServerListener { // should be called within the synchronized block of RegionStateNode void regionOpening(RegionStateNode regionNode) throws IOException { - transitStateAndUpdate(regionNode, State.OPENING, RegionStates.STATES_EXPECTED_ON_OPEN); + // As in SCP, for performance reason, there is no TRSP attached with this region, we will not + // update the region state, which means that the region could be in any state when we want to + // assign it after a RS crash. So here we do not pass the expectedStates parameter. + transitStateAndUpdate(regionNode, State.OPENING); regionStates.addRegionToServer(regionNode); // update the operation count metrics metrics.incrementOperationCounter(); @@ -1468,7 +1500,7 @@ public class AssignmentManager implements ServerListener { void regionOpened(RegionStateNode regionNode) throws IOException { // TODO: OPENING Updates hbase:meta too... we need to do both here and there? // That is a lot of hbase:meta writing. - transitStateAndUpdate(regionNode, State.OPEN, RegionStates.STATES_EXPECTED_ON_OPEN); + transitStateAndUpdate(regionNode, State.OPEN, STATES_EXPECTED_ON_OPEN); RegionInfo hri = regionNode.getRegionInfo(); if (isMetaRegion(hri)) { // Usually we'd set a table ENABLED at this stage but hbase:meta is ALWAYs enabled, it @@ -1483,8 +1515,7 @@ public class AssignmentManager implements ServerListener { // should be called within the synchronized block of RegionStateNode void regionClosing(RegionStateNode regionNode) throws IOException { - transitStateAndUpdate(regionNode, State.CLOSING, RegionStates.STATES_EXPECTED_ON_CLOSE); - regionStateStore.updateRegionLocation(regionNode); + transitStateAndUpdate(regionNode, State.CLOSING, STATES_EXPECTED_ON_CLOSING); RegionInfo hri = regionNode.getRegionInfo(); // Set meta has not initialized early. so people trying to create/edit tables will wait @@ -1502,8 +1533,13 @@ public class AssignmentManager implements ServerListener { void regionClosed(RegionStateNode regionNode, boolean normally) throws IOException { RegionState.State state = regionNode.getState(); ServerName regionLocation = regionNode.getRegionLocation(); - regionNode.transitionState(normally ? State.CLOSED : State.ABNORMALLY_CLOSED, - RegionStates.STATES_EXPECTED_ON_CLOSE); + if (normally) { + regionNode.transitionState(State.CLOSED, STATES_EXPECTED_ON_CLOSED); + } else { + // For SCP + regionNode.transitionState(State.ABNORMALLY_CLOSED); + } + regionNode.setRegionLocation(null); boolean succ = false; try { regionStateStore.updateRegionLocation(regionNode); @@ -1517,7 +1553,6 @@ public class AssignmentManager implements ServerListener { } if (regionLocation != null) { regionNode.setLastHost(regionLocation); - regionNode.setRegionLocation(null); regionStates.removeRegionFromServer(regionLocation, regionNode); } } http://git-wip-us.apache.org/repos/asf/hbase/blob/3afe9fb7/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java index e446e17..0d22a9c 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/CloseRegionProcedure.java @@ -20,6 +20,7 @@ package org.apache.hadoop.hbase.master.assignment; import java.io.IOException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionCloseOperation; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; @@ -79,4 +80,9 @@ public class CloseRegionProcedure extends RegionRemoteProcedureBase { assignCandidate = ProtobufUtil.toServerName(data.getAssignCandidate()); } } + + @Override + protected boolean shouldDispatch(RegionStateNode regionNode) { + return !regionNode.isInState(RegionState.State.CLOSED, RegionState.State.ABNORMALLY_CLOSED); + } } http://git-wip-us.apache.org/repos/asf/hbase/blob/3afe9fb7/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java index 1a79697..125ef11 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/OpenRegionProcedure.java @@ -20,6 +20,7 @@ package org.apache.hadoop.hbase.master.assignment; import java.io.IOException; import org.apache.hadoop.hbase.ServerName; import org.apache.hadoop.hbase.client.RegionInfo; +import org.apache.hadoop.hbase.master.RegionState; import org.apache.hadoop.hbase.master.procedure.MasterProcedureEnv; import org.apache.hadoop.hbase.master.procedure.RSProcedureDispatcher.RegionOpenOperation; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; @@ -64,4 +65,9 @@ public class OpenRegionProcedure extends RegionRemoteProcedureBase { super.deserializeStateData(serializer); serializer.deserialize(OpenRegionProcedureStateData.class); } + + @Override + protected boolean shouldDispatch(RegionStateNode regionNode) { + return !regionNode.isInState(RegionState.State.OPEN); + } } http://git-wip-us.apache.org/repos/asf/hbase/blob/3afe9fb7/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java index 6770e68..3e05cde 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionRemoteProcedureBase.java @@ -77,16 +77,16 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur throw new UnsupportedOperationException(); } - private ProcedureEvent<?> getRegionEvent(MasterProcedureEnv env) { - return env.getAssignmentManager().getRegionStates().getOrCreateRegionStateNode(region) - .getProcedureEvent(); + private RegionStateNode getRegionNode(MasterProcedureEnv env) { + return env.getAssignmentManager().getRegionStates().getRegionStateNode(region); } @Override - public void remoteCallFailed(MasterProcedureEnv env, ServerName remote, - IOException exception) { - ProcedureEvent<?> event = getRegionEvent(env); - synchronized (event) { + public void remoteCallFailed(MasterProcedureEnv env, ServerName remote, IOException exception) { + RegionStateNode regionNode = getRegionNode(env); + regionNode.lock(); + try { + ProcedureEvent<?> event = regionNode.getProcedureEvent(); if (event.isReady()) { LOG.warn( "The procedure event of procedure {} for region {} to server {} is not suspended, " + @@ -97,6 +97,8 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur LOG.warn("The remote operation {} for region {} to server {} failed", this, region, targetServer, exception); event.wake(env.getProcedureScheduler()); + } finally { + regionNode.unlock(); } } @@ -115,6 +117,17 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur return false; } + /** + * Check whether we still need to make the call to RS. + * <p/> + * Usually this will not happen if we do not allow assigning a already onlined region. But if we + * have something wrong in the RSProcedureDispatcher, where we have already sent the request to + * RS, but then we tell the upper layer the remote call is failed due to rpc timeout or connection + * closed or anything else, then this issue can still happen. So here we add a check to make it + * more robust. + */ + protected abstract boolean shouldDispatch(RegionStateNode regionNode); + @Override protected Procedure<MasterProcedureEnv>[] execute(MasterProcedureEnv env) throws ProcedureYieldException, ProcedureSuspendedException, InterruptedException { @@ -122,8 +135,15 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur // we are done, the parent procedure will check whether we are succeeded. return null; } - ProcedureEvent<?> event = getRegionEvent(env); - synchronized (event) { + RegionStateNode regionNode = getRegionNode(env); + regionNode.lock(); + try { + if (!shouldDispatch(regionNode)) { + return null; + } + // The code which wakes us up also needs to lock the RSN so here we do not need to synchronize + // on the event. + ProcedureEvent<?> event = regionNode.getProcedureEvent(); try { env.getRemoteDispatcher().addOperationToNode(targetServer, this); } catch (FailedRemoteDispatchException e) { @@ -136,6 +156,8 @@ public abstract class RegionRemoteProcedureBase extends Procedure<MasterProcedur event.suspend(); event.suspendIfNotReady(this); throw new ProcedureSuspendedException(); + } finally { + regionNode.unlock(); } } http://git-wip-us.apache.org/repos/asf/hbase/blob/3afe9fb7/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java index 26a6884..eeb9252 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStates.java @@ -55,22 +55,6 @@ import org.apache.hbase.thirdparty.com.google.common.annotations.VisibleForTesti public class RegionStates { private static final Logger LOG = LoggerFactory.getLogger(RegionStates.class); - // TODO: need to be more specific, i.e, OPENING vs. OPEN, CLOSING vs. CLOSED. - static final State[] STATES_EXPECTED_ON_OPEN = new State[] { - State.OPEN, // State may already be OPEN if we died after receiving the OPEN from regionserver - // but before complete finish of AssignProcedure. HBASE-20100. - State.OFFLINE, State.CLOSED, State.ABNORMALLY_CLOSED, // disable/offline - State.SPLITTING, // ServerCrashProcedure - State.OPENING, State.FAILED_OPEN, // already in-progress (retrying) - State.MERGED, State.SPLITTING_NEW - }; - - static final State[] STATES_EXPECTED_ON_CLOSE = new State[] { - State.SPLITTING, State.MERGING, State.OPENING, // ServerCrashProcedure - State.OPEN, // enabled/open - State.CLOSING // already in-progress (retrying) - }; - // This comparator sorts the RegionStates by time stamp then Region name. // Comparing by timestamp alone can lead us to discard different RegionStates that happen // to share a timestamp. http://git-wip-us.apache.org/repos/asf/hbase/blob/3afe9fb7/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestEnableTable.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestEnableTable.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestEnableTable.java index 7a1bc55..4e2d6ba 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestEnableTable.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/client/TestEnableTable.java @@ -18,33 +18,26 @@ package org.apache.hadoop.hbase.client; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.IOException; -import java.util.ArrayList; -import java.util.List; import java.util.Optional; import java.util.concurrent.CountDownLatch; import org.apache.hadoop.hbase.HBaseClassTestRule; import org.apache.hadoop.hbase.HBaseTestingUtility; import org.apache.hadoop.hbase.HColumnDescriptor; import org.apache.hadoop.hbase.HConstants; -import org.apache.hadoop.hbase.HRegionInfo; import org.apache.hadoop.hbase.HTableDescriptor; import org.apache.hadoop.hbase.MetaTableAccessor; -import org.apache.hadoop.hbase.MiniHBaseCluster; import org.apache.hadoop.hbase.TableName; import org.apache.hadoop.hbase.coprocessor.CoprocessorHost; import org.apache.hadoop.hbase.coprocessor.MasterCoprocessor; import org.apache.hadoop.hbase.coprocessor.MasterCoprocessorEnvironment; import org.apache.hadoop.hbase.coprocessor.MasterObserver; import org.apache.hadoop.hbase.coprocessor.ObserverContext; -import org.apache.hadoop.hbase.master.HMaster; import org.apache.hadoop.hbase.testclassification.MasterTests; import org.apache.hadoop.hbase.testclassification.MediumTests; import org.apache.hadoop.hbase.util.Bytes; -import org.apache.hadoop.hbase.util.JVMClusterUtil; import org.junit.After; import org.junit.Before; import org.junit.ClassRule; @@ -55,10 +48,6 @@ import org.junit.rules.TestName; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.hbase.thirdparty.com.google.common.base.Predicate; -import org.apache.hbase.thirdparty.com.google.common.collect.Iterables; -import org.apache.hbase.thirdparty.com.google.common.collect.Lists; - @Category({ MasterTests.class, MediumTests.class }) public class TestEnableTable { @@ -85,63 +74,6 @@ public class TestEnableTable { TEST_UTIL.shutdownMiniCluster(); } - @Test - public void testEnableTableWithNoRegionServers() throws Exception { - final TableName tableName = TableName.valueOf(name.getMethodName()); - final MiniHBaseCluster cluster = TEST_UTIL.getHBaseCluster(); - final HMaster m = cluster.getMaster(); - final Admin admin = TEST_UTIL.getAdmin(); - final HTableDescriptor desc = new HTableDescriptor(tableName); - desc.addFamily(new HColumnDescriptor(FAMILYNAME)); - admin.createTable(desc); - admin.disableTable(tableName); - TEST_UTIL.waitTableDisabled(tableName.getName()); - - admin.enableTable(tableName); - TEST_UTIL.waitTableEnabled(tableName); - // disable once more - admin.disableTable(tableName); - - TEST_UTIL.waitUntilNoRegionsInTransition(60000); - // now stop region servers - JVMClusterUtil.RegionServerThread rs = cluster.getRegionServerThreads().get(0); - rs.getRegionServer().stop("stop"); - cluster.waitForRegionServerToStop(rs.getRegionServer().getServerName(), 10000); - - // We used to enable the table here but AMv2 would hang waiting on a RS to check-in. - // Revisit. - - JVMClusterUtil.RegionServerThread rs2 = cluster.startRegionServer(); - cluster.waitForRegionServerToStart(rs2.getRegionServer().getServerName().getHostname(), - rs2.getRegionServer().getServerName().getPort(), 60000); - - LOG.debug("Now enabling table " + tableName); - admin.enableTable(tableName); - assertTrue(admin.isTableEnabled(tableName)); - - List<HRegionInfo> regions = TEST_UTIL.getAdmin().getTableRegions(tableName); - assertEquals(1, regions.size()); - for (HRegionInfo region : regions) { - TEST_UTIL.getAdmin().assign(region.getEncodedNameAsBytes()); - } - LOG.debug("Waiting for table assigned " + tableName); - TEST_UTIL.waitUntilAllRegionsAssigned(tableName); - List<HRegionInfo> onlineRegions = admin.getOnlineRegions( - rs2.getRegionServer().getServerName()); - ArrayList<HRegionInfo> tableRegions = filterTableRegions(tableName, onlineRegions); - assertEquals(1, tableRegions.size()); - } - - private ArrayList<HRegionInfo> filterTableRegions(final TableName tableName, - List<HRegionInfo> onlineRegions) { - return Lists.newArrayList(Iterables.filter(onlineRegions, new Predicate<HRegionInfo>() { - @Override - public boolean apply(HRegionInfo input) { - return input.getTable().equals(tableName); - } - })); - } - /** * We were only clearing rows that had a hregioninfo column in hbase:meta. Mangled rows that * were missing the hregioninfo because of error were being left behind messing up any