HBASE-19815 Flakey TestAssignmentManager.testAssignWithRandExec (Part Two).
Part One cleaned up a ClassCastException. Part Two adds the ServerCrashProcedure#handleRIT behavior to RecoverMetaProcedure. Adds debug in the test. Project: http://git-wip-us.apache.org/repos/asf/hbase/repo Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/581fabe7 Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/581fabe7 Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/581fabe7 Branch: refs/heads/HBASE-19064 Commit: 581fabe7b2177a090af33517f2f7cb1cdab2c64b Parents: 646770d Author: Michael Stack <st...@apache.org> Authored: Wed Jan 17 22:35:35 2018 -0800 Committer: Michael Stack <st...@apache.org> Committed: Thu Jan 18 11:32:21 2018 -0800 ---------------------------------------------------------------------- .../master/procedure/RecoverMetaProcedure.java | 38 +++++++++++++++++--- .../assignment/TestAssignmentManager.java | 9 ++++- 2 files changed, 41 insertions(+), 6 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hbase/blob/581fabe7/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RecoverMetaProcedure.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RecoverMetaProcedure.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RecoverMetaProcedure.java index 50ef3e0..70d0d55 100644 --- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RecoverMetaProcedure.java +++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/procedure/RecoverMetaProcedure.java @@ -28,6 +28,8 @@ import org.apache.hadoop.hbase.client.RegionInfoBuilder; import org.apache.hadoop.hbase.client.RegionReplicaUtil; import org.apache.hadoop.hbase.master.MasterServices; import org.apache.hadoop.hbase.master.assignment.AssignProcedure; +import org.apache.hadoop.hbase.master.assignment.AssignmentManager; +import org.apache.hadoop.hbase.master.assignment.RegionTransitionProcedure; import org.apache.hadoop.hbase.procedure2.ProcedureStateSerializer; import org.apache.hadoop.hbase.procedure2.ProcedureSuspendedException; import org.apache.hadoop.hbase.procedure2.ProcedureYieldException; @@ -126,17 +128,17 @@ public class RecoverMetaProcedure RegionInfoBuilder.FIRST_META_REGIONINFO, this.replicaId); AssignProcedure metaAssignProcedure; + AssignmentManager am = master.getAssignmentManager(); if (failedMetaServer != null) { - LOG.info(this + "; Assigning meta with new plan. previous meta server=" + - failedMetaServer); - metaAssignProcedure = master.getAssignmentManager().createAssignProcedure(hri); + handleRIT(env, hri, this.failedMetaServer); + LOG.info(this + "; Assigning meta with new plan; previous server=" + failedMetaServer); + metaAssignProcedure = am.createAssignProcedure(hri); } else { // get server carrying meta from zk ServerName metaServer = MetaTableLocator.getMetaRegionState(master.getZooKeeper()).getServerName(); LOG.info(this + "; Retaining meta assignment to server=" + metaServer); - metaAssignProcedure = - master.getAssignmentManager().createAssignProcedure(hri, metaServer); + metaAssignProcedure = am.createAssignProcedure(hri, metaServer); } addChildProcedure(metaAssignProcedure); @@ -152,6 +154,32 @@ public class RecoverMetaProcedure return Flow.HAS_MORE_STATE; } + /** + * Is the region stuck assigning to this failedMetaServer? If so, cancel the call + * just as we do over in ServerCrashProcedure#handleRIT except less to do here; less context + * to carry. + */ + private void handleRIT(MasterProcedureEnv env, RegionInfo ri, ServerName crashedServerName) { + AssignmentManager am = env.getAssignmentManager(); + RegionTransitionProcedure rtp = am.getRegionStates().getRegionTransitionProcedure(ri); + if (rtp == null) { + return; // Nothing to do. Not in RIT. + } + // Make sure the RIT is against this crashed server. In the case where there are many + // processings of a crashed server -- backed up for whatever reason (slow WAL split) + // -- then a previous SCP may have already failed an assign, etc., and it may have a + // new location target; DO NOT fail these else we make for assign flux. + ServerName rtpServerName = rtp.getServer(env); + if (rtpServerName == null) { + LOG.warn("RIT with ServerName null! " + rtp); + } else if (rtpServerName.equals(crashedServerName)) { + LOG.info("pid=" + getProcId() + " found RIT " + rtp + "; " + + rtp.getRegionState(env).toShortString()); + rtp.remoteCallFailed(env, crashedServerName, + new ServerCrashException(getProcId(), crashedServerName)); + } + } + @Override protected void rollbackState(MasterProcedureEnv env, MasterProcedureProtos.RecoverMetaState recoverMetaState) http://git-wip-us.apache.org/repos/asf/hbase/blob/581fabe7/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java ---------------------------------------------------------------------- diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java index 3ab915b..9b9f624 100644 --- a/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java +++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/master/assignment/TestAssignmentManager.java @@ -24,6 +24,7 @@ import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import java.io.IOException; +import java.io.InterruptedIOException; import java.net.SocketTimeoutException; import java.util.NavigableMap; import java.util.Random; @@ -206,7 +207,7 @@ public class TestAssignmentManager { rsDispatcher.setMockRsExecutor(new RandRsExecutor()); // Loop a bunch of times so we hit various combos of exceptions. for (int i = 0; i < 10; i++) { - LOG.info("" + i); + LOG.info("ROUND=" + i); AssignProcedure proc = am.createAssignProcedure(hri); waitOnFuture(submitProcedure(proc)); } @@ -445,6 +446,12 @@ public class TestAssignmentManager { return future.get(5, TimeUnit.SECONDS); } catch (ExecutionException e) { LOG.info("ExecutionException", e); + Exception ee = (Exception)e.getCause(); + if (ee instanceof InterruptedIOException) { + for (Procedure p: this.master.getMasterProcedureExecutor().getProcedures()) { + LOG.info(p.toStringDetails()); + } + } throw (Exception)e.getCause(); } }