bharathv commented on a change in pull request #1755: URL: https://github.com/apache/hbase/pull/1755#discussion_r432774809
########## File path: hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java ########## @@ -1995,97 +1995,100 @@ private void unassign(final HRegionInfo region, } return; } + long sleepTime = 0; try { // Send CLOSE RPC - if (serverManager.sendRegionClose(server, region, - versionOfClosingNode, dest, transitionInZK)) { - LOG.debug("Sent CLOSE to " + server + " for region " + - region.getRegionNameAsString()); + if (serverManager.sendRegionClose(server, region, versionOfClosingNode, dest, + transitionInZK)) { + LOG.debug("Sent CLOSE to " + server + " for region " + region.getRegionNameAsString()); if (useZKForAssignment && !transitionInZK && state != null) { // Retry to make sure the region is // closed so as to avoid double assignment. - unassign(region, state, versionOfClosingNode, - dest, transitionInZK, src); + unassign(region, state, versionOfClosingNode, dest, transitionInZK, src); } return; } // This never happens. Currently regionserver close always return true. // Todo; this can now happen (0.96) if there is an exception in a coprocessor - LOG.warn("Server " + server + " region CLOSE RPC returned false for " + - region.getRegionNameAsString()); - } catch (Throwable t) { - long sleepTime = 0; + LOG.warn("Server " + server + " region CLOSE RPC returned false for " + + region.getRegionNameAsString()); + } catch (Throwable t) { Configuration conf = this.server.getConfiguration(); if (t instanceof RemoteException) { - t = ((RemoteException)t).unwrapRemoteException(); + t = ((RemoteException) t).unwrapRemoteException(); } boolean logRetries = true; - if (t instanceof RegionServerAbortedException - || t instanceof RegionServerStoppedException + if (t instanceof RegionServerAbortedException || t instanceof RegionServerStoppedException Review comment: nit: (t instanceof RegionServerAbortedException) is redundant (per static analysis check in the IDE)... ########## File path: hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java ########## @@ -598,6 +601,68 @@ public void testCloseFailed() throws Exception { } } + /** + * This tests region close with exponential backoff + */ + @Test(timeout = 60000) + public void testCloseRegionWithExponentialBackOff() throws Exception { + String table = "testCloseRegionWithExponentialBackOff"; Review comment: nit: We use TestTableName.getTableName() that automatically gets the TableName with the running test name. ########## File path: hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java ########## @@ -1995,97 +1995,100 @@ private void unassign(final HRegionInfo region, } return; } + long sleepTime = 0; try { // Send CLOSE RPC - if (serverManager.sendRegionClose(server, region, - versionOfClosingNode, dest, transitionInZK)) { - LOG.debug("Sent CLOSE to " + server + " for region " + - region.getRegionNameAsString()); + if (serverManager.sendRegionClose(server, region, versionOfClosingNode, dest, + transitionInZK)) { + LOG.debug("Sent CLOSE to " + server + " for region " + region.getRegionNameAsString()); if (useZKForAssignment && !transitionInZK && state != null) { // Retry to make sure the region is // closed so as to avoid double assignment. - unassign(region, state, versionOfClosingNode, - dest, transitionInZK, src); + unassign(region, state, versionOfClosingNode, dest, transitionInZK, src); } return; } // This never happens. Currently regionserver close always return true. // Todo; this can now happen (0.96) if there is an exception in a coprocessor - LOG.warn("Server " + server + " region CLOSE RPC returned false for " + - region.getRegionNameAsString()); - } catch (Throwable t) { - long sleepTime = 0; + LOG.warn("Server " + server + " region CLOSE RPC returned false for " + + region.getRegionNameAsString()); + } catch (Throwable t) { Configuration conf = this.server.getConfiguration(); if (t instanceof RemoteException) { - t = ((RemoteException)t).unwrapRemoteException(); + t = ((RemoteException) t).unwrapRemoteException(); } boolean logRetries = true; - if (t instanceof RegionServerAbortedException - || t instanceof RegionServerStoppedException + if (t instanceof RegionServerAbortedException || t instanceof RegionServerStoppedException || t instanceof ServerNotRunningYetException) { // RS is aborting or stopping, we cannot offline the region since the region may need - // to do WAL recovery. Until we see the RS expiration, we should retry. + // to do WAL recovery. Until we see the RS expiration, we should retry. sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, RpcClient.FAILED_SERVER_EXPIRY_DEFAULT); } else if (t instanceof NotServingRegionException) { - LOG.debug("Offline " + region.getRegionNameAsString() - + ", it's not any more on " + server, t); + LOG.debug( + "Offline " + region.getRegionNameAsString() + ", it's not any more on " + server, t); if (transitionInZK) { deleteClosingOrClosedNode(region, server); } if (state != null) { regionOffline(region); } return; - } else if ((t instanceof FailedServerException) || (state != null && - t instanceof RegionAlreadyInTransitionException)) { - if (t instanceof FailedServerException) { - sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, + } else if ((t instanceof FailedServerException) + || (state != null && t instanceof RegionAlreadyInTransitionException)) { + if (t instanceof FailedServerException) { + sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, RpcClient.FAILED_SERVER_EXPIRY_DEFAULT); - } else { - // RS is already processing this region, only need to update the timestamp - LOG.debug("update " + state + " the timestamp."); - state.updateTimestampToNow(); - if (maxWaitTime < 0) { - maxWaitTime = - EnvironmentEdgeManager.currentTime() - + conf.getLong(ALREADY_IN_TRANSITION_WAITTIME, - DEFAULT_ALREADY_IN_TRANSITION_WAITTIME); - } - long now = EnvironmentEdgeManager.currentTime(); - if (now < maxWaitTime) { - LOG.debug("Region is already in transition; " - + "waiting up to " + (maxWaitTime - now) + "ms", t); - sleepTime = 100; - i--; // reset the try count - logRetries = false; + } else { + // RS is already processing this region, only need to update the timestamp + LOG.debug("update " + state + " the timestamp."); + state.updateTimestampToNow(); + if (maxWaitTime < 0) { + maxWaitTime = EnvironmentEdgeManager.currentTime() + conf.getLong( + ALREADY_IN_TRANSITION_WAITTIME, DEFAULT_ALREADY_IN_TRANSITION_WAITTIME); + } + long now = EnvironmentEdgeManager.currentTime(); + if (now < maxWaitTime) { + LOG.debug("Region is already in transition; " + "waiting up to " + + (maxWaitTime - now) + "ms", + t); + sleepTime = 100; + i--; // reset the try count + logRetries = false; + } + } } - } - } - - try { - if (sleepTime > 0) { - Thread.sleep(sleepTime); - } - } catch (InterruptedException ie) { - LOG.warn("Failed to unassign " - + region.getRegionNameAsString() + " since interrupted", ie); - Thread.currentThread().interrupt(); - if (state != null) { - regionStates.updateRegionState(region, State.FAILED_CLOSE); - } - return; - } if (logRetries) { - LOG.info("Server " + server + " returned " + t + " for " - + region.getRegionNameAsString() + ", try=" + i - + " of " + this.maximumAttempts, t); + LOG.info("Server " + server + " returned " + t + " for " + region.getRegionNameAsString() + + ", try=" + i + " of " + this.maximumAttempts, + t); // Presume retry or server will expire. } } + // If sleepTime is not set by any of the cases, set it to sleep for Review comment: nit: Can you please add a few comments on the code flow so that its easier for other to follow? ########## File path: hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java ########## @@ -1995,97 +1995,100 @@ private void unassign(final HRegionInfo region, } return; } + long sleepTime = 0; try { // Send CLOSE RPC - if (serverManager.sendRegionClose(server, region, - versionOfClosingNode, dest, transitionInZK)) { - LOG.debug("Sent CLOSE to " + server + " for region " + - region.getRegionNameAsString()); + if (serverManager.sendRegionClose(server, region, versionOfClosingNode, dest, + transitionInZK)) { + LOG.debug("Sent CLOSE to " + server + " for region " + region.getRegionNameAsString()); if (useZKForAssignment && !transitionInZK && state != null) { // Retry to make sure the region is // closed so as to avoid double assignment. - unassign(region, state, versionOfClosingNode, - dest, transitionInZK, src); + unassign(region, state, versionOfClosingNode, dest, transitionInZK, src); } return; } // This never happens. Currently regionserver close always return true. // Todo; this can now happen (0.96) if there is an exception in a coprocessor - LOG.warn("Server " + server + " region CLOSE RPC returned false for " + - region.getRegionNameAsString()); - } catch (Throwable t) { - long sleepTime = 0; + LOG.warn("Server " + server + " region CLOSE RPC returned false for " + + region.getRegionNameAsString()); + } catch (Throwable t) { Configuration conf = this.server.getConfiguration(); if (t instanceof RemoteException) { - t = ((RemoteException)t).unwrapRemoteException(); + t = ((RemoteException) t).unwrapRemoteException(); } boolean logRetries = true; - if (t instanceof RegionServerAbortedException - || t instanceof RegionServerStoppedException + if (t instanceof RegionServerAbortedException || t instanceof RegionServerStoppedException || t instanceof ServerNotRunningYetException) { // RS is aborting or stopping, we cannot offline the region since the region may need - // to do WAL recovery. Until we see the RS expiration, we should retry. + // to do WAL recovery. Until we see the RS expiration, we should retry. sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, RpcClient.FAILED_SERVER_EXPIRY_DEFAULT); } else if (t instanceof NotServingRegionException) { - LOG.debug("Offline " + region.getRegionNameAsString() - + ", it's not any more on " + server, t); + LOG.debug( + "Offline " + region.getRegionNameAsString() + ", it's not any more on " + server, t); if (transitionInZK) { deleteClosingOrClosedNode(region, server); } if (state != null) { regionOffline(region); } return; - } else if ((t instanceof FailedServerException) || (state != null && - t instanceof RegionAlreadyInTransitionException)) { - if (t instanceof FailedServerException) { - sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, + } else if ((t instanceof FailedServerException) + || (state != null && t instanceof RegionAlreadyInTransitionException)) { + if (t instanceof FailedServerException) { + sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, RpcClient.FAILED_SERVER_EXPIRY_DEFAULT); - } else { - // RS is already processing this region, only need to update the timestamp - LOG.debug("update " + state + " the timestamp."); - state.updateTimestampToNow(); - if (maxWaitTime < 0) { - maxWaitTime = - EnvironmentEdgeManager.currentTime() - + conf.getLong(ALREADY_IN_TRANSITION_WAITTIME, - DEFAULT_ALREADY_IN_TRANSITION_WAITTIME); - } - long now = EnvironmentEdgeManager.currentTime(); - if (now < maxWaitTime) { - LOG.debug("Region is already in transition; " - + "waiting up to " + (maxWaitTime - now) + "ms", t); - sleepTime = 100; - i--; // reset the try count - logRetries = false; + } else { + // RS is already processing this region, only need to update the timestamp + LOG.debug("update " + state + " the timestamp."); + state.updateTimestampToNow(); + if (maxWaitTime < 0) { + maxWaitTime = EnvironmentEdgeManager.currentTime() + conf.getLong( + ALREADY_IN_TRANSITION_WAITTIME, DEFAULT_ALREADY_IN_TRANSITION_WAITTIME); + } + long now = EnvironmentEdgeManager.currentTime(); + if (now < maxWaitTime) { + LOG.debug("Region is already in transition; " + "waiting up to " + + (maxWaitTime - now) + "ms", + t); + sleepTime = 100; + i--; // reset the try count + logRetries = false; + } + } } - } - } - - try { - if (sleepTime > 0) { - Thread.sleep(sleepTime); - } - } catch (InterruptedException ie) { - LOG.warn("Failed to unassign " - + region.getRegionNameAsString() + " since interrupted", ie); - Thread.currentThread().interrupt(); - if (state != null) { - regionStates.updateRegionState(region, State.FAILED_CLOSE); - } - return; - } if (logRetries) { - LOG.info("Server " + server + " returned " + t + " for " - + region.getRegionNameAsString() + ", try=" + i - + " of " + this.maximumAttempts, t); + LOG.info("Server " + server + " returned " + t + " for " + region.getRegionNameAsString() Review comment: merge this logging and the logging in L2074 and log towards the end? ########## File path: hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java ########## @@ -1995,97 +1995,100 @@ private void unassign(final HRegionInfo region, } return; } + long sleepTime = 0; try { // Send CLOSE RPC - if (serverManager.sendRegionClose(server, region, - versionOfClosingNode, dest, transitionInZK)) { - LOG.debug("Sent CLOSE to " + server + " for region " + - region.getRegionNameAsString()); + if (serverManager.sendRegionClose(server, region, versionOfClosingNode, dest, + transitionInZK)) { + LOG.debug("Sent CLOSE to " + server + " for region " + region.getRegionNameAsString()); if (useZKForAssignment && !transitionInZK && state != null) { // Retry to make sure the region is // closed so as to avoid double assignment. - unassign(region, state, versionOfClosingNode, - dest, transitionInZK, src); + unassign(region, state, versionOfClosingNode, dest, transitionInZK, src); } return; } // This never happens. Currently regionserver close always return true. // Todo; this can now happen (0.96) if there is an exception in a coprocessor - LOG.warn("Server " + server + " region CLOSE RPC returned false for " + - region.getRegionNameAsString()); - } catch (Throwable t) { - long sleepTime = 0; + LOG.warn("Server " + server + " region CLOSE RPC returned false for " + + region.getRegionNameAsString()); + } catch (Throwable t) { Configuration conf = this.server.getConfiguration(); if (t instanceof RemoteException) { - t = ((RemoteException)t).unwrapRemoteException(); + t = ((RemoteException) t).unwrapRemoteException(); } boolean logRetries = true; - if (t instanceof RegionServerAbortedException - || t instanceof RegionServerStoppedException + if (t instanceof RegionServerAbortedException || t instanceof RegionServerStoppedException || t instanceof ServerNotRunningYetException) { // RS is aborting or stopping, we cannot offline the region since the region may need - // to do WAL recovery. Until we see the RS expiration, we should retry. + // to do WAL recovery. Until we see the RS expiration, we should retry. sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, RpcClient.FAILED_SERVER_EXPIRY_DEFAULT); } else if (t instanceof NotServingRegionException) { - LOG.debug("Offline " + region.getRegionNameAsString() - + ", it's not any more on " + server, t); + LOG.debug( + "Offline " + region.getRegionNameAsString() + ", it's not any more on " + server, t); if (transitionInZK) { deleteClosingOrClosedNode(region, server); } if (state != null) { regionOffline(region); } return; - } else if ((t instanceof FailedServerException) || (state != null && - t instanceof RegionAlreadyInTransitionException)) { - if (t instanceof FailedServerException) { - sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, + } else if ((t instanceof FailedServerException) + || (state != null && t instanceof RegionAlreadyInTransitionException)) { + if (t instanceof FailedServerException) { + sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, RpcClient.FAILED_SERVER_EXPIRY_DEFAULT); - } else { - // RS is already processing this region, only need to update the timestamp - LOG.debug("update " + state + " the timestamp."); - state.updateTimestampToNow(); - if (maxWaitTime < 0) { - maxWaitTime = - EnvironmentEdgeManager.currentTime() - + conf.getLong(ALREADY_IN_TRANSITION_WAITTIME, - DEFAULT_ALREADY_IN_TRANSITION_WAITTIME); - } - long now = EnvironmentEdgeManager.currentTime(); - if (now < maxWaitTime) { - LOG.debug("Region is already in transition; " - + "waiting up to " + (maxWaitTime - now) + "ms", t); - sleepTime = 100; - i--; // reset the try count - logRetries = false; + } else { + // RS is already processing this region, only need to update the timestamp + LOG.debug("update " + state + " the timestamp."); + state.updateTimestampToNow(); + if (maxWaitTime < 0) { + maxWaitTime = EnvironmentEdgeManager.currentTime() + conf.getLong( + ALREADY_IN_TRANSITION_WAITTIME, DEFAULT_ALREADY_IN_TRANSITION_WAITTIME); + } + long now = EnvironmentEdgeManager.currentTime(); + if (now < maxWaitTime) { + LOG.debug("Region is already in transition; " + "waiting up to " + + (maxWaitTime - now) + "ms", + t); + sleepTime = 100; + i--; // reset the try count + logRetries = false; + } + } } - } - } - - try { - if (sleepTime > 0) { - Thread.sleep(sleepTime); - } - } catch (InterruptedException ie) { - LOG.warn("Failed to unassign " - + region.getRegionNameAsString() + " since interrupted", ie); - Thread.currentThread().interrupt(); - if (state != null) { - regionStates.updateRegionState(region, State.FAILED_CLOSE); - } - return; - } if (logRetries) { - LOG.info("Server " + server + " returned " + t + " for " - + region.getRegionNameAsString() + ", try=" + i - + " of " + this.maximumAttempts, t); + LOG.info("Server " + server + " returned " + t + " for " + region.getRegionNameAsString() + + ", try=" + i + " of " + this.maximumAttempts, + t); // Presume retry or server will expire. } } + // If sleepTime is not set by any of the cases, set it to sleep for + // configured exponential backoff time + if (sleepTime == 0 && i != maximumAttempts) { Review comment: How about a more readable condition here? if (regionCloseFailed || anyOtherExceptionThrown) { // set these flags above } I think that makes the flow easier to understand. ########## File path: hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java ########## @@ -1995,97 +1995,100 @@ private void unassign(final HRegionInfo region, } return; } + long sleepTime = 0; try { // Send CLOSE RPC - if (serverManager.sendRegionClose(server, region, - versionOfClosingNode, dest, transitionInZK)) { - LOG.debug("Sent CLOSE to " + server + " for region " + - region.getRegionNameAsString()); + if (serverManager.sendRegionClose(server, region, versionOfClosingNode, dest, + transitionInZK)) { + LOG.debug("Sent CLOSE to " + server + " for region " + region.getRegionNameAsString()); if (useZKForAssignment && !transitionInZK && state != null) { // Retry to make sure the region is // closed so as to avoid double assignment. - unassign(region, state, versionOfClosingNode, - dest, transitionInZK, src); + unassign(region, state, versionOfClosingNode, dest, transitionInZK, src); } return; } // This never happens. Currently regionserver close always return true. // Todo; this can now happen (0.96) if there is an exception in a coprocessor - LOG.warn("Server " + server + " region CLOSE RPC returned false for " + - region.getRegionNameAsString()); - } catch (Throwable t) { - long sleepTime = 0; + LOG.warn("Server " + server + " region CLOSE RPC returned false for " + + region.getRegionNameAsString()); + } catch (Throwable t) { Configuration conf = this.server.getConfiguration(); if (t instanceof RemoteException) { - t = ((RemoteException)t).unwrapRemoteException(); + t = ((RemoteException) t).unwrapRemoteException(); } boolean logRetries = true; - if (t instanceof RegionServerAbortedException - || t instanceof RegionServerStoppedException + if (t instanceof RegionServerAbortedException || t instanceof RegionServerStoppedException || t instanceof ServerNotRunningYetException) { // RS is aborting or stopping, we cannot offline the region since the region may need - // to do WAL recovery. Until we see the RS expiration, we should retry. + // to do WAL recovery. Until we see the RS expiration, we should retry. sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, RpcClient.FAILED_SERVER_EXPIRY_DEFAULT); } else if (t instanceof NotServingRegionException) { - LOG.debug("Offline " + region.getRegionNameAsString() - + ", it's not any more on " + server, t); + LOG.debug( + "Offline " + region.getRegionNameAsString() + ", it's not any more on " + server, t); if (transitionInZK) { deleteClosingOrClosedNode(region, server); } if (state != null) { regionOffline(region); } return; - } else if ((t instanceof FailedServerException) || (state != null && - t instanceof RegionAlreadyInTransitionException)) { - if (t instanceof FailedServerException) { - sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, + } else if ((t instanceof FailedServerException) + || (state != null && t instanceof RegionAlreadyInTransitionException)) { + if (t instanceof FailedServerException) { + sleepTime = 1L + conf.getInt(RpcClient.FAILED_SERVER_EXPIRY_KEY, RpcClient.FAILED_SERVER_EXPIRY_DEFAULT); - } else { - // RS is already processing this region, only need to update the timestamp - LOG.debug("update " + state + " the timestamp."); - state.updateTimestampToNow(); - if (maxWaitTime < 0) { - maxWaitTime = - EnvironmentEdgeManager.currentTime() - + conf.getLong(ALREADY_IN_TRANSITION_WAITTIME, - DEFAULT_ALREADY_IN_TRANSITION_WAITTIME); - } - long now = EnvironmentEdgeManager.currentTime(); - if (now < maxWaitTime) { - LOG.debug("Region is already in transition; " - + "waiting up to " + (maxWaitTime - now) + "ms", t); - sleepTime = 100; - i--; // reset the try count - logRetries = false; + } else { + // RS is already processing this region, only need to update the timestamp + LOG.debug("update " + state + " the timestamp."); + state.updateTimestampToNow(); + if (maxWaitTime < 0) { + maxWaitTime = EnvironmentEdgeManager.currentTime() + conf.getLong( + ALREADY_IN_TRANSITION_WAITTIME, DEFAULT_ALREADY_IN_TRANSITION_WAITTIME); + } + long now = EnvironmentEdgeManager.currentTime(); + if (now < maxWaitTime) { + LOG.debug("Region is already in transition; " + "waiting up to " + + (maxWaitTime - now) + "ms", + t); + sleepTime = 100; + i--; // reset the try count + logRetries = false; + } + } } - } - } - - try { - if (sleepTime > 0) { - Thread.sleep(sleepTime); - } - } catch (InterruptedException ie) { - LOG.warn("Failed to unassign " - + region.getRegionNameAsString() + " since interrupted", ie); - Thread.currentThread().interrupt(); - if (state != null) { - regionStates.updateRegionState(region, State.FAILED_CLOSE); - } - return; - } if (logRetries) { - LOG.info("Server " + server + " returned " + t + " for " - + region.getRegionNameAsString() + ", try=" + i - + " of " + this.maximumAttempts, t); + LOG.info("Server " + server + " returned " + t + " for " + region.getRegionNameAsString() + + ", try=" + i + " of " + this.maximumAttempts, + t); // Presume retry or server will expire. } } + // If sleepTime is not set by any of the cases, set it to sleep for + // configured exponential backoff time + if (sleepTime == 0 && i != maximumAttempts) { + sleepTime = backoffPolicy.getBackoffTime(retryConfig, i); + LOG.info("Waiting for " + sleepTime + "milliseconds exponential backoff time for " + + region.getRegionNameAsString() + " before next retry " + (i + 1) + " of " + + this.maximumAttempts); + } + try { + if (sleepTime > 0 && i != maximumAttempts) { + Thread.sleep(sleepTime); + } + } catch (InterruptedException ie) { + LOG.warn("Failed to unassign " + region.getRegionNameAsString() + " since interrupted", ie); + Thread.currentThread().interrupt(); + if (state != null) { Review comment: do this before propagating the interrupt flag? ########## File path: hbase-server/src/test/java/org/apache/hadoop/hbase/master/TestAssignmentManagerOnCluster.java ########## @@ -598,6 +601,68 @@ public void testCloseFailed() throws Exception { } } + /** + * This tests region close with exponential backoff + */ + @Test(timeout = 60000) + public void testCloseRegionWithExponentialBackOff() throws Exception { + String table = "testCloseRegionWithExponentialBackOff"; + // Set the backoff time between each retry for failed close + TEST_UTIL.getMiniHBaseCluster().getConf().setLong("hbase.assignment.retry.sleep.initial", 1000); + HMaster activeMaster = TEST_UTIL.getHBaseCluster().getMaster(); + TEST_UTIL.getMiniHBaseCluster().stopMaster(activeMaster.getServerName()); + TEST_UTIL.getMiniHBaseCluster().startMaster(); // restart the master for conf take into affect + + try { + ScheduledThreadPoolExecutor scheduledThreadPoolExecutor = + new ScheduledThreadPoolExecutor(1, Threads.newDaemonThreadFactory("ExponentialBackOff")); + + HTableDescriptor desc = new HTableDescriptor(TableName.valueOf(table)); + desc.addFamily(new HColumnDescriptor(FAMILY)); + admin.createTable(desc); + + Table meta = new HTable(conf, TableName.META_TABLE_NAME); + HRegionInfo hri = + new HRegionInfo(desc.getTableName(), Bytes.toBytes("A"), Bytes.toBytes("Z")); + MetaTableAccessor.addRegionToMeta(meta, hri); + + HMaster master = TEST_UTIL.getHBaseCluster().getMaster(); + AssignmentManager am = master.getAssignmentManager(); + assertTrue(TEST_UTIL.assignRegion(hri)); + ServerName sn = am.getRegionStates().getRegionServerOfRegion(hri); + TEST_UTIL.assertRegionOnServer(hri, sn, 6000); + + MyRegionObserver.preCloseEnabled.set(true); + // Unset the precloseEnabled flag after 1 second for next retry to succeed + scheduledThreadPoolExecutor.schedule(new Runnable() { + @Override + public void run() { + MyRegionObserver.preCloseEnabled.set(false); Review comment: Does the test reliably fail without the patch, meaning it ends up in FAILED_CLOSE? ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org