saintstack commented on a change in pull request #1305: HBASE-23984 [Flakey Tests] TestMasterAbortAndRSGotKilled fails in tea… URL: https://github.com/apache/hbase/pull/1305#discussion_r395161763
########## File path: hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java ########## @@ -76,75 +79,75 @@ private HRegionServer getServer() { @Override public void process() throws IOException { - HRegionServer rs = getServer(); - byte[] encodedNameBytes = Bytes.toBytes(encodedName); - Boolean previous = rs.getRegionsInTransitionInRS().putIfAbsent(encodedNameBytes, Boolean.FALSE); - if (previous != null) { - if (previous) { - // This could happen as we will update the region state to OPEN when calling - // reportRegionStateTransition, so the HMaster will think the region is online, before we - // actually open the region, as reportRegionStateTransition is part of the opening process. - long backoff = retryCounter.getBackoffTimeAndIncrementAttempts(); - LOG.warn("Received CLOSE for the region: {}, which we are already " + - "trying to OPEN. try again after {}ms", encodedName, backoff); - rs.getExecutorService().delayedSubmit(this, backoff, TimeUnit.MILLISECONDS); - } else { - LOG.info("Received CLOSE for the region: {}, which we are already trying to CLOSE," + - " but not completed yet", encodedName); + HRegionServer rss = getServer(); + try { + byte[] encodedRegionNameAsBytes = Bytes.toBytes(this.encodedRegionName); + Boolean previous = rss.getRegionsInTransitionInRS(). + putIfAbsent(encodedRegionNameAsBytes, Boolean.FALSE); + if (previous != null) { + if (previous) { + // This could happen as we will update the region state to OPEN when calling + // reportRegionStateTransition, so the HMaster will think the region is online, before + // we actually open the region, as reportRegionStateTransition is part of the opening + // process. + long backoff = this.retryCounter.getBackoffTimeAndIncrementAttempts(); + LOG.warn("Received CLOSE for {} which is being OPENED; try again after {}ms", + encodedRegionNameAsBytes, backoff); + rss.getExecutorService().delayedSubmit(this, backoff, TimeUnit.MILLISECONDS); + } else { + LOG.info("Received CLOSE for {}, which is CLOSING but not complete yet", + encodedRegionNameAsBytes); + } + return; } - return; - } - HRegion region = rs.getRegion(encodedName); - if (region == null) { - LOG.debug( - "Received CLOSE for a region {} which is not online, and we're not opening/closing.", - encodedName); - rs.getRegionsInTransitionInRS().remove(encodedNameBytes, Boolean.FALSE); - return; - } - String regionName = region.getRegionInfo().getEncodedName(); - LOG.info("Close {}", regionName); - if (region.getCoprocessorHost() != null) { - // XXX: The behavior is a bit broken. At master side there is no FAILED_CLOSE state, so if - // there are exception thrown from the CP, we can not report the error to master, and if here - // we just return without calling reportRegionStateTransition, the TRSP at master side will - // hang there for ever. So here if the CP throws an exception out, the only way is to abort - // the RS... - region.getCoprocessorHost().preClose(abort); - } - if (region.close(abort) == null) { - // XXX: Is this still possible? The old comment says about split, but now split is done at - // master side, so... - LOG.warn("Can't close region {}, was already closed during close()", regionName); - rs.getRegionsInTransitionInRS().remove(encodedNameBytes, Boolean.FALSE); - return; - } - rs.removeRegion(region, destination); - if (!rs.reportRegionStateTransition(new RegionStateTransitionContext(TransitionCode.CLOSED, - HConstants.NO_SEQNUM, closeProcId, -1, region.getRegionInfo()))) { - throw new IOException("Failed to report close to master: " + regionName); + HRegion region = rss.getRegion(this.encodedRegionName); + // Can't unassign a Region that is not online. + if (region == null) { + LOG.debug("Received CLOSE for {} which is not online, and we're not opening/closing.", + this.encodedRegionName); + // Remove what we added above. + rss.getRegionsInTransitionInRS().remove(encodedRegionNameAsBytes, Boolean.FALSE); + return; + } + CloseRegionHandler crh = new CloseRegionHandler(rss, region.getRegionInfo(), this.abort, + this.destination) { + @Override boolean reportRegionStateTransition() throws IOException { + boolean b = super.reportRegionStateTransition(); + if (!b) { + String name = getRegionInfo().getRegionNameAsString(); + throw new IOException("Failed report close of " + name + " to master"); + } + return b; + } + + @Override long getClosePID() { + return UnassignRegionHandler.this.closeProcId; + } + }; + crh.process(); + } finally { + // Cache the close region procedure id + rss.finishRegionProcedure(closeProcId); Review comment: This is the old URH code: https://github.com/saintstack/hbase/blob/master/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/UnassignRegionHandler.java Any exception thrown calling the CP preclose or running the region close would exit the URH#process, get caught by the executor run method and then fed to the URH#handleException which would abort the RS. This refactor does not change this behavior or add to it; it just adds the cleanup of getRegionsInTransitionInRS references on exception -- of less use -- but on success too. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services