ndimiduk commented on a change in pull request #1305: HBASE-23984 [Flakey Tests] TestMasterAbortAndRSGotKilled fails in tea… URL: https://github.com/apache/hbase/pull/1305#discussion_r394694740
########## File path: hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/handler/AssignRegionHandler.java ########## @@ -105,50 +102,54 @@ public void process() throws IOException { if (previous != null) { if (previous) { // The region is opening and this maybe a retry on the rpc call, it is safe to ignore it. - LOG.info("Receiving OPEN for the region:{}, which we are already trying to OPEN" + - " - ignoring this new request for this region.", regionName); + LOG.info("Receiving OPEN for {} which is OPENING; ignoring request", regionName); } else { // The region is closing. This is possible as we will update the region state to CLOSED when // calling reportRegionStateTransition, so the HMaster will think the region is offline, // before we actually close the region, as reportRegionStateTransition is part of the // closing process. long backoff = retryCounter.getBackoffTimeAndIncrementAttempts(); - LOG.info( - "Receiving OPEN for the region:{}, which we are trying to close, try again after {}ms", + LOG.info("Receiving OPEN for {} which we are trying to CLOSE, try again after {}ms", regionName, backoff); rs.getExecutorService().delayedSubmit(this, backoff, TimeUnit.MILLISECONDS); } return; } LOG.info("Open {}", regionName); HRegion region; + // Start a try. Above we registered region with rs.getRegionsInTransitionInRS. Need to clear + // it on success or error on the end. try { - TableDescriptor htd = - tableDesc != null ? tableDesc : rs.getTableDescriptors().get(regionInfo.getTable()); - if (htd == null) { - throw new IOException("Missing table descriptor for " + regionName); + try { + TableDescriptor htd = + tableDesc != null ? tableDesc : rs.getTableDescriptors().get(regionInfo.getTable()); + if (htd == null) { + throw new IOException("Missing table descriptor for " + regionName); + } + // pass null for the last parameter, which used to be a CancelableProgressable, as now the + // opening can not be interrupted by a close request any more. + region = HRegion + .openHRegion(regionInfo, htd, rs.getWAL(regionInfo), rs.getConfiguration(), rs, null); + } catch (IOException e) { + cleanUpAndReportFailure(e); + return; } - // pass null for the last parameter, which used to be a CancelableProgressable, as now the - // opening can not be interrupted by a close request any more. - region = HRegion.openHRegion(regionInfo, htd, rs.getWAL(regionInfo), rs.getConfiguration(), - rs, null); - } catch (IOException e) { - cleanUpAndReportFailure(e); - return; - } - rs.postOpenDeployTasks(new PostOpenDeployContext(region, openProcId, masterSystemTime)); - rs.addRegion(region); - LOG.info("Opened {}", regionName); - // Cache the open region procedure id after report region transition succeed. - rs.finishRegionProcedure(openProcId); - Boolean current = rs.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes()); - if (current == null) { - // Should NEVER happen, but let's be paranoid. - LOG.error("Bad state: we've just opened a region that was NOT in transition. Region={}", - regionName); - } else if (!current) { - // Should NEVER happen, but let's be paranoid. - LOG.error("Bad state: we've just opened a region that was closing. Region={}", regionName); + rs.postOpenDeployTasks(new PostOpenDeployContext(region, openProcId, masterSystemTime)); + rs.addRegion(region); + LOG.info("Opened {}", regionName); + } finally { + // Remove from regionsInTransitionRS whether success or failure. + rs.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes(), Boolean.TRUE); + Boolean current = rs.getRegionsInTransitionInRS().remove(regionInfo.getEncodedNameAsBytes()); + if (current == null) { + // Should NEVER happen, but let's be paranoid. Review comment: this is a lie. there's nothing stopping multiple of these executing concurrently, is there? the only guarantee our data structures give us is that changes to the RIT map are atomic. or do these handlers operate under a region lock at a higher level? I haven't notice one during this review. Based on above thinking, this should be logged at DEBUG, not ERROR. ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services