[ 
https://issues.apache.org/jira/browse/HBASE-23895?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17046270#comment-17046270
 ] 

Duo Zhang commented on HBASE-23895:
-----------------------------------

Oh this is a serious problem, haven't consider this though...

The call to HMaster may trigger a region instertion and then the timeout of the 
rpc call will effect the behavior of the insertion...

> STUCK Region-In-Transition when failed to insert procedure to procedure store
> -----------------------------------------------------------------------------
>
>                 Key: HBASE-23895
>                 URL: https://issues.apache.org/jira/browse/HBASE-23895
>             Project: HBase
>          Issue Type: Bug
>          Components: proc-v2, RegionProcedureStore
>            Reporter: Guanghao Zhang
>            Priority: Major
>             Fix For: 3.0.0, 2.3.0
>
>
> When move an region, it will generate a TRSP first and set the procedure to 
> the region state node. But if the submit TRSP failed, the procedure cannot be 
> unset now and the region will stuck in RIT.
> hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/AssignmentManager.java
> {code:java}
> public Future<byte[]> moveAsync(RegionPlan regionPlan) throws 
> HBaseIOException {
>     TransitRegionStateProcedure proc =
>       createMoveRegionProcedure(regionPlan.getRegionInfo(), 
> regionPlan.getDestination());
>     return 
> ProcedureSyncWait.submitProcedure(master.getMasterProcedureExecutor(), proc);
>   }
>   public TransitRegionStateProcedure createMoveRegionProcedure(RegionInfo 
> regionInfo,
>       ServerName targetServer) throws HBaseIOException {
>     RegionStateNode regionNode = 
> this.regionStates.getRegionStateNode(regionInfo);
>     if (regionNode == null) {
>       throw new UnknownRegionException("No RegionStateNode found for " +
>           regionInfo.getEncodedName() + "(Closed/Deleted?)");
>     }    
>     TransitRegionStateProcedure proc;
>     regionNode.lock();
>     try {
>       preTransitCheck(regionNode, STATES_EXPECTED_ON_UNASSIGN_OR_MOVE);
>       regionNode.checkOnline();
>       proc = TransitRegionStateProcedure.move(getProcedureEnvironment(), 
> regionInfo, targetServer);
>       regionNode.setProcedure(proc);
>     } finally {
>       regionNode.unlock();
>     }    
>     return proc;
>   }
> {code}
> hbase-server/src/main/java/org/apache/hadoop/hbase/master/assignment/RegionStateNode.java
> {code:java}
>   public void setProcedure(TransitRegionStateProcedure proc) {
>     assert this.procedure == null;
>     this.procedure = proc;
>     ritMap.put(regionInfo, this);
>   }
>   public void unsetProcedure(TransitRegionStateProcedure proc) {
>     assert this.procedure == proc;
>     this.procedure = null;
>     ritMap.remove(regionInfo, this);
>   } 
> {code}
> {code:java}
> 2020-02-26,13:45:21,344 ERROR 
> [RpcServer.default.RWQ.Fifo.read.handler=437,queue=5,port=21500] 
> org.apache.hadoop.hbase.ipc.RpcServer: Unexpected throwable object
> java.io.UncheckedIOException: 
> org.apache.hadoop.hbase.exceptions.TimeoutIOException: Timed out waiting for 
> lock for row: \x00\x00\x00\x00\x00\x0B\xAB\xD2 in region 
> 9731aea823e7f83264b14713ae486fb7
>         at 
> org.apache.hadoop.hbase.procedure2.store.region.RegionProcedureStore.update(RegionProcedureStore.java:588)
>         at 
> org.apache.hadoop.hbase.procedure2.store.region.RegionProcedureStore.insert(RegionProcedureStore.java:545)
>         at 
> org.apache.hadoop.hbase.procedure2.ProcedureExecutor.submitProcedure(ProcedureExecutor.java:1042)
>         at 
> org.apache.hadoop.hbase.procedure2.ProcedureExecutor.submitProcedure(ProcedureExecutor.java:860)
>         at 
> org.apache.hadoop.hbase.master.procedure.ProcedureSyncWait.submitProcedure(ProcedureSyncWait.java:123)
>         at 
> org.apache.hadoop.hbase.master.assignment.AssignmentManager.moveAsync(AssignmentManager.java:657)
>         at 
> org.apache.hadoop.hbase.master.HMaster.executeRegionPlansWithThrottling(HMaster.java:1793)
>         at org.apache.hadoop.hbase.master.HMaster.balance(HMaster.java:1761)
>         at 
> org.apache.hadoop.hbase.master.MasterRpcServices.balance(MasterRpcServices.java:654)
>         at 
> org.apache.hadoop.hbase.shaded.protobuf.generated.MasterProtos$MasterService$2.callBlockingMethod(MasterProtos.java)
>         at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:374)
>         at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:135)
>         at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:352)
>         at 
> org.apache.hadoop.hbase.ipc.RpcExecutor$Handler.run(RpcExecutor.java:332)
> Caused by: org.apache.hadoop.hbase.exceptions.TimeoutIOException: Timed out 
> waiting for lock for row: \x00\x00\x00\x00\x00\x0B\xAB\xD2 in region 
> 9731aea823e7f83264b14713ae486fb7
>         at 
> org.apache.hadoop.hbase.regionserver.HRegion.getRowLockInternal(HRegion.java:6158)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegion$BatchOperation.lockRowsAndBuildMiniBatch(HRegion.java:3488)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegion.doMiniBatchMutate(HRegion.java:4235)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:4208)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:4134)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:4125)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(HRegion.java:4139)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegion.doBatchMutate(HRegion.java:4511)
>         at org.apache.hadoop.hbase.regionserver.HRegion.put(HRegion.java:3209)
>         at 
> org.apache.hadoop.hbase.procedure2.store.region.RegionProcedureStore.update(RegionProcedureStore.java:584)
>         ... 13 more
> {code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to