Enis Soztutar created PHOENIX-2667:
--------------------------------------

             Summary: Race condition between IndexBuilder and Split for region 
lock
                 Key: PHOENIX-2667
                 URL: https://issues.apache.org/jira/browse/PHOENIX-2667
             Project: Phoenix
          Issue Type: Bug
            Reporter: Enis Soztutar


In a production cluster, we have seen a condition where the split did not 
finish for 30+ minutes. Also due to this, no request was being serviced in this 
time frame affectively making the region offline. 

The jstack provides 3 types of threads waiting on the regions read or write 
locks. 
First, the handlers are all blocked on trying to acquire the read lock on the 
region in multi(), most of the handlers are like this:
{code}
Thread 2328: (state = BLOCKED)
 - sun.misc.Unsafe.park(boolean, long) @bci=0 (Compiled frame; information may 
be imprecise)
 - java.util.concurrent.locks.LockSupport.parkNanos(java.lang.Object, long) 
@bci=20, line=226 (Compiled frame)
 - 
java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireSharedNanos(int, 
long) @bci=122, line=1033 (Compiled frame)
 - 
java.util.concurrent.locks.AbstractQueuedSynchronizer.tryAcquireSharedNanos(int,
 long) @bci=25, line=1326 (Compiled frame)
 - java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.tryLock(long, 
java.util.concurrent.TimeUnit) @bci=10, line=873 (Compiled frame)
 - 
org.apache.hadoop.hbase.regionserver.HRegion.lock(java.util.concurrent.locks.Lock,
 int) @bci=27, line=7754 (Interpreted frame)
 - 
org.apache.hadoop.hbase.regionserver.HRegion.lock(java.util.concurrent.locks.Lock)
 @bci=3, line=7741 (Interpreted frame)
 - 
org.apache.hadoop.hbase.regionserver.HRegion.startRegionOperation(org.apache.hadoop.hbase.regionserver.Region$Operation)
 @bci=211, line=7650 (Interpreted frame)
 - 
org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(org.apache.hadoop.hbase.regionserver.HRegion$BatchOperationInProgress)
 @bci=21, line=2803 (Interpreted frame)
 - 
org.apache.hadoop.hbase.regionserver.HRegion.batchMutate(org.apache.hadoop.hbase.client.Mutation[],
 long, long) @bci=12, line=2760 (Compiled frame)
 - 
org.apache.hadoop.hbase.regionserver.RSRpcServices.doBatchOp(org.apache.hadoop.hbase.protobuf.generated.ClientProtos$RegionActionResult$Builder,
 org.apache.hadoop.hbase.regionserver.Region, org.apache.
 - 
org.apache.hadoop.hbase.regionserver.RSRpcServices.doNonAtomicRegionMutation(org.apache.hadoop.hbase.regionserver.Region,
 org.apache.hadoop.hbase.quotas.OperationQuota, org.apache.hadoop.hbase.protobuf
 - 
org.apache.hadoop.hbase.regionserver.RSRpcServices.multi(com.google.protobuf.RpcController,
 org.apache.hadoop.hbase.protobuf.generated.ClientProtos$MultiRequest) 
@bci=407, line=2032 (Compiled frame)
 - 
org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(com.google.protobuf.Descriptors$MethodDescriptor,
 com.google.protobuf.RpcController, com.google.protobuf.Messa
 - 
org.apache.hadoop.hbase.ipc.RpcServer.call(com.google.protobuf.BlockingService, 
com.google.protobuf.Descriptors$MethodDescriptor, com.google.protobuf.Message, 
org.apache.hadoop.hbase.CellScanner, long,
 - org.apache.hadoop.hbase.ipc.CallRunner.run() @bci=345, line=101 (Compiled 
frame)
 - 
org.apache.hadoop.hbase.ipc.RpcExecutor.consumerLoop(java.util.concurrent.BlockingQueue)
 @bci=54, line=130 (Compiled frame)
 - org.apache.hadoop.hbase.ipc.RpcExecutor$1.run() @bci=20, line=107 
(Interpreted frame)
 - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame)
{code}

second, the IndexBuilder threads from Phoenix index are also blocked waiting on 
the region read locks: 
{code}
Thread 17566: (state = BLOCKED)
 - sun.misc.Unsafe.park(boolean, long) @bci=0 (Compiled frame; information may 
be imprecise)
 - java.util.concurrent.locks.LockSupport.parkNanos(java.lang.Object, long) 
@bci=20, line=226 (Compiled frame)
 - 
java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireSharedNanos(int, 
long) @bci=122, line=1033 (Compiled frame)
 - 
java.util.concurrent.locks.AbstractQueuedSynchronizer.tryAcquireSharedNanos(int,
 long) @bci=25, line=1326 (Compiled frame)
 - java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.tryLock(long, 
java.util.concurrent.TimeUnit) @bci=10, line=873 (Compiled frame)
 - 
org.apache.hadoop.hbase.regionserver.HRegion.lock(java.util.concurrent.locks.Lock,
 int) @bci=27, line=7754 (Interpreted frame)
 - 
org.apache.hadoop.hbase.regionserver.HRegion.lock(java.util.concurrent.locks.Lock)
 @bci=3, line=7741 (Interpreted frame)
 - 
org.apache.hadoop.hbase.regionserver.HRegion.startRegionOperation(org.apache.hadoop.hbase.regionserver.Region$Operation)
 @bci=211, line=7650 (Interpreted frame)
 - 
org.apache.hadoop.hbase.regionserver.HRegion.getScanner(org.apache.hadoop.hbase.client.Scan,
 java.util.List) @bci=4, line=2484 (Interpreted frame)
 - 
org.apache.hadoop.hbase.regionserver.HRegion.getScanner(org.apache.hadoop.hbase.client.Scan)
 @bci=3, line=2479 (Compiled frame)
 - 
org.apache.phoenix.hbase.index.covered.data.LocalTable.getCurrentRowState(org.apache.hadoop.hbase.client.Mutation,
 java.util.Collection) @bci=43, line=63 (Compiled frame)
 - 
org.apache.phoenix.hbase.index.covered.LocalTableState.ensureLocalStateInitialized(java.util.Collection)
 @bci=29, line=158 (Compiled frame)
 - 
org.apache.phoenix.hbase.index.covered.LocalTableState.getIndexedColumnsTableState(java.util.Collection)
 @bci=2, line=126 (Compiled frame)
 - 
org.apache.phoenix.index.PhoenixIndexCodec.getIndexUpdates(org.apache.phoenix.hbase.index.covered.TableState,
 boolean) @bci=213, line=162 (Compiled frame)
 - 
org.apache.phoenix.index.PhoenixIndexCodec.getIndexDeletes(org.apache.phoenix.hbase.index.covered.TableState)
 @bci=3, line=120 (Compiled frame)
 - 
org.apache.phoenix.hbase.index.covered.CoveredColumnsIndexBuilder.addDeleteUpdatesToMap(org.apache.phoenix.hbase.index.covered.update.IndexUpdateManager,
 org.apache.phoenix.hbase.index.covered.LocalTab
 - 
org.apache.phoenix.hbase.index.covered.CoveredColumnsIndexBuilder.addCleanupForCurrentBatch(org.apache.phoenix.hbase.index.covered.update.IndexUpdateManager,
 long, org.apache.phoenix.hbase.index.covere
 - 
org.apache.phoenix.hbase.index.covered.CoveredColumnsIndexBuilder.addMutationsForBatch(org.apache.phoenix.hbase.index.covered.update.IndexUpdateManager,
 org.apache.phoenix.hbase.index.covered.Batch, or
 - 
org.apache.phoenix.hbase.index.covered.CoveredColumnsIndexBuilder.batchMutationAndAddUpdates(org.apache.phoenix.hbase.index.covered.update.IndexUpdateManager,
 org.apache.hadoop.hbase.client.Mutation) @
 - 
org.apache.phoenix.hbase.index.covered.CoveredColumnsIndexBuilder.getIndexUpdate(org.apache.hadoop.hbase.client.Mutation)
 @bci=11, line=99 (Compiled frame)
 - org.apache.phoenix.hbase.index.builder.IndexBuildManager$1.call() @bci=11, 
line=133 (Compiled frame)
 - org.apache.phoenix.hbase.index.builder.IndexBuildManager$1.call() @bci=1, 
line=129 (Compiled frame)
 - java.util.concurrent.FutureTask.run() @bci=42, line=262 (Compiled frame)
 - 
java.util.concurrent.ThreadPoolExecutor.runWorker(java.util.concurrent.ThreadPoolExecutor$Worker)
 @bci=95, line=1145 (Compiled frame)
 - java.util.concurrent.ThreadPoolExecutor$Worker.run() @bci=5, line=615 
(Interpreted frame)
 - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame)
{code}

Finally, the splitting of the region is being executed, which is trying to 
close the region. This operation acquires the regions write lock, but it is 
also blocked: 
{code}
Thread 18417: (state = BLOCKED)
 - sun.misc.Unsafe.park(boolean, long) @bci=0 (Compiled frame; information may 
be imprecise)
 - java.util.concurrent.locks.LockSupport.park(java.lang.Object) @bci=14, 
line=186 (Compiled frame)
 - 
java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt() 
@bci=1, line=834 (Compiled frame)
 - 
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(java.util.concurrent.locks.AbstractQueuedSynchronizer$Node,
 int) @bci=67, line=867 (Compiled frame)
 - java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(int) @bci=17, 
line=1197 (Compiled frame)
 - java.util.concurrent.locks.ReentrantReadWriteLock$WriteLock.lock() @bci=5, 
line=945 (Interpreted frame)
 - org.apache.hadoop.hbase.regionserver.HRegion.doClose(boolean, 
org.apache.hadoop.hbase.monitoring.MonitoredTask) @bci=278, line=1401 
(Interpreted frame)
 - org.apache.hadoop.hbase.regionserver.HRegion.close(boolean) @bci=62, 
line=1349 (Interpreted frame)
 - 
org.apache.hadoop.hbase.regionserver.SplitTransactionImpl.stepsBeforePONR(org.apache.hadoop.hbase.Server,
 org.apache.hadoop.hbase.regionserver.RegionServerServices, boolean) @bci=238, 
line=394 (Interpr
 - 
org.apache.hadoop.hbase.regionserver.SplitTransactionImpl.createDaughters(org.apache.hadoop.hbase.Server,
 org.apache.hadoop.hbase.regionserver.RegionServerServices, 
org.apache.hadoop.hbase.security.Use
 - 
org.apache.hadoop.hbase.regionserver.SplitTransactionImpl.execute(org.apache.hadoop.hbase.Server,
 org.apache.hadoop.hbase.regionserver.RegionServerServices, 
org.apache.hadoop.hbase.security.User) @bci=
 - 
org.apache.hadoop.hbase.regionserver.SplitRequest.doSplitting(org.apache.hadoop.hbase.security.User)
 @bci=357, line=82 (Interpreted frame)
 - org.apache.hadoop.hbase.regionserver.SplitRequest.run() @bci=74, line=154 
(Interpreted frame)
 - 
java.util.concurrent.ThreadPoolExecutor.runWorker(java.util.concurrent.ThreadPoolExecutor$Worker)
 @bci=95, line=1145 (Compiled frame)
 - java.util.concurrent.ThreadPoolExecutor$Worker.run() @bci=5, line=615 
(Interpreted frame)
 - java.lang.Thread.run() @bci=11, line=745 (Interpreted frame)
{code}

This looks like a race between the split thread and the global index updates 
racing for the region lock.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to