[ https://issues.apache.org/jira/browse/HBASE-11380?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14038199#comment-14038199 ]
Enis Soztutar commented on HBASE-11380: --------------------------------------- >From what I remember, HRegion#processRowsWithLocks will only be called for >operations that has RowProcessor, which we only use for for meta edits >(MultiRowMutationProcessor). We should do the patch, but that might not be the root cause. [~ccondit] you see this on a regular table? Are you using any coprocessors? > HRegion lock object is not being released properly, leading to snapshot > failure > ------------------------------------------------------------------------------- > > Key: HBASE-11380 > URL: https://issues.apache.org/jira/browse/HBASE-11380 > Project: HBase > Issue Type: Bug > Components: regionserver > Affects Versions: 0.98.3 > Reporter: Craig Condit > Attachments: 11380-v1.txt > > > Background: > We are attempting to create ~ 750 table snapshots on a nightly basis for use > in MR jobs. The jobs are run in batches, with a maximum of around 20 jobs > running simultaneously. > We have started to see the following in our region server logs (after < 1 day > uptime): > {noformat} > java.lang.Error: Maximum lock count exceeded > at > java.util.concurrent.locks.ReentrantReadWriteLock$Sync.fullTryAcquireShared(ReentrantReadWriteLock.java:531) > at > java.util.concurrent.locks.ReentrantReadWriteLock$Sync.tryAcquireShared(ReentrantReadWriteLock.java:491) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer.tryAcquireSharedNanos(AbstractQueuedSynchronizer.java:1326) > at > java.util.concurrent.locks.ReentrantReadWriteLock$ReadLock.tryLock(ReentrantReadWriteLock.java:873) > at org.apache.hadoop.hbase.regionserver.HRegion.lock(HRegion.java:5904) > at org.apache.hadoop.hbase.regionserver.HRegion.lock(HRegion.java:5891) > at > org.apache.hadoop.hbase.regionserver.HRegion.startRegionOperation(HRegion.java:5798) > at > org.apache.hadoop.hbase.regionserver.HRegion.startRegionOperation(HRegion.java:5761) > at > org.apache.hadoop.hbase.regionserver.HRegion.processRowsWithLocks(HRegion.java:4891) > at > org.apache.hadoop.hbase.regionserver.HRegion.mutateRowsWithLocks(HRegion.java:4856) > at > org.apache.hadoop.hbase.regionserver.HRegion.mutateRowsWithLocks(HRegion.java:4838) > at > org.apache.hadoop.hbase.regionserver.HRegion.mutateRow(HRegion.java:4829) > at > org.apache.hadoop.hbase.regionserver.HRegionServer.mutateRows(HRegionServer.java:4390) > at > org.apache.hadoop.hbase.regionserver.HRegionServer.multi(HRegionServer.java:3362) > at > org.apache.hadoop.hbase.protobuf.generated.ClientProtos$ClientService$2.callBlockingMethod(ClientProtos.java:29503) > at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2012) > at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:98) > at > org.apache.hadoop.hbase.ipc.SimpleRpcScheduler.consumerLoop(SimpleRpcScheduler.java:168) > at > org.apache.hadoop.hbase.ipc.SimpleRpcScheduler.access$000(SimpleRpcScheduler.java:39) > at > org.apache.hadoop.hbase.ipc.SimpleRpcScheduler$1.run(SimpleRpcScheduler.java:111) > at java.lang.Thread.run(Thread.java:744) > {noformat} > Not sure of the cause, but the result is that snapshots cannot be created. We > see this in our client logs: > {noformat} > Exception in thread "main" > org.apache.hadoop.hbase.snapshot.HBaseSnapshotException: > org.apache.hadoop.hbase.snapshot.HBaseSnapshotException: Snapshot { > ss=test-snapshot-20140619143753294 table=test type=FLUSH } had an error. > Procedure test-snapshot-20140619143753294 { > waiting=[p3plpadata038.internal,60020,1403140682587, > p3plpadata056.internal,60020,1403140865123, > p3plpadata072.internal,60020,1403141022569] > done=[p3plpadata023.internal,60020,1403140552227, > p3plpadata009.internal,60020,1403140487826] } > at > org.apache.hadoop.hbase.master.snapshot.SnapshotManager.isSnapshotDone(SnapshotManager.java:342) > at > org.apache.hadoop.hbase.master.HMaster.isSnapshotDone(HMaster.java:2907) > at > org.apache.hadoop.hbase.protobuf.generated.MasterProtos$MasterService$2.callBlockingMethod(MasterProtos.java:40494) > at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2012) > at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:98) > at > org.apache.hadoop.hbase.ipc.FifoRpcScheduler$1.run(FifoRpcScheduler.java:73) > at > java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471) > at java.util.concurrent.FutureTask.run(FutureTask.java:262) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:744) > Caused by: > org.apache.hadoop.hbase.errorhandling.ForeignException$ProxyThrowable via > p3plpadata060.internal,60020,1403140935958:org.apache.hadoop.hbase.errorhandling.ForeignException$ProxyThrowable: > > at > org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher.rethrowException(ForeignExceptionDispatcher.java:83) > at > org.apache.hadoop.hbase.master.snapshot.TakeSnapshotHandler.rethrowExceptionIfFailed(TakeSnapshotHandler.java:320) > at > org.apache.hadoop.hbase.master.snapshot.SnapshotManager.isSnapshotDone(SnapshotManager.java:332) > ... 10 more > Caused by: > org.apache.hadoop.hbase.errorhandling.ForeignException$ProxyThrowable: > at > org.apache.hadoop.hbase.procedure.Subprocedure.cancel(Subprocedure.java:270) > at > org.apache.hadoop.hbase.procedure.ProcedureMember.submitSubprocedure(ProcedureMember.java:171) > at > org.apache.hadoop.hbase.procedure.ZKProcedureMemberRpcs.startNewSubprocedure(ZKProcedureMemberRpcs.java:214) > at > org.apache.hadoop.hbase.procedure.ZKProcedureMemberRpcs.waitForNewProcedures(ZKProcedureMemberRpcs.java:172) > at > org.apache.hadoop.hbase.procedure.ZKProcedureMemberRpcs.access$100(ZKProcedureMemberRpcs.java:55) > at > org.apache.hadoop.hbase.procedure.ZKProcedureMemberRpcs$1.nodeChildrenChanged(ZKProcedureMemberRpcs.java:107) > at > org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher.process(ZooKeeperWatcher.java:348) > at > org.apache.zookeeper.ClientCnxn$EventThread.processEvent(ClientCnxn.java:522) > at org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:498) > at sun.reflect.GeneratedConstructorAccessor17.newInstance(Unknown > Source) > at > sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) > at java.lang.reflect.Constructor.newInstance(Constructor.java:526) > at > org.apache.hadoop.ipc.RemoteException.instantiateException(RemoteException.java:106) > at > org.apache.hadoop.ipc.RemoteException.unwrapRemoteException(RemoteException.java:95) > at > org.apache.hadoop.hbase.client.RpcRetryingCaller.translateException(RpcRetryingCaller.java:207) > at > org.apache.hadoop.hbase.client.RpcRetryingCaller.translateException(RpcRetryingCaller.java:221) > at > org.apache.hadoop.hbase.client.RpcRetryingCaller.callWithRetries(RpcRetryingCaller.java:121) > at > org.apache.hadoop.hbase.client.RpcRetryingCaller.callWithRetries(RpcRetryingCaller.java:90) > at > org.apache.hadoop.hbase.client.HBaseAdmin.executeCallable(HBaseAdmin.java:3327) > at > org.apache.hadoop.hbase.client.HBaseAdmin.snapshot(HBaseAdmin.java:2722) > at > org.apache.hadoop.hbase.client.HBaseAdmin.snapshot(HBaseAdmin.java:2655) > at > org.apache.hadoop.hbase.client.HBaseAdmin.snapshot(HBaseAdmin.java:2596) > at > [SNIP] > Caused by: > org.apache.hadoop.hbase.ipc.RemoteWithExtrasException(org.apache.hadoop.hbase.snapshot.HBaseSnapshotException): > org.apache.hadoop.hbase.snapshot.HBaseSnapshotException: Snapshot { > ss=test-snapshot-20140619143753294 table=test type=FLUSH } had an error. > Procedure test-snapshot-20140619143753294 { > waiting=[p3plpadata038.internal,60020,1403140682587, > p3plpadata056.internal,60020,1403140865123, > p3plpadata072.internal,60020,1403141022569] > done=[p3plpadata023.internal,60020,1403140552227, > p3plpadata009.internal,60020,1403140487826] } > at > org.apache.hadoop.hbase.master.snapshot.SnapshotManager.isSnapshotDone(SnapshotManager.java:342) > at > org.apache.hadoop.hbase.master.HMaster.isSnapshotDone(HMaster.java:2907) > at > org.apache.hadoop.hbase.protobuf.generated.MasterProtos$MasterService$2.callBlockingMethod(MasterProtos.java:40494) > at org.apache.hadoop.hbase.ipc.RpcServer.call(RpcServer.java:2012) > at org.apache.hadoop.hbase.ipc.CallRunner.run(CallRunner.java:98) > at > org.apache.hadoop.hbase.ipc.FifoRpcScheduler$1.run(FifoRpcScheduler.java:73) > at > java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:471) > at java.util.concurrent.FutureTask.run(FutureTask.java:262) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615) > at java.lang.Thread.run(Thread.java:744) > Caused by: > org.apache.hadoop.hbase.errorhandling.ForeignException$ProxyThrowable via > p3plpadata060.internal,60020,1403140935958:org.apache.hadoop.hbase.errorhandling.ForeignException$ProxyThrowable: > > at > org.apache.hadoop.hbase.errorhandling.ForeignExceptionDispatcher.rethrowException(ForeignExceptionDispatcher.java:83) > at > org.apache.hadoop.hbase.master.snapshot.TakeSnapshotHandler.rethrowExceptionIfFailed(TakeSnapshotHandler.java:320) > at > org.apache.hadoop.hbase.master.snapshot.SnapshotManager.isSnapshotDone(SnapshotManager.java:332) > ... 10 more > Caused by: > org.apache.hadoop.hbase.errorhandling.ForeignException$ProxyThrowable: > at > org.apache.hadoop.hbase.procedure.Subprocedure.cancel(Subprocedure.java:270) > at > org.apache.hadoop.hbase.procedure.ProcedureMember.submitSubprocedure(ProcedureMember.java:171) > at > org.apache.hadoop.hbase.procedure.ZKProcedureMemberRpcs.startNewSubprocedure(ZKProcedureMemberRpcs.java:214) > at > org.apache.hadoop.hbase.procedure.ZKProcedureMemberRpcs.waitForNewProcedures(ZKProcedureMemberRpcs.java:172) > at > org.apache.hadoop.hbase.procedure.ZKProcedureMemberRpcs.access$100(ZKProcedureMemberRpcs.java:55) > at > org.apache.hadoop.hbase.procedure.ZKProcedureMemberRpcs$1.nodeChildrenChanged(ZKProcedureMemberRpcs.java:107) > at > org.apache.hadoop.hbase.zookeeper.ZooKeeperWatcher.process(ZooKeeperWatcher.java:348) > at > org.apache.zookeeper.ClientCnxn$EventThread.processEvent(ClientCnxn.java:522) > at org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:498) > at org.apache.hadoop.hbase.ipc.RpcClient.call(RpcClient.java:1453) > at > org.apache.hadoop.hbase.ipc.RpcClient.callBlockingMethod(RpcClient.java:1657) > at > org.apache.hadoop.hbase.ipc.RpcClient$BlockingRpcChannelImplementation.callBlockingMethod(RpcClient.java:1715) > at > org.apache.hadoop.hbase.protobuf.generated.MasterProtos$MasterService$BlockingStub.isSnapshotDone(MasterProtos.java:42861) > at > org.apache.hadoop.hbase.client.HConnectionManager$HConnectionImplementation$5.isSnapshotDone(HConnectionManager.java:2048) > at > org.apache.hadoop.hbase.client.HBaseAdmin$24.call(HBaseAdmin.java:2725) > at > org.apache.hadoop.hbase.client.HBaseAdmin$24.call(HBaseAdmin.java:2722) > at > org.apache.hadoop.hbase.client.RpcRetryingCaller.callWithRetries(RpcRetryingCaller.java:114) > ... 16 more > {noformat} > -- This message was sent by Atlassian JIRA (v6.2#6252)