[
https://issues.apache.org/jira/browse/HBASE-659?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=12601247#action_12601247
]
stack commented on HBASE-659:
-----------------------------
Here's the thread dump on exit. Gives a clean view of the deadlock.
Regionserver will not go down when we have this condition, not without a kill
-9:
{code}
Full thread dump Java HotSpot(TM) 64-Bit Server VM (1.5.0_13-b05 mixed mode):
"Thread-8" prio=1 tid=0x00002aab3c696e60 nid=0x38db in Object.wait()
[0x0000000042f7f000..0x0000000042f7fc80]
at java.lang.Object.wait(Native Method)
- waiting on <0x00002aaab69a2e90> (a
org.apache.hadoop.hbase.HRegionServer$CompactSplitThread)
at java.lang.Thread.join(Unknown Source)
- locked <0x00002aaab69a2e90> (a
org.apache.hadoop.hbase.HRegionServer$CompactSplitThread)
at java.lang.Thread.join(Unknown Source)
at org.apache.hadoop.hbase.HRegionServer.join(HRegionServer.java:1081)
at org.apache.hadoop.hbase.HRegionServer.join(HRegionServer.java:1074)
at
org.apache.hadoop.hbase.HRegionServer$ShutdownThread.run(HRegionServer.java:158)
"SIGTERM handler" daemon prio=1 tid=0x00002aab37d61eb0 nid=0x38d8 in
Object.wait() [0x0000000043282000..0x0000000043282e00]
at java.lang.Object.wait(Native Method)
- waiting on <0x00002aaab693ad28> (a
org.apache.hadoop.hbase.HRegionServer$ShutdownThread)
at java.lang.Thread.join(Unknown Source)
- locked <0x00002aaab693ad28> (a
org.apache.hadoop.hbase.HRegionServer$ShutdownThread)
at java.lang.Thread.join(Unknown Source)
at java.lang.Shutdown.runHooks(Unknown Source)
at java.lang.Shutdown.sequence(Unknown Source)
at java.lang.Shutdown.exit(Unknown Source)
- locked <0x00002aaab1bdced8> (a java.lang.Class)
at java.lang.Terminator$1.handle(Unknown Source)
at sun.misc.Signal$1.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
"IPC Client connection to coral-dfs.cluster.powerset.com/208.76.44.135:10000"
daemon prio=1 tid=0x00002aab3c5157d0 nid=0x7db5 runnable
[0x0000000043585000..0x0000000043585c80]
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.read(Unknown Source)
at java.io.FilterInputStream.read(Unknown Source)
at org.apache.hadoop.ipc.Client$Connection$1.read(Client.java:190)
at java.io.BufferedInputStream.fill(Unknown Source)
at java.io.BufferedInputStream.read(Unknown Source)
- locked <0x00002aaab6ef97c8> (a java.io.BufferedInputStream)
at java.io.DataInputStream.readInt(Unknown Source)
at org.apache.hadoop.ipc.Client$Connection.run(Client.java:276)
"regionserver/0:0:0:0:0:0:0:0:60020.compactor" daemon prio=1
tid=0x00002aab381e5fd0 nid=0x6195 waiting on condition
[0x0000000041c6c000..0x0000000041c6ce00]
at sun.misc.Unsafe.park(Native Method)
at java.util.concurrent.locks.LockSupport.park(Unknown Source)
at
java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(Unknown
Source)
at
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(Unknown
Source)
at
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(Unknown Source)
at java.util.concurrent.locks.ReentrantLock$NonfairSync.lock(Unknown
Source)
at java.util.concurrent.locks.ReentrantLock.lock(Unknown Source)
at org.apache.hadoop.hbase.HLog.startCacheFlush(HLog.java:459)
at org.apache.hadoop.hbase.HRegion.internalFlushcache(HRegion.java:1089)
at org.apache.hadoop.hbase.HRegion.close(HRegion.java:594)
- locked <0x00002aaab6fa4e60> (a java.lang.Integer)
at org.apache.hadoop.hbase.HRegion.splitRegion(HRegion.java:759)
- locked <0x00002aaab6fa4e60> (a java.lang.Integer)
at
org.apache.hadoop.hbase.HRegionServer$CompactSplitThread.split(HRegionServer.java:248)
at
org.apache.hadoop.hbase.HRegionServer$CompactSplitThread.run(HRegionServer.java:204)
"regionserver/0:0:0:0:0:0:0:0:60020.logRoller" daemon prio=1
tid=0x00002aab38181d70 nid=0x6193 waiting on condition
[0x0000000041a6a000..0x0000000041a6ab00]
at sun.misc.Unsafe.park(Native Method)
at java.util.concurrent.locks.LockSupport.park(Unknown Source)
at
java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(Unknown
Source)
at
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(Unknown
Source)
at
java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(Unknown Source)
at java.util.concurrent.locks.ReentrantLock$NonfairSync.lock(Unknown
Source)
at java.util.concurrent.locks.ReentrantLock.lock(Unknown Source)
at org.apache.hadoop.hbase.HLog.rollWriter(HLog.java:219)
at
org.apache.hadoop.hbase.HRegionServer$LogRoller.run(HRegionServer.java:615)
- locked <0x00002aaab69a3790> (a java.lang.Integer)
"org.apache.hadoop.io.ObjectWritable Connection Culler" daemon prio=1
tid=0x00002aab38001280 nid=0x6185 waiting on condition
[0x0000000041666000..0x0000000041666d00]
at java.lang.Thread.sleep(Native Method)
at org.apache.hadoop.ipc.Client$ConnectionCuller.run(Client.java:423)
"org.apache.hadoop.hbase.io.HbaseObjectWritable Connection Culler" daemon
prio=1 tid=0x00002aab3771f2a0 nid=0x5ea2 waiting on condition
[0x0000000041464000..0x0000000041464d00]
at java.lang.Thread.sleep(Native Method)
at org.apache.hadoop.ipc.Client$ConnectionCuller.run(Client.java:423)
"DestroyJavaVM" prio=1 tid=0x0000000040116a20 nid=0x5e70 waiting on condition
[0x0000000000000000..0x00007fffb7bdab80]
"regionserver/0:0:0:0:0:0:0:0:60020" prio=1 tid=0x00002aab37b76d60 nid=0x5ea1
waiting for monitor entry [0x0000000041363000..0x0000000041363d80]
at org.apache.hadoop.hbase.HRegionServer.run(HRegionServer.java:862)
- waiting to lock <0x00002aaab69a3790> (a java.lang.Integer)
at java.lang.Thread.run(Unknown Source)
"Low Memory Detector" daemon prio=1 tid=0x00002aab37148810 nid=0x5e9f runnable
[0x0000000000000000..0x0000000000000000]
"CompilerThread1" daemon prio=1 tid=0x00002aab37146be0 nid=0x5e9e waiting on
condition [0x0000000000000000..0x000000004105f3e0]
"CompilerThread0" daemon prio=1 tid=0x00002aab37145630 nid=0x5e9d waiting on
condition [0x0000000000000000..0x0000000040f5e4a0]
"AdapterThread" daemon prio=1 tid=0x00002aab37143ed0 nid=0x5e9c waiting on
condition [0x0000000000000000..0x0000000000000000]
"Signal Dispatcher" daemon prio=1 tid=0x00002aab37136fd0 nid=0x5e9b waiting on
condition [0x0000000000000000..0x0000000040d5dc50]
"Finalizer" daemon prio=1 tid=0x00002aab37130020 nid=0x5e9a in Object.wait()
[0x0000000040c5c000..0x0000000040c5cd00]
at java.lang.Object.wait(Native Method)
at java.lang.ref.ReferenceQueue.remove(Unknown Source)
- locked <0x00002aaab69b91e0> (a java.lang.ref.ReferenceQueue$Lock)
at java.lang.ref.ReferenceQueue.remove(Unknown Source)
at java.lang.ref.Finalizer$FinalizerThread.run(Unknown Source)
"Reference Handler" daemon prio=1 tid=0x00002aab3712f740 nid=0x5e99 in
Object.wait() [0x0000000040b5b000..0x0000000040b5bd80]
at java.lang.Object.wait(Native Method)
at java.lang.Object.wait(Unknown Source)
at java.lang.ref.Reference$ReferenceHandler.run(Unknown Source)
- locked <0x00002aaab69b6fa0> (a java.lang.ref.Reference$Lock)
"VM Thread" prio=1 tid=0x00002aab371283d0 nid=0x5e98 runnable
"GC task thread#0 (ParallelGC)" prio=1 tid=0x0000000040133610 nid=0x5e90
runnable
"GC task thread#1 (ParallelGC)" prio=1 tid=0x0000000040134490 nid=0x5e91
runnable
"GC task thread#2 (ParallelGC)" prio=1 tid=0x0000000040135310 nid=0x5e92
runnable
"GC task thread#3 (ParallelGC)" prio=1 tid=0x0000000040136190 nid=0x5e93
runnable
"GC task thread#4 (ParallelGC)" prio=1 tid=0x0000000040137010 nid=0x5e94
runnable
"GC task thread#5 (ParallelGC)" prio=1 tid=0x0000000040137e90 nid=0x5e95
runnable
"GC task thread#6 (ParallelGC)" prio=1 tid=0x0000000040138d10 nid=0x5e96
runnable
"GC task thread#7 (ParallelGC)" prio=1 tid=0x0000000040139b90 nid=0x5e97
runnable
"VM Periodic Task Thread" prio=1 tid=0x000000004012a520 nid=0x5ea0 waiting on
condition
{code}
> HLog#cacheFlushLock not cleared; hangs a region
> -----------------------------------------------
>
> Key: HBASE-659
> URL: https://issues.apache.org/jira/browse/HBASE-659
> Project: Hadoop HBase
> Issue Type: Bug
> Affects Versions: 0.1.2
> Reporter: stack
> Priority: Blocker
> Fix For: 0.2.0
>
>
> I have a region that is stuck in a close that was ordained by a split. Here
> is what I have from the log pertaining to the stuck region:
> {code}
> 4 6416 2008-05-29 22:29:03,433 INFO org.apache.hadoop.hbase.HRegion:
> checking compaction completed on region
> enwiki,IK9sWdHJe6ffGZgFPsqIvk==,1212092907061 in 12sec
> 5 6417 2008-05-29 22:29:03,439 INFO org.apache.hadoop.hbase.HRegion:
> Splitting enwiki,IK9sWdHJe6ffGZgFPsqIvk==,1212092907061 because largest
> aggregate size is 288.3m and desired size is 256.0m
>
> 6 6418 2008-05-29 22:29:03,443 DEBUG org.apache.hadoop.hbase.HRegion:
> compactions and cache flushes disabled for region
> enwiki,IK9sWdHJe6ffGZgFPsqIvk==,1212092907061
> 7 6419 2008-05-29 22:29:03,443 DEBUG org.apache.hadoop.hbase.HRegion: new
> updates and scanners for region enwiki,IK9sWdHJe6ffGZgFPsqIvk==,1212092907061
> disabled
>
> 8 6420 2008-05-29 22:29:03,443 DEBUG org.apache.hadoop.hbase.HRegion: no
> more active scanners for region enwiki,IK9sWdHJe6ffGZgFPsqIvk==,1212092907061
> 9 6421 2008-05-29 22:29:03,443 DEBUG org.apache.hadoop.hbase.HRegion: no
> more row locks outstanding on region
> enwiki,IK9sWdHJe6ffGZgFPsqIvk==,1212092907061
>
> 10 6422 2008-05-29 22:29:03,443 DEBUG
> org.apache.hadoop.hbase.HRegionServer:
> enwiki,IK9sWdHJe6ffGZgFPsqIvk==,1212092907061 closing (Adding to
> retiringRegions)
> 11 6423 2008-05-29 22:29:03,443 DEBUG org.apache.hadoop.hbase.HRegion:
> Started memcache flush for region
> enwiki,IK9sWdHJe6ffGZgFPsqIvk==,1212092907061. Current region memcache size
> 2.1m
>
> 12 6424 2008-05-29 22:29:03,561 INFO org.apache.hadoop.ipc.Server: IPC
> Server handler 0 on 60020, call
> batchUpdate(enwiki,IK9sWdHJe6ffGZgFPsqIvk==,1212092907061, 1171081390000,
> [EMAIL PROTECTED]) from 208.76.44.139:49358: err or: org.
> apache.hadoop.hbase.NotServingRegionException:
> enwiki,IK9sWdHJe6ffGZgFPsqIvk==,1212092907061
>
>
> 13 6425 org.apache.hadoop.hbase.NotServingRegionException:
> enwiki,IK9sWdHJe6ffGZgFPsqIvk==,1212092907061
> 14 6434 2008-05-29 22:29:03,982 INFO org.apache.hadoop.ipc.Server: IPC
> Server handler 9 on 60020, call
> batchUpdate(enwiki,IK9sWdHJe6ffGZgFPsqIvk==,1212092907061, 1202595259000,
> [EMAIL PROTECTED]) from 208.76.44.139:49358: err or: org.
> apache.hadoop.hbase.NotServingRegionException:
> enwiki,IK9sWdHJe6ffGZgFPsqIvk==,1212092907061
> 15 6435 org.apache.hadoop.hbase.NotServingRegionException:
> enwiki,IK9sWdHJe6ffGZgFPsqIvk==,1212092907061
> {code}
> Then in thread dump, I have two threads blocked on the HLog#cacheFlushLock
> but looking in code, there is no obvious code path that would get a situation
> where a lock is held and then not released.
> {code}
> "regionserver/0:0:0:0:0:0:0:0:60020.compactor" daemon prio=1
> tid=0x00002aab381e5fd0 nid=0x6195 waiting on condition
> [0x0000000041c6c000..0x0000000041c6ce00]
> at sun.misc.Unsafe.park(Native Method)
> at java.util.concurrent.locks.LockSupport.park(Unknown Source)
> at
> java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(Unknown
> Source)
> at
> java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(Unknown
> Source)
> at
> java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(Unknown Source)
> at java.util.concurrent.locks.ReentrantLock$NonfairSync.lock(Unknown
> Source)
> at java.util.concurrent.locks.ReentrantLock.lock(Unknown Source)
> at org.apache.hadoop.hbase.HLog.startCacheFlush(HLog.java:459)
> at
> org.apache.hadoop.hbase.HRegion.internalFlushcache(HRegion.java:1089)
> at org.apache.hadoop.hbase.HRegion.close(HRegion.java:594)
> - locked <0x00002aaab70bf3a0> (a java.lang.Integer)
> at org.apache.hadoop.hbase.HRegion.splitRegion(HRegion.java:759)
> - locked <0x00002aaab70bf3a0> (a java.lang.Integer)
> at
> org.apache.hadoop.hbase.HRegionServer$CompactSplitThread.split(HRegionServer.java:248)
> at
> org.apache.hadoop.hbase.HRegionServer$CompactSplitThread.run(HRegionServer.java:204)
> ...
> "regionserver/0:0:0:0:0:0:0:0:60020.logRoller" daemon prio=1
> tid=0x00002aab38181d70 nid=0x6193 waiting on condition
> [0x0000000041a6a000..0x0000000041a6ab00]
> at sun.misc.Unsafe.park(Native Method)
> at java.util.concurrent.locks.LockSupport.park(Unknown Source)
> at
> java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(Unknown
> Source)
> at
> java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(Unknown
> Source)
> at
> java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(Unknown Source)
> at java.util.concurrent.locks.ReentrantLock$NonfairSync.lock(Unknown
> Source)
> at java.util.concurrent.locks.ReentrantLock.lock(Unknown Source)
> at org.apache.hadoop.hbase.HLog.rollWriter(HLog.java:219)
> at
> org.apache.hadoop.hbase.HRegionServer$LogRoller.run(HRegionServer.java:615)
> - locked <0x00002aaab69ccf00> (a java.lang.Integer)
> ...
> {code}
--
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.