[ 
https://issues.apache.org/jira/browse/HBASE-3196?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Prakash Khemani updated HBASE-3196:
-----------------------------------

          Description: 
The region server is stuck with the following jstack

2010-11-03 22:23:41
Full thread dump Java HotSpot(TM) 64-Bit Server VM (14.0-b16 mixed mode):

"Attach Listener" daemon prio=10 tid=0x00002aaeb6774000 nid=0x3974 waiting on 
condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"RS_CLOSE_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-2" prio=10 
tid=0x00002aaeb8449000 nid=0x3bbc waiting on condition [0x0000000043f67000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00002aaab7fd1130> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358)
        at 
java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907)
        at java.lang.Thread.run(Thread.java:619)

"RS_CLOSE_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-1" prio=10 
tid=0x00002aaeb843f800 nid=0x3bbb waiting on condition [0x0000000043e66000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00002aaab7fd1130> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358)
        at 
java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907)
        at java.lang.Thread.run(Thread.java:619)

"RS_CLOSE_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-0" prio=10 
tid=0x00002aaeb8447800 nid=0x3bba waiting on condition [0x0000000044068000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00002aaab7fd1130> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358)
        at 
java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907)
        at java.lang.Thread.run(Thread.java:619)

"RMI Scheduler(0)" daemon prio=10 tid=0x00002aaeb48c4800 nid=0x1c97 waiting on 
condition [0x00000000580a7000]
   java.lang.Thread.State: TIMED_WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00002aaab773a118> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at 
java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:198)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:1963)
        at java.util.concurrent.DelayQueue.take(DelayQueue.java:164)
        at 
java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:583)
        at 
java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:576)
        at 
java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907)
        at java.lang.Thread.run(Thread.java:619)

"RS_OPEN_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-2" daemon 
prio=10 tid=0x00002aaeb4804800 nid=0x17a0 waiting on condition 
[0x00000000582a9000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00002aaab7fca538> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358)
        at 
java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907)
        at java.lang.Thread.run(Thread.java:619)

"RS_OPEN_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-1" daemon 
prio=10 tid=0x00002aaeb490a000 nid=0x179f waiting on condition 
[0x000000004345c000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00002aaab7fca538> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358)
        at 
java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907)
        at java.lang.Thread.run(Thread.java:619)

"RS_OPEN_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-0" daemon 
prio=10 tid=0x00002aaeb4909000 nid=0x179e waiting on condition 
[0x000000004335b000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00002aaab7fca538> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358)
        at 
java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907)
        at java.lang.Thread.run(Thread.java:619)

"LruBlockCache.EvictionThread" daemon prio=10 tid=0x00002aaeb889b000 nid=0x1767 
in Object.wait() [0x0000000057ea5000]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        at java.lang.Object.wait(Object.java:485)
        at 
org.apache.hadoop.hbase.io.hfile.LruBlockCache$EvictionThread.run(LruBlockCache.java:514)
        - locked <0x00002aaab7fdb728> (a 
org.apache.hadoop.hbase.io.hfile.LruBlockCache$EvictionThread)

"Timer thread for monitoring jvm" daemon prio=10 tid=0x00002aaebc0a5000 
nid=0x1620 in Object.wait() [0x0000000043c64000]
   java.lang.Thread.State: TIMED_WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        at java.util.TimerThread.mainLoop(Timer.java:509)
        - locked <0x00002aaab7e89c38> (a java.util.TaskQueue)
        at java.util.TimerThread.run(Timer.java:462)

"Timer thread for monitoring hbase" daemon prio=10 tid=0x00002aaebc0a3800 
nid=0x161f in Object.wait() [0x0000000043b63000]
   java.lang.Thread.State: TIMED_WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        at java.util.TimerThread.mainLoop(Timer.java:509)
        - locked <0x00002aaab7fe95d8> (a java.util.TaskQueue)
        at java.util.TimerThread.run(Timer.java:462)

"DestroyJavaVM" prio=10 tid=0x00002aaebc092000 nid=0x15ce waiting on condition 
[0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"regionserver60020-EventThread" daemon prio=10 tid=0x00002aaeb5276800 
nid=0x1606 waiting on condition [0x000000004325a000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00002aaab6bf6bb8> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358)
        at org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:477)

"regionserver60020-SendThread(pumahbasectrl001.snc5.facebook.com:2181)" daemon 
prio=10 tid=0x00002aaeb5275800 nid=0x1605 runnable [0x0000000043159000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:215)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:65)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:69)
        - locked <0x00002aaab6c02078> (a sun.nio.ch.Util$1)
        - locked <0x00002aaab6c02090> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00002aaab66713d0> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:80)
        at org.apache.zookeeper.ClientCnxn$SendThread.run(ClientCnxn.java:1066)

"regionserver60020" prio=10 tid=0x00002aaeb858c800 nid=0x1604 waiting on 
condition [0x0000000043058000]
   java.lang.Thread.State: TIMED_WAITING (sleeping)
        at java.lang.Thread.sleep(Native Method)
        at org.apache.hadoop.hbase.util.Threads.sleep(Threads.java:126)
        at 
org.apache.hadoop.hbase.regionserver.HRegionServer.waitOnAllRegionsToClose(HRegionServer.java:645)
        at 
org.apache.hadoop.hbase.regionserver.HRegionServer.run(HRegionServer.java:611)
        at java.lang.Thread.run(Thread.java:619)

"Timer thread for monitoring rpc" daemon prio=10 tid=0x00002aaeb8583000 
nid=0x1602 in Object.wait() [0x0000000042f57000]
   java.lang.Thread.State: TIMED_WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        at java.util.TimerThread.mainLoop(Timer.java:509)
        - locked <0x00002aaab6be7038> (a java.util.TaskQueue)
        at java.util.TimerThread.run(Timer.java:462)

"main-EventThread" daemon prio=10 tid=0x00002aaeb842d000 nid=0x15f7 waiting on 
condition [0x000000004244c000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00002aaab6bc0720> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358)
        at org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:477)

"main-SendThread(pumahbasectrl062.snc5.facebook.com:2181)" daemon prio=10 
tid=0x00002aaeb8451800 nid=0x15f6 runnable [0x000000004234b000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:215)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:65)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:69)
        - locked <0x00002aaab6bb08f0> (a sun.nio.ch.Util$1)
        - locked <0x00002aaab6bb0908> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00002aaab764df90> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:80)
        at org.apache.zookeeper.ClientCnxn$SendThread.run(ClientCnxn.java:1066)

"RMI TCP Accept-0" daemon prio=10 tid=0x00002aaeb83b9800 nid=0x15f2 runnable 
[0x0000000042149000]
   java.lang.Thread.State: RUNNABLE
        at java.net.PlainSocketImpl.socketAccept(Native Method)
        at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:390)
        - locked <0x00002aaab6b935e8> (a java.net.SocksSocketImpl)
        at java.net.ServerSocket.implAccept(ServerSocket.java:453)
        at java.net.ServerSocket.accept(ServerSocket.java:421)
        at 
sun.management.jmxremote.LocalRMIServerSocketFactory$1.accept(LocalRMIServerSocketFactory.java:34)
        at 
sun.rmi.transport.tcp.TCPTransport$AcceptLoop.executeAcceptLoop(TCPTransport.java:369)
        at 
sun.rmi.transport.tcp.TCPTransport$AcceptLoop.run(TCPTransport.java:341)
        at java.lang.Thread.run(Thread.java:619)

"RMI TCP Accept-8091" daemon prio=10 tid=0x00002aaeb83b3000 nid=0x15f1 runnable 
[0x0000000042048000]
   java.lang.Thread.State: RUNNABLE
        at java.net.PlainSocketImpl.socketAccept(Native Method)
        at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:390)
        - locked <0x00002aaab7659d50> (a java.net.SocksSocketImpl)
        at java.net.ServerSocket.implAccept(ServerSocket.java:453)
        at java.net.ServerSocket.accept(ServerSocket.java:421)
        at 
sun.rmi.transport.tcp.TCPTransport$AcceptLoop.executeAcceptLoop(TCPTransport.java:369)
        at 
sun.rmi.transport.tcp.TCPTransport$AcceptLoop.run(TCPTransport.java:341)
        at java.lang.Thread.run(Thread.java:619)

"RMI TCP Accept-0" daemon prio=10 tid=0x00002aaeb83a0000 nid=0x15f0 runnable 
[0x0000000041f47000]
   java.lang.Thread.State: RUNNABLE
        at java.net.PlainSocketImpl.socketAccept(Native Method)
        at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:390)
        - locked <0x00002aaab765c140> (a java.net.SocksSocketImpl)
        at java.net.ServerSocket.implAccept(ServerSocket.java:453)
        at java.net.ServerSocket.accept(ServerSocket.java:421)
        at 
sun.rmi.transport.tcp.TCPTransport$AcceptLoop.executeAcceptLoop(TCPTransport.java:369)
        at 
sun.rmi.transport.tcp.TCPTransport$AcceptLoop.run(TCPTransport.java:341)
        at java.lang.Thread.run(Thread.java:619)

"Low Memory Detector" daemon prio=10 tid=0x00002aaeb8016000 nid=0x15eb runnable 
[0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"CompilerThread1" daemon prio=10 tid=0x00002aaeb8013800 nid=0x15ea waiting on 
condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"CompilerThread0" daemon prio=10 tid=0x00002aaeb8011800 nid=0x15e9 waiting on 
condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"JDWP Event Helper Thread" daemon prio=10 tid=0x00002aaeb800f800 nid=0x15e8 
runnable [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"JDWP Transport Listener: dt_socket" daemon prio=10 tid=0x00002aaeb800c000 
nid=0x15e7 runnable [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"Signal Dispatcher" daemon prio=10 tid=0x00002aaeb45d4800 nid=0x15e6 runnable 
[0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"Surrogate Locker Thread (CMS)" daemon prio=10 tid=0x00002aaeb45d2800 
nid=0x15e5 waiting on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"Finalizer" daemon prio=10 tid=0x00002aaeb440b000 nid=0x15e3 in Object.wait() 
[0x000000004173f000]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:118)
        - locked <0x00002aaab6663ac0> (a java.lang.ref.ReferenceQueue$Lock)
        at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:134)
        at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:159)

"Reference Handler" daemon prio=10 tid=0x00002aaeb4409000 nid=0x15e2 in 
Object.wait() [0x000000004163e000]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        at java.lang.Object.wait(Object.java:485)
        at java.lang.ref.Reference$ReferenceHandler.run(Reference.java:116)
        - locked <0x00002aaab6c01940> (a java.lang.ref.Reference$Lock)

"VM Thread" prio=10 tid=0x00002aaeb4402000 nid=0x15e1 runnable

"Gang worker#0 (Parallel GC Threads)" prio=10 tid=0x000000004011e800 nid=0x15cf 
runnable

"Gang worker#1 (Parallel GC Threads)" prio=10 tid=0x0000000040120800 nid=0x15d0 
runnable

"Gang worker#2 (Parallel GC Threads)" prio=10 tid=0x0000000040122000 nid=0x15d1 
runnable

"Gang worker#3 (Parallel GC Threads)" prio=10 tid=0x0000000040124000 nid=0x15d2 
runnable

"Gang worker#4 (Parallel GC Threads)" prio=10 tid=0x0000000040126000 nid=0x15d3 
runnable

"Gang worker#5 (Parallel GC Threads)" prio=10 tid=0x0000000040127800 nid=0x15d4 
runnable

"Gang worker#6 (Parallel GC Threads)" prio=10 tid=0x0000000040129800 nid=0x15d5 
runnable

"Gang worker#7 (Parallel GC Threads)" prio=10 tid=0x00002aaab2526800 nid=0x15d6 
runnable

"Gang worker#8 (Parallel GC Threads)" prio=10 tid=0x00002aaab2528000 nid=0x15d7 
runnable

"Gang worker#9 (Parallel GC Threads)" prio=10 tid=0x00002aaab252a000 nid=0x15d8 
runnable

"Gang worker#10 (Parallel GC Threads)" prio=10 tid=0x00002aaab252c000 
nid=0x15d9 runnable

"Gang worker#11 (Parallel GC Threads)" prio=10 tid=0x00002aaab252d800 
nid=0x15da runnable

"Gang worker#12 (Parallel GC Threads)" prio=10 tid=0x00002aaab252f800 
nid=0x15db runnable

"Concurrent Mark-Sweep GC Thread" prio=10 tid=0x00002aaea3b8f000 nid=0x15e0 
runnable
"Gang worker#0 (Parallel CMS Threads)" prio=10 tid=0x00002aaea3a7e800 
nid=0x15dc runnable

"Gang worker#1 (Parallel CMS Threads)" prio=10 tid=0x00002aaea3a80000 
nid=0x15dd runnable

"Gang worker#2 (Parallel CMS Threads)" prio=10 tid=0x00002aaea3acb800 
nid=0x15de runnable

"Gang worker#3 (Parallel CMS Threads)" prio=10 tid=0x00002aaea3acd000 
nid=0x15df runnable

"VM Periodic Task Thread" prio=10 tid=0x00002aaeb83bc800 nid=0x15f3 waiting on 
condition

JNI global references: 4205

===

In the RS logs all the IPC Server handlers have died because HDFS failure to 
write to the log.

First failure to append to the logs
2010-11-02 17:05:58,620 WARN org.apache.hadoop.hdfs.DFSClient: Packet 243111 
[offsetInBlock=241298432 pktLen=2173] of blk_-4978761844464053668_783579 is 
timed out
2010-11-02 17:05:58,623 WARN org.apache.hadoop.hdfs.DFSClient: Error Recovery 
for block blk_-4978761844464053668_783579 bad datanode[2] 10.38.47.49:50010
2010-11-02 17:05:58,623 WARN org.apache.hadoop.hdfs.DFSClient: Error Recovery 
for block blk_-4978761844464053668_783579 in pipeline 10.38.15.45:50010, 
10.38.47.59:50010, 10.38.47.49:50010: bad datanode 10.38.47.49:50010
2010-11-02 17:05:58,755 WARN org.apache.hadoop.hdfs.DFSClient: Failed recovery 
attempt #0 from primary datanode 10.38.15.45:50010
org.apache.hadoop.ipc.RPC$VersionMismatch: Protocol 
org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol version mismatch. 
(client = 5, server = 3)
        at org.apache.hadoop.ipc.RPC.getProxy(RPC.java:452)
        at 
org.apache.hadoop.hdfs.DFSClient.createClientDatanodeProtocolProxy(DFSClient.java:194)
        at 
org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.processDatanodeError(DFSClient.java:3027)
        at 
org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.access$1800(DFSClient.java:2467)
        at 
org.apache.hadoop.hdfs.DFSClient$DFSOutputStream$DataStreamer.run(DFSClient.java:2687)
...
2010-11-02 17:11:09,667 WARN org.apache.hadoop.hdfs.DFSClient: Failed recovery 
attempt #0 from primary datanode 10.38.47.49:50010
java.net.SocketTimeoutException: Call to /10.38.47.49:50020 failed on socket 
timeout exception: java.net.SocketTimeoutException: 300000 millis timeout while 
waiting for channel to be ready for read. ch : 
java.nio.channels.SocketChannel[connected local=/10.38.15.45:56492 
remote=/10.38.47.49:50020]
        at org.apache.hadoop.ipc.Client.wrapException(Client.java:855)
        at org.apache.hadoop.ipc.Client.call(Client.java:827)
        at org.apache.hadoop.ipc.RPC$Invoker.invoke(RPC.java:222)
        at $Proxy3.getProtocolVersion(Unknown Source)
        at org.apache.hadoop.ipc.RPC.getProxy(RPC.java:447)
        at 
org.apache.hadoop.hdfs.DFSClient.createClientDatanodeProtocolProxy(DFSClient.java:194)
        at 
org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.processDatanodeError(DFSClient.java:3027)
        at 
org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.access$1800(DFSClient.java:2467)
        at 
org.apache.hadoop.hdfs.DFSClient$DFSOutputStream$DataStreamer.run(DFSClient.java:2687)
Caused by: java.net.SocketTimeoutException: 300000 millis timeout while waiting 
for channel to be ready for read. ch : 
java.nio.channels.SocketChannel[connected local=/10.38.15.45:56492 
remote=/10.38.47.49:50020]
        at 
org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)
        at 
org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:155)
        at 
org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:128)
        at java.io.FilterInputStream.read(FilterInputStream.java:116)
        at 
org.apache.hadoop.ipc.Client$Connection$PingInputStream.read(Client.java:325)
        at java.io.BufferedInputStream.fill(BufferedInputStream.java:218)
        at java.io.BufferedInputStream.read(BufferedInputStream.java:237)
        at java.io.DataInputStream.readInt(DataInputStream.java:370)
        at 
org.apache.hadoop.ipc.Client$Connection.receiveResponse(Client.java:572)
        at org.apache.hadoop.ipc.Client$Connection.run(Client.java:500)
2010-11-02 17:11:09,669 WARN org.apache.hadoop.hdfs.DFSClient: Error Recovery 
for block blk_-4978761844464053668_783579 file 
/PUMAHBASE001-SNC5-HBASE/.logs/pumahbase028.snc5.facebook.com,60020,1288733355197/10.38.15.45%3A60020.1288742259683
 failed  because recovery from primary datanode 10.38.47.49:50010 failed 1 
times.  Pipeline was 10.38.47.59:50010, 10.38.47.49:50010. Will retry...

Error recovery fails.
2010-11-02 17:36:20,919 FATAL org.apache.hadoop.hbase.regionserver.wal.HLog: 
Could not append. Requesting close of hlog
java.io.IOException: Reflection
        at 
org.apache.hadoop.hbase.regionserver.wal.SequenceFileLogWriter.sync(SequenceFileLogWriter.java:147)
        at org.apache.hadoop.hbase.regionserver.wal.HLog.hflush(HLog.java:1042)
        at 
org.apache.hadoop.hbase.regionserver.wal.HLog$LogSyncer.run(HLog.java:966)
Caused by: java.lang.reflect.InvocationTargetException
        at sun.reflect.GeneratedMethodAccessor22.invoke(Unknown Source)
        at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
        at java.lang.reflect.Method.invoke(Method.java:597)
        at 
org.apache.hadoop.hbase.regionserver.wal.SequenceFileLogWriter.sync(SequenceFileLogWriter.java:145)
        ... 2 more
Caused by: java.io.IOException: Error Recovery for block 
blk_-4978761844464053668_783579 file 
/PUMAHBASE001-SNC5-HBASE/.logs/pumahbase028.snc5.facebook.com,60020,1288733355197/10.38.15.45%3A60020.1288742259683
 failed  because recovery from primary datanode 10.38.47.59:50010 failed 6 
times.  Pipeline was 10.38.47.59:50010. Aborting...
        at 
org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.processDatanodeError(DFSClient.java:3065)
        at 
org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.access$1800(DFSClient.java:2467)
        at 
org.apache.hadoop.hdfs.DFSClient$DFSOutputStream$DataStreamer.run(DFSClient.java:2687)


Regions start getting aborted  because hlog close fails because filesystem is 
not available


2010-11-02 17:36:24,055 FATAL 
org.apache.hadoop.hbase.regionserver.HRegionServer: ABORTING region server 
serverName=pumahbase028.snc5.facebook.com,60020,1288733355197, 
load=(requests=492, regions=55, usedHeap=3376, maxHeap=15993): File System not 
available
java.io.IOException: File system is not available
        at 
org.apache.hadoop.hbase.util.FSUtils.checkFileSystemAvailable(FSUtils.java:130)
        at 
org.apache.hadoop.hbase.regionserver.HRegionServer.checkFileSystem(HRegionServer.java:949)
        at 
org.apache.hadoop.hbase.regionserver.HRegionServer.increment(HRegionServer.java:2312)
        at sun.reflect.GeneratedMethodAccessor21.invoke(Unknown Source)
        at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
        at java.lang.reflect.Method.invoke(Method.java:597)
        at org.apache.hadoop.hbase.ipc.HBaseRPC$Server.call(HBaseRPC.java:561)
        at 
org.apache.hadoop.hbase.ipc.HBaseServer$Handler.run(HBaseServer.java:1025)
Caused by: java.io.IOException: Filesystem closed
        at org.apache.hadoop.hdfs.DFSClient.checkOpen(DFSClient.java:282)
        at org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:875)
        at 
org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:484)
        at org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:766)
        at 
org.apache.hadoop.hbase.util.FSUtils.checkFileSystemAvailable(FSUtils.java:119)
        ... 7 more

All the IPC Server handlers exit

2010-11-02 17:36:24,097 WARN org.apache.hadoop.ipc.HBaseServer: IPC Server 
Responder, call increment([...@aa32978, 
row=b4da93229cfc2cfb7d731f46da44f108info.compartilo1.59624 lfi 450313269132, 
families={(family=COUNTERS_0, columns={+=1, a13g1+=1, cCL+=1, les_LA+=1}), 
(family=COUNTERS_3600, columns={7ffa89a0+=1}}) from 10.38.15.37:49593: output 
error
2010-11-02 17:36:24,094 FATAL org.apache.hadoop.hbase.regionserver.wal.HLog: 
Could not append. Requesting close of hlog
java.io.IOException: Filesystem closed
        at org.apache.hadoop.hdfs.DFSClient.checkOpen(DFSClient.java:282)
        at org.apache.hadoop.hdfs.DFSClient.access$800(DFSClient.java:71)
        at 
org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.writeChunk(DFSClient.java:3465)
        at 
org.apache.hadoop.fs.FSOutputSummer.writeChecksumChunk(FSOutputSummer.java:150)
        at 
org.apache.hadoop.fs.FSOutputSummer.flushBuffer(FSOutputSummer.java:132)
        at 
org.apache.hadoop.fs.FSOutputSummer.flushBuffer(FSOutputSummer.java:121)
        at org.apache.hadoop.fs.FSOutputSummer.write1(FSOutputSummer.java:112)
        at org.apache.hadoop.fs.FSOutputSummer.write(FSOutputSummer.java:86)
        at 
org.apache.hadoop.fs.FSDataOutputStream$PositionCache.write(FSDataOutputStream.java:49)
        at java.io.DataOutputStream.write(DataOutputStream.java:90)
        at 
org.apache.hadoop.io.SequenceFile$Writer.append(SequenceFile.java:1073)
        at 
org.apache.hadoop.io.SequenceFile$Writer.append(SequenceFile.java:1037)
        at 
org.apache.hadoop.hbase.regionserver.wal.SequenceFileLogWriter.append(SequenceFileLogWriter.java:133)
        at org.apache.hadoop.hbase.regionserver.wal.HLog.doWrite(HLog.java:1129)
        at org.apache.hadoop.hbase.regionserver.wal.HLog.append(HLog.java:914)
        at 
org.apache.hadoop.hbase.regionserver.HRegion.increment(HRegion.java:3038)
        at 
org.apache.hadoop.hbase.regionserver.HRegionServer.increment(HRegionServer.java:2309)
        at sun.reflect.GeneratedMethodAccessor21.invoke(Unknown Source)
        at 
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
        at java.lang.reflect.Method.invoke(Method.java:597)
        at org.apache.hadoop.hbase.ipc.HBaseRPC$Server.call(HBaseRPC.java:561)
        at 
org.apache.hadoop.hbase.ipc.HBaseServer$Handler.run(HBaseServer.java:1025)
2010-11-02 17:36:24,108 WARN org.apache.hadoop.ipc.HBaseServer: IPC Server 
handler 69 on 60020 caught: java.nio.channels.ClosedChannelException
        at 
sun.nio.ch.SocketChannelImpl.ensureWriteOpen(SocketChannelImpl.java:126)
        at sun.nio.ch.SocketChannelImpl.write(SocketChannelImpl.java:324)
        at 
org.apache.hadoop.hbase.ipc.HBaseServer.channelWrite(HBaseServer.java:1282)
        at 
org.apache.hadoop.hbase.ipc.HBaseServer$Responder.processResponse(HBaseServer.java:713)
        at 
org.apache.hadoop.hbase.ipc.HBaseServer$Responder.doRespond(HBaseServer.java:778)
        at 
org.apache.hadoop.hbase.ipc.HBaseServer$Handler.run(HBaseServer.java:1053)

2010-11-02 17:36:24,113 INFO org.apache.hadoop.ipc.HBaseServer: IPC Server 
handler 69 on 60020: exiting

===

If all the IPC Server handlers exit for whatever reason then the Region Server 
should cleanly exit. I think my clients are getting stuck in 
locateRegionInMeta() because of this stuck region server.

===




    Affects Version/s: 0.90.0
             Assignee: Jonathan Gray
              Summary: Regionserver stuck when after all IPC Server handlers 
fatal'd  (was: Regionserver stuck when after all IPC Server handlers fatal')

> Regionserver stuck when after all IPC Server handlers fatal'd
> -------------------------------------------------------------
>
>                 Key: HBASE-3196
>                 URL: https://issues.apache.org/jira/browse/HBASE-3196
>             Project: HBase
>          Issue Type: Bug
>    Affects Versions: 0.90.0
>            Reporter: Prakash Khemani
>            Assignee: Jonathan Gray
>
> The region server is stuck with the following jstack
> 2010-11-03 22:23:41
> Full thread dump Java HotSpot(TM) 64-Bit Server VM (14.0-b16 mixed mode):
> "Attach Listener" daemon prio=10 tid=0x00002aaeb6774000 nid=0x3974 waiting on 
> condition [0x0000000000000000]
>    java.lang.Thread.State: RUNNABLE
> "RS_CLOSE_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-2" 
> prio=10 tid=0x00002aaeb8449000 nid=0x3bbc waiting on condition 
> [0x0000000043f67000]
>    java.lang.Thread.State: WAITING (parking)
>         at sun.misc.Unsafe.park(Native Method)
>         - parking to wait for  <0x00002aaab7fd1130> (a 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
>         at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158)
>         at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925)
>         at 
> java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358)
>         at 
> java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947)
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907)
>         at java.lang.Thread.run(Thread.java:619)
> "RS_CLOSE_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-1" 
> prio=10 tid=0x00002aaeb843f800 nid=0x3bbb waiting on condition 
> [0x0000000043e66000]
>    java.lang.Thread.State: WAITING (parking)
>         at sun.misc.Unsafe.park(Native Method)
>         - parking to wait for  <0x00002aaab7fd1130> (a 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
>         at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158)
>         at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925)
>         at 
> java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358)
>         at 
> java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947)
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907)
>         at java.lang.Thread.run(Thread.java:619)
> "RS_CLOSE_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-0" 
> prio=10 tid=0x00002aaeb8447800 nid=0x3bba waiting on condition 
> [0x0000000044068000]
>    java.lang.Thread.State: WAITING (parking)
>         at sun.misc.Unsafe.park(Native Method)
>         - parking to wait for  <0x00002aaab7fd1130> (a 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
>         at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158)
>         at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925)
>         at 
> java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358)
>         at 
> java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947)
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907)
>         at java.lang.Thread.run(Thread.java:619)
> "RMI Scheduler(0)" daemon prio=10 tid=0x00002aaeb48c4800 nid=0x1c97 waiting 
> on condition [0x00000000580a7000]
>    java.lang.Thread.State: TIMED_WAITING (parking)
>         at sun.misc.Unsafe.park(Native Method)
>         - parking to wait for  <0x00002aaab773a118> (a 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
>         at 
> java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:198)
>         at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:1963)
>         at java.util.concurrent.DelayQueue.take(DelayQueue.java:164)
>         at 
> java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:583)
>         at 
> java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:576)
>         at 
> java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947)
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907)
>         at java.lang.Thread.run(Thread.java:619)
> "RS_OPEN_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-2" daemon 
> prio=10 tid=0x00002aaeb4804800 nid=0x17a0 waiting on condition 
> [0x00000000582a9000]
>    java.lang.Thread.State: WAITING (parking)
>         at sun.misc.Unsafe.park(Native Method)
>         - parking to wait for  <0x00002aaab7fca538> (a 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
>         at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158)
>         at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925)
>         at 
> java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358)
>         at 
> java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947)
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907)
>         at java.lang.Thread.run(Thread.java:619)
> "RS_OPEN_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-1" daemon 
> prio=10 tid=0x00002aaeb490a000 nid=0x179f waiting on condition 
> [0x000000004345c000]
>    java.lang.Thread.State: WAITING (parking)
>         at sun.misc.Unsafe.park(Native Method)
>         - parking to wait for  <0x00002aaab7fca538> (a 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
>         at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158)
>         at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925)
>         at 
> java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358)
>         at 
> java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947)
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907)
>         at java.lang.Thread.run(Thread.java:619)
> "RS_OPEN_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-0" daemon 
> prio=10 tid=0x00002aaeb4909000 nid=0x179e waiting on condition 
> [0x000000004335b000]
>    java.lang.Thread.State: WAITING (parking)
>         at sun.misc.Unsafe.park(Native Method)
>         - parking to wait for  <0x00002aaab7fca538> (a 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
>         at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158)
>         at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925)
>         at 
> java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358)
>         at 
> java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947)
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907)
>         at java.lang.Thread.run(Thread.java:619)
> "LruBlockCache.EvictionThread" daemon prio=10 tid=0x00002aaeb889b000 
> nid=0x1767 in Object.wait() [0x0000000057ea5000]
>    java.lang.Thread.State: WAITING (on object monitor)
>         at java.lang.Object.wait(Native Method)
>         at java.lang.Object.wait(Object.java:485)
>         at 
> org.apache.hadoop.hbase.io.hfile.LruBlockCache$EvictionThread.run(LruBlockCache.java:514)
>         - locked <0x00002aaab7fdb728> (a 
> org.apache.hadoop.hbase.io.hfile.LruBlockCache$EvictionThread)
> "Timer thread for monitoring jvm" daemon prio=10 tid=0x00002aaebc0a5000 
> nid=0x1620 in Object.wait() [0x0000000043c64000]
>    java.lang.Thread.State: TIMED_WAITING (on object monitor)
>         at java.lang.Object.wait(Native Method)
>         at java.util.TimerThread.mainLoop(Timer.java:509)
>         - locked <0x00002aaab7e89c38> (a java.util.TaskQueue)
>         at java.util.TimerThread.run(Timer.java:462)
> "Timer thread for monitoring hbase" daemon prio=10 tid=0x00002aaebc0a3800 
> nid=0x161f in Object.wait() [0x0000000043b63000]
>    java.lang.Thread.State: TIMED_WAITING (on object monitor)
>         at java.lang.Object.wait(Native Method)
>         at java.util.TimerThread.mainLoop(Timer.java:509)
>         - locked <0x00002aaab7fe95d8> (a java.util.TaskQueue)
>         at java.util.TimerThread.run(Timer.java:462)
> "DestroyJavaVM" prio=10 tid=0x00002aaebc092000 nid=0x15ce waiting on 
> condition [0x0000000000000000]
>    java.lang.Thread.State: RUNNABLE
> "regionserver60020-EventThread" daemon prio=10 tid=0x00002aaeb5276800 
> nid=0x1606 waiting on condition [0x000000004325a000]
>    java.lang.Thread.State: WAITING (parking)
>         at sun.misc.Unsafe.park(Native Method)
>         - parking to wait for  <0x00002aaab6bf6bb8> (a 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
>         at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158)
>         at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925)
>         at 
> java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358)
>         at 
> org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:477)
> "regionserver60020-SendThread(pumahbasectrl001.snc5.facebook.com:2181)" 
> daemon prio=10 tid=0x00002aaeb5275800 nid=0x1605 runnable [0x0000000043159000]
>    java.lang.Thread.State: RUNNABLE
>         at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
>         at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:215)
>         at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:65)
>         at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:69)
>         - locked <0x00002aaab6c02078> (a sun.nio.ch.Util$1)
>         - locked <0x00002aaab6c02090> (a 
> java.util.Collections$UnmodifiableSet)
>         - locked <0x00002aaab66713d0> (a sun.nio.ch.EPollSelectorImpl)
>         at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:80)
>         at 
> org.apache.zookeeper.ClientCnxn$SendThread.run(ClientCnxn.java:1066)
> "regionserver60020" prio=10 tid=0x00002aaeb858c800 nid=0x1604 waiting on 
> condition [0x0000000043058000]
>    java.lang.Thread.State: TIMED_WAITING (sleeping)
>         at java.lang.Thread.sleep(Native Method)
>         at org.apache.hadoop.hbase.util.Threads.sleep(Threads.java:126)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.waitOnAllRegionsToClose(HRegionServer.java:645)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.run(HRegionServer.java:611)
>         at java.lang.Thread.run(Thread.java:619)
> "Timer thread for monitoring rpc" daemon prio=10 tid=0x00002aaeb8583000 
> nid=0x1602 in Object.wait() [0x0000000042f57000]
>    java.lang.Thread.State: TIMED_WAITING (on object monitor)
>         at java.lang.Object.wait(Native Method)
>         at java.util.TimerThread.mainLoop(Timer.java:509)
>         - locked <0x00002aaab6be7038> (a java.util.TaskQueue)
>         at java.util.TimerThread.run(Timer.java:462)
> "main-EventThread" daemon prio=10 tid=0x00002aaeb842d000 nid=0x15f7 waiting 
> on condition [0x000000004244c000]
>    java.lang.Thread.State: WAITING (parking)
>         at sun.misc.Unsafe.park(Native Method)
>         - parking to wait for  <0x00002aaab6bc0720> (a 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
>         at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158)
>         at 
> java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925)
>         at 
> java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358)
>         at 
> org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:477)
> "main-SendThread(pumahbasectrl062.snc5.facebook.com:2181)" daemon prio=10 
> tid=0x00002aaeb8451800 nid=0x15f6 runnable [0x000000004234b000]
>    java.lang.Thread.State: RUNNABLE
>         at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
>         at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:215)
>         at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:65)
>         at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:69)
>         - locked <0x00002aaab6bb08f0> (a sun.nio.ch.Util$1)
>         - locked <0x00002aaab6bb0908> (a 
> java.util.Collections$UnmodifiableSet)
>         - locked <0x00002aaab764df90> (a sun.nio.ch.EPollSelectorImpl)
>         at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:80)
>         at 
> org.apache.zookeeper.ClientCnxn$SendThread.run(ClientCnxn.java:1066)
> "RMI TCP Accept-0" daemon prio=10 tid=0x00002aaeb83b9800 nid=0x15f2 runnable 
> [0x0000000042149000]
>    java.lang.Thread.State: RUNNABLE
>         at java.net.PlainSocketImpl.socketAccept(Native Method)
>         at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:390)
>         - locked <0x00002aaab6b935e8> (a java.net.SocksSocketImpl)
>         at java.net.ServerSocket.implAccept(ServerSocket.java:453)
>         at java.net.ServerSocket.accept(ServerSocket.java:421)
>         at 
> sun.management.jmxremote.LocalRMIServerSocketFactory$1.accept(LocalRMIServerSocketFactory.java:34)
>         at 
> sun.rmi.transport.tcp.TCPTransport$AcceptLoop.executeAcceptLoop(TCPTransport.java:369)
>         at 
> sun.rmi.transport.tcp.TCPTransport$AcceptLoop.run(TCPTransport.java:341)
>         at java.lang.Thread.run(Thread.java:619)
> "RMI TCP Accept-8091" daemon prio=10 tid=0x00002aaeb83b3000 nid=0x15f1 
> runnable [0x0000000042048000]
>    java.lang.Thread.State: RUNNABLE
>         at java.net.PlainSocketImpl.socketAccept(Native Method)
>         at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:390)
>         - locked <0x00002aaab7659d50> (a java.net.SocksSocketImpl)
>         at java.net.ServerSocket.implAccept(ServerSocket.java:453)
>         at java.net.ServerSocket.accept(ServerSocket.java:421)
>         at 
> sun.rmi.transport.tcp.TCPTransport$AcceptLoop.executeAcceptLoop(TCPTransport.java:369)
>         at 
> sun.rmi.transport.tcp.TCPTransport$AcceptLoop.run(TCPTransport.java:341)
>         at java.lang.Thread.run(Thread.java:619)
> "RMI TCP Accept-0" daemon prio=10 tid=0x00002aaeb83a0000 nid=0x15f0 runnable 
> [0x0000000041f47000]
>    java.lang.Thread.State: RUNNABLE
>         at java.net.PlainSocketImpl.socketAccept(Native Method)
>         at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:390)
>         - locked <0x00002aaab765c140> (a java.net.SocksSocketImpl)
>         at java.net.ServerSocket.implAccept(ServerSocket.java:453)
>         at java.net.ServerSocket.accept(ServerSocket.java:421)
>         at 
> sun.rmi.transport.tcp.TCPTransport$AcceptLoop.executeAcceptLoop(TCPTransport.java:369)
>         at 
> sun.rmi.transport.tcp.TCPTransport$AcceptLoop.run(TCPTransport.java:341)
>         at java.lang.Thread.run(Thread.java:619)
> "Low Memory Detector" daemon prio=10 tid=0x00002aaeb8016000 nid=0x15eb 
> runnable [0x0000000000000000]
>    java.lang.Thread.State: RUNNABLE
> "CompilerThread1" daemon prio=10 tid=0x00002aaeb8013800 nid=0x15ea waiting on 
> condition [0x0000000000000000]
>    java.lang.Thread.State: RUNNABLE
> "CompilerThread0" daemon prio=10 tid=0x00002aaeb8011800 nid=0x15e9 waiting on 
> condition [0x0000000000000000]
>    java.lang.Thread.State: RUNNABLE
> "JDWP Event Helper Thread" daemon prio=10 tid=0x00002aaeb800f800 nid=0x15e8 
> runnable [0x0000000000000000]
>    java.lang.Thread.State: RUNNABLE
> "JDWP Transport Listener: dt_socket" daemon prio=10 tid=0x00002aaeb800c000 
> nid=0x15e7 runnable [0x0000000000000000]
>    java.lang.Thread.State: RUNNABLE
> "Signal Dispatcher" daemon prio=10 tid=0x00002aaeb45d4800 nid=0x15e6 runnable 
> [0x0000000000000000]
>    java.lang.Thread.State: RUNNABLE
> "Surrogate Locker Thread (CMS)" daemon prio=10 tid=0x00002aaeb45d2800 
> nid=0x15e5 waiting on condition [0x0000000000000000]
>    java.lang.Thread.State: RUNNABLE
> "Finalizer" daemon prio=10 tid=0x00002aaeb440b000 nid=0x15e3 in Object.wait() 
> [0x000000004173f000]
>    java.lang.Thread.State: WAITING (on object monitor)
>         at java.lang.Object.wait(Native Method)
>         at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:118)
>         - locked <0x00002aaab6663ac0> (a java.lang.ref.ReferenceQueue$Lock)
>         at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:134)
>         at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:159)
> "Reference Handler" daemon prio=10 tid=0x00002aaeb4409000 nid=0x15e2 in 
> Object.wait() [0x000000004163e000]
>    java.lang.Thread.State: WAITING (on object monitor)
>         at java.lang.Object.wait(Native Method)
>         at java.lang.Object.wait(Object.java:485)
>         at java.lang.ref.Reference$ReferenceHandler.run(Reference.java:116)
>         - locked <0x00002aaab6c01940> (a java.lang.ref.Reference$Lock)
> "VM Thread" prio=10 tid=0x00002aaeb4402000 nid=0x15e1 runnable
> "Gang worker#0 (Parallel GC Threads)" prio=10 tid=0x000000004011e800 
> nid=0x15cf runnable
> "Gang worker#1 (Parallel GC Threads)" prio=10 tid=0x0000000040120800 
> nid=0x15d0 runnable
> "Gang worker#2 (Parallel GC Threads)" prio=10 tid=0x0000000040122000 
> nid=0x15d1 runnable
> "Gang worker#3 (Parallel GC Threads)" prio=10 tid=0x0000000040124000 
> nid=0x15d2 runnable
> "Gang worker#4 (Parallel GC Threads)" prio=10 tid=0x0000000040126000 
> nid=0x15d3 runnable
> "Gang worker#5 (Parallel GC Threads)" prio=10 tid=0x0000000040127800 
> nid=0x15d4 runnable
> "Gang worker#6 (Parallel GC Threads)" prio=10 tid=0x0000000040129800 
> nid=0x15d5 runnable
> "Gang worker#7 (Parallel GC Threads)" prio=10 tid=0x00002aaab2526800 
> nid=0x15d6 runnable
> "Gang worker#8 (Parallel GC Threads)" prio=10 tid=0x00002aaab2528000 
> nid=0x15d7 runnable
> "Gang worker#9 (Parallel GC Threads)" prio=10 tid=0x00002aaab252a000 
> nid=0x15d8 runnable
> "Gang worker#10 (Parallel GC Threads)" prio=10 tid=0x00002aaab252c000 
> nid=0x15d9 runnable
> "Gang worker#11 (Parallel GC Threads)" prio=10 tid=0x00002aaab252d800 
> nid=0x15da runnable
> "Gang worker#12 (Parallel GC Threads)" prio=10 tid=0x00002aaab252f800 
> nid=0x15db runnable
> "Concurrent Mark-Sweep GC Thread" prio=10 tid=0x00002aaea3b8f000 nid=0x15e0 
> runnable
> "Gang worker#0 (Parallel CMS Threads)" prio=10 tid=0x00002aaea3a7e800 
> nid=0x15dc runnable
> "Gang worker#1 (Parallel CMS Threads)" prio=10 tid=0x00002aaea3a80000 
> nid=0x15dd runnable
> "Gang worker#2 (Parallel CMS Threads)" prio=10 tid=0x00002aaea3acb800 
> nid=0x15de runnable
> "Gang worker#3 (Parallel CMS Threads)" prio=10 tid=0x00002aaea3acd000 
> nid=0x15df runnable
> "VM Periodic Task Thread" prio=10 tid=0x00002aaeb83bc800 nid=0x15f3 waiting 
> on condition
> JNI global references: 4205
> ===
> In the RS logs all the IPC Server handlers have died because HDFS failure to 
> write to the log.
> First failure to append to the logs
> 2010-11-02 17:05:58,620 WARN org.apache.hadoop.hdfs.DFSClient: Packet 243111 
> [offsetInBlock=241298432 pktLen=2173] of blk_-4978761844464053668_783579 is 
> timed out
> 2010-11-02 17:05:58,623 WARN org.apache.hadoop.hdfs.DFSClient: Error Recovery 
> for block blk_-4978761844464053668_783579 bad datanode[2] 10.38.47.49:50010
> 2010-11-02 17:05:58,623 WARN org.apache.hadoop.hdfs.DFSClient: Error Recovery 
> for block blk_-4978761844464053668_783579 in pipeline 10.38.15.45:50010, 
> 10.38.47.59:50010, 10.38.47.49:50010: bad datanode 10.38.47.49:50010
> 2010-11-02 17:05:58,755 WARN org.apache.hadoop.hdfs.DFSClient: Failed 
> recovery attempt #0 from primary datanode 10.38.15.45:50010
> org.apache.hadoop.ipc.RPC$VersionMismatch: Protocol 
> org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol version mismatch. 
> (client = 5, server = 3)
>         at org.apache.hadoop.ipc.RPC.getProxy(RPC.java:452)
>         at 
> org.apache.hadoop.hdfs.DFSClient.createClientDatanodeProtocolProxy(DFSClient.java:194)
>         at 
> org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.processDatanodeError(DFSClient.java:3027)
>         at 
> org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.access$1800(DFSClient.java:2467)
>         at 
> org.apache.hadoop.hdfs.DFSClient$DFSOutputStream$DataStreamer.run(DFSClient.java:2687)
> ...
> 2010-11-02 17:11:09,667 WARN org.apache.hadoop.hdfs.DFSClient: Failed 
> recovery attempt #0 from primary datanode 10.38.47.49:50010
> java.net.SocketTimeoutException: Call to /10.38.47.49:50020 failed on socket 
> timeout exception: java.net.SocketTimeoutException: 300000 millis timeout 
> while waiting for channel to be ready for read. ch : 
> java.nio.channels.SocketChannel[connected local=/10.38.15.45:56492 
> remote=/10.38.47.49:50020]
>         at org.apache.hadoop.ipc.Client.wrapException(Client.java:855)
>         at org.apache.hadoop.ipc.Client.call(Client.java:827)
>         at org.apache.hadoop.ipc.RPC$Invoker.invoke(RPC.java:222)
>         at $Proxy3.getProtocolVersion(Unknown Source)
>         at org.apache.hadoop.ipc.RPC.getProxy(RPC.java:447)
>         at 
> org.apache.hadoop.hdfs.DFSClient.createClientDatanodeProtocolProxy(DFSClient.java:194)
>         at 
> org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.processDatanodeError(DFSClient.java:3027)
>         at 
> org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.access$1800(DFSClient.java:2467)
>         at 
> org.apache.hadoop.hdfs.DFSClient$DFSOutputStream$DataStreamer.run(DFSClient.java:2687)
> Caused by: java.net.SocketTimeoutException: 300000 millis timeout while 
> waiting for channel to be ready for read. ch : 
> java.nio.channels.SocketChannel[connected local=/10.38.15.45:56492 
> remote=/10.38.47.49:50020]
>         at 
> org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164)
>         at 
> org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:155)
>         at 
> org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:128)
>         at java.io.FilterInputStream.read(FilterInputStream.java:116)
>         at 
> org.apache.hadoop.ipc.Client$Connection$PingInputStream.read(Client.java:325)
>         at java.io.BufferedInputStream.fill(BufferedInputStream.java:218)
>         at java.io.BufferedInputStream.read(BufferedInputStream.java:237)
>         at java.io.DataInputStream.readInt(DataInputStream.java:370)
>         at 
> org.apache.hadoop.ipc.Client$Connection.receiveResponse(Client.java:572)
>         at org.apache.hadoop.ipc.Client$Connection.run(Client.java:500)
> 2010-11-02 17:11:09,669 WARN org.apache.hadoop.hdfs.DFSClient: Error Recovery 
> for block blk_-4978761844464053668_783579 file 
> /PUMAHBASE001-SNC5-HBASE/.logs/pumahbase028.snc5.facebook.com,60020,1288733355197/10.38.15.45%3A60020.1288742259683
>  failed  because recovery from primary datanode 10.38.47.49:50010 failed 1 
> times.  Pipeline was 10.38.47.59:50010, 10.38.47.49:50010. Will retry...
> Error recovery fails.
> 2010-11-02 17:36:20,919 FATAL org.apache.hadoop.hbase.regionserver.wal.HLog: 
> Could not append. Requesting close of hlog
> java.io.IOException: Reflection
>         at 
> org.apache.hadoop.hbase.regionserver.wal.SequenceFileLogWriter.sync(SequenceFileLogWriter.java:147)
>         at 
> org.apache.hadoop.hbase.regionserver.wal.HLog.hflush(HLog.java:1042)
>         at 
> org.apache.hadoop.hbase.regionserver.wal.HLog$LogSyncer.run(HLog.java:966)
> Caused by: java.lang.reflect.InvocationTargetException
>         at sun.reflect.GeneratedMethodAccessor22.invoke(Unknown Source)
>         at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
>         at java.lang.reflect.Method.invoke(Method.java:597)
>         at 
> org.apache.hadoop.hbase.regionserver.wal.SequenceFileLogWriter.sync(SequenceFileLogWriter.java:145)
>         ... 2 more
> Caused by: java.io.IOException: Error Recovery for block 
> blk_-4978761844464053668_783579 file 
> /PUMAHBASE001-SNC5-HBASE/.logs/pumahbase028.snc5.facebook.com,60020,1288733355197/10.38.15.45%3A60020.1288742259683
>  failed  because recovery from primary datanode 10.38.47.59:50010 failed 6 
> times.  Pipeline was 10.38.47.59:50010. Aborting...
>         at 
> org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.processDatanodeError(DFSClient.java:3065)
>         at 
> org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.access$1800(DFSClient.java:2467)
>         at 
> org.apache.hadoop.hdfs.DFSClient$DFSOutputStream$DataStreamer.run(DFSClient.java:2687)
> Regions start getting aborted  because hlog close fails because filesystem is 
> not available
> 2010-11-02 17:36:24,055 FATAL 
> org.apache.hadoop.hbase.regionserver.HRegionServer: ABORTING region server 
> serverName=pumahbase028.snc5.facebook.com,60020,1288733355197, 
> load=(requests=492, regions=55, usedHeap=3376, maxHeap=15993): File System 
> not available
> java.io.IOException: File system is not available
>         at 
> org.apache.hadoop.hbase.util.FSUtils.checkFileSystemAvailable(FSUtils.java:130)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.checkFileSystem(HRegionServer.java:949)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.increment(HRegionServer.java:2312)
>         at sun.reflect.GeneratedMethodAccessor21.invoke(Unknown Source)
>         at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
>         at java.lang.reflect.Method.invoke(Method.java:597)
>         at org.apache.hadoop.hbase.ipc.HBaseRPC$Server.call(HBaseRPC.java:561)
>         at 
> org.apache.hadoop.hbase.ipc.HBaseServer$Handler.run(HBaseServer.java:1025)
> Caused by: java.io.IOException: Filesystem closed
>         at org.apache.hadoop.hdfs.DFSClient.checkOpen(DFSClient.java:282)
>         at org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:875)
>         at 
> org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:484)
>         at org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:766)
>         at 
> org.apache.hadoop.hbase.util.FSUtils.checkFileSystemAvailable(FSUtils.java:119)
>         ... 7 more
> All the IPC Server handlers exit
> 2010-11-02 17:36:24,097 WARN org.apache.hadoop.ipc.HBaseServer: IPC Server 
> Responder, call increment([...@aa32978, 
> row=b4da93229cfc2cfb7d731f46da44f108info.compartilo1.59624 lfi 450313269132, 
> families={(family=COUNTERS_0, columns={+=1, a13g1+=1, cCL+=1, les_LA+=1}), 
> (family=COUNTERS_3600, columns={7ffa89a0+=1}}) from 10.38.15.37:49593: output 
> error
> 2010-11-02 17:36:24,094 FATAL org.apache.hadoop.hbase.regionserver.wal.HLog: 
> Could not append. Requesting close of hlog
> java.io.IOException: Filesystem closed
>         at org.apache.hadoop.hdfs.DFSClient.checkOpen(DFSClient.java:282)
>         at org.apache.hadoop.hdfs.DFSClient.access$800(DFSClient.java:71)
>         at 
> org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.writeChunk(DFSClient.java:3465)
>         at 
> org.apache.hadoop.fs.FSOutputSummer.writeChecksumChunk(FSOutputSummer.java:150)
>         at 
> org.apache.hadoop.fs.FSOutputSummer.flushBuffer(FSOutputSummer.java:132)
>         at 
> org.apache.hadoop.fs.FSOutputSummer.flushBuffer(FSOutputSummer.java:121)
>         at org.apache.hadoop.fs.FSOutputSummer.write1(FSOutputSummer.java:112)
>         at org.apache.hadoop.fs.FSOutputSummer.write(FSOutputSummer.java:86)
>         at 
> org.apache.hadoop.fs.FSDataOutputStream$PositionCache.write(FSDataOutputStream.java:49)
>         at java.io.DataOutputStream.write(DataOutputStream.java:90)
>         at 
> org.apache.hadoop.io.SequenceFile$Writer.append(SequenceFile.java:1073)
>         at 
> org.apache.hadoop.io.SequenceFile$Writer.append(SequenceFile.java:1037)
>         at 
> org.apache.hadoop.hbase.regionserver.wal.SequenceFileLogWriter.append(SequenceFileLogWriter.java:133)
>         at 
> org.apache.hadoop.hbase.regionserver.wal.HLog.doWrite(HLog.java:1129)
>         at org.apache.hadoop.hbase.regionserver.wal.HLog.append(HLog.java:914)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegion.increment(HRegion.java:3038)
>         at 
> org.apache.hadoop.hbase.regionserver.HRegionServer.increment(HRegionServer.java:2309)
>         at sun.reflect.GeneratedMethodAccessor21.invoke(Unknown Source)
>         at 
> sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25)
>         at java.lang.reflect.Method.invoke(Method.java:597)
>         at org.apache.hadoop.hbase.ipc.HBaseRPC$Server.call(HBaseRPC.java:561)
>         at 
> org.apache.hadoop.hbase.ipc.HBaseServer$Handler.run(HBaseServer.java:1025)
> 2010-11-02 17:36:24,108 WARN org.apache.hadoop.ipc.HBaseServer: IPC Server 
> handler 69 on 60020 caught: java.nio.channels.ClosedChannelException
>         at 
> sun.nio.ch.SocketChannelImpl.ensureWriteOpen(SocketChannelImpl.java:126)
>         at sun.nio.ch.SocketChannelImpl.write(SocketChannelImpl.java:324)
>         at 
> org.apache.hadoop.hbase.ipc.HBaseServer.channelWrite(HBaseServer.java:1282)
>         at 
> org.apache.hadoop.hbase.ipc.HBaseServer$Responder.processResponse(HBaseServer.java:713)
>         at 
> org.apache.hadoop.hbase.ipc.HBaseServer$Responder.doRespond(HBaseServer.java:778)
>         at 
> org.apache.hadoop.hbase.ipc.HBaseServer$Handler.run(HBaseServer.java:1053)
> 2010-11-02 17:36:24,113 INFO org.apache.hadoop.ipc.HBaseServer: IPC Server 
> handler 69 on 60020: exiting
> ===
> If all the IPC Server handlers exit for whatever reason then the Region 
> Server should cleanly exit. I think my clients are getting stuck in 
> locateRegionInMeta() because of this stuck region server.
> ===

-- 
This message is automatically generated by JIRA.
-
You can reply to this email to add a comment to the issue online.

Reply via email to