[ https://issues.apache.org/jira/browse/HBASE-3196?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Prakash Khemani updated HBASE-3196: ----------------------------------- Description: The region server is stuck with the following jstack 2010-11-03 22:23:41 Full thread dump Java HotSpot(TM) 64-Bit Server VM (14.0-b16 mixed mode): "Attach Listener" daemon prio=10 tid=0x00002aaeb6774000 nid=0x3974 waiting on condition [0x0000000000000000] java.lang.Thread.State: RUNNABLE "RS_CLOSE_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-2" prio=10 tid=0x00002aaeb8449000 nid=0x3bbc waiting on condition [0x0000000043f67000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00002aaab7fd1130> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925) at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358) at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907) at java.lang.Thread.run(Thread.java:619) "RS_CLOSE_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-1" prio=10 tid=0x00002aaeb843f800 nid=0x3bbb waiting on condition [0x0000000043e66000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00002aaab7fd1130> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925) at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358) at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907) at java.lang.Thread.run(Thread.java:619) "RS_CLOSE_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-0" prio=10 tid=0x00002aaeb8447800 nid=0x3bba waiting on condition [0x0000000044068000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00002aaab7fd1130> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925) at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358) at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907) at java.lang.Thread.run(Thread.java:619) "RMI Scheduler(0)" daemon prio=10 tid=0x00002aaeb48c4800 nid=0x1c97 waiting on condition [0x00000000580a7000] java.lang.Thread.State: TIMED_WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00002aaab773a118> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:198) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:1963) at java.util.concurrent.DelayQueue.take(DelayQueue.java:164) at java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:583) at java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:576) at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907) at java.lang.Thread.run(Thread.java:619) "RS_OPEN_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-2" daemon prio=10 tid=0x00002aaeb4804800 nid=0x17a0 waiting on condition [0x00000000582a9000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00002aaab7fca538> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925) at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358) at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907) at java.lang.Thread.run(Thread.java:619) "RS_OPEN_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-1" daemon prio=10 tid=0x00002aaeb490a000 nid=0x179f waiting on condition [0x000000004345c000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00002aaab7fca538> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925) at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358) at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907) at java.lang.Thread.run(Thread.java:619) "RS_OPEN_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-0" daemon prio=10 tid=0x00002aaeb4909000 nid=0x179e waiting on condition [0x000000004335b000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00002aaab7fca538> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925) at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358) at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907) at java.lang.Thread.run(Thread.java:619) "LruBlockCache.EvictionThread" daemon prio=10 tid=0x00002aaeb889b000 nid=0x1767 in Object.wait() [0x0000000057ea5000] java.lang.Thread.State: WAITING (on object monitor) at java.lang.Object.wait(Native Method) at java.lang.Object.wait(Object.java:485) at org.apache.hadoop.hbase.io.hfile.LruBlockCache$EvictionThread.run(LruBlockCache.java:514) - locked <0x00002aaab7fdb728> (a org.apache.hadoop.hbase.io.hfile.LruBlockCache$EvictionThread) "Timer thread for monitoring jvm" daemon prio=10 tid=0x00002aaebc0a5000 nid=0x1620 in Object.wait() [0x0000000043c64000] java.lang.Thread.State: TIMED_WAITING (on object monitor) at java.lang.Object.wait(Native Method) at java.util.TimerThread.mainLoop(Timer.java:509) - locked <0x00002aaab7e89c38> (a java.util.TaskQueue) at java.util.TimerThread.run(Timer.java:462) "Timer thread for monitoring hbase" daemon prio=10 tid=0x00002aaebc0a3800 nid=0x161f in Object.wait() [0x0000000043b63000] java.lang.Thread.State: TIMED_WAITING (on object monitor) at java.lang.Object.wait(Native Method) at java.util.TimerThread.mainLoop(Timer.java:509) - locked <0x00002aaab7fe95d8> (a java.util.TaskQueue) at java.util.TimerThread.run(Timer.java:462) "DestroyJavaVM" prio=10 tid=0x00002aaebc092000 nid=0x15ce waiting on condition [0x0000000000000000] java.lang.Thread.State: RUNNABLE "regionserver60020-EventThread" daemon prio=10 tid=0x00002aaeb5276800 nid=0x1606 waiting on condition [0x000000004325a000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00002aaab6bf6bb8> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925) at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358) at org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:477) "regionserver60020-SendThread(pumahbasectrl001.snc5.facebook.com:2181)" daemon prio=10 tid=0x00002aaeb5275800 nid=0x1605 runnable [0x0000000043159000] java.lang.Thread.State: RUNNABLE at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method) at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:215) at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:65) at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:69) - locked <0x00002aaab6c02078> (a sun.nio.ch.Util$1) - locked <0x00002aaab6c02090> (a java.util.Collections$UnmodifiableSet) - locked <0x00002aaab66713d0> (a sun.nio.ch.EPollSelectorImpl) at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:80) at org.apache.zookeeper.ClientCnxn$SendThread.run(ClientCnxn.java:1066) "regionserver60020" prio=10 tid=0x00002aaeb858c800 nid=0x1604 waiting on condition [0x0000000043058000] java.lang.Thread.State: TIMED_WAITING (sleeping) at java.lang.Thread.sleep(Native Method) at org.apache.hadoop.hbase.util.Threads.sleep(Threads.java:126) at org.apache.hadoop.hbase.regionserver.HRegionServer.waitOnAllRegionsToClose(HRegionServer.java:645) at org.apache.hadoop.hbase.regionserver.HRegionServer.run(HRegionServer.java:611) at java.lang.Thread.run(Thread.java:619) "Timer thread for monitoring rpc" daemon prio=10 tid=0x00002aaeb8583000 nid=0x1602 in Object.wait() [0x0000000042f57000] java.lang.Thread.State: TIMED_WAITING (on object monitor) at java.lang.Object.wait(Native Method) at java.util.TimerThread.mainLoop(Timer.java:509) - locked <0x00002aaab6be7038> (a java.util.TaskQueue) at java.util.TimerThread.run(Timer.java:462) "main-EventThread" daemon prio=10 tid=0x00002aaeb842d000 nid=0x15f7 waiting on condition [0x000000004244c000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00002aaab6bc0720> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925) at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358) at org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:477) "main-SendThread(pumahbasectrl062.snc5.facebook.com:2181)" daemon prio=10 tid=0x00002aaeb8451800 nid=0x15f6 runnable [0x000000004234b000] java.lang.Thread.State: RUNNABLE at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method) at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:215) at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:65) at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:69) - locked <0x00002aaab6bb08f0> (a sun.nio.ch.Util$1) - locked <0x00002aaab6bb0908> (a java.util.Collections$UnmodifiableSet) - locked <0x00002aaab764df90> (a sun.nio.ch.EPollSelectorImpl) at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:80) at org.apache.zookeeper.ClientCnxn$SendThread.run(ClientCnxn.java:1066) "RMI TCP Accept-0" daemon prio=10 tid=0x00002aaeb83b9800 nid=0x15f2 runnable [0x0000000042149000] java.lang.Thread.State: RUNNABLE at java.net.PlainSocketImpl.socketAccept(Native Method) at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:390) - locked <0x00002aaab6b935e8> (a java.net.SocksSocketImpl) at java.net.ServerSocket.implAccept(ServerSocket.java:453) at java.net.ServerSocket.accept(ServerSocket.java:421) at sun.management.jmxremote.LocalRMIServerSocketFactory$1.accept(LocalRMIServerSocketFactory.java:34) at sun.rmi.transport.tcp.TCPTransport$AcceptLoop.executeAcceptLoop(TCPTransport.java:369) at sun.rmi.transport.tcp.TCPTransport$AcceptLoop.run(TCPTransport.java:341) at java.lang.Thread.run(Thread.java:619) "RMI TCP Accept-8091" daemon prio=10 tid=0x00002aaeb83b3000 nid=0x15f1 runnable [0x0000000042048000] java.lang.Thread.State: RUNNABLE at java.net.PlainSocketImpl.socketAccept(Native Method) at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:390) - locked <0x00002aaab7659d50> (a java.net.SocksSocketImpl) at java.net.ServerSocket.implAccept(ServerSocket.java:453) at java.net.ServerSocket.accept(ServerSocket.java:421) at sun.rmi.transport.tcp.TCPTransport$AcceptLoop.executeAcceptLoop(TCPTransport.java:369) at sun.rmi.transport.tcp.TCPTransport$AcceptLoop.run(TCPTransport.java:341) at java.lang.Thread.run(Thread.java:619) "RMI TCP Accept-0" daemon prio=10 tid=0x00002aaeb83a0000 nid=0x15f0 runnable [0x0000000041f47000] java.lang.Thread.State: RUNNABLE at java.net.PlainSocketImpl.socketAccept(Native Method) at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:390) - locked <0x00002aaab765c140> (a java.net.SocksSocketImpl) at java.net.ServerSocket.implAccept(ServerSocket.java:453) at java.net.ServerSocket.accept(ServerSocket.java:421) at sun.rmi.transport.tcp.TCPTransport$AcceptLoop.executeAcceptLoop(TCPTransport.java:369) at sun.rmi.transport.tcp.TCPTransport$AcceptLoop.run(TCPTransport.java:341) at java.lang.Thread.run(Thread.java:619) "Low Memory Detector" daemon prio=10 tid=0x00002aaeb8016000 nid=0x15eb runnable [0x0000000000000000] java.lang.Thread.State: RUNNABLE "CompilerThread1" daemon prio=10 tid=0x00002aaeb8013800 nid=0x15ea waiting on condition [0x0000000000000000] java.lang.Thread.State: RUNNABLE "CompilerThread0" daemon prio=10 tid=0x00002aaeb8011800 nid=0x15e9 waiting on condition [0x0000000000000000] java.lang.Thread.State: RUNNABLE "JDWP Event Helper Thread" daemon prio=10 tid=0x00002aaeb800f800 nid=0x15e8 runnable [0x0000000000000000] java.lang.Thread.State: RUNNABLE "JDWP Transport Listener: dt_socket" daemon prio=10 tid=0x00002aaeb800c000 nid=0x15e7 runnable [0x0000000000000000] java.lang.Thread.State: RUNNABLE "Signal Dispatcher" daemon prio=10 tid=0x00002aaeb45d4800 nid=0x15e6 runnable [0x0000000000000000] java.lang.Thread.State: RUNNABLE "Surrogate Locker Thread (CMS)" daemon prio=10 tid=0x00002aaeb45d2800 nid=0x15e5 waiting on condition [0x0000000000000000] java.lang.Thread.State: RUNNABLE "Finalizer" daemon prio=10 tid=0x00002aaeb440b000 nid=0x15e3 in Object.wait() [0x000000004173f000] java.lang.Thread.State: WAITING (on object monitor) at java.lang.Object.wait(Native Method) at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:118) - locked <0x00002aaab6663ac0> (a java.lang.ref.ReferenceQueue$Lock) at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:134) at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:159) "Reference Handler" daemon prio=10 tid=0x00002aaeb4409000 nid=0x15e2 in Object.wait() [0x000000004163e000] java.lang.Thread.State: WAITING (on object monitor) at java.lang.Object.wait(Native Method) at java.lang.Object.wait(Object.java:485) at java.lang.ref.Reference$ReferenceHandler.run(Reference.java:116) - locked <0x00002aaab6c01940> (a java.lang.ref.Reference$Lock) "VM Thread" prio=10 tid=0x00002aaeb4402000 nid=0x15e1 runnable "Gang worker#0 (Parallel GC Threads)" prio=10 tid=0x000000004011e800 nid=0x15cf runnable "Gang worker#1 (Parallel GC Threads)" prio=10 tid=0x0000000040120800 nid=0x15d0 runnable "Gang worker#2 (Parallel GC Threads)" prio=10 tid=0x0000000040122000 nid=0x15d1 runnable "Gang worker#3 (Parallel GC Threads)" prio=10 tid=0x0000000040124000 nid=0x15d2 runnable "Gang worker#4 (Parallel GC Threads)" prio=10 tid=0x0000000040126000 nid=0x15d3 runnable "Gang worker#5 (Parallel GC Threads)" prio=10 tid=0x0000000040127800 nid=0x15d4 runnable "Gang worker#6 (Parallel GC Threads)" prio=10 tid=0x0000000040129800 nid=0x15d5 runnable "Gang worker#7 (Parallel GC Threads)" prio=10 tid=0x00002aaab2526800 nid=0x15d6 runnable "Gang worker#8 (Parallel GC Threads)" prio=10 tid=0x00002aaab2528000 nid=0x15d7 runnable "Gang worker#9 (Parallel GC Threads)" prio=10 tid=0x00002aaab252a000 nid=0x15d8 runnable "Gang worker#10 (Parallel GC Threads)" prio=10 tid=0x00002aaab252c000 nid=0x15d9 runnable "Gang worker#11 (Parallel GC Threads)" prio=10 tid=0x00002aaab252d800 nid=0x15da runnable "Gang worker#12 (Parallel GC Threads)" prio=10 tid=0x00002aaab252f800 nid=0x15db runnable "Concurrent Mark-Sweep GC Thread" prio=10 tid=0x00002aaea3b8f000 nid=0x15e0 runnable "Gang worker#0 (Parallel CMS Threads)" prio=10 tid=0x00002aaea3a7e800 nid=0x15dc runnable "Gang worker#1 (Parallel CMS Threads)" prio=10 tid=0x00002aaea3a80000 nid=0x15dd runnable "Gang worker#2 (Parallel CMS Threads)" prio=10 tid=0x00002aaea3acb800 nid=0x15de runnable "Gang worker#3 (Parallel CMS Threads)" prio=10 tid=0x00002aaea3acd000 nid=0x15df runnable "VM Periodic Task Thread" prio=10 tid=0x00002aaeb83bc800 nid=0x15f3 waiting on condition JNI global references: 4205 === In the RS logs all the IPC Server handlers have died because HDFS failure to write to the log. First failure to append to the logs 2010-11-02 17:05:58,620 WARN org.apache.hadoop.hdfs.DFSClient: Packet 243111 [offsetInBlock=241298432 pktLen=2173] of blk_-4978761844464053668_783579 is timed out 2010-11-02 17:05:58,623 WARN org.apache.hadoop.hdfs.DFSClient: Error Recovery for block blk_-4978761844464053668_783579 bad datanode[2] 10.38.47.49:50010 2010-11-02 17:05:58,623 WARN org.apache.hadoop.hdfs.DFSClient: Error Recovery for block blk_-4978761844464053668_783579 in pipeline 10.38.15.45:50010, 10.38.47.59:50010, 10.38.47.49:50010: bad datanode 10.38.47.49:50010 2010-11-02 17:05:58,755 WARN org.apache.hadoop.hdfs.DFSClient: Failed recovery attempt #0 from primary datanode 10.38.15.45:50010 org.apache.hadoop.ipc.RPC$VersionMismatch: Protocol org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol version mismatch. (client = 5, server = 3) at org.apache.hadoop.ipc.RPC.getProxy(RPC.java:452) at org.apache.hadoop.hdfs.DFSClient.createClientDatanodeProtocolProxy(DFSClient.java:194) at org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.processDatanodeError(DFSClient.java:3027) at org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.access$1800(DFSClient.java:2467) at org.apache.hadoop.hdfs.DFSClient$DFSOutputStream$DataStreamer.run(DFSClient.java:2687) ... 2010-11-02 17:11:09,667 WARN org.apache.hadoop.hdfs.DFSClient: Failed recovery attempt #0 from primary datanode 10.38.47.49:50010 java.net.SocketTimeoutException: Call to /10.38.47.49:50020 failed on socket timeout exception: java.net.SocketTimeoutException: 300000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.38.15.45:56492 remote=/10.38.47.49:50020] at org.apache.hadoop.ipc.Client.wrapException(Client.java:855) at org.apache.hadoop.ipc.Client.call(Client.java:827) at org.apache.hadoop.ipc.RPC$Invoker.invoke(RPC.java:222) at $Proxy3.getProtocolVersion(Unknown Source) at org.apache.hadoop.ipc.RPC.getProxy(RPC.java:447) at org.apache.hadoop.hdfs.DFSClient.createClientDatanodeProtocolProxy(DFSClient.java:194) at org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.processDatanodeError(DFSClient.java:3027) at org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.access$1800(DFSClient.java:2467) at org.apache.hadoop.hdfs.DFSClient$DFSOutputStream$DataStreamer.run(DFSClient.java:2687) Caused by: java.net.SocketTimeoutException: 300000 millis timeout while waiting for channel to be ready for read. ch : java.nio.channels.SocketChannel[connected local=/10.38.15.45:56492 remote=/10.38.47.49:50020] at org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164) at org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:155) at org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:128) at java.io.FilterInputStream.read(FilterInputStream.java:116) at org.apache.hadoop.ipc.Client$Connection$PingInputStream.read(Client.java:325) at java.io.BufferedInputStream.fill(BufferedInputStream.java:218) at java.io.BufferedInputStream.read(BufferedInputStream.java:237) at java.io.DataInputStream.readInt(DataInputStream.java:370) at org.apache.hadoop.ipc.Client$Connection.receiveResponse(Client.java:572) at org.apache.hadoop.ipc.Client$Connection.run(Client.java:500) 2010-11-02 17:11:09,669 WARN org.apache.hadoop.hdfs.DFSClient: Error Recovery for block blk_-4978761844464053668_783579 file /PUMAHBASE001-SNC5-HBASE/.logs/pumahbase028.snc5.facebook.com,60020,1288733355197/10.38.15.45%3A60020.1288742259683 failed because recovery from primary datanode 10.38.47.49:50010 failed 1 times. Pipeline was 10.38.47.59:50010, 10.38.47.49:50010. Will retry... Error recovery fails. 2010-11-02 17:36:20,919 FATAL org.apache.hadoop.hbase.regionserver.wal.HLog: Could not append. Requesting close of hlog java.io.IOException: Reflection at org.apache.hadoop.hbase.regionserver.wal.SequenceFileLogWriter.sync(SequenceFileLogWriter.java:147) at org.apache.hadoop.hbase.regionserver.wal.HLog.hflush(HLog.java:1042) at org.apache.hadoop.hbase.regionserver.wal.HLog$LogSyncer.run(HLog.java:966) Caused by: java.lang.reflect.InvocationTargetException at sun.reflect.GeneratedMethodAccessor22.invoke(Unknown Source) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25) at java.lang.reflect.Method.invoke(Method.java:597) at org.apache.hadoop.hbase.regionserver.wal.SequenceFileLogWriter.sync(SequenceFileLogWriter.java:145) ... 2 more Caused by: java.io.IOException: Error Recovery for block blk_-4978761844464053668_783579 file /PUMAHBASE001-SNC5-HBASE/.logs/pumahbase028.snc5.facebook.com,60020,1288733355197/10.38.15.45%3A60020.1288742259683 failed because recovery from primary datanode 10.38.47.59:50010 failed 6 times. Pipeline was 10.38.47.59:50010. Aborting... at org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.processDatanodeError(DFSClient.java:3065) at org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.access$1800(DFSClient.java:2467) at org.apache.hadoop.hdfs.DFSClient$DFSOutputStream$DataStreamer.run(DFSClient.java:2687) Regions start getting aborted because hlog close fails because filesystem is not available 2010-11-02 17:36:24,055 FATAL org.apache.hadoop.hbase.regionserver.HRegionServer: ABORTING region server serverName=pumahbase028.snc5.facebook.com,60020,1288733355197, load=(requests=492, regions=55, usedHeap=3376, maxHeap=15993): File System not available java.io.IOException: File system is not available at org.apache.hadoop.hbase.util.FSUtils.checkFileSystemAvailable(FSUtils.java:130) at org.apache.hadoop.hbase.regionserver.HRegionServer.checkFileSystem(HRegionServer.java:949) at org.apache.hadoop.hbase.regionserver.HRegionServer.increment(HRegionServer.java:2312) at sun.reflect.GeneratedMethodAccessor21.invoke(Unknown Source) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25) at java.lang.reflect.Method.invoke(Method.java:597) at org.apache.hadoop.hbase.ipc.HBaseRPC$Server.call(HBaseRPC.java:561) at org.apache.hadoop.hbase.ipc.HBaseServer$Handler.run(HBaseServer.java:1025) Caused by: java.io.IOException: Filesystem closed at org.apache.hadoop.hdfs.DFSClient.checkOpen(DFSClient.java:282) at org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:875) at org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:484) at org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:766) at org.apache.hadoop.hbase.util.FSUtils.checkFileSystemAvailable(FSUtils.java:119) ... 7 more All the IPC Server handlers exit 2010-11-02 17:36:24,097 WARN org.apache.hadoop.ipc.HBaseServer: IPC Server Responder, call increment([...@aa32978, row=b4da93229cfc2cfb7d731f46da44f108info.compartilo1.59624 lfi 450313269132, families={(family=COUNTERS_0, columns={+=1, a13g1+=1, cCL+=1, les_LA+=1}), (family=COUNTERS_3600, columns={7ffa89a0+=1}}) from 10.38.15.37:49593: output error 2010-11-02 17:36:24,094 FATAL org.apache.hadoop.hbase.regionserver.wal.HLog: Could not append. Requesting close of hlog java.io.IOException: Filesystem closed at org.apache.hadoop.hdfs.DFSClient.checkOpen(DFSClient.java:282) at org.apache.hadoop.hdfs.DFSClient.access$800(DFSClient.java:71) at org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.writeChunk(DFSClient.java:3465) at org.apache.hadoop.fs.FSOutputSummer.writeChecksumChunk(FSOutputSummer.java:150) at org.apache.hadoop.fs.FSOutputSummer.flushBuffer(FSOutputSummer.java:132) at org.apache.hadoop.fs.FSOutputSummer.flushBuffer(FSOutputSummer.java:121) at org.apache.hadoop.fs.FSOutputSummer.write1(FSOutputSummer.java:112) at org.apache.hadoop.fs.FSOutputSummer.write(FSOutputSummer.java:86) at org.apache.hadoop.fs.FSDataOutputStream$PositionCache.write(FSDataOutputStream.java:49) at java.io.DataOutputStream.write(DataOutputStream.java:90) at org.apache.hadoop.io.SequenceFile$Writer.append(SequenceFile.java:1073) at org.apache.hadoop.io.SequenceFile$Writer.append(SequenceFile.java:1037) at org.apache.hadoop.hbase.regionserver.wal.SequenceFileLogWriter.append(SequenceFileLogWriter.java:133) at org.apache.hadoop.hbase.regionserver.wal.HLog.doWrite(HLog.java:1129) at org.apache.hadoop.hbase.regionserver.wal.HLog.append(HLog.java:914) at org.apache.hadoop.hbase.regionserver.HRegion.increment(HRegion.java:3038) at org.apache.hadoop.hbase.regionserver.HRegionServer.increment(HRegionServer.java:2309) at sun.reflect.GeneratedMethodAccessor21.invoke(Unknown Source) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25) at java.lang.reflect.Method.invoke(Method.java:597) at org.apache.hadoop.hbase.ipc.HBaseRPC$Server.call(HBaseRPC.java:561) at org.apache.hadoop.hbase.ipc.HBaseServer$Handler.run(HBaseServer.java:1025) 2010-11-02 17:36:24,108 WARN org.apache.hadoop.ipc.HBaseServer: IPC Server handler 69 on 60020 caught: java.nio.channels.ClosedChannelException at sun.nio.ch.SocketChannelImpl.ensureWriteOpen(SocketChannelImpl.java:126) at sun.nio.ch.SocketChannelImpl.write(SocketChannelImpl.java:324) at org.apache.hadoop.hbase.ipc.HBaseServer.channelWrite(HBaseServer.java:1282) at org.apache.hadoop.hbase.ipc.HBaseServer$Responder.processResponse(HBaseServer.java:713) at org.apache.hadoop.hbase.ipc.HBaseServer$Responder.doRespond(HBaseServer.java:778) at org.apache.hadoop.hbase.ipc.HBaseServer$Handler.run(HBaseServer.java:1053) 2010-11-02 17:36:24,113 INFO org.apache.hadoop.ipc.HBaseServer: IPC Server handler 69 on 60020: exiting === If all the IPC Server handlers exit for whatever reason then the Region Server should cleanly exit. I think my clients are getting stuck in locateRegionInMeta() because of this stuck region server. === Affects Version/s: 0.90.0 Assignee: Jonathan Gray Summary: Regionserver stuck when after all IPC Server handlers fatal'd (was: Regionserver stuck when after all IPC Server handlers fatal') > Regionserver stuck when after all IPC Server handlers fatal'd > ------------------------------------------------------------- > > Key: HBASE-3196 > URL: https://issues.apache.org/jira/browse/HBASE-3196 > Project: HBase > Issue Type: Bug > Affects Versions: 0.90.0 > Reporter: Prakash Khemani > Assignee: Jonathan Gray > > The region server is stuck with the following jstack > 2010-11-03 22:23:41 > Full thread dump Java HotSpot(TM) 64-Bit Server VM (14.0-b16 mixed mode): > "Attach Listener" daemon prio=10 tid=0x00002aaeb6774000 nid=0x3974 waiting on > condition [0x0000000000000000] > java.lang.Thread.State: RUNNABLE > "RS_CLOSE_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-2" > prio=10 tid=0x00002aaeb8449000 nid=0x3bbc waiting on condition > [0x0000000043f67000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00002aaab7fd1130> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925) > at > java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358) > at > java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907) > at java.lang.Thread.run(Thread.java:619) > "RS_CLOSE_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-1" > prio=10 tid=0x00002aaeb843f800 nid=0x3bbb waiting on condition > [0x0000000043e66000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00002aaab7fd1130> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925) > at > java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358) > at > java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907) > at java.lang.Thread.run(Thread.java:619) > "RS_CLOSE_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-0" > prio=10 tid=0x00002aaeb8447800 nid=0x3bba waiting on condition > [0x0000000044068000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00002aaab7fd1130> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925) > at > java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358) > at > java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907) > at java.lang.Thread.run(Thread.java:619) > "RMI Scheduler(0)" daemon prio=10 tid=0x00002aaeb48c4800 nid=0x1c97 waiting > on condition [0x00000000580a7000] > java.lang.Thread.State: TIMED_WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00002aaab773a118> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at > java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:198) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:1963) > at java.util.concurrent.DelayQueue.take(DelayQueue.java:164) > at > java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:583) > at > java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:576) > at > java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907) > at java.lang.Thread.run(Thread.java:619) > "RS_OPEN_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-2" daemon > prio=10 tid=0x00002aaeb4804800 nid=0x17a0 waiting on condition > [0x00000000582a9000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00002aaab7fca538> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925) > at > java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358) > at > java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907) > at java.lang.Thread.run(Thread.java:619) > "RS_OPEN_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-1" daemon > prio=10 tid=0x00002aaeb490a000 nid=0x179f waiting on condition > [0x000000004345c000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00002aaab7fca538> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925) > at > java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358) > at > java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907) > at java.lang.Thread.run(Thread.java:619) > "RS_OPEN_REGION-pumahbase028.snc5.facebook.com,60020,1288733355197-0" daemon > prio=10 tid=0x00002aaeb4909000 nid=0x179e waiting on condition > [0x000000004335b000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00002aaab7fca538> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925) > at > java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358) > at > java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:947) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:907) > at java.lang.Thread.run(Thread.java:619) > "LruBlockCache.EvictionThread" daemon prio=10 tid=0x00002aaeb889b000 > nid=0x1767 in Object.wait() [0x0000000057ea5000] > java.lang.Thread.State: WAITING (on object monitor) > at java.lang.Object.wait(Native Method) > at java.lang.Object.wait(Object.java:485) > at > org.apache.hadoop.hbase.io.hfile.LruBlockCache$EvictionThread.run(LruBlockCache.java:514) > - locked <0x00002aaab7fdb728> (a > org.apache.hadoop.hbase.io.hfile.LruBlockCache$EvictionThread) > "Timer thread for monitoring jvm" daemon prio=10 tid=0x00002aaebc0a5000 > nid=0x1620 in Object.wait() [0x0000000043c64000] > java.lang.Thread.State: TIMED_WAITING (on object monitor) > at java.lang.Object.wait(Native Method) > at java.util.TimerThread.mainLoop(Timer.java:509) > - locked <0x00002aaab7e89c38> (a java.util.TaskQueue) > at java.util.TimerThread.run(Timer.java:462) > "Timer thread for monitoring hbase" daemon prio=10 tid=0x00002aaebc0a3800 > nid=0x161f in Object.wait() [0x0000000043b63000] > java.lang.Thread.State: TIMED_WAITING (on object monitor) > at java.lang.Object.wait(Native Method) > at java.util.TimerThread.mainLoop(Timer.java:509) > - locked <0x00002aaab7fe95d8> (a java.util.TaskQueue) > at java.util.TimerThread.run(Timer.java:462) > "DestroyJavaVM" prio=10 tid=0x00002aaebc092000 nid=0x15ce waiting on > condition [0x0000000000000000] > java.lang.Thread.State: RUNNABLE > "regionserver60020-EventThread" daemon prio=10 tid=0x00002aaeb5276800 > nid=0x1606 waiting on condition [0x000000004325a000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00002aaab6bf6bb8> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925) > at > java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358) > at > org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:477) > "regionserver60020-SendThread(pumahbasectrl001.snc5.facebook.com:2181)" > daemon prio=10 tid=0x00002aaeb5275800 nid=0x1605 runnable [0x0000000043159000] > java.lang.Thread.State: RUNNABLE > at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method) > at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:215) > at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:65) > at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:69) > - locked <0x00002aaab6c02078> (a sun.nio.ch.Util$1) > - locked <0x00002aaab6c02090> (a > java.util.Collections$UnmodifiableSet) > - locked <0x00002aaab66713d0> (a sun.nio.ch.EPollSelectorImpl) > at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:80) > at > org.apache.zookeeper.ClientCnxn$SendThread.run(ClientCnxn.java:1066) > "regionserver60020" prio=10 tid=0x00002aaeb858c800 nid=0x1604 waiting on > condition [0x0000000043058000] > java.lang.Thread.State: TIMED_WAITING (sleeping) > at java.lang.Thread.sleep(Native Method) > at org.apache.hadoop.hbase.util.Threads.sleep(Threads.java:126) > at > org.apache.hadoop.hbase.regionserver.HRegionServer.waitOnAllRegionsToClose(HRegionServer.java:645) > at > org.apache.hadoop.hbase.regionserver.HRegionServer.run(HRegionServer.java:611) > at java.lang.Thread.run(Thread.java:619) > "Timer thread for monitoring rpc" daemon prio=10 tid=0x00002aaeb8583000 > nid=0x1602 in Object.wait() [0x0000000042f57000] > java.lang.Thread.State: TIMED_WAITING (on object monitor) > at java.lang.Object.wait(Native Method) > at java.util.TimerThread.mainLoop(Timer.java:509) > - locked <0x00002aaab6be7038> (a java.util.TaskQueue) > at java.util.TimerThread.run(Timer.java:462) > "main-EventThread" daemon prio=10 tid=0x00002aaeb842d000 nid=0x15f7 waiting > on condition [0x000000004244c000] > java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x00002aaab6bc0720> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:158) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:1925) > at > java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:358) > at > org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:477) > "main-SendThread(pumahbasectrl062.snc5.facebook.com:2181)" daemon prio=10 > tid=0x00002aaeb8451800 nid=0x15f6 runnable [0x000000004234b000] > java.lang.Thread.State: RUNNABLE > at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method) > at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:215) > at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:65) > at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:69) > - locked <0x00002aaab6bb08f0> (a sun.nio.ch.Util$1) > - locked <0x00002aaab6bb0908> (a > java.util.Collections$UnmodifiableSet) > - locked <0x00002aaab764df90> (a sun.nio.ch.EPollSelectorImpl) > at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:80) > at > org.apache.zookeeper.ClientCnxn$SendThread.run(ClientCnxn.java:1066) > "RMI TCP Accept-0" daemon prio=10 tid=0x00002aaeb83b9800 nid=0x15f2 runnable > [0x0000000042149000] > java.lang.Thread.State: RUNNABLE > at java.net.PlainSocketImpl.socketAccept(Native Method) > at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:390) > - locked <0x00002aaab6b935e8> (a java.net.SocksSocketImpl) > at java.net.ServerSocket.implAccept(ServerSocket.java:453) > at java.net.ServerSocket.accept(ServerSocket.java:421) > at > sun.management.jmxremote.LocalRMIServerSocketFactory$1.accept(LocalRMIServerSocketFactory.java:34) > at > sun.rmi.transport.tcp.TCPTransport$AcceptLoop.executeAcceptLoop(TCPTransport.java:369) > at > sun.rmi.transport.tcp.TCPTransport$AcceptLoop.run(TCPTransport.java:341) > at java.lang.Thread.run(Thread.java:619) > "RMI TCP Accept-8091" daemon prio=10 tid=0x00002aaeb83b3000 nid=0x15f1 > runnable [0x0000000042048000] > java.lang.Thread.State: RUNNABLE > at java.net.PlainSocketImpl.socketAccept(Native Method) > at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:390) > - locked <0x00002aaab7659d50> (a java.net.SocksSocketImpl) > at java.net.ServerSocket.implAccept(ServerSocket.java:453) > at java.net.ServerSocket.accept(ServerSocket.java:421) > at > sun.rmi.transport.tcp.TCPTransport$AcceptLoop.executeAcceptLoop(TCPTransport.java:369) > at > sun.rmi.transport.tcp.TCPTransport$AcceptLoop.run(TCPTransport.java:341) > at java.lang.Thread.run(Thread.java:619) > "RMI TCP Accept-0" daemon prio=10 tid=0x00002aaeb83a0000 nid=0x15f0 runnable > [0x0000000041f47000] > java.lang.Thread.State: RUNNABLE > at java.net.PlainSocketImpl.socketAccept(Native Method) > at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:390) > - locked <0x00002aaab765c140> (a java.net.SocksSocketImpl) > at java.net.ServerSocket.implAccept(ServerSocket.java:453) > at java.net.ServerSocket.accept(ServerSocket.java:421) > at > sun.rmi.transport.tcp.TCPTransport$AcceptLoop.executeAcceptLoop(TCPTransport.java:369) > at > sun.rmi.transport.tcp.TCPTransport$AcceptLoop.run(TCPTransport.java:341) > at java.lang.Thread.run(Thread.java:619) > "Low Memory Detector" daemon prio=10 tid=0x00002aaeb8016000 nid=0x15eb > runnable [0x0000000000000000] > java.lang.Thread.State: RUNNABLE > "CompilerThread1" daemon prio=10 tid=0x00002aaeb8013800 nid=0x15ea waiting on > condition [0x0000000000000000] > java.lang.Thread.State: RUNNABLE > "CompilerThread0" daemon prio=10 tid=0x00002aaeb8011800 nid=0x15e9 waiting on > condition [0x0000000000000000] > java.lang.Thread.State: RUNNABLE > "JDWP Event Helper Thread" daemon prio=10 tid=0x00002aaeb800f800 nid=0x15e8 > runnable [0x0000000000000000] > java.lang.Thread.State: RUNNABLE > "JDWP Transport Listener: dt_socket" daemon prio=10 tid=0x00002aaeb800c000 > nid=0x15e7 runnable [0x0000000000000000] > java.lang.Thread.State: RUNNABLE > "Signal Dispatcher" daemon prio=10 tid=0x00002aaeb45d4800 nid=0x15e6 runnable > [0x0000000000000000] > java.lang.Thread.State: RUNNABLE > "Surrogate Locker Thread (CMS)" daemon prio=10 tid=0x00002aaeb45d2800 > nid=0x15e5 waiting on condition [0x0000000000000000] > java.lang.Thread.State: RUNNABLE > "Finalizer" daemon prio=10 tid=0x00002aaeb440b000 nid=0x15e3 in Object.wait() > [0x000000004173f000] > java.lang.Thread.State: WAITING (on object monitor) > at java.lang.Object.wait(Native Method) > at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:118) > - locked <0x00002aaab6663ac0> (a java.lang.ref.ReferenceQueue$Lock) > at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:134) > at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:159) > "Reference Handler" daemon prio=10 tid=0x00002aaeb4409000 nid=0x15e2 in > Object.wait() [0x000000004163e000] > java.lang.Thread.State: WAITING (on object monitor) > at java.lang.Object.wait(Native Method) > at java.lang.Object.wait(Object.java:485) > at java.lang.ref.Reference$ReferenceHandler.run(Reference.java:116) > - locked <0x00002aaab6c01940> (a java.lang.ref.Reference$Lock) > "VM Thread" prio=10 tid=0x00002aaeb4402000 nid=0x15e1 runnable > "Gang worker#0 (Parallel GC Threads)" prio=10 tid=0x000000004011e800 > nid=0x15cf runnable > "Gang worker#1 (Parallel GC Threads)" prio=10 tid=0x0000000040120800 > nid=0x15d0 runnable > "Gang worker#2 (Parallel GC Threads)" prio=10 tid=0x0000000040122000 > nid=0x15d1 runnable > "Gang worker#3 (Parallel GC Threads)" prio=10 tid=0x0000000040124000 > nid=0x15d2 runnable > "Gang worker#4 (Parallel GC Threads)" prio=10 tid=0x0000000040126000 > nid=0x15d3 runnable > "Gang worker#5 (Parallel GC Threads)" prio=10 tid=0x0000000040127800 > nid=0x15d4 runnable > "Gang worker#6 (Parallel GC Threads)" prio=10 tid=0x0000000040129800 > nid=0x15d5 runnable > "Gang worker#7 (Parallel GC Threads)" prio=10 tid=0x00002aaab2526800 > nid=0x15d6 runnable > "Gang worker#8 (Parallel GC Threads)" prio=10 tid=0x00002aaab2528000 > nid=0x15d7 runnable > "Gang worker#9 (Parallel GC Threads)" prio=10 tid=0x00002aaab252a000 > nid=0x15d8 runnable > "Gang worker#10 (Parallel GC Threads)" prio=10 tid=0x00002aaab252c000 > nid=0x15d9 runnable > "Gang worker#11 (Parallel GC Threads)" prio=10 tid=0x00002aaab252d800 > nid=0x15da runnable > "Gang worker#12 (Parallel GC Threads)" prio=10 tid=0x00002aaab252f800 > nid=0x15db runnable > "Concurrent Mark-Sweep GC Thread" prio=10 tid=0x00002aaea3b8f000 nid=0x15e0 > runnable > "Gang worker#0 (Parallel CMS Threads)" prio=10 tid=0x00002aaea3a7e800 > nid=0x15dc runnable > "Gang worker#1 (Parallel CMS Threads)" prio=10 tid=0x00002aaea3a80000 > nid=0x15dd runnable > "Gang worker#2 (Parallel CMS Threads)" prio=10 tid=0x00002aaea3acb800 > nid=0x15de runnable > "Gang worker#3 (Parallel CMS Threads)" prio=10 tid=0x00002aaea3acd000 > nid=0x15df runnable > "VM Periodic Task Thread" prio=10 tid=0x00002aaeb83bc800 nid=0x15f3 waiting > on condition > JNI global references: 4205 > === > In the RS logs all the IPC Server handlers have died because HDFS failure to > write to the log. > First failure to append to the logs > 2010-11-02 17:05:58,620 WARN org.apache.hadoop.hdfs.DFSClient: Packet 243111 > [offsetInBlock=241298432 pktLen=2173] of blk_-4978761844464053668_783579 is > timed out > 2010-11-02 17:05:58,623 WARN org.apache.hadoop.hdfs.DFSClient: Error Recovery > for block blk_-4978761844464053668_783579 bad datanode[2] 10.38.47.49:50010 > 2010-11-02 17:05:58,623 WARN org.apache.hadoop.hdfs.DFSClient: Error Recovery > for block blk_-4978761844464053668_783579 in pipeline 10.38.15.45:50010, > 10.38.47.59:50010, 10.38.47.49:50010: bad datanode 10.38.47.49:50010 > 2010-11-02 17:05:58,755 WARN org.apache.hadoop.hdfs.DFSClient: Failed > recovery attempt #0 from primary datanode 10.38.15.45:50010 > org.apache.hadoop.ipc.RPC$VersionMismatch: Protocol > org.apache.hadoop.hdfs.protocol.ClientDatanodeProtocol version mismatch. > (client = 5, server = 3) > at org.apache.hadoop.ipc.RPC.getProxy(RPC.java:452) > at > org.apache.hadoop.hdfs.DFSClient.createClientDatanodeProtocolProxy(DFSClient.java:194) > at > org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.processDatanodeError(DFSClient.java:3027) > at > org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.access$1800(DFSClient.java:2467) > at > org.apache.hadoop.hdfs.DFSClient$DFSOutputStream$DataStreamer.run(DFSClient.java:2687) > ... > 2010-11-02 17:11:09,667 WARN org.apache.hadoop.hdfs.DFSClient: Failed > recovery attempt #0 from primary datanode 10.38.47.49:50010 > java.net.SocketTimeoutException: Call to /10.38.47.49:50020 failed on socket > timeout exception: java.net.SocketTimeoutException: 300000 millis timeout > while waiting for channel to be ready for read. ch : > java.nio.channels.SocketChannel[connected local=/10.38.15.45:56492 > remote=/10.38.47.49:50020] > at org.apache.hadoop.ipc.Client.wrapException(Client.java:855) > at org.apache.hadoop.ipc.Client.call(Client.java:827) > at org.apache.hadoop.ipc.RPC$Invoker.invoke(RPC.java:222) > at $Proxy3.getProtocolVersion(Unknown Source) > at org.apache.hadoop.ipc.RPC.getProxy(RPC.java:447) > at > org.apache.hadoop.hdfs.DFSClient.createClientDatanodeProtocolProxy(DFSClient.java:194) > at > org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.processDatanodeError(DFSClient.java:3027) > at > org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.access$1800(DFSClient.java:2467) > at > org.apache.hadoop.hdfs.DFSClient$DFSOutputStream$DataStreamer.run(DFSClient.java:2687) > Caused by: java.net.SocketTimeoutException: 300000 millis timeout while > waiting for channel to be ready for read. ch : > java.nio.channels.SocketChannel[connected local=/10.38.15.45:56492 > remote=/10.38.47.49:50020] > at > org.apache.hadoop.net.SocketIOWithTimeout.doIO(SocketIOWithTimeout.java:164) > at > org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:155) > at > org.apache.hadoop.net.SocketInputStream.read(SocketInputStream.java:128) > at java.io.FilterInputStream.read(FilterInputStream.java:116) > at > org.apache.hadoop.ipc.Client$Connection$PingInputStream.read(Client.java:325) > at java.io.BufferedInputStream.fill(BufferedInputStream.java:218) > at java.io.BufferedInputStream.read(BufferedInputStream.java:237) > at java.io.DataInputStream.readInt(DataInputStream.java:370) > at > org.apache.hadoop.ipc.Client$Connection.receiveResponse(Client.java:572) > at org.apache.hadoop.ipc.Client$Connection.run(Client.java:500) > 2010-11-02 17:11:09,669 WARN org.apache.hadoop.hdfs.DFSClient: Error Recovery > for block blk_-4978761844464053668_783579 file > /PUMAHBASE001-SNC5-HBASE/.logs/pumahbase028.snc5.facebook.com,60020,1288733355197/10.38.15.45%3A60020.1288742259683 > failed because recovery from primary datanode 10.38.47.49:50010 failed 1 > times. Pipeline was 10.38.47.59:50010, 10.38.47.49:50010. Will retry... > Error recovery fails. > 2010-11-02 17:36:20,919 FATAL org.apache.hadoop.hbase.regionserver.wal.HLog: > Could not append. Requesting close of hlog > java.io.IOException: Reflection > at > org.apache.hadoop.hbase.regionserver.wal.SequenceFileLogWriter.sync(SequenceFileLogWriter.java:147) > at > org.apache.hadoop.hbase.regionserver.wal.HLog.hflush(HLog.java:1042) > at > org.apache.hadoop.hbase.regionserver.wal.HLog$LogSyncer.run(HLog.java:966) > Caused by: java.lang.reflect.InvocationTargetException > at sun.reflect.GeneratedMethodAccessor22.invoke(Unknown Source) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25) > at java.lang.reflect.Method.invoke(Method.java:597) > at > org.apache.hadoop.hbase.regionserver.wal.SequenceFileLogWriter.sync(SequenceFileLogWriter.java:145) > ... 2 more > Caused by: java.io.IOException: Error Recovery for block > blk_-4978761844464053668_783579 file > /PUMAHBASE001-SNC5-HBASE/.logs/pumahbase028.snc5.facebook.com,60020,1288733355197/10.38.15.45%3A60020.1288742259683 > failed because recovery from primary datanode 10.38.47.59:50010 failed 6 > times. Pipeline was 10.38.47.59:50010. Aborting... > at > org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.processDatanodeError(DFSClient.java:3065) > at > org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.access$1800(DFSClient.java:2467) > at > org.apache.hadoop.hdfs.DFSClient$DFSOutputStream$DataStreamer.run(DFSClient.java:2687) > Regions start getting aborted because hlog close fails because filesystem is > not available > 2010-11-02 17:36:24,055 FATAL > org.apache.hadoop.hbase.regionserver.HRegionServer: ABORTING region server > serverName=pumahbase028.snc5.facebook.com,60020,1288733355197, > load=(requests=492, regions=55, usedHeap=3376, maxHeap=15993): File System > not available > java.io.IOException: File system is not available > at > org.apache.hadoop.hbase.util.FSUtils.checkFileSystemAvailable(FSUtils.java:130) > at > org.apache.hadoop.hbase.regionserver.HRegionServer.checkFileSystem(HRegionServer.java:949) > at > org.apache.hadoop.hbase.regionserver.HRegionServer.increment(HRegionServer.java:2312) > at sun.reflect.GeneratedMethodAccessor21.invoke(Unknown Source) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25) > at java.lang.reflect.Method.invoke(Method.java:597) > at org.apache.hadoop.hbase.ipc.HBaseRPC$Server.call(HBaseRPC.java:561) > at > org.apache.hadoop.hbase.ipc.HBaseServer$Handler.run(HBaseServer.java:1025) > Caused by: java.io.IOException: Filesystem closed > at org.apache.hadoop.hdfs.DFSClient.checkOpen(DFSClient.java:282) > at org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:875) > at > org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:484) > at org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:766) > at > org.apache.hadoop.hbase.util.FSUtils.checkFileSystemAvailable(FSUtils.java:119) > ... 7 more > All the IPC Server handlers exit > 2010-11-02 17:36:24,097 WARN org.apache.hadoop.ipc.HBaseServer: IPC Server > Responder, call increment([...@aa32978, > row=b4da93229cfc2cfb7d731f46da44f108info.compartilo1.59624 lfi 450313269132, > families={(family=COUNTERS_0, columns={+=1, a13g1+=1, cCL+=1, les_LA+=1}), > (family=COUNTERS_3600, columns={7ffa89a0+=1}}) from 10.38.15.37:49593: output > error > 2010-11-02 17:36:24,094 FATAL org.apache.hadoop.hbase.regionserver.wal.HLog: > Could not append. Requesting close of hlog > java.io.IOException: Filesystem closed > at org.apache.hadoop.hdfs.DFSClient.checkOpen(DFSClient.java:282) > at org.apache.hadoop.hdfs.DFSClient.access$800(DFSClient.java:71) > at > org.apache.hadoop.hdfs.DFSClient$DFSOutputStream.writeChunk(DFSClient.java:3465) > at > org.apache.hadoop.fs.FSOutputSummer.writeChecksumChunk(FSOutputSummer.java:150) > at > org.apache.hadoop.fs.FSOutputSummer.flushBuffer(FSOutputSummer.java:132) > at > org.apache.hadoop.fs.FSOutputSummer.flushBuffer(FSOutputSummer.java:121) > at org.apache.hadoop.fs.FSOutputSummer.write1(FSOutputSummer.java:112) > at org.apache.hadoop.fs.FSOutputSummer.write(FSOutputSummer.java:86) > at > org.apache.hadoop.fs.FSDataOutputStream$PositionCache.write(FSDataOutputStream.java:49) > at java.io.DataOutputStream.write(DataOutputStream.java:90) > at > org.apache.hadoop.io.SequenceFile$Writer.append(SequenceFile.java:1073) > at > org.apache.hadoop.io.SequenceFile$Writer.append(SequenceFile.java:1037) > at > org.apache.hadoop.hbase.regionserver.wal.SequenceFileLogWriter.append(SequenceFileLogWriter.java:133) > at > org.apache.hadoop.hbase.regionserver.wal.HLog.doWrite(HLog.java:1129) > at org.apache.hadoop.hbase.regionserver.wal.HLog.append(HLog.java:914) > at > org.apache.hadoop.hbase.regionserver.HRegion.increment(HRegion.java:3038) > at > org.apache.hadoop.hbase.regionserver.HRegionServer.increment(HRegionServer.java:2309) > at sun.reflect.GeneratedMethodAccessor21.invoke(Unknown Source) > at > sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:25) > at java.lang.reflect.Method.invoke(Method.java:597) > at org.apache.hadoop.hbase.ipc.HBaseRPC$Server.call(HBaseRPC.java:561) > at > org.apache.hadoop.hbase.ipc.HBaseServer$Handler.run(HBaseServer.java:1025) > 2010-11-02 17:36:24,108 WARN org.apache.hadoop.ipc.HBaseServer: IPC Server > handler 69 on 60020 caught: java.nio.channels.ClosedChannelException > at > sun.nio.ch.SocketChannelImpl.ensureWriteOpen(SocketChannelImpl.java:126) > at sun.nio.ch.SocketChannelImpl.write(SocketChannelImpl.java:324) > at > org.apache.hadoop.hbase.ipc.HBaseServer.channelWrite(HBaseServer.java:1282) > at > org.apache.hadoop.hbase.ipc.HBaseServer$Responder.processResponse(HBaseServer.java:713) > at > org.apache.hadoop.hbase.ipc.HBaseServer$Responder.doRespond(HBaseServer.java:778) > at > org.apache.hadoop.hbase.ipc.HBaseServer$Handler.run(HBaseServer.java:1053) > 2010-11-02 17:36:24,113 INFO org.apache.hadoop.ipc.HBaseServer: IPC Server > handler 69 on 60020: exiting > === > If all the IPC Server handlers exit for whatever reason then the Region > Server should cleanly exit. I think my clients are getting stuck in > locateRegionInMeta() because of this stuck region server. > === -- This message is automatically generated by JIRA. - You can reply to this email to add a comment to the issue online.