[jira] [Updated] (HDFS-7429) DomainSocketWatcher.kick stuck
[ https://issues.apache.org/jira/browse/HDFS-7429?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] zhaoyunjiong updated HDFS-7429: --- Description: I found some of our DataNodes will run "exceeds the limit of concurrent xciever", the limit is 4K. After check the stack, I suspect that org.apache.hadoop.net.unix.DomainSocket.writeArray0 which called by DomainSocketWatcher.kick stuck: {quote} "DataXceiver for client unix:/var/run/hadoop-hdfs/dn [Waiting for operation #1]" daemon prio=10 tid=0x7f55c5576000 nid=0x385d waiting on condition [0x7f558d5d4000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x000740df9c90> (a java.util.concurrent.locks.ReentrantLock$NonfairSync) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:186) at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:834) at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:867) at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1197) at java.util.concurrent.locks.ReentrantLock$NonfairSync.lock(ReentrantLock.java:214) at java.util.concurrent.locks.ReentrantLock.lock(ReentrantLock.java:290) at org.apache.hadoop.net.unix.DomainSocketWatcher.add(DomainSocketWatcher.java:286) at org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry.createNewMemorySegment(ShortCircuitRegistry.java:283) at org.apache.hadoop.hdfs.server.datanode.DataXceiver.requestShortCircuitShm(DataXceiver.java:413) at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.opRequestShortCircuitShm(Receiver.java:172) at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.processOp(Receiver.java:92) at org.apache.hadoop.hdfs.server.datanode.DataXceiver.run(DataXceiver.java:232) -- "DataXceiver for client unix:/var/run/hadoop-hdfs/dn [Waiting for operation #1]" daemon prio=10 tid=0x7f7de034c800 nid=0x7b7 runnable [0x7f7db06c5000] java.lang.Thread.State: RUNNABLE at org.apache.hadoop.net.unix.DomainSocket.writeArray0(Native Method) at org.apache.hadoop.net.unix.DomainSocket.access$300(DomainSocket.java:45) at org.apache.hadoop.net.unix.DomainSocket$DomainOutputStream.write(DomainSocket.java:589) at org.apache.hadoop.net.unix.DomainSocketWatcher.kick(DomainSocketWatcher.java:350) at org.apache.hadoop.net.unix.DomainSocketWatcher.add(DomainSocketWatcher.java:303) at org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry.createNewMemorySegment(ShortCircuitRegistry.java:283) at org.apache.hadoop.hdfs.server.datanode.DataXceiver.requestShortCircuitShm(DataXceiver.java:413) at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.opRequestShortCircuitShm(Receiver.java:172) at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.processOp(Receiver.java:92) at org.apache.hadoop.hdfs.server.datanode.DataXceiver.run(DataXceiver.java:232) at java.lang.Thread.run(Thread.java:745) "DataXceiver for client unix:/var/run/hadoop-hdfs/dn [Waiting for operation #1]" daemon prio=10 tid=0x7f55c5574000 nid=0x377a waiting on condition [0x7f558d7d6000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x000740df9cb0> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:186) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2043) at org.apache.hadoop.net.unix.DomainSocketWatcher.add(DomainSocketWatcher.java:306) at org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry.createNewMemorySegment(ShortCircuitRegistry.java:283) at org.apache.hadoop.hdfs.server.datanode.DataXceiver.requestShortCircuitShm(DataXceiver.java:413) at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.opRequestShortCircuitShm(Receiver.java:172) at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.processOp(Receiver.java:92) at org.apache.hadoop.hdfs.server.datanode.DataXceiver.run(DataXceiver.java:232) at java.lang.Thread.run(Thread.java:745) "Thread-163852" daemon prio=10 tid=0x7f55c811c800 nid=0x6757 runnable [0x7f55aef6e000] java.lang.Thread.State: RUNNABLE at org.apache.hadoop.net.unix.DomainSocketWatcher.doPoll0(Native Method) at org.apache.hadoop.net.unix.DomainSocketWatcher.access$800(DomainSocketWatcher.java:52) at org.apache.hadoop.net.unix.DomainSocketWatcher$1.run(DomainSocketWatcher.java:457) at java.lang.Thr
[jira] [Updated] (HDFS-7429) DomainSocketWatcher.kick stuck
[ https://issues.apache.org/jira/browse/HDFS-7429?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] zhaoyunjiong updated HDFS-7429: --- Description: I found some of our DataNodes will run "exceeds the limit of concurrent xciever", the limit is 4K. After check the stack, I suspect that org.apache.hadoop.net.unix.DomainSocket.writeArray0 which called by DomainSocketWatcher.kick stuck: {quote} "DataXceiver for client unix:/var/run/hadoop-hdfs/dn [Waiting for operation #1]" daemon prio=10 tid=0x7f55c5576000 nid=0x385d waiting on condition [0x7f558d5d4000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x000740df9c90> (a java.util.concurrent.locks.ReentrantLock$NonfairSync) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:186) at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:834) at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:867) at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1197) at java.util.concurrent.locks.ReentrantLock$NonfairSync.lock(ReentrantLock.java:214) at java.util.concurrent.locks.ReentrantLock.lock(ReentrantLock.java:290) at org.apache.hadoop.net.unix.DomainSocketWatcher.add(DomainSocketWatcher.java:286) at org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry.createNewMemorySegment(ShortCircuitRegistry.java:283) at org.apache.hadoop.hdfs.server.datanode.DataXceiver.requestShortCircuitShm(DataXceiver.java:413) at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.opRequestShortCircuitShm(Receiver.java:172) at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.processOp(Receiver.java:92) at org.apache.hadoop.hdfs.server.datanode.DataXceiver.run(DataXceiver.java:232) -- "DataXceiver for client unix:/var/run/hadoop-hdfs/dn [Waiting for operation #1]" daemon prio=10 tid=0x7f55c5575000 nid=0x37b3 runnable [0x7f558d3d2000] java.lang.Thread.State: RUNNABLE at org.apache.hadoop.net.unix.DomainSocket.writeArray0(Native Method) at org.apache.hadoop.net.unix.DomainSocket.access$300(DomainSocket.java:45) at org.apache.hadoop.net.unix.DomainSocket$DomainOutputStream.write(DomainSocket.java:589) at org.apache.hadoop.net.unix.DomainSocketWatcher.kick(DomainSocketWatcher.java:350) at org.apache.hadoop.net.unix.DomainSocketWatcher.add(DomainSocketWatcher.java:303) at org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry.createNewMemorySegment(ShortCircuitRegistry.java:283) at org.apache.hadoop.hdfs.server.datanode.DataXceiver.requestShortCircuitShm(DataXceiver.java:413) at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.opRequestShortCircuitShm(Receiver.java:172) at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.processOp(Receiver.java:92) at org.apache.hadoop.hdfs.server.datanode.DataXceiver.run(DataXceiver.java:232) "DataXceiver for client unix:/var/run/hadoop-hdfs/dn [Waiting for operation #1]" daemon prio=10 tid=0x7f55c5574000 nid=0x377a waiting on condition [0x7f558d7d6000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x000740df9cb0> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:186) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2043) at org.apache.hadoop.net.unix.DomainSocketWatcher.add(DomainSocketWatcher.java:306) at org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry.createNewMemorySegment(ShortCircuitRegistry.java:283) at org.apache.hadoop.hdfs.server.datanode.DataXceiver.requestShortCircuitShm(DataXceiver.java:413) at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.opRequestShortCircuitShm(Receiver.java:172) at org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.processOp(Receiver.java:92) at org.apache.hadoop.hdfs.server.datanode.DataXceiver.run(DataXceiver.java:232) at java.lang.Thread.run(Thread.java:745) "Thread-163852" daemon prio=10 tid=0x7f55c811c800 nid=0x6757 runnable [0x7f55aef6e000] java.lang.Thread.State: RUNNABLE at org.apache.hadoop.net.unix.DomainSocketWatcher.doPoll0(Native Method) at org.apache.hadoop.net.unix.DomainSocketWatcher.access$800(DomainSocketWatcher.java:52) at org.apache.hadoop.net.unix.DomainSocketWatcher$1.run(DomainSocketWatcher.java:457) at java.lang.Thread.run(Thread.java:745) {quote} was: I found
[jira] [Updated] (HDFS-7429) DomainSocketWatcher.kick stuck
[ https://issues.apache.org/jira/browse/HDFS-7429?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ] zhaoyunjiong updated HDFS-7429: --- Summary: DomainSocketWatcher.kick stuck (was: DomainSocketWatcher.doPoll0 stuck) > DomainSocketWatcher.kick stuck > -- > > Key: HDFS-7429 > URL: https://issues.apache.org/jira/browse/HDFS-7429 > Project: Hadoop HDFS > Issue Type: Bug >Reporter: zhaoyunjiong > Attachments: 11241021, 11241023, 11241025 > > > I found some of our DataNodes will run "exceeds the limit of concurrent > xciever", the limit is 4K. > After check the stack, I suspect that DomainSocketWatcher.doPoll0 stuck: > {quote} > "DataXceiver for client unix:/var/run/hadoop-hdfs/dn [Waiting for operation > #1]" daemon prio=10 tid=0x7f55c5576000 nid=0x385d waiting on condition > [0x7f558d5d4000] >java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x000740df9c90> (a > java.util.concurrent.locks.ReentrantLock$NonfairSync) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:186) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:834) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireQueued(AbstractQueuedSynchronizer.java:867) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer.acquire(AbstractQueuedSynchronizer.java:1197) > at > java.util.concurrent.locks.ReentrantLock$NonfairSync.lock(ReentrantLock.java:214) > at > java.util.concurrent.locks.ReentrantLock.lock(ReentrantLock.java:290) > at > org.apache.hadoop.net.unix.DomainSocketWatcher.add(DomainSocketWatcher.java:286) > at > org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry.createNewMemorySegment(ShortCircuitRegistry.java:283) > at > org.apache.hadoop.hdfs.server.datanode.DataXceiver.requestShortCircuitShm(DataXceiver.java:413) > at > org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.opRequestShortCircuitShm(Receiver.java:172) > at > org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.processOp(Receiver.java:92) > at > org.apache.hadoop.hdfs.server.datanode.DataXceiver.run(DataXceiver.java:232) > -- > "DataXceiver for client unix:/var/run/hadoop-hdfs/dn [Waiting for operation > #1]" daemon prio=10 tid=0x7f55c5575000 nid=0x37b3 runnable > [0x7f558d3d2000] >java.lang.Thread.State: RUNNABLE > at org.apache.hadoop.net.unix.DomainSocket.writeArray0(Native Method) > at > org.apache.hadoop.net.unix.DomainSocket.access$300(DomainSocket.java:45) > at > org.apache.hadoop.net.unix.DomainSocket$DomainOutputStream.write(DomainSocket.java:589) > at > org.apache.hadoop.net.unix.DomainSocketWatcher.kick(DomainSocketWatcher.java:350) > at > org.apache.hadoop.net.unix.DomainSocketWatcher.add(DomainSocketWatcher.java:303) > at > org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry.createNewMemorySegment(ShortCircuitRegistry.java:283) > at > org.apache.hadoop.hdfs.server.datanode.DataXceiver.requestShortCircuitShm(DataXceiver.java:413) > at > org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.opRequestShortCircuitShm(Receiver.java:172) > at > org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.processOp(Receiver.java:92) > at > org.apache.hadoop.hdfs.server.datanode.DataXceiver.run(DataXceiver.java:232) > "DataXceiver for client unix:/var/run/hadoop-hdfs/dn [Waiting for operation > #1]" daemon prio=10 tid=0x7f55c5574000 nid=0x377a waiting on condition > [0x7f558d7d6000] >java.lang.Thread.State: WAITING (parking) > at sun.misc.Unsafe.park(Native Method) > - parking to wait for <0x000740df9cb0> (a > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) > at java.util.concurrent.locks.LockSupport.park(LockSupport.java:186) > at > java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2043) > at > org.apache.hadoop.net.unix.DomainSocketWatcher.add(DomainSocketWatcher.java:306) > at > org.apache.hadoop.hdfs.server.datanode.ShortCircuitRegistry.createNewMemorySegment(ShortCircuitRegistry.java:283) > at > org.apache.hadoop.hdfs.server.datanode.DataXceiver.requestShortCircuitShm(DataXceiver.java:413) > at > org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.opRequestShortCircuitShm(Receiver.java:172) > at > org.apache.hadoop.hdfs.protocol.datatransfer.Receiver.processOp(Receiver.java:92) > at > org.apache.hadoop.hdfs.server.datanode.DataXceiver.run(DataXceiver.java:232) > at java.lang.Thread.run(Thread.java:745) >