Hello dev team, We are using Hadoop, Accumulo & Zookeeper in our environment, after enabling TLS for ZK we noticed that starting Accumulo master service hangs in an intermediate process as shown below and require to kill the process in-order for Accumulo master to start.
[user1@host1 ~]$ jps -m 23314 JournalNode 23011 NameNode 23539 DFSZKFailoverController *84118 Main org.apache.accumulo.master.state.SetGoalState NORMAL* 22590 QuorumPeerMain 89790 Jps -m [user1@host1 ~]$ *kill -9 84118* [user1@host1 ~]$ jps -m 23314 JournalNode 23011 NameNode 23539 DFSZKFailoverController 89892 Jps -m *89847 Main master* 22590 QuorumPeerMain [user1@host1 ~]$ Jstack collected during the hang shows 2 non-daemon threads (#27 & #30) while the rest are daemon threads. Would like to check with the dev team if "nioEventLoopGroup" threads are expected to be non-daemon? If so, any thoughts on what else might be causing the issue? I have copied only a portion of the jstack output, let me know in-case you need the full output. Fyi, I'm using Apache Zookeeper 3.5.7, Hadoop 3.2.1 & Accumulo 2.0. Let me know if you need any further details? Many thanks "org.apache.accumulo.master.state.SetGoalState-SendThread(host1:2281)" #25 daemon prio=5 os_prio=0 cpu=127.90ms elapsed=95.38s tid=0x0000000003a10800 nid=0x1624e waiting on condition [0x00007f5c7bd67000] java.lang.Thread.State: TIMED_WAITING (parking) at jdk.internal.misc.Unsafe.park(java.base@11.0.6/Native Method) - parking to wait for <0x000000070f0acf38> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.parkNanos(java.base@11.0.6 /LockSupport.java:234) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(java.base@11.0.6 /AbstractQueuedSynchronizer.java:2123) at java.util.concurrent.LinkedBlockingDeque.pollFirst(java.base@11.0.6 /LinkedBlockingDeque.java:513) at java.util.concurrent.LinkedBlockingDeque.poll(java.base@11.0.6 /LinkedBlockingDeque.java:675) at org.apache.zookeeper.ClientCnxnSocketNetty.doTransport(ClientCnxnSocketNetty.java:278) at org.apache.zookeeper.ClientCnxn$SendThread.run(ClientCnxn.java:1223) Locked ownable synchronizers: - None "org.apache.accumulo.master.state.SetGoalState-EventThread" #26 daemon prio=5 os_prio=0 cpu=0.88ms elapsed=95.38s tid=0x0000000003a16800 nid=0x1624f waiting on condition [0x00007f5c7bc66000] java.lang.Thread.State: WAITING (parking) at jdk.internal.misc.Unsafe.park(java.base@11.0.6/Native Method) - parking to wait for <0x000000070f0f8af0> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(java.base@11.0.6 /LockSupport.java:194) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(java.base@11.0.6 /AbstractQueuedSynchronizer.java:2081) at java.util.concurrent.LinkedBlockingQueue.take(java.base@11.0.6 /LinkedBlockingQueue.java:433) at org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:506) Locked ownable synchronizers: - None "*nioEventLoopGroup-2-1*" #27 prio=10 os_prio=0 cpu=718.22ms elapsed=95.28s tid=0x0000000003cd2800 nid=0x16250 runnable [0x00007f5c7d6b9000] java.lang.Thread.State: RUNNABLE at sun.nio.ch.EPoll.wait(java.base@11.0.6/Native Method) at sun.nio.ch.EPollSelectorImpl.doSelect(java.base@11.0.6 /EPollSelectorImpl.java:120) at sun.nio.ch.SelectorImpl.lockAndDoSelect(java.base@11.0.6 /SelectorImpl.java:124) - locked <0x000000070f079a28> (a io.netty.channel.nio.SelectedSelectionKeySet) - locked <0x000000070f06ca80> (a sun.nio.ch.EPollSelectorImpl) at sun.nio.ch.SelectorImpl.select(java.base@11.0.6/SelectorImpl.java:141) at io.netty.channel.nio.SelectedSelectionKeySetSelector.select(SelectedSelectionKeySetSelector.java:68) at io.netty.channel.nio.NioEventLoop.select(NioEventLoop.java:803) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:457) at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) at java.lang.Thread.run(java.base@11.0.6/Thread.java:834) Locked ownable synchronizers: - None "org.apache.accumulo.master.state.SetGoalState-SendThread(host2:2281)" #28 daemon prio=5 os_prio=0 cpu=8.27ms elapsed=94.31s tid=0x0000000005942800 nid=0x16259 waiting on condition [0x00007f5c7b145000] java.lang.Thread.State: TIMED_WAITING (parking) at jdk.internal.misc.Unsafe.park(java.base@11.0.6/Native Method) - parking to wait for <0x000000070f2acbc0> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.parkNanos(java.base@11.0.6 /LockSupport.java:234) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(java.base@11.0.6 /AbstractQueuedSynchronizer.java:2123) at java.util.concurrent.LinkedBlockingDeque.pollFirst(java.base@11.0.6 /LinkedBlockingDeque.java:513) at java.util.concurrent.LinkedBlockingDeque.poll(java.base@11.0.6 /LinkedBlockingDeque.java:675) at org.apache.zookeeper.ClientCnxnSocketNetty.doTransport(ClientCnxnSocketNetty.java:278) at org.apache.zookeeper.ClientCnxn$SendThread.run(ClientCnxn.java:1223) Locked ownable synchronizers: - None "org.apache.accumulo.master.state.SetGoalState-EventThread" #29 daemon prio=5 os_prio=0 cpu=0.25ms elapsed=94.31s tid=0x0000000005943800 nid=0x1625a waiting on condition [0x00007f5c7b044000] java.lang.Thread.State: WAITING (parking) at jdk.internal.misc.Unsafe.park(java.base@11.0.6/Native Method) - parking to wait for <0x000000070f2adff8> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(java.base@11.0.6 /LockSupport.java:194) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(java.base@11.0.6 /AbstractQueuedSynchronizer.java:2081) at java.util.concurrent.LinkedBlockingQueue.take(java.base@11.0.6 /LinkedBlockingQueue.java:433) at org.apache.zookeeper.ClientCnxn$EventThread.run(ClientCnxn.java:506) Locked ownable synchronizers: - None "*nioEventLoopGroup-3-1*" #30 prio=10 os_prio=0 cpu=297.70ms elapsed=94.30s tid=0x00000000044af800 nid=0x1625b runnable [0x00007f5c7b646000] java.lang.Thread.State: RUNNABLE at sun.nio.ch.EPoll.wait(java.base@11.0.6/Native Method) at sun.nio.ch.EPollSelectorImpl.doSelect(java.base@11.0.6 /EPollSelectorImpl.java:120) at sun.nio.ch.SelectorImpl.lockAndDoSelect(java.base@11.0.6 /SelectorImpl.java:124) - locked <0x000000070f2ab868> (a io.netty.channel.nio.SelectedSelectionKeySet) - locked <0x000000070f2ab640> (a sun.nio.ch.EPollSelectorImpl) at sun.nio.ch.SelectorImpl.select(java.base@11.0.6/SelectorImpl.java:141) at io.netty.channel.nio.SelectedSelectionKeySetSelector.select(SelectedSelectionKeySetSelector.java:68) at io.netty.channel.nio.NioEventLoop.select(NioEventLoop.java:803) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:457) at io.netty.util.concurrent.SingleThreadEventExecutor$4.run(SingleThreadEventExecutor.java:989) at io.netty.util.internal.ThreadExecutorMap$2.run(ThreadExecutorMap.java:74) at io.netty.util.concurrent.FastThreadLocalRunnable.run(FastThreadLocalRunnable.java:30) at java.lang.Thread.run(java.base@11.0.6/Thread.java:834) Locked ownable synchronizers: - None