[ https://issues.apache.org/jira/browse/SPARK-19764?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15889566#comment-15889566 ]
Ari Gesher commented on SPARK-19764: ------------------------------------ And here's the stuck Executor: {noformat} Full thread dump OpenJDK 64-Bit Server VM (25.121-b13 mixed mode): "shuffle-server-1" #30 daemon prio=5 os_prio=0 tid=0x00007fdf9400d800 nid=0xd22 runnable [0x00007fdfc4726000] java.lang.Thread.State: RUNNABLE at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method) at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269) at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:93) at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86) - locked <0x00000000c014b990> (a io.netty.channel.nio.SelectedSelectionKeySet) - locked <0x00000000c014da10> (a java.util.Collections$UnmodifiableSet) - locked <0x00000000c014b8f8> (a sun.nio.ch.EPollSelectorImpl) at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97) at io.netty.channel.nio.NioEventLoop.select(NioEventLoop.java:622) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:310) at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111) at java.lang.Thread.run(Thread.java:745) "File appending thread for /home/ubuntu/app-20170228223629-0000/1/stderr" #56 daemon prio=5 os_prio=0 tid=0x00007fdf7803b000 nid=0xce7 runnable [0x00007fdfc4827000] java.lang.Thread.State: RUNNABLE at java.io.FileInputStream.readBytes(Native Method) at java.io.FileInputStream.read(FileInputStream.java:255) at java.io.BufferedInputStream.read1(BufferedInputStream.java:284) at java.io.BufferedInputStream.read(BufferedInputStream.java:345) - locked <0x00000000f16a3010> (a java.lang.UNIXProcess$ProcessPipeInputStream) at java.io.FilterInputStream.read(FilterInputStream.java:107) at org.apache.spark.util.logging.FileAppender$$anonfun$appendStreamToFile$1.apply$mcV$sp(FileAppender.scala:68) at org.apache.spark.util.logging.FileAppender$$anonfun$appendStreamToFile$1.apply(FileAppender.scala:62) at org.apache.spark.util.logging.FileAppender$$anonfun$appendStreamToFile$1.apply(FileAppender.scala:62) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1310) at org.apache.spark.util.logging.FileAppender.appendStreamToFile(FileAppender.scala:78) at org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply$mcV$sp(FileAppender.scala:39) at org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply(FileAppender.scala:39) at org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply(FileAppender.scala:39) at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1953) at org.apache.spark.util.logging.FileAppender$$anon$1.run(FileAppender.scala:38) "File appending thread for /home/ubuntu/app-20170228223629-0000/1/stdout" #55 daemon prio=5 os_prio=0 tid=0x00007fdf78036800 nid=0xcde runnable [0x00007fdfc4e2b000] java.lang.Thread.State: RUNNABLE at java.io.FileInputStream.readBytes(Native Method) at java.io.FileInputStream.read(FileInputStream.java:255) at java.io.BufferedInputStream.read1(BufferedInputStream.java:284) at java.io.BufferedInputStream.read(BufferedInputStream.java:345) - locked <0x00000000f16a0f50> (a java.lang.UNIXProcess$ProcessPipeInputStream) at java.io.FilterInputStream.read(FilterInputStream.java:107) at org.apache.spark.util.logging.FileAppender$$anonfun$appendStreamToFile$1.apply$mcV$sp(FileAppender.scala:68) at org.apache.spark.util.logging.FileAppender$$anonfun$appendStreamToFile$1.apply(FileAppender.scala:62) at org.apache.spark.util.logging.FileAppender$$anonfun$appendStreamToFile$1.apply(FileAppender.scala:62) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1310) at org.apache.spark.util.logging.FileAppender.appendStreamToFile(FileAppender.scala:78) at org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply$mcV$sp(FileAppender.scala:39) at org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply(FileAppender.scala:39) at org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply(FileAppender.scala:39) at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1953) at org.apache.spark.util.logging.FileAppender$$anon$1.run(FileAppender.scala:38) "process reaper" #54 daemon prio=10 os_prio=0 tid=0x00007fdf78032800 nid=0xcdc runnable [0x00007fdfc6de2000] java.lang.Thread.State: RUNNABLE at java.lang.UNIXProcess.waitForProcessExit(Native Method) at java.lang.UNIXProcess.lambda$initStreams$3(UNIXProcess.java:289) at java.lang.UNIXProcess$$Lambda$7/1471086700.run(Unknown Source) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) "ExecutorRunner for app-20170228223629-0000/1" #53 daemon prio=5 os_prio=0 tid=0x00007fdfa00af800 nid=0xcda in Object.wait() [0x00007fdfc502d000] java.lang.Thread.State: WAITING (on object monitor) at java.lang.Object.wait(Native Method) - waiting on <0x00000000f169ec58> (a java.lang.UNIXProcess) at java.lang.Object.wait(Object.java:502) at java.lang.UNIXProcess.waitFor(UNIXProcess.java:395) - locked <0x00000000f169ec58> (a java.lang.UNIXProcess) at org.apache.spark.deploy.worker.ExecutorRunner.org$apache$spark$deploy$worker$ExecutorRunner$$fetchAndRunExecutor(ExecutorRunner.scala:177) at org.apache.spark.deploy.worker.ExecutorRunner$$anon$1.run(ExecutorRunner.scala:73) "threadDeathWatcher-2-1" #51 daemon prio=1 os_prio=0 tid=0x00007fdf6800d800 nid=0xab3 waiting on condition [0x00007fdfc4928000] java.lang.Thread.State: TIMED_WAITING (sleeping) at java.lang.Thread.sleep(Native Method) at io.netty.util.ThreadDeathWatcher$Watcher.run(ThreadDeathWatcher.java:137) at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:137) at java.lang.Thread.run(Thread.java:745) "shuffle-client-0" #21 daemon prio=5 os_prio=0 tid=0x00007fdf70006000 nid=0xab1 runnable [0x00007fdfc4a29000] java.lang.Thread.State: RUNNABLE at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method) at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269) at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:93) at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86) - locked <0x00000000c0235758> (a io.netty.channel.nio.SelectedSelectionKeySet) - locked <0x00000000c028f760> (a java.util.Collections$UnmodifiableSet) - locked <0x00000000c02356c0> (a sun.nio.ch.EPollSelectorImpl) at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97) at io.netty.channel.nio.NioEventLoop.select(NioEventLoop.java:622) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:310) at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111) at java.lang.Thread.run(Thread.java:745) "netty-rpc-env-timeout" #50 daemon prio=5 os_prio=0 tid=0x00007fdf78015800 nid=0xab0 waiting on condition [0x00007fdfc4d2a000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00000000c01d19a8> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) at java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:1081) at java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:809) at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1067) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1127) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) "worker-forward-message-scheduler" #48 daemon prio=5 os_prio=0 tid=0x00007fdfb819d000 nid=0xaa5 waiting on condition [0x00007fdfc4f2c000] java.lang.Thread.State: TIMED_WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00000000c009edb0> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078) at java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:1093) at java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:809) at java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1067) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1127) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) "WorkerUI-46" #46 daemon prio=5 os_prio=0 tid=0x00007fdfb8116800 nid=0xa9a waiting on condition [0x00007fdfc512e000] java.lang.Thread.State: TIMED_WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00000000ebad5008> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078) at org.spark_project.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:389) at org.spark_project.jetty.util.thread.QueuedThreadPool.idleJobPoll(QueuedThreadPool.java:531) at org.spark_project.jetty.util.thread.QueuedThreadPool.access$700(QueuedThreadPool.java:47) at org.spark_project.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:590) at java.lang.Thread.run(Thread.java:745) "WorkerUI-45" #45 daemon prio=5 os_prio=0 tid=0x00007fdfb8115000 nid=0xa99 waiting on condition [0x00007fdfc522f000] java.lang.Thread.State: TIMED_WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00000000ebad5008> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078) at org.spark_project.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:389) at org.spark_project.jetty.util.thread.QueuedThreadPool.idleJobPoll(QueuedThreadPool.java:531) at org.spark_project.jetty.util.thread.QueuedThreadPool.access$700(QueuedThreadPool.java:47) at org.spark_project.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:590) at java.lang.Thread.run(Thread.java:745) "WorkerUI-44" #44 daemon prio=5 os_prio=0 tid=0x00007fdfb8113000 nid=0xa98 waiting on condition [0x00007fdfc5330000] java.lang.Thread.State: TIMED_WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00000000ebad5008> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078) at org.spark_project.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:389) at org.spark_project.jetty.util.thread.QueuedThreadPool.idleJobPoll(QueuedThreadPool.java:531) at org.spark_project.jetty.util.thread.QueuedThreadPool.access$700(QueuedThreadPool.java:47) at org.spark_project.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:590) at java.lang.Thread.run(Thread.java:745) "WorkerUI-43-acceptor-0@410ad31d-ServerConnector@4fcd354c{HTTP/1.1}{172.31.20.156:8081}" #43 daemon prio=5 os_prio=0 tid=0x00007fdfb8111800 nid=0xa97 runnable [0x00007fdfc5431000] java.lang.Thread.State: RUNNABLE at sun.nio.ch.ServerSocketChannelImpl.accept0(Native Method) at sun.nio.ch.ServerSocketChannelImpl.accept(ServerSocketChannelImpl.java:422) at sun.nio.ch.ServerSocketChannelImpl.accept(ServerSocketChannelImpl.java:250) - locked <0x00000000ebdaf3b0> (a java.lang.Object) at org.spark_project.jetty.server.ServerConnector.accept(ServerConnector.java:377) at org.spark_project.jetty.server.AbstractConnector$Acceptor.run(AbstractConnector.java:500) at org.spark_project.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635) at org.spark_project.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555) at java.lang.Thread.run(Thread.java:745) "WorkerUI-42-selector-ServerConnectorManager@19274cfc/3" #42 daemon prio=5 os_prio=0 tid=0x00007fdfb8110000 nid=0xa96 runnable [0x00007fdfc5532000] java.lang.Thread.State: RUNNABLE at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method) at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269) at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:93) at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86) - locked <0x00000000ebe19238> (a sun.nio.ch.Util$3) - locked <0x00000000ebe19228> (a java.util.Collections$UnmodifiableSet) - locked <0x00000000ebe19110> (a sun.nio.ch.EPollSelectorImpl) at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97) at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:101) at org.spark_project.jetty.io.SelectorManager$ManagedSelector.select(SelectorManager.java:601) at org.spark_project.jetty.io.SelectorManager$ManagedSelector.run(SelectorManager.java:550) at org.spark_project.jetty.util.thread.NonBlockingThread.run(NonBlockingThread.java:52) at org.spark_project.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635) at org.spark_project.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555) at java.lang.Thread.run(Thread.java:745) "WorkerUI-41-selector-ServerConnectorManager@19274cfc/2" #41 daemon prio=5 os_prio=0 tid=0x00007fdfb810e000 nid=0xa95 runnable [0x00007fdfc5633000] java.lang.Thread.State: RUNNABLE at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method) at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269) at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:93) at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86) - locked <0x00000000ebe08420> (a sun.nio.ch.Util$3) - locked <0x00000000ebe08410> (a java.util.Collections$UnmodifiableSet) - locked <0x00000000ebe082f8> (a sun.nio.ch.EPollSelectorImpl) at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97) at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:101) at org.spark_project.jetty.io.SelectorManager$ManagedSelector.select(SelectorManager.java:601) at org.spark_project.jetty.io.SelectorManager$ManagedSelector.run(SelectorManager.java:550) at org.spark_project.jetty.util.thread.NonBlockingThread.run(NonBlockingThread.java:52) at org.spark_project.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635) at org.spark_project.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555) at java.lang.Thread.run(Thread.java:745) "WorkerUI-40-selector-ServerConnectorManager@19274cfc/1" #40 daemon prio=5 os_prio=0 tid=0x00007fdfb810d000 nid=0xa94 runnable [0x00007fdfc5734000] java.lang.Thread.State: RUNNABLE at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method) at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269) at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:93) at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86) - locked <0x00000000ebdf7608> (a sun.nio.ch.Util$3) - locked <0x00000000ebdf75f8> (a java.util.Collections$UnmodifiableSet) - locked <0x00000000ebdf74e0> (a sun.nio.ch.EPollSelectorImpl) at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97) at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:101) at org.spark_project.jetty.io.SelectorManager$ManagedSelector.select(SelectorManager.java:601) at org.spark_project.jetty.io.SelectorManager$ManagedSelector.run(SelectorManager.java:550) at org.spark_project.jetty.util.thread.NonBlockingThread.run(NonBlockingThread.java:52) at org.spark_project.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635) at org.spark_project.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555) at java.lang.Thread.run(Thread.java:745) "WorkerUI-39-selector-ServerConnectorManager@19274cfc/0" #39 daemon prio=5 os_prio=0 tid=0x00007fdfb8104800 nid=0xa93 runnable [0x00007fdfc5835000] java.lang.Thread.State: RUNNABLE at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method) at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269) at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:93) at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86) - locked <0x00000000ebddedf8> (a sun.nio.ch.Util$3) - locked <0x00000000ebddede8> (a java.util.Collections$UnmodifiableSet) - locked <0x00000000ebddecd0> (a sun.nio.ch.EPollSelectorImpl) at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97) at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:101) at org.spark_project.jetty.io.SelectorManager$ManagedSelector.select(SelectorManager.java:601) at org.spark_project.jetty.io.SelectorManager$ManagedSelector.run(SelectorManager.java:550) at org.spark_project.jetty.util.thread.NonBlockingThread.run(NonBlockingThread.java:52) at org.spark_project.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635) at org.spark_project.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555) at java.lang.Thread.run(Thread.java:745) "shuffle-server-0" #29 daemon prio=5 os_prio=0 tid=0x00007fe021198800 nid=0xa8f runnable [0x00007fdfc5d36000] java.lang.Thread.State: RUNNABLE at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method) at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269) at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:93) at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86) - locked <0x00000000c0025160> (a io.netty.channel.nio.SelectedSelectionKeySet) - locked <0x00000000c0025180> (a java.util.Collections$UnmodifiableSet) - locked <0x00000000c0025118> (a sun.nio.ch.EPollSelectorImpl) at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97) at io.netty.channel.nio.NioEventLoop.select(NioEventLoop.java:622) at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:310) at io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111) at java.lang.Thread.run(Thread.java:745) "dispatcher-event-loop-7" #20 daemon prio=5 os_prio=0 tid=0x00007fe020f9d000 nid=0xa8b waiting on condition [0x00007fdfc624a000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00000000c00156d0> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442) at org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:207) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) "dispatcher-event-loop-6" #19 daemon prio=5 os_prio=0 tid=0x00007fe020f9b000 nid=0xa8a waiting on condition [0x00007fdfc63a1000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00000000c00156d0> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442) at org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:207) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) "dispatcher-event-loop-5" #18 daemon prio=5 os_prio=0 tid=0x00007fe020f99800 nid=0xa87 waiting on condition [0x00007fdfc64a2000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00000000c00156d0> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442) at org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:207) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) "dispatcher-event-loop-4" #17 daemon prio=5 os_prio=0 tid=0x00007fe020f97800 nid=0xa86 waiting on condition [0x00007fdfc65a3000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00000000c00156d0> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442) at org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:207) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) "dispatcher-event-loop-3" #16 daemon prio=5 os_prio=0 tid=0x00007fe020f96000 nid=0xa84 waiting on condition [0x00007fdfc66a4000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00000000c00156d0> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442) at org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:207) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) "dispatcher-event-loop-2" #15 daemon prio=5 os_prio=0 tid=0x00007fe020f8b800 nid=0xa81 waiting on condition [0x00007fdfc67a5000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00000000c00156d0> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442) at org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:207) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) "dispatcher-event-loop-1" #14 daemon prio=5 os_prio=0 tid=0x00007fe020f8a800 nid=0xa7f waiting on condition [0x00007fdfc68a6000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00000000c00156d0> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442) at org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:207) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) "dispatcher-event-loop-0" #13 daemon prio=5 os_prio=0 tid=0x00007fe020f89800 nid=0xa7d waiting on condition [0x00007fdfc6da9000] java.lang.Thread.State: WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00000000c00156d0> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039) at java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442) at org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:207) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) "Service Thread" #9 daemon prio=9 os_prio=0 tid=0x00007fe0200d2000 nid=0xa6d runnable [0x0000000000000000] java.lang.Thread.State: RUNNABLE "C1 CompilerThread3" #8 daemon prio=9 os_prio=0 tid=0x00007fe0200c5000 nid=0xa67 waiting on condition [0x0000000000000000] java.lang.Thread.State: RUNNABLE "C2 CompilerThread2" #7 daemon prio=9 os_prio=0 tid=0x00007fe0200c0800 nid=0xa66 waiting on condition [0x0000000000000000] java.lang.Thread.State: RUNNABLE "C2 CompilerThread1" #6 daemon prio=9 os_prio=0 tid=0x00007fe0200be800 nid=0xa65 waiting on condition [0x0000000000000000] java.lang.Thread.State: RUNNABLE "C2 CompilerThread0" #5 daemon prio=9 os_prio=0 tid=0x00007fe0200bb800 nid=0xa64 waiting on condition [0x0000000000000000] java.lang.Thread.State: RUNNABLE "Signal Dispatcher" #4 daemon prio=9 os_prio=0 tid=0x00007fe0200b9800 nid=0xa63 waiting on condition [0x0000000000000000] java.lang.Thread.State: RUNNABLE "Finalizer" #3 daemon prio=8 os_prio=0 tid=0x00007fe020092800 nid=0xa61 in Object.wait() [0x00007fdfedcfb000] java.lang.Thread.State: WAITING (on object monitor) at java.lang.Object.wait(Native Method) - waiting on <0x00000000c001df40> (a java.lang.ref.ReferenceQueue$Lock) at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:143) - locked <0x00000000c001df40> (a java.lang.ref.ReferenceQueue$Lock) at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:164) at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:209) "Reference Handler" #2 daemon prio=10 os_prio=0 tid=0x00007fe02008e000 nid=0xa5f in Object.wait() [0x00007fdfeddfc000] java.lang.Thread.State: WAITING (on object monitor) at java.lang.Object.wait(Native Method) - waiting on <0x00000000c00351e8> (a java.lang.ref.Reference$Lock) at java.lang.Object.wait(Object.java:502) at java.lang.ref.Reference.tryHandlePending(Reference.java:191) - locked <0x00000000c00351e8> (a java.lang.ref.Reference$Lock) at java.lang.ref.Reference$ReferenceHandler.run(Reference.java:153) "main" #1 prio=5 os_prio=0 tid=0x00007fe020010800 nid=0xa4c waiting on condition [0x00007fe029d71000] java.lang.Thread.State: TIMED_WAITING (parking) at sun.misc.Unsafe.park(Native Method) - parking to wait for <0x00000000c01c5b70> (a java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject) at java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215) at java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078) at java.util.concurrent.ThreadPoolExecutor.awaitTermination(ThreadPoolExecutor.java:1465) at org.apache.spark.rpc.netty.Dispatcher.awaitTermination(Dispatcher.scala:180) at org.apache.spark.rpc.netty.NettyRpcEnv.awaitTermination(NettyRpcEnv.scala:273) at org.apache.spark.deploy.worker.Worker$.main(Worker.scala:696) at org.apache.spark.deploy.worker.Worker.main(Worker.scala) "VM Thread" os_prio=0 tid=0x00007fe020086000 nid=0xa5e runnable "GC task thread#0 (ParallelGC)" os_prio=0 tid=0x00007fe020025800 nid=0xa4d runnable "GC task thread#1 (ParallelGC)" os_prio=0 tid=0x00007fe020027800 nid=0xa4f runnable "GC task thread#2 (ParallelGC)" os_prio=0 tid=0x00007fe020029000 nid=0xa51 runnable "GC task thread#3 (ParallelGC)" os_prio=0 tid=0x00007fe02002b000 nid=0xa53 runnable "GC task thread#4 (ParallelGC)" os_prio=0 tid=0x00007fe02002c800 nid=0xa55 runnable "GC task thread#5 (ParallelGC)" os_prio=0 tid=0x00007fe02002e800 nid=0xa57 runnable "GC task thread#6 (ParallelGC)" os_prio=0 tid=0x00007fe020030000 nid=0xa59 runnable "GC task thread#7 (ParallelGC)" os_prio=0 tid=0x00007fe020032000 nid=0xa5b runnable "VM Periodic Task Thread" os_prio=0 tid=0x00007fe0200d4800 nid=0xa6e waiting on condition JNI global references: 265 Heap PSYoungGen total 286720K, used 138020K [0x00000000eab00000, 0x00000000feb00000, 0x0000000100000000) eden space 245760K, 56% used [0x00000000eab00000,0x00000000f31c90b0,0x00000000f9b00000) from space 40960K, 0% used [0x00000000f9b00000,0x00000000f9b00000,0x00000000fc300000) to space 40960K, 0% used [0x00000000fc300000,0x00000000fc300000,0x00000000feb00000) ParOldGen total 263168K, used 16900K [0x00000000c0000000, 0x00000000d0100000, 0x00000000eab00000) object space 263168K, 6% used [0x00000000c0000000,0x00000000c10812e8,0x00000000d0100000) Metaspace used 25730K, capacity 25990K, committed 26112K, reserved 1071104K class space used 3462K, capacity 3533K, committed 3584K, reserved 1048576K {noformat} > Executors hang with supposedly running task that are really finished. > --------------------------------------------------------------------- > > Key: SPARK-19764 > URL: https://issues.apache.org/jira/browse/SPARK-19764 > Project: Spark > Issue Type: Bug > Components: PySpark, Spark Core > Affects Versions: 2.0.2 > Environment: Ubuntu 16.04 LTS > OpenJDK Runtime Environment (build 1.8.0_121-8u121-b13-0ubuntu1.16.04.2-b13) > Spark 2.0.2 - Spark Cluster Manager > Reporter: Ari Gesher > Attachments: driver-log-stderr.log, executor-2.log, netty-6153.jpg, > SPARK-19764.tgz > > > We've come across a job that won't finish. Running on a six-node cluster, > each of the executors end up with 5-7 tasks that are never marked as > completed. > Here's an excerpt from the web UI: > ||Index ▴||ID||Attempt||Status||Locality Level||Executor ID / Host||Launch > Time||Duration||Scheduler Delay||Task Deserialization Time||GC Time||Result > Serialization Time||Getting Result Time||Peak Execution Memory||Shuffle Read > Size / Records||Errors|| > |105 | 1131 | 0 | SUCCESS |PROCESS_LOCAL |4 / 172.31.24.171 | > 2017/02/27 22:51:36 | 1.9 min | 9 ms | 4 ms | 0.7 s | 2 ms| 6 ms| > 384.1 MB| 90.3 MB / 572 | | > |106| 1168| 0| RUNNING |ANY| 2 / 172.31.16.112| 2017/02/27 > 22:53:25| 6.5 h |0 ms| 0 ms| 1 s |0 ms| 0 ms| |384.1 MB > |98.7 MB / 624 | | > However, the Executor reports the task as finished: > {noformat} > 17/02/27 22:53:25 INFO Executor: Running task 106.0 in stage 5.0 (TID 1168) > 17/02/27 22:55:29 INFO Executor: Finished task 106.0 in stage 5.0 (TID 1168). > 2633558 bytes result sent via BlockManager) > {noformat} > As does the driver log: > {noformat} > 17/02/27 22:53:25 INFO Executor: Running task 106.0 in stage 5.0 (TID 1168) > 17/02/27 22:55:29 INFO Executor: Finished task 106.0 in stage 5.0 (TID 1168). > 2633558 bytes result sent via BlockManager) > {noformat} > Full log from this executor and the {{stderr}} from > {{app-20170227223614-0001/2/stderr}} attached. -- This message was sent by Atlassian JIRA (v6.3.15#6346) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org