[ 
https://issues.apache.org/jira/browse/SPARK-19764?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15889566#comment-15889566
 ] 

Ari Gesher commented on SPARK-19764:
------------------------------------

And here's the stuck Executor:

{noformat}
Full thread dump OpenJDK 64-Bit Server VM (25.121-b13 mixed mode):

"shuffle-server-1" #30 daemon prio=5 os_prio=0 tid=0x00007fdf9400d800 nid=0xd22 
runnable [0x00007fdfc4726000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:93)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86)
        - locked <0x00000000c014b990> (a 
io.netty.channel.nio.SelectedSelectionKeySet)
        - locked <0x00000000c014da10> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000000c014b8f8> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97)
        at io.netty.channel.nio.NioEventLoop.select(NioEventLoop.java:622)
        at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:310)
        at 
io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
        at java.lang.Thread.run(Thread.java:745)

"File appending thread for /home/ubuntu/app-20170228223629-0000/1/stderr" #56 
daemon prio=5 os_prio=0 tid=0x00007fdf7803b000 nid=0xce7 runnable 
[0x00007fdfc4827000]
   java.lang.Thread.State: RUNNABLE
        at java.io.FileInputStream.readBytes(Native Method)
        at java.io.FileInputStream.read(FileInputStream.java:255)
        at java.io.BufferedInputStream.read1(BufferedInputStream.java:284)
        at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
        - locked <0x00000000f16a3010> (a 
java.lang.UNIXProcess$ProcessPipeInputStream)
        at java.io.FilterInputStream.read(FilterInputStream.java:107)
        at 
org.apache.spark.util.logging.FileAppender$$anonfun$appendStreamToFile$1.apply$mcV$sp(FileAppender.scala:68)
        at 
org.apache.spark.util.logging.FileAppender$$anonfun$appendStreamToFile$1.apply(FileAppender.scala:62)
        at 
org.apache.spark.util.logging.FileAppender$$anonfun$appendStreamToFile$1.apply(FileAppender.scala:62)
        at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1310)
        at 
org.apache.spark.util.logging.FileAppender.appendStreamToFile(FileAppender.scala:78)
        at 
org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply$mcV$sp(FileAppender.scala:39)
        at 
org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply(FileAppender.scala:39)
        at 
org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply(FileAppender.scala:39)
        at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1953)
        at 
org.apache.spark.util.logging.FileAppender$$anon$1.run(FileAppender.scala:38)

"File appending thread for /home/ubuntu/app-20170228223629-0000/1/stdout" #55 
daemon prio=5 os_prio=0 tid=0x00007fdf78036800 nid=0xcde runnable 
[0x00007fdfc4e2b000]
   java.lang.Thread.State: RUNNABLE
        at java.io.FileInputStream.readBytes(Native Method)
        at java.io.FileInputStream.read(FileInputStream.java:255)
        at java.io.BufferedInputStream.read1(BufferedInputStream.java:284)
        at java.io.BufferedInputStream.read(BufferedInputStream.java:345)
        - locked <0x00000000f16a0f50> (a 
java.lang.UNIXProcess$ProcessPipeInputStream)
        at java.io.FilterInputStream.read(FilterInputStream.java:107)
        at 
org.apache.spark.util.logging.FileAppender$$anonfun$appendStreamToFile$1.apply$mcV$sp(FileAppender.scala:68)
        at 
org.apache.spark.util.logging.FileAppender$$anonfun$appendStreamToFile$1.apply(FileAppender.scala:62)
        at 
org.apache.spark.util.logging.FileAppender$$anonfun$appendStreamToFile$1.apply(FileAppender.scala:62)
        at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1310)
        at 
org.apache.spark.util.logging.FileAppender.appendStreamToFile(FileAppender.scala:78)
        at 
org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply$mcV$sp(FileAppender.scala:39)
        at 
org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply(FileAppender.scala:39)
        at 
org.apache.spark.util.logging.FileAppender$$anon$1$$anonfun$run$1.apply(FileAppender.scala:39)
        at org.apache.spark.util.Utils$.logUncaughtExceptions(Utils.scala:1953)
        at 
org.apache.spark.util.logging.FileAppender$$anon$1.run(FileAppender.scala:38)

"process reaper" #54 daemon prio=10 os_prio=0 tid=0x00007fdf78032800 nid=0xcdc 
runnable [0x00007fdfc6de2000]
   java.lang.Thread.State: RUNNABLE
        at java.lang.UNIXProcess.waitForProcessExit(Native Method)
        at java.lang.UNIXProcess.lambda$initStreams$3(UNIXProcess.java:289)
        at java.lang.UNIXProcess$$Lambda$7/1471086700.run(Unknown Source)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

"ExecutorRunner for app-20170228223629-0000/1" #53 daemon prio=5 os_prio=0 
tid=0x00007fdfa00af800 nid=0xcda in Object.wait() [0x00007fdfc502d000]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        - waiting on <0x00000000f169ec58> (a java.lang.UNIXProcess)
        at java.lang.Object.wait(Object.java:502)
        at java.lang.UNIXProcess.waitFor(UNIXProcess.java:395)
        - locked <0x00000000f169ec58> (a java.lang.UNIXProcess)
        at 
org.apache.spark.deploy.worker.ExecutorRunner.org$apache$spark$deploy$worker$ExecutorRunner$$fetchAndRunExecutor(ExecutorRunner.scala:177)
        at 
org.apache.spark.deploy.worker.ExecutorRunner$$anon$1.run(ExecutorRunner.scala:73)

"threadDeathWatcher-2-1" #51 daemon prio=1 os_prio=0 tid=0x00007fdf6800d800 
nid=0xab3 waiting on condition [0x00007fdfc4928000]
   java.lang.Thread.State: TIMED_WAITING (sleeping)
        at java.lang.Thread.sleep(Native Method)
        at 
io.netty.util.ThreadDeathWatcher$Watcher.run(ThreadDeathWatcher.java:137)
        at 
io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:137)
        at java.lang.Thread.run(Thread.java:745)

"shuffle-client-0" #21 daemon prio=5 os_prio=0 tid=0x00007fdf70006000 nid=0xab1 
runnable [0x00007fdfc4a29000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:93)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86)
        - locked <0x00000000c0235758> (a 
io.netty.channel.nio.SelectedSelectionKeySet)
        - locked <0x00000000c028f760> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000000c02356c0> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97)
        at io.netty.channel.nio.NioEventLoop.select(NioEventLoop.java:622)
        at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:310)
        at 
io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
        at java.lang.Thread.run(Thread.java:745)

"netty-rpc-env-timeout" #50 daemon prio=5 os_prio=0 tid=0x00007fdf78015800 
nid=0xab0 waiting on condition [0x00007fdfc4d2a000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000c01d19a8> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
        at 
java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:1081)
        at 
java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:809)
        at 
java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1067)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1127)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

"worker-forward-message-scheduler" #48 daemon prio=5 os_prio=0 
tid=0x00007fdfb819d000 nid=0xaa5 waiting on condition [0x00007fdfc4f2c000]
   java.lang.Thread.State: TIMED_WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000c009edb0> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at 
java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at 
java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:1093)
        at 
java.util.concurrent.ScheduledThreadPoolExecutor$DelayedWorkQueue.take(ScheduledThreadPoolExecutor.java:809)
        at 
java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1067)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1127)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

"WorkerUI-46" #46 daemon prio=5 os_prio=0 tid=0x00007fdfb8116800 nid=0xa9a 
waiting on condition [0x00007fdfc512e000]
   java.lang.Thread.State: TIMED_WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000ebad5008> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at 
java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at 
org.spark_project.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:389)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool.idleJobPoll(QueuedThreadPool.java:531)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool.access$700(QueuedThreadPool.java:47)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:590)
        at java.lang.Thread.run(Thread.java:745)

"WorkerUI-45" #45 daemon prio=5 os_prio=0 tid=0x00007fdfb8115000 nid=0xa99 
waiting on condition [0x00007fdfc522f000]
   java.lang.Thread.State: TIMED_WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000ebad5008> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at 
java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at 
org.spark_project.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:389)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool.idleJobPoll(QueuedThreadPool.java:531)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool.access$700(QueuedThreadPool.java:47)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:590)
        at java.lang.Thread.run(Thread.java:745)

"WorkerUI-44" #44 daemon prio=5 os_prio=0 tid=0x00007fdfb8113000 nid=0xa98 
waiting on condition [0x00007fdfc5330000]
   java.lang.Thread.State: TIMED_WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000ebad5008> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at 
java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at 
org.spark_project.jetty.util.BlockingArrayQueue.poll(BlockingArrayQueue.java:389)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool.idleJobPoll(QueuedThreadPool.java:531)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool.access$700(QueuedThreadPool.java:47)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:590)
        at java.lang.Thread.run(Thread.java:745)

"WorkerUI-43-acceptor-0@410ad31d-ServerConnector@4fcd354c{HTTP/1.1}{172.31.20.156:8081}"
 #43 daemon prio=5 os_prio=0 tid=0x00007fdfb8111800 nid=0xa97 runnable 
[0x00007fdfc5431000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.ServerSocketChannelImpl.accept0(Native Method)
        at 
sun.nio.ch.ServerSocketChannelImpl.accept(ServerSocketChannelImpl.java:422)
        at 
sun.nio.ch.ServerSocketChannelImpl.accept(ServerSocketChannelImpl.java:250)
        - locked <0x00000000ebdaf3b0> (a java.lang.Object)
        at 
org.spark_project.jetty.server.ServerConnector.accept(ServerConnector.java:377)
        at 
org.spark_project.jetty.server.AbstractConnector$Acceptor.run(AbstractConnector.java:500)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555)
        at java.lang.Thread.run(Thread.java:745)

"WorkerUI-42-selector-ServerConnectorManager@19274cfc/3" #42 daemon prio=5 
os_prio=0 tid=0x00007fdfb8110000 nid=0xa96 runnable [0x00007fdfc5532000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:93)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86)
        - locked <0x00000000ebe19238> (a sun.nio.ch.Util$3)
        - locked <0x00000000ebe19228> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000000ebe19110> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:101)
        at 
org.spark_project.jetty.io.SelectorManager$ManagedSelector.select(SelectorManager.java:601)
        at 
org.spark_project.jetty.io.SelectorManager$ManagedSelector.run(SelectorManager.java:550)
        at 
org.spark_project.jetty.util.thread.NonBlockingThread.run(NonBlockingThread.java:52)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555)
        at java.lang.Thread.run(Thread.java:745)

"WorkerUI-41-selector-ServerConnectorManager@19274cfc/2" #41 daemon prio=5 
os_prio=0 tid=0x00007fdfb810e000 nid=0xa95 runnable [0x00007fdfc5633000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:93)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86)
        - locked <0x00000000ebe08420> (a sun.nio.ch.Util$3)
        - locked <0x00000000ebe08410> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000000ebe082f8> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:101)
        at 
org.spark_project.jetty.io.SelectorManager$ManagedSelector.select(SelectorManager.java:601)
        at 
org.spark_project.jetty.io.SelectorManager$ManagedSelector.run(SelectorManager.java:550)
        at 
org.spark_project.jetty.util.thread.NonBlockingThread.run(NonBlockingThread.java:52)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555)
        at java.lang.Thread.run(Thread.java:745)

"WorkerUI-40-selector-ServerConnectorManager@19274cfc/1" #40 daemon prio=5 
os_prio=0 tid=0x00007fdfb810d000 nid=0xa94 runnable [0x00007fdfc5734000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:93)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86)
        - locked <0x00000000ebdf7608> (a sun.nio.ch.Util$3)
        - locked <0x00000000ebdf75f8> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000000ebdf74e0> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:101)
        at 
org.spark_project.jetty.io.SelectorManager$ManagedSelector.select(SelectorManager.java:601)
        at 
org.spark_project.jetty.io.SelectorManager$ManagedSelector.run(SelectorManager.java:550)
        at 
org.spark_project.jetty.util.thread.NonBlockingThread.run(NonBlockingThread.java:52)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555)
        at java.lang.Thread.run(Thread.java:745)

"WorkerUI-39-selector-ServerConnectorManager@19274cfc/0" #39 daemon prio=5 
os_prio=0 tid=0x00007fdfb8104800 nid=0xa93 runnable [0x00007fdfc5835000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:93)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86)
        - locked <0x00000000ebddedf8> (a sun.nio.ch.Util$3)
        - locked <0x00000000ebddede8> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000000ebddecd0> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:101)
        at 
org.spark_project.jetty.io.SelectorManager$ManagedSelector.select(SelectorManager.java:601)
        at 
org.spark_project.jetty.io.SelectorManager$ManagedSelector.run(SelectorManager.java:550)
        at 
org.spark_project.jetty.util.thread.NonBlockingThread.run(NonBlockingThread.java:52)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:635)
        at 
org.spark_project.jetty.util.thread.QueuedThreadPool$3.run(QueuedThreadPool.java:555)
        at java.lang.Thread.run(Thread.java:745)

"shuffle-server-0" #29 daemon prio=5 os_prio=0 tid=0x00007fe021198800 nid=0xa8f 
runnable [0x00007fdfc5d36000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:93)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:86)
        - locked <0x00000000c0025160> (a 
io.netty.channel.nio.SelectedSelectionKeySet)
        - locked <0x00000000c0025180> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000000c0025118> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:97)
        at io.netty.channel.nio.NioEventLoop.select(NioEventLoop.java:622)
        at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:310)
        at 
io.netty.util.concurrent.SingleThreadEventExecutor$2.run(SingleThreadEventExecutor.java:111)
        at java.lang.Thread.run(Thread.java:745)

"dispatcher-event-loop-7" #20 daemon prio=5 os_prio=0 tid=0x00007fe020f9d000 
nid=0xa8b waiting on condition [0x00007fdfc624a000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000c00156d0> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442)
        at 
org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:207)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

"dispatcher-event-loop-6" #19 daemon prio=5 os_prio=0 tid=0x00007fe020f9b000 
nid=0xa8a waiting on condition [0x00007fdfc63a1000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000c00156d0> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442)
        at 
org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:207)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

"dispatcher-event-loop-5" #18 daemon prio=5 os_prio=0 tid=0x00007fe020f99800 
nid=0xa87 waiting on condition [0x00007fdfc64a2000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000c00156d0> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442)
        at 
org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:207)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

"dispatcher-event-loop-4" #17 daemon prio=5 os_prio=0 tid=0x00007fe020f97800 
nid=0xa86 waiting on condition [0x00007fdfc65a3000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000c00156d0> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442)
        at 
org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:207)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

"dispatcher-event-loop-3" #16 daemon prio=5 os_prio=0 tid=0x00007fe020f96000 
nid=0xa84 waiting on condition [0x00007fdfc66a4000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000c00156d0> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442)
        at 
org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:207)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

"dispatcher-event-loop-2" #15 daemon prio=5 os_prio=0 tid=0x00007fe020f8b800 
nid=0xa81 waiting on condition [0x00007fdfc67a5000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000c00156d0> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442)
        at 
org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:207)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

"dispatcher-event-loop-1" #14 daemon prio=5 os_prio=0 tid=0x00007fe020f8a800 
nid=0xa7f waiting on condition [0x00007fdfc68a6000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000c00156d0> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442)
        at 
org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:207)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

"dispatcher-event-loop-0" #13 daemon prio=5 os_prio=0 tid=0x00007fe020f89800 
nid=0xa7d waiting on condition [0x00007fdfc6da9000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000c00156d0> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.await(AbstractQueuedSynchronizer.java:2039)
        at 
java.util.concurrent.LinkedBlockingQueue.take(LinkedBlockingQueue.java:442)
        at 
org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:207)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)

"Service Thread" #9 daemon prio=9 os_prio=0 tid=0x00007fe0200d2000 nid=0xa6d 
runnable [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"C1 CompilerThread3" #8 daemon prio=9 os_prio=0 tid=0x00007fe0200c5000 
nid=0xa67 waiting on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"C2 CompilerThread2" #7 daemon prio=9 os_prio=0 tid=0x00007fe0200c0800 
nid=0xa66 waiting on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"C2 CompilerThread1" #6 daemon prio=9 os_prio=0 tid=0x00007fe0200be800 
nid=0xa65 waiting on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"C2 CompilerThread0" #5 daemon prio=9 os_prio=0 tid=0x00007fe0200bb800 
nid=0xa64 waiting on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"Signal Dispatcher" #4 daemon prio=9 os_prio=0 tid=0x00007fe0200b9800 nid=0xa63 
waiting on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"Finalizer" #3 daemon prio=8 os_prio=0 tid=0x00007fe020092800 nid=0xa61 in 
Object.wait() [0x00007fdfedcfb000]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        - waiting on <0x00000000c001df40> (a java.lang.ref.ReferenceQueue$Lock)
        at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:143)
        - locked <0x00000000c001df40> (a java.lang.ref.ReferenceQueue$Lock)
        at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:164)
        at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:209)

"Reference Handler" #2 daemon prio=10 os_prio=0 tid=0x00007fe02008e000 
nid=0xa5f in Object.wait() [0x00007fdfeddfc000]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        - waiting on <0x00000000c00351e8> (a java.lang.ref.Reference$Lock)
        at java.lang.Object.wait(Object.java:502)
        at java.lang.ref.Reference.tryHandlePending(Reference.java:191)
        - locked <0x00000000c00351e8> (a java.lang.ref.Reference$Lock)
        at java.lang.ref.Reference$ReferenceHandler.run(Reference.java:153)

"main" #1 prio=5 os_prio=0 tid=0x00007fe020010800 nid=0xa4c waiting on 
condition [0x00007fe029d71000]
   java.lang.Thread.State: TIMED_WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000000c01c5b70> (a 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject)
        at 
java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:215)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2078)
        at 
java.util.concurrent.ThreadPoolExecutor.awaitTermination(ThreadPoolExecutor.java:1465)
        at 
org.apache.spark.rpc.netty.Dispatcher.awaitTermination(Dispatcher.scala:180)
        at 
org.apache.spark.rpc.netty.NettyRpcEnv.awaitTermination(NettyRpcEnv.scala:273)
        at org.apache.spark.deploy.worker.Worker$.main(Worker.scala:696)
        at org.apache.spark.deploy.worker.Worker.main(Worker.scala)

"VM Thread" os_prio=0 tid=0x00007fe020086000 nid=0xa5e runnable 

"GC task thread#0 (ParallelGC)" os_prio=0 tid=0x00007fe020025800 nid=0xa4d 
runnable 

"GC task thread#1 (ParallelGC)" os_prio=0 tid=0x00007fe020027800 nid=0xa4f 
runnable 

"GC task thread#2 (ParallelGC)" os_prio=0 tid=0x00007fe020029000 nid=0xa51 
runnable 

"GC task thread#3 (ParallelGC)" os_prio=0 tid=0x00007fe02002b000 nid=0xa53 
runnable 

"GC task thread#4 (ParallelGC)" os_prio=0 tid=0x00007fe02002c800 nid=0xa55 
runnable 

"GC task thread#5 (ParallelGC)" os_prio=0 tid=0x00007fe02002e800 nid=0xa57 
runnable 

"GC task thread#6 (ParallelGC)" os_prio=0 tid=0x00007fe020030000 nid=0xa59 
runnable 

"GC task thread#7 (ParallelGC)" os_prio=0 tid=0x00007fe020032000 nid=0xa5b 
runnable 

"VM Periodic Task Thread" os_prio=0 tid=0x00007fe0200d4800 nid=0xa6e waiting on 
condition 

JNI global references: 265

Heap
 PSYoungGen      total 286720K, used 138020K [0x00000000eab00000, 
0x00000000feb00000, 0x0000000100000000)
  eden space 245760K, 56% used 
[0x00000000eab00000,0x00000000f31c90b0,0x00000000f9b00000)
  from space 40960K, 0% used 
[0x00000000f9b00000,0x00000000f9b00000,0x00000000fc300000)
  to   space 40960K, 0% used 
[0x00000000fc300000,0x00000000fc300000,0x00000000feb00000)
 ParOldGen       total 263168K, used 16900K [0x00000000c0000000, 
0x00000000d0100000, 0x00000000eab00000)
  object space 263168K, 6% used 
[0x00000000c0000000,0x00000000c10812e8,0x00000000d0100000)
 Metaspace       used 25730K, capacity 25990K, committed 26112K, reserved 
1071104K
  class space    used 3462K, capacity 3533K, committed 3584K, reserved 1048576K

{noformat}

> Executors hang with supposedly running task that are really finished.
> ---------------------------------------------------------------------
>
>                 Key: SPARK-19764
>                 URL: https://issues.apache.org/jira/browse/SPARK-19764
>             Project: Spark
>          Issue Type: Bug
>          Components: PySpark, Spark Core
>    Affects Versions: 2.0.2
>         Environment: Ubuntu 16.04 LTS
> OpenJDK Runtime Environment (build 1.8.0_121-8u121-b13-0ubuntu1.16.04.2-b13)
> Spark 2.0.2 - Spark Cluster Manager
>            Reporter: Ari Gesher
>         Attachments: driver-log-stderr.log, executor-2.log, netty-6153.jpg, 
> SPARK-19764.tgz
>
>
> We've come across a job that won't finish.  Running on a six-node cluster, 
> each of the executors end up with 5-7 tasks that are never marked as 
> completed.
> Here's an excerpt from the web UI:
> ||Index  ▴||ID||Attempt||Status||Locality Level||Executor ID / Host||Launch 
> Time||Duration||Scheduler Delay||Task Deserialization Time||GC Time||Result 
> Serialization Time||Getting Result Time||Peak Execution Memory||Shuffle Read 
> Size / Records||Errors||
> |105  | 1131  | 0     | SUCCESS       |PROCESS_LOCAL  |4 / 172.31.24.171 |    
> 2017/02/27 22:51:36 |   1.9 min |       9 ms |  4 ms |  0.7 s | 2 ms|   6 ms| 
>   384.1 MB|       90.3 MB / 572   | |
> |106| 1168|   0|      RUNNING |ANY|   2 / 172.31.16.112|      2017/02/27 
> 22:53:25|    6.5 h   |0 ms|  0 ms|   1 s     |0 ms|  0 ms|   |384.1 MB       
> |98.7 MB / 624 | |      
> However, the Executor reports the task as finished: 
> {noformat}
> 17/02/27 22:53:25 INFO Executor: Running task 106.0 in stage 5.0 (TID 1168)
> 17/02/27 22:55:29 INFO Executor: Finished task 106.0 in stage 5.0 (TID 1168). 
> 2633558 bytes result sent via BlockManager)
> {noformat}
> As does the driver log:
> {noformat}
> 17/02/27 22:53:25 INFO Executor: Running task 106.0 in stage 5.0 (TID 1168)
> 17/02/27 22:55:29 INFO Executor: Finished task 106.0 in stage 5.0 (TID 1168). 
> 2633558 bytes result sent via BlockManager)
> {noformat}
> Full log from this executor and the {{stderr}} from 
> {{app-20170227223614-0001/2/stderr}} attached.



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to