[ 
https://issues.apache.org/jira/browse/SPARK-3687?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14156236#comment-14156236
 ] 

Ziv Huang commented on SPARK-3687:
----------------------------------

The following is the jstack dump of one CoarseGrainedExecutorBackend when the 
job hangs (the spark version is 1.1.0):

"Attach Listener" daemon prio=10 tid=0x00007fded0001000 nid=0x7836 waiting on 
condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"Hashed wheel timer #1" daemon prio=10 tid=0x00007fde9c001000 nid=0x7811 
waiting on condition [0x00007fdf26a84000]
   java.lang.Thread.State: TIMED_WAITING (sleeping)
        at java.lang.Thread.sleep(Native Method)
        at 
org.jboss.netty.util.HashedWheelTimer$Worker.waitForNextTick(HashedWheelTimer.java:503)
        at 
org.jboss.netty.util.HashedWheelTimer$Worker.run(HashedWheelTimer.java:401)
        at 
org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
        at java.lang.Thread.run(Thread.java:745)

"New I/O server boss #6" daemon prio=10 tid=0x00007fdeb4084000 nid=0x7810 
runnable [0x00007fdf26b85000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:87)
        - locked <0x00000007db53acc0> (a sun.nio.ch.Util$2)
        - locked <0x00000007db53acb0> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000007db53ab98> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:98)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:102)
        at 
org.jboss.netty.channel.socket.nio.NioServerBoss.select(NioServerBoss.java:163)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:206)
        at 
org.jboss.netty.channel.socket.nio.NioServerBoss.run(NioServerBoss.java:42)
        at 
org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
        at 
org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)

"New I/O worker #5" daemon prio=10 tid=0x00007fdeb4037000 nid=0x780f runnable 
[0x00007fdf26c86000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:87)
        - locked <0x00000007db529f98> (a sun.nio.ch.Util$2)
        - locked <0x00000007db529f88> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000007db529e70> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:98)
        at 
org.jboss.netty.channel.socket.nio.SelectorUtil.select(SelectorUtil.java:64)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.select(AbstractNioSelector.java:409)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:206)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioWorker.run(AbstractNioWorker.java:90)
        at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:178)
        at 
org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
        at 
org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)

"New I/O worker #4" daemon prio=10 tid=0x00007fdeb4032800 nid=0x780e runnable 
[0x00007fdf26d87000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:87)
        - locked <0x00000007db528610> (a sun.nio.ch.Util$2)
        - locked <0x00000007db528600> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000007db5284e8> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:98)
        at 
org.jboss.netty.channel.socket.nio.SelectorUtil.select(SelectorUtil.java:64)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.select(AbstractNioSelector.java:409)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:206)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioWorker.run(AbstractNioWorker.java:90)
        at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:178)
        at 
org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
        at 
org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)

"New I/O boss #3" daemon prio=10 tid=0x00007fdeb4035000 nid=0x780d runnable 
[0x00007fdf26e88000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:87)
        - locked <0x00000007db4f0d20> (a sun.nio.ch.Util$2)
        - locked <0x00000007db4f0d10> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000007db4f0bf8> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:98)
        at 
org.jboss.netty.channel.socket.nio.SelectorUtil.select(SelectorUtil.java:64)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.select(AbstractNioSelector.java:409)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:206)
        at 
org.jboss.netty.channel.socket.nio.NioClientBoss.run(NioClientBoss.java:42)
        at 
org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
        at 
org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)

"New I/O worker #2" daemon prio=10 tid=0x00007fdeb402c000 nid=0x780c runnable 
[0x00007fdf26f88000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:87)
        - locked <0x00000007db4334b8> (a sun.nio.ch.Util$2)
        - locked <0x00000007db4334a8> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000007db433390> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:98)
        at 
org.jboss.netty.channel.socket.nio.SelectorUtil.select(SelectorUtil.java:64)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.select(AbstractNioSelector.java:409)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:206)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioWorker.run(AbstractNioWorker.java:90)
        at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:178)
        at 
org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
        at 
org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)

"New I/O worker #1" daemon prio=10 tid=0x00007fdeb4024800 nid=0x780b runnable 
[0x00007fdf27089000]
   java.lang.Thread.State: RUNNABLE
        at sun.nio.ch.EPollArrayWrapper.epollWait(Native Method)
        at sun.nio.ch.EPollArrayWrapper.poll(EPollArrayWrapper.java:269)
        at sun.nio.ch.EPollSelectorImpl.doSelect(EPollSelectorImpl.java:79)
        at sun.nio.ch.SelectorImpl.lockAndDoSelect(SelectorImpl.java:87)
        - locked <0x00000007db3fdb18> (a sun.nio.ch.Util$2)
        - locked <0x00000007db3fda98> (a java.util.Collections$UnmodifiableSet)
        - locked <0x00000007db3fd8a0> (a sun.nio.ch.EPollSelectorImpl)
        at sun.nio.ch.SelectorImpl.select(SelectorImpl.java:98)
        at 
org.jboss.netty.channel.socket.nio.SelectorUtil.select(SelectorUtil.java:64)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.select(AbstractNioSelector.java:409)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioSelector.run(AbstractNioSelector.java:206)
        at 
org.jboss.netty.channel.socket.nio.AbstractNioWorker.run(AbstractNioWorker.java:90)
        at org.jboss.netty.channel.socket.nio.NioWorker.run(NioWorker.java:178)
        at 
org.jboss.netty.util.ThreadRenamingRunnable.run(ThreadRenamingRunnable.java:108)
        at 
org.jboss.netty.util.internal.DeadLockProofWorker$1.run(DeadLockProofWorker.java:42)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)

"driverPropsFetcher-akka.actor.default-dispatcher-6" daemon prio=10 
tid=0x00007fdeb0017800 nid=0x780a waiting on condition [0x00007fdf2718b000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000007d9fd8e78> (a 
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinPool)
        at scala.concurrent.forkjoin.ForkJoinPool.scan(ForkJoinPool.java:2075)
        at 
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
        at 
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)

"driverPropsFetcher-akka.actor.default-dispatcher-5" daemon prio=10 
tid=0x00007fdeb4004000 nid=0x7809 waiting on condition [0x00007fdf2728c000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000007d9fd8e78> (a 
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinPool)
        at scala.concurrent.forkjoin.ForkJoinPool.scan(ForkJoinPool.java:2075)
        at 
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
        at 
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)

"driverPropsFetcher-akka.actor.default-dispatcher-4" daemon prio=10 
tid=0x00007fdeb0015800 nid=0x7808 waiting on condition [0x00007fdf2738d000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000007d9fd8e78> (a 
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinPool)
        at scala.concurrent.forkjoin.ForkJoinPool.scan(ForkJoinPool.java:2075)
        at 
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
        at 
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)

"driverPropsFetcher-akka.actor.default-dispatcher-3" daemon prio=10 
tid=0x00007fdf30568800 nid=0x7807 waiting on condition [0x00007fdf2748e000]
   java.lang.Thread.State: TIMED_WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000007d9fd8e78> (a 
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinPool)
        at 
scala.concurrent.forkjoin.ForkJoinPool.idleAwaitWork(ForkJoinPool.java:2135)
        at scala.concurrent.forkjoin.ForkJoinPool.scan(ForkJoinPool.java:2067)
        at 
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
        at 
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)

"driverPropsFetcher-akka.actor.default-dispatcher-2" daemon prio=10 
tid=0x00007fdf30562000 nid=0x7806 waiting on condition [0x00007fdf2758f000]
   java.lang.Thread.State: WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000007d9fd8e78> (a 
akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinPool)
        at scala.concurrent.forkjoin.ForkJoinPool.scan(ForkJoinPool.java:2075)
        at 
scala.concurrent.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
        at 
scala.concurrent.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)

"driverPropsFetcher-scheduler-1" daemon prio=10 tid=0x00007fdf304ed800 
nid=0x7805 waiting on condition [0x00007fdf27891000]
   java.lang.Thread.State: TIMED_WAITING (sleeping)
        at java.lang.Thread.sleep(Native Method)
        at akka.actor.LightArrayRevolverScheduler.waitNanos(Scheduler.scala:226)
        at 
akka.actor.LightArrayRevolverScheduler$$anon$12.nextTick(Scheduler.scala:393)
        at 
akka.actor.LightArrayRevolverScheduler$$anon$12.run(Scheduler.scala:363)
        at java.lang.Thread.run(Thread.java:745)

"process reaper" daemon prio=10 tid=0x00007fdf303f6800 nid=0x7803 waiting on 
condition [0x00007fdf2c0f9000]
   java.lang.Thread.State: TIMED_WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000007d75ff268> (a 
java.util.concurrent.SynchronousQueue$TransferStack)
        at 
java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:226)
        at 
java.util.concurrent.SynchronousQueue$TransferStack.awaitFulfill(SynchronousQueue.java:460)
        at 
java.util.concurrent.SynchronousQueue$TransferStack.transfer(SynchronousQueue.java:359)
        at java.util.concurrent.SynchronousQueue.poll(SynchronousQueue.java:942)
        at 
java.util.concurrent.ThreadPoolExecutor.getTask(ThreadPoolExecutor.java:1068)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1130)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
        at java.lang.Thread.run(Thread.java:745)

"Service Thread" daemon prio=10 tid=0x00007fdf300a6000 nid=0x7800 runnable 
[0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"C2 CompilerThread1" daemon prio=10 tid=0x00007fdf300a3800 nid=0x77ff waiting 
on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"C2 CompilerThread0" daemon prio=10 tid=0x00007fdf300a0800 nid=0x77fe waiting 
on condition [0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"Signal Dispatcher" daemon prio=10 tid=0x00007fdf3009f000 nid=0x77fd runnable 
[0x0000000000000000]
   java.lang.Thread.State: RUNNABLE

"Finalizer" daemon prio=10 tid=0x00007fdf30077800 nid=0x77fc in Object.wait() 
[0x00007fdf2cf2c000]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        - waiting on <0x00000007d5505630> (a java.lang.ref.ReferenceQueue$Lock)
        at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:135)
        - locked <0x00000007d5505630> (a java.lang.ref.ReferenceQueue$Lock)
        at java.lang.ref.ReferenceQueue.remove(ReferenceQueue.java:151)
        at java.lang.ref.Finalizer$FinalizerThread.run(Finalizer.java:209)

"Reference Handler" daemon prio=10 tid=0x00007fdf30075800 nid=0x77fb in 
Object.wait() [0x00007fdf2d02d000]
   java.lang.Thread.State: WAITING (on object monitor)
        at java.lang.Object.wait(Native Method)
        - waiting on <0x00000007d55051b8> (a java.lang.ref.Reference$Lock)
        at java.lang.Object.wait(Object.java:503)
        at java.lang.ref.Reference$ReferenceHandler.run(Reference.java:133)
        - locked <0x00000007d55051b8> (a java.lang.ref.Reference$Lock)

"main" prio=10 tid=0x00007fdf30009000 nid=0x77f1 waiting on condition 
[0x00007fdf37e67000]
   java.lang.Thread.State: TIMED_WAITING (parking)
        at sun.misc.Unsafe.park(Native Method)
        - parking to wait for  <0x00000007da431b50> (a 
scala.concurrent.impl.Promise$CompletionLatch)
        at 
java.util.concurrent.locks.LockSupport.parkNanos(LockSupport.java:226)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireSharedNanos(AbstractQueuedSynchronizer.java:1033)
        at 
java.util.concurrent.locks.AbstractQueuedSynchronizer.tryAcquireSharedNanos(AbstractQueuedSynchronizer.java:1326)
        at 
scala.concurrent.impl.Promise$DefaultPromise.tryAwait(Promise.scala:208)
        at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:218)
        at 
scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:223)
        at scala.concurrent.Await$$anonfun$result$1.apply(package.scala:107)
        at 
scala.concurrent.BlockContext$DefaultBlockContext$.blockOn(BlockContext.scala:53)
        at scala.concurrent.Await$.result(package.scala:107)
        at 
org.apache.spark.executor.CoarseGrainedExecutorBackend$$anonfun$run$1.apply$mcV$sp(CoarseGrainedExecutorBackend.scala:125)
        at 
org.apache.spark.deploy.SparkHadoopUtil$$anon$1.run(SparkHadoopUtil.scala:53)
        at 
org.apache.spark.deploy.SparkHadoopUtil$$anon$1.run(SparkHadoopUtil.scala:52)
        at java.security.AccessController.doPrivileged(Native Method)
        at javax.security.auth.Subject.doAs(Subject.java:415)
        at 
org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1548)
        at 
org.apache.spark.deploy.SparkHadoopUtil.runAsSparkUser(SparkHadoopUtil.scala:52)
        at 
org.apache.spark.executor.CoarseGrainedExecutorBackend$.run(CoarseGrainedExecutorBackend.scala:113)
        at 
org.apache.spark.executor.CoarseGrainedExecutorBackend$.main(CoarseGrainedExecutorBackend.scala:156)
        at 
org.apache.spark.executor.CoarseGrainedExecutorBackend.main(CoarseGrainedExecutorBackend.scala)

"VM Thread" prio=10 tid=0x00007fdf30071000 nid=0x77fa runnable 

"GC task thread#0 (ParallelGC)" prio=10 tid=0x00007fdf3001e800 nid=0x77f2 
runnable 

"GC task thread#1 (ParallelGC)" prio=10 tid=0x00007fdf30020800 nid=0x77f3 
runnable 

"GC task thread#2 (ParallelGC)" prio=10 tid=0x00007fdf30022800 nid=0x77f4 
runnable 

"GC task thread#3 (ParallelGC)" prio=10 tid=0x00007fdf30024000 nid=0x77f5 
runnable 

"GC task thread#4 (ParallelGC)" prio=10 tid=0x00007fdf30026000 nid=0x77f6 
runnable 

"GC task thread#5 (ParallelGC)" prio=10 tid=0x00007fdf30028000 nid=0x77f7 
runnable 

"GC task thread#6 (ParallelGC)" prio=10 tid=0x00007fdf30029800 nid=0x77f8 
runnable 

"GC task thread#7 (ParallelGC)" prio=10 tid=0x00007fdf3002b800 nid=0x77f9 
runnable 

"VM Periodic Task Thread" prio=10 tid=0x00007fdf300b0800 nid=0x7801 waiting on 
condition 

JNI global references: 253


> Spark hang while processing more than 100 sequence files
> --------------------------------------------------------
>
>                 Key: SPARK-3687
>                 URL: https://issues.apache.org/jira/browse/SPARK-3687
>             Project: Spark
>          Issue Type: Bug
>          Components: Spark Core
>    Affects Versions: 1.0.2, 1.1.0
>            Reporter: Ziv Huang
>
> In my application, I read more than 100 sequence files to a JavaPairRDD, 
> perform flatmap to get another JavaRDD, and then use takeOrdered to get the 
> result.
> It is quite often (but not always) that the spark hangs while the executing 
> some of 120th-150th tasks.
> In 1.0.2, the job can hang for several hours, maybe forever (I can't wait for 
> its completion).
> When the spark job hangs,  I can't kill the job from web UI.
> In 1.1.0, the job hangs for couple mins (3.x mins actually),
> and then web UI of spark master shows that the job is finished with state 
> "FAILED".
> In addition, the job stage web UI still hangs, and execution duration time is 
> still accumulating.
> For both 1.0.2 and 1.1.0, the job hangs with no error messages in anywhere.
> The current workaround is to use coalesce to reduce the number of partitions 
> to be processed.
> I never get a job hanged if the number of partitions to be processed is no 
> greater than 100.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to