[ https://issues.apache.org/jira/browse/SPARK-15905?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15328608#comment-15328608 ]
Tejas Patil commented on SPARK-15905: ------------------------------------- Another instance but this time not via console progress bar. This job has been stuck for 15+ hours. {noformat} "dispatcher-event-loop-23" #60 daemon prio=5 os_prio=0 tid=0x00007f981e206000 nid=0x685f8 runnable [0x00007f8c0f1ef000] java.lang.Thread.State: RUNNABLE at java.io.FileOutputStream.writeBytes(Native Method) at java.io.FileOutputStream.write(FileOutputStream.java:326) at java.io.BufferedOutputStream.write(BufferedOutputStream.java:122) - locked <0x00007f8d48167058> (a java.io.BufferedOutputStream) at java.io.PrintStream.write(PrintStream.java:480) - locked <0x00007f8d48167020> (a java.io.PrintStream) at sun.nio.cs.StreamEncoder.writeBytes(StreamEncoder.java:221) at sun.nio.cs.StreamEncoder.implFlushBuffer(StreamEncoder.java:291) at sun.nio.cs.StreamEncoder.implFlush(StreamEncoder.java:295) at sun.nio.cs.StreamEncoder.flush(StreamEncoder.java:141) - locked <0x00007f8d48237680> (a java.io.OutputStreamWriter) at java.io.OutputStreamWriter.flush(OutputStreamWriter.java:229) at org.apache.log4j.helpers.QuietWriter.flush(QuietWriter.java:59) at org.apache.log4j.WriterAppender.subAppend(WriterAppender.java:324) at org.apache.log4j.WriterAppender.append(WriterAppender.java:162) at org.apache.log4j.AppenderSkeleton.doAppend(AppenderSkeleton.java:251) - locked <0x00007f8d48235ee0> (a org.apache.log4j.ConsoleAppender) at org.apache.log4j.helpers.AppenderAttachableImpl.appendLoopOnAppenders(AppenderAttachableImpl.java:66) at org.apache.log4j.Category.callAppenders(Category.java:206) - locked <0x00007f8d481bf1e8> (a org.apache.log4j.spi.RootLogger) at org.apache.log4j.Category.forcedLog(Category.java:391) at org.apache.log4j.Category.log(Category.java:856) at org.slf4j.impl.Log4jLoggerAdapter.warn(Log4jLoggerAdapter.java:400) at org.apache.spark.Logging$class.logWarning(Logging.scala:70) at org.apache.spark.scheduler.TaskSetManager.logWarning(TaskSetManager.scala:52) at org.apache.spark.scheduler.TaskSetManager.handleFailedTask(TaskSetManager.scala:721) at org.apache.spark.scheduler.TaskSetManager$$anonfun$executorLost$6.apply(TaskSetManager.scala:813) at org.apache.spark.scheduler.TaskSetManager$$anonfun$executorLost$6.apply(TaskSetManager.scala:807) at scala.collection.TraversableLike$WithFilter$$anonfun$foreach$1.apply(TraversableLike.scala:772) at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98) at scala.collection.mutable.HashMap$$anonfun$foreach$1.apply(HashMap.scala:98) at scala.collection.mutable.HashTable$class.foreachEntry(HashTable.scala:226) at scala.collection.mutable.HashMap.foreachEntry(HashMap.scala:39) at scala.collection.mutable.HashMap.foreach(HashMap.scala:98) at scala.collection.TraversableLike$WithFilter.foreach(TraversableLike.scala:771) at org.apache.spark.scheduler.TaskSetManager.executorLost(TaskSetManager.scala:807) at org.apache.spark.scheduler.Pool$$anonfun$executorLost$1.apply(Pool.scala:87) at org.apache.spark.scheduler.Pool$$anonfun$executorLost$1.apply(Pool.scala:87) at scala.collection.Iterator$class.foreach(Iterator.scala:727) at scala.collection.AbstractIterator.foreach(Iterator.scala:1157) at scala.collection.IterableLike$class.foreach(IterableLike.scala:72) at scala.collection.AbstractIterable.foreach(Iterable.scala:54) at org.apache.spark.scheduler.Pool.executorLost(Pool.scala:87) at org.apache.spark.scheduler.TaskSchedulerImpl.removeExecutor(TaskSchedulerImpl.scala:536) at org.apache.spark.scheduler.TaskSchedulerImpl.executorLost(TaskSchedulerImpl.scala:474) - locked <0x00007f8d5850e1e0> (a org.apache.spark.scheduler.TaskSchedulerImpl) at org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend$DriverEndpoint.removeExecutor(CoarseGrainedSchedulerBackend.scala:263) at org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend$DriverEndpoint$$anonfun$onDisconnected$1.apply(CoarseGrainedSchedulerBackend.scala:202) at org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend$DriverEndpoint$$anonfun$onDisconnected$1.apply(CoarseGrainedSchedulerBackend.scala:202) at scala.Option.foreach(Option.scala:236) at org.apache.spark.scheduler.cluster.CoarseGrainedSchedulerBackend$DriverEndpoint.onDisconnected(CoarseGrainedSchedulerBackend.scala:202) at org.apache.spark.rpc.netty.Inbox$$anonfun$process$1.apply$mcV$sp(Inbox.scala:142) at org.apache.spark.rpc.netty.Inbox.safelyCall(Inbox.scala:204) at org.apache.spark.rpc.netty.Inbox.process(Inbox.scala:100) at org.apache.spark.rpc.netty.Dispatcher$MessageLoop.run(Dispatcher.scala:215) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) {noformat} > Driver hung while writing to console progress bar > ------------------------------------------------- > > Key: SPARK-15905 > URL: https://issues.apache.org/jira/browse/SPARK-15905 > Project: Spark > Issue Type: Bug > Components: Spark Core > Affects Versions: 1.6.1 > Reporter: Tejas Patil > Priority: Minor > > This leads to driver being not able to get heartbeats from its executors and > job being stuck. After looking at the locking dependency amongst the driver > threads per the jstack, this is where the driver seems to be stuck. > {noformat} > "refresh progress" #113 daemon prio=5 os_prio=0 tid=0x00007f7986cbc800 > nid=0x7887d runnable [0x00007f6d3507a000] > java.lang.Thread.State: RUNNABLE > at java.io.FileOutputStream.writeBytes(Native Method) > at java.io.FileOutputStream.write(FileOutputStream.java:326) > at > java.io.BufferedOutputStream.flushBuffer(BufferedOutputStream.java:82) > at java.io.BufferedOutputStream.flush(BufferedOutputStream.java:140) > - locked <0x00007f6eb81dd290> (a java.io.BufferedOutputStream) > at java.io.PrintStream.write(PrintStream.java:482) > - locked <0x00007f6eb81dd258> (a java.io.PrintStream) > at sun.nio.cs.StreamEncoder.writeBytes(StreamEncoder.java:221) > at sun.nio.cs.StreamEncoder.implFlushBuffer(StreamEncoder.java:291) > at sun.nio.cs.StreamEncoder.flushBuffer(StreamEncoder.java:104) > - locked <0x00007f6eb81dd400> (a java.io.OutputStreamWriter) > at java.io.OutputStreamWriter.flushBuffer(OutputStreamWriter.java:185) > at java.io.PrintStream.write(PrintStream.java:527) > - locked <0x00007f6eb81dd258> (a java.io.PrintStream) > at java.io.PrintStream.print(PrintStream.java:669) > at > org.apache.spark.ui.ConsoleProgressBar.show(ConsoleProgressBar.scala:99) > at > org.apache.spark.ui.ConsoleProgressBar.org$apache$spark$ui$ConsoleProgressBar$$refresh(ConsoleProgressBar.scala:69) > - locked <0x00007f6ed33b48a0> (a > org.apache.spark.ui.ConsoleProgressBar) > at > org.apache.spark.ui.ConsoleProgressBar$$anon$1.run(ConsoleProgressBar.scala:53) > at java.util.TimerThread.mainLoop(Timer.java:555) > at java.util.TimerThread.run(Timer.java:505) > {noformat} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org