[ https://issues.apache.org/jira/browse/SPARK-24687?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16725892#comment-16725892 ]
ASF GitHub Bot commented on SPARK-24687: ---------------------------------------- srowen closed pull request #21664: [SPARK-24687][CORE] Avoid job hanging when generate task binary causes fatal error URL: https://github.com/apache/spark/pull/21664 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala index f74425d73b392..6ee15fde5c71f 100644 --- a/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala +++ b/core/src/main/scala/org/apache/spark/scheduler/DAGScheduler.scala @@ -1045,9 +1045,11 @@ class DAGScheduler( // Abort execution return - case NonFatal(e) => + case e: Throwable => abortStage(stage, s"Task serialization failed: $e\n${Utils.exceptionString(e)}", Some(e)) runningStages -= stage + + // Abort execution return } ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > When NoClassDefError thrown during task serialization will cause job hang > ------------------------------------------------------------------------- > > Key: SPARK-24687 > URL: https://issues.apache.org/jira/browse/SPARK-24687 > Project: Spark > Issue Type: Bug > Components: Spark Core > Affects Versions: 2.1.0, 2.1.1 > Reporter: zhoukang > Assignee: zhoukang > Priority: Major > Fix For: 2.3.3, 2.4.1, 3.0.0 > > Attachments: hanging-960.png > > > When below exception thrown: > {code:java} > Exception in thread "dag-scheduler-event-loop" > java.lang.NoClassDefFoundError: > Lcom/xxx/data/recommend/aggregator/queue/QueueName; > at java.lang.Class.getDeclaredFields0(Native Method) > at java.lang.Class.privateGetDeclaredFields(Class.java:2436) > at java.lang.Class.getDeclaredField(Class.java:1946) > at > java.io.ObjectStreamClass.getDeclaredSUID(ObjectStreamClass.java:1659) > at java.io.ObjectStreamClass.access$700(ObjectStreamClass.java:72) > at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:480) > at java.io.ObjectStreamClass$2.run(ObjectStreamClass.java:468) > at java.security.AccessController.doPrivileged(Native Method) > at java.io.ObjectStreamClass.<init>(ObjectStreamClass.java:468) > at java.io.ObjectStreamClass.lookup(ObjectStreamClass.java:365) > at java.io.ObjectOutputStream.writeClass(ObjectOutputStream.java:1212) > at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1119) > at > java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1547) > at > java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1508) > at > java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431) > at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177) > at > java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1547) > at > java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1508) > at > java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431) > at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177) > at > java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1547) > at > java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1508) > at > java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431) > at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177) > at java.io.ObjectOutputStream.writeArray(ObjectOutputStream.java:1377) > at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1173) > at > java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1547) > at > java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1508) > at > java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431) > at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177) > at > java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1547) > at > java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1508) > at > java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431) > at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177) > at > java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1547) > at > java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1508) > at > java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431) > at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177) > at java.io.ObjectOutputStream.writeArray(ObjectOutputStream.java:1377) > at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1173) > at > java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1547) > at > java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1508) > at > java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431) > at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177) > at > java.io.ObjectOutputStream.defaultWriteFields(ObjectOutputStream.java:1547) > at > java.io.ObjectOutputStream.writeSerialData(ObjectOutputStream.java:1508) > at > java.io.ObjectOutputStream.writeOrdinaryObject(ObjectOutputStream.java:1431) > at java.io.ObjectOutputStream.writeObject0(ObjectOutputStream.java:1177) > {code} > Stage will always hang.Not abort. > !hanging-960.png! > It is because NoClassDefError will no be catch up below. > {code} > var taskBinary: Broadcast[Array[Byte]] = null > try { > // For ShuffleMapTask, serialize and broadcast (rdd, shuffleDep). > // For ResultTask, serialize and broadcast (rdd, func). > val taskBinaryBytes: Array[Byte] = stage match { > case stage: ShuffleMapStage => > JavaUtils.bufferToArray( > closureSerializer.serialize((stage.rdd, stage.shuffleDep): > AnyRef)) > case stage: ResultStage => > JavaUtils.bufferToArray(closureSerializer.serialize((stage.rdd, > stage.func): AnyRef)) > } > taskBinary = sc.broadcast(taskBinaryBytes) > } catch { > // In the case of a failure during serialization, abort the stage. > case e: NotSerializableException => > abortStage(stage, "Task not serializable: " + e.toString, Some(e)) > runningStages -= stage > // Abort execution > return > case NonFatal(e) => > abortStage(stage, s"Task serialization failed: > $e\n${Utils.exceptionString(e)}", Some(e)) > runningStages -= stage > return > } > {code} -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org