[ https://issues.apache.org/jira/browse/SPARK-44478 ]
Dongjoon Hyun deleted comment on SPARK-44478: --------------------------------------- was (Author: dongjoon): Ya, it's difficult to reproduce. BTW, could you try 3.4.2, [~juhai], because SPARK-44630 is in Apache Spark 3.4.2 only. We didn't revert SPARK-43043 in Apache Spark 3.5.x. > Executor decommission causes stage failure > ------------------------------------------ > > Key: SPARK-44478 > URL: https://issues.apache.org/jira/browse/SPARK-44478 > Project: Spark > Issue Type: Bug > Components: Scheduler > Affects Versions: 3.4.0, 3.4.1 > Reporter: Dale Huettenmoser > Priority: Minor > > During spark execution, save fails due to executor decommissioning. Issue not > present in 3.3.0 > Sample error: > > {code:java} > An error occurred while calling o8948.save. > : org.apache.spark.SparkException: Job aborted due to stage failure: > Authorized committer (attemptNumber=0, stage=170, partition=233) failed; but > task commit success, data duplication may happen. > reason=ExecutorLostFailure(1,false,Some(Executor decommission: Executor 1 is > decommissioned.)) > at > org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2785) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2721) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2720) > at > scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) > at > scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2720) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$handleStageFailed$1(DAGScheduler.scala:1199) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$handleStageFailed$1$adapted(DAGScheduler.scala:1199) > at scala.Option.foreach(Option.scala:407) > at > org.apache.spark.scheduler.DAGScheduler.handleStageFailed(DAGScheduler.scala:1199) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2981) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2923) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2912) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) > at > org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:971) > at org.apache.spark.SparkContext.runJob(SparkContext.scala:2263) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$.$anonfun$executeWrite$4(FileFormatWriter.scala:307) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$.writeAndCommit(FileFormatWriter.scala:271) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$.executeWrite(FileFormatWriter.scala:304) > at > org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:190) > at > org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:190) > at > org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:113) > at > org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:111) > at > org.apache.spark.sql.execution.command.DataWritingCommandExec.executeCollect(commands.scala:125) > at > org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.$anonfun$applyOrElse$1(QueryExecution.scala:98) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:118) > at > org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195) > at > org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103) > at > org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827) > at > org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65) > at > org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98) > at > org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94) > at > org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:512) > at > org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:104) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:512) > at > org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:31) > at > org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267) > at > org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263) > at > org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31) > at > org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:488) > at > org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94) > at > org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81) > at > org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79) > at > org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:133) > at > org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:856) > at > org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:387) > at > org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:360) > at > org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239) > at jdk.internal.reflect.GeneratedMethodAccessor497.invoke(Unknown > Source) > at > java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(Unknown > Source) > at java.base/java.lang.reflect.Method.invoke(Unknown Source) > at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) > at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374) > at py4j.Gateway.invoke(Gateway.java:282) > at > py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) > at py4j.commands.CallCommand.execute(CallCommand.java:79) > at > py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182) > at py4j.ClientServerConnection.run(ClientServerConnection.java:106) > at java.base/java.lang.Thread.run(Unknown Source) > {code} > > > This occurred while running our production k8s spark jobs (spark 3.3.0) in a > duplicate test environment, with the only change being the image used was > spark 3.4.0 and 3.4.1, and the only changes in jar versions were the > requisite dependencies. > Current workaround is to retry the job, but this can cause substantial > slowdowns if it occurs during a longer job. Possibly related to > https://issues.apache.org/jira/browse/SPARK-44389 ? -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org