[ https://issues.apache.org/jira/browse/SPARK-39612?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Hyukjin Kwon updated SPARK-39612: --------------------------------- Priority: Critical (was: Major) > The dataframe returned by exceptAll() can no longer perform operations such > as count() or isEmpty(), or an exception will be thrown. > ------------------------------------------------------------------------------------------------------------------------------------ > > Key: SPARK-39612 > URL: https://issues.apache.org/jira/browse/SPARK-39612 > Project: Spark > Issue Type: Bug > Components: PySpark > Affects Versions: 3.3.0 > Environment: OS: centos stream 8 > {code:java} > $ uname -a > Linux thomaszhu1.fyre.ibm.com 4.18.0-348.7.1.el8_5.x86_64 #1 SMP Wed Dec 22 > 13:25:12 UTC 2021 x86_64 x86_64 x86_64 GNU/Linux > $ python --version > Python 3.8.13 > $ pyspark --version > Welcome to > ____ __ > / __/__ ___ _____/ /__ > _\ \/ _ \/ _ `/ __/ '_/ > /___/ .__/\_,_/_/ /_/\_\ version 3.3.0 > /_/ > > Using Scala version 2.12.15, OpenJDK 64-Bit Server VM, 11.0.11 > Branch HEAD > Compiled by user ubuntu on 2022-06-09T19:58:58Z > Revision f74867bddfbcdd4d08076db36851e88b15e66556 > Url https://github.com/apache/spark > Type --help for more information. > $ java --version > openjdk 11.0.11 2021-04-20 > OpenJDK Runtime Environment AdoptOpenJDK-11.0.11+9 (build 11.0.11+9) > OpenJDK 64-Bit Server VM AdoptOpenJDK-11.0.11+9 (build 11.0.11+9, mixed mode) > {code} > > Reporter: Zhu JunYong > Priority: Critical > > As I said, the dataframe returned by `exceptAll()` can no longer perform > operations such as `count()` or `isEmpty()`, or an exception will be thrown. > > > {code:java} > >>> d1 = spark.createDataFrame([("a")], 'STRING') > >>> d1.show() > +-----+ > |value| > +-----+ > | a| > +-----+ > >>> d2 = d1.exceptAll(d1) > >>> d2.show() > +-----+ > |value| > +-----+ > +-----+ > >>> d2.count() > 22/06/27 11:22:15 ERROR Executor: Exception in task 0.0 in stage 113.0 (TID > 525) > java.lang.IllegalStateException: Couldn't find value#465 in [sum#494L] > at > org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:80) > at > org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:73) > at > org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584) > at > org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584) > at > org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$3(TreeNode.scala:589) > at scala.collection.immutable.List.map(List.scala:297) > at > org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:698) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:589) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:528) > at > org.apache.spark.sql.catalyst.expressions.BindReferences$.bindReference(BoundAttribute.scala:73) > at > org.apache.spark.sql.execution.GenerateExec.boundGenerator$lzycompute(GenerateExec.scala:75) > at > org.apache.spark.sql.execution.GenerateExec.boundGenerator(GenerateExec.scala:75) > at > org.apache.spark.sql.execution.GenerateExec.$anonfun$doExecute$10(GenerateExec.scala:114) > at > org.apache.spark.sql.execution.LazyIterator.results$lzycompute(GenerateExec.scala:36) > at > org.apache.spark.sql.execution.LazyIterator.results(GenerateExec.scala:36) > at > org.apache.spark.sql.execution.LazyIterator.hasNext(GenerateExec.scala:37) > at scala.collection.Iterator$ConcatIterator.advance(Iterator.scala:199) > at scala.collection.Iterator$ConcatIterator.hasNext(Iterator.scala:227) > at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage5.hashAgg_doAggregateWithoutKey_0$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage5.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760) > at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) > at > org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140) > at > org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) > at org.apache.spark.scheduler.Task.run(Task.scala:136) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551) > at > java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) > at > java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) > at java.base/java.lang.Thread.run(Thread.java:829) > 22/06/27 11:22:15 ERROR TaskSetManager: Task 0 in stage 113.0 failed 1 times; > aborting job > Traceback (most recent call last): > File "<stdin>", line 1, in <module> > File > "/opt/downloads/spark-3.3.0-bin-hadoop3/python/pyspark/sql/dataframe.py", > line 804, in count > return int(self._jdf.count()) > File "/root/miniconda3/lib/python3.8/site-packages/py4j/java_gateway.py", > line 1304, in __call__ > return_value = get_return_value( > File "/opt/downloads/spark-3.3.0-bin-hadoop3/python/pyspark/sql/utils.py", > line 190, in deco > return f(*a, **kw) > File "/root/miniconda3/lib/python3.8/site-packages/py4j/protocol.py", line > 326, in get_return_value > raise Py4JJavaError( > py4j.protocol.Py4JJavaError: An error occurred while calling o253.count. > : org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 > in stage 113.0 failed 1 times, most recent failure: Lost task 0.0 in stage > 113.0 (TID 525) (thomaszhu1.fyre.ibm.com executor driver): > java.lang.IllegalStateException: Couldn't find value#465 in [sum#494L] > at > org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:80) > at > org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:73) > at > org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584) > at > org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584) > at > org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$3(TreeNode.scala:589) > at scala.collection.immutable.List.map(List.scala:297) > at > org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:698) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:589) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:528) > at > org.apache.spark.sql.catalyst.expressions.BindReferences$.bindReference(BoundAttribute.scala:73) > at > org.apache.spark.sql.execution.GenerateExec.boundGenerator$lzycompute(GenerateExec.scala:75) > at > org.apache.spark.sql.execution.GenerateExec.boundGenerator(GenerateExec.scala:75) > at > org.apache.spark.sql.execution.GenerateExec.$anonfun$doExecute$10(GenerateExec.scala:114) > at > org.apache.spark.sql.execution.LazyIterator.results$lzycompute(GenerateExec.scala:36) > at > org.apache.spark.sql.execution.LazyIterator.results(GenerateExec.scala:36) > at > org.apache.spark.sql.execution.LazyIterator.hasNext(GenerateExec.scala:37) > at scala.collection.Iterator$ConcatIterator.advance(Iterator.scala:199) > at scala.collection.Iterator$ConcatIterator.hasNext(Iterator.scala:227) > at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage5.hashAgg_doAggregateWithoutKey_0$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage5.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760) > at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) > at > org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140) > at > org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) > at org.apache.spark.scheduler.Task.run(Task.scala:136) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551) > at > java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) > at > java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) > at java.base/java.lang.Thread.run(Thread.java:829)Driver stacktrace: > at > org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607) > at > scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) > at > scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) > at > org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182) > at > org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182) > at scala.Option.foreach(Option.scala:407) > at > org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802) > at > org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791) > at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49) > Caused by: java.lang.IllegalStateException: Couldn't find value#465 in > [sum#494L] > at > org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:80) > at > org.apache.spark.sql.catalyst.expressions.BindReferences$$anonfun$bindReference$1.applyOrElse(BoundAttribute.scala:73) > at > org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:584) > at > org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:176) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:584) > at > org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$3(TreeNode.scala:589) > at scala.collection.immutable.List.map(List.scala:297) > at > org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:698) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:589) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:560) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:528) > at > org.apache.spark.sql.catalyst.expressions.BindReferences$.bindReference(BoundAttribute.scala:73) > at > org.apache.spark.sql.execution.GenerateExec.boundGenerator$lzycompute(GenerateExec.scala:75) > at > org.apache.spark.sql.execution.GenerateExec.boundGenerator(GenerateExec.scala:75) > at > org.apache.spark.sql.execution.GenerateExec.$anonfun$doExecute$10(GenerateExec.scala:114) > at > org.apache.spark.sql.execution.LazyIterator.results$lzycompute(GenerateExec.scala:36) > at > org.apache.spark.sql.execution.LazyIterator.results(GenerateExec.scala:36) > at > org.apache.spark.sql.execution.LazyIterator.hasNext(GenerateExec.scala:37) > at scala.collection.Iterator$ConcatIterator.advance(Iterator.scala:199) > at scala.collection.Iterator$ConcatIterator.hasNext(Iterator.scala:227) > at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage5.hashAgg_doAggregateWithoutKey_0$(Unknown > Source) > at > org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage5.processNext(Unknown > Source) > at > org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43) > at > org.apache.spark.sql.execution.WholeStageCodegenExec$$anon$1.hasNext(WholeStageCodegenExec.scala:760) > at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460) > at > org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:140) > at > org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99) > at > org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52) > at org.apache.spark.scheduler.Task.run(Task.scala:136) > at > org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548) > at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504) > at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551) > at > java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) > at > java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) > at java.base/java.lang.Thread.run(Thread.java:829) > {code} > > -- This message was sent by Atlassian Jira (v8.20.10#820010) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org