David Lindelöf created SPARK-32280: --------------------------------------
Summary: AnalysisException thrown when query contains several JOINs Key: SPARK-32280 URL: https://issues.apache.org/jira/browse/SPARK-32280 Project: Spark Issue Type: Bug Components: PySpark Affects Versions: 2.4.5 Reporter: David Lindelöf I've come across a curious {{AnalysisException}} thrown in one of my SQL queries, even though the SQL appears legitimate. I was able to reduce it to this example: {code:python} from pyspark.sql import SparkSession spark = SparkSession.builder.getOrCreate() spark.sql('SELECT 1 AS id').createOrReplaceTempView('A') spark.sql(''' SELECT id, 'foo' AS kind FROM A''').createOrReplaceTempView('B') spark.sql(''' SELECT l.id FROM B AS l JOIN B AS r ON l.kind = r.kind''').createOrReplaceTempView('C') spark.sql(''' SELECT 0 FROM ( SELECT * FROM B JOIN C USING (id)) JOIN ( SELECT * FROM B JOIN C USING (id)) USING (id)''') {code} Running this yields the following error: {code} py4j.protocol.Py4JJavaError: An error occurred while calling o20.sql. : org.apache.spark.sql.AnalysisException: Resolved attribute(s) kind#11 missing from id#10,kind#2,id#7,kind#5 in operator !Join Inner, (kind#11 = kind#5). Attribute(s) with the same name appear in the operation: kind. Please check if the right attribute(s) are used.;; Project [0 AS 0#15] +- Project [id#0, kind#2, kind#11] +- Join Inner, (id#0 = id#14) :- SubqueryAlias `__auto_generated_subquery_name` : +- Project [id#0, kind#2] : +- Project [id#0, kind#2] : +- Join Inner, (id#0 = id#9) : :- SubqueryAlias `b` : : +- Project [id#0, foo AS kind#2] : : +- SubqueryAlias `a` : : +- Project [1 AS id#0] : : +- OneRowRelation : +- SubqueryAlias `c` : +- Project [id#9] : +- Join Inner, (kind#2 = kind#5) : :- SubqueryAlias `l` : : +- SubqueryAlias `b` : : +- Project [id#9, foo AS kind#2] : : +- SubqueryAlias `a` : : +- Project [1 AS id#9] : : +- OneRowRelation : +- SubqueryAlias `r` : +- SubqueryAlias `b` : +- Project [id#7, foo AS kind#5] : +- SubqueryAlias `a` : +- Project [1 AS id#7] : +- OneRowRelation +- SubqueryAlias `__auto_generated_subquery_name` +- Project [id#14, kind#11] +- Project [id#14, kind#11] +- Join Inner, (id#14 = id#10) :- SubqueryAlias `b` : +- Project [id#14, foo AS kind#11] : +- SubqueryAlias `a` : +- Project [1 AS id#14] : +- OneRowRelation +- SubqueryAlias `c` +- Project [id#10] +- !Join Inner, (kind#11 = kind#5) :- SubqueryAlias `l` : +- SubqueryAlias `b` : +- Project [id#10, foo AS kind#2] : +- SubqueryAlias `a` : +- Project [1 AS id#10] : +- OneRowRelation +- SubqueryAlias `r` +- SubqueryAlias `b` +- Project [id#7, foo AS kind#5] +- SubqueryAlias `a` +- Project [1 AS id#7] +- OneRowRelation at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.failAnalysis(CheckAnalysis.scala:43) at org.apache.spark.sql.catalyst.analysis.Analyzer.failAnalysis(Analyzer.scala:95) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:369) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$checkAnalysis$1.apply(CheckAnalysis.scala:86) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:126) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:125) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:125) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:125) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:125) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:125) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:125) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:125) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:125) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$foreachUp$1.apply(TreeNode.scala:125) at scala.collection.immutable.List.foreach(List.scala:392) at org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:125) at org.apache.spark.sql.catalyst.analysis.CheckAnalysis$class.checkAnalysis(CheckAnalysis.scala:86) at org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:95) at org.apache.spark.sql.catalyst.analysis.Analyzer$$anonfun$executeAndCheck$1.apply(Analyzer.scala:108) at org.apache.spark.sql.catalyst.analysis.Analyzer$$anonfun$executeAndCheck$1.apply(Analyzer.scala:105) at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:201) at org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:105) at org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:58) at org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:56) at org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:48) at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:78) at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:642) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:498) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357) at py4j.Gateway.invoke(Gateway.java:282) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:238) at java.lang.Thread.run(Thread.java:748) {code} -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org