peter-toth commented on code in PR #45343: URL: https://github.com/apache/spark/pull/45343#discussion_r1520078411
########## sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala: ########## @@ -498,4 +559,70 @@ class DataFrameSelfJoinSuite extends QueryTest with SharedSparkSession { assert(df1.join(df2, $"t1.i" === $"t2.i").cache().count() == 1) } } + + test("SPARK_47217: deduplication of project causes ambiguity in resolution") { + val df = Seq((1, 2)).toDF("a", "b") + val df2 = df.select(df("a").as("aa"), df("b").as("bb")) + val df3 = df2.join(df, df2("bb") === df("b")).select(df2("aa"), df("a")) + checkAnswer( + df3, + Row(1, 1) :: Nil) + } + + test("SPARK-47217. deduplication in nested joins with join attribute aliased") { + val df1 = Seq((1, 2)).toDF("a", "b") + val df2 = Seq((1, 2)).toDF("aa", "bb") + val df1Joindf2 = df1.join(df2, df1("a") === df2("aa")).select(df1("a").as("aaa"), + df2("aa"), df1("b")) + + assertCorrectResolution(df1Joindf2.join(df1, df1Joindf2("aaa") === df1("a")), + Resolution.LeftConditionToLeftLeg, Resolution.RightConditionToRightLeg) + + assertCorrectResolution(df1.join(df1Joindf2, df1Joindf2("aaa") === df1("a")), + Resolution.LeftConditionToRightLeg, Resolution.RightConditionToLeftLeg) + + val proj1 = df1Joindf2.join(df1, df1Joindf2("aaa") === df1("a")).select(df1Joindf2("aa"), + df1("a")).queryExecution.analyzed.asInstanceOf[Project] + val join1 = proj1.child.asInstanceOf[Join] + assert(proj1.projectList(0).references.subsetOf(join1.left.outputSet)) + assert(proj1.projectList(1).references.subsetOf(join1.right.outputSet)) + + val proj2 = df1.join(df1Joindf2, df1Joindf2("aaa") === df1("a")).select(df1Joindf2("aa"), + df1("a")).queryExecution.analyzed.asInstanceOf[Project] + val join2 = proj2.child.asInstanceOf[Join] + assert(proj2.projectList(0).references.subsetOf(join2.right.outputSet)) + assert(proj2.projectList(1).references.subsetOf(join2.left.outputSet)) + } + + test("SPARK-47217. deduplication in nested joins without join attribute aliased") { + val df1 = Seq((1, 2)).toDF("a", "b") + val df2 = Seq((1, 2)).toDF("aa", "bb") + val df1Joindf2 = df1.join(df2, df1("a") === df2("aa")).select(df1("a"), df2("aa"), df1("b")) + + assertCorrectResolution(df1Joindf2.join(df1, df1Joindf2("a") === df1("a")), + Resolution.LeftConditionToLeftLeg, Resolution.RightConditionToRightLeg) + + assertCorrectResolution(df1.join(df1Joindf2, df1Joindf2("a") === df1("a")), + Resolution.LeftConditionToRightLeg, Resolution.RightConditionToLeftLeg) + + val proj1 = df1Joindf2.join(df1, df1Joindf2("a") === df1("a")).select(df1Joindf2("a"), + df1("a")).queryExecution.analyzed.asInstanceOf[Project] Review Comment: Shouldn't selecting `df1("a")` be ambiguous here? It is not ambiguous in the join condition because `df1Joindf2("a")` can come from only one side so the `df1("a")` must come from the other side. But after the join I'm not sure why it shouldn't be ambiguous. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org