ahshahid commented on code in PR #45343:
URL: https://github.com/apache/spark/pull/45343#discussion_r1520323348


##########
sql/core/src/test/scala/org/apache/spark/sql/DataFrameSelfJoinSuite.scala:
##########
@@ -498,4 +559,70 @@ class DataFrameSelfJoinSuite extends QueryTest with 
SharedSparkSession {
       assert(df1.join(df2, $"t1.i" === $"t2.i").cache().count() == 1)
     }
   }
+
+  test("SPARK_47217: deduplication of project causes ambiguity in resolution") 
{
+    val df = Seq((1, 2)).toDF("a", "b")
+    val df2 = df.select(df("a").as("aa"), df("b").as("bb"))
+    val df3 = df2.join(df, df2("bb") === df("b")).select(df2("aa"), df("a"))
+    checkAnswer(
+      df3,
+      Row(1, 1) :: Nil)
+  }
+
+  test("SPARK-47217. deduplication in nested joins with join attribute 
aliased") {
+    val df1 = Seq((1, 2)).toDF("a", "b")
+    val df2 = Seq((1, 2)).toDF("aa", "bb")
+    val df1Joindf2 = df1.join(df2, df1("a") === 
df2("aa")).select(df1("a").as("aaa"),
+      df2("aa"), df1("b"))
+
+    assertCorrectResolution(df1Joindf2.join(df1, df1Joindf2("aaa") === 
df1("a")),
+      Resolution.LeftConditionToLeftLeg, Resolution.RightConditionToRightLeg)
+
+    assertCorrectResolution(df1.join(df1Joindf2, df1Joindf2("aaa") === 
df1("a")),
+      Resolution.LeftConditionToRightLeg, Resolution.RightConditionToLeftLeg)
+
+    val proj1 = df1Joindf2.join(df1, df1Joindf2("aaa") === 
df1("a")).select(df1Joindf2("aa"),
+      df1("a")).queryExecution.analyzed.asInstanceOf[Project]
+    val join1 = proj1.child.asInstanceOf[Join]
+    assert(proj1.projectList(0).references.subsetOf(join1.left.outputSet))
+    assert(proj1.projectList(1).references.subsetOf(join1.right.outputSet))
+
+    val proj2 = df1.join(df1Joindf2, df1Joindf2("aaa") === 
df1("a")).select(df1Joindf2("aa"),
+      df1("a")).queryExecution.analyzed.asInstanceOf[Project]
+    val join2 = proj2.child.asInstanceOf[Join]
+    assert(proj2.projectList(0).references.subsetOf(join2.right.outputSet))
+    assert(proj2.projectList(1).references.subsetOf(join2.left.outputSet))
+  }
+
+  test("SPARK-47217. deduplication in nested joins without join attribute 
aliased") {
+    val df1 = Seq((1, 2)).toDF("a", "b")
+    val df2 = Seq((1, 2)).toDF("aa", "bb")
+    val df1Joindf2 = df1.join(df2, df1("a") === df2("aa")).select(df1("a"), 
df2("aa"), df1("b"))
+
+    assertCorrectResolution(df1Joindf2.join(df1, df1Joindf2("a") === df1("a")),
+      Resolution.LeftConditionToLeftLeg, Resolution.RightConditionToRightLeg)
+
+    assertCorrectResolution(df1.join(df1Joindf2, df1Joindf2("a") === df1("a")),
+      Resolution.LeftConditionToRightLeg, Resolution.RightConditionToLeftLeg)
+
+    val proj1 = df1Joindf2.join(df1, df1Joindf2("a") === 
df1("a")).select(df1Joindf2("a"),
+      df1("a")).queryExecution.analyzed.asInstanceOf[Project]

Review Comment:
   "Shouldn't selecting df1("a") be ambiguous here? It is not ambiguous in the 
join condition because df1Joindf2("a") can come from only one side so the 
df1("a") must come from the other side. But after the join I'm not sure we can 
follow the same logic."
   
   As I envisage , mentally, for the Join with select as 
                                            Project
                                                   |
                                               Filter    
                                                   |
                                                Join
   
   So what is true for filter in terms of attribute refs should also be true 
for Project.
   
   But I am not basing my view on above ( i.e in terms of ExprIDs).
   I am looking at it from Point of View of user, that is one is coming from 
df1("a") and another from  df1Joindf2("a"), so there is no ambiguity either in 
Join or Select



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to