[spark] branch master updated: [SPARK-41178][SQL] Fix parser rule precedence between JOIN and comma

wenchen Thu, 17 Nov 2022 19:26:44 -0800

This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new d5b9fc119cc [SPARK-41178][SQL] Fix parser rule precedence between JOIN 
and comma
d5b9fc119cc is described below

commit d5b9fc119cc4eedff35134ecb64b3123b70a689d
Author: Wenchen Fan <wenc...@databricks.com>
AuthorDate: Fri Nov 18 11:26:19 2022 +0800

    [SPARK-41178][SQL] Fix parser rule precedence between JOIN and comma
    
    ### What changes were proposed in this pull request?
    
    This PR fixes a long-standing parser bug in Spark: `JOIN` should take 
precedence over comma when combining relations. For example, `FROM t1, t2 JOIN 
t3` should result to `t1 X (t2 X t3)`. However, it's `(t1 X t2) X t3` today.
    
    You can easily verify this behavior in other databases by running the query 
`SELECT * FROM t1, t2 JOIN t3 ON t1.c and t3.c`. It should fail as `t1.c` is 
not available when joining t2 and t3. I tested MySQL, Oracle and PostgreSQL, 
all of them fail, but Spark can run this query due to the wrong join order.
    
    However, this bug has a large impact: it changes the join order which can 
lead to worse performance or unexpected analysis errors. To be safe, this PR 
hides the fix under a new ANSI sub-config.
    
    ### Why are the changes needed?
    
    bug fix
    
    ### Does this PR introduce _any_ user-facing change?
    
    No, as the fix is turned off by default.
    
    ### How was this patch tested?
    
    new tests
    
    Closes #38691 from cloud-fan/join.
    
    Authored-by: Wenchen Fan <wenc...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../spark/sql/catalyst/parser/AstBuilder.scala     |  8 +++++--
 .../org/apache/spark/sql/internal/SQLConf.scala    | 14 ++++++++++--
 .../sql/catalyst/parser/PlanParserSuite.scala      | 26 ++++++++++++++++++++++
 3 files changed, 44 insertions(+), 4 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index af2097b5d0f..4adb70bc390 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -914,7 +914,11 @@ class AstBuilder extends SqlBaseParserBaseVisitor[AnyRef] 
with SQLConfHelper wit
    */
   override def visitFromClause(ctx: FromClauseContext): LogicalPlan = 
withOrigin(ctx) {
     val from = ctx.relation.asScala.foldLeft(null: LogicalPlan) { (left, 
relation) =>
-      val right = plan(relation.relationPrimary)
+      val right = if (conf.ansiRelationPrecedence) {
+        visitRelation(relation)
+      } else {
+        plan(relation.relationPrimary)
+      }
       val join = right.optionalMap(left) { (left, right) =>
         if (relation.LATERAL != null) {
           if (!relation.relationPrimary.isInstanceOf[AliasedQueryContext]) {
@@ -925,7 +929,7 @@ class AstBuilder extends SqlBaseParserBaseVisitor[AnyRef] 
with SQLConfHelper wit
           Join(left, right, Inner, None, JoinHint.NONE)
         }
       }
-      withJoinRelations(join, relation)
+      if (conf.ansiRelationPrecedence) join else withJoinRelations(join, 
relation)
     }
     if (ctx.pivotClause() != null) {
       if (ctx.unpivotClause() != null) {
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index f6c3f902409..2f4b03cf591 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -2984,8 +2984,16 @@ object SQLConf {
     .createWithDefault(false)
 
   val DOUBLE_QUOTED_IDENTIFIERS = 
buildConf("spark.sql.ansi.doubleQuotedIdentifiers")
-    .doc("When true, Spark SQL reads literals enclosed in double quoted (\") 
as identifiers. " +
-      "When false they are read as string literals.")
+    .doc(s"When true and '${ANSI_ENABLED.key}' is true, Spark SQL reads 
literals enclosed in " +
+      "double quoted (\") as identifiers. When false they are read as string 
literals.")
+    .version("3.4.0")
+    .booleanConf
+    .createWithDefault(false)
+
+  val ANSI_RELATION_PRECEDENCE = buildConf("spark.sql.ansi.relationPrecedence")
+    .doc(s"When true and '${ANSI_ENABLED.key}' is true, JOIN takes precedence 
over comma when " +
+      "combining relation. For example, `t1, t2 JOIN t3` should result to `t1 
X (t2 X t3)`. If " +
+      "the config is false, the result is `(t1 X t2) X t3`.")
     .version("3.4.0")
     .booleanConf
     .createWithDefault(false)
@@ -4684,6 +4692,8 @@ class SQLConf extends Serializable with Logging {
 
   def doubleQuotedIdentifiers: Boolean = ansiEnabled && 
getConf(DOUBLE_QUOTED_IDENTIFIERS)
 
+  def ansiRelationPrecedence: Boolean = ansiEnabled && 
getConf(ANSI_RELATION_PRECEDENCE)
+
   def timestampType: AtomicType = getConf(TIMESTAMP_TYPE) match {
     case "TIMESTAMP_LTZ" =>
       // For historical reason, the TimestampType maps to TIMESTAMP WITH LOCAL 
TIME ZONE
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index 968e2227234..11590e465c2 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -704,6 +704,32 @@ class PlanParserSuite extends AnalysisTest {
         .join(table("t2"), Inner, Option($"t1.col1" === $"t2.col2"))
         .select(star()))
 
+    assertEqual(
+      "select * from t1 JOIN t2, t3 join t2 on t1.col1 = t2.col2",
+      table("t1")
+        .join(table("t2"))
+        .join(table("t3"))
+        .join(table("t2"), Inner, Option($"t1.col1" === $"t2.col2"))
+        .select(star()))
+
+    // Implicit joins - ANSI mode
+    withSQLConf(
+      SQLConf.ANSI_ENABLED.key -> "true",
+      SQLConf.ANSI_RELATION_PRECEDENCE.key -> "true") {
+
+      assertEqual(
+        "select * from t1, t3 join t2 on t1.col1 = t2.col2",
+        table("t1").join(
+          table("t3").join(table("t2"), Inner, Option($"t1.col1" === 
$"t2.col2")))
+          .select(star()))
+
+      assertEqual(
+        "select * from t1 JOIN t2, t3 join t2 on t1.col1 = t2.col2",
+        table("t1").join(table("t2")).join(
+          table("t3").join(table("t2"), Inner, Option($"t1.col1" === 
$"t2.col2")))
+          .select(star()))
+    }
+
     // Test lateral join with join conditions
     assertEqual(
       s"select * from t join lateral (select * from u) uu on true",


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-41178][SQL] Fix parser rule precedence between JOIN and comma

Reply via email to