This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 493493d6c5bb [SPARK-48173][SQL] CheckAnalysis should see the entire 
query plan
493493d6c5bb is described below

commit 493493d6c5bbbaa0b04f5548ac1ccd9502e8b8fa
Author: Wenchen Fan <wenc...@databricks.com>
AuthorDate: Tue May 7 08:02:25 2024 -0700

    [SPARK-48173][SQL] CheckAnalysis should see the entire query plan
    
    ### What changes were proposed in this pull request?
    
    This is a follow-up of https://github.com/apache/spark/pull/38029 . Some 
custom check rules need to see the entire query plan tree to get some context, 
but https://github.com/apache/spark/pull/38029 breaks it as it checks the query 
plan of dangling CTE relations recursively.
    
    This PR fixes it by putting back the dangling CTE relation in the main 
query plan and then check the main query plan.
    
    ### Why are the changes needed?
    
    Revert the breaking change to custom check rules
    
    ### Does this PR introduce _any_ user-facing change?
    
    No for most users. This restores the behavior of Spark 3.3 and earlier for 
custom check rules.
    
    ### How was this patch tested?
    
    existing tests.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No
    
    Closes #46439 from cloud-fan/check.
    
    Authored-by: Wenchen Fan <wenc...@databricks.com>
    Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
 .../sql/catalyst/analysis/CheckAnalysis.scala      | 39 +++++++++++-----------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
index d1b336b08955..e55f23b6aa86 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala
@@ -145,15 +145,16 @@ trait CheckAnalysis extends PredicateHelper with 
LookupCatalog with QueryErrorsB
   private def checkUnreferencedCTERelations(
       cteMap: mutable.Map[Long, (CTERelationDef, Int, mutable.Map[Long, Int])],
       visited: mutable.Map[Long, Boolean],
+      danglingCTERelations: mutable.ArrayBuffer[CTERelationDef],
       cteId: Long): Unit = {
     if (visited(cteId)) {
       return
     }
     val (cteDef, _, refMap) = cteMap(cteId)
     refMap.foreach { case (id, _) =>
-      checkUnreferencedCTERelations(cteMap, visited, id)
+      checkUnreferencedCTERelations(cteMap, visited, danglingCTERelations, id)
     }
-    checkAnalysis0(cteDef.child)
+    danglingCTERelations.append(cteDef)
     visited(cteId) = true
   }
 
@@ -161,35 +162,35 @@ trait CheckAnalysis extends PredicateHelper with 
LookupCatalog with QueryErrorsB
     val inlineCTE = InlineCTE(alwaysInline = true)
     val cteMap = mutable.HashMap.empty[Long, (CTERelationDef, Int, 
mutable.Map[Long, Int])]
     inlineCTE.buildCTEMap(plan, cteMap)
+    val danglingCTERelations = mutable.ArrayBuffer.empty[CTERelationDef]
     val visited: mutable.Map[Long, Boolean] = 
mutable.Map.empty.withDefaultValue(false)
-    cteMap.foreach { case (cteId, (relation, refCount, _)) =>
-      // If a CTE relation is never used, it will disappear after inline. Here 
we explicitly check
-      // analysis for it, to make sure the entire query plan is valid.
-      try {
-        // If a CTE relation ref count is 0, the other CTE relations that 
reference it
-        // should also be checked by checkAnalysis0. This code will also 
guarantee the leaf
-        // relations that do not reference any others are checked first.
-        if (refCount == 0) {
-          checkUnreferencedCTERelations(cteMap, visited, cteId)
-        }
-      } catch {
-        case e: AnalysisException =>
-          throw new ExtendedAnalysisException(e, relation.child)
+    // If a CTE relation is never used, it will disappear after inline. Here 
we explicitly collect
+    // these dangling CTE relations, and put them back in the main query, to 
make sure the entire
+    // query plan is valid.
+    cteMap.foreach { case (cteId, (_, refCount, _)) =>
+      // If a CTE relation ref count is 0, the other CTE relations that 
reference it should also be
+      // collected. This code will also guarantee the leaf relations that do 
not reference
+      // any others are collected first.
+      if (refCount == 0) {
+        checkUnreferencedCTERelations(cteMap, visited, danglingCTERelations, 
cteId)
       }
     }
     // Inline all CTEs in the plan to help check query plan structures in 
subqueries.
-    var inlinedPlan: Option[LogicalPlan] = None
+    var inlinedPlan: LogicalPlan = plan
     try {
-      inlinedPlan = Some(inlineCTE(plan))
+      inlinedPlan = inlineCTE(plan)
     } catch {
       case e: AnalysisException =>
         throw new ExtendedAnalysisException(e, plan)
     }
+    if (danglingCTERelations.nonEmpty) {
+      inlinedPlan = WithCTE(inlinedPlan, danglingCTERelations.toSeq)
+    }
     try {
-      checkAnalysis0(inlinedPlan.get)
+      checkAnalysis0(inlinedPlan)
     } catch {
       case e: AnalysisException =>
-        throw new ExtendedAnalysisException(e, inlinedPlan.get)
+        throw new ExtendedAnalysisException(e, inlinedPlan)
     }
     plan.setAnalyzed()
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to