This is an automated email from the ASF dual-hosted git repository.

yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new f659f8d1b019 [SPARK-47319][SQL] Improve missingInput calculation
f659f8d1b019 is described below

commit f659f8d1b019385ad95673205386b6cbe8f89a49
Author: Peter Toth <peter.t...@gmail.com>
AuthorDate: Fri Mar 8 13:15:35 2024 +0800

    [SPARK-47319][SQL] Improve missingInput calculation
    
    ### What changes were proposed in this pull request?
    This PR improves `QueryPlan.missingInput()` calculation.
    
    ### Why are the changes needed?
    This seems to be the root cause of `DeduplicateRelations` slowness in some 
cases.
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    Existing UTs.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #45424 from peter-toth/fix-missinginput.
    
    Authored-by: Peter Toth <peter.t...@gmail.com>
    Signed-off-by: Kent Yao <y...@apache.org>
---
 .../sql/catalyst/expressions/AttributeSet.scala      | 20 +++++++++++++-------
 .../apache/spark/sql/catalyst/plans/QueryPlan.scala  |  8 +++++++-
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
index 2628afd8923c..236380b2c030 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/AttributeSet.scala
@@ -104,13 +104,19 @@ class AttributeSet private (private val baseSet: 
mutable.LinkedHashSet[Attribute
    * in `other`.
    */
   def --(other: Iterable[NamedExpression]): AttributeSet = {
-    other match {
-      // SPARK-32755: `--` method behave differently under scala 2.12 and 2.13,
-      // use a Scala 2.12 based code to maintains the insertion order in Scala 
2.13
-      case otherSet: AttributeSet =>
-        new AttributeSet(baseSet.clone() --= otherSet.baseSet)
-      case _ =>
-        new AttributeSet(baseSet.clone() --= other.map(a => new 
AttributeEquals(a.toAttribute)))
+    if (isEmpty) {
+      AttributeSet.empty
+    } else if (other.isEmpty) {
+      this
+    } else {
+      other match {
+        // SPARK-32755: `--` method behave differently under scala 2.12 and 
2.13,
+        // use a Scala 2.12 based code to maintains the insertion order in 
Scala 2.13
+        case otherSet: AttributeSet =>
+          new AttributeSet(baseSet.clone() --= otherSet.baseSet)
+        case _ =>
+          new AttributeSet(baseSet.clone() --= other.map(a => new 
AttributeEquals(a.toAttribute)))
+      }
     }
   }
 
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
index 2a62ea1feb03..0f049103542e 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/QueryPlan.scala
@@ -102,7 +102,13 @@ abstract class QueryPlan[PlanType <: QueryPlan[PlanType]]
   /**
    * Attributes that are referenced by expressions but not provided by this 
node's children.
    */
-  final def missingInput: AttributeSet = references -- inputSet
+  final def missingInput: AttributeSet = {
+    if (references.isEmpty) {
+      AttributeSet.empty
+    } else {
+      references -- inputSet
+    }
+  }
 
   /**
    * Runs [[transformExpressionsDown]] with `rule` on all expressions present


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to