(spark) 01/01: [SPARK-XXXX][SQL] Address review comments

yao Wed, 04 Feb 2026 01:27:33 -0800

This is an automated email from the ASF dual-hosted git repository.

yao pushed a commit to branch feature/crossjoin-array-contains-benchmark
in repository https://gitbox.apache.org/repos/asf/spark.git


commit 0e22ceb620997941f1b7a9e7a6776f7e91c8a4d7
Author: Kent Yao <[email protected]>
AuthorDate: Wed Feb 4 09:25:04 2026 +0000

    [SPARK-XXXX][SQL] Address review comments
    
    - Register CrossJoinArrayContainsToInnerJoin in Optimizer.scala batches
    - Update benchmark to exclude rule for accurate unoptimized baseline
    - Remove Presto references from code comments
---
 .../sql/catalyst/optimizer/CrossJoinArrayContainsToInnerJoin.scala | 4 +---
 .../scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala  | 1 +
 .../benchmark/CrossJoinArrayContainsToInnerJoinBenchmark.scala     | 7 +++++--
 3 files changed, 7 insertions(+), 5 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CrossJoinArrayContainsToInnerJoin.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CrossJoinArrayContainsToInnerJoin.scala
index b74f08319e63..10ed24468202 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CrossJoinArrayContainsToInnerJoin.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CrossJoinArrayContainsToInnerJoin.scala
@@ -41,12 +41,10 @@ import org.apache.spark.sql.types._
  * }}}
  *
  * This avoids the O(N*M) cross join by using unnesting and equi-join.
- *
- * Ported from Presto's CrossJoinWithArrayContainsToInnerJoin optimizer rule.
  */
 object CrossJoinArrayContainsToInnerJoin extends Rule[LogicalPlan] with 
PredicateHelper {
 
-  // Supported element types for the optimization (matching Presto's supported 
types)
+  // Supported element types for the optimization
   private val supportedTypes: Set[DataType] = Set(
     IntegerType, LongType, StringType, DateType
   )
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
index fe15819bd44a..0a018bfe08a5 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
@@ -261,6 +261,7 @@ abstract class Optimizer(catalogManager: CatalogManager)
     Batch("Optimize One Row Plan", fixedPoint, OptimizeOneRowPlan),
     // The following batch should be executed after batch "Join Reorder" and 
"LocalRelation".
     Batch("Check Cartesian Products", Once,
+      CrossJoinArrayContainsToInnerJoin,
       CheckCartesianProducts),
     Batch("RewriteSubquery", Once,
       RewritePredicateSubquery,
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CrossJoinArrayContainsToInnerJoinBenchmark.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CrossJoinArrayContainsToInnerJoinBenchmark.scala
index 1b9a6439d8cf..2ac64d9099dc 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CrossJoinArrayContainsToInnerJoinBenchmark.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CrossJoinArrayContainsToInnerJoinBenchmark.scala
@@ -73,8 +73,11 @@ object CrossJoinArrayContainsToInnerJoinBenchmark extends 
SqlBasedBenchmark {
     items.createOrReplaceTempView("items")
 
     benchmark.addCase("Cross join + array_contains filter (unoptimized)", 
numIters = 3) { _ =>
-      // Disable the optimization to simulate unoptimized behavior
-      withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") {
+      // Disable the optimization to measure the true cross-join+filter 
baseline
+      withSQLConf(
+        SQLConf.CROSS_JOINS_ENABLED.key -> "true",
+        SQLConf.OPTIMIZER_EXCLUDED_RULES.key ->
+          
"org.apache.spark.sql.catalyst.optimizer.CrossJoinArrayContainsToInnerJoin") {
         // This query would be a cross join with filter without optimization
         val df = spark.sql(
           """


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(spark) 01/01: [SPARK-XXXX][SQL] Address review comments

Reply via email to