This is an automated email from the ASF dual-hosted git repository. yao pushed a commit to branch feature/crossjoin-array-contains-benchmark in repository https://gitbox.apache.org/repos/asf/spark.git
commit 0e22ceb620997941f1b7a9e7a6776f7e91c8a4d7 Author: Kent Yao <[email protected]> AuthorDate: Wed Feb 4 09:25:04 2026 +0000 [SPARK-XXXX][SQL] Address review comments - Register CrossJoinArrayContainsToInnerJoin in Optimizer.scala batches - Update benchmark to exclude rule for accurate unoptimized baseline - Remove Presto references from code comments --- .../sql/catalyst/optimizer/CrossJoinArrayContainsToInnerJoin.scala | 4 +--- .../scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala | 1 + .../benchmark/CrossJoinArrayContainsToInnerJoinBenchmark.scala | 7 +++++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CrossJoinArrayContainsToInnerJoin.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CrossJoinArrayContainsToInnerJoin.scala index b74f08319e63..10ed24468202 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CrossJoinArrayContainsToInnerJoin.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/CrossJoinArrayContainsToInnerJoin.scala @@ -41,12 +41,10 @@ import org.apache.spark.sql.types._ * }}} * * This avoids the O(N*M) cross join by using unnesting and equi-join. - * - * Ported from Presto's CrossJoinWithArrayContainsToInnerJoin optimizer rule. */ object CrossJoinArrayContainsToInnerJoin extends Rule[LogicalPlan] with PredicateHelper { - // Supported element types for the optimization (matching Presto's supported types) + // Supported element types for the optimization private val supportedTypes: Set[DataType] = Set( IntegerType, LongType, StringType, DateType ) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala index fe15819bd44a..0a018bfe08a5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala @@ -261,6 +261,7 @@ abstract class Optimizer(catalogManager: CatalogManager) Batch("Optimize One Row Plan", fixedPoint, OptimizeOneRowPlan), // The following batch should be executed after batch "Join Reorder" and "LocalRelation". Batch("Check Cartesian Products", Once, + CrossJoinArrayContainsToInnerJoin, CheckCartesianProducts), Batch("RewriteSubquery", Once, RewritePredicateSubquery, diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CrossJoinArrayContainsToInnerJoinBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CrossJoinArrayContainsToInnerJoinBenchmark.scala index 1b9a6439d8cf..2ac64d9099dc 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CrossJoinArrayContainsToInnerJoinBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CrossJoinArrayContainsToInnerJoinBenchmark.scala @@ -73,8 +73,11 @@ object CrossJoinArrayContainsToInnerJoinBenchmark extends SqlBasedBenchmark { items.createOrReplaceTempView("items") benchmark.addCase("Cross join + array_contains filter (unoptimized)", numIters = 3) { _ => - // Disable the optimization to simulate unoptimized behavior - withSQLConf(SQLConf.CROSS_JOINS_ENABLED.key -> "true") { + // Disable the optimization to measure the true cross-join+filter baseline + withSQLConf( + SQLConf.CROSS_JOINS_ENABLED.key -> "true", + SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> + "org.apache.spark.sql.catalyst.optimizer.CrossJoinArrayContainsToInnerJoin") { // This query would be a cross join with filter without optimization val df = spark.sql( """ --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
