[ https://issues.apache.org/jira/browse/SPARK-31705?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17110836#comment-17110836 ]
Apache Spark commented on SPARK-31705: -------------------------------------- User 'wangyum' has created a pull request for this issue: https://github.com/apache/spark/pull/28575 > Rewrite join condition to conjunctive normal form > ------------------------------------------------- > > Key: SPARK-31705 > URL: https://issues.apache.org/jira/browse/SPARK-31705 > Project: Spark > Issue Type: Improvement > Components: SQL > Affects Versions: 3.1.0 > Reporter: Yuming Wang > Assignee: Yuming Wang > Priority: Major > > Rewrite join condition to [conjunctive normal > form|https://en.wikipedia.org/wiki/Conjunctive_normal_form] to push more > conditions to filter. > PostgreSQL: > {code:sql} > CREATE TABLE lineitem (l_orderkey BIGINT, l_partkey BIGINT, l_suppkey BIGINT, > > l_linenumber INT, l_quantity DECIMAL(10,0), l_extendedprice DECIMAL(10,0), > > l_discount DECIMAL(10,0), l_tax DECIMAL(10,0), l_returnflag varchar(255), > > l_linestatus varchar(255), l_shipdate DATE, l_commitdate DATE, l_receiptdate > DATE, > l_shipinstruct varchar(255), l_shipmode varchar(255), l_comment varchar(255)); > > CREATE TABLE orders ( > o_orderkey BIGINT, o_custkey BIGINT, o_orderstatus varchar(255), > o_totalprice DECIMAL(10,0), o_orderdate DATE, o_orderpriority varchar(255), > o_clerk varchar(255), o_shippriority INT, o_comment varchar(255)); > EXPLAIN > SELECT Count(*) > FROM lineitem, > orders > WHERE l_orderkey = o_orderkey > AND ( ( l_suppkey > 3 > AND o_custkey > 13 ) > OR ( l_suppkey > 1 > AND o_custkey > 11 ) ) > AND l_partkey > 19; > EXPLAIN > SELECT Count(*) > FROM lineitem > JOIN orders > ON l_orderkey = o_orderkey > AND ( ( l_suppkey > 3 > AND o_custkey > 13 ) > OR ( l_suppkey > 1 > AND o_custkey > 11 ) ) > AND l_partkey > 19; > EXPLAIN > SELECT Count(*) > FROM lineitem, > orders > WHERE l_orderkey = o_orderkey > AND NOT ( ( l_suppkey > 3 > AND ( l_suppkey > 2 > OR o_custkey > 13 ) ) > OR ( l_suppkey > 1 > AND o_custkey > 11 ) ) > AND l_partkey > 19; > {code} > {noformat} > postgres=# EXPLAIN > postgres-# SELECT Count(*) > postgres-# FROM lineitem, > postgres-# orders > postgres-# WHERE l_orderkey = o_orderkey > postgres-# AND ( ( l_suppkey > 3 > postgres(# AND o_custkey > 13 ) > postgres(# OR ( l_suppkey > 1 > postgres(# AND o_custkey > 11 ) ) > postgres-# AND l_partkey > 19; > QUERY PLAN > ----------------------------------------------------------------------------------------------------------------------------------------- > Aggregate (cost=21.18..21.19 rows=1 width=8) > -> Hash Join (cost=10.60..21.17 rows=2 width=0) > Hash Cond: (orders.o_orderkey = lineitem.l_orderkey) > Join Filter: (((lineitem.l_suppkey > 3) AND (orders.o_custkey > 13)) > OR ((lineitem.l_suppkey > 1) AND (orders.o_custkey > 11))) > -> Seq Scan on orders (cost=0.00..10.45 rows=17 width=16) > Filter: ((o_custkey > 13) OR (o_custkey > 11)) > -> Hash (cost=10.53..10.53 rows=6 width=16) > -> Seq Scan on lineitem (cost=0.00..10.53 rows=6 width=16) > Filter: ((l_partkey > 19) AND ((l_suppkey > 3) OR > (l_suppkey > 1))) > (9 rows) > postgres=# EXPLAIN > postgres-# SELECT Count(*) > postgres-# FROM lineitem > postgres-# JOIN orders > postgres-# ON l_orderkey = o_orderkey > postgres-# AND ( ( l_suppkey > 3 > postgres(# AND o_custkey > 13 ) > postgres(# OR ( l_suppkey > 1 > postgres(# AND o_custkey > 11 ) ) > postgres-# AND l_partkey > 19; > QUERY PLAN > ----------------------------------------------------------------------------------------------------------------------------------------- > Aggregate (cost=21.18..21.19 rows=1 width=8) > -> Hash Join (cost=10.60..21.17 rows=2 width=0) > Hash Cond: (orders.o_orderkey = lineitem.l_orderkey) > Join Filter: (((lineitem.l_suppkey > 3) AND (orders.o_custkey > 13)) > OR ((lineitem.l_suppkey > 1) AND (orders.o_custkey > 11))) > -> Seq Scan on orders (cost=0.00..10.45 rows=17 width=16) > Filter: ((o_custkey > 13) OR (o_custkey > 11)) > -> Hash (cost=10.53..10.53 rows=6 width=16) > -> Seq Scan on lineitem (cost=0.00..10.53 rows=6 width=16) > Filter: ((l_partkey > 19) AND ((l_suppkey > 3) OR > (l_suppkey > 1))) > (9 rows) > postgres=# EXPLAIN > postgres-# SELECT Count(*) > postgres-# FROM lineitem, > postgres-# orders > postgres-# WHERE l_orderkey = o_orderkey > postgres-# AND NOT ( ( l_suppkey > 3 > postgres(# AND ( l_suppkey > 2 > postgres(# OR o_custkey > 13 ) ) > postgres(# OR ( l_suppkey > 1 > postgres(# AND o_custkey > 11 ) ) > postgres-# AND l_partkey > 19; > > QUERY PLAN > ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------- > Aggregate (cost=21.15..21.16 rows=1 width=8) > -> Hash Join (cost=10.60..21.15 rows=2 width=0) > Hash Cond: (orders.o_orderkey = lineitem.l_orderkey) > Join Filter: (((lineitem.l_suppkey <= 1) OR (orders.o_custkey <= > 11)) AND ((lineitem.l_suppkey <= 3) OR ((lineitem.l_suppkey <= 2) AND > (orders.o_custkey <= 13)))) > -> Seq Scan on orders (cost=0.00..10.30 rows=30 width=16) > -> Hash (cost=10.53..10.53 rows=6 width=16) > -> Seq Scan on lineitem (cost=0.00..10.53 rows=6 width=16) > Filter: ((l_partkey > 19) AND ((l_suppkey <= 3) OR > (l_suppkey <= 2))) > (8 rows) > {noformat} > https://docs.teradata.com/reader/i_VlYHwN0b8knh6AEWrv1Q/Bh~37Qcc2~24P_jn2~0w6w -- This message was sent by Atlassian Jira (v8.3.4#803005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org