[ 
https://issues.apache.org/jira/browse/SPARK-36162?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Yuming Wang updated SPARK-36162:
--------------------------------
    Description: 
sql("select * from date_dim join item on d_date_sk = i_item_sk").explain("cost")
{noformat}
== Optimized Logical Plan ==
Join Inner, (d_date_sk#0 = i_item_sk#28), Statistics(sizeInBytes=1.0 B, 
rowCount=0)
:- Relation 
default.date_dim[d_date_sk#0,d_date_id#1,d_date#2,d_month_seq#3,d_week_seq#4,d_quarter_seq#5,d_year#6,d_dow#7,d_moy#8,d_dom#9,d_qoy#10,d_fy_year#11,d_fy_quarter_seq#12,d_fy_week_seq#13,d_day_name#14,d_quarter_name#15,d_holiday#16,d_weekend#17,d_following_holiday#18,d_first_dom#19,d_last_dom#20,d_same_day_ly#21,d_same_day_lq#22,d_current_day#23,...
 4 more fields] parquet, Statistics(sizeInBytes=17.6 MiB, rowCount=7.30E+4)
+- Relation 
default.item[i_item_sk#28,i_item_id#29,i_rec_start_date#30,i_rec_end_date#31,i_item_desc#32,i_current_price#33,i_wholesale_cost#34,i_brand_id#35,i_brand#36,i_class_id#37,i_class#38,i_category_id#39,i_category#40,i_manufact_id#41,i_manufact#42,i_size#43,i_formulation#44,i_color#45,i_units#46,i_container#47,i_manager_id#48,i_product_name#49]
 parquet, Statistics(sizeInBytes=85.2 MiB, rowCount=2.04E+5)
{noformat}

sql("select * from date_dim join item on d_date_sk <=> 
i_item_sk").explain("cost")
{noformat}
== Optimized Logical Plan ==
Join Inner, (d_date_sk#0 <=> i_item_sk#28), Statistics(sizeInBytes=9.2 TiB, 
rowCount=1.49E+10)
:- Relation 
default.date_dim[d_date_sk#0,d_date_id#1,d_date#2,d_month_seq#3,d_week_seq#4,d_quarter_seq#5,d_year#6,d_dow#7,d_moy#8,d_dom#9,d_qoy#10,d_fy_year#11,d_fy_quarter_seq#12,d_fy_week_seq#13,d_day_name#14,d_quarter_name#15,d_holiday#16,d_weekend#17,d_following_holiday#18,d_first_dom#19,d_last_dom#20,d_same_day_ly#21,d_same_day_lq#22,d_current_day#23,...
 4 more fields] parquet, Statistics(sizeInBytes=17.6 MiB, rowCount=7.30E+4)
+- Relation 
default.item[i_item_sk#28,i_item_id#29,i_rec_start_date#30,i_rec_end_date#31,i_item_desc#32,i_current_price#33,i_wholesale_cost#34,i_brand_id#35,i_brand#36,i_class_id#37,i_class#38,i_category_id#39,i_category#40,i_manufact_id#41,i_manufact#42,i_size#43,i_formulation#44,i_color#45,i_units#46,i_container#47,i_manager_id#48,i_product_name#49]
 parquet, Statistics(sizeInBytes=85.2 MiB, rowCount=2.04E+5)
{noformat}

https://github.com/apache/spark/blob/d6a68e0b67ff7de58073c176dd097070e88ac831/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala#L329-L339


  was:
sql("select * from date_dim join item on d_date_sk = i_item_sk").explain("cost")
{noformat}
== Optimized Logical Plan ==
Join Inner, (d_date_sk#0 <=> i_item_sk#28), Statistics(sizeInBytes=9.2 TiB, 
rowCount=1.49E+10)
:- Relation 
default.date_dim[d_date_sk#0,d_date_id#1,d_date#2,d_month_seq#3,d_week_seq#4,d_quarter_seq#5,d_year#6,d_dow#7,d_moy#8,d_dom#9,d_qoy#10,d_fy_year#11,d_fy_quarter_seq#12,d_fy_week_seq#13,d_day_name#14,d_quarter_name#15,d_holiday#16,d_weekend#17,d_following_holiday#18,d_first_dom#19,d_last_dom#20,d_same_day_ly#21,d_same_day_lq#22,d_current_day#23,...
 4 more fields] parquet, Statistics(sizeInBytes=17.6 MiB, rowCount=7.30E+4)
+- Relation 
default.item[i_item_sk#28,i_item_id#29,i_rec_start_date#30,i_rec_end_date#31,i_item_desc#32,i_current_price#33,i_wholesale_cost#34,i_brand_id#35,i_brand#36,i_class_id#37,i_class#38,i_category_id#39,i_category#40,i_manufact_id#41,i_manufact#42,i_size#43,i_formulation#44,i_color#45,i_units#46,i_container#47,i_manager_id#48,i_product_name#49]
 parquet, Statistics(sizeInBytes=85.2 MiB, rowCount=2.04E+5)
{noformat}

sql("select * from date_dim join item on d_date_sk <=> 
i_item_sk").explain("cost")
{noformat}
== Optimized Logical Plan ==
Join Inner, (d_date_sk#0 <=> i_item_sk#28), Statistics(sizeInBytes=9.2 TiB, 
rowCount=1.49E+10)
:- Relation 
default.date_dim[d_date_sk#0,d_date_id#1,d_date#2,d_month_seq#3,d_week_seq#4,d_quarter_seq#5,d_year#6,d_dow#7,d_moy#8,d_dom#9,d_qoy#10,d_fy_year#11,d_fy_quarter_seq#12,d_fy_week_seq#13,d_day_name#14,d_quarter_name#15,d_holiday#16,d_weekend#17,d_following_holiday#18,d_first_dom#19,d_last_dom#20,d_same_day_ly#21,d_same_day_lq#22,d_current_day#23,...
 4 more fields] parquet, Statistics(sizeInBytes=17.6 MiB, rowCount=7.30E+4)
+- Relation 
default.item[i_item_sk#28,i_item_id#29,i_rec_start_date#30,i_rec_end_date#31,i_item_desc#32,i_current_price#33,i_wholesale_cost#34,i_brand_id#35,i_brand#36,i_class_id#37,i_class#38,i_category_id#39,i_category#40,i_manufact_id#41,i_manufact#42,i_size#43,i_formulation#44,i_color#45,i_units#46,i_container#47,i_manager_id#48,i_product_name#49]
 parquet, Statistics(sizeInBytes=85.2 MiB, rowCount=2.04E+5)
{noformat}

https://github.com/apache/spark/blob/d6a68e0b67ff7de58073c176dd097070e88ac831/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala#L329-L339



> extractJoinKeysWithColStats support EqualNullSafe
> -------------------------------------------------
>
>                 Key: SPARK-36162
>                 URL: https://issues.apache.org/jira/browse/SPARK-36162
>             Project: Spark
>          Issue Type: Improvement
>          Components: SQL
>    Affects Versions: 3.3.0
>            Reporter: Yuming Wang
>            Priority: Major
>
> sql("select * from date_dim join item on d_date_sk = 
> i_item_sk").explain("cost")
> {noformat}
> == Optimized Logical Plan ==
> Join Inner, (d_date_sk#0 = i_item_sk#28), Statistics(sizeInBytes=1.0 B, 
> rowCount=0)
> :- Relation 
> default.date_dim[d_date_sk#0,d_date_id#1,d_date#2,d_month_seq#3,d_week_seq#4,d_quarter_seq#5,d_year#6,d_dow#7,d_moy#8,d_dom#9,d_qoy#10,d_fy_year#11,d_fy_quarter_seq#12,d_fy_week_seq#13,d_day_name#14,d_quarter_name#15,d_holiday#16,d_weekend#17,d_following_holiday#18,d_first_dom#19,d_last_dom#20,d_same_day_ly#21,d_same_day_lq#22,d_current_day#23,...
>  4 more fields] parquet, Statistics(sizeInBytes=17.6 MiB, rowCount=7.30E+4)
> +- Relation 
> default.item[i_item_sk#28,i_item_id#29,i_rec_start_date#30,i_rec_end_date#31,i_item_desc#32,i_current_price#33,i_wholesale_cost#34,i_brand_id#35,i_brand#36,i_class_id#37,i_class#38,i_category_id#39,i_category#40,i_manufact_id#41,i_manufact#42,i_size#43,i_formulation#44,i_color#45,i_units#46,i_container#47,i_manager_id#48,i_product_name#49]
>  parquet, Statistics(sizeInBytes=85.2 MiB, rowCount=2.04E+5)
> {noformat}
> sql("select * from date_dim join item on d_date_sk <=> 
> i_item_sk").explain("cost")
> {noformat}
> == Optimized Logical Plan ==
> Join Inner, (d_date_sk#0 <=> i_item_sk#28), Statistics(sizeInBytes=9.2 TiB, 
> rowCount=1.49E+10)
> :- Relation 
> default.date_dim[d_date_sk#0,d_date_id#1,d_date#2,d_month_seq#3,d_week_seq#4,d_quarter_seq#5,d_year#6,d_dow#7,d_moy#8,d_dom#9,d_qoy#10,d_fy_year#11,d_fy_quarter_seq#12,d_fy_week_seq#13,d_day_name#14,d_quarter_name#15,d_holiday#16,d_weekend#17,d_following_holiday#18,d_first_dom#19,d_last_dom#20,d_same_day_ly#21,d_same_day_lq#22,d_current_day#23,...
>  4 more fields] parquet, Statistics(sizeInBytes=17.6 MiB, rowCount=7.30E+4)
> +- Relation 
> default.item[i_item_sk#28,i_item_id#29,i_rec_start_date#30,i_rec_end_date#31,i_item_desc#32,i_current_price#33,i_wholesale_cost#34,i_brand_id#35,i_brand#36,i_class_id#37,i_class#38,i_category_id#39,i_category#40,i_manufact_id#41,i_manufact#42,i_size#43,i_formulation#44,i_color#45,i_units#46,i_container#47,i_manager_id#48,i_product_name#49]
>  parquet, Statistics(sizeInBytes=85.2 MiB, rowCount=2.04E+5)
> {noformat}
> https://github.com/apache/spark/blob/d6a68e0b67ff7de58073c176dd097070e88ac831/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/JoinEstimation.scala#L329-L339



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

Reply via email to