Mostafa Mokhtar created HIVE-8526:
-------------------------------------

             Summary: Hive : CBO incorrect join order in TPC-DS Q45 as self 
join selectivity has incorrect CE
                 Key: HIVE-8526
                 URL: https://issues.apache.org/jira/browse/HIVE-8526
             Project: Hive
          Issue Type: Bug
          Components: CBO
    Affects Versions: 0.14.0
            Reporter: Mostafa Mokhtar
            Assignee: Harish Butani
             Fix For: 0.14.0


The join order has Item joined last where it should be joined first

Query 
{code}
select  ca_zip, ca_county, sum(ws_sales_price)
 from
    web_sales
    JOIN customer ON web_sales.ws_bill_customer_sk = customer.c_customer_sk
    JOIN customer_address ON customer.c_current_addr_sk = 
customer_address.ca_address_sk 
    JOIN date_dim ON web_sales.ws_sold_date_sk = date_dim.d_date_sk
    JOIN item ON web_sales.ws_item_sk = item.i_item_sk 
 where
        ( item.i_item_id in (select i_item_id
                             from item i2
                             where i2.i_item_sk in (2, 3, 5, 7, 11, 13, 17, 19, 
23, 29)
                             )
            )
        and d_qoy = 2 and d_year = 2000
 group by ca_zip, ca_county
 order by ca_zip, ca_county
 limit 100
{code}

Plan
{code}
2014-10-20 18:43:16,521 DEBUG [main]: parse.SemanticAnalyzer 
(SemanticAnalyzer.java:apply(12330)) - HiveSortRel(fetch=[100]): rowcount = 
1.710158597922807E7, cumulative cost = {7.169080587598123E10 rows, 
3.420317295845614E7 cpu, 0.0 io}, id = 579
  HiveSortRel(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC]): rowcount = 
1.710158597922807E7, cumulative cost = {6.827294821015483E10 rows, 
1.710158697922807E7 cpu, 0.0 io}, id = 577
    HiveProjectRel(ca_zip=[$0], ca_county=[$1], _o__c2=[$2]): rowcount = 
1.710158597922807E7, cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 
io}, id = 575
      HiveAggregateRel(group=[{0, 1}], agg#0=[sum($2)]): rowcount = 
1.710158597922807E7, cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 0.0 
io}, id = 573
        HiveProjectRel($f0=[$2], $f1=[$1], $f2=[$0]): rowcount = 
6.0197670310147226E7, cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 
0.0 io}, id = 571
          HiveProjectRel(ws_sales_price=[$2], ca_county=[$7], ca_zip=[$8]): 
rowcount = 6.0197670310147226E7, cumulative cost = {6.485509054432843E10 rows, 
1.0 cpu, 0.0 io}, id = 569
            HiveFilterRel(condition=[AND(=($11, 2), =($10, 2000))]): rowcount = 
6.0197670310147226E7, cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 
0.0 io}, id = 567
              SemiJoinRel(condition=[=($13, $14)], joinType=[inner]): rowcount 
= 3.371069537368245E10, cumulative cost = {6.485509054432843E10 rows, 1.0 cpu, 
0.0 io}, id = 565
                HiveProjectRel(ws_item_sk=[$0], ws_bill_customer_sk=[$1], 
ws_sales_price=[$2], ws_sold_date_sk=[$3], c_customer_sk=[$9], 
c_current_addr_sk=[$10], ca_address_sk=[$11], ca_county=[$12], ca_zip=[$13], 
d_date_sk=[$6], d_year=[$7], d_qoy=[$8], i_item_sk=[$4], i_item_id=[$5]): 
rowcount = 3.371069537368245E10, cumulative cost = {6.485509054332843E10 rows, 
0.0 cpu, 0.0 io}, id = 669
                  HiveJoinRel(condition=[=($1, $9)], joinType=[inner]): 
rowcount = 3.371069537368245E10, cumulative cost = {6.485509054332843E10 rows, 
0.0 cpu, 0.0 io}, id = 667
                    HiveJoinRel(condition=[=($3, $6)], joinType=[inner]): 
rowcount = 2.1594638446E10, cumulative cost = {4.3189811941E10 rows, 0.0 cpu, 
0.0 io}, id = 664
                      HiveJoinRel(condition=[=($0, $4)], joinType=[inner]): 
rowcount = 2.1594638446E10, cumulative cost = {2.1595100446E10 rows, 0.0 cpu, 
0.0 io}, id = 601
                        HiveProjectRel(ws_item_sk=[$2], 
ws_bill_customer_sk=[$3], ws_sales_price=[$20], ws_sold_date_sk=[$33]): 
rowcount = 2.1594638446E10, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 
497
                          
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.web_sales]]): rowcount 
= 2.1594638446E10, cumulative cost = {0}, id = 341
                        HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): 
rowcount = 462000.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 555
                          
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]): rowcount = 
462000.0, cumulative cost = {0}, id = 340
                      HiveProjectRel(d_date_sk=[$0], d_year=[$6], d_qoy=[$10]): 
rowcount = 73049.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 551
                        
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.date_dim]]): rowcount 
= 73049.0, cumulative cost = {0}, id = 342
                    HiveJoinRel(condition=[=($1, $2)], joinType=[inner]): 
rowcount = 7.064015632843196E7, cumulative cost = {1.2E8 rows, 0.0 cpu, 0.0 
io}, id = 598
                      HiveProjectRel(c_customer_sk=[$0], 
c_current_addr_sk=[$4]): rowcount = 8.0E7, cumulative cost = {0.0 rows, 0.0 
cpu, 0.0 io}, id = 500
                        
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer]]): rowcount 
= 8.0E7, cumulative cost = {0}, id = 343
                      HiveProjectRel(ca_address_sk=[$0], ca_county=[$7], 
ca_zip=[$9]): rowcount = 4.0E7, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, 
id = 547
                        
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer_address]]): 
rowcount = 4.0E7, cumulative cost = {0}, id = 339
                HiveProjectRel(i_item_id=[$1]): rowcount = 1.05119214745814, 
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 563
                  HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount = 
1.05119214745814, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 561
                    HiveFilterRel(condition=[in($0, 2, 3, 5, 7, 11, 13, 17, 19, 
23, 29)]): rowcount = 1.05119214745814, cumulative cost = {0.0 rows, 0.0 cpu, 
0.0 io}, id = 559
                      
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]): rowcount = 
462000.0, cumulative cost = {0}, id = 340
{code}


Then I rewrote the query trying to force CBO to generate the correct join order
{code}
with items as (select i_item_sk from 
item  where
        ( item.i_item_id in (select i_item_id
                             from item i2
                             where i2.i_item_sk in (2, 3, 5, 7, 11, 13, 17, 19, 
23, 29)
                             )
            )
)

select  ca_zip, ca_county, sum(ws_sales_price)
 from
    web_sales
    JOIN items ON web_sales.ws_item_sk = items.i_item_sk 
    JOIN customer ON web_sales.ws_bill_customer_sk = customer.c_customer_sk
    JOIN customer_address ON customer.c_current_addr_sk = 
customer_address.ca_address_sk 
    JOIN date_dim ON web_sales.ws_sold_date_sk = date_dim.d_date_sk
 where
 d_qoy = 2 and d_year = 2000
 group by ca_zip, ca_county
 order by ca_zip, ca_county
 limit 100
{code}

But the correct join order wasn't generated because CE for item x item + filter 
has a selectivity of 1.
{code}
2014-10-20 18:46:27,120 DEBUG [main]: parse.SemanticAnalyzer 
(SemanticAnalyzer.java:apply(12330)) - HiveSortRel(fetch=[100]): rowcount = 
1.6595391288544238E7, cumulative cost = {2.8364280421639153E10 rows, 
3.3190782577088475E7 cpu, 0.0 io}, id = 1291
  HiveSortRel(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC]): rowcount = 
1.6595391288544238E7, cumulative cost = {2.505357243157397E10 rows, 
1.6595391288544238E7 cpu, 0.0 io}, id = 1289
    HiveProjectRel(ca_zip=[$0], ca_county=[$1], _o__c2=[$2]): rowcount = 
1.6595391288544238E7, cumulative cost = {2.174286444150879E10 rows, 0.0 cpu, 
0.0 io}, id = 1287
      HiveAggregateRel(group=[{0, 1}], agg#0=[sum($2)]): rowcount = 
1.6595391288544238E7, cumulative cost = {2.174286444150879E10 rows, 0.0 cpu, 
0.0 io}, id = 1285
        HiveProjectRel($f0=[$9], $f1=[$8], $f2=[$2]): rowcount = 
6.019767031014723E7, cumulative cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 
io}, id = 1283
          HiveProjectRel(ws_item_sk=[$5], ws_bill_customer_sk=[$6], 
ws_sales_price=[$7], ws_sold_date_sk=[$8], i_item_sk=[$12], c_customer_sk=[$0], 
c_current_addr_sk=[$1], ca_address_sk=[$2], ca_county=[$3], ca_zip=[$4], 
d_date_sk=[$9], d_year=[$10], d_qoy=[$11]): rowcount = 6.019767031014723E7, 
cumulative cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 io}, id = 1380
            HiveJoinRel(condition=[=($6, $0)], joinType=[inner]): rowcount = 
6.019767031014723E7, cumulative cost = {2.174286444150879E10 rows, 0.0 cpu, 0.0 
io}, id = 1378
              HiveJoinRel(condition=[=($1, $2)], joinType=[inner]): rowcount = 
7.064015632843196E7, cumulative cost = {1.2E8 rows, 0.0 cpu, 0.0 io}, id = 1309
                HiveProjectRel(c_customer_sk=[$0], c_current_addr_sk=[$4]): 
rowcount = 8.0E7, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1269
                  
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer]]): rowcount 
= 8.0E7, cumulative cost = {0}, id = 1035
                HiveProjectRel(ca_address_sk=[$0], ca_county=[$7], 
ca_zip=[$9]): rowcount = 4.0E7, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, 
id = 1273
                  
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.customer_address]]): 
rowcount = 4.0E7, cumulative cost = {0}, id = 1032
              HiveJoinRel(condition=[=($0, $7)], joinType=[inner]): rowcount = 
3.856185436785714E7, cumulative cost = {2.16336624308125E10 rows, 0.0 cpu, 0.0 
io}, id = 1376
                HiveJoinRel(condition=[=($3, $4)], joinType=[inner]): rowcount 
= 3.856185436785714E7, cumulative cost = {2.159463857644464E10 rows, 0.0 cpu, 
0.0 io}, id = 1316
                  HiveProjectRel(ws_item_sk=[$2], ws_bill_customer_sk=[$3], 
ws_sales_price=[$20], ws_sold_date_sk=[$33]): rowcount = 2.1594638446E10, 
cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1205
                    
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.web_sales]]): rowcount 
= 2.1594638446E10, cumulative cost = {0}, id = 1033
                  HiveProjectRel(d_date_sk=[$0], d_year=[$6], d_qoy=[$10]): 
rowcount = 130.44464285714287, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, 
id = 1279
                    HiveFilterRel(condition=[AND(=($10, 2), =($6, 2000))]): 
rowcount = 130.44464285714287, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, 
id = 1277
                      
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.date_dim]]): rowcount 
= 73049.0, cumulative cost = {0}, id = 1034
                HiveProjectRel(i_item_sk=[$0]): rowcount = 462000.0, cumulative 
cost = {1.0 rows, 1.0 cpu, 0.0 io}, id = 1265
                  HiveFilterRel(condition=[=(1, 1)]): rowcount = 462000.0, 
cumulative cost = {1.0 rows, 1.0 cpu, 0.0 io}, id = 1263
                    SemiJoinRel(condition=[=($1, $2)], joinType=[inner]): 
rowcount = 462000.0, cumulative cost = {1.0 rows, 1.0 cpu, 0.0 io}, id = 1261
                      HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): rowcount 
= 462000.0, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1253
                        
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]): rowcount = 
462000.0, cumulative cost = {0}, id = 1024
                      HiveProjectRel(i_item_id=[$1]): rowcount = 
1.05119214745814, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id = 1259
                        HiveProjectRel(i_item_sk=[$0], i_item_id=[$1]): 
rowcount = 1.05119214745814, cumulative cost = {0.0 rows, 0.0 cpu, 0.0 io}, id 
= 1257
                          HiveFilterRel(condition=[in($0, 2, 3, 5, 7, 11, 13, 
17, 19, 23, 29)]): rowcount = 1.05119214745814, cumulative cost = {0.0 rows, 
0.0 cpu, 0.0 io}, id = 1255
                            
HiveTableScanRel(table=[[tpcds_bin_partitioned_orc_30000.item]]): rowcount = 
462000.0, cumulative cost = {0}, id = 1024
{code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

Reply via email to