Rajesh Balamohan created HIVE-26008:
---------------------------------------

             Summary: Dynamic partition pruning not sending right partitions 
with subqueries
                 Key: HIVE-26008
                 URL: https://issues.apache.org/jira/browse/HIVE-26008
             Project: Hive
          Issue Type: Improvement
          Components: HiveServer2
            Reporter: Rajesh Balamohan


DPP isn't working fine when there are subqueries involved. Here is an example 
query (q83). 

Note that "date_dim" has another query involved. Due to this, DPP operator ends 
up sending entire "date_dim" to the fact tables. 

Because of this, data scanned for fact tables are way higher and query runtime 
is increased.

For context, on a very small cluster, this query ran for 265 seconds and with 
the rewritten query it finished in 11 seconds!. Fact table scan was 10MB vs 10 
GB.

{noformat}
HiveJoin(condition=[=($2, $5)], joinType=[inner])
                HiveJoin(condition=[=($0, $3)], joinType=[inner])
                  HiveProject(cr_item_sk=[$1], cr_return_quantity=[$16], 
cr_returned_date_sk=[$26])
                    HiveFilter(condition=[AND(IS NOT NULL($26), IS NOT 
NULL($1))])
                      HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
catalog_returns]], table:alias=[catalog_returns])
                  HiveProject(i_item_sk=[$0], i_item_id=[$1])
                    HiveFilter(condition=[AND(IS NOT NULL($1), IS NOT 
NULL($0))])
                      HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
item]], table:alias=[item])
                HiveProject(d_date_sk=[$0], d_date=[$2])
                  HiveFilter(condition=[AND(IS NOT NULL($2), IS NOT NULL($0))])
                    HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])
              HiveProject(d_date=[$0])
                HiveSemiJoin(condition=[=($1, $2)], joinType=[semi])
                  HiveProject(d_date=[$2], d_week_seq=[$4])
                    HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT 
NULL($2))])
                      HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])
                  HiveProject(d_week_seq=[$4])
                    HiveFilter(condition=[AND(IN($2, 1998-01-02:DATE, 
1998-10-15:DATE, 1998-11-10:DATE), IS NOT NULL($4))])
                      HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])
{noformat}

*Original Query & Plan: *

{noformat}
explain cbo with sr_items as
(select i_item_id item_id,
sum(sr_return_quantity) sr_item_qty
from store_returns,
item,
date_dim
where sr_item_sk = i_item_sk
and   d_date    in
(select d_date
from date_dim
where d_week_seq in
(select d_week_seq
from date_dim
where d_date in ('1998-01-02','1998-10-15','1998-11-10')))
and   sr_returned_date_sk   = d_date_sk
group by i_item_id),
cr_items as
(select i_item_id item_id,
sum(cr_return_quantity) cr_item_qty
from catalog_returns,
item,
date_dim
where cr_item_sk = i_item_sk
and   d_date    in
(select d_date
from date_dim
where d_week_seq in
(select d_week_seq
from date_dim
where d_date in ('1998-01-02','1998-10-15','1998-11-10')))
and   cr_returned_date_sk   = d_date_sk
group by i_item_id),
wr_items as
(select i_item_id item_id,
sum(wr_return_quantity) wr_item_qty
from web_returns,
item,
date_dim
where wr_item_sk = i_item_sk
and   d_date    in
(select d_date
from date_dim
where d_week_seq in
(select d_week_seq
from date_dim
where d_date in ('1998-01-02','1998-10-15','1998-11-10')))
and   wr_returned_date_sk   = d_date_sk
group by i_item_id)
select  sr_items.item_id
,sr_item_qty
,sr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 sr_dev
,cr_item_qty
,cr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 cr_dev
,wr_item_qty
,wr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 wr_dev
,(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 average
from sr_items
,cr_items
,wr_items
where sr_items.item_id=cr_items.item_id
and sr_items.item_id=wr_items.item_id
order by sr_items.item_id
,sr_item_qty
limit 100
INFO  : Starting task [Stage-3:EXPLAIN] in serial mode
INFO  : Completed executing 
command(queryId=hive_20220307055109_88ad0cbd-bd40-45bc-92ae-ab15fa6b1da4); Time 
taken: 0.973 seconds
INFO  : OK
Explain
CBO PLAN:
HiveSortLimit(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC], fetch=[100])
  HiveProject(item_id=[$0], sr_item_qty=[$4], sr_dev=[*(/(/($5, CAST(+(+($4, 
$1), $7)):DOUBLE), 3), 100)], cr_item_qty=[$1], cr_dev=[*(/(/($2, CAST(+(+($4, 
$1), $7)):DOUBLE), 3), 100)], wr_item_qty=[$7], wr_dev=[*(/(/($8, CAST(+(+($4, 
$1), $7)):DOUBLE), 3), 100)], average=[/(CAST(+(+($4, $1), $7)):DECIMAL(19, 0), 
3:DECIMAL(1, 0))])
    HiveJoin(condition=[=($0, $6)], joinType=[inner])
      HiveJoin(condition=[=($3, $0)], joinType=[inner])
        HiveProject($f0=[$0], $f1=[$1], EXPR$0=[CAST($1):DOUBLE])
          HiveAggregate(group=[{4}], agg#0=[sum($1)])
            HiveSemiJoin(condition=[=($6, $7)], joinType=[semi])
              HiveJoin(condition=[=($2, $5)], joinType=[inner])
                HiveJoin(condition=[=($0, $3)], joinType=[inner])
                  HiveProject(cr_item_sk=[$1], cr_return_quantity=[$16], 
cr_returned_date_sk=[$26])
                    HiveFilter(condition=[AND(IS NOT NULL($26), IS NOT 
NULL($1))])
                      HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
catalog_returns]], table:alias=[catalog_returns])
                  HiveProject(i_item_sk=[$0], i_item_id=[$1])
                    HiveFilter(condition=[AND(IS NOT NULL($1), IS NOT 
NULL($0))])
                      HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
item]], table:alias=[item])
                HiveProject(d_date_sk=[$0], d_date=[$2])
                  HiveFilter(condition=[AND(IS NOT NULL($2), IS NOT NULL($0))])
                    HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])
              HiveProject(d_date=[$0])
                HiveSemiJoin(condition=[=($1, $2)], joinType=[semi])
                  HiveProject(d_date=[$2], d_week_seq=[$4])
                    HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT 
NULL($2))])
                      HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])
                  HiveProject(d_week_seq=[$4])
                    HiveFilter(condition=[AND(IN($2, 1998-01-02:DATE, 
1998-10-15:DATE, 1998-11-10:DATE), IS NOT NULL($4))])
                      HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])
        HiveProject($f0=[$0], $f1=[$1], EXPR$0=[CAST($1):DOUBLE])
          HiveAggregate(group=[{4}], agg#0=[sum($1)])
            HiveSemiJoin(condition=[=($6, $7)], joinType=[semi])
              HiveJoin(condition=[=($2, $5)], joinType=[inner])
                HiveJoin(condition=[=($0, $3)], joinType=[inner])
                  HiveProject(sr_item_sk=[$1], sr_return_quantity=[$9], 
sr_returned_date_sk=[$19])
                    HiveFilter(condition=[AND(IS NOT NULL($19), IS NOT 
NULL($1))])
                      HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
store_returns]], table:alias=[store_returns])
                  HiveProject(i_item_sk=[$0], i_item_id=[$1])
                    HiveFilter(condition=[AND(IS NOT NULL($1), IS NOT 
NULL($0))])
                      HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
item]], table:alias=[item])
                HiveProject(d_date_sk=[$0], d_date=[$2])
                  HiveFilter(condition=[AND(IS NOT NULL($2), IS NOT NULL($0))])
                    HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])
              HiveProject(d_date=[$0])
                HiveSemiJoin(condition=[=($1, $2)], joinType=[semi])
                  HiveProject(d_date=[$2], d_week_seq=[$4])
                    HiveFilter(condition=[AND(IS NOT NULL($2), IS NOT 
NULL($4))])
                      HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])
                  HiveProject(d_week_seq=[$4])
                    HiveFilter(condition=[AND(IN($2, 1998-01-02:DATE, 
1998-10-15:DATE, 1998-11-10:DATE), IS NOT NULL($4))])
                      HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])
      HiveProject($f0=[$0], $f1=[$1], EXPR$0=[CAST($1):DOUBLE])
        HiveAggregate(group=[{4}], agg#0=[sum($1)])
          HiveSemiJoin(condition=[=($6, $7)], joinType=[semi])
            HiveJoin(condition=[=($2, $5)], joinType=[inner])
              HiveJoin(condition=[=($0, $3)], joinType=[inner])
                HiveProject(wr_item_sk=[$1], wr_return_quantity=[$13], 
wr_returned_date_sk=[$23])
                  HiveFilter(condition=[AND(IS NOT NULL($23), IS NOT NULL($1))])
                    HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
web_returns]], table:alias=[web_returns])
                HiveProject(i_item_sk=[$0], i_item_id=[$1])
                  HiveFilter(condition=[AND(IS NOT NULL($1), IS NOT NULL($0))])
                    HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
item]], table:alias=[item])
              HiveProject(d_date_sk=[$0], d_date=[$2])
                HiveFilter(condition=[AND(IS NOT NULL($2), IS NOT NULL($0))])
                  HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])
            HiveProject(d_date=[$0])
              HiveSemiJoin(condition=[=($1, $2)], joinType=[semi])
                HiveProject(d_date=[$2], d_week_seq=[$4])
                  HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT NULL($2))])
                    HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])
                HiveProject(d_week_seq=[$4])
                  HiveFilter(condition=[AND(IN($2, 1998-01-02:DATE, 
1998-10-15:DATE, 1998-11-10:DATE), IS NOT NULL($4))])
                    HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])

{noformat}

 

*Modified Query & Plan:*

{noformat}
explain cbo
with sr_items as
(select i_item_id item_id,
sum(sr_return_quantity) sr_item_qty
from store_returns,
item,
date_dim
where sr_item_sk = i_item_sk
and   d_date    in
(select d_date
from date_dim
where d_date in ('1998-01-02','1998-10-15','1998-11-10'))
and   sr_returned_date_sk   = d_date_sk
group by i_item_id),
cr_items as
(select i_item_id item_id,
sum(cr_return_quantity) cr_item_qty
from catalog_returns,
item,
date_dim
where cr_item_sk = i_item_sk
and   d_date    in
(select d_date
from date_dim
where  d_date in ('1998-01-02','1998-10-15','1998-11-10'))
and   cr_returned_date_sk   = d_date_sk
group by i_item_id),
wr_items as
(select i_item_id item_id,
sum(wr_return_quantity) wr_item_qty
from web_returns,
item,
date_dim
where wr_item_sk = i_item_sk
and   d_date    in
(select d_date
from date_dim
where d_date in ('1998-01-02','1998-10-15','1998-11-10'))
and   wr_returned_date_sk   = d_date_sk
group by i_item_id)
select  sr_items.item_id
,sr_item_qty
,sr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 sr_dev
,cr_item_qty
,cr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 cr_dev
,wr_item_qty
,wr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 wr_dev
,(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 average
from sr_items
,cr_items
,wr_items
where sr_items.item_id=cr_items.item_id
and sr_items.item_id=wr_items.item_id
order by sr_items.item_id
,sr_item_qty
limit 100
INFO  : Starting task [Stage-3:EXPLAIN] in serial mode
INFO  : Completed executing 
command(queryId=hive_20220307062043_2847c12d-9c22-452e-aa84-3200a3b9018b); Time 
taken: 0.827 seconds
INFO  : OK
Explain
CBO PLAN:
HiveSortLimit(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC], fetch=[100])
  HiveProject(item_id=[$0], sr_item_qty=[$4], sr_dev=[*(/(/($5, CAST(+(+($4, 
$1), $7)):DOUBLE), 3), 100)], cr_item_qty=[$1], cr_dev=[*(/(/($2, CAST(+(+($4, 
$1), $7)):DOUBLE), 3), 100)], wr_item_qty=[$7], wr_dev=[*(/(/($8, CAST(+(+($4, 
$1), $7)):DOUBLE), 3), 100)], average=[/(CAST(+(+($4, $1), $7)):DECIMAL(19, 0), 
3:DECIMAL(1, 0))])
    HiveJoin(condition=[=($0, $6)], joinType=[inner])
      HiveJoin(condition=[=($3, $0)], joinType=[inner])
        HiveProject($f0=[$0], $f1=[$1], EXPR$0=[CAST($1):DOUBLE])
          HiveAggregate(group=[{4}], agg#0=[sum($1)])
            HiveSemiJoin(condition=[=($6, $7)], joinType=[semi])
              HiveJoin(condition=[=($2, $5)], joinType=[inner])
                HiveJoin(condition=[=($0, $3)], joinType=[inner])
                  HiveProject(cr_item_sk=[$1], cr_return_quantity=[$16], 
cr_returned_date_sk=[$26])
                    HiveFilter(condition=[AND(IS NOT NULL($26), IS NOT 
NULL($1))])
                      HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
catalog_returns]], table:alias=[catalog_returns])
                  HiveProject(i_item_sk=[$0], i_item_id=[$1])
                    HiveFilter(condition=[AND(IS NOT NULL($1), IS NOT 
NULL($0))])
                      HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
item]], table:alias=[item])
                HiveProject(d_date_sk=[$0], d_date=[$2])
                  HiveFilter(condition=[AND(IN($2, 1998-01-02:DATE, 
1998-10-15:DATE, 1998-11-10:DATE), IS NOT NULL($0))])
                    HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])
              HiveProject(d_date=[$2])
                HiveFilter(condition=[IN($2, 1998-01-02:DATE, 1998-10-15:DATE, 
1998-11-10:DATE)])
                  HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])
        HiveProject($f0=[$0], $f1=[$1], EXPR$0=[CAST($1):DOUBLE])
          HiveAggregate(group=[{4}], agg#0=[sum($1)])
            HiveSemiJoin(condition=[=($6, $7)], joinType=[semi])
              HiveJoin(condition=[=($2, $5)], joinType=[inner])
                HiveJoin(condition=[=($0, $3)], joinType=[inner])
                  HiveProject(sr_item_sk=[$1], sr_return_quantity=[$9], 
sr_returned_date_sk=[$19])
                    HiveFilter(condition=[AND(IS NOT NULL($19), IS NOT 
NULL($1))])
                      HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
store_returns]], table:alias=[store_returns])
                  HiveProject(i_item_sk=[$0], i_item_id=[$1])
                    HiveFilter(condition=[AND(IS NOT NULL($1), IS NOT 
NULL($0))])
                      HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
item]], table:alias=[item])
                HiveProject(d_date_sk=[$0], d_date=[$2])
                  HiveFilter(condition=[AND(IN($2, 1998-01-02:DATE, 
1998-10-15:DATE, 1998-11-10:DATE), IS NOT NULL($0))])
                    HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])
              HiveProject(d_date=[$2])
                HiveFilter(condition=[IN($2, 1998-01-02:DATE, 1998-10-15:DATE, 
1998-11-10:DATE)])
                  HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])
      HiveProject($f0=[$0], $f1=[$1], EXPR$0=[CAST($1):DOUBLE])
        HiveAggregate(group=[{4}], agg#0=[sum($1)])
          HiveSemiJoin(condition=[=($6, $7)], joinType=[semi])
            HiveJoin(condition=[=($2, $5)], joinType=[inner])
              HiveJoin(condition=[=($0, $3)], joinType=[inner])
                HiveProject(wr_item_sk=[$1], wr_return_quantity=[$13], 
wr_returned_date_sk=[$23])
                  HiveFilter(condition=[AND(IS NOT NULL($23), IS NOT NULL($1))])
                    HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
web_returns]], table:alias=[web_returns])
                HiveProject(i_item_sk=[$0], i_item_id=[$1])
                  HiveFilter(condition=[AND(IS NOT NULL($1), IS NOT NULL($0))])
                    HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
item]], table:alias=[item])
              HiveProject(d_date_sk=[$0], d_date=[$2])
                HiveFilter(condition=[AND(IN($2, 1998-01-02:DATE, 
1998-10-15:DATE, 1998-11-10:DATE), IS NOT NULL($0))])
                  HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])
            HiveProject(d_date=[$2])
              HiveFilter(condition=[IN($2, 1998-01-02:DATE, 1998-10-15:DATE, 
1998-11-10:DATE)])
                HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000, 
date_dim]], table:alias=[date_dim])


 {noformat}



--
This message was sent by Atlassian Jira
(v8.20.1#820001)

Reply via email to