Rajesh Balamohan created HIVE-26008:
---------------------------------------
Summary: Dynamic partition pruning not sending right partitions
with subqueries
Key: HIVE-26008
URL: https://issues.apache.org/jira/browse/HIVE-26008
Project: Hive
Issue Type: Improvement
Components: HiveServer2
Reporter: Rajesh Balamohan
DPP isn't working fine when there are subqueries involved. Here is an example
query (q83).
Note that "date_dim" has another query involved. Due to this, DPP operator ends
up sending entire "date_dim" to the fact tables.
Because of this, data scanned for fact tables are way higher and query runtime
is increased.
For context, on a very small cluster, this query ran for 265 seconds and with
the rewritten query it finished in 11 seconds!. Fact table scan was 10MB vs 10
GB.
{noformat}
HiveJoin(condition=[=($2, $5)], joinType=[inner])
HiveJoin(condition=[=($0, $3)], joinType=[inner])
HiveProject(cr_item_sk=[$1], cr_return_quantity=[$16],
cr_returned_date_sk=[$26])
HiveFilter(condition=[AND(IS NOT NULL($26), IS NOT
NULL($1))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
catalog_returns]], table:alias=[catalog_returns])
HiveProject(i_item_sk=[$0], i_item_id=[$1])
HiveFilter(condition=[AND(IS NOT NULL($1), IS NOT
NULL($0))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
item]], table:alias=[item])
HiveProject(d_date_sk=[$0], d_date=[$2])
HiveFilter(condition=[AND(IS NOT NULL($2), IS NOT NULL($0))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
HiveProject(d_date=[$0])
HiveSemiJoin(condition=[=($1, $2)], joinType=[semi])
HiveProject(d_date=[$2], d_week_seq=[$4])
HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT
NULL($2))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
HiveProject(d_week_seq=[$4])
HiveFilter(condition=[AND(IN($2, 1998-01-02:DATE,
1998-10-15:DATE, 1998-11-10:DATE), IS NOT NULL($4))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
{noformat}
*Original Query & Plan: *
{noformat}
explain cbo with sr_items as
(select i_item_id item_id,
sum(sr_return_quantity) sr_item_qty
from store_returns,
item,
date_dim
where sr_item_sk = i_item_sk
and d_date in
(select d_date
from date_dim
where d_week_seq in
(select d_week_seq
from date_dim
where d_date in ('1998-01-02','1998-10-15','1998-11-10')))
and sr_returned_date_sk = d_date_sk
group by i_item_id),
cr_items as
(select i_item_id item_id,
sum(cr_return_quantity) cr_item_qty
from catalog_returns,
item,
date_dim
where cr_item_sk = i_item_sk
and d_date in
(select d_date
from date_dim
where d_week_seq in
(select d_week_seq
from date_dim
where d_date in ('1998-01-02','1998-10-15','1998-11-10')))
and cr_returned_date_sk = d_date_sk
group by i_item_id),
wr_items as
(select i_item_id item_id,
sum(wr_return_quantity) wr_item_qty
from web_returns,
item,
date_dim
where wr_item_sk = i_item_sk
and d_date in
(select d_date
from date_dim
where d_week_seq in
(select d_week_seq
from date_dim
where d_date in ('1998-01-02','1998-10-15','1998-11-10')))
and wr_returned_date_sk = d_date_sk
group by i_item_id)
select sr_items.item_id
,sr_item_qty
,sr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 sr_dev
,cr_item_qty
,cr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 cr_dev
,wr_item_qty
,wr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 wr_dev
,(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 average
from sr_items
,cr_items
,wr_items
where sr_items.item_id=cr_items.item_id
and sr_items.item_id=wr_items.item_id
order by sr_items.item_id
,sr_item_qty
limit 100
INFO : Starting task [Stage-3:EXPLAIN] in serial mode
INFO : Completed executing
command(queryId=hive_20220307055109_88ad0cbd-bd40-45bc-92ae-ab15fa6b1da4); Time
taken: 0.973 seconds
INFO : OK
Explain
CBO PLAN:
HiveSortLimit(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC], fetch=[100])
HiveProject(item_id=[$0], sr_item_qty=[$4], sr_dev=[*(/(/($5, CAST(+(+($4,
$1), $7)):DOUBLE), 3), 100)], cr_item_qty=[$1], cr_dev=[*(/(/($2, CAST(+(+($4,
$1), $7)):DOUBLE), 3), 100)], wr_item_qty=[$7], wr_dev=[*(/(/($8, CAST(+(+($4,
$1), $7)):DOUBLE), 3), 100)], average=[/(CAST(+(+($4, $1), $7)):DECIMAL(19, 0),
3:DECIMAL(1, 0))])
HiveJoin(condition=[=($0, $6)], joinType=[inner])
HiveJoin(condition=[=($3, $0)], joinType=[inner])
HiveProject($f0=[$0], $f1=[$1], EXPR$0=[CAST($1):DOUBLE])
HiveAggregate(group=[{4}], agg#0=[sum($1)])
HiveSemiJoin(condition=[=($6, $7)], joinType=[semi])
HiveJoin(condition=[=($2, $5)], joinType=[inner])
HiveJoin(condition=[=($0, $3)], joinType=[inner])
HiveProject(cr_item_sk=[$1], cr_return_quantity=[$16],
cr_returned_date_sk=[$26])
HiveFilter(condition=[AND(IS NOT NULL($26), IS NOT
NULL($1))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
catalog_returns]], table:alias=[catalog_returns])
HiveProject(i_item_sk=[$0], i_item_id=[$1])
HiveFilter(condition=[AND(IS NOT NULL($1), IS NOT
NULL($0))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
item]], table:alias=[item])
HiveProject(d_date_sk=[$0], d_date=[$2])
HiveFilter(condition=[AND(IS NOT NULL($2), IS NOT NULL($0))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
HiveProject(d_date=[$0])
HiveSemiJoin(condition=[=($1, $2)], joinType=[semi])
HiveProject(d_date=[$2], d_week_seq=[$4])
HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT
NULL($2))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
HiveProject(d_week_seq=[$4])
HiveFilter(condition=[AND(IN($2, 1998-01-02:DATE,
1998-10-15:DATE, 1998-11-10:DATE), IS NOT NULL($4))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
HiveProject($f0=[$0], $f1=[$1], EXPR$0=[CAST($1):DOUBLE])
HiveAggregate(group=[{4}], agg#0=[sum($1)])
HiveSemiJoin(condition=[=($6, $7)], joinType=[semi])
HiveJoin(condition=[=($2, $5)], joinType=[inner])
HiveJoin(condition=[=($0, $3)], joinType=[inner])
HiveProject(sr_item_sk=[$1], sr_return_quantity=[$9],
sr_returned_date_sk=[$19])
HiveFilter(condition=[AND(IS NOT NULL($19), IS NOT
NULL($1))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
store_returns]], table:alias=[store_returns])
HiveProject(i_item_sk=[$0], i_item_id=[$1])
HiveFilter(condition=[AND(IS NOT NULL($1), IS NOT
NULL($0))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
item]], table:alias=[item])
HiveProject(d_date_sk=[$0], d_date=[$2])
HiveFilter(condition=[AND(IS NOT NULL($2), IS NOT NULL($0))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
HiveProject(d_date=[$0])
HiveSemiJoin(condition=[=($1, $2)], joinType=[semi])
HiveProject(d_date=[$2], d_week_seq=[$4])
HiveFilter(condition=[AND(IS NOT NULL($2), IS NOT
NULL($4))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
HiveProject(d_week_seq=[$4])
HiveFilter(condition=[AND(IN($2, 1998-01-02:DATE,
1998-10-15:DATE, 1998-11-10:DATE), IS NOT NULL($4))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
HiveProject($f0=[$0], $f1=[$1], EXPR$0=[CAST($1):DOUBLE])
HiveAggregate(group=[{4}], agg#0=[sum($1)])
HiveSemiJoin(condition=[=($6, $7)], joinType=[semi])
HiveJoin(condition=[=($2, $5)], joinType=[inner])
HiveJoin(condition=[=($0, $3)], joinType=[inner])
HiveProject(wr_item_sk=[$1], wr_return_quantity=[$13],
wr_returned_date_sk=[$23])
HiveFilter(condition=[AND(IS NOT NULL($23), IS NOT NULL($1))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
web_returns]], table:alias=[web_returns])
HiveProject(i_item_sk=[$0], i_item_id=[$1])
HiveFilter(condition=[AND(IS NOT NULL($1), IS NOT NULL($0))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
item]], table:alias=[item])
HiveProject(d_date_sk=[$0], d_date=[$2])
HiveFilter(condition=[AND(IS NOT NULL($2), IS NOT NULL($0))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
HiveProject(d_date=[$0])
HiveSemiJoin(condition=[=($1, $2)], joinType=[semi])
HiveProject(d_date=[$2], d_week_seq=[$4])
HiveFilter(condition=[AND(IS NOT NULL($4), IS NOT NULL($2))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
HiveProject(d_week_seq=[$4])
HiveFilter(condition=[AND(IN($2, 1998-01-02:DATE,
1998-10-15:DATE, 1998-11-10:DATE), IS NOT NULL($4))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
{noformat}
*Modified Query & Plan:*
{noformat}
explain cbo
with sr_items as
(select i_item_id item_id,
sum(sr_return_quantity) sr_item_qty
from store_returns,
item,
date_dim
where sr_item_sk = i_item_sk
and d_date in
(select d_date
from date_dim
where d_date in ('1998-01-02','1998-10-15','1998-11-10'))
and sr_returned_date_sk = d_date_sk
group by i_item_id),
cr_items as
(select i_item_id item_id,
sum(cr_return_quantity) cr_item_qty
from catalog_returns,
item,
date_dim
where cr_item_sk = i_item_sk
and d_date in
(select d_date
from date_dim
where d_date in ('1998-01-02','1998-10-15','1998-11-10'))
and cr_returned_date_sk = d_date_sk
group by i_item_id),
wr_items as
(select i_item_id item_id,
sum(wr_return_quantity) wr_item_qty
from web_returns,
item,
date_dim
where wr_item_sk = i_item_sk
and d_date in
(select d_date
from date_dim
where d_date in ('1998-01-02','1998-10-15','1998-11-10'))
and wr_returned_date_sk = d_date_sk
group by i_item_id)
select sr_items.item_id
,sr_item_qty
,sr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 sr_dev
,cr_item_qty
,cr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 cr_dev
,wr_item_qty
,wr_item_qty/(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 * 100 wr_dev
,(sr_item_qty+cr_item_qty+wr_item_qty)/3.0 average
from sr_items
,cr_items
,wr_items
where sr_items.item_id=cr_items.item_id
and sr_items.item_id=wr_items.item_id
order by sr_items.item_id
,sr_item_qty
limit 100
INFO : Starting task [Stage-3:EXPLAIN] in serial mode
INFO : Completed executing
command(queryId=hive_20220307062043_2847c12d-9c22-452e-aa84-3200a3b9018b); Time
taken: 0.827 seconds
INFO : OK
Explain
CBO PLAN:
HiveSortLimit(sort0=[$0], sort1=[$1], dir0=[ASC], dir1=[ASC], fetch=[100])
HiveProject(item_id=[$0], sr_item_qty=[$4], sr_dev=[*(/(/($5, CAST(+(+($4,
$1), $7)):DOUBLE), 3), 100)], cr_item_qty=[$1], cr_dev=[*(/(/($2, CAST(+(+($4,
$1), $7)):DOUBLE), 3), 100)], wr_item_qty=[$7], wr_dev=[*(/(/($8, CAST(+(+($4,
$1), $7)):DOUBLE), 3), 100)], average=[/(CAST(+(+($4, $1), $7)):DECIMAL(19, 0),
3:DECIMAL(1, 0))])
HiveJoin(condition=[=($0, $6)], joinType=[inner])
HiveJoin(condition=[=($3, $0)], joinType=[inner])
HiveProject($f0=[$0], $f1=[$1], EXPR$0=[CAST($1):DOUBLE])
HiveAggregate(group=[{4}], agg#0=[sum($1)])
HiveSemiJoin(condition=[=($6, $7)], joinType=[semi])
HiveJoin(condition=[=($2, $5)], joinType=[inner])
HiveJoin(condition=[=($0, $3)], joinType=[inner])
HiveProject(cr_item_sk=[$1], cr_return_quantity=[$16],
cr_returned_date_sk=[$26])
HiveFilter(condition=[AND(IS NOT NULL($26), IS NOT
NULL($1))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
catalog_returns]], table:alias=[catalog_returns])
HiveProject(i_item_sk=[$0], i_item_id=[$1])
HiveFilter(condition=[AND(IS NOT NULL($1), IS NOT
NULL($0))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
item]], table:alias=[item])
HiveProject(d_date_sk=[$0], d_date=[$2])
HiveFilter(condition=[AND(IN($2, 1998-01-02:DATE,
1998-10-15:DATE, 1998-11-10:DATE), IS NOT NULL($0))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
HiveProject(d_date=[$2])
HiveFilter(condition=[IN($2, 1998-01-02:DATE, 1998-10-15:DATE,
1998-11-10:DATE)])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
HiveProject($f0=[$0], $f1=[$1], EXPR$0=[CAST($1):DOUBLE])
HiveAggregate(group=[{4}], agg#0=[sum($1)])
HiveSemiJoin(condition=[=($6, $7)], joinType=[semi])
HiveJoin(condition=[=($2, $5)], joinType=[inner])
HiveJoin(condition=[=($0, $3)], joinType=[inner])
HiveProject(sr_item_sk=[$1], sr_return_quantity=[$9],
sr_returned_date_sk=[$19])
HiveFilter(condition=[AND(IS NOT NULL($19), IS NOT
NULL($1))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
store_returns]], table:alias=[store_returns])
HiveProject(i_item_sk=[$0], i_item_id=[$1])
HiveFilter(condition=[AND(IS NOT NULL($1), IS NOT
NULL($0))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
item]], table:alias=[item])
HiveProject(d_date_sk=[$0], d_date=[$2])
HiveFilter(condition=[AND(IN($2, 1998-01-02:DATE,
1998-10-15:DATE, 1998-11-10:DATE), IS NOT NULL($0))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
HiveProject(d_date=[$2])
HiveFilter(condition=[IN($2, 1998-01-02:DATE, 1998-10-15:DATE,
1998-11-10:DATE)])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
HiveProject($f0=[$0], $f1=[$1], EXPR$0=[CAST($1):DOUBLE])
HiveAggregate(group=[{4}], agg#0=[sum($1)])
HiveSemiJoin(condition=[=($6, $7)], joinType=[semi])
HiveJoin(condition=[=($2, $5)], joinType=[inner])
HiveJoin(condition=[=($0, $3)], joinType=[inner])
HiveProject(wr_item_sk=[$1], wr_return_quantity=[$13],
wr_returned_date_sk=[$23])
HiveFilter(condition=[AND(IS NOT NULL($23), IS NOT NULL($1))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
web_returns]], table:alias=[web_returns])
HiveProject(i_item_sk=[$0], i_item_id=[$1])
HiveFilter(condition=[AND(IS NOT NULL($1), IS NOT NULL($0))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
item]], table:alias=[item])
HiveProject(d_date_sk=[$0], d_date=[$2])
HiveFilter(condition=[AND(IN($2, 1998-01-02:DATE,
1998-10-15:DATE, 1998-11-10:DATE), IS NOT NULL($0))])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
HiveProject(d_date=[$2])
HiveFilter(condition=[IN($2, 1998-01-02:DATE, 1998-10-15:DATE,
1998-11-10:DATE)])
HiveTableScan(table=[[tpcds_bin_partitioned_orc_10000,
date_dim]], table:alias=[date_dim])
{noformat}
--
This message was sent by Atlassian Jira
(v8.20.1#820001)