HIVE-17216: Additional qtests for HoS DPP (Sahil Takiar, reviewed by Sergio Pena)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/642acdf7 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/642acdf7 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/642acdf7 Branch: refs/heads/hive-14535 Commit: 642acdf76a46d7bb5bd036510d5bcdaa85e9f613 Parents: 3573549 Author: Sahil Takiar <takiar.sa...@gmail.com> Authored: Thu Aug 31 09:05:16 2017 -0700 Committer: Sahil Takiar <stak...@cloudera.com> Committed: Thu Aug 31 09:06:10 2017 -0700 ---------------------------------------------------------------------- .../test/resources/testconfiguration.properties | 1 + .../clientpositive/spark_constprog_dpp.q | 17 + .../spark_dynamic_partition_pruning.q | 40 +- .../spark_dynamic_partition_pruning_3.q | 236 ++- .../spark/spark_constprog_dpp.q.out | 138 ++ .../spark/spark_dynamic_partition_pruning.q.out | 351 ++-- .../spark_dynamic_partition_pruning_3.q.out | 1781 ++++++++++++++++-- 7 files changed, 2272 insertions(+), 292 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/642acdf7/itests/src/test/resources/testconfiguration.properties ---------------------------------------------------------------------- diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index fa6a2aa..6f2efa7 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -1403,6 +1403,7 @@ miniSparkOnYarn.only.query.files=spark_combine_equivalent_work.q,\ spark_dynamic_partition_pruning_2.q,\ spark_dynamic_partition_pruning_3.q,\ spark_dynamic_partition_pruning_mapjoin_only.q,\ + spark_constprog_dpp.q,\ dynamic_rdd_cache.q, \ spark_multi_insert_parallel_orderby.q,\ spark_explainuser_1.q,\ http://git-wip-us.apache.org/repos/asf/hive/blob/642acdf7/ql/src/test/queries/clientpositive/spark_constprog_dpp.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/spark_constprog_dpp.q b/ql/src/test/queries/clientpositive/spark_constprog_dpp.q new file mode 100644 index 0000000..cbe6aa2 --- /dev/null +++ b/ql/src/test/queries/clientpositive/spark_constprog_dpp.q @@ -0,0 +1,17 @@ +set hive.mapred.mode=nonstrict; +set hive.optimize.constant.propagation=true; +set hive.spark.dynamic.partition.pruning=true; + +drop table if exists tb1; +create table tb1 (id int); + +drop table if exists tb2; +create table tb2 (id smallint); + +explain +select a.id from tb1 a +left outer join +(select id from tb2 +union all +select 2 as id from tb2 limit 1) b +on a.id=b.id; http://git-wip-us.apache.org/repos/asf/hive/blob/642acdf7/ql/src/test/queries/clientpositive/spark_dynamic_partition_pruning.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/spark_dynamic_partition_pruning.q b/ql/src/test/queries/clientpositive/spark_dynamic_partition_pruning.q index 78e7515..71a7399 100644 --- a/ql/src/test/queries/clientpositive/spark_dynamic_partition_pruning.q +++ b/ql/src/test/queries/clientpositive/spark_dynamic_partition_pruning.q @@ -16,7 +16,7 @@ create table srcpart_hour as select hr as hr, hr as hour from srcpart group by h create table srcpart_date_hour as select ds as ds, ds as `date`, hr as hr, hr as hour from srcpart group by ds, hr; create table srcpart_double_hour as select (hr*2) as hr, hr as hour from srcpart group by hr; --- single column, single key +-- single column, single key -- join a partitioned table to a non-partitioned table, static filter on the non-partitioned table EXPLAIN select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) where srcpart_date.`date` = '2008-04-08'; select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) where srcpart_date.`date` = '2008-04-08'; set hive.spark.dynamic.partition.pruning=false; @@ -41,8 +41,8 @@ select count(*) from srcpart join srcpart_date on abs(negative(cast(concat(cast( EXPLAIN select count(*) from srcpart join srcpart_date on cast(day(srcpart.ds) as smallint) = cast(day(srcpart_date.ds) as decimal) where srcpart_date.`date` = '2008-04-08'; select count(*) from srcpart join srcpart_date on cast(day(srcpart.ds) as smallint) = cast(day(srcpart_date.ds) as decimal) where srcpart_date.`date` = '2008-04-08'; --- multiple sources, single key -EXPLAIN select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) join srcpart_hour on (srcpart.hr = srcpart_hour.hr) +-- multiple sources, single key -- filter partitioned table on both partitioned columns via join with non-partitioned table +EXPLAIN select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) join srcpart_hour on (srcpart.hr = srcpart_hour.hr) where srcpart_date.`date` = '2008-04-08' and srcpart_hour.hour = 11; select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) join srcpart_hour on (srcpart.hr = srcpart_hour.hr) where srcpart_date.`date` = '2008-04-08' and srcpart_hour.hour = 11; @@ -54,7 +54,7 @@ where srcpart_date.`date` = '2008-04-08' and srcpart_hour.hour = 11; set hive.spark.dynamic.partition.pruning=true; select count(*) from srcpart where hr = 11 and ds = '2008-04-08'; --- multiple columns single source +-- multiple columns single source -- filter partitioned table on both partitioned columns via join with non-partitioned table, filter non-partitioned table EXPLAIN select count(*) from srcpart join srcpart_date_hour on (srcpart.ds = srcpart_date_hour.ds and srcpart.hr = srcpart_date_hour.hr) where srcpart_date_hour.`date` = '2008-04-08' and srcpart_date_hour.hour = 11; select count(*) from srcpart join srcpart_date_hour on (srcpart.ds = srcpart_date_hour.ds and srcpart.hr = srcpart_date_hour.hr) where srcpart_date_hour.`date` = '2008-04-08' and srcpart_date_hour.hour = 11; set hive.spark.dynamic.partition.pruning=false; @@ -63,7 +63,7 @@ select count(*) from srcpart join srcpart_date_hour on (srcpart.ds = srcpart_dat set hive.spark.dynamic.partition.pruning=true; select count(*) from srcpart where ds = '2008-04-08' and hr = 11; --- empty set +-- empty set -- join a partitioned table to a non-partitioned table, static filter on the non-partitioned table that doesn't filter out anything EXPLAIN select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) where srcpart_date.`date` = 'I DONT EXIST'; select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) where srcpart_date.`date` = 'I DONT EXIST'; set hive.spark.dynamic.partition.pruning=false; @@ -72,7 +72,7 @@ select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) set hive.spark.dynamic.partition.pruning=true; select count(*) from srcpart where ds = 'I DONT EXIST'; --- expressions +-- expressions -- triggers DPP with various expressions - e.g. cast, multiplication, division EXPLAIN select count(*) from srcpart join srcpart_double_hour on (srcpart.hr = cast(srcpart_double_hour.hr/2 as int)) where srcpart_double_hour.hour = 11; select count(*) from srcpart join srcpart_double_hour on (srcpart.hr = cast(srcpart_double_hour.hr/2 as int)) where srcpart_double_hour.hour = 11; EXPLAIN select count(*) from srcpart join srcpart_double_hour on (srcpart.hr*2 = srcpart_double_hour.hr) where srcpart_double_hour.hour = 11; @@ -89,8 +89,7 @@ select count(*) from srcpart join srcpart_double_hour on (cast(srcpart.hr*2 as s set hive.spark.dynamic.partition.pruning=true; select count(*) from srcpart where cast(hr as string) = 11; - --- parent is reduce tasks +-- parent is reduce tasks -- join a partitioned table to a non-partitioned table, where the non-partitioned table is a subquery, static filter on the subquery EXPLAIN select count(*) from srcpart join (select ds as ds, ds as `date` from srcpart group by ds) s on (srcpart.ds = s.ds) where s.`date` = '2008-04-08'; select count(*) from srcpart join (select ds as ds, ds as `date` from srcpart group by ds) s on (srcpart.ds = s.ds) where s.`date` = '2008-04-08'; select count(*) from srcpart where ds = '2008-04-08'; @@ -155,8 +154,7 @@ select count(*) from srcpart where ds = '2008-04-08' and hr = 11; -- empty set EXPLAIN select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) where srcpart_date.`date` = 'I DONT EXIST'; --- Disabled until TEZ-1486 is fixed --- select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) where srcpart_date.`date` = 'I DONT EXIST'; +select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) where srcpart_date.`date` = 'I DONT EXIST'; -- expressions EXPLAIN select count(*) from srcpart join srcpart_double_hour on (srcpart.hr = cast(srcpart_double_hour.hr/2 as int)) where srcpart_double_hour.hour = 11; @@ -170,10 +168,6 @@ EXPLAIN select count(*) from srcpart join (select ds as ds, ds as `date` from sr select count(*) from srcpart join (select ds as ds, ds as `date` from srcpart group by ds) s on (srcpart.ds = s.ds) where s.`date` = '2008-04-08'; select count(*) from srcpart where ds = '2008-04-08'; --- single column, single key, udf with typechange -EXPLAIN select count(*) from srcpart join srcpart_date on (day(srcpart.ds) = day(srcpart_date.ds)) where srcpart_date.`date` = '2008-04-08'; -select count(*) from srcpart join srcpart_date on (day(srcpart.ds) = day(srcpart_date.ds)) where srcpart_date.`date` = '2008-04-08'; - -- left join EXPLAIN select count(*) from srcpart left join srcpart_date on (srcpart.ds = srcpart_date.ds) where srcpart_date.`date` = '2008-04-08'; EXPLAIN select count(*) from srcpart_date left join srcpart on (srcpart.ds = srcpart_date.ds) where srcpart_date.`date` = '2008-04-08'; @@ -188,16 +182,26 @@ select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) where srcpart_date.`date` = '2008-04-08' and srcpart_hour.hour = 11 and srcpart.hr = 11; EXPLAIN select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) join srcpart_hour on (srcpart.hr = srcpart_hour.hr) where srcpart_date.`date` = '2008-04-08' and srcpart.hr = 13; --- Disabled until TEZ-1486 is fixed --- select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) join srcpart_hour on (srcpart.hr = srcpart_hour.hr) --- where srcpart_date.`date` = '2008-04-08' and srcpart.hr = 13; +select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) join srcpart_hour on (srcpart.hr = srcpart_hour.hr) +where srcpart_date.`date` = '2008-04-08' and srcpart.hr = 13; -- union + subquery EXPLAIN select distinct(ds) from srcpart where srcpart.ds in (select max(srcpart.ds) from srcpart union all select min(srcpart.ds) from srcpart); select distinct(ds) from srcpart where srcpart.ds in (select max(srcpart.ds) from srcpart union all select min(srcpart.ds) from srcpart); +-- different file format +create table srcpart_parquet (key int, value string) partitioned by (ds string, hr int) stored as parquet; + +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.vectorized.execution.enabled=false; +set hive.exec.max.dynamic.partitions=1000; + +insert into table srcpart_parquet partition (ds, hr) select key, value, ds, hr from srcpart; +EXPLAIN select count(*) from srcpart_parquet join srcpart_date_hour on (srcpart_parquet.ds = srcpart_date_hour.ds and srcpart_parquet.hr = srcpart_date_hour.hr) where srcpart_date_hour.hour = 11 and (srcpart_date_hour.`date` = '2008-04-08' or srcpart_date_hour.`date` = '2008-04-09'); +select count(*) from srcpart_parquet join srcpart_date_hour on (srcpart_parquet.ds = srcpart_date_hour.ds and srcpart_parquet.hr = srcpart_date_hour.hr) where srcpart_date_hour.hour = 11 and (srcpart_date_hour.`date` = '2008-04-08' or srcpart_date_hour.`date` = '2008-04-09'); +select count(*) from srcpart where (ds = '2008-04-08' or ds = '2008-04-09') and hr = 11; -drop table srcpart_orc; +drop table srcpart_parquet; drop table srcpart_date; drop table srcpart_hour; drop table srcpart_date_hour; http://git-wip-us.apache.org/repos/asf/hive/blob/642acdf7/ql/src/test/queries/clientpositive/spark_dynamic_partition_pruning_3.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/spark_dynamic_partition_pruning_3.q b/ql/src/test/queries/clientpositive/spark_dynamic_partition_pruning_3.q index ac4ff93..8863cf4 100644 --- a/ql/src/test/queries/clientpositive/spark_dynamic_partition_pruning_3.q +++ b/ql/src/test/queries/clientpositive/spark_dynamic_partition_pruning_3.q @@ -1,16 +1,228 @@ -set hive.spark.dynamic.partition.pruning=true; -set hive.auto.convert.join=true; +CREATE TABLE partitioned_table1 (col int) PARTITIONED BY (part_col int); +CREATE TABLE partitioned_table2 (col int) PARTITIONED BY (part_col int); +CREATE TABLE partitioned_table3 (col int) PARTITIONED BY (part_col int); +CREATE TABLE partitioned_table4 (col int) PARTITIONED BY (part_col1 int, part_col2 int); +CREATE TABLE partitioned_table5 (col int) PARTITIONED BY (part_col1 int, part_col2 int); -create table partitioned_table1 (col int) partitioned by (part_col int); -create table partitioned_table2 (col int) partitioned by (part_col int); -create table regular_table (col int); -insert into table regular_table values (1); +CREATE TABLE regular_table1 (col1 int, col2 int); +CREATE TABLE regular_table2 (col1 int, col2 int); -alter table partitioned_table1 add partition (part_col = 1); -insert into table partitioned_table1 partition (part_col = 1) values (1), (2), (3), (4), (5), (6), (7), (8), (9), (10); +ALTER TABLE partitioned_table1 ADD PARTITION (part_col = 1); +ALTER TABLE partitioned_table1 ADD PARTITION (part_col = 2); +ALTER TABLE partitioned_table1 ADD PARTITION (part_col = 3); -alter table partitioned_table2 add partition (part_col = 1); -insert into table partitioned_table2 partition (part_col = 1) values (1), (2), (3), (4), (5), (6), (7), (8), (9), (10); +ALTER TABLE partitioned_table2 ADD PARTITION (part_col = 1); +ALTER TABLE partitioned_table2 ADD PARTITION (part_col = 2); +ALTER TABLE partitioned_table2 ADD PARTITION (part_col = 3); -explain select * from partitioned_table1, partitioned_table2 where partitioned_table1.part_col = partitioned_table2.part_col; -explain select * from partitioned_table1 where partitioned_table1.part_col in (select regular_table.col from regular_table join partitioned_table2 on regular_table.col = partitioned_table2.part_col); \ No newline at end of file +ALTER TABLE partitioned_table3 ADD PARTITION (part_col = 1); +ALTER TABLE partitioned_table3 ADD PARTITION (part_col = 2); +ALTER TABLE partitioned_table3 ADD PARTITION (part_col = 3); + +ALTER TABLE partitioned_table4 ADD PARTITION (part_col1 = 1, part_col2 = 1); +ALTER TABLE partitioned_table4 ADD PARTITION (part_col1 = 2, part_col2 = 2); +ALTER TABLE partitioned_table4 ADD PARTITION (part_col1 = 3, part_col2 = 3); + +ALTER TABLE partitioned_table5 ADD PARTITION (part_col1 = 1, part_col2 = 1); +ALTER TABLE partitioned_table5 ADD PARTITION (part_col1 = 2, part_col2 = 2); +ALTER TABLE partitioned_table5 ADD PARTITION (part_col1 = 3, part_col2 = 3); + +INSERT INTO TABLE regular_table1 VALUES (0, 0), (1, 1), (2, 2); +INSERT INTO TABLE regular_table2 VALUES (0, 0), (1, 1), (2, 2); + +INSERT INTO TABLE partitioned_table1 PARTITION (part_col = 1) VALUES (1), (2), (3), (4), (5), (6); +INSERT INTO TABLE partitioned_table1 PARTITION (part_col = 2) VALUES (1), (2), (3), (4), (5), (6); +INSERT INTO TABLE partitioned_table1 PARTITION (part_col = 3) VALUES (1), (2), (3), (4), (5), (6); + +INSERT INTO TABLE partitioned_table2 PARTITION (part_col = 1) VALUES (1), (2), (3), (4), (5), (6); +INSERT INTO TABLE partitioned_table2 PARTITION (part_col = 2) VALUES (1), (2), (3), (4), (5), (6); +INSERT INTO TABLE partitioned_table2 PARTITION (part_col = 3) VALUES (1), (2), (3), (4), (5), (6); + +INSERT INTO TABLE partitioned_table3 PARTITION (part_col = 1) VALUES (1), (2), (3), (4), (5), (6); +INSERT INTO TABLE partitioned_table3 PARTITION (part_col = 2) VALUES (1), (2), (3), (4), (5), (6); +INSERT INTO TABLE partitioned_table3 PARTITION (part_col = 3) VALUES (1), (2), (3), (4), (5), (6); + +INSERT INTO TABLE partitioned_table4 PARTITION (part_col1 = 1, part_col2 = 1) VALUES (1), (2), (3), (4), (5), (6); +INSERT INTO TABLE partitioned_table4 PARTITION (part_col1 = 2, part_col2 = 2) VALUES (1), (2), (3), (4), (5), (6); +INSERT INTO TABLE partitioned_table4 PARTITION (part_col1 = 3, part_col2 = 3) VALUES (1), (2), (3), (4), (5), (6); + +INSERT INTO TABLE partitioned_table5 PARTITION (part_col1 = 1, part_col2 = 1) VALUES (1), (2), (3), (4), (5), (6); +INSERT INTO TABLE partitioned_table5 PARTITION (part_col1 = 2, part_col2 = 2) VALUES (1), (2), (3), (4), (5), (6); +INSERT INTO TABLE partitioned_table5 PARTITION (part_col1 = 3, part_col2 = 3) VALUES (1), (2), (3), (4), (5), (6); + +SET hive.spark.dynamic.partition.pruning.map.join.only=true; +SET hive.strict.checks.cartesian.product=false; +SET hive.auto.convert.join=true; +SET hive.auto.convert.join.noconditionaltask.size=15; -- ensure the partitioned tables are treated as big tables + +-- left semi join where the subquery is a join between a partitioned and a non-partitioned table +EXPLAIN SELECT count(*) FROM partitioned_table1 WHERE partitioned_table1.part_col IN ( +SELECT regular_table1.col1 FROM regular_table1 JOIN partitioned_table2 ON +regular_table1.col1 = partitioned_table2.part_col AND partitioned_table2.col > 3 AND regular_table1.col1 > 1); + +SELECT count(*) +FROM partitioned_table1 +WHERE partitioned_table1.part_col IN (SELECT regular_table1.col1 + FROM regular_table1 + JOIN partitioned_table2 + ON + regular_table1.col1 = partitioned_table2.part_col AND partitioned_table2.col > 3 AND regular_table1.col1 > 1); + +-- three-table join with dpp against one partitioned table +EXPLAIN SELECT count(*) FROM partitioned_table1, regular_table1 rt1, +regular_table1 rt2 WHERE rt1.col1 = partitioned_table1.part_col AND rt2.col1 = +partitioned_table1.part_col AND rt1.col2 > 0 AND rt2.col2 > 1; + +SELECT count(*) +FROM partitioned_table1, + regular_table1 rt1, + regular_table1 rt2 +WHERE rt1.col1 = partitioned_table1.part_col + AND rt2.col1 = partitioned_table1.part_col + AND rt1.col2 > 0 + AND rt2.col2 > 1; + +-- four-table join with dpp against two separate partitioned tables +EXPLAIN SELECT count(*) FROM partitioned_table1, partitioned_table2, regular_table1 rt1 +, regular_table1 rt2 WHERE rt1.col1 = partitioned_table1.part_col AND rt2.col1 = +partitioned_table2.part_col AND rt1.col2 > 0 AND rt2.col2 > 1; + +SELECT count(*) +FROM partitioned_table1, + partitioned_table2, + regular_table1 rt1, + regular_table1 rt2 +WHERE rt1.col1 = partitioned_table1.part_col + AND rt2.col1 = partitioned_table2.part_col + AND rt1.col2 > 0 + AND rt2.col2 > 1; + +-- dpp with between filter +EXPLAIN SELECT count(*) FROM regular_table1, partitioned_table1 WHERE regular_table1.col1 += partitioned_table1.part_col AND regular_table1.col2 BETWEEN 1 AND 3; + +SELECT count(*) +FROM regular_table1, + partitioned_table1 +WHERE regular_table1.col1 = partitioned_table1.part_col + AND regular_table1.col2 BETWEEN 1 AND 3; + +-- dpp with cte +EXPLAIN WITH q1 AS (SELECT regular_table1.col1 AS col FROM regular_table1 WHERE +regular_table1.col2 > 1), q2 AS (SELECT partitioned_table1.part_col AS col FROM +partitioned_table1 WHERE partitioned_table1.col > 1) SELECT count(*) FROM q1 JOIN q2 ON +q1.col = q2.col; + +WITH q1 + AS (SELECT regular_table1.col1 AS col + FROM regular_table1 + WHERE regular_table1.col2 > 1), + q2 + AS (SELECT partitioned_table1.part_col AS col + FROM partitioned_table1 + WHERE partitioned_table1.col > 1) +SELECT count(*) +FROM q1 + JOIN q2 + ON q1.col = q2.col; + +-- join two partitioned tables with a filter +EXPLAIN SELECT count(*) FROM partitioned_table1, partitioned_table2 WHERE +partitioned_table1.part_col = partitioned_table2.part_col AND partitioned_table2.col > 1; + +SELECT count(*) +FROM partitioned_table1, + partitioned_table2 +WHERE partitioned_table1.part_col = partitioned_table2.part_col + AND partitioned_table2.col > 1; + +-- dpp betwen two partitioned tables, both with multiple partition columns +SET hive.auto.convert.join.noconditionaltask.size=150; -- set auto convert size to a higher value so map-joins are triggered for the partitioned tables + +EXPLAIN SELECT count(*) FROM partitioned_table4, partitioned_table5 WHERE +partitioned_table4.part_col1 = partitioned_table5.part_col1 AND +partitioned_table4.part_col2 = partitioned_table5.part_col2; + +SELECT count(*) +FROM partitioned_table4, + partitioned_table5 +WHERE partitioned_table4.part_col1 = partitioned_table5.part_col1 + AND partitioned_table4.part_col2 = partitioned_table5.part_col2; + +-- dpp is pushed through multiple levels of joins +EXPLAIN SELECT count(*) FROM partitioned_table1 JOIN regular_table1 ON +partitioned_table1.part_col = regular_table1.col1 JOIN regular_table2 ON +regular_table1.col1 = regular_table2.col1; + +SELECT count(*) +FROM partitioned_table1 + JOIN regular_table1 + ON partitioned_table1.part_col = regular_table1.col1 + JOIN regular_table2 + ON regular_table1.col1 = regular_table2.col1; + +SET hive.auto.convert.join.noconditionaltask.size=15; -- reset auto convert size to previous value + +-- three-way join where the partitioned table is the smallest table +-- disabled until HIVE-17225 is fixed +-- EXPLAIN SELECT * FROM partitioned_table1, regular_table1 rt1, regular_table2 rt2 +-- WHERE rt1.col1 = partitioned_table1.part_col AND rt2.col1 = +-- partitioned_table1.part_col AND partitioned_table1.col > 3; + +-- SELECT * +-- FROM partitioned_table1, +-- regular_table1 rt1, +-- regular_table2 rt2 +-- WHERE rt1.col1 = partitioned_table1.part_col +-- AND rt2.col1 = partitioned_table1.part_col AND partitioned_table1.col > 3; + +-- dpp is pushed to partition columns that are added to each other +-- disabled until HIVE-17244 is fixed +-- EXPLAIN SELECT count(*) FROM partitioned_table4 pt4 JOIN regular_table1 rt1 +-- ON pt4.part_col1 + pt4.part_col2 = rt1.col1 + 1; + +-- SELECT count(*) +-- FROM partitioned_table4 pt4 +-- JOIN regular_table1 rt1 +-- ON pt4.part_col1 + pt4.part_col2 = rt1.col1 + 1; + +-- dpp pushed to all union operands +-- disabled until HIVE-17239 is fixed +-- EXPLAIN SELECT count(*) FROM (SELECT part_col FROM partitioned_table1 UNION ALL SELECT +-- part_col FROM partitioned_table2) q1 JOIN regular_table1 JOIN regular_table2 +-- WHERE q1.part_col = regular_table1.col1 AND q1.part_col = regular_table2.col1; + +-- SELECT count(*) +-- FROM ( +-- SELECT part_col +-- FROM partitioned_table1 +-- UNION ALL +-- SELECT part_col +-- FROM partitioned_table2) q1 +-- JOIN regular_table1 +-- JOIN regular_table2 +-- where q1.part_col = regular_table1.col1 +-- AND q1.part_col = regular_table2.col1; + +-- left semi join where the subquery is a join between a partitioned and a partitioned table +-- disabled until HIVE-17238 is fixed +-- EXPLAIN SELECT count(*) FROM partitioned_table1 WHERE partitioned_table1.part_col IN ( +-- SELECT partitioned_table2.col FROM partitioned_table2 JOIN partitioned_table3 ON +-- partitioned_table3.col = partitioned_table2.part_col AND partitioned_table2.col > 1); + +-- SELECT count(*) +-- FROM partitioned_table1 +-- WHERE partitioned_table1.part_col IN (SELECT partitioned_table2.col +-- FROM partitioned_table2 +-- JOIN partitioned_table3 +-- ON +-- partitioned_table3.col = partitioned_table2.part_col +-- AND partitioned_table2.col > 1); + +DROP TABLE partitioned_table1; +DROP TABLE partitioned_table2; +DROP TABLE partitioned_table3; +DROP TABLE partitioned_table4; +DROP TABLE partitioned_table5; +DROP TABLE regular_table1; +DROP TABLE regular_table2; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hive/blob/642acdf7/ql/src/test/results/clientpositive/spark/spark_constprog_dpp.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/spark/spark_constprog_dpp.q.out b/ql/src/test/results/clientpositive/spark/spark_constprog_dpp.q.out new file mode 100644 index 0000000..79bbbdf --- /dev/null +++ b/ql/src/test/results/clientpositive/spark/spark_constprog_dpp.q.out @@ -0,0 +1,138 @@ +PREHOOK: query: drop table if exists tb1 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists tb1 +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table tb1 (id int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tb1 +POSTHOOK: query: create table tb1 (id int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tb1 +PREHOOK: query: drop table if exists tb2 +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists tb2 +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table tb2 (id smallint) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@tb2 +POSTHOOK: query: create table tb2 (id smallint) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@tb2 +PREHOOK: query: explain +select a.id from tb1 a +left outer join +(select id from tb2 +union all +select 2 as id from tb2 limit 1) b +on a.id=b.id +PREHOOK: type: QUERY +POSTHOOK: query: explain +select a.id from tb1 a +left outer join +(select id from tb2 +union all +select 2 as id from tb2 limit 1) b +on a.id=b.id +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (PARTITION-LEVEL SORT, 4), Reducer 4 (PARTITION-LEVEL SORT, 4) + Reducer 4 <- Map 3 (GROUP, 1), Map 5 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: a + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: id (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Map 3 + Map Operator Tree: + TableScan + alias: tb2 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Select Operator + expressions: UDFToInteger(id) (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + value expressions: _col0 (type: int) + Map 5 + Map Operator Tree: + TableScan + alias: tb2 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Select Operator + expressions: 2 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + TopN Hash Memory Usage: 0.1 + value expressions: _col0 (type: int) + Reducer 2 + Reduce Operator Tree: + Join Operator + condition map: + Left Outer Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Limit + Number of rows: 1 + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 1 Data size: 0 Basic stats: PARTIAL Column stats: COMPLETE + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + http://git-wip-us.apache.org/repos/asf/hive/blob/642acdf7/ql/src/test/results/clientpositive/spark/spark_dynamic_partition_pruning.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/spark/spark_dynamic_partition_pruning.q.out b/ql/src/test/results/clientpositive/spark/spark_dynamic_partition_pruning.q.out index 63a9548..6c21f1a 100644 --- a/ql/src/test/results/clientpositive/spark/spark_dynamic_partition_pruning.q.out +++ b/ql/src/test/results/clientpositive/spark/spark_dynamic_partition_pruning.q.out @@ -976,10 +976,10 @@ POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 POSTHOOK: Input: default@srcpart_date #### A masked pattern was here #### 1000 -PREHOOK: query: EXPLAIN select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) join srcpart_hour on (srcpart.hr = srcpart_hour.hr) +PREHOOK: query: EXPLAIN select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) join srcpart_hour on (srcpart.hr = srcpart_hour.hr) where srcpart_date.`date` = '2008-04-08' and srcpart_hour.hour = 11 PREHOOK: type: QUERY -POSTHOOK: query: EXPLAIN select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) join srcpart_hour on (srcpart.hr = srcpart_hour.hr) +POSTHOOK: query: EXPLAIN select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) join srcpart_hour on (srcpart.hr = srcpart_hour.hr) where srcpart_date.`date` = '2008-04-08' and srcpart_hour.hour = 11 POSTHOOK: type: QUERY STAGE DEPENDENCIES: @@ -5107,6 +5107,25 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) where srcpart_date.`date` = 'I DONT EXIST' +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 +PREHOOK: Input: default@srcpart_date +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) where srcpart_date.`date` = 'I DONT EXIST' +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 +POSTHOOK: Input: default@srcpart_date +#### A masked pattern was here #### +0 PREHOOK: query: EXPLAIN select count(*) from srcpart join srcpart_double_hour on (srcpart.hr = cast(srcpart_double_hour.hr/2 as int)) where srcpart_double_hour.hour = 11 PREHOOK: type: QUERY POSTHOOK: query: EXPLAIN select count(*) from srcpart join srcpart_double_hour on (srcpart.hr = cast(srcpart_double_hour.hr/2 as int)) where srcpart_double_hour.hour = 11 @@ -5493,130 +5512,6 @@ POSTHOOK: type: QUERY POSTHOOK: Input: default@srcpart #### A masked pattern was here #### 1000 -PREHOOK: query: EXPLAIN select count(*) from srcpart join srcpart_date on (day(srcpart.ds) = day(srcpart_date.ds)) where srcpart_date.`date` = '2008-04-08' -PREHOOK: type: QUERY -POSTHOOK: query: EXPLAIN select count(*) from srcpart join srcpart_date on (day(srcpart.ds) = day(srcpart_date.ds)) where srcpart_date.`date` = '2008-04-08' -POSTHOOK: type: QUERY -STAGE DEPENDENCIES: - Stage-2 is a root stage - Stage-1 depends on stages: Stage-2 - Stage-0 depends on stages: Stage-1 - -STAGE PLANS: - Stage: Stage-2 - Spark -#### A masked pattern was here #### - Vertices: - Map 3 - Map Operator Tree: - TableScan - alias: srcpart_date - filterExpr: ((date = '2008-04-08') and ds is not null) (type: boolean) - Statistics: Num rows: 2 Data size: 42 Basic stats: COMPLETE Column stats: NONE - Filter Operator - predicate: ((date = '2008-04-08') and ds is not null) (type: boolean) - Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: ds (type: string) - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column stats: NONE - Spark HashTable Sink Operator - keys: - 0 day(_col0) (type: int) - 1 day(_col0) (type: int) - Select Operator - expressions: day(_col0) (type: int) - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column stats: NONE - Group By Operator - keys: _col0 (type: int) - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column stats: NONE - Spark Partition Pruning Sink Operator - Target column: ds (string) - partition key expr: day(ds) - Statistics: Num rows: 1 Data size: 21 Basic stats: COMPLETE Column stats: NONE - target work: Map 1 - Local Work: - Map Reduce Local Work - - Stage: Stage-1 - Spark - Edges: - Reducer 2 <- Map 1 (GROUP, 1) -#### A masked pattern was here #### - Vertices: - Map 1 - Map Operator Tree: - TableScan - alias: srcpart - filterExpr: ds is not null (type: boolean) - Statistics: Num rows: 2000 Data size: 21248 Basic stats: COMPLETE Column stats: NONE - Select Operator - expressions: ds (type: string) - outputColumnNames: _col0 - Statistics: Num rows: 2000 Data size: 21248 Basic stats: COMPLETE Column stats: NONE - Map Join Operator - condition map: - Inner Join 0 to 1 - keys: - 0 day(_col0) (type: int) - 1 day(_col0) (type: int) - input vertices: - 1 Map 3 - Statistics: Num rows: 2200 Data size: 23372 Basic stats: COMPLETE Column stats: NONE - Group By Operator - aggregations: count() - mode: hash - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - Reduce Output Operator - sort order: - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - value expressions: _col0 (type: bigint) - Local Work: - Map Reduce Local Work - Reducer 2 - Reduce Operator Tree: - Group By Operator - aggregations: count(VALUE._col0) - mode: mergepartial - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - File Output Operator - compressed: false - Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE - table: - input format: org.apache.hadoop.mapred.SequenceFileInputFormat - output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat - serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe - - Stage: Stage-0 - Fetch Operator - limit: -1 - Processor Tree: - ListSink - -PREHOOK: query: select count(*) from srcpart join srcpart_date on (day(srcpart.ds) = day(srcpart_date.ds)) where srcpart_date.`date` = '2008-04-08' -PREHOOK: type: QUERY -PREHOOK: Input: default@srcpart -PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 -PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 -PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 -PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 -PREHOOK: Input: default@srcpart_date -#### A masked pattern was here #### -POSTHOOK: query: select count(*) from srcpart join srcpart_date on (day(srcpart.ds) = day(srcpart_date.ds)) where srcpart_date.`date` = '2008-04-08' -POSTHOOK: type: QUERY -POSTHOOK: Input: default@srcpart -POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 -POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 -POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 -POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 -POSTHOOK: Input: default@srcpart_date -#### A masked pattern was here #### -1000 PREHOOK: query: EXPLAIN select count(*) from srcpart left join srcpart_date on (srcpart.ds = srcpart_date.ds) where srcpart_date.`date` = '2008-04-08' PREHOOK: type: QUERY POSTHOOK: query: EXPLAIN select count(*) from srcpart left join srcpart_date on (srcpart.ds = srcpart_date.ds) where srcpart_date.`date` = '2008-04-08' @@ -6201,6 +6096,21 @@ STAGE PLANS: Processor Tree: ListSink +PREHOOK: query: select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) join srcpart_hour on (srcpart.hr = srcpart_hour.hr) +where srcpart_date.`date` = '2008-04-08' and srcpart.hr = 13 +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +PREHOOK: Input: default@srcpart_date +PREHOOK: Input: default@srcpart_hour +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart join srcpart_date on (srcpart.ds = srcpart_date.ds) join srcpart_hour on (srcpart.hr = srcpart_hour.hr) +where srcpart_date.`date` = '2008-04-08' and srcpart.hr = 13 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart +POSTHOOK: Input: default@srcpart_date +POSTHOOK: Input: default@srcpart_hour +#### A masked pattern was here #### +0 PREHOOK: query: EXPLAIN select distinct(ds) from srcpart where srcpart.ds in (select max(srcpart.ds) from srcpart union all select min(srcpart.ds) from srcpart) PREHOOK: type: QUERY POSTHOOK: query: EXPLAIN select distinct(ds) from srcpart where srcpart.ds in (select max(srcpart.ds) from srcpart union all select min(srcpart.ds) from srcpart) @@ -6472,10 +6382,195 @@ POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 #### A masked pattern was here #### 2008-04-08 2008-04-09 -PREHOOK: query: drop table srcpart_orc +PREHOOK: query: create table srcpart_parquet (key int, value string) partitioned by (ds string, hr int) stored as parquet +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@srcpart_parquet +POSTHOOK: query: create table srcpart_parquet (key int, value string) partitioned by (ds string, hr int) stored as parquet +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@srcpart_parquet +PREHOOK: query: insert into table srcpart_parquet partition (ds, hr) select key, value, ds, hr from srcpart +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +PREHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 +PREHOOK: Output: default@srcpart_parquet +POSTHOOK: query: insert into table srcpart_parquet partition (ds, hr) select key, value, ds, hr from srcpart +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-08/hr=12 +POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=11 +POSTHOOK: Input: default@srcpart@ds=2008-04-09/hr=12 +POSTHOOK: Output: default@srcpart_parquet@ds=2008-04-08/hr=11 +POSTHOOK: Output: default@srcpart_parquet@ds=2008-04-08/hr=12 +POSTHOOK: Output: default@srcpart_parquet@ds=2008-04-09/hr=11 +POSTHOOK: Output: default@srcpart_parquet@ds=2008-04-09/hr=12 +POSTHOOK: Lineage: srcpart_parquet PARTITION(ds=2008-04-08,hr=11).key EXPRESSION [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: srcpart_parquet PARTITION(ds=2008-04-08,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: srcpart_parquet PARTITION(ds=2008-04-08,hr=12).key EXPRESSION [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: srcpart_parquet PARTITION(ds=2008-04-08,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: srcpart_parquet PARTITION(ds=2008-04-09,hr=11).key EXPRESSION [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: srcpart_parquet PARTITION(ds=2008-04-09,hr=11).value SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ] +POSTHOOK: Lineage: srcpart_parquet PARTITION(ds=2008-04-09,hr=12).key EXPRESSION [(srcpart)srcpart.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: srcpart_parquet PARTITION(ds=2008-04-09,hr=12).value SIMPLE [(srcpart)srcpart.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: EXPLAIN select count(*) from srcpart_parquet join srcpart_date_hour on (srcpart_parquet.ds = srcpart_date_hour.ds and srcpart_parquet.hr = srcpart_date_hour.hr) where srcpart_date_hour.hour = 11 and (srcpart_date_hour.`date` = '2008-04-08' or srcpart_date_hour.`date` = '2008-04-09') +PREHOOK: type: QUERY +POSTHOOK: query: EXPLAIN select count(*) from srcpart_parquet join srcpart_date_hour on (srcpart_parquet.ds = srcpart_date_hour.ds and srcpart_parquet.hr = srcpart_date_hour.hr) where srcpart_date_hour.hour = 11 and (srcpart_date_hour.`date` = '2008-04-08' or srcpart_date_hour.`date` = '2008-04-09') +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-2 is a root stage + Stage-1 depends on stages: Stage-2 + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-2 + Spark +#### A masked pattern was here #### + Vertices: + Map 3 + Map Operator Tree: + TableScan + alias: srcpart_date_hour + filterExpr: (((date = '2008-04-08') or (date = '2008-04-09')) and (UDFToDouble(hour) = 11.0) and ds is not null and hr is not null) (type: boolean) + Statistics: Num rows: 4 Data size: 108 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: (((date = '2008-04-08') or (date = '2008-04-09')) and (UDFToDouble(hour) = 11.0) and ds is not null and hr is not null) (type: boolean) + Statistics: Num rows: 2 Data size: 54 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ds (type: string), hr (type: string) + outputColumnNames: _col0, _col2 + Statistics: Num rows: 2 Data size: 54 Basic stats: COMPLETE Column stats: NONE + Spark HashTable Sink Operator + keys: + 0 _col0 (type: string), UDFToDouble(_col1) (type: double) + 1 _col0 (type: string), UDFToDouble(_col2) (type: double) + Select Operator + expressions: _col0 (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 54 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 54 Basic stats: COMPLETE Column stats: NONE + Spark Partition Pruning Sink Operator + Target column: ds (string) + partition key expr: ds + Statistics: Num rows: 2 Data size: 54 Basic stats: COMPLETE Column stats: NONE + target work: Map 1 + Select Operator + expressions: UDFToDouble(_col2) (type: double) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 54 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: double) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 54 Basic stats: COMPLETE Column stats: NONE + Spark Partition Pruning Sink Operator + Target column: hr (int) + partition key expr: UDFToDouble(hr) + Statistics: Num rows: 2 Data size: 54 Basic stats: COMPLETE Column stats: NONE + target work: Map 1 + Local Work: + Map Reduce Local Work + + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP, 1) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: srcpart_parquet + Statistics: Num rows: 2000 Data size: 4000 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ds (type: string), hr (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2000 Data size: 4000 Basic stats: COMPLETE Column stats: NONE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: string), UDFToDouble(_col1) (type: double) + 1 _col0 (type: string), UDFToDouble(_col2) (type: double) + input vertices: + 1 Map 3 + Statistics: Num rows: 2200 Data size: 4400 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Local Work: + Map Reduce Local Work + Reducer 2 + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(*) from srcpart_parquet join srcpart_date_hour on (srcpart_parquet.ds = srcpart_date_hour.ds and srcpart_parquet.hr = srcpart_date_hour.hr) where srcpart_date_hour.hour = 11 and (srcpart_date_hour.`date` = '2008-04-08' or srcpart_date_hour.`date` = '2008-04-09') +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart_date_hour +PREHOOK: Input: default@srcpart_parquet +PREHOOK: Input: default@srcpart_parquet@ds=2008-04-08/hr=11 +PREHOOK: Input: default@srcpart_parquet@ds=2008-04-08/hr=12 +PREHOOK: Input: default@srcpart_parquet@ds=2008-04-09/hr=11 +PREHOOK: Input: default@srcpart_parquet@ds=2008-04-09/hr=12 +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart_parquet join srcpart_date_hour on (srcpart_parquet.ds = srcpart_date_hour.ds and srcpart_parquet.hr = srcpart_date_hour.hr) where srcpart_date_hour.hour = 11 and (srcpart_date_hour.`date` = '2008-04-08' or srcpart_date_hour.`date` = '2008-04-09') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart_date_hour +POSTHOOK: Input: default@srcpart_parquet +POSTHOOK: Input: default@srcpart_parquet@ds=2008-04-08/hr=11 +POSTHOOK: Input: default@srcpart_parquet@ds=2008-04-08/hr=12 +POSTHOOK: Input: default@srcpart_parquet@ds=2008-04-09/hr=11 +POSTHOOK: Input: default@srcpart_parquet@ds=2008-04-09/hr=12 +#### A masked pattern was here #### +1000 +PREHOOK: query: select count(*) from srcpart where (ds = '2008-04-08' or ds = '2008-04-09') and hr = 11 +PREHOOK: type: QUERY +PREHOOK: Input: default@srcpart +#### A masked pattern was here #### +POSTHOOK: query: select count(*) from srcpart where (ds = '2008-04-08' or ds = '2008-04-09') and hr = 11 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@srcpart +#### A masked pattern was here #### +1000 +PREHOOK: query: drop table srcpart_parquet PREHOOK: type: DROPTABLE -POSTHOOK: query: drop table srcpart_orc +PREHOOK: Input: default@srcpart_parquet +PREHOOK: Output: default@srcpart_parquet +POSTHOOK: query: drop table srcpart_parquet POSTHOOK: type: DROPTABLE +POSTHOOK: Input: default@srcpart_parquet +POSTHOOK: Output: default@srcpart_parquet PREHOOK: query: drop table srcpart_date PREHOOK: type: DROPTABLE PREHOOK: Input: default@srcpart_date