Repository: hive Updated Branches: refs/heads/master 6ef4a990d -> 5ade74060
HIVE-20069: Fix reoptimization in case of DPP and Semijoin optimization (Zoltan Haindrich reviewed by Ashutosh Chauhan, Zoltan Haindrich) Signed-off-by: Zoltan Haindrich <k...@rxd.hu> Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/cac971b4 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/cac971b4 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/cac971b4 Branch: refs/heads/master Commit: cac971b43a05a161c2be6ea220dbdf1317b07017 Parents: 6ef4a99 Author: Zoltan Haindrich <k...@rxd.hu> Authored: Wed Jul 11 14:48:25 2018 +0200 Committer: Zoltan Haindrich <k...@rxd.hu> Committed: Thu Jul 12 08:50:28 2018 +0200 ---------------------------------------------------------------------- .../test/resources/testconfiguration.properties | 2 + .../hadoop/hive/ql/parse/TezCompiler.java | 72 ++++ .../hive/ql/plan/mapper/StatsSources.java | 9 +- .../hadoop/hive/ql/stats/OperatorStats.java | 7 + ql/src/test/queries/clientpositive/reopt_dpp.q | 62 +++ .../queries/clientpositive/reopt_semijoin.q | 76 ++++ .../results/clientpositive/llap/reopt_dpp.q.out | 259 ++++++++++++ .../clientpositive/llap/reopt_semijoin.q.out | 420 +++++++++++++++++++ 8 files changed, 905 insertions(+), 2 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/cac971b4/itests/src/test/resources/testconfiguration.properties ---------------------------------------------------------------------- diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 72dd144..4001b9f 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -456,6 +456,8 @@ minillaplocal.query.files=\ bucketmapjoin6.q,\ bucketmapjoin7.q,\ bucketpruning1.q,\ + reopt_dpp.q,\ + reopt_semijoin.q,\ retry_failure.q,\ retry_failure_stat_changes.q,\ retry_failure_oom.q,\ http://git-wip-us.apache.org/repos/asf/hive/blob/cac971b4/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java index dfd7908..119aa92 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java @@ -93,6 +93,7 @@ import org.apache.hadoop.hive.ql.optimizer.physical.StageIDsRearranger; import org.apache.hadoop.hive.ql.optimizer.physical.Vectorizer; import org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics; import org.apache.hadoop.hive.ql.plan.AggregationDesc; +import org.apache.hadoop.hive.ql.plan.AppMasterEventDesc; import org.apache.hadoop.hive.ql.plan.BaseWork; import org.apache.hadoop.hive.ql.plan.ColStatistics; import org.apache.hadoop.hive.ql.plan.DynamicPruningEventDesc; @@ -105,8 +106,10 @@ import org.apache.hadoop.hive.ql.plan.MoveWork; import org.apache.hadoop.hive.ql.plan.OperatorDesc; import org.apache.hadoop.hive.ql.plan.Statistics; import org.apache.hadoop.hive.ql.plan.TezWork; +import org.apache.hadoop.hive.ql.plan.mapper.PlanMapper; import org.apache.hadoop.hive.ql.session.SessionState; import org.apache.hadoop.hive.ql.session.SessionState.LogHelper; +import org.apache.hadoop.hive.ql.stats.OperatorStats; import org.apache.hadoop.hive.ql.stats.StatsUtils; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFBloomFilter.GenericUDAFBloomFilterEvaluator; import org.slf4j.Logger; @@ -211,6 +214,10 @@ public class TezCompiler extends TaskCompiler { } perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "Shared scans optimization"); + perfLogger.PerfLogBegin(this.getClass().getName(), PerfLogger.TEZ_COMPILER); + markOperatorsWithUnstableRuntimeStats(procCtx); + perfLogger.PerfLogEnd(this.getClass().getName(), PerfLogger.TEZ_COMPILER, "markOperatorsWithUnstableRuntimeStats"); + // need a new run of the constant folding because we might have created lots // of "and true and true" conditions. // Rather than run the full constant folding just need to shortcut AND/OR expressions @@ -1006,6 +1013,71 @@ public class TezCompiler extends TaskCompiler { ogw.startWalking(topNodes, null); } + private static class MarkRuntimeStatsAsIncorrect implements NodeProcessor { + + private PlanMapper planMapper; + + @Override + public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) + throws SemanticException { + ParseContext pCtx = ((OptimizeTezProcContext) procCtx).parseContext; + planMapper = pCtx.getContext().getPlanMapper(); + if (nd instanceof ReduceSinkOperator) { + ReduceSinkOperator rs = (ReduceSinkOperator) nd; + SemiJoinBranchInfo sjInfo = pCtx.getRsToSemiJoinBranchInfo().get(rs); + if (sjInfo == null) { + return null; + } + walkSubtree(sjInfo.getTsOp()); + } + if (nd instanceof AppMasterEventOperator) { + AppMasterEventOperator ame = (AppMasterEventOperator) nd; + AppMasterEventDesc c = ame.getConf(); + if (c instanceof DynamicPruningEventDesc) { + DynamicPruningEventDesc dped = (DynamicPruningEventDesc) c; + mark(dped.getTableScan()); + } + } + return null; + } + + private void walkSubtree(Operator<?> root) { + Deque<Operator<?>> deque = new LinkedList<>(); + deque.add(root); + while (!deque.isEmpty()) { + Operator<?> op = deque.pollLast(); + mark(op); + if (op instanceof ReduceSinkOperator) { + // Done with this branch + } else { + deque.addAll(op.getChildOperators()); + } + } + } + + private void mark(Operator<?> op) { + planMapper.link(op, new OperatorStats.IncorrectRuntimeStatsMarker()); + } + + } + + private void markOperatorsWithUnstableRuntimeStats(OptimizeTezProcContext procCtx) throws SemanticException { + Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); + opRules.put( + new RuleRegExp("R1", + ReduceSinkOperator.getOperatorName() + "%"), + new MarkRuntimeStatsAsIncorrect()); + opRules.put( + new RuleRegExp("R2", + AppMasterEventOperator.getOperatorName() + "%"), + new MarkRuntimeStatsAsIncorrect()); + Dispatcher disp = new DefaultRuleDispatcher(null, opRules, procCtx); + List<Node> topNodes = new ArrayList<Node>(); + topNodes.addAll(procCtx.parseContext.getTopOps().values()); + GraphWalker ogw = new PreOrderOnceWalker(disp); + ogw.startWalking(topNodes, null); + } + private boolean findParallelSemiJoinBranch(Operator<?> mapjoin, TableScanOperator bigTableTS, ParseContext parseContext, Map<ReduceSinkOperator, TableScanOperator> semijoins) { http://git-wip-us.apache.org/repos/asf/hive/blob/cac971b4/ql/src/java/org/apache/hadoop/hive/ql/plan/mapper/StatsSources.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/mapper/StatsSources.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/mapper/StatsSources.java index 5a62046..823cb87 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/mapper/StatsSources.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/mapper/StatsSources.java @@ -95,9 +95,14 @@ public class StatsSources { } LOG.debug(sb.toString()); } - if (stat.size() >= 1 && sig.size() >= 1) { - map.put(sig.get(0), stat.get(0)); + if (stat.size() < 1 || sig.size() < 1) { + continue; } + if (e.getAll(OperatorStats.IncorrectRuntimeStatsMarker.class).size() > 0) { + LOG.debug("Ignoring {}, marked with OperatorStats.IncorrectRuntimeStatsMarker", sig.get(0)); + continue; + } + map.put(sig.get(0), stat.get(0)); } return map.build(); } http://git-wip-us.apache.org/repos/asf/hive/blob/cac971b4/ql/src/java/org/apache/hadoop/hive/ql/stats/OperatorStats.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/stats/OperatorStats.java b/ql/src/java/org/apache/hadoop/hive/ql/stats/OperatorStats.java index d70bb82..0c56c82 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/stats/OperatorStats.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/stats/OperatorStats.java @@ -23,6 +23,11 @@ import com.google.common.base.Objects; * Holds information an operator's statistics. */ public final class OperatorStats { + + /** Marker class to help with plan elements which will collect invalid statistics */ + public static class IncorrectRuntimeStatsMarker { + } + private String operatorId; private long outputRecords; @@ -67,4 +72,6 @@ public final class OperatorStats { return Objects.equal(operatorId, o.operatorId) && Objects.equal(outputRecords, o.outputRecords); } + + } http://git-wip-us.apache.org/repos/asf/hive/blob/cac971b4/ql/src/test/queries/clientpositive/reopt_dpp.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/reopt_dpp.q b/ql/src/test/queries/clientpositive/reopt_dpp.q new file mode 100644 index 0000000..952dcbe --- /dev/null +++ b/ql/src/test/queries/clientpositive/reopt_dpp.q @@ -0,0 +1,62 @@ +set hive.explain.user=true; +set hive.optimize.index.filter=true; +set hive.auto.convert.join=true; +set hive.vectorized.execution.enabled=true; + +drop table if exists x1_store_sales; +drop table if exists x1_date_dim; +drop table if exists x1_item; + +create table x1_store_sales +( + ss_item_sk int +) +partitioned by (ss_sold_date_sk int) +stored as orc; + +create table x1_date_dim +( + d_date_sk int, + d_month_seq int, + d_year int, + d_moy int +) +stored as orc; + + +insert into x1_date_dim values (1,1,2000,2), + (2,2,2001,2); +insert into x1_store_sales partition (ss_sold_date_sk=1) values (1); +insert into x1_store_sales partition (ss_sold_date_sk=2) values (2); + +alter table x1_store_sales partition (ss_sold_date_sk=1) update statistics set( +'numRows'='123456', +'rawDataSize'='1234567'); + +alter table x1_date_dim update statistics set( +'numRows'='56', +'rawDataSize'='81449'); + + +-- the following query is designed to produce a DPP plan +explain +select count(*) cnt + from + x1_store_sales s + ,x1_date_dim d + where + 1=1 + and s.ss_sold_date_sk = d.d_date_sk + and d.d_year=2000; + +-- tablescan of s should be 2 or 123456 rows; but never 1 +-- and it should not be a mapjoin :) +explain reoptimization +select count(*) cnt + from + x1_store_sales s + ,x1_date_dim d + where + 1=1 + and s.ss_sold_date_sk = d.d_date_sk + and d.d_year=2000; http://git-wip-us.apache.org/repos/asf/hive/blob/cac971b4/ql/src/test/queries/clientpositive/reopt_semijoin.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/reopt_semijoin.q b/ql/src/test/queries/clientpositive/reopt_semijoin.q new file mode 100644 index 0000000..0eacb8a --- /dev/null +++ b/ql/src/test/queries/clientpositive/reopt_semijoin.q @@ -0,0 +1,76 @@ +set hive.explain.user=true; +set hive.optimize.index.filter=true; +set hive.auto.convert.join=true; +set hive.vectorized.execution.enabled=true; + +drop table if exists x1_store_sales; +drop table if exists x1_date_dim; + +create table x1_store_sales +( + ss_sold_date_sk int, + ss_item_sk int +) +stored as orc; + +create table x1_date_dim +( + d_date_sk int, + d_month_seq int, + d_year int, + d_moy int +) +stored as orc; + +insert into x1_date_dim values (1,1,2000,1), + (2,2,2001,2), + (3,2,2001,3), + (4,2,2001,4), + (5,2,2001,5), + (6,2,2001,6), + (7,2,2001,7), + (8,2,2001,8); + +insert into x1_store_sales values (1,1),(3,3),(4,4),(5,5),(6,6),(7,7),(8,8),(9,9),(10,10),(11,11); + +alter table x1_store_sales update statistics set( +'numRows'='123456', +'rawDataSize'='1234567'); + +alter table x1_date_dim update statistics set( +'numRows'='56', +'rawDataSize'='81449'); + + +set hive.auto.convert.join.noconditionaltask.size=1; +set hive.tez.dynamic.partition.pruning=true; +set hive.tez.dynamic.semijoin.reduction=true; +set hive.optimize.index.filter=true; +set hive.tez.bigtable.minsize.semijoin.reduction=1; +set hive.tez.min.bloom.filter.entries=1; +set hive.tez.bloom.filter.factor=1.0f; +set hive.explain.user=false; + +explain +select sum(s.ss_item_sk) + from + x1_store_sales s + ,x1_date_dim d + where + 1=1 + and s.ss_sold_date_sk=d.d_date_sk + and d.d_moy=3 +; + +explain reoptimization +select sum(s.ss_item_sk) + from + x1_store_sales s + ,x1_date_dim d + where + 1=1 + and s.ss_sold_date_sk=d.d_date_sk + and d.d_moy=3 +; + + http://git-wip-us.apache.org/repos/asf/hive/blob/cac971b4/ql/src/test/results/clientpositive/llap/reopt_dpp.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/reopt_dpp.q.out b/ql/src/test/results/clientpositive/llap/reopt_dpp.q.out new file mode 100644 index 0000000..31726f6 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/reopt_dpp.q.out @@ -0,0 +1,259 @@ +PREHOOK: query: drop table if exists x1_store_sales +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists x1_store_sales +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table if exists x1_date_dim +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists x1_date_dim +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table if exists x1_item +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists x1_item +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table x1_store_sales +( + ss_item_sk int +) +partitioned by (ss_sold_date_sk int) +stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@x1_store_sales +POSTHOOK: query: create table x1_store_sales +( + ss_item_sk int +) +partitioned by (ss_sold_date_sk int) +stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@x1_store_sales +PREHOOK: query: create table x1_date_dim +( + d_date_sk int, + d_month_seq int, + d_year int, + d_moy int +) +stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@x1_date_dim +POSTHOOK: query: create table x1_date_dim +( + d_date_sk int, + d_month_seq int, + d_year int, + d_moy int +) +stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@x1_date_dim +PREHOOK: query: insert into x1_date_dim values (1,1,2000,2), + (2,2,2001,2) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@x1_date_dim +POSTHOOK: query: insert into x1_date_dim values (1,1,2000,2), + (2,2,2001,2) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@x1_date_dim +POSTHOOK: Lineage: x1_date_dim.d_date_sk SCRIPT [] +POSTHOOK: Lineage: x1_date_dim.d_month_seq SCRIPT [] +POSTHOOK: Lineage: x1_date_dim.d_moy SCRIPT [] +POSTHOOK: Lineage: x1_date_dim.d_year SCRIPT [] +PREHOOK: query: insert into x1_store_sales partition (ss_sold_date_sk=1) values (1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@x1_store_sales@ss_sold_date_sk=1 +POSTHOOK: query: insert into x1_store_sales partition (ss_sold_date_sk=1) values (1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@x1_store_sales@ss_sold_date_sk=1 +POSTHOOK: Lineage: x1_store_sales PARTITION(ss_sold_date_sk=1).ss_item_sk SCRIPT [] +PREHOOK: query: insert into x1_store_sales partition (ss_sold_date_sk=2) values (2) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@x1_store_sales@ss_sold_date_sk=2 +POSTHOOK: query: insert into x1_store_sales partition (ss_sold_date_sk=2) values (2) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@x1_store_sales@ss_sold_date_sk=2 +POSTHOOK: Lineage: x1_store_sales PARTITION(ss_sold_date_sk=2).ss_item_sk SCRIPT [] +PREHOOK: query: alter table x1_store_sales partition (ss_sold_date_sk=1) update statistics set( +'numRows'='123456', +'rawDataSize'='1234567') +PREHOOK: type: ALTERTABLE_UPDATEPARTSTATS +PREHOOK: Input: default@x1_store_sales +PREHOOK: Output: default@x1_store_sales@ss_sold_date_sk=1 +POSTHOOK: query: alter table x1_store_sales partition (ss_sold_date_sk=1) update statistics set( +'numRows'='123456', +'rawDataSize'='1234567') +POSTHOOK: type: ALTERTABLE_UPDATEPARTSTATS +POSTHOOK: Input: default@x1_store_sales +POSTHOOK: Input: default@x1_store_sales@ss_sold_date_sk=1 +POSTHOOK: Output: default@x1_store_sales@ss_sold_date_sk=1 +PREHOOK: query: alter table x1_date_dim update statistics set( +'numRows'='56', +'rawDataSize'='81449') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@x1_date_dim +PREHOOK: Output: default@x1_date_dim +POSTHOOK: query: alter table x1_date_dim update statistics set( +'numRows'='56', +'rawDataSize'='81449') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@x1_date_dim +POSTHOOK: Output: default@x1_date_dim +PREHOOK: query: explain +select count(*) cnt + from + x1_store_sales s + ,x1_date_dim d + where + 1=1 + and s.ss_sold_date_sk = d.d_date_sk + and d.d_year=2000 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select count(*) cnt + from + x1_store_sales s + ,x1_date_dim d + where + 1=1 + and s.ss_sold_date_sk = d.d_date_sk + and d.d_year=2000 +POSTHOOK: type: QUERY +Plan optimized by CBO. + +Vertex dependency in root stage +Map 1 <- Map 3 (BROADCAST_EDGE) +Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:-1 + Stage-1 + Reducer 2 vectorized, llap + File Output Operator [FS_35] + Group By Operator [GBY_34] (rows=1 width=8) + Output:["_col0"],aggregations:["count(VALUE._col0)"] + <-Map 1 [CUSTOM_SIMPLE_EDGE] vectorized, llap + PARTITION_ONLY_SHUFFLE [RS_33] + Group By Operator [GBY_32] (rows=1 width=8) + Output:["_col0"],aggregations:["count()"] + Map Join Operator [MAPJOIN_31] (rows=1728398 width=8) + Conds:SEL_30._col0=RS_26._col0(Inner) + <-Map 3 [BROADCAST_EDGE] vectorized, llap + BROADCAST [RS_26] + PartitionCols:_col0 + Select Operator [SEL_25] (rows=28 width=8) + Output:["_col0"] + Filter Operator [FIL_24] (rows=28 width=8) + predicate:((d_year = 2000) and d_date_sk is not null) + TableScan [TS_3] (rows=56 width=8) + default@x1_date_dim,d,Tbl:COMPLETE,Col:COMPLETE,Output:["d_date_sk","d_year"] + Dynamic Partitioning Event Operator [EVENT_29] (rows=1 width=8) + Group By Operator [GBY_28] (rows=1 width=8) + Output:["_col0"],keys:_col0 + Select Operator [SEL_27] (rows=28 width=8) + Output:["_col0"] + Please refer to the previous Select Operator [SEL_25] + <-Select Operator [SEL_30] (rows=123457 width=4) + Output:["_col0"] + TableScan [TS_0] (rows=123457 width=14) + default@x1_store_sales,s,Tbl:COMPLETE,Col:COMPLETE + +PREHOOK: query: explain reoptimization +select count(*) cnt + from + x1_store_sales s + ,x1_date_dim d + where + 1=1 + and s.ss_sold_date_sk = d.d_date_sk + and d.d_year=2000 +PREHOOK: type: QUERY +PREHOOK: Input: default@x1_date_dim +PREHOOK: Input: default@x1_store_sales +PREHOOK: Input: default@x1_store_sales@ss_sold_date_sk=1 +PREHOOK: Input: default@x1_store_sales@ss_sold_date_sk=2 +#### A masked pattern was here #### +POSTHOOK: query: explain reoptimization +select count(*) cnt + from + x1_store_sales s + ,x1_date_dim d + where + 1=1 + and s.ss_sold_date_sk = d.d_date_sk + and d.d_year=2000 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@x1_date_dim +POSTHOOK: Input: default@x1_store_sales +POSTHOOK: Input: default@x1_store_sales@ss_sold_date_sk=1 +POSTHOOK: Input: default@x1_store_sales@ss_sold_date_sk=2 +#### A masked pattern was here #### +PREHOOK: query: explain reoptimization +select count(*) cnt + from + x1_store_sales s + ,x1_date_dim d + where + 1=1 + and s.ss_sold_date_sk = d.d_date_sk + and d.d_year=2000 +PREHOOK: type: QUERY +POSTHOOK: query: explain reoptimization +select count(*) cnt + from + x1_store_sales s + ,x1_date_dim d + where + 1=1 + and s.ss_sold_date_sk = d.d_date_sk + and d.d_year=2000 +POSTHOOK: type: QUERY +Plan optimized by CBO. + +Vertex dependency in root stage +Map 1 <- Map 3 (BROADCAST_EDGE) +Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) + +Stage-0 + Fetch Operator + limit:-1 + Stage-1 + Reducer 2 vectorized, llap + File Output Operator [FS_35] + Group By Operator [GBY_34] (runtime: rows=1 width=8) + Output:["_col0"],aggregations:["count(VALUE._col0)"] + <-Map 1 [CUSTOM_SIMPLE_EDGE] vectorized, llap + PARTITION_ONLY_SHUFFLE [RS_33] + Group By Operator [GBY_32] (runtime: rows=1 width=8) + Output:["_col0"],aggregations:["count()"] + Map Join Operator [MAPJOIN_31] (runtime: rows=1 width=8) + Conds:SEL_30._col0=RS_26._col0(Inner) + <-Map 3 [BROADCAST_EDGE] vectorized, llap + BROADCAST [RS_26] + PartitionCols:_col0 + Select Operator [SEL_25] (runtime: rows=1 width=8) + Output:["_col0"] + Filter Operator [FIL_24] (runtime: rows=1 width=8) + predicate:((d_year = 2000) and d_date_sk is not null) + TableScan [TS_3] (runtime: rows=2 width=8) + default@x1_date_dim,d,Tbl:COMPLETE,Col:COMPLETE,Output:["d_date_sk","d_year"] + Dynamic Partitioning Event Operator [EVENT_29] (runtime: rows=1 width=8) + Group By Operator [GBY_28] (runtime: rows=1 width=8) + Output:["_col0"],keys:_col0 + Select Operator [SEL_27] (runtime: rows=1 width=8) + Output:["_col0"] + Please refer to the previous Select Operator [SEL_25] + <-Select Operator [SEL_30] (runtime: rows=1 width=4) + Output:["_col0"] + TableScan [TS_0] (rows=123457 width=14) + default@x1_store_sales,s,Tbl:COMPLETE,Col:COMPLETE + http://git-wip-us.apache.org/repos/asf/hive/blob/cac971b4/ql/src/test/results/clientpositive/llap/reopt_semijoin.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/reopt_semijoin.q.out b/ql/src/test/results/clientpositive/llap/reopt_semijoin.q.out new file mode 100644 index 0000000..e60b207 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/reopt_semijoin.q.out @@ -0,0 +1,420 @@ +PREHOOK: query: drop table if exists x1_store_sales +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists x1_store_sales +POSTHOOK: type: DROPTABLE +PREHOOK: query: drop table if exists x1_date_dim +PREHOOK: type: DROPTABLE +POSTHOOK: query: drop table if exists x1_date_dim +POSTHOOK: type: DROPTABLE +PREHOOK: query: create table x1_store_sales +( + ss_sold_date_sk int, + ss_item_sk int +) +stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@x1_store_sales +POSTHOOK: query: create table x1_store_sales +( + ss_sold_date_sk int, + ss_item_sk int +) +stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@x1_store_sales +PREHOOK: query: create table x1_date_dim +( + d_date_sk int, + d_month_seq int, + d_year int, + d_moy int +) +stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@x1_date_dim +POSTHOOK: query: create table x1_date_dim +( + d_date_sk int, + d_month_seq int, + d_year int, + d_moy int +) +stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@x1_date_dim +PREHOOK: query: insert into x1_date_dim values (1,1,2000,1), + (2,2,2001,2), + (3,2,2001,3), + (4,2,2001,4), + (5,2,2001,5), + (6,2,2001,6), + (7,2,2001,7), + (8,2,2001,8) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@x1_date_dim +POSTHOOK: query: insert into x1_date_dim values (1,1,2000,1), + (2,2,2001,2), + (3,2,2001,3), + (4,2,2001,4), + (5,2,2001,5), + (6,2,2001,6), + (7,2,2001,7), + (8,2,2001,8) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@x1_date_dim +POSTHOOK: Lineage: x1_date_dim.d_date_sk SCRIPT [] +POSTHOOK: Lineage: x1_date_dim.d_month_seq SCRIPT [] +POSTHOOK: Lineage: x1_date_dim.d_moy SCRIPT [] +POSTHOOK: Lineage: x1_date_dim.d_year SCRIPT [] +PREHOOK: query: insert into x1_store_sales values (1,1),(3,3),(4,4),(5,5),(6,6),(7,7),(8,8),(9,9),(10,10),(11,11) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@x1_store_sales +POSTHOOK: query: insert into x1_store_sales values (1,1),(3,3),(4,4),(5,5),(6,6),(7,7),(8,8),(9,9),(10,10),(11,11) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@x1_store_sales +POSTHOOK: Lineage: x1_store_sales.ss_item_sk SCRIPT [] +POSTHOOK: Lineage: x1_store_sales.ss_sold_date_sk SCRIPT [] +PREHOOK: query: alter table x1_store_sales update statistics set( +'numRows'='123456', +'rawDataSize'='1234567') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@x1_store_sales +PREHOOK: Output: default@x1_store_sales +POSTHOOK: query: alter table x1_store_sales update statistics set( +'numRows'='123456', +'rawDataSize'='1234567') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@x1_store_sales +POSTHOOK: Output: default@x1_store_sales +PREHOOK: query: alter table x1_date_dim update statistics set( +'numRows'='56', +'rawDataSize'='81449') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@x1_date_dim +PREHOOK: Output: default@x1_date_dim +POSTHOOK: query: alter table x1_date_dim update statistics set( +'numRows'='56', +'rawDataSize'='81449') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@x1_date_dim +POSTHOOK: Output: default@x1_date_dim +PREHOOK: query: explain +select sum(s.ss_item_sk) + from + x1_store_sales s + ,x1_date_dim d + where + 1=1 + and s.ss_sold_date_sk=d.d_date_sk + and d.d_moy=3 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select sum(s.ss_item_sk) + from + x1_store_sales s + ,x1_date_dim d + where + 1=1 + and s.ss_sold_date_sk=d.d_date_sk + and d.d_moy=3 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Reducer 5 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) + Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: s + filterExpr: (ss_sold_date_sk is not null and (ss_sold_date_sk BETWEEN DynamicValue(RS_7_d_d_date_sk_min) AND DynamicValue(RS_7_d_d_date_sk_max) and in_bloom_filter(ss_sold_date_sk, DynamicValue(RS_7_d_d_date_sk_bloom_filter)))) (type: boolean) + Statistics: Num rows: 123456 Data size: 987648 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: ((ss_sold_date_sk BETWEEN DynamicValue(RS_7_d_d_date_sk_min) AND DynamicValue(RS_7_d_d_date_sk_max) and in_bloom_filter(ss_sold_date_sk, DynamicValue(RS_7_d_d_date_sk_bloom_filter))) and ss_sold_date_sk is not null) (type: boolean) + Statistics: Num rows: 123456 Data size: 987648 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: ss_sold_date_sk (type: int), ss_item_sk (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 123456 Data size: 987648 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 123456 Data size: 987648 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: int) + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: d + filterExpr: ((d_moy = 3) and d_date_sk is not null) (type: boolean) + Statistics: Num rows: 56 Data size: 448 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: ((d_moy = 3) and d_date_sk is not null) (type: boolean) + Statistics: Num rows: 7 Data size: 56 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: d_date_sk (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 7 Data size: 56 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 7 Data size: 56 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 7 Data size: 28 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col1 + Statistics: Num rows: 86419 Data size: 345676 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: sum(_col1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: explain reoptimization +select sum(s.ss_item_sk) + from + x1_store_sales s + ,x1_date_dim d + where + 1=1 + and s.ss_sold_date_sk=d.d_date_sk + and d.d_moy=3 +PREHOOK: type: QUERY +PREHOOK: Input: default@x1_date_dim +PREHOOK: Input: default@x1_store_sales +#### A masked pattern was here #### +POSTHOOK: query: explain reoptimization +select sum(s.ss_item_sk) + from + x1_store_sales s + ,x1_date_dim d + where + 1=1 + and s.ss_sold_date_sk=d.d_date_sk + and d.d_moy=3 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@x1_date_dim +POSTHOOK: Input: default@x1_store_sales +#### A masked pattern was here #### +PREHOOK: query: explain reoptimization +select sum(s.ss_item_sk) + from + x1_store_sales s + ,x1_date_dim d + where + 1=1 + and s.ss_sold_date_sk=d.d_date_sk + and d.d_moy=3 +PREHOOK: type: QUERY +POSTHOOK: query: explain reoptimization +select sum(s.ss_item_sk) + from + x1_store_sales s + ,x1_date_dim d + where + 1=1 + and s.ss_sold_date_sk=d.d_date_sk + and d.d_moy=3 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Reducer 5 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (CUSTOM_SIMPLE_EDGE) + Reducer 5 <- Map 4 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: s + filterExpr: (ss_sold_date_sk is not null and (ss_sold_date_sk BETWEEN DynamicValue(RS_7_d_d_date_sk_min) AND DynamicValue(RS_7_d_d_date_sk_max) and in_bloom_filter(ss_sold_date_sk, DynamicValue(RS_7_d_d_date_sk_bloom_filter)))) (type: boolean) + Statistics: Num rows: 123456 Data size: 987648 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: ((ss_sold_date_sk BETWEEN DynamicValue(RS_7_d_d_date_sk_min) AND DynamicValue(RS_7_d_d_date_sk_max) and in_bloom_filter(ss_sold_date_sk, DynamicValue(RS_7_d_d_date_sk_bloom_filter))) and ss_sold_date_sk is not null) (type: boolean) + Statistics: Num rows: 123456 Data size: 987648 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: ss_sold_date_sk (type: int), ss_item_sk (type: int) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 123456 Data size: 987648 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 123456 Data size: 987648 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: int) + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 4 + Map Operator Tree: + TableScan + alias: d + filterExpr: ((d_moy = 3) and d_date_sk is not null) (type: boolean) + Statistics: (RUNTIME) Num rows: 8 Data size: 64 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: ((d_moy = 3) and d_date_sk is not null) (type: boolean) + Statistics: (RUNTIME) Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: d_date_sk (type: int) + outputColumnNames: _col0 + Statistics: (RUNTIME) Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: (RUNTIME) Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int) + outputColumnNames: _col0 + Statistics: (RUNTIME) Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: min(_col0), max(_col0), bloom_filter(_col0, expectedEntries=1) + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: (RUNTIME) Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: (RUNTIME) Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col0 (type: int) + 1 _col0 (type: int) + outputColumnNames: _col1 + Statistics: (RUNTIME) Num rows: 1 Data size: 4 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: sum(_col1) + mode: hash + outputColumnNames: _col0 + Statistics: (RUNTIME) Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: (RUNTIME) Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: (RUNTIME) Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + aggregations: min(VALUE._col0), max(VALUE._col1), bloom_filter(VALUE._col2, expectedEntries=1) + mode: final + outputColumnNames: _col0, _col1, _col2 + Statistics: (RUNTIME) Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: (RUNTIME) Num rows: 1 Data size: 12 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: int), _col2 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +