HIVE-10607 : Combination of ReducesinkDedup + TopN optimization yields incorrect result if there are multiple GBY in reducer (Ashutosh Chauhan via Sergey Shelukhin)
Signed-off-by: Ashutosh Chauhan <hashut...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/c0116739 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/c0116739 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/c0116739 Branch: refs/heads/beeline-cli Commit: c0116739972bcffcc65498eb721f6b8c1b8e305d Parents: eefb071 Author: Ashutosh Chauhan <hashut...@apache.org> Authored: Mon May 4 22:25:12 2015 -0700 Committer: Ashutosh Chauhan <hashut...@apache.org> Committed: Tue May 5 23:52:38 2015 -0700 ---------------------------------------------------------------------- .../ql/optimizer/LimitPushdownOptimizer.java | 9 +- .../queries/clientpositive/limit_pushdown.q | 4 + .../results/clientpositive/limit_pushdown.q.out | 88 ++++++++++++++++++ .../clientpositive/spark/limit_pushdown.q.out | 94 ++++++++++++++++++++ .../clientpositive/tez/limit_pushdown.q.out | 94 ++++++++++++++++++++ 5 files changed, 288 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/c0116739/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java index f80941e..e850550 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java @@ -28,6 +28,7 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.exec.GroupByOperator; import org.apache.hadoop.hive.ql.exec.LimitOperator; import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorUtils; import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; @@ -86,6 +87,7 @@ import org.apache.hadoop.hive.ql.parse.SemanticException; */ public class LimitPushdownOptimizer implements Transform { + @Override public ParseContext transform(ParseContext pctx) throws SemanticException { Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); opRules.put(new RuleRegExp("R1", @@ -105,6 +107,7 @@ public class LimitPushdownOptimizer implements Transform { private static class TopNReducer implements NodeProcessor { + @Override public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, Object... nodeOutputs) throws SemanticException { ReduceSinkOperator rs = null; @@ -122,6 +125,10 @@ public class LimitPushdownOptimizer implements Transform { } } if (rs != null) { + if (OperatorUtils.findOperators(rs, GroupByOperator.class).size() > 1){ + // Not safe to continue for RS-GBY-GBY-LIM kind of pipelines. See HIVE-10607 for more. + return false; + } LimitOperator limit = (LimitOperator) nd; rs.getConf().setTopN(limit.getConf().getLimit()); rs.getConf().setTopNMemoryUsage(((LimitPushdownContext) procCtx).threshold); @@ -135,7 +142,7 @@ public class LimitPushdownOptimizer implements Transform { private static class LimitPushdownContext implements NodeProcessorCtx { - private float threshold; + private final float threshold; public LimitPushdownContext(HiveConf conf) throws SemanticException { threshold = conf.getFloatVar(HiveConf.ConfVars.HIVELIMITPUSHDOWNMEMORYUSAGE); http://git-wip-us.apache.org/repos/asf/hive/blob/c0116739/ql/src/test/queries/clientpositive/limit_pushdown.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/limit_pushdown.q b/ql/src/test/queries/clientpositive/limit_pushdown.q index d93e246..3940564 100644 --- a/ql/src/test/queries/clientpositive/limit_pushdown.q +++ b/ql/src/test/queries/clientpositive/limit_pushdown.q @@ -31,6 +31,10 @@ explain select ctinyint, count(distinct(cdouble)) from alltypesorc group by ctinyint order by ctinyint limit 20; select ctinyint, count(distinct(cdouble)) from alltypesorc group by ctinyint order by ctinyint limit 20; +explain +select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20; +select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20; + -- multi distinct explain select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from alltypesorc group by ctinyint order by ctinyint limit 20; http://git-wip-us.apache.org/repos/asf/hive/blob/c0116739/ql/src/test/results/clientpositive/limit_pushdown.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/limit_pushdown.q.out b/ql/src/test/results/clientpositive/limit_pushdown.q.out index c7ab7b3..6ace047 100644 --- a/ql/src/test/results/clientpositive/limit_pushdown.q.out +++ b/ql/src/test/results/clientpositive/limit_pushdown.q.out @@ -504,6 +504,94 @@ POSTHOOK: Input: default@alltypesorc -63 19 -64 24 NULL 2932 +PREHOOK: query: explain +select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: alltypesorc + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ctinyint (type: tinyint), cdouble (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: tinyint), _col1 (type: double) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: tinyint), _col1 (type: double) + sort order: ++ + Map-reduce partition columns: _col0 (type: tinyint) + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: tinyint), KEY._col1 (type: double) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col1) + keys: _col0 (type: tinyint) + mode: complete + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3072 Data size: 660491 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 20 + Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 20 + Processor Tree: + ListSink + +PREHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +-46 24 +-47 22 +-48 29 +-49 26 +-50 30 +-51 21 +-52 33 +-53 22 +-54 26 +-55 29 +-56 36 +-57 35 +-58 23 +-59 31 +-60 27 +-61 25 +-62 27 +-63 19 +-64 24 +NULL 2932 PREHOOK: query: -- multi distinct explain select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from alltypesorc group by ctinyint order by ctinyint limit 20 http://git-wip-us.apache.org/repos/asf/hive/blob/c0116739/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out b/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out index 01106a4..40af253 100644 --- a/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out +++ b/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out @@ -540,6 +540,100 @@ POSTHOOK: Input: default@alltypesorc -63 19 -64 24 NULL 2932 +PREHOOK: query: explain +select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Spark + Edges: + Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 1) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: alltypesorc + Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ctinyint (type: tinyint), cdouble (type: double) + outputColumnNames: ctinyint, cdouble + Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: ctinyint (type: tinyint), cdouble (type: double) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: tinyint), _col1 (type: double) + sort order: ++ + Map-reduce partition columns: _col0 (type: tinyint) + Statistics: Num rows: 12288 Data size: 377237 Basic stats: COMPLETE Column stats: NONE + Reducer 2 + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: tinyint), KEY._col1 (type: double) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6144 Data size: 188618 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col1) + keys: _col0 (type: tinyint) + mode: complete + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3072 Data size: 94309 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 20 + Statistics: Num rows: 20 Data size: 600 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 20 Data size: 600 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 20 + Processor Tree: + ListSink + +PREHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +-46 24 +-47 22 +-48 29 +-49 26 +-50 30 +-51 21 +-52 33 +-53 22 +-54 26 +-55 29 +-56 36 +-57 35 +-58 23 +-59 31 +-60 27 +-61 25 +-62 27 +-63 19 +-64 24 +NULL 2932 PREHOOK: query: -- multi distinct explain select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from alltypesorc group by ctinyint order by ctinyint limit 20 http://git-wip-us.apache.org/repos/asf/hive/blob/c0116739/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out b/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out index 952d7ff..7038b4d 100644 --- a/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out +++ b/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out @@ -540,6 +540,100 @@ POSTHOOK: Input: default@alltypesorc -63 19 -64 24 NULL 2932 +PREHOOK: query: explain +select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +PREHOOK: type: QUERY +POSTHOOK: query: explain +select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: alltypesorc + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: ctinyint (type: tinyint), cdouble (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: _col0 (type: tinyint), _col1 (type: double) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: tinyint), _col1 (type: double) + sort order: ++ + Map-reduce partition columns: _col0 (type: tinyint) + Statistics: Num rows: 12288 Data size: 2641964 Basic stats: COMPLETE Column stats: NONE + Reducer 2 + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: tinyint), KEY._col1 (type: double) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col1) + keys: _col0 (type: tinyint) + mode: complete + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3072 Data size: 660491 Basic stats: COMPLETE Column stats: NONE + Limit + Number of rows: 20 + Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.TextInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: 20 + Processor Tree: + ListSink + +PREHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +POSTHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint limit 20 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +#### A masked pattern was here #### +-46 24 +-47 22 +-48 29 +-49 26 +-50 30 +-51 21 +-52 33 +-53 22 +-54 26 +-55 29 +-56 36 +-57 35 +-58 23 +-59 31 +-60 27 +-61 25 +-62 27 +-63 19 +-64 24 +NULL 2932 PREHOOK: query: -- multi distinct explain select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from alltypesorc group by ctinyint order by ctinyint limit 20