HIVE-10607 : Combination of ReducesinkDedup + TopN optimization yields 
incorrect result if there are multiple GBY in reducer (Ashutosh Chauhan via 
Sergey Shelukhin)

Signed-off-by: Ashutosh Chauhan <hashut...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/c0116739
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/c0116739
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/c0116739

Branch: refs/heads/beeline-cli
Commit: c0116739972bcffcc65498eb721f6b8c1b8e305d
Parents: eefb071
Author: Ashutosh Chauhan <hashut...@apache.org>
Authored: Mon May 4 22:25:12 2015 -0700
Committer: Ashutosh Chauhan <hashut...@apache.org>
Committed: Tue May 5 23:52:38 2015 -0700

----------------------------------------------------------------------
 .../ql/optimizer/LimitPushdownOptimizer.java    |  9 +-
 .../queries/clientpositive/limit_pushdown.q     |  4 +
 .../results/clientpositive/limit_pushdown.q.out | 88 ++++++++++++++++++
 .../clientpositive/spark/limit_pushdown.q.out   | 94 ++++++++++++++++++++
 .../clientpositive/tez/limit_pushdown.q.out     | 94 ++++++++++++++++++++
 5 files changed, 288 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/c0116739/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java
index f80941e..e850550 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/LimitPushdownOptimizer.java
@@ -28,6 +28,7 @@ import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.ql.exec.GroupByOperator;
 import org.apache.hadoop.hive.ql.exec.LimitOperator;
 import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.OperatorUtils;
 import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
 import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
 import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
@@ -86,6 +87,7 @@ import org.apache.hadoop.hive.ql.parse.SemanticException;
  */
 public class LimitPushdownOptimizer implements Transform {
 
+  @Override
   public ParseContext transform(ParseContext pctx) throws SemanticException {
     Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, 
NodeProcessor>();
     opRules.put(new RuleRegExp("R1",
@@ -105,6 +107,7 @@ public class LimitPushdownOptimizer implements Transform {
 
   private static class TopNReducer implements NodeProcessor {
 
+    @Override
     public Object process(Node nd, Stack<Node> stack,
         NodeProcessorCtx procCtx, Object... nodeOutputs) throws 
SemanticException {
       ReduceSinkOperator rs = null;
@@ -122,6 +125,10 @@ public class LimitPushdownOptimizer implements Transform {
         }
       }
       if (rs != null) {
+        if (OperatorUtils.findOperators(rs, GroupByOperator.class).size() > 1){
+          // Not safe to continue for RS-GBY-GBY-LIM kind of pipelines. See 
HIVE-10607 for more.
+          return false;
+        }
         LimitOperator limit = (LimitOperator) nd;
         rs.getConf().setTopN(limit.getConf().getLimit());
         rs.getConf().setTopNMemoryUsage(((LimitPushdownContext) 
procCtx).threshold);
@@ -135,7 +142,7 @@ public class LimitPushdownOptimizer implements Transform {
 
   private static class LimitPushdownContext implements NodeProcessorCtx {
 
-    private float threshold;
+    private final float threshold;
 
     public LimitPushdownContext(HiveConf conf) throws SemanticException {
       threshold = 
conf.getFloatVar(HiveConf.ConfVars.HIVELIMITPUSHDOWNMEMORYUSAGE);

http://git-wip-us.apache.org/repos/asf/hive/blob/c0116739/ql/src/test/queries/clientpositive/limit_pushdown.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/limit_pushdown.q 
b/ql/src/test/queries/clientpositive/limit_pushdown.q
index d93e246..3940564 100644
--- a/ql/src/test/queries/clientpositive/limit_pushdown.q
+++ b/ql/src/test/queries/clientpositive/limit_pushdown.q
@@ -31,6 +31,10 @@ explain
 select ctinyint, count(distinct(cdouble)) from alltypesorc group by ctinyint 
order by ctinyint limit 20;
 select ctinyint, count(distinct(cdouble)) from alltypesorc group by ctinyint 
order by ctinyint limit 20;
 
+explain 
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from 
alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint 
limit 20;
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from 
alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint 
limit 20;
+
 -- multi distinct
 explain
 select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from 
alltypesorc group by ctinyint order by ctinyint limit 20;

http://git-wip-us.apache.org/repos/asf/hive/blob/c0116739/ql/src/test/results/clientpositive/limit_pushdown.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/limit_pushdown.q.out 
b/ql/src/test/results/clientpositive/limit_pushdown.q.out
index c7ab7b3..6ace047 100644
--- a/ql/src/test/results/clientpositive/limit_pushdown.q.out
+++ b/ql/src/test/results/clientpositive/limit_pushdown.q.out
@@ -504,6 +504,94 @@ POSTHOOK: Input: default@alltypesorc
 -63    19
 -64    24
 NULL   2932
+PREHOOK: query: explain 
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from 
alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint 
limit 20
+PREHOOK: type: QUERY
+POSTHOOK: query: explain 
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from 
alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint 
limit 20
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: alltypesorc
+            Statistics: Num rows: 12288 Data size: 2641964 Basic stats: 
COMPLETE Column stats: NONE
+            Select Operator
+              expressions: ctinyint (type: tinyint), cdouble (type: double)
+              outputColumnNames: _col0, _col1
+              Statistics: Num rows: 12288 Data size: 2641964 Basic stats: 
COMPLETE Column stats: NONE
+              Group By Operator
+                keys: _col0 (type: tinyint), _col1 (type: double)
+                mode: hash
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 12288 Data size: 2641964 Basic stats: 
COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: tinyint), _col1 (type: double)
+                  sort order: ++
+                  Map-reduce partition columns: _col0 (type: tinyint)
+                  Statistics: Num rows: 12288 Data size: 2641964 Basic stats: 
COMPLETE Column stats: NONE
+      Reduce Operator Tree:
+        Group By Operator
+          keys: KEY._col0 (type: tinyint), KEY._col1 (type: double)
+          mode: mergepartial
+          outputColumnNames: _col0, _col1
+          Statistics: Num rows: 6144 Data size: 1320982 Basic stats: COMPLETE 
Column stats: NONE
+          Group By Operator
+            aggregations: count(_col1)
+            keys: _col0 (type: tinyint)
+            mode: complete
+            outputColumnNames: _col0, _col1
+            Statistics: Num rows: 3072 Data size: 660491 Basic stats: COMPLETE 
Column stats: NONE
+            Limit
+              Number of rows: 20
+              Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE 
Column stats: NONE
+              File Output Operator
+                compressed: false
+                Statistics: Num rows: 20 Data size: 4300 Basic stats: COMPLETE 
Column stats: NONE
+                table:
+                    input format: org.apache.hadoop.mapred.TextInputFormat
+                    output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                    serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: 20
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble 
from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by 
ctinyint limit 20
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+POSTHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, 
cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order 
by ctinyint limit 20
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+-46    24
+-47    22
+-48    29
+-49    26
+-50    30
+-51    21
+-52    33
+-53    22
+-54    26
+-55    29
+-56    36
+-57    35
+-58    23
+-59    31
+-60    27
+-61    25
+-62    27
+-63    19
+-64    24
+NULL   2932
 PREHOOK: query: -- multi distinct
 explain
 select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from 
alltypesorc group by ctinyint order by ctinyint limit 20

http://git-wip-us.apache.org/repos/asf/hive/blob/c0116739/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out 
b/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out
index 01106a4..40af253 100644
--- a/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out
+++ b/ql/src/test/results/clientpositive/spark/limit_pushdown.q.out
@@ -540,6 +540,100 @@ POSTHOOK: Input: default@alltypesorc
 -63    19
 -64    24
 NULL   2932
+PREHOOK: query: explain 
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from 
alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint 
limit 20
+PREHOOK: type: QUERY
+POSTHOOK: query: explain 
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from 
alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint 
limit 20
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Spark
+      Edges:
+        Reducer 2 <- Map 1 (GROUP PARTITION-LEVEL SORT, 1)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: alltypesorc
+                  Statistics: Num rows: 12288 Data size: 377237 Basic stats: 
COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: ctinyint (type: tinyint), cdouble (type: 
double)
+                    outputColumnNames: ctinyint, cdouble
+                    Statistics: Num rows: 12288 Data size: 377237 Basic stats: 
COMPLETE Column stats: NONE
+                    Group By Operator
+                      keys: ctinyint (type: tinyint), cdouble (type: double)
+                      mode: hash
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 12288 Data size: 377237 Basic 
stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: tinyint), _col1 (type: 
double)
+                        sort order: ++
+                        Map-reduce partition columns: _col0 (type: tinyint)
+                        Statistics: Num rows: 12288 Data size: 377237 Basic 
stats: COMPLETE Column stats: NONE
+        Reducer 2 
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: tinyint), KEY._col1 (type: double)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 6144 Data size: 188618 Basic stats: 
COMPLETE Column stats: NONE
+                Group By Operator
+                  aggregations: count(_col1)
+                  keys: _col0 (type: tinyint)
+                  mode: complete
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 3072 Data size: 94309 Basic stats: 
COMPLETE Column stats: NONE
+                  Limit
+                    Number of rows: 20
+                    Statistics: Num rows: 20 Data size: 600 Basic stats: 
COMPLETE Column stats: NONE
+                    File Output Operator
+                      compressed: false
+                      Statistics: Num rows: 20 Data size: 600 Basic stats: 
COMPLETE Column stats: NONE
+                      table:
+                          input format: 
org.apache.hadoop.mapred.TextInputFormat
+                          output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                          serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: 20
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble 
from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by 
ctinyint limit 20
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+POSTHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, 
cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order 
by ctinyint limit 20
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+-46    24
+-47    22
+-48    29
+-49    26
+-50    30
+-51    21
+-52    33
+-53    22
+-54    26
+-55    29
+-56    36
+-57    35
+-58    23
+-59    31
+-60    27
+-61    25
+-62    27
+-63    19
+-64    24
+NULL   2932
 PREHOOK: query: -- multi distinct
 explain
 select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from 
alltypesorc group by ctinyint order by ctinyint limit 20

http://git-wip-us.apache.org/repos/asf/hive/blob/c0116739/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out 
b/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out
index 952d7ff..7038b4d 100644
--- a/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out
+++ b/ql/src/test/results/clientpositive/tez/limit_pushdown.q.out
@@ -540,6 +540,100 @@ POSTHOOK: Input: default@alltypesorc
 -63    19
 -64    24
 NULL   2932
+PREHOOK: query: explain 
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from 
alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint 
limit 20
+PREHOOK: type: QUERY
+POSTHOOK: query: explain 
+select ctinyint, count(cdouble) from (select ctinyint, cdouble from 
alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by ctinyint 
limit 20
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: alltypesorc
+                  Statistics: Num rows: 12288 Data size: 2641964 Basic stats: 
COMPLETE Column stats: NONE
+                  Select Operator
+                    expressions: ctinyint (type: tinyint), cdouble (type: 
double)
+                    outputColumnNames: _col0, _col1
+                    Statistics: Num rows: 12288 Data size: 2641964 Basic 
stats: COMPLETE Column stats: NONE
+                    Group By Operator
+                      keys: _col0 (type: tinyint), _col1 (type: double)
+                      mode: hash
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 12288 Data size: 2641964 Basic 
stats: COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: tinyint), _col1 (type: 
double)
+                        sort order: ++
+                        Map-reduce partition columns: _col0 (type: tinyint)
+                        Statistics: Num rows: 12288 Data size: 2641964 Basic 
stats: COMPLETE Column stats: NONE
+        Reducer 2 
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: tinyint), KEY._col1 (type: double)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 6144 Data size: 1320982 Basic stats: 
COMPLETE Column stats: NONE
+                Group By Operator
+                  aggregations: count(_col1)
+                  keys: _col0 (type: tinyint)
+                  mode: complete
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 3072 Data size: 660491 Basic stats: 
COMPLETE Column stats: NONE
+                  Limit
+                    Number of rows: 20
+                    Statistics: Num rows: 20 Data size: 4300 Basic stats: 
COMPLETE Column stats: NONE
+                    File Output Operator
+                      compressed: false
+                      Statistics: Num rows: 20 Data size: 4300 Basic stats: 
COMPLETE Column stats: NONE
+                      table:
+                          input format: 
org.apache.hadoop.mapred.TextInputFormat
+                          output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                          serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: 20
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, cdouble 
from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order by 
ctinyint limit 20
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+POSTHOOK: query: select ctinyint, count(cdouble) from (select ctinyint, 
cdouble from alltypesorc group by ctinyint, cdouble) t1 group by ctinyint order 
by ctinyint limit 20
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+-46    24
+-47    22
+-48    29
+-49    26
+-50    30
+-51    21
+-52    33
+-53    22
+-54    26
+-55    29
+-56    36
+-57    35
+-58    23
+-59    31
+-60    27
+-61    25
+-62    27
+-63    19
+-64    24
+NULL   2932
 PREHOOK: query: -- multi distinct
 explain
 select ctinyint, count(distinct(cstring1)), count(distinct(cstring2)) from 
alltypesorc group by ctinyint order by ctinyint limit 20

Reply via email to