[14/50] [abbrv] hive git commit: HIVE-15848: count or sum distinct incorrect when hive.optimize.reducededuplication set to true (Zoltan Haindrich reviewed by Ashutosh Chauhan)

sershe Tue, 07 Mar 2017 19:29:40 -0800

HIVE-15848: count or sum distinct incorrect when 
hive.optimize.reducededuplication set to true (Zoltan Haindrich reviewed by 
Ashutosh Chauhan)



Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/b9ad6dc3
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/b9ad6dc3
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/b9ad6dc3

Branch: refs/heads/hive-14535
Commit: b9ad6dc3867efce4dd833e519c788aab280dabd9
Parents: 3e94fb2
Author: Haindrich ZoltÃ¡n (kirk) <k...@rxd.hu>
Authored: Wed Mar 1 22:30:13 2017 +0100
Committer: Haindrich ZoltÃ¡n (kirk) <k...@rxd.hu>
Committed: Wed Mar 1 22:35:49 2017 +0100

----------------------------------------------------------------------
 .../test/resources/testconfiguration.properties |   1 +
 .../correlation/ReduceSinkDeDuplication.java    |   4 +
 .../reduce_deduplicate_distinct.q               |  54 +++
 .../llap/reduce_deduplicate_distinct.q.out      | 483 +++++++++++++++++++
 4 files changed, 542 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/b9ad6dc3/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties 
b/itests/src/test/resources/testconfiguration.properties
index 807b124..9c6a069 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -413,6 +413,7 @@ minillap.query.files=acid_bucket_pruning.q,\
   llap_udf.q,\
   llapdecider.q,\
   reduce_deduplicate.q,\
+  reduce_deduplicate_distinct.q, \
   remote_script.q,\
   tez_aggr_part_stats.q,\
   tez_union_view.q,\

http://git-wip-us.apache.org/repos/asf/hive/blob/b9ad6dc3/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java
index d53efbf..2b075be 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/correlation/ReduceSinkDeDuplication.java
@@ -360,6 +360,10 @@ public class ReduceSinkDeDuplication extends Transform {
       if (moveRSOrderTo == null) {
         return null;
       }
+      // if cRS is being used for distinct - the two reduce sinks are 
incompatible
+      if (cConf.getDistinctColumnIndices().size() >= 2) {
+        return null;
+      }
       Integer moveReducerNumTo = checkNumReducer(cConf.getNumReducers(), 
pConf.getNumReducers());
       if (moveReducerNumTo == null ||
           moveReducerNumTo > 0 && cConf.getNumReducers() < minReducer) {

http://git-wip-us.apache.org/repos/asf/hive/blob/b9ad6dc3/ql/src/test/queries/clientpositive/reduce_deduplicate_distinct.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/reduce_deduplicate_distinct.q 
b/ql/src/test/queries/clientpositive/reduce_deduplicate_distinct.q
new file mode 100644
index 0000000..840025c
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/reduce_deduplicate_distinct.q
@@ -0,0 +1,54 @@
+create table count_distinct_test(id int,key int,name int);
+
+insert into count_distinct_test values (1,1,2),(1,2,3),(1,3,2),(1,4,2),(1,5,3);
+
+-- simple case; no need for opt
+explain select id,count(distinct key),count(distinct name)
+from count_distinct_test
+group by id;
+
+select id,count(distinct key),count(distinct name)
+from count_distinct_test
+group by id;
+
+-- dedup on
+set hive.optimize.reducededuplication=true;
+
+-- candidate1
+explain select id,count(Distinct key),count(Distinct name)
+from (select id,key,name from count_distinct_test group by id,key,name)m
+group by id;
+
+select id,count(Distinct key),count(Distinct name)
+from (select id,key,name from count_distinct_test group by id,key,name)m
+group by id;
+
+-- candidate2
+explain select id,count(Distinct name),count(Distinct key)
+from (select id,key,name from count_distinct_test group by id,name,key)m
+group by id;
+
+select id,count(Distinct name),count(Distinct key)
+from (select id,key,name from count_distinct_test group by id,name,key)m
+group by id;
+
+-- deduplication off
+set hive.optimize.reducededuplication=false;
+
+-- candidate1
+explain select id,count(Distinct key),count(Distinct name)
+from (select id,key,name from count_distinct_test group by id,key,name)m
+group by id;
+
+select id,count(Distinct key),count(Distinct name)
+from (select id,key,name from count_distinct_test group by id,key,name)m
+group by id;
+
+-- candidate2
+explain select id,count(Distinct name),count(Distinct key)
+from (select id,key,name from count_distinct_test group by id,name,key)m
+group by id;
+
+select id,count(Distinct name),count(Distinct key)
+from (select id,key,name from count_distinct_test group by id,name,key)m
+group by id;

http://git-wip-us.apache.org/repos/asf/hive/blob/b9ad6dc3/ql/src/test/results/clientpositive/llap/reduce_deduplicate_distinct.q.out
----------------------------------------------------------------------
diff --git 
a/ql/src/test/results/clientpositive/llap/reduce_deduplicate_distinct.q.out 
b/ql/src/test/results/clientpositive/llap/reduce_deduplicate_distinct.q.out
new file mode 100644
index 0000000..e5b8d11
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/reduce_deduplicate_distinct.q.out
@@ -0,0 +1,483 @@
+PREHOOK: query: create table count_distinct_test(id int,key int,name int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@count_distinct_test
+POSTHOOK: query: create table count_distinct_test(id int,key int,name int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@count_distinct_test
+PREHOOK: query: insert into count_distinct_test values 
(1,1,2),(1,2,3),(1,3,2),(1,4,2),(1,5,3)
+PREHOOK: type: QUERY
+PREHOOK: Output: default@count_distinct_test
+POSTHOOK: query: insert into count_distinct_test values 
(1,1,2),(1,2,3),(1,3,2),(1,4,2),(1,5,3)
+POSTHOOK: type: QUERY
+POSTHOOK: Output: default@count_distinct_test
+POSTHOOK: Lineage: count_distinct_test.id EXPRESSION 
[(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, 
type:string, comment:), ]
+POSTHOOK: Lineage: count_distinct_test.key EXPRESSION 
[(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, 
type:string, comment:), ]
+POSTHOOK: Lineage: count_distinct_test.name EXPRESSION 
[(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col3, 
type:string, comment:), ]
+PREHOOK: query: explain select id,count(distinct key),count(distinct name)
+from count_distinct_test
+group by id
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select id,count(distinct key),count(distinct name)
+from count_distinct_test
+group by id
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: count_distinct_test
+                  Statistics: Num rows: 5 Data size: 25 Basic stats: COMPLETE 
Column stats: NONE
+                  Select Operator
+                    expressions: id (type: int), key (type: int), name (type: 
int)
+                    outputColumnNames: id, key, name
+                    Statistics: Num rows: 5 Data size: 25 Basic stats: 
COMPLETE Column stats: NONE
+                    Group By Operator
+                      aggregations: count(DISTINCT key), count(DISTINCT name)
+                      keys: id (type: int), key (type: int), name (type: int)
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                      Statistics: Num rows: 5 Data size: 25 Basic stats: 
COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int), _col1 (type: int), 
_col2 (type: int)
+                        sort order: +++
+                        Map-reduce partition columns: _col0 (type: int)
+                        Statistics: Num rows: 5 Data size: 25 Basic stats: 
COMPLETE Column stats: NONE
+            Execution mode: llap
+            LLAP IO: no inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(DISTINCT KEY._col1:0._col0), 
count(DISTINCT KEY._col1:1._col0)
+                keys: KEY._col0 (type: int)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE 
Column stats: NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE 
Column stats: NONE
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select id,count(distinct key),count(distinct name)
+from count_distinct_test
+group by id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@count_distinct_test
+#### A masked pattern was here ####
+POSTHOOK: query: select id,count(distinct key),count(distinct name)
+from count_distinct_test
+group by id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@count_distinct_test
+#### A masked pattern was here ####
+1      5       2
+PREHOOK: query: explain select id,count(Distinct key),count(Distinct name)
+from (select id,key,name from count_distinct_test group by id,key,name)m
+group by id
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select id,count(Distinct key),count(Distinct name)
+from (select id,key,name from count_distinct_test group by id,key,name)m
+group by id
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: count_distinct_test
+                  Statistics: Num rows: 5 Data size: 25 Basic stats: COMPLETE 
Column stats: NONE
+                  Select Operator
+                    expressions: id (type: int), key (type: int), name (type: 
int)
+                    outputColumnNames: id, key, name
+                    Statistics: Num rows: 5 Data size: 25 Basic stats: 
COMPLETE Column stats: NONE
+                    Group By Operator
+                      keys: id (type: int), key (type: int), name (type: int)
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2
+                      Statistics: Num rows: 5 Data size: 25 Basic stats: 
COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int), _col1 (type: int), 
_col2 (type: int)
+                        sort order: +++
+                        Map-reduce partition columns: _col0 (type: int), _col1 
(type: int), _col2 (type: int)
+                        Statistics: Num rows: 5 Data size: 25 Basic stats: 
COMPLETE Column stats: NONE
+            Execution mode: llap
+            LLAP IO: no inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 
(type: int)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE 
Column stats: NONE
+                Group By Operator
+                  aggregations: count(DISTINCT _col1), count(DISTINCT _col2)
+                  keys: _col0 (type: int), _col1 (type: int), _col2 (type: int)
+                  mode: hash
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                  Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE 
Column stats: NONE
+                  Reduce Output Operator
+                    key expressions: _col0 (type: int), _col1 (type: int), 
_col2 (type: int)
+                    sort order: +++
+                    Map-reduce partition columns: _col0 (type: int)
+                    Statistics: Num rows: 2 Data size: 10 Basic stats: 
COMPLETE Column stats: NONE
+        Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(DISTINCT KEY._col1:0._col0), 
count(DISTINCT KEY._col1:1._col0)
+                keys: KEY._col0 (type: int)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE 
Column stats: NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE 
Column stats: NONE
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select id,count(Distinct key),count(Distinct name)
+from (select id,key,name from count_distinct_test group by id,key,name)m
+group by id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@count_distinct_test
+#### A masked pattern was here ####
+POSTHOOK: query: select id,count(Distinct key),count(Distinct name)
+from (select id,key,name from count_distinct_test group by id,key,name)m
+group by id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@count_distinct_test
+#### A masked pattern was here ####
+1      5       2
+PREHOOK: query: explain select id,count(Distinct name),count(Distinct key)
+from (select id,key,name from count_distinct_test group by id,name,key)m
+group by id
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select id,count(Distinct name),count(Distinct key)
+from (select id,key,name from count_distinct_test group by id,name,key)m
+group by id
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: count_distinct_test
+                  Statistics: Num rows: 5 Data size: 25 Basic stats: COMPLETE 
Column stats: NONE
+                  Select Operator
+                    expressions: id (type: int), key (type: int), name (type: 
int)
+                    outputColumnNames: id, key, name
+                    Statistics: Num rows: 5 Data size: 25 Basic stats: 
COMPLETE Column stats: NONE
+                    Group By Operator
+                      keys: id (type: int), key (type: int), name (type: int)
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2
+                      Statistics: Num rows: 5 Data size: 25 Basic stats: 
COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int), _col1 (type: int), 
_col2 (type: int)
+                        sort order: +++
+                        Map-reduce partition columns: _col0 (type: int), _col1 
(type: int), _col2 (type: int)
+                        Statistics: Num rows: 5 Data size: 25 Basic stats: 
COMPLETE Column stats: NONE
+            Execution mode: llap
+            LLAP IO: no inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 
(type: int)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE 
Column stats: NONE
+                Group By Operator
+                  aggregations: count(DISTINCT _col2), count(DISTINCT _col1)
+                  keys: _col0 (type: int), _col2 (type: int), _col1 (type: int)
+                  mode: hash
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                  Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE 
Column stats: NONE
+                  Reduce Output Operator
+                    key expressions: _col0 (type: int), _col1 (type: int), 
_col2 (type: int)
+                    sort order: +++
+                    Map-reduce partition columns: _col0 (type: int)
+                    Statistics: Num rows: 2 Data size: 10 Basic stats: 
COMPLETE Column stats: NONE
+        Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(DISTINCT KEY._col1:0._col0), 
count(DISTINCT KEY._col1:1._col0)
+                keys: KEY._col0 (type: int)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE 
Column stats: NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE 
Column stats: NONE
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select id,count(Distinct name),count(Distinct key)
+from (select id,key,name from count_distinct_test group by id,name,key)m
+group by id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@count_distinct_test
+#### A masked pattern was here ####
+POSTHOOK: query: select id,count(Distinct name),count(Distinct key)
+from (select id,key,name from count_distinct_test group by id,name,key)m
+group by id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@count_distinct_test
+#### A masked pattern was here ####
+1      2       5
+PREHOOK: query: explain select id,count(Distinct key),count(Distinct name)
+from (select id,key,name from count_distinct_test group by id,key,name)m
+group by id
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select id,count(Distinct key),count(Distinct name)
+from (select id,key,name from count_distinct_test group by id,key,name)m
+group by id
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: count_distinct_test
+                  Statistics: Num rows: 5 Data size: 25 Basic stats: COMPLETE 
Column stats: NONE
+                  Select Operator
+                    expressions: id (type: int), key (type: int), name (type: 
int)
+                    outputColumnNames: id, key, name
+                    Statistics: Num rows: 5 Data size: 25 Basic stats: 
COMPLETE Column stats: NONE
+                    Group By Operator
+                      keys: id (type: int), key (type: int), name (type: int)
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2
+                      Statistics: Num rows: 5 Data size: 25 Basic stats: 
COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int), _col1 (type: int), 
_col2 (type: int)
+                        sort order: +++
+                        Map-reduce partition columns: _col0 (type: int), _col1 
(type: int), _col2 (type: int)
+                        Statistics: Num rows: 5 Data size: 25 Basic stats: 
COMPLETE Column stats: NONE
+            Execution mode: llap
+            LLAP IO: no inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 
(type: int)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE 
Column stats: NONE
+                Group By Operator
+                  aggregations: count(DISTINCT _col1), count(DISTINCT _col2)
+                  keys: _col0 (type: int), _col1 (type: int), _col2 (type: int)
+                  mode: hash
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                  Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE 
Column stats: NONE
+                  Reduce Output Operator
+                    key expressions: _col0 (type: int), _col1 (type: int), 
_col2 (type: int)
+                    sort order: +++
+                    Map-reduce partition columns: _col0 (type: int)
+                    Statistics: Num rows: 2 Data size: 10 Basic stats: 
COMPLETE Column stats: NONE
+        Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(DISTINCT KEY._col1:0._col0), 
count(DISTINCT KEY._col1:1._col0)
+                keys: KEY._col0 (type: int)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE 
Column stats: NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE 
Column stats: NONE
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select id,count(Distinct key),count(Distinct name)
+from (select id,key,name from count_distinct_test group by id,key,name)m
+group by id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@count_distinct_test
+#### A masked pattern was here ####
+POSTHOOK: query: select id,count(Distinct key),count(Distinct name)
+from (select id,key,name from count_distinct_test group by id,key,name)m
+group by id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@count_distinct_test
+#### A masked pattern was here ####
+1      5       2
+PREHOOK: query: explain select id,count(Distinct name),count(Distinct key)
+from (select id,key,name from count_distinct_test group by id,name,key)m
+group by id
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select id,count(Distinct name),count(Distinct key)
+from (select id,key,name from count_distinct_test group by id,name,key)m
+group by id
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: count_distinct_test
+                  Statistics: Num rows: 5 Data size: 25 Basic stats: COMPLETE 
Column stats: NONE
+                  Select Operator
+                    expressions: id (type: int), key (type: int), name (type: 
int)
+                    outputColumnNames: id, key, name
+                    Statistics: Num rows: 5 Data size: 25 Basic stats: 
COMPLETE Column stats: NONE
+                    Group By Operator
+                      keys: id (type: int), key (type: int), name (type: int)
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2
+                      Statistics: Num rows: 5 Data size: 25 Basic stats: 
COMPLETE Column stats: NONE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int), _col1 (type: int), 
_col2 (type: int)
+                        sort order: +++
+                        Map-reduce partition columns: _col0 (type: int), _col1 
(type: int), _col2 (type: int)
+                        Statistics: Num rows: 5 Data size: 25 Basic stats: 
COMPLETE Column stats: NONE
+            Execution mode: llap
+            LLAP IO: no inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 
(type: int)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE 
Column stats: NONE
+                Group By Operator
+                  aggregations: count(DISTINCT _col2), count(DISTINCT _col1)
+                  keys: _col0 (type: int), _col2 (type: int), _col1 (type: int)
+                  mode: hash
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                  Statistics: Num rows: 2 Data size: 10 Basic stats: COMPLETE 
Column stats: NONE
+                  Reduce Output Operator
+                    key expressions: _col0 (type: int), _col1 (type: int), 
_col2 (type: int)
+                    sort order: +++
+                    Map-reduce partition columns: _col0 (type: int)
+                    Statistics: Num rows: 2 Data size: 10 Basic stats: 
COMPLETE Column stats: NONE
+        Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(DISTINCT KEY._col1:0._col0), 
count(DISTINCT KEY._col1:1._col0)
+                keys: KEY._col0 (type: int)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE 
Column stats: NONE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 1 Data size: 5 Basic stats: COMPLETE 
Column stats: NONE
+                  table:
+                      input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                      serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select id,count(Distinct name),count(Distinct key)
+from (select id,key,name from count_distinct_test group by id,name,key)m
+group by id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@count_distinct_test
+#### A masked pattern was here ####
+POSTHOOK: query: select id,count(Distinct name),count(Distinct key)
+from (select id,key,name from count_distinct_test group by id,name,key)m
+group by id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@count_distinct_test
+#### A masked pattern was here ####
+1      2       5

[14/50] [abbrv] hive git commit: HIVE-15848: count or sum distinct incorrect when hive.optimize.reducededuplication set to true (Zoltan Haindrich reviewed by Ashutosh Chauhan)

Reply via email to