Author: rhbutani
Date: Fri Apr 11 15:10:42 2014
New Revision: 1586678

URL: http://svn.apache.org/r1586678
Log:
HIVE-6873 DISTINCT clause in aggregates is handled incorrectly by vectorized 
execution (Jitendra, Remus via Ashutosh)

Added:
    
hive/branches/branch-0.13/ql/src/test/queries/clientpositive/vectorized_distinct_gby.q
    
hive/branches/branch-0.13/ql/src/test/results/clientpositive/vectorized_distinct_gby.q.out
Modified:
    
hive/branches/branch-0.13/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java

Modified: 
hive/branches/branch-0.13/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java
URL: 
http://svn.apache.org/viewvc/hive/branches/branch-0.13/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java?rev=1586678&r1=1586677&r2=1586678&view=diff
==============================================================================
--- 
hive/branches/branch-0.13/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java
 (original)
+++ 
hive/branches/branch-0.13/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupByOptimizer.java
 Fri Apr 11 15:10:42 2014
@@ -210,7 +210,7 @@ public class GroupByOptimizer implements
       if (removeReduceSink) {
         convertGroupByMapSideSortedGroupBy(hiveConf, groupByOp, depth);
       }
-      else if (optimizeDistincts) {
+      else if (optimizeDistincts && !HiveConf.getBoolVar(hiveConf, 
HiveConf.ConfVars.HIVE_VECTORIZATION_ENABLED)) {
         // In test mode, dont change the query plan. However, setup a query 
property
         pGraphContext.getQueryProperties().setHasMapGroupBy(true);
         if (HiveConf.getBoolVar(hiveConf, 
HiveConf.ConfVars.HIVE_MAP_GROUPBY_SORT_TESTMODE)) {

Added: 
hive/branches/branch-0.13/ql/src/test/queries/clientpositive/vectorized_distinct_gby.q
URL: 
http://svn.apache.org/viewvc/hive/branches/branch-0.13/ql/src/test/queries/clientpositive/vectorized_distinct_gby.q?rev=1586678&view=auto
==============================================================================
--- 
hive/branches/branch-0.13/ql/src/test/queries/clientpositive/vectorized_distinct_gby.q
 (added)
+++ 
hive/branches/branch-0.13/ql/src/test/queries/clientpositive/vectorized_distinct_gby.q
 Fri Apr 11 15:10:42 2014
@@ -0,0 +1,12 @@
+SET hive.vectorized.execution.enabled=true;
+
+SET hive.map.groupby.sorted=true;
+
+create table dtest(a int, b int) clustered by (a) sorted by (a) into 1 buckets 
stored as orc;
+insert into table dtest select c,b from (select array(300,300,300,300,300) as 
a, 1 as b from src limit 1) y lateral view  explode(a) t1 as c;
+
+explain select sum(distinct a), count(distinct a) from dtest;
+select sum(distinct a), count(distinct a) from dtest;
+
+explain select sum(distinct cint), count(distinct cint), avg(distinct cint), 
std(distinct cint) from alltypesorc;
+select sum(distinct cint), count(distinct cint), avg(distinct cint), 
std(distinct cint) from alltypesorc;

Added: 
hive/branches/branch-0.13/ql/src/test/results/clientpositive/vectorized_distinct_gby.q.out
URL: 
http://svn.apache.org/viewvc/hive/branches/branch-0.13/ql/src/test/results/clientpositive/vectorized_distinct_gby.q.out?rev=1586678&view=auto
==============================================================================
--- 
hive/branches/branch-0.13/ql/src/test/results/clientpositive/vectorized_distinct_gby.q.out
 (added)
+++ 
hive/branches/branch-0.13/ql/src/test/results/clientpositive/vectorized_distinct_gby.q.out
 Fri Apr 11 15:10:42 2014
@@ -0,0 +1,150 @@
+PREHOOK: query: create table dtest(a int, b int) clustered by (a) sorted by 
(a) into 1 buckets stored as orc
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+POSTHOOK: query: create table dtest(a int, b int) clustered by (a) sorted by 
(a) into 1 buckets stored as orc
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@dtest
+PREHOOK: query: insert into table dtest select c,b from (select 
array(300,300,300,300,300) as a, 1 as b from src limit 1) y lateral view  
explode(a) t1 as c
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@dtest
+POSTHOOK: query: insert into table dtest select c,b from (select 
array(300,300,300,300,300) as a, 1 as b from src limit 1) y lateral view  
explode(a) t1 as c
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@dtest
+POSTHOOK: Lineage: dtest.a SIMPLE []
+POSTHOOK: Lineage: dtest.b EXPRESSION []
+PREHOOK: query: explain select sum(distinct a), count(distinct a) from dtest
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select sum(distinct a), count(distinct a) from dtest
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: dtest.a SIMPLE []
+POSTHOOK: Lineage: dtest.b EXPRESSION []
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: dtest
+            Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE Column 
stats: NONE
+            Select Operator
+              expressions: a (type: int)
+              outputColumnNames: a
+              Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE 
Column stats: NONE
+              Group By Operator
+                aggregations: sum(DISTINCT a), count(DISTINCT a)
+                bucketGroup: true
+                keys: a (type: int)
+                mode: hash
+                outputColumnNames: _col0, _col1, _col2
+                Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE 
Column stats: NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: int)
+                  sort order: +
+                  Statistics: Num rows: 5 Data size: 40 Basic stats: COMPLETE 
Column stats: NONE
+                  value expressions: _col1 (type: bigint), _col2 (type: bigint)
+      Execution mode: vectorized
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: sum(DISTINCT KEY._col0:0._col0), count(DISTINCT 
KEY._col0:1._col0)
+          mode: mergepartial
+          outputColumnNames: _col0, _col1
+          Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column 
stats: NONE
+          Select Operator
+            expressions: _col0 (type: bigint), _col1 (type: bigint)
+            outputColumnNames: _col0, _col1
+            Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE Column 
stats: NONE
+            File Output Operator
+              compressed: false
+              Statistics: Num rows: 1 Data size: 24 Basic stats: COMPLETE 
Column stats: NONE
+              table:
+                  input format: org.apache.hadoop.mapred.TextInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+
+PREHOOK: query: select sum(distinct a), count(distinct a) from dtest
+PREHOOK: type: QUERY
+PREHOOK: Input: default@dtest
+#### A masked pattern was here ####
+POSTHOOK: query: select sum(distinct a), count(distinct a) from dtest
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@dtest
+#### A masked pattern was here ####
+POSTHOOK: Lineage: dtest.a SIMPLE []
+POSTHOOK: Lineage: dtest.b EXPRESSION []
+300    1
+PREHOOK: query: explain select sum(distinct cint), count(distinct cint), 
avg(distinct cint), std(distinct cint) from alltypesorc
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select sum(distinct cint), count(distinct cint), 
avg(distinct cint), std(distinct cint) from alltypesorc
+POSTHOOK: type: QUERY
+POSTHOOK: Lineage: dtest.a SIMPLE []
+POSTHOOK: Lineage: dtest.b EXPRESSION []
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: alltypesorc
+            Statistics: Num rows: 94309 Data size: 377237 Basic stats: 
COMPLETE Column stats: NONE
+            Select Operator
+              expressions: cint (type: int)
+              outputColumnNames: cint
+              Statistics: Num rows: 94309 Data size: 377237 Basic stats: 
COMPLETE Column stats: NONE
+              Group By Operator
+                aggregations: sum(DISTINCT cint), count(DISTINCT cint), 
avg(DISTINCT cint), std(DISTINCT cint)
+                keys: cint (type: int)
+                mode: hash
+                outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                Statistics: Num rows: 94309 Data size: 377237 Basic stats: 
COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: int)
+                  sort order: +
+                  Statistics: Num rows: 94309 Data size: 377237 Basic stats: 
COMPLETE Column stats: NONE
+                  value expressions: _col1 (type: bigint), _col2 (type: 
bigint), _col3 (type: struct<count:bigint,sum:double,input:int>), _col4 (type: 
struct<count:bigint,sum:double,variance:double>)
+      Execution mode: vectorized
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: sum(DISTINCT KEY._col0:0._col0), count(DISTINCT 
KEY._col0:1._col0), avg(DISTINCT KEY._col0:2._col0), std(DISTINCT 
KEY._col0:3._col0)
+          mode: mergepartial
+          outputColumnNames: _col0, _col1, _col2, _col3
+          Statistics: Num rows: 1 Data size: 32 Basic stats: COMPLETE Column 
stats: NONE
+          Select Operator
+            expressions: _col0 (type: bigint), _col1 (type: bigint), _col2 
(type: double), _col3 (type: double)
+            outputColumnNames: _col0, _col1, _col2, _col3
+            Statistics: Num rows: 1 Data size: 32 Basic stats: COMPLETE Column 
stats: NONE
+            File Output Operator
+              compressed: false
+              Statistics: Num rows: 1 Data size: 32 Basic stats: COMPLETE 
Column stats: NONE
+              table:
+                  input format: org.apache.hadoop.mapred.TextInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat
+                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+
+PREHOOK: query: select sum(distinct cint), count(distinct cint), avg(distinct 
cint), std(distinct cint) from alltypesorc
+PREHOOK: type: QUERY
+PREHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+POSTHOOK: query: select sum(distinct cint), count(distinct cint), avg(distinct 
cint), std(distinct cint) from alltypesorc
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@alltypesorc
+#### A masked pattern was here ####
+POSTHOOK: Lineage: dtest.a SIMPLE []
+POSTHOOK: Lineage: dtest.b EXPRESSION []
+-3482841611    6082    -572647.4204209142      6.153814687328991E8


Reply via email to