[3/3] hive git commit: HIVE-16654: Optimize a combination of avg(), sum(), count(distinct) etc (Pengcheng Xiong, reviewed by Ashutosh Chauhan)

pxiong Wed, 31 May 2017 18:18:05 -0700

HIVE-16654: Optimize a combination of avg(), sum(), count(distinct) etc 
(Pengcheng Xiong, reviewed by Ashutosh Chauhan)



Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/b560f492
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/b560f492
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/b560f492

Branch: refs/heads/master
Commit: b560f492ea08a9a83005f166e8ed5ef0fda6592d
Parents: d7ab32f
Author: Pengcheng Xiong <pxi...@hortonworks.com>
Authored: Wed May 31 18:17:53 2017 -0700
Committer: Pengcheng Xiong <pxi...@hortonworks.com>
Committed: Wed May 31 18:17:53 2017 -0700

----------------------------------------------------------------------
 .../org/apache/hadoop/hive/conf/HiveConf.java   |    2 +
 .../test/resources/testconfiguration.properties |    1 +
 .../apache/hadoop/hive/ql/exec/Operator.java    |   10 +-
 .../ql/optimizer/CountDistinctRewriteProc.java  |  504 ++++++++
 .../hadoop/hive/ql/optimizer/Optimizer.java     |    4 +
 .../apache/hadoop/hive/ql/plan/GroupByDesc.java |   15 +
 .../queries/clientpositive/count_dist_rewrite.q |   65 +
 .../clientpositive/count_dist_rewrite.q.out     | 1151 +++++++++++++++++
 .../clientpositive/groupby_sort_11.q.out        |   39 +-
 .../llap/count_dist_rewrite.q.out               | 1169 ++++++++++++++++++
 .../results/clientpositive/nullgroup4.q.out     |   41 +-
 .../results/clientpositive/perf/query16.q.out   |  318 ++---
 .../results/clientpositive/perf/query28.q.out   |   58 +-
 .../results/clientpositive/perf/query94.q.out   |  318 ++---
 .../results/clientpositive/perf/query95.q.out   |  302 ++---
 .../clientpositive/spark/nullgroup4.q.out       |   31 +-
 .../test/results/clientpositive/udf_count.q.out |   39 +-
 .../clientpositive/vector_empty_where.q.out     |  300 ++++-
 18 files changed, 3820 insertions(+), 547 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
----------------------------------------------------------------------
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java 
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 5344f36..176d36f 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -1569,6 +1569,8 @@ public class HiveConf extends Configuration {
          "Whether to transform OR clauses in Filter operators into IN 
clauses"),
     HIVEPOINTLOOKUPOPTIMIZERMIN("hive.optimize.point.lookup.min", 31,
              "Minimum number of OR clauses needed to transform into IN 
clauses"),
+    HIVECOUNTDISTINCTOPTIMIZER("hive.optimize.countdistinct", true,
+                 "Whether to transform count distinct into two stages"),
    HIVEPARTITIONCOLUMNSEPARATOR("hive.optimize.partition.columns.separate", 
true,
             "Extract partition columns from IN clauses"),
     // Constant propagation optimizer

http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/itests/src/test/resources/testconfiguration.properties
----------------------------------------------------------------------
diff --git a/itests/src/test/resources/testconfiguration.properties 
b/itests/src/test/resources/testconfiguration.properties
index f4a53df..e613374 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -130,6 +130,7 @@ minillaplocal.shared.query.files=alter_merge_2_orc.q,\
   constprog_semijoin.q,\
   correlationoptimizer1.q,\
   count.q,\
+  count_dist_rewrite.q,\
   create_merge_compressed.q,\
   cross_join.q,\
   cross_product_check_1.q,\

http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java
index ffa5f41..3656842 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java
@@ -1164,8 +1164,16 @@ public abstract class Operator<T extends OperatorDesc> 
implements Serializable,C
     @SuppressWarnings("unchecked")
     T descClone = (T)conf.clone();
     // also clone the colExprMap by default
+    // we need a deep copy
+    ArrayList<ColumnInfo> colInfos = new ArrayList<>();
+    colInfos.addAll(getSchema().getSignature());
+    Map<String, ExprNodeDesc> map = null;
+    if (getColumnExprMap() != null) {
+      map = new HashMap<>();
+      map.putAll(getColumnExprMap());
+    }
     Operator<? extends OperatorDesc> ret = OperatorFactory.getAndMakeChild(
-            cContext, descClone, getSchema(), getColumnExprMap(), 
parentClones);
+            cContext, descClone, new RowSchema(colInfos), map, parentClones);
 
     return ret;
   }

http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CountDistinctRewriteProc.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CountDistinctRewriteProc.java 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CountDistinctRewriteProc.java
new file mode 100644
index 0000000..6450cb3
--- /dev/null
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CountDistinctRewriteProc.java
@@ -0,0 +1,504 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Stack;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.GroupByOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.OperatorFactory;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.io.AcidUtils;
+import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
+import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
+import org.apache.hadoop.hive.ql.lib.Dispatcher;
+import org.apache.hadoop.hive.ql.lib.GraphWalker;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.lib.Rule;
+import org.apache.hadoop.hive.ql.lib.RuleRegExp;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer;
+import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.GenericUDAFInfo;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.AggregationDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.GroupByDesc;
+import org.apache.hadoop.hive.ql.plan.OperatorDesc;
+import org.apache.hadoop.hive.ql.plan.PlanUtils;
+import org.apache.hadoop.hive.ql.plan.TableScanDesc;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode;
+
+/**
+ * Queries of form : select max(c), count(distinct c) from T; generates a plan
+ * of form TS->mGBy->RS->rGBy->FS This plan suffers from a problem that vertex
+ * containing rGBy->FS necessarily need to have 1 task. This limitation results
+ * in slow execution because that task gets all the data. This optimization if
+ * successful will rewrite above plan to mGby1-rs1-mGby2-mGby3-rs2-rGby1 This
+ * introduces extra vertex of mGby2-mGby3-rs2. Note this vertex can have
+ * multiple tasks and since we are doing aggregation, output of this must
+ * necessarily be smaller than its input, which results in much less data going
+ * in to original rGby->FS vertex, which continues to have single task. Also
+ * note on calcite tree we have HiveExpandDistinctAggregatesRule rule which 
does
+ * similar plan transformation but has different conditions which needs to be
+ * satisfied. Additionally, we don't do any costing here but this is possibly
+ * that this transformation may slow down query a bit since if data is small
+ * enough to fit in a single task of last reducer, injecting additional vertex
+ * in pipeline may make query slower. If this happens, users can use the
+ * configuration hive.optimize.countdistinct to turn it off.
+ */
+public class CountDistinctRewriteProc extends Transform {
+
+  private static final Logger LOG = 
LoggerFactory.getLogger(CountDistinctRewriteProc.class
+      .getName());
+
+  public CountDistinctRewriteProc() {
+  }
+
+  @Override
+  public ParseContext transform(ParseContext pctx) throws SemanticException {
+
+    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, 
NodeProcessor>();
+    // process group-by pattern
+    opRules
+        .put(
+            new RuleRegExp("R1", GroupByOperator.getOperatorName() + "%"
+                + ReduceSinkOperator.getOperatorName() + "%" + 
GroupByOperator.getOperatorName()
+                + "%"), getCountDistinctProc(pctx));
+
+    // The dispatcher fires the processor corresponding to the closest matching
+    // rule and passes the context along
+    Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, 
null);
+    GraphWalker ogw = new DefaultGraphWalker(disp);
+
+    // Create a list of topop nodes
+    List<Node> topNodes = new ArrayList<Node>();
+    topNodes.addAll(pctx.getTopOps().values());
+    ogw.startWalking(topNodes, null);
+
+    return pctx;
+  }
+
+  private NodeProcessor getDefaultProc() {
+    return new NodeProcessor() {
+      @Override
+      public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx 
procCtx,
+          Object... nodeOutputs) throws SemanticException {
+        return null;
+      }
+    };
+  }
+
+  private NodeProcessor getCountDistinctProc(ParseContext pctx) {
+    return new CountDistinctProcessor(pctx);
+  }
+
+  /**
+   * CountDistinctProcessor.
+   *
+   */
+  public class CountDistinctProcessor implements NodeProcessor {
+
+    protected ParseContext pGraphContext;
+
+    public CountDistinctProcessor(ParseContext pGraphContext) {
+      this.pGraphContext = pGraphContext;
+    }
+
+    // Position of distinct column in aggregator list of map Gby before 
rewrite.
+    int indexOfDist = -1;
+
+    // Check if we can process it or not
+    protected boolean checkCountDistinct(GroupByOperator mGby, 
ReduceSinkOperator rs,
+        GroupByOperator rGby) {
+      ArrayList<ExprNodeDesc> keys = mGby.getConf().getKeys();
+      if (!(mGby.getConf().getMode() == GroupByDesc.Mode.HASH
+          && !mGby.getConf().isGroupingSetsPresent() && 
rs.getConf().getKeyCols().size() == 1
+          && rs.getConf().getPartitionCols().size() == 0
+          && rs.getConf().getDistinctColumnIndices().size() == 1
+          && rGby.getConf().getMode() == GroupByDesc.Mode.MERGEPARTIAL && 
keys.size() == 1
+          && rGby.getConf().getKeys().size() == 0 && 
mGby.getConf().getOutputColumnNames().size() == mGby
+          .getConf().getAggregators().size() + 1)) {
+        return false;
+      }
+      for (int pos = 0; pos < mGby.getConf().getAggregators().size(); pos++) {
+        AggregationDesc aggr = mGby.getConf().getAggregators().get(pos);
+        if (aggr.getDistinct()) {
+          if (indexOfDist != -1 || 
!aggr.getGenericUDAFName().equalsIgnoreCase("count")) {
+            // there are 2 or more distincts, or distinct is not on count
+            // TODO: may be the same count(distinct key), count(distinct key)
+            // TODO: deal with duplicate count distinct key
+            return false;
+          }
+          indexOfDist = pos;
+          if (!(aggr.getParameters().size() == 1
+              && aggr.getParameters().get(0) instanceof ExprNodeColumnDesc && 
mGby.getConf()
+              .getKeys().get(0) instanceof ExprNodeColumnDesc)) {
+            return false;
+          } else {
+            ExprNodeColumnDesc agg = (ExprNodeColumnDesc) 
aggr.getParameters().get(0);
+            ExprNodeColumnDesc key = (ExprNodeColumnDesc) 
mGby.getConf().getKeys().get(0);
+            if (!agg.isSame(key)) {
+              return false;
+            }
+          }
+        }
+      }
+      if (indexOfDist == -1) {
+        return false;
+      }
+      // check if it is potential to trigger nullscan
+      if 
(pGraphContext.getConf().getBoolVar(HiveConf.ConfVars.HIVEMETADATAONLYQUERIES)) 
{
+        for (TableScanOperator tsOp : pGraphContext.getTopOps().values()) {
+          List<Integer> colIDs = tsOp.getNeededColumnIDs();
+          TableScanDesc desc = tsOp.getConf();
+          boolean noColNeeded = (colIDs == null) || (colIDs.isEmpty());
+          // VC is still here and it will be pruned by column pruner
+          // boolean noVCneeded = (desc == null) || (desc.getVirtualCols() == 
null)
+          // || (desc.getVirtualCols().isEmpty());
+          boolean isSkipHF = desc.isNeedSkipHeaderFooters();
+          if (noColNeeded && !isSkipHF) {
+            // it is possible that nullscan can fire, we skip this rule.
+            return false;
+          }
+        }
+      }
+      return true;
+    }
+
+    /*
+     * We will transform GB-RS-GBY to mGby1-rs1-mGby2-mGby3-rs2-rGby1
+     */
+    @SuppressWarnings("unchecked")
+    protected void processGroupBy(GroupByOperator mGby, ReduceSinkOperator rs, 
GroupByOperator rGby)
+        throws SemanticException, CloneNotSupportedException {
+      // remove count(distinct) in map-side gby
+      List<Operator<? extends OperatorDesc>> parents = 
mGby.getParentOperators();
+      List<Operator<? extends OperatorDesc>> children = 
rGby.getChildOperators();
+      mGby.removeParents();
+      rs.removeParents();
+      rGby.removeParents();
+
+      GroupByOperator mGby1 = genMapGroupby1(mGby, indexOfDist);
+      ReduceSinkOperator rs1 = genReducesink1(mGby1, rs, indexOfDist);
+      GroupByOperator mGby2 = genMapGroupby2(rs1, mGby);
+      GroupByOperator mGby3 = genMapGroupby3(mGby2, mGby);
+      ReduceSinkOperator rs2 = genReducesink2(mGby3, rs);
+      GroupByOperator rGby1 = genReduceGroupby(rs2, rGby, indexOfDist);
+      for (Operator<? extends OperatorDesc> parent : parents) {
+        OperatorFactory.makeChild(parent, mGby1);
+      }
+      OperatorFactory.makeChild(mGby1, rs1);
+      OperatorFactory.makeChild(rs1, mGby2);
+      OperatorFactory.makeChild(mGby2, mGby3);
+      OperatorFactory.makeChild(mGby3, rs2);
+      OperatorFactory.makeChild(rs2, rGby1);
+      for (Operator<? extends OperatorDesc> child : children) {
+        child.removeParents();
+        OperatorFactory.makeChild(rGby1, child);
+      }
+    }
+
+    // mGby1 ---already contains group by key, we need to remove distinct 
column
+    private GroupByOperator genMapGroupby1(Operator<? extends OperatorDesc> 
mGby, int indexOfDist)
+        throws CloneNotSupportedException {
+      GroupByOperator mGby1 = (GroupByOperator) mGby.clone();
+      // distinct is at lost position.
+      String fieldString = 
mGby1.getConf().getOutputColumnNames().get(indexOfDist + 1);
+      mGby1.getColumnExprMap().remove(fieldString);
+      mGby1.getConf().getOutputColumnNames().remove(indexOfDist + 1);
+      mGby1.getConf().getAggregators().remove(indexOfDist);
+      mGby1.getConf().setDistinct(false);
+      mGby1.getSchema().getColumnNames().remove(indexOfDist + 1);
+      mGby1.getSchema().getSignature().remove(indexOfDist + 1);
+      return mGby1;
+    }
+
+    // rs1 --- remove distinctColIndices, set #reducer as -1, reset keys,
+    // values, colexpmap and rowschema
+    private ReduceSinkOperator genReducesink1(GroupByOperator mGby1,
+        Operator<? extends OperatorDesc> rs, int indexOfDist) throws 
CloneNotSupportedException,
+        SemanticException {
+      ReduceSinkOperator rs1 = (ReduceSinkOperator) rs.clone();
+      Map<String, ExprNodeDesc> colExprMap = new HashMap<String, 
ExprNodeDesc>();
+      ArrayList<String> outputKeyColumnNames = new ArrayList<String>();
+      ArrayList<String> outputValueColumnNames = new ArrayList<String>();
+      ArrayList<ExprNodeDesc> reduceKeys = new ArrayList<ExprNodeDesc>();
+      ArrayList<ExprNodeDesc> reduceValues = new ArrayList<ExprNodeDesc>();
+      List<String> internalNames = new ArrayList<>();
+      for (int index = 0; index < mGby1.getSchema().getSignature().size(); 
index++) {
+        ColumnInfo paraExprInfo = mGby1.getSchema().getSignature().get(index);
+        String paraExpression = paraExprInfo.getInternalName();
+        assert (paraExpression != null);
+        ExprNodeColumnDesc exprDesc = new 
ExprNodeColumnDesc(paraExprInfo.getType(),
+            paraExpression, paraExprInfo.getTabAlias(), 
paraExprInfo.getIsVirtualCol());
+        // index==0 means this is key
+        if (index == 0) {
+          reduceKeys.add(exprDesc);
+          String outputColName = SemanticAnalyzer.getColumnInternalName(index);
+          outputKeyColumnNames.add(outputColName);
+          String internalName = Utilities.ReduceField.KEY.toString() + "." + 
outputColName;
+          colExprMap.put(internalName, exprDesc);
+          internalNames.add(internalName);
+        } else {
+          reduceValues.add(exprDesc);
+          String outputColName = SemanticAnalyzer.getColumnInternalName(index 
- 1);
+          outputValueColumnNames.add(outputColName);
+          String internalName = Utilities.ReduceField.VALUE.toString() + "." + 
outputColName;
+          colExprMap.put(internalName, exprDesc);
+          internalNames.add(internalName);
+        }
+      }
+      List<List<Integer>> distinctColIndices = new ArrayList<>();
+      rs1.setConf(PlanUtils.getReduceSinkDesc(reduceKeys, 1, reduceValues, 
distinctColIndices,
+          outputKeyColumnNames, outputValueColumnNames, true, -1, 1, -1,
+          AcidUtils.Operation.NOT_ACID));
+      rs1.setColumnExprMap(colExprMap);
+
+      rs1.getSchema().getColumnNames().remove(indexOfDist + 1);
+      rs1.getSchema().getSignature().remove(indexOfDist + 1);
+      // KEY._col0:0._col0 => KEY._col0
+
+      for (int i = 0; i < rs1.getSchema().getSignature().size(); i++) {
+        
rs1.getSchema().getSignature().get(i).setInternalName(internalNames.get(i));
+        rs1.getSchema().getColumnNames().set(i, internalNames.get(i));
+      }
+      return rs1;
+    }
+
+    // mGby2 ---already contains key, remove distinct and change all the others
+    private GroupByOperator genMapGroupby2(ReduceSinkOperator rs1,
+        Operator<? extends OperatorDesc> mGby) throws 
CloneNotSupportedException, SemanticException {
+      GroupByOperator mGby2 = (GroupByOperator) mGby.clone();
+      ArrayList<ColumnInfo> rowSchema = new ArrayList<>();
+      ArrayList<ExprNodeDesc> groupByKeys = new ArrayList<ExprNodeDesc>();
+      ArrayList<String> outputColumnNames = new ArrayList<String>();
+      Map<String, ExprNodeDesc> colExprMap = new HashMap<String, 
ExprNodeDesc>();
+
+      ColumnInfo exprInfo = rs1.getSchema().getSignature().get(0);
+      ExprNodeDesc key = new ExprNodeColumnDesc(exprInfo);
+      groupByKeys.add(key);
+      String field = SemanticAnalyzer.getColumnInternalName(0);
+      outputColumnNames.add(field);
+      ColumnInfo oColInfo = new ColumnInfo(field, exprInfo.getType(), "", 
false);
+      colExprMap.put(field, key);
+      rowSchema.add(oColInfo);
+
+      ArrayList<AggregationDesc> aggregations = new 
ArrayList<AggregationDesc>();
+      for (int index = 0; index < mGby2.getConf().getAggregators().size(); 
index++) {
+        ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
+        if (index != indexOfDist) {
+          AggregationDesc desc = mGby2.getConf().getAggregators().get(index);
+          ColumnInfo paraExprInfo = null;
+          // for example, original it is max 0, dist 1, min 2
+          // rs1's schema is key 0, max 1, min 2
+          if (index < indexOfDist) {
+            paraExprInfo = rs1.getSchema().getSignature().get(index + 1);
+          } else {
+            paraExprInfo = rs1.getSchema().getSignature().get(index);
+          }
+
+          String paraExpression = paraExprInfo.getInternalName();
+          assert (paraExpression != null);
+          aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), 
paraExpression,
+              paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()));
+
+          // for all the other aggregations, we set the mode to PARTIAL2
+          Mode amode = 
SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.PARTIAL2, false);
+          GenericUDAFEvaluator genericUDAFEvaluator = 
desc.getGenericUDAFEvaluator();
+          GenericUDAFInfo udaf = 
SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode,
+              aggParameters);
+          aggregations.add(new AggregationDesc(desc.getGenericUDAFName(),
+              udaf.genericUDAFEvaluator, udaf.convertedParameters, false, 
amode));
+          String f = 
SemanticAnalyzer.getColumnInternalName(aggregations.size());
+          outputColumnNames.add(f);
+          rowSchema.add(new ColumnInfo(f, udaf.returnType, "", false));
+        }
+      }
+      mGby2.getConf().setMode(GroupByDesc.Mode.PARTIAL2);
+      mGby2.getConf().setOutputColumnNames(outputColumnNames);
+      mGby2.getConf().getKeys().clear();
+      mGby2.getConf().getKeys().addAll(groupByKeys);
+      mGby2.getConf().getAggregators().clear();
+      mGby2.getConf().getAggregators().addAll(aggregations);
+      mGby2.getConf().setDistinct(false);
+      mGby2.setSchema(new RowSchema(rowSchema));
+      mGby2.setColumnExprMap(colExprMap);
+      return mGby2;
+    }
+
+    // mGby3 is a follow up of mGby2. Here we start to count(key).
+    private GroupByOperator genMapGroupby3(GroupByOperator mGby2,
+        Operator<? extends OperatorDesc> mGby) throws 
CloneNotSupportedException, SemanticException {
+      GroupByOperator mGby3 = (GroupByOperator) mGby.clone();
+      ArrayList<ColumnInfo> rowSchema = new ArrayList<>();
+      ArrayList<String> outputColumnNames = new ArrayList<String>();
+      Map<String, ExprNodeDesc> colExprMap = new HashMap<String, 
ExprNodeDesc>();
+
+      // exprInfo is the key
+      ArrayList<AggregationDesc> aggregations = new 
ArrayList<AggregationDesc>();
+      for (int index = 0; index <= mGby2.getConf().getAggregators().size(); 
index++) {
+        if (index == indexOfDist) {
+          ArrayList<ExprNodeDesc> aggParameters = new 
ArrayList<ExprNodeDesc>();
+          // add count(KEY._col0) to replace distinct
+          ColumnInfo paraExprInfo = mGby2.getSchema().getSignature().get(0);
+          String paraExpression = paraExprInfo.getInternalName();
+          assert (paraExpression != null);
+          aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), 
paraExpression,
+              paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()));
+          Mode amode = 
SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.HASH, false);
+          GenericUDAFEvaluator genericUDAFEvaluator = 
SemanticAnalyzer.getGenericUDAFEvaluator(
+              "count", aggParameters, null, false, false);
+          assert (genericUDAFEvaluator != null);
+          GenericUDAFInfo udaf = 
SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode,
+              aggParameters);
+          AggregationDesc newDesc = new AggregationDesc("count", 
udaf.genericUDAFEvaluator,
+              udaf.convertedParameters, false, amode);
+          String f = 
SemanticAnalyzer.getColumnInternalName(aggregations.size());
+          aggregations.add(newDesc);
+          outputColumnNames.add(f);
+          rowSchema.add(new ColumnInfo(f, udaf.returnType, "", false));
+        }
+        if (index == mGby2.getConf().getAggregators().size()) {
+          break;
+        }
+        ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
+        AggregationDesc desc = mGby2.getConf().getAggregators().get(index);
+        ColumnInfo paraExprInfo = null;
+        // for example, original it is max 0, dist 1, min 2
+        // rs1's schema is key 0, max 1, min 2
+        paraExprInfo = mGby2.getSchema().getSignature().get(index + 1);
+        String paraExpression = paraExprInfo.getInternalName();
+        assert (paraExpression != null);
+        aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), 
paraExpression,
+            paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()));
+
+        // for all the other aggregations, we set the mode to PARTIAL2
+        Mode amode = 
SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.PARTIAL2, false);
+        GenericUDAFEvaluator genericUDAFEvaluator = 
desc.getGenericUDAFEvaluator();
+        GenericUDAFInfo udaf = 
SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode,
+            aggParameters);
+        String f = SemanticAnalyzer.getColumnInternalName(aggregations.size());
+        aggregations.add(new AggregationDesc(desc.getGenericUDAFName(), 
udaf.genericUDAFEvaluator,
+            udaf.convertedParameters, false, amode));
+        outputColumnNames.add(f);
+        rowSchema.add(new ColumnInfo(f, udaf.returnType, "", false));
+      }
+      mGby3.getConf().setMode(GroupByDesc.Mode.PARTIAL2);
+      mGby3.getConf().setOutputColumnNames(outputColumnNames);
+      mGby3.getConf().getKeys().clear();
+      mGby3.getConf().getAggregators().clear();
+      mGby3.getConf().getAggregators().addAll(aggregations);
+      mGby3.getConf().setDistinct(false);
+      mGby3.setSchema(new RowSchema(rowSchema));
+      mGby3.setColumnExprMap(colExprMap);
+      return mGby3;
+    }
+
+    // #reducer is already 1
+    private ReduceSinkOperator genReducesink2(GroupByOperator mGby2,
+        Operator<? extends OperatorDesc> rs) throws SemanticException, 
CloneNotSupportedException {
+      ReduceSinkOperator rs2 = (ReduceSinkOperator) rs.clone();
+      Map<String, ExprNodeDesc> colExprMap = new HashMap<>();
+
+      ArrayList<String> outputKeyColumnNames = new ArrayList<String>();
+      ArrayList<String> outputValueColumnNames = new ArrayList<String>();
+      ArrayList<ExprNodeDesc> reduceValues = new ArrayList<ExprNodeDesc>();
+      for (int index = 0; index < mGby2.getSchema().getSignature().size(); 
index++) {
+        ColumnInfo paraExprInfo = mGby2.getSchema().getSignature().get(index);
+        String paraExpression = paraExprInfo.getInternalName();
+        assert (paraExpression != null);
+        ExprNodeColumnDesc exprDesc = new 
ExprNodeColumnDesc(paraExprInfo.getType(),
+            paraExpression, paraExprInfo.getTabAlias(), 
paraExprInfo.getIsVirtualCol());
+        reduceValues.add(exprDesc);
+        String outputColName = SemanticAnalyzer.getColumnInternalName(index);
+        outputValueColumnNames.add(outputColName);
+        String internalName = Utilities.ReduceField.VALUE.toString() + "." + 
outputColName;
+        colExprMap.put(internalName, exprDesc);
+      }
+      List<List<Integer>> distinctColIndices = new ArrayList<>();
+      ArrayList<ExprNodeDesc> reduceKeys = new ArrayList<>();
+      rs2.setConf(PlanUtils.getReduceSinkDesc(reduceKeys, 0, reduceValues, 
distinctColIndices,
+          outputKeyColumnNames, outputValueColumnNames, false, -1, 0, 1,
+          AcidUtils.Operation.NOT_ACID));
+      rs2.setColumnExprMap(colExprMap);
+      rs2.getSchema().getSignature().remove(0);
+      return rs2;
+    }
+
+    // replace the distinct with the count aggregation
+    private GroupByOperator genReduceGroupby(ReduceSinkOperator rs2,
+        Operator<? extends OperatorDesc> rGby, int indexOfDist) throws 
SemanticException,
+        CloneNotSupportedException {
+      GroupByOperator rGby1 = (GroupByOperator) rGby.clone();
+      ColumnInfo paraExprInfo = 
rs2.getSchema().getSignature().get(indexOfDist);
+      String paraExpression = paraExprInfo.getInternalName();
+      assert (paraExpression != null);
+      ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
+      aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), 
paraExpression, paraExprInfo
+          .getTabAlias(), paraExprInfo.getIsVirtualCol()));
+      GenericUDAFEvaluator genericUDAFEvaluator = 
SemanticAnalyzer.getGenericUDAFEvaluator("count",
+          aggParameters, null, false, false);
+      assert (genericUDAFEvaluator != null);
+      Mode amode = 
SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.MERGEPARTIAL, 
false);
+      GenericUDAFInfo udaf = 
SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode,
+          aggParameters);
+      AggregationDesc newDesc = new AggregationDesc("count", 
udaf.genericUDAFEvaluator,
+          udaf.convertedParameters, false, amode);
+      rGby1.getConf().getAggregators().set(indexOfDist, newDesc);
+      rGby1.getConf().setDistinct(false);
+      return rGby1;
+    }
+
+    @Override
+    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
+        Object... nodeOutputs) throws SemanticException {
+      GroupByOperator mGby = (GroupByOperator) stack.get(stack.size() - 3);
+      ReduceSinkOperator rs = (ReduceSinkOperator) stack.get(stack.size() - 2);
+      GroupByOperator rGby = (GroupByOperator) stack.get(stack.size() - 1);
+      if (checkCountDistinct(mGby, rs, rGby)) {
+        LOG.info("trigger count distinct rewrite");
+        try {
+          processGroupBy(mGby, rs, rGby);
+        } catch (CloneNotSupportedException e) {
+          throw new SemanticException(e.getMessage());
+        }
+      }
+      return null;
+    }
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
index 7dace90..781e088 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
@@ -143,6 +143,10 @@ public class Optimizer {
         HiveConf.getBoolVar(hiveConf, 
HiveConf.ConfVars.HIVE_MAP_GROUPBY_SORT)) {
       transformations.add(new GroupByOptimizer());
     }
+    if (HiveConf.getBoolVar(hiveConf, 
HiveConf.ConfVars.HIVECOUNTDISTINCTOPTIMIZER)
+        && (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_IN_TEST) || 
isTezExecEngine)) {
+      transformations.add(new CountDistinctRewriteProc());
+    }
     transformations.add(new ColumnPruner());
     if (HiveConf.getBoolVar(hiveConf, 
HiveConf.ConfVars.HIVE_OPTIMIZE_SKEWJOIN_COMPILETIME)) {
       if (!isTezExecEngine) {

http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java
index 38a9ef2..fe91ee7 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java
@@ -367,4 +367,19 @@ public class GroupByDesc extends AbstractOperatorDesc {
     }
     return new GroupByOperatorExplainVectorization(this, vectorDesc);
   }
+
+  @Override
+  public Object clone() {
+    ArrayList<java.lang.String> outputColumnNames = new ArrayList<>();
+    outputColumnNames.addAll(this.outputColumnNames);
+    ArrayList<ExprNodeDesc> keys = new ArrayList<>();
+    keys.addAll(this.keys);
+    ArrayList<org.apache.hadoop.hive.ql.plan.AggregationDesc> aggregators = 
new ArrayList<>();
+    aggregators.addAll(this.aggregators);
+    List<Integer> listGroupingSets = new ArrayList<>();
+    listGroupingSets.addAll(this.listGroupingSets);
+    return new GroupByDesc(this.mode, outputColumnNames, keys, aggregators,
+        this.groupByMemoryUsage, this.memoryThreshold, listGroupingSets, 
this.groupingSetsPresent,
+        this.groupingSetPosition, this.isDistinct);
+  }
 }

http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/ql/src/test/queries/clientpositive/count_dist_rewrite.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/count_dist_rewrite.q 
b/ql/src/test/queries/clientpositive/count_dist_rewrite.q
new file mode 100644
index 0000000..0b1bc66
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/count_dist_rewrite.q
@@ -0,0 +1,65 @@
+explain select count(distinct key) from src; 
+
+select count(distinct key) from src; 
+
+explain select max(key), count(distinct key) B1_CNTD from src; 
+
+select max(key), count(distinct key) B1_CNTD from src; 
+
+explain select max(key), count(distinct key), min(key) from src; 
+
+select max(key), count(distinct key), min(key) from src; 
+
+explain select max(key), count(distinct key), min(key), avg(key) from src;
+
+select max(key), count(distinct key), min(key), avg(key) from src;
+
+explain select count(1), count(distinct key) from src;
+
+select count(1), count(distinct key) from src;
+
+explain select 
+  count(*) as total,
+  count(key) as not_null_total,
+  count(distinct key) as unique_days,
+  max(value) as max_ss_store_sk,
+  max(key) as max_ss_promo_sk
+from src;
+
+select
+  count(*) as total,
+  count(key) as not_null_total,
+  count(distinct key) as unique_days,
+  max(value) as max_ss_store_sk,
+  max(key) as max_ss_promo_sk
+from src;
+
+explain select count(1), count(distinct key), cast(STDDEV(key) as int) from 
src;
+select count(1), count(distinct key), cast(STDDEV(key) as int) from src;
+select count(distinct key), count(1), cast(STDDEV(key) as int) from src;
+
+explain SELECT
+  sum(substr(src.value,5)),
+  avg(substr(src.value,5)),
+  count(DISTINCT substr(src.value,5)),
+  max(substr(src.value,5)),
+  min(substr(src.value,5)),
+  cast(std(substr(src.value,5)) as int),
+  cast(stddev_samp(substr(src.value,5)) as int),
+  cast(variance(substr(src.value,5)) as int),
+  cast(var_samp(substr(src.value,5)) as int)  from src;
+
+SELECT
+  sum(substr(src.value,5)),
+  avg(substr(src.value,5)),
+  count(DISTINCT substr(src.value,5)),
+  max(substr(src.value,5)),
+  min(substr(src.value,5)),
+  cast(std(substr(src.value,5)) as int),
+  cast(stddev_samp(substr(src.value,5)) as int),
+  cast(variance(substr(src.value,5)) as int),
+  cast(var_samp(substr(src.value,5)) as int)  from src;
+
+explain select max(key), count(distinct key), min(key), avg(key) from src 
group by value;
+
+select max(key), count(distinct key), min(key), avg(key) from src group by 
value;

http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/ql/src/test/results/clientpositive/count_dist_rewrite.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/count_dist_rewrite.q.out 
b/ql/src/test/results/clientpositive/count_dist_rewrite.q.out
new file mode 100644
index 0000000..ceda918
--- /dev/null
+++ b/ql/src/test/results/clientpositive/count_dist_rewrite.q.out
@@ -0,0 +1,1151 @@
+PREHOOK: query: explain select count(distinct key) from src
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select count(distinct key) from src
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-2 depends on stages: Stage-1
+  Stage-0 depends on stages: Stage-2
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: src
+            Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+            Select Operator
+              expressions: key (type: string)
+              outputColumnNames: key
+              Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+              Group By Operator
+                keys: key (type: string)
+                mode: hash
+                outputColumnNames: _col0
+                Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: string)
+                  sort order: +
+                  Map-reduce partition columns: _col0 (type: string)
+                  Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+      Reduce Operator Tree:
+        Group By Operator
+          keys: KEY._col0 (type: string)
+          mode: partial2
+          outputColumnNames: _col0
+          Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+          Group By Operator
+            aggregations: count(_col0)
+            mode: partial2
+            outputColumnNames: _col0
+            Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column 
stats: NONE
+            File Output Operator
+              compressed: false
+              table:
+                  input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                  serde: 
org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+  Stage: Stage-2
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            Reduce Output Operator
+              sort order: 
+              Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: NONE
+              value expressions: _col0 (type: bigint)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0)
+          mode: mergepartial
+          outputColumnNames: _col0
+          Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column 
stats: NONE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column 
stats: NONE
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select count(distinct key) from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct key) from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+309
+PREHOOK: query: explain select max(key), count(distinct key) B1_CNTD from src
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select max(key), count(distinct key) B1_CNTD from src
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-2 depends on stages: Stage-1
+  Stage-0 depends on stages: Stage-2
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: src
+            Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+            Select Operator
+              expressions: key (type: string)
+              outputColumnNames: key
+              Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+              Group By Operator
+                aggregations: max(key)
+                keys: key (type: string)
+                mode: hash
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: string)
+                  sort order: +
+                  Map-reduce partition columns: _col0 (type: string)
+                  Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+                  value expressions: _col1 (type: string)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: max(VALUE._col0)
+          keys: KEY._col0 (type: string)
+          mode: partial2
+          outputColumnNames: _col0, _col1
+          Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+          Group By Operator
+            aggregations: max(_col1), count(_col0)
+            mode: partial2
+            outputColumnNames: _col0, _col1
+            Statistics: Num rows: 1 Data size: 368 Basic stats: COMPLETE 
Column stats: NONE
+            File Output Operator
+              compressed: false
+              table:
+                  input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                  serde: 
org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+  Stage: Stage-2
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            Reduce Output Operator
+              sort order: 
+              Statistics: Num rows: 1 Data size: 368 Basic stats: COMPLETE 
Column stats: NONE
+              value expressions: _col0 (type: string), _col1 (type: bigint)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: max(VALUE._col0), count(VALUE._col1)
+          mode: mergepartial
+          outputColumnNames: _col0, _col1
+          Statistics: Num rows: 1 Data size: 368 Basic stats: COMPLETE Column 
stats: NONE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 1 Data size: 368 Basic stats: COMPLETE 
Column stats: NONE
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select max(key), count(distinct key) B1_CNTD from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select max(key), count(distinct key) B1_CNTD from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+98     309
+PREHOOK: query: explain select max(key), count(distinct key), min(key) from src
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select max(key), count(distinct key), min(key) from 
src
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-2 depends on stages: Stage-1
+  Stage-0 depends on stages: Stage-2
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: src
+            Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+            Select Operator
+              expressions: key (type: string)
+              outputColumnNames: key
+              Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+              Group By Operator
+                aggregations: max(key), min(key)
+                keys: key (type: string)
+                mode: hash
+                outputColumnNames: _col0, _col1, _col3
+                Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: string)
+                  sort order: +
+                  Map-reduce partition columns: _col0 (type: string)
+                  Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+                  value expressions: _col1 (type: string), _col3 (type: string)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: max(VALUE._col0), min(VALUE._col1)
+          keys: KEY._col0 (type: string)
+          mode: partial2
+          outputColumnNames: _col0, _col1, _col2
+          Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+          Group By Operator
+            aggregations: max(_col1), count(_col0), min(_col2)
+            mode: partial2
+            outputColumnNames: _col0, _col1, _col2
+            Statistics: Num rows: 1 Data size: 736 Basic stats: COMPLETE 
Column stats: NONE
+            File Output Operator
+              compressed: false
+              table:
+                  input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                  serde: 
org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+  Stage: Stage-2
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            Reduce Output Operator
+              sort order: 
+              Statistics: Num rows: 1 Data size: 736 Basic stats: COMPLETE 
Column stats: NONE
+              value expressions: _col0 (type: string), _col1 (type: bigint), 
_col2 (type: string)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: max(VALUE._col0), count(VALUE._col1), min(VALUE._col2)
+          mode: mergepartial
+          outputColumnNames: _col0, _col1, _col2
+          Statistics: Num rows: 1 Data size: 736 Basic stats: COMPLETE Column 
stats: NONE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 1 Data size: 736 Basic stats: COMPLETE 
Column stats: NONE
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select max(key), count(distinct key), min(key) from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select max(key), count(distinct key), min(key) from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+98     309     0
+PREHOOK: query: explain select max(key), count(distinct key), min(key), 
avg(key) from src
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select max(key), count(distinct key), min(key), 
avg(key) from src
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-2 depends on stages: Stage-1
+  Stage-0 depends on stages: Stage-2
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: src
+            Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+            Select Operator
+              expressions: key (type: string)
+              outputColumnNames: key
+              Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+              Group By Operator
+                aggregations: max(key), min(key), avg(key)
+                keys: key (type: string)
+                mode: hash
+                outputColumnNames: _col0, _col1, _col3, _col4
+                Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: string)
+                  sort order: +
+                  Map-reduce partition columns: _col0 (type: string)
+                  Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+                  value expressions: _col1 (type: string), _col3 (type: 
string), _col4 (type: struct<count:bigint,sum:double,input:string>)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: max(VALUE._col0), min(VALUE._col1), avg(VALUE._col2)
+          keys: KEY._col0 (type: string)
+          mode: partial2
+          outputColumnNames: _col0, _col1, _col2, _col3
+          Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+          Group By Operator
+            aggregations: max(_col1), count(_col0), min(_col2), avg(_col3)
+            mode: partial2
+            outputColumnNames: _col0, _col1, _col2, _col3
+            Statistics: Num rows: 1 Data size: 1148 Basic stats: COMPLETE 
Column stats: NONE
+            File Output Operator
+              compressed: false
+              table:
+                  input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                  serde: 
org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+  Stage: Stage-2
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            Reduce Output Operator
+              sort order: 
+              Statistics: Num rows: 1 Data size: 1148 Basic stats: COMPLETE 
Column stats: NONE
+              value expressions: _col0 (type: string), _col1 (type: bigint), 
_col2 (type: string), _col3 (type: struct<count:bigint,sum:double,input:string>)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: max(VALUE._col0), count(VALUE._col1), 
min(VALUE._col2), avg(VALUE._col3)
+          mode: mergepartial
+          outputColumnNames: _col0, _col1, _col2, _col3
+          Statistics: Num rows: 1 Data size: 1148 Basic stats: COMPLETE Column 
stats: NONE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 1 Data size: 1148 Basic stats: COMPLETE 
Column stats: NONE
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select max(key), count(distinct key), min(key), avg(key) from 
src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select max(key), count(distinct key), min(key), avg(key) from 
src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+98     309     0       260.182
+PREHOOK: query: explain select count(1), count(distinct key) from src
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select count(1), count(distinct key) from src
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-2 depends on stages: Stage-1
+  Stage-0 depends on stages: Stage-2
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: src
+            Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+            Select Operator
+              expressions: key (type: string)
+              outputColumnNames: _col1
+              Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+              Group By Operator
+                aggregations: count(1)
+                keys: _col1 (type: string)
+                mode: hash
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: string)
+                  sort order: +
+                  Map-reduce partition columns: _col0 (type: string)
+                  Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+                  value expressions: _col1 (type: bigint)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0)
+          keys: KEY._col0 (type: string)
+          mode: partial2
+          outputColumnNames: _col0, _col1
+          Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+          Group By Operator
+            aggregations: count(_col1), count(_col0)
+            mode: partial2
+            outputColumnNames: _col0, _col1
+            Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column 
stats: NONE
+            File Output Operator
+              compressed: false
+              table:
+                  input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                  serde: 
org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+  Stage: Stage-2
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            Reduce Output Operator
+              sort order: 
+              Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE 
Column stats: NONE
+              value expressions: _col0 (type: bigint), _col1 (type: bigint)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0), count(VALUE._col1)
+          mode: mergepartial
+          outputColumnNames: _col0, _col1
+          Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column 
stats: NONE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column 
stats: NONE
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select count(1), count(distinct key) from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select count(1), count(distinct key) from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+500    309
+PREHOOK: query: explain select 
+  count(*) as total,
+  count(key) as not_null_total,
+  count(distinct key) as unique_days,
+  max(value) as max_ss_store_sk,
+  max(key) as max_ss_promo_sk
+from src
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select 
+  count(*) as total,
+  count(key) as not_null_total,
+  count(distinct key) as unique_days,
+  max(value) as max_ss_store_sk,
+  max(key) as max_ss_promo_sk
+from src
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-2 depends on stages: Stage-1
+  Stage-0 depends on stages: Stage-2
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: src
+            Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+            Select Operator
+              expressions: key (type: string), value (type: string)
+              outputColumnNames: key, value
+              Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+              Group By Operator
+                aggregations: count(), count(key), max(value), max(key)
+                keys: key (type: string)
+                mode: hash
+                outputColumnNames: _col0, _col1, _col2, _col4, _col5
+                Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: string)
+                  sort order: +
+                  Map-reduce partition columns: _col0 (type: string)
+                  Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+                  value expressions: _col1 (type: bigint), _col2 (type: 
bigint), _col4 (type: string), _col5 (type: string)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0), count(VALUE._col1), 
max(VALUE._col2), max(VALUE._col3)
+          keys: KEY._col0 (type: string)
+          mode: partial2
+          outputColumnNames: _col0, _col1, _col2, _col3, _col4
+          Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+          Group By Operator
+            aggregations: count(_col1), count(_col2), count(_col0), 
max(_col3), max(_col4)
+            mode: partial2
+            outputColumnNames: _col0, _col1, _col2, _col3, _col4
+            Statistics: Num rows: 1 Data size: 576 Basic stats: COMPLETE 
Column stats: NONE
+            File Output Operator
+              compressed: false
+              table:
+                  input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                  serde: 
org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+  Stage: Stage-2
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            Reduce Output Operator
+              sort order: 
+              Statistics: Num rows: 1 Data size: 576 Basic stats: COMPLETE 
Column stats: NONE
+              value expressions: _col0 (type: bigint), _col1 (type: bigint), 
_col2 (type: bigint), _col3 (type: string), _col4 (type: string)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0), count(VALUE._col1), 
count(VALUE._col2), max(VALUE._col3), max(VALUE._col4)
+          mode: mergepartial
+          outputColumnNames: _col0, _col1, _col2, _col3, _col4
+          Statistics: Num rows: 1 Data size: 576 Basic stats: COMPLETE Column 
stats: NONE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 1 Data size: 576 Basic stats: COMPLETE 
Column stats: NONE
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select
+  count(*) as total,
+  count(key) as not_null_total,
+  count(distinct key) as unique_days,
+  max(value) as max_ss_store_sk,
+  max(key) as max_ss_promo_sk
+from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select
+  count(*) as total,
+  count(key) as not_null_total,
+  count(distinct key) as unique_days,
+  max(value) as max_ss_store_sk,
+  max(key) as max_ss_promo_sk
+from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+500    500     309     val_98  98
+PREHOOK: query: explain select count(1), count(distinct key), cast(STDDEV(key) 
as int) from src
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select count(1), count(distinct key), 
cast(STDDEV(key) as int) from src
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-2 depends on stages: Stage-1
+  Stage-0 depends on stages: Stage-2
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: src
+            Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+            Select Operator
+              expressions: key (type: string)
+              outputColumnNames: _col1
+              Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+              Group By Operator
+                aggregations: count(1), stddev(_col1)
+                keys: _col1 (type: string)
+                mode: hash
+                outputColumnNames: _col0, _col1, _col3
+                Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: string)
+                  sort order: +
+                  Map-reduce partition columns: _col0 (type: string)
+                  Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+                  value expressions: _col1 (type: bigint), _col3 (type: 
struct<count:bigint,sum:double,variance:double>)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0), stddev(VALUE._col1)
+          keys: KEY._col0 (type: string)
+          mode: partial2
+          outputColumnNames: _col0, _col1, _col2
+          Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+          Group By Operator
+            aggregations: count(_col1), count(_col0), stddev(_col2)
+            mode: partial2
+            outputColumnNames: _col0, _col1, _col2
+            Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE 
Column stats: NONE
+            File Output Operator
+              compressed: false
+              table:
+                  input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                  serde: 
org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+  Stage: Stage-2
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            Reduce Output Operator
+              sort order: 
+              Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE 
Column stats: NONE
+              value expressions: _col0 (type: bigint), _col1 (type: bigint), 
_col2 (type: struct<count:bigint,sum:double,variance:double>)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0), count(VALUE._col1), 
stddev(VALUE._col2)
+          mode: mergepartial
+          outputColumnNames: _col0, _col1, _col2
+          Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE Column 
stats: NONE
+          Select Operator
+            expressions: _col0 (type: bigint), _col1 (type: bigint), 
UDFToInteger(_col2) (type: int)
+            outputColumnNames: _col0, _col1, _col2
+            Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE 
Column stats: NONE
+            File Output Operator
+              compressed: false
+              Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE 
Column stats: NONE
+              table:
+                  input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select count(1), count(distinct key), cast(STDDEV(key) as int) 
from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select count(1), count(distinct key), cast(STDDEV(key) as 
int) from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+500    309     142
+PREHOOK: query: select count(distinct key), count(1), cast(STDDEV(key) as int) 
from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select count(distinct key), count(1), cast(STDDEV(key) as 
int) from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+309    500     142
+PREHOOK: query: explain SELECT
+  sum(substr(src.value,5)),
+  avg(substr(src.value,5)),
+  count(DISTINCT substr(src.value,5)),
+  max(substr(src.value,5)),
+  min(substr(src.value,5)),
+  cast(std(substr(src.value,5)) as int),
+  cast(stddev_samp(substr(src.value,5)) as int),
+  cast(variance(substr(src.value,5)) as int),
+  cast(var_samp(substr(src.value,5)) as int)  from src
+PREHOOK: type: QUERY
+POSTHOOK: query: explain SELECT
+  sum(substr(src.value,5)),
+  avg(substr(src.value,5)),
+  count(DISTINCT substr(src.value,5)),
+  max(substr(src.value,5)),
+  min(substr(src.value,5)),
+  cast(std(substr(src.value,5)) as int),
+  cast(stddev_samp(substr(src.value,5)) as int),
+  cast(variance(substr(src.value,5)) as int),
+  cast(var_samp(substr(src.value,5)) as int)  from src
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-2 depends on stages: Stage-1
+  Stage-0 depends on stages: Stage-2
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: src
+            Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+            Select Operator
+              expressions: substr(value, 5) (type: string)
+              outputColumnNames: _col0
+              Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+              Group By Operator
+                aggregations: sum(_col0), avg(_col0), max(_col0), min(_col0), 
std(_col0), stddev_samp(_col0), variance(_col0), var_samp(_col0)
+                keys: _col0 (type: string)
+                mode: hash
+                outputColumnNames: _col0, _col1, _col2, _col4, _col5, _col6, 
_col7, _col8, _col9
+                Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: string)
+                  sort order: +
+                  Map-reduce partition columns: _col0 (type: string)
+                  Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+                  value expressions: _col1 (type: double), _col2 (type: 
struct<count:bigint,sum:double,input:string>), _col4 (type: string), _col5 
(type: string), _col6 (type: struct<count:bigint,sum:double,variance:double>), 
_col7 (type: struct<count:bigint,sum:double,variance:double>), _col8 (type: 
struct<count:bigint,sum:double,variance:double>), _col9 (type: 
struct<count:bigint,sum:double,variance:double>)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: sum(VALUE._col0), avg(VALUE._col1), max(VALUE._col2), 
min(VALUE._col3), std(VALUE._col4), stddev_samp(VALUE._col5), 
variance(VALUE._col6), var_samp(VALUE._col7)
+          keys: KEY._col0 (type: string)
+          mode: partial2
+          outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, 
_col7, _col8
+          Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+          Group By Operator
+            aggregations: sum(_col1), avg(_col2), count(_col0), max(_col3), 
min(_col4), std(_col5), stddev_samp(_col6), variance(_col7), var_samp(_col8)
+            mode: partial2
+            outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, 
_col6, _col7, _col8
+            Statistics: Num rows: 1 Data size: 1392 Basic stats: COMPLETE 
Column stats: NONE
+            File Output Operator
+              compressed: false
+              table:
+                  input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                  serde: 
org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+  Stage: Stage-2
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            Reduce Output Operator
+              sort order: 
+              Statistics: Num rows: 1 Data size: 1392 Basic stats: COMPLETE 
Column stats: NONE
+              value expressions: _col0 (type: double), _col1 (type: 
struct<count:bigint,sum:double,input:string>), _col2 (type: bigint), _col3 
(type: string), _col4 (type: string), _col5 (type: 
struct<count:bigint,sum:double,variance:double>), _col6 (type: 
struct<count:bigint,sum:double,variance:double>), _col7 (type: 
struct<count:bigint,sum:double,variance:double>), _col8 (type: 
struct<count:bigint,sum:double,variance:double>)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: sum(VALUE._col0), avg(VALUE._col1), 
count(VALUE._col2), max(VALUE._col3), min(VALUE._col4), std(VALUE._col5), 
stddev_samp(VALUE._col6), variance(VALUE._col7), var_samp(VALUE._col8)
+          mode: mergepartial
+          outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, 
_col7, _col8
+          Statistics: Num rows: 1 Data size: 1392 Basic stats: COMPLETE Column 
stats: NONE
+          Select Operator
+            expressions: _col0 (type: double), _col1 (type: double), _col2 
(type: bigint), _col3 (type: string), _col4 (type: string), UDFToInteger(_col5) 
(type: int), UDFToInteger(_col6) (type: int), UDFToInteger(_col7) (type: int), 
UDFToInteger(_col8) (type: int)
+            outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, 
_col6, _col7, _col8
+            Statistics: Num rows: 1 Data size: 1392 Basic stats: COMPLETE 
Column stats: NONE
+            File Output Operator
+              compressed: false
+              Statistics: Num rows: 1 Data size: 1392 Basic stats: COMPLETE 
Column stats: NONE
+              table:
+                  input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: SELECT
+  sum(substr(src.value,5)),
+  avg(substr(src.value,5)),
+  count(DISTINCT substr(src.value,5)),
+  max(substr(src.value,5)),
+  min(substr(src.value,5)),
+  cast(std(substr(src.value,5)) as int),
+  cast(stddev_samp(substr(src.value,5)) as int),
+  cast(variance(substr(src.value,5)) as int),
+  cast(var_samp(substr(src.value,5)) as int)  from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT
+  sum(substr(src.value,5)),
+  avg(substr(src.value,5)),
+  count(DISTINCT substr(src.value,5)),
+  max(substr(src.value,5)),
+  min(substr(src.value,5)),
+  cast(std(substr(src.value,5)) as int),
+  cast(stddev_samp(substr(src.value,5)) as int),
+  cast(variance(substr(src.value,5)) as int),
+  cast(var_samp(substr(src.value,5)) as int)  from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+130091.0       260.182 309     98      0       142     143     20428   20469
+PREHOOK: query: explain select max(key), count(distinct key), min(key), 
avg(key) from src group by value
+PREHOOK: type: QUERY
+POSTHOOK: query: explain select max(key), count(distinct key), min(key), 
avg(key) from src group by value
+POSTHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: src
+            Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+            Select Operator
+              expressions: key (type: string), value (type: string)
+              outputColumnNames: key, value
+              Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE 
Column stats: NONE
+              Group By Operator
+                aggregations: max(key), count(DISTINCT key), min(key), avg(key)
+                keys: value (type: string), key (type: string)
+                mode: hash
+                outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+                Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+                Reduce Output Operator
+                  key expressions: _col0 (type: string), _col1 (type: string)
+                  sort order: ++
+                  Map-reduce partition columns: _col0 (type: string)
+                  Statistics: Num rows: 500 Data size: 5312 Basic stats: 
COMPLETE Column stats: NONE
+                  value expressions: _col2 (type: string), _col4 (type: 
string), _col5 (type: struct<count:bigint,sum:double,input:string>)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: max(VALUE._col0), count(DISTINCT KEY._col1:0._col0), 
min(VALUE._col2), avg(VALUE._col3)
+          keys: KEY._col0 (type: string)
+          mode: mergepartial
+          outputColumnNames: _col0, _col1, _col2, _col3, _col4
+          Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE 
Column stats: NONE
+          Select Operator
+            expressions: _col1 (type: string), _col2 (type: bigint), _col3 
(type: string), _col4 (type: double)
+            outputColumnNames: _col0, _col1, _col2, _col3
+            Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE 
Column stats: NONE
+            File Output Operator
+              compressed: false
+              Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE 
Column stats: NONE
+              table:
+                  input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                  serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select max(key), count(distinct key), min(key), avg(key) from 
src group by value
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+#### A masked pattern was here ####
+POSTHOOK: query: select max(key), count(distinct key), min(key), avg(key) from 
src group by value
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+#### A masked pattern was here ####
+0      1       0       0.0
+10     1       10      10.0
+100    1       100     100.0
+103    1       103     103.0
+104    1       104     104.0
+105    1       105     105.0
+11     1       11      11.0
+111    1       111     111.0
+113    1       113     113.0
+114    1       114     114.0
+116    1       116     116.0
+118    1       118     118.0
+119    1       119     119.0
+12     1       12      12.0
+120    1       120     120.0
+125    1       125     125.0
+126    1       126     126.0
+128    1       128     128.0
+129    1       129     129.0
+131    1       131     131.0
+133    1       133     133.0
+134    1       134     134.0
+136    1       136     136.0
+137    1       137     137.0
+138    1       138     138.0
+143    1       143     143.0
+145    1       145     145.0
+146    1       146     146.0
+149    1       149     149.0
+15     1       15      15.0
+150    1       150     150.0
+152    1       152     152.0
+153    1       153     153.0
+155    1       155     155.0
+156    1       156     156.0
+157    1       157     157.0
+158    1       158     158.0
+160    1       160     160.0
+162    1       162     162.0
+163    1       163     163.0
+164    1       164     164.0
+165    1       165     165.0
+166    1       166     166.0
+167    1       167     167.0
+168    1       168     168.0
+169    1       169     169.0
+17     1       17      17.0
+170    1       170     170.0
+172    1       172     172.0
+174    1       174     174.0
+175    1       175     175.0
+176    1       176     176.0
+177    1       177     177.0
+178    1       178     178.0
+179    1       179     179.0
+18     1       18      18.0
+180    1       180     180.0
+181    1       181     181.0
+183    1       183     183.0
+186    1       186     186.0
+187    1       187     187.0
+189    1       189     189.0
+19     1       19      19.0
+190    1       190     190.0
+191    1       191     191.0
+192    1       192     192.0
+193    1       193     193.0
+194    1       194     194.0
+195    1       195     195.0
+196    1       196     196.0
+197    1       197     197.0
+199    1       199     199.0
+2      1       2       2.0
+20     1       20      20.0
+200    1       200     200.0
+201    1       201     201.0
+202    1       202     202.0
+203    1       203     203.0
+205    1       205     205.0
+207    1       207     207.0
+208    1       208     208.0
+209    1       209     209.0
+213    1       213     213.0
+214    1       214     214.0
+216    1       216     216.0
+217    1       217     217.0
+218    1       218     218.0
+219    1       219     219.0
+221    1       221     221.0
+222    1       222     222.0
+223    1       223     223.0
+224    1       224     224.0
+226    1       226     226.0
+228    1       228     228.0
+229    1       229     229.0
+230    1       230     230.0
+233    1       233     233.0
+235    1       235     235.0
+237    1       237     237.0
+238    1       238     238.0
+239    1       239     239.0
+24     1       24      24.0
+241    1       241     241.0
+242    1       242     242.0
+244    1       244     244.0
+247    1       247     247.0
+248    1       248     248.0
+249    1       249     249.0
+252    1       252     252.0
+255    1       255     255.0
+256    1       256     256.0
+257    1       257     257.0
+258    1       258     258.0
+26     1       26      26.0
+260    1       260     260.0
+262    1       262     262.0
+263    1       263     263.0
+265    1       265     265.0
+266    1       266     266.0
+27     1       27      27.0
+272    1       272     272.0
+273    1       273     273.0
+274    1       274     274.0
+275    1       275     275.0
+277    1       277     277.0
+278    1       278     278.0
+28     1       28      28.0
+280    1       280     280.0
+281    1       281     281.0
+282    1       282     282.0
+283    1       283     283.0
+284    1       284     284.0
+285    1       285     285.0
+286    1       286     286.0
+287    1       287     287.0
+288    1       288     288.0
+289    1       289     289.0
+291    1       291     291.0
+292    1       292     292.0
+296    1       296     296.0
+298    1       298     298.0
+30     1       30      30.0
+302    1       302     302.0
+305    1       305     305.0
+306    1       306     306.0
+307    1       307     307.0
+308    1       308     308.0
+309    1       309     309.0
+310    1       310     310.0
+311    1       311     311.0
+315    1       315     315.0
+316    1       316     316.0
+317    1       317     317.0
+318    1       318     318.0
+321    1       321     321.0
+322    1       322     322.0
+323    1       323     323.0
+325    1       325     325.0
+327    1       327     327.0
+33     1       33      33.0
+331    1       331     331.0
+332    1       332     332.0
+333    1       333     333.0
+335    1       335     335.0
+336    1       336     336.0
+338    1       338     338.0
+339    1       339     339.0
+34     1       34      34.0
+341    1       341     341.0
+342    1       342     342.0
+344    1       344     344.0
+345    1       345     345.0
+348    1       348     348.0
+35     1       35      35.0
+351    1       351     351.0
+353    1       353     353.0
+356    1       356     356.0
+360    1       360     360.0
+362    1       362     362.0
+364    1       364     364.0
+365    1       365     365.0
+366    1       366     366.0
+367    1       367     367.0
+368    1       368     368.0
+369    1       369     369.0
+37     1       37      37.0
+373    1       373     373.0
+374    1       374     374.0
+375    1       375     375.0
+377    1       377     377.0
+378    1       378     378.0
+379    1       379     379.0
+382    1       382     382.0
+384    1       384     384.0
+386    1       386     386.0
+389    1       389     389.0
+392    1       392     392.0
+393    1       393     393.0
+394    1       394     394.0
+395    1       395     395.0
+396    1       396     396.0
+397    1       397     397.0
+399    1       399     399.0
+4      1       4       4.0
+400    1       400     400.0
+401    1       401     401.0
+402    1       402     402.0
+403    1       403     403.0
+404    1       404     404.0
+406    1       406     406.0
+407    1       407     407.0
+409    1       409     409.0
+41     1       41      41.0
+411    1       411     411.0
+413    1       413     413.0
+414    1       414     414.0
+417    1       417     417.0
+418    1       418     418.0
+419    1       419     419.0
+42     1       42      42.0
+421    1       421     421.0
+424    1       424     424.0
+427    1       427     427.0
+429    1       429     429.0
+43     1       43      43.0
+430    1       430     430.0
+431    1       431     431.0
+432    1       432     432.0
+435    1       435     435.0
+436    1       436     436.0
+437    1       437     437.0
+438    1       438     438.0
+439    1       439     439.0
+44     1       44      44.0
+443    1       443     443.0
+444    1       444     444.0
+446    1       446     446.0
+448    1       448     448.0
+449    1       449     449.0
+452    1       452     452.0
+453    1       453     453.0
+454    1       454     454.0
+455    1       455     455.0
+457    1       457     457.0
+458    1       458     458.0
+459    1       459     459.0
+460    1       460     460.0
+462    1       462     462.0
+463    1       463     463.0
+466    1       466     466.0
+467    1       467     467.0
+468    1       468     468.0
+469    1       469     469.0
+47     1       47      47.0
+470    1       470     470.0
+472    1       472     472.0
+475    1       475     475.0
+477    1       477     477.0
+478    1       478     478.0
+479    1       479     479.0
+480    1       480     480.0
+481    1       481     481.0
+482    1       482     482.0
+483    1       483     483.0
+484    1       484     484.0
+485    1       485     485.0
+487    1       487     487.0
+489    1       489     489.0
+490    1       490     490.0
+491    1       491     491.0
+492    1       492     492.0
+493    1       493     493.0
+494    1       494     494.0
+495    1       495     495.0
+496    1       496     496.0
+497    1       497     497.0
+498    1       498     498.0
+5      1       5       5.0
+51     1       51      51.0
+53     1       53      53.0
+54     1       54      54.0
+57     1       57      57.0
+58     1       58      58.0
+64     1       64      64.0
+65     1       65      65.0
+66     1       66      66.0
+67     1       67      67.0
+69     1       69      69.0
+70     1       70      70.0
+72     1       72      72.0
+74     1       74      74.0
+76     1       76      76.0
+77     1       77      77.0
+78     1       78      78.0
+8      1       8       8.0
+80     1       80      80.0
+82     1       82      82.0
+83     1       83      83.0
+84     1       84      84.0
+85     1       85      85.0
+86     1       86      86.0
+87     1       87      87.0
+9      1       9       9.0
+90     1       90      90.0
+92     1       92      92.0
+95     1       95      95.0
+96     1       96      96.0
+97     1       97      97.0
+98     1       98      98.0

http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/ql/src/test/results/clientpositive/groupby_sort_11.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/groupby_sort_11.q.out 
b/ql/src/test/results/clientpositive/groupby_sort_11.q.out
index 2b3bf4a..fe6bbb3 100644
--- a/ql/src/test/results/clientpositive/groupby_sort_11.q.out
+++ b/ql/src/test/results/clientpositive/groupby_sort_11.q.out
@@ -292,7 +292,8 @@ POSTHOOK: query: EXPLAIN select count(distinct key+key) 
from T1
 POSTHOOK: type: QUERY
 STAGE DEPENDENCIES:
   Stage-1 is a root stage
-  Stage-0 depends on stages: Stage-1
+  Stage-2 depends on stages: Stage-1
+  Stage-0 depends on stages: Stage-2
 
 STAGE PLANS:
   Stage: Stage-1
@@ -306,24 +307,50 @@ STAGE PLANS:
               outputColumnNames: _col0
               Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE 
Column stats: NONE
               Group By Operator
-                aggregations: count(DISTINCT _col0)
                 keys: _col0 (type: double)
                 mode: hash
-                outputColumnNames: _col0, _col1
+                outputColumnNames: _col0
                 Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE 
Column stats: NONE
                 Reduce Output Operator
                   key expressions: _col0 (type: double)
                   sort order: +
+                  Map-reduce partition columns: _col0 (type: double)
                   Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE 
Column stats: NONE
       Reduce Operator Tree:
         Group By Operator
-          aggregations: count(DISTINCT KEY._col0:0._col0)
+          keys: KEY._col0 (type: double)
+          mode: partial2
+          outputColumnNames: _col0
+          Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column 
stats: NONE
+          Group By Operator
+            aggregations: count(_col0)
+            mode: partial2
+            outputColumnNames: _col0
+            Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column 
stats: NONE
+            File Output Operator
+              compressed: false
+              table:
+                  input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                  output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                  serde: 
org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe
+
+  Stage: Stage-2
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            Reduce Output Operator
+              sort order: 
+              Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: NONE
+              value expressions: _col0 (type: bigint)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0)
           mode: mergepartial
           outputColumnNames: _col0
-          Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column 
stats: NONE
+          Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column 
stats: NONE
           File Output Operator
             compressed: false
-            Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column 
stats: NONE
+            Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column 
stats: NONE
             table:
                 input format: org.apache.hadoop.mapred.SequenceFileInputFormat
                 output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat

[3/3] hive git commit: HIVE-16654: Optimize a combination of avg(), sum(), count(distinct) etc (Pengcheng Xiong, reviewed by Ashutosh Chauhan)

Reply via email to