HIVE-16654: Optimize a combination of avg(), sum(), count(distinct) etc (Pengcheng Xiong, reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/b560f492 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/b560f492 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/b560f492 Branch: refs/heads/master Commit: b560f492ea08a9a83005f166e8ed5ef0fda6592d Parents: d7ab32f Author: Pengcheng Xiong <pxi...@hortonworks.com> Authored: Wed May 31 18:17:53 2017 -0700 Committer: Pengcheng Xiong <pxi...@hortonworks.com> Committed: Wed May 31 18:17:53 2017 -0700 ---------------------------------------------------------------------- .../org/apache/hadoop/hive/conf/HiveConf.java | 2 + .../test/resources/testconfiguration.properties | 1 + .../apache/hadoop/hive/ql/exec/Operator.java | 10 +- .../ql/optimizer/CountDistinctRewriteProc.java | 504 ++++++++ .../hadoop/hive/ql/optimizer/Optimizer.java | 4 + .../apache/hadoop/hive/ql/plan/GroupByDesc.java | 15 + .../queries/clientpositive/count_dist_rewrite.q | 65 + .../clientpositive/count_dist_rewrite.q.out | 1151 +++++++++++++++++ .../clientpositive/groupby_sort_11.q.out | 39 +- .../llap/count_dist_rewrite.q.out | 1169 ++++++++++++++++++ .../results/clientpositive/nullgroup4.q.out | 41 +- .../results/clientpositive/perf/query16.q.out | 318 ++--- .../results/clientpositive/perf/query28.q.out | 58 +- .../results/clientpositive/perf/query94.q.out | 318 ++--- .../results/clientpositive/perf/query95.q.out | 302 ++--- .../clientpositive/spark/nullgroup4.q.out | 31 +- .../test/results/clientpositive/udf_count.q.out | 39 +- .../clientpositive/vector_empty_where.q.out | 300 ++++- 18 files changed, 3820 insertions(+), 547 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java ---------------------------------------------------------------------- diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 5344f36..176d36f 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1569,6 +1569,8 @@ public class HiveConf extends Configuration { "Whether to transform OR clauses in Filter operators into IN clauses"), HIVEPOINTLOOKUPOPTIMIZERMIN("hive.optimize.point.lookup.min", 31, "Minimum number of OR clauses needed to transform into IN clauses"), + HIVECOUNTDISTINCTOPTIMIZER("hive.optimize.countdistinct", true, + "Whether to transform count distinct into two stages"), HIVEPARTITIONCOLUMNSEPARATOR("hive.optimize.partition.columns.separate", true, "Extract partition columns from IN clauses"), // Constant propagation optimizer http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/itests/src/test/resources/testconfiguration.properties ---------------------------------------------------------------------- diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index f4a53df..e613374 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -130,6 +130,7 @@ minillaplocal.shared.query.files=alter_merge_2_orc.q,\ constprog_semijoin.q,\ correlationoptimizer1.q,\ count.q,\ + count_dist_rewrite.q,\ create_merge_compressed.q,\ cross_join.q,\ cross_product_check_1.q,\ http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java index ffa5f41..3656842 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Operator.java @@ -1164,8 +1164,16 @@ public abstract class Operator<T extends OperatorDesc> implements Serializable,C @SuppressWarnings("unchecked") T descClone = (T)conf.clone(); // also clone the colExprMap by default + // we need a deep copy + ArrayList<ColumnInfo> colInfos = new ArrayList<>(); + colInfos.addAll(getSchema().getSignature()); + Map<String, ExprNodeDesc> map = null; + if (getColumnExprMap() != null) { + map = new HashMap<>(); + map.putAll(getColumnExprMap()); + } Operator<? extends OperatorDesc> ret = OperatorFactory.getAndMakeChild( - cContext, descClone, getSchema(), getColumnExprMap(), parentClones); + cContext, descClone, new RowSchema(colInfos), map, parentClones); return ret; } http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CountDistinctRewriteProc.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CountDistinctRewriteProc.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CountDistinctRewriteProc.java new file mode 100644 index 0000000..6450cb3 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/CountDistinctRewriteProc.java @@ -0,0 +1,504 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements.See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership.The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License.You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.Operator; +import org.apache.hadoop.hive.ql.exec.OperatorFactory; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.RowSchema; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.Utilities; +import org.apache.hadoop.hive.ql.io.AcidUtils; +import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer; +import org.apache.hadoop.hive.ql.parse.SemanticAnalyzer.GenericUDAFInfo; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.AggregationDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.GroupByDesc; +import org.apache.hadoop.hive.ql.plan.OperatorDesc; +import org.apache.hadoop.hive.ql.plan.PlanUtils; +import org.apache.hadoop.hive.ql.plan.TableScanDesc; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFEvaluator.Mode; + +/** + * Queries of form : select max(c), count(distinct c) from T; generates a plan + * of form TS->mGBy->RS->rGBy->FS This plan suffers from a problem that vertex + * containing rGBy->FS necessarily need to have 1 task. This limitation results + * in slow execution because that task gets all the data. This optimization if + * successful will rewrite above plan to mGby1-rs1-mGby2-mGby3-rs2-rGby1 This + * introduces extra vertex of mGby2-mGby3-rs2. Note this vertex can have + * multiple tasks and since we are doing aggregation, output of this must + * necessarily be smaller than its input, which results in much less data going + * in to original rGby->FS vertex, which continues to have single task. Also + * note on calcite tree we have HiveExpandDistinctAggregatesRule rule which does + * similar plan transformation but has different conditions which needs to be + * satisfied. Additionally, we don't do any costing here but this is possibly + * that this transformation may slow down query a bit since if data is small + * enough to fit in a single task of last reducer, injecting additional vertex + * in pipeline may make query slower. If this happens, users can use the + * configuration hive.optimize.countdistinct to turn it off. + */ +public class CountDistinctRewriteProc extends Transform { + + private static final Logger LOG = LoggerFactory.getLogger(CountDistinctRewriteProc.class + .getName()); + + public CountDistinctRewriteProc() { + } + + @Override + public ParseContext transform(ParseContext pctx) throws SemanticException { + + Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); + // process group-by pattern + opRules + .put( + new RuleRegExp("R1", GroupByOperator.getOperatorName() + "%" + + ReduceSinkOperator.getOperatorName() + "%" + GroupByOperator.getOperatorName() + + "%"), getCountDistinctProc(pctx)); + + // The dispatcher fires the processor corresponding to the closest matching + // rule and passes the context along + Dispatcher disp = new DefaultRuleDispatcher(getDefaultProc(), opRules, null); + GraphWalker ogw = new DefaultGraphWalker(disp); + + // Create a list of topop nodes + List<Node> topNodes = new ArrayList<Node>(); + topNodes.addAll(pctx.getTopOps().values()); + ogw.startWalking(topNodes, null); + + return pctx; + } + + private NodeProcessor getDefaultProc() { + return new NodeProcessor() { + @Override + public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + return null; + } + }; + } + + private NodeProcessor getCountDistinctProc(ParseContext pctx) { + return new CountDistinctProcessor(pctx); + } + + /** + * CountDistinctProcessor. + * + */ + public class CountDistinctProcessor implements NodeProcessor { + + protected ParseContext pGraphContext; + + public CountDistinctProcessor(ParseContext pGraphContext) { + this.pGraphContext = pGraphContext; + } + + // Position of distinct column in aggregator list of map Gby before rewrite. + int indexOfDist = -1; + + // Check if we can process it or not + protected boolean checkCountDistinct(GroupByOperator mGby, ReduceSinkOperator rs, + GroupByOperator rGby) { + ArrayList<ExprNodeDesc> keys = mGby.getConf().getKeys(); + if (!(mGby.getConf().getMode() == GroupByDesc.Mode.HASH + && !mGby.getConf().isGroupingSetsPresent() && rs.getConf().getKeyCols().size() == 1 + && rs.getConf().getPartitionCols().size() == 0 + && rs.getConf().getDistinctColumnIndices().size() == 1 + && rGby.getConf().getMode() == GroupByDesc.Mode.MERGEPARTIAL && keys.size() == 1 + && rGby.getConf().getKeys().size() == 0 && mGby.getConf().getOutputColumnNames().size() == mGby + .getConf().getAggregators().size() + 1)) { + return false; + } + for (int pos = 0; pos < mGby.getConf().getAggregators().size(); pos++) { + AggregationDesc aggr = mGby.getConf().getAggregators().get(pos); + if (aggr.getDistinct()) { + if (indexOfDist != -1 || !aggr.getGenericUDAFName().equalsIgnoreCase("count")) { + // there are 2 or more distincts, or distinct is not on count + // TODO: may be the same count(distinct key), count(distinct key) + // TODO: deal with duplicate count distinct key + return false; + } + indexOfDist = pos; + if (!(aggr.getParameters().size() == 1 + && aggr.getParameters().get(0) instanceof ExprNodeColumnDesc && mGby.getConf() + .getKeys().get(0) instanceof ExprNodeColumnDesc)) { + return false; + } else { + ExprNodeColumnDesc agg = (ExprNodeColumnDesc) aggr.getParameters().get(0); + ExprNodeColumnDesc key = (ExprNodeColumnDesc) mGby.getConf().getKeys().get(0); + if (!agg.isSame(key)) { + return false; + } + } + } + } + if (indexOfDist == -1) { + return false; + } + // check if it is potential to trigger nullscan + if (pGraphContext.getConf().getBoolVar(HiveConf.ConfVars.HIVEMETADATAONLYQUERIES)) { + for (TableScanOperator tsOp : pGraphContext.getTopOps().values()) { + List<Integer> colIDs = tsOp.getNeededColumnIDs(); + TableScanDesc desc = tsOp.getConf(); + boolean noColNeeded = (colIDs == null) || (colIDs.isEmpty()); + // VC is still here and it will be pruned by column pruner + // boolean noVCneeded = (desc == null) || (desc.getVirtualCols() == null) + // || (desc.getVirtualCols().isEmpty()); + boolean isSkipHF = desc.isNeedSkipHeaderFooters(); + if (noColNeeded && !isSkipHF) { + // it is possible that nullscan can fire, we skip this rule. + return false; + } + } + } + return true; + } + + /* + * We will transform GB-RS-GBY to mGby1-rs1-mGby2-mGby3-rs2-rGby1 + */ + @SuppressWarnings("unchecked") + protected void processGroupBy(GroupByOperator mGby, ReduceSinkOperator rs, GroupByOperator rGby) + throws SemanticException, CloneNotSupportedException { + // remove count(distinct) in map-side gby + List<Operator<? extends OperatorDesc>> parents = mGby.getParentOperators(); + List<Operator<? extends OperatorDesc>> children = rGby.getChildOperators(); + mGby.removeParents(); + rs.removeParents(); + rGby.removeParents(); + + GroupByOperator mGby1 = genMapGroupby1(mGby, indexOfDist); + ReduceSinkOperator rs1 = genReducesink1(mGby1, rs, indexOfDist); + GroupByOperator mGby2 = genMapGroupby2(rs1, mGby); + GroupByOperator mGby3 = genMapGroupby3(mGby2, mGby); + ReduceSinkOperator rs2 = genReducesink2(mGby3, rs); + GroupByOperator rGby1 = genReduceGroupby(rs2, rGby, indexOfDist); + for (Operator<? extends OperatorDesc> parent : parents) { + OperatorFactory.makeChild(parent, mGby1); + } + OperatorFactory.makeChild(mGby1, rs1); + OperatorFactory.makeChild(rs1, mGby2); + OperatorFactory.makeChild(mGby2, mGby3); + OperatorFactory.makeChild(mGby3, rs2); + OperatorFactory.makeChild(rs2, rGby1); + for (Operator<? extends OperatorDesc> child : children) { + child.removeParents(); + OperatorFactory.makeChild(rGby1, child); + } + } + + // mGby1 ---already contains group by key, we need to remove distinct column + private GroupByOperator genMapGroupby1(Operator<? extends OperatorDesc> mGby, int indexOfDist) + throws CloneNotSupportedException { + GroupByOperator mGby1 = (GroupByOperator) mGby.clone(); + // distinct is at lost position. + String fieldString = mGby1.getConf().getOutputColumnNames().get(indexOfDist + 1); + mGby1.getColumnExprMap().remove(fieldString); + mGby1.getConf().getOutputColumnNames().remove(indexOfDist + 1); + mGby1.getConf().getAggregators().remove(indexOfDist); + mGby1.getConf().setDistinct(false); + mGby1.getSchema().getColumnNames().remove(indexOfDist + 1); + mGby1.getSchema().getSignature().remove(indexOfDist + 1); + return mGby1; + } + + // rs1 --- remove distinctColIndices, set #reducer as -1, reset keys, + // values, colexpmap and rowschema + private ReduceSinkOperator genReducesink1(GroupByOperator mGby1, + Operator<? extends OperatorDesc> rs, int indexOfDist) throws CloneNotSupportedException, + SemanticException { + ReduceSinkOperator rs1 = (ReduceSinkOperator) rs.clone(); + Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>(); + ArrayList<String> outputKeyColumnNames = new ArrayList<String>(); + ArrayList<String> outputValueColumnNames = new ArrayList<String>(); + ArrayList<ExprNodeDesc> reduceKeys = new ArrayList<ExprNodeDesc>(); + ArrayList<ExprNodeDesc> reduceValues = new ArrayList<ExprNodeDesc>(); + List<String> internalNames = new ArrayList<>(); + for (int index = 0; index < mGby1.getSchema().getSignature().size(); index++) { + ColumnInfo paraExprInfo = mGby1.getSchema().getSignature().get(index); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + ExprNodeColumnDesc exprDesc = new ExprNodeColumnDesc(paraExprInfo.getType(), + paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()); + // index==0 means this is key + if (index == 0) { + reduceKeys.add(exprDesc); + String outputColName = SemanticAnalyzer.getColumnInternalName(index); + outputKeyColumnNames.add(outputColName); + String internalName = Utilities.ReduceField.KEY.toString() + "." + outputColName; + colExprMap.put(internalName, exprDesc); + internalNames.add(internalName); + } else { + reduceValues.add(exprDesc); + String outputColName = SemanticAnalyzer.getColumnInternalName(index - 1); + outputValueColumnNames.add(outputColName); + String internalName = Utilities.ReduceField.VALUE.toString() + "." + outputColName; + colExprMap.put(internalName, exprDesc); + internalNames.add(internalName); + } + } + List<List<Integer>> distinctColIndices = new ArrayList<>(); + rs1.setConf(PlanUtils.getReduceSinkDesc(reduceKeys, 1, reduceValues, distinctColIndices, + outputKeyColumnNames, outputValueColumnNames, true, -1, 1, -1, + AcidUtils.Operation.NOT_ACID)); + rs1.setColumnExprMap(colExprMap); + + rs1.getSchema().getColumnNames().remove(indexOfDist + 1); + rs1.getSchema().getSignature().remove(indexOfDist + 1); + // KEY._col0:0._col0 => KEY._col0 + + for (int i = 0; i < rs1.getSchema().getSignature().size(); i++) { + rs1.getSchema().getSignature().get(i).setInternalName(internalNames.get(i)); + rs1.getSchema().getColumnNames().set(i, internalNames.get(i)); + } + return rs1; + } + + // mGby2 ---already contains key, remove distinct and change all the others + private GroupByOperator genMapGroupby2(ReduceSinkOperator rs1, + Operator<? extends OperatorDesc> mGby) throws CloneNotSupportedException, SemanticException { + GroupByOperator mGby2 = (GroupByOperator) mGby.clone(); + ArrayList<ColumnInfo> rowSchema = new ArrayList<>(); + ArrayList<ExprNodeDesc> groupByKeys = new ArrayList<ExprNodeDesc>(); + ArrayList<String> outputColumnNames = new ArrayList<String>(); + Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>(); + + ColumnInfo exprInfo = rs1.getSchema().getSignature().get(0); + ExprNodeDesc key = new ExprNodeColumnDesc(exprInfo); + groupByKeys.add(key); + String field = SemanticAnalyzer.getColumnInternalName(0); + outputColumnNames.add(field); + ColumnInfo oColInfo = new ColumnInfo(field, exprInfo.getType(), "", false); + colExprMap.put(field, key); + rowSchema.add(oColInfo); + + ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>(); + for (int index = 0; index < mGby2.getConf().getAggregators().size(); index++) { + ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>(); + if (index != indexOfDist) { + AggregationDesc desc = mGby2.getConf().getAggregators().get(index); + ColumnInfo paraExprInfo = null; + // for example, original it is max 0, dist 1, min 2 + // rs1's schema is key 0, max 1, min 2 + if (index < indexOfDist) { + paraExprInfo = rs1.getSchema().getSignature().get(index + 1); + } else { + paraExprInfo = rs1.getSchema().getSignature().get(index); + } + + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, + paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol())); + + // for all the other aggregations, we set the mode to PARTIAL2 + Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.PARTIAL2, false); + GenericUDAFEvaluator genericUDAFEvaluator = desc.getGenericUDAFEvaluator(); + GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode, + aggParameters); + aggregations.add(new AggregationDesc(desc.getGenericUDAFName(), + udaf.genericUDAFEvaluator, udaf.convertedParameters, false, amode)); + String f = SemanticAnalyzer.getColumnInternalName(aggregations.size()); + outputColumnNames.add(f); + rowSchema.add(new ColumnInfo(f, udaf.returnType, "", false)); + } + } + mGby2.getConf().setMode(GroupByDesc.Mode.PARTIAL2); + mGby2.getConf().setOutputColumnNames(outputColumnNames); + mGby2.getConf().getKeys().clear(); + mGby2.getConf().getKeys().addAll(groupByKeys); + mGby2.getConf().getAggregators().clear(); + mGby2.getConf().getAggregators().addAll(aggregations); + mGby2.getConf().setDistinct(false); + mGby2.setSchema(new RowSchema(rowSchema)); + mGby2.setColumnExprMap(colExprMap); + return mGby2; + } + + // mGby3 is a follow up of mGby2. Here we start to count(key). + private GroupByOperator genMapGroupby3(GroupByOperator mGby2, + Operator<? extends OperatorDesc> mGby) throws CloneNotSupportedException, SemanticException { + GroupByOperator mGby3 = (GroupByOperator) mGby.clone(); + ArrayList<ColumnInfo> rowSchema = new ArrayList<>(); + ArrayList<String> outputColumnNames = new ArrayList<String>(); + Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>(); + + // exprInfo is the key + ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>(); + for (int index = 0; index <= mGby2.getConf().getAggregators().size(); index++) { + if (index == indexOfDist) { + ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>(); + // add count(KEY._col0) to replace distinct + ColumnInfo paraExprInfo = mGby2.getSchema().getSignature().get(0); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, + paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol())); + Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.HASH, false); + GenericUDAFEvaluator genericUDAFEvaluator = SemanticAnalyzer.getGenericUDAFEvaluator( + "count", aggParameters, null, false, false); + assert (genericUDAFEvaluator != null); + GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode, + aggParameters); + AggregationDesc newDesc = new AggregationDesc("count", udaf.genericUDAFEvaluator, + udaf.convertedParameters, false, amode); + String f = SemanticAnalyzer.getColumnInternalName(aggregations.size()); + aggregations.add(newDesc); + outputColumnNames.add(f); + rowSchema.add(new ColumnInfo(f, udaf.returnType, "", false)); + } + if (index == mGby2.getConf().getAggregators().size()) { + break; + } + ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>(); + AggregationDesc desc = mGby2.getConf().getAggregators().get(index); + ColumnInfo paraExprInfo = null; + // for example, original it is max 0, dist 1, min 2 + // rs1's schema is key 0, max 1, min 2 + paraExprInfo = mGby2.getSchema().getSignature().get(index + 1); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, + paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol())); + + // for all the other aggregations, we set the mode to PARTIAL2 + Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.PARTIAL2, false); + GenericUDAFEvaluator genericUDAFEvaluator = desc.getGenericUDAFEvaluator(); + GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode, + aggParameters); + String f = SemanticAnalyzer.getColumnInternalName(aggregations.size()); + aggregations.add(new AggregationDesc(desc.getGenericUDAFName(), udaf.genericUDAFEvaluator, + udaf.convertedParameters, false, amode)); + outputColumnNames.add(f); + rowSchema.add(new ColumnInfo(f, udaf.returnType, "", false)); + } + mGby3.getConf().setMode(GroupByDesc.Mode.PARTIAL2); + mGby3.getConf().setOutputColumnNames(outputColumnNames); + mGby3.getConf().getKeys().clear(); + mGby3.getConf().getAggregators().clear(); + mGby3.getConf().getAggregators().addAll(aggregations); + mGby3.getConf().setDistinct(false); + mGby3.setSchema(new RowSchema(rowSchema)); + mGby3.setColumnExprMap(colExprMap); + return mGby3; + } + + // #reducer is already 1 + private ReduceSinkOperator genReducesink2(GroupByOperator mGby2, + Operator<? extends OperatorDesc> rs) throws SemanticException, CloneNotSupportedException { + ReduceSinkOperator rs2 = (ReduceSinkOperator) rs.clone(); + Map<String, ExprNodeDesc> colExprMap = new HashMap<>(); + + ArrayList<String> outputKeyColumnNames = new ArrayList<String>(); + ArrayList<String> outputValueColumnNames = new ArrayList<String>(); + ArrayList<ExprNodeDesc> reduceValues = new ArrayList<ExprNodeDesc>(); + for (int index = 0; index < mGby2.getSchema().getSignature().size(); index++) { + ColumnInfo paraExprInfo = mGby2.getSchema().getSignature().get(index); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + ExprNodeColumnDesc exprDesc = new ExprNodeColumnDesc(paraExprInfo.getType(), + paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()); + reduceValues.add(exprDesc); + String outputColName = SemanticAnalyzer.getColumnInternalName(index); + outputValueColumnNames.add(outputColName); + String internalName = Utilities.ReduceField.VALUE.toString() + "." + outputColName; + colExprMap.put(internalName, exprDesc); + } + List<List<Integer>> distinctColIndices = new ArrayList<>(); + ArrayList<ExprNodeDesc> reduceKeys = new ArrayList<>(); + rs2.setConf(PlanUtils.getReduceSinkDesc(reduceKeys, 0, reduceValues, distinctColIndices, + outputKeyColumnNames, outputValueColumnNames, false, -1, 0, 1, + AcidUtils.Operation.NOT_ACID)); + rs2.setColumnExprMap(colExprMap); + rs2.getSchema().getSignature().remove(0); + return rs2; + } + + // replace the distinct with the count aggregation + private GroupByOperator genReduceGroupby(ReduceSinkOperator rs2, + Operator<? extends OperatorDesc> rGby, int indexOfDist) throws SemanticException, + CloneNotSupportedException { + GroupByOperator rGby1 = (GroupByOperator) rGby.clone(); + ColumnInfo paraExprInfo = rs2.getSchema().getSignature().get(indexOfDist); + String paraExpression = paraExprInfo.getInternalName(); + assert (paraExpression != null); + ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>(); + aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo + .getTabAlias(), paraExprInfo.getIsVirtualCol())); + GenericUDAFEvaluator genericUDAFEvaluator = SemanticAnalyzer.getGenericUDAFEvaluator("count", + aggParameters, null, false, false); + assert (genericUDAFEvaluator != null); + Mode amode = SemanticAnalyzer.groupByDescModeToUDAFMode(GroupByDesc.Mode.MERGEPARTIAL, false); + GenericUDAFInfo udaf = SemanticAnalyzer.getGenericUDAFInfo(genericUDAFEvaluator, amode, + aggParameters); + AggregationDesc newDesc = new AggregationDesc("count", udaf.genericUDAFEvaluator, + udaf.convertedParameters, false, amode); + rGby1.getConf().getAggregators().set(indexOfDist, newDesc); + rGby1.getConf().setDistinct(false); + return rGby1; + } + + @Override + public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + GroupByOperator mGby = (GroupByOperator) stack.get(stack.size() - 3); + ReduceSinkOperator rs = (ReduceSinkOperator) stack.get(stack.size() - 2); + GroupByOperator rGby = (GroupByOperator) stack.get(stack.size() - 1); + if (checkCountDistinct(mGby, rs, rGby)) { + LOG.info("trigger count distinct rewrite"); + try { + processGroupBy(mGby, rs, rGby); + } catch (CloneNotSupportedException e) { + throw new SemanticException(e.getMessage()); + } + } + return null; + } + + } + +} http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java index 7dace90..781e088 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java @@ -143,6 +143,10 @@ public class Optimizer { HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_MAP_GROUPBY_SORT)) { transformations.add(new GroupByOptimizer()); } + if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVECOUNTDISTINCTOPTIMIZER) + && (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_IN_TEST) || isTezExecEngine)) { + transformations.add(new CountDistinctRewriteProc()); + } transformations.add(new ColumnPruner()); if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVE_OPTIMIZE_SKEWJOIN_COMPILETIME)) { if (!isTezExecEngine) { http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java index 38a9ef2..fe91ee7 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/GroupByDesc.java @@ -367,4 +367,19 @@ public class GroupByDesc extends AbstractOperatorDesc { } return new GroupByOperatorExplainVectorization(this, vectorDesc); } + + @Override + public Object clone() { + ArrayList<java.lang.String> outputColumnNames = new ArrayList<>(); + outputColumnNames.addAll(this.outputColumnNames); + ArrayList<ExprNodeDesc> keys = new ArrayList<>(); + keys.addAll(this.keys); + ArrayList<org.apache.hadoop.hive.ql.plan.AggregationDesc> aggregators = new ArrayList<>(); + aggregators.addAll(this.aggregators); + List<Integer> listGroupingSets = new ArrayList<>(); + listGroupingSets.addAll(this.listGroupingSets); + return new GroupByDesc(this.mode, outputColumnNames, keys, aggregators, + this.groupByMemoryUsage, this.memoryThreshold, listGroupingSets, this.groupingSetsPresent, + this.groupingSetPosition, this.isDistinct); + } } http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/ql/src/test/queries/clientpositive/count_dist_rewrite.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/count_dist_rewrite.q b/ql/src/test/queries/clientpositive/count_dist_rewrite.q new file mode 100644 index 0000000..0b1bc66 --- /dev/null +++ b/ql/src/test/queries/clientpositive/count_dist_rewrite.q @@ -0,0 +1,65 @@ +explain select count(distinct key) from src; + +select count(distinct key) from src; + +explain select max(key), count(distinct key) B1_CNTD from src; + +select max(key), count(distinct key) B1_CNTD from src; + +explain select max(key), count(distinct key), min(key) from src; + +select max(key), count(distinct key), min(key) from src; + +explain select max(key), count(distinct key), min(key), avg(key) from src; + +select max(key), count(distinct key), min(key), avg(key) from src; + +explain select count(1), count(distinct key) from src; + +select count(1), count(distinct key) from src; + +explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src; + +select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src; + +explain select count(1), count(distinct key), cast(STDDEV(key) as int) from src; +select count(1), count(distinct key), cast(STDDEV(key) as int) from src; +select count(distinct key), count(1), cast(STDDEV(key) as int) from src; + +explain SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src; + +SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src; + +explain select max(key), count(distinct key), min(key), avg(key) from src group by value; + +select max(key), count(distinct key), min(key), avg(key) from src group by value; http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/ql/src/test/results/clientpositive/count_dist_rewrite.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/count_dist_rewrite.q.out b/ql/src/test/results/clientpositive/count_dist_rewrite.q.out new file mode 100644 index 0000000..ceda918 --- /dev/null +++ b/ql/src/test/results/clientpositive/count_dist_rewrite.q.out @@ -0,0 +1,1151 @@ +PREHOOK: query: explain select count(distinct key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(distinct key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + keys: key (type: string) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col0) + mode: partial2 + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(distinct key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +309 +PREHOOK: query: explain select max(key), count(distinct key) B1_CNTD from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key) B1_CNTD from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(_col1), count(_col0) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 368 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 368 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 368 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 368 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key) B1_CNTD from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key) B1_CNTD from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 +PREHOOK: query: explain select max(key), count(distinct key), min(key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key), min(key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(key), min(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string), _col3 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), min(VALUE._col1) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(_col1), count(_col0), min(_col2) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 736 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 736 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: bigint), _col2 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1), min(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 736 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 736 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key), min(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key), min(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 0 +PREHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: key + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(key), min(key), avg(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3, _col4 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: string), _col3 (type: string), _col4 (type: struct<count:bigint,sum:double,input:string>) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), min(VALUE._col1), avg(VALUE._col2) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(_col1), count(_col0), min(_col2), avg(_col3) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 1148 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 1148 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: string), _col1 (type: bigint), _col2 (type: string), _col3 (type: struct<count:bigint,sum:double,input:string>) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(VALUE._col1), min(VALUE._col2), avg(VALUE._col3) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 1 Data size: 1148 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 1148 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +98 309 0 260.182 +PREHOOK: query: explain select count(1), count(distinct key) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(1), count(distinct key) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1) + keys: _col1 (type: string) + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col1), count(_col0) + mode: partial2 + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint), _col1 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(distinct key) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(distinct key) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 309 +PREHOOK: query: explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(), count(key), max(value), max(key) + keys: key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col4, _col5 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col2 (type: bigint), _col4 (type: string), _col5 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), max(VALUE._col2), max(VALUE._col3) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col1), count(_col2), count(_col0), max(_col3), max(_col4) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 576 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 576 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: bigint), _col3 (type: string), _col4 (type: string) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), count(VALUE._col2), max(VALUE._col3), max(VALUE._col4) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 1 Data size: 576 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 576 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select + count(*) as total, + count(key) as not_null_total, + count(distinct key) as unique_days, + max(value) as max_ss_store_sk, + max(key) as max_ss_promo_sk +from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 500 309 val_98 98 +PREHOOK: query: explain select count(1), count(distinct key), cast(STDDEV(key) as int) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(1), count(distinct key), cast(STDDEV(key) as int) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string) + outputColumnNames: _col1 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(1), stddev(_col1) + keys: _col1 (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col3 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: bigint), _col3 (type: struct<count:bigint,sum:double,variance:double>) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), stddev(VALUE._col1) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col1), count(_col0), stddev(_col2) + mode: partial2 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint), _col1 (type: bigint), _col2 (type: struct<count:bigint,sum:double,variance:double>) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0), count(VALUE._col1), stddev(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: bigint), _col1 (type: bigint), UDFToInteger(_col2) (type: int) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 176 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1), count(distinct key), cast(STDDEV(key) as int) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(1), count(distinct key), cast(STDDEV(key) as int) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +500 309 142 +PREHOOK: query: select count(distinct key), count(1), cast(STDDEV(key) as int) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select count(distinct key), count(1), cast(STDDEV(key) as int) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +309 500 142 +PREHOOK: query: explain SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +PREHOOK: type: QUERY +POSTHOOK: query: explain SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: substr(value, 5) (type: string) + outputColumnNames: _col0 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: sum(_col0), avg(_col0), max(_col0), min(_col0), std(_col0), stddev_samp(_col0), variance(_col0), var_samp(_col0) + keys: _col0 (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col4, _col5, _col6, _col7, _col8, _col9 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string) + sort order: + + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col1 (type: double), _col2 (type: struct<count:bigint,sum:double,input:string>), _col4 (type: string), _col5 (type: string), _col6 (type: struct<count:bigint,sum:double,variance:double>), _col7 (type: struct<count:bigint,sum:double,variance:double>), _col8 (type: struct<count:bigint,sum:double,variance:double>), _col9 (type: struct<count:bigint,sum:double,variance:double>) + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0), avg(VALUE._col1), max(VALUE._col2), min(VALUE._col3), std(VALUE._col4), stddev_samp(VALUE._col5), variance(VALUE._col6), var_samp(VALUE._col7) + keys: KEY._col0 (type: string) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: sum(_col1), avg(_col2), count(_col0), max(_col3), min(_col4), std(_col5), stddev_samp(_col6), variance(_col7), var_samp(_col8) + mode: partial2 + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 1 Data size: 1392 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 1392 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: double), _col1 (type: struct<count:bigint,sum:double,input:string>), _col2 (type: bigint), _col3 (type: string), _col4 (type: string), _col5 (type: struct<count:bigint,sum:double,variance:double>), _col6 (type: struct<count:bigint,sum:double,variance:double>), _col7 (type: struct<count:bigint,sum:double,variance:double>), _col8 (type: struct<count:bigint,sum:double,variance:double>) + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0), avg(VALUE._col1), count(VALUE._col2), max(VALUE._col3), min(VALUE._col4), std(VALUE._col5), stddev_samp(VALUE._col6), variance(VALUE._col7), var_samp(VALUE._col8) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 1 Data size: 1392 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col0 (type: double), _col1 (type: double), _col2 (type: bigint), _col3 (type: string), _col4 (type: string), UDFToInteger(_col5) (type: int), UDFToInteger(_col6) (type: int), UDFToInteger(_col7) (type: int), UDFToInteger(_col8) (type: int) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8 + Statistics: Num rows: 1 Data size: 1392 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 1392 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: SELECT + sum(substr(src.value,5)), + avg(substr(src.value,5)), + count(DISTINCT substr(src.value,5)), + max(substr(src.value,5)), + min(substr(src.value,5)), + cast(std(substr(src.value,5)) as int), + cast(stddev_samp(substr(src.value,5)) as int), + cast(variance(substr(src.value,5)) as int), + cast(var_samp(substr(src.value,5)) as int) from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +130091.0 260.182 309 98 0 142 143 20428 20469 +PREHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src group by value +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(key), count(distinct key), min(key), avg(key) from src group by value +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: src + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: string), value (type: string) + outputColumnNames: key, value + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: max(key), count(DISTINCT key), min(key), avg(key) + keys: value (type: string), key (type: string) + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5 + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: string), _col1 (type: string) + sort order: ++ + Map-reduce partition columns: _col0 (type: string) + Statistics: Num rows: 500 Data size: 5312 Basic stats: COMPLETE Column stats: NONE + value expressions: _col2 (type: string), _col4 (type: string), _col5 (type: struct<count:bigint,sum:double,input:string>) + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), count(DISTINCT KEY._col1:0._col0), min(VALUE._col2), avg(VALUE._col3) + keys: KEY._col0 (type: string) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: _col1 (type: string), _col2 (type: bigint), _col3 (type: string), _col4 (type: double) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 250 Data size: 2656 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src group by value +PREHOOK: type: QUERY +PREHOOK: Input: default@src +#### A masked pattern was here #### +POSTHOOK: query: select max(key), count(distinct key), min(key), avg(key) from src group by value +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +#### A masked pattern was here #### +0 1 0 0.0 +10 1 10 10.0 +100 1 100 100.0 +103 1 103 103.0 +104 1 104 104.0 +105 1 105 105.0 +11 1 11 11.0 +111 1 111 111.0 +113 1 113 113.0 +114 1 114 114.0 +116 1 116 116.0 +118 1 118 118.0 +119 1 119 119.0 +12 1 12 12.0 +120 1 120 120.0 +125 1 125 125.0 +126 1 126 126.0 +128 1 128 128.0 +129 1 129 129.0 +131 1 131 131.0 +133 1 133 133.0 +134 1 134 134.0 +136 1 136 136.0 +137 1 137 137.0 +138 1 138 138.0 +143 1 143 143.0 +145 1 145 145.0 +146 1 146 146.0 +149 1 149 149.0 +15 1 15 15.0 +150 1 150 150.0 +152 1 152 152.0 +153 1 153 153.0 +155 1 155 155.0 +156 1 156 156.0 +157 1 157 157.0 +158 1 158 158.0 +160 1 160 160.0 +162 1 162 162.0 +163 1 163 163.0 +164 1 164 164.0 +165 1 165 165.0 +166 1 166 166.0 +167 1 167 167.0 +168 1 168 168.0 +169 1 169 169.0 +17 1 17 17.0 +170 1 170 170.0 +172 1 172 172.0 +174 1 174 174.0 +175 1 175 175.0 +176 1 176 176.0 +177 1 177 177.0 +178 1 178 178.0 +179 1 179 179.0 +18 1 18 18.0 +180 1 180 180.0 +181 1 181 181.0 +183 1 183 183.0 +186 1 186 186.0 +187 1 187 187.0 +189 1 189 189.0 +19 1 19 19.0 +190 1 190 190.0 +191 1 191 191.0 +192 1 192 192.0 +193 1 193 193.0 +194 1 194 194.0 +195 1 195 195.0 +196 1 196 196.0 +197 1 197 197.0 +199 1 199 199.0 +2 1 2 2.0 +20 1 20 20.0 +200 1 200 200.0 +201 1 201 201.0 +202 1 202 202.0 +203 1 203 203.0 +205 1 205 205.0 +207 1 207 207.0 +208 1 208 208.0 +209 1 209 209.0 +213 1 213 213.0 +214 1 214 214.0 +216 1 216 216.0 +217 1 217 217.0 +218 1 218 218.0 +219 1 219 219.0 +221 1 221 221.0 +222 1 222 222.0 +223 1 223 223.0 +224 1 224 224.0 +226 1 226 226.0 +228 1 228 228.0 +229 1 229 229.0 +230 1 230 230.0 +233 1 233 233.0 +235 1 235 235.0 +237 1 237 237.0 +238 1 238 238.0 +239 1 239 239.0 +24 1 24 24.0 +241 1 241 241.0 +242 1 242 242.0 +244 1 244 244.0 +247 1 247 247.0 +248 1 248 248.0 +249 1 249 249.0 +252 1 252 252.0 +255 1 255 255.0 +256 1 256 256.0 +257 1 257 257.0 +258 1 258 258.0 +26 1 26 26.0 +260 1 260 260.0 +262 1 262 262.0 +263 1 263 263.0 +265 1 265 265.0 +266 1 266 266.0 +27 1 27 27.0 +272 1 272 272.0 +273 1 273 273.0 +274 1 274 274.0 +275 1 275 275.0 +277 1 277 277.0 +278 1 278 278.0 +28 1 28 28.0 +280 1 280 280.0 +281 1 281 281.0 +282 1 282 282.0 +283 1 283 283.0 +284 1 284 284.0 +285 1 285 285.0 +286 1 286 286.0 +287 1 287 287.0 +288 1 288 288.0 +289 1 289 289.0 +291 1 291 291.0 +292 1 292 292.0 +296 1 296 296.0 +298 1 298 298.0 +30 1 30 30.0 +302 1 302 302.0 +305 1 305 305.0 +306 1 306 306.0 +307 1 307 307.0 +308 1 308 308.0 +309 1 309 309.0 +310 1 310 310.0 +311 1 311 311.0 +315 1 315 315.0 +316 1 316 316.0 +317 1 317 317.0 +318 1 318 318.0 +321 1 321 321.0 +322 1 322 322.0 +323 1 323 323.0 +325 1 325 325.0 +327 1 327 327.0 +33 1 33 33.0 +331 1 331 331.0 +332 1 332 332.0 +333 1 333 333.0 +335 1 335 335.0 +336 1 336 336.0 +338 1 338 338.0 +339 1 339 339.0 +34 1 34 34.0 +341 1 341 341.0 +342 1 342 342.0 +344 1 344 344.0 +345 1 345 345.0 +348 1 348 348.0 +35 1 35 35.0 +351 1 351 351.0 +353 1 353 353.0 +356 1 356 356.0 +360 1 360 360.0 +362 1 362 362.0 +364 1 364 364.0 +365 1 365 365.0 +366 1 366 366.0 +367 1 367 367.0 +368 1 368 368.0 +369 1 369 369.0 +37 1 37 37.0 +373 1 373 373.0 +374 1 374 374.0 +375 1 375 375.0 +377 1 377 377.0 +378 1 378 378.0 +379 1 379 379.0 +382 1 382 382.0 +384 1 384 384.0 +386 1 386 386.0 +389 1 389 389.0 +392 1 392 392.0 +393 1 393 393.0 +394 1 394 394.0 +395 1 395 395.0 +396 1 396 396.0 +397 1 397 397.0 +399 1 399 399.0 +4 1 4 4.0 +400 1 400 400.0 +401 1 401 401.0 +402 1 402 402.0 +403 1 403 403.0 +404 1 404 404.0 +406 1 406 406.0 +407 1 407 407.0 +409 1 409 409.0 +41 1 41 41.0 +411 1 411 411.0 +413 1 413 413.0 +414 1 414 414.0 +417 1 417 417.0 +418 1 418 418.0 +419 1 419 419.0 +42 1 42 42.0 +421 1 421 421.0 +424 1 424 424.0 +427 1 427 427.0 +429 1 429 429.0 +43 1 43 43.0 +430 1 430 430.0 +431 1 431 431.0 +432 1 432 432.0 +435 1 435 435.0 +436 1 436 436.0 +437 1 437 437.0 +438 1 438 438.0 +439 1 439 439.0 +44 1 44 44.0 +443 1 443 443.0 +444 1 444 444.0 +446 1 446 446.0 +448 1 448 448.0 +449 1 449 449.0 +452 1 452 452.0 +453 1 453 453.0 +454 1 454 454.0 +455 1 455 455.0 +457 1 457 457.0 +458 1 458 458.0 +459 1 459 459.0 +460 1 460 460.0 +462 1 462 462.0 +463 1 463 463.0 +466 1 466 466.0 +467 1 467 467.0 +468 1 468 468.0 +469 1 469 469.0 +47 1 47 47.0 +470 1 470 470.0 +472 1 472 472.0 +475 1 475 475.0 +477 1 477 477.0 +478 1 478 478.0 +479 1 479 479.0 +480 1 480 480.0 +481 1 481 481.0 +482 1 482 482.0 +483 1 483 483.0 +484 1 484 484.0 +485 1 485 485.0 +487 1 487 487.0 +489 1 489 489.0 +490 1 490 490.0 +491 1 491 491.0 +492 1 492 492.0 +493 1 493 493.0 +494 1 494 494.0 +495 1 495 495.0 +496 1 496 496.0 +497 1 497 497.0 +498 1 498 498.0 +5 1 5 5.0 +51 1 51 51.0 +53 1 53 53.0 +54 1 54 54.0 +57 1 57 57.0 +58 1 58 58.0 +64 1 64 64.0 +65 1 65 65.0 +66 1 66 66.0 +67 1 67 67.0 +69 1 69 69.0 +70 1 70 70.0 +72 1 72 72.0 +74 1 74 74.0 +76 1 76 76.0 +77 1 77 77.0 +78 1 78 78.0 +8 1 8 8.0 +80 1 80 80.0 +82 1 82 82.0 +83 1 83 83.0 +84 1 84 84.0 +85 1 85 85.0 +86 1 86 86.0 +87 1 87 87.0 +9 1 9 9.0 +90 1 90 90.0 +92 1 92 92.0 +95 1 95 95.0 +96 1 96 96.0 +97 1 97 97.0 +98 1 98 98.0 http://git-wip-us.apache.org/repos/asf/hive/blob/b560f492/ql/src/test/results/clientpositive/groupby_sort_11.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/groupby_sort_11.q.out b/ql/src/test/results/clientpositive/groupby_sort_11.q.out index 2b3bf4a..fe6bbb3 100644 --- a/ql/src/test/results/clientpositive/groupby_sort_11.q.out +++ b/ql/src/test/results/clientpositive/groupby_sort_11.q.out @@ -292,7 +292,8 @@ POSTHOOK: query: EXPLAIN select count(distinct key+key) from T1 POSTHOOK: type: QUERY STAGE DEPENDENCIES: Stage-1 is a root stage - Stage-0 depends on stages: Stage-1 + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 STAGE PLANS: Stage: Stage-1 @@ -306,24 +307,50 @@ STAGE PLANS: outputColumnNames: _col0 Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE Group By Operator - aggregations: count(DISTINCT _col0) keys: _col0 (type: double) mode: hash - outputColumnNames: _col0, _col1 + outputColumnNames: _col0 Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE Reduce Output Operator key expressions: _col0 (type: double) sort order: + + Map-reduce partition columns: _col0 (type: double) Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE Reduce Operator Tree: Group By Operator - aggregations: count(DISTINCT KEY._col0:0._col0) + keys: KEY._col0 (type: double) + mode: partial2 + outputColumnNames: _col0 + Statistics: Num rows: 10 Data size: 70 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: count(_col0) + mode: partial2 + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazybinary.LazyBinarySerDe + + Stage: Stage-2 + Map Reduce + Map Operator Tree: + TableScan + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) mode: mergepartial outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE File Output Operator compressed: false - Statistics: Num rows: 1 Data size: 16 Basic stats: COMPLETE Column stats: NONE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat