This is an automated email from the ASF dual-hosted git repository.
okumin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new 6e9c8283a94 HIVE-28489: Partition the input data of GroupBy with
GroupingSet (Seonggon Namgung, reviewed by PLASH SPEED, Shohei Okumiya)
6e9c8283a94 is described below
commit 6e9c8283a94391050f1073629f0a04d3370a7cf2
Author: seonggon <[email protected]>
AuthorDate: Thu Nov 28 00:35:22 2024 +0900
HIVE-28489: Partition the input data of GroupBy with GroupingSet (Seonggon
Namgung, reviewed by PLASH SPEED, Shohei Okumiya)
---
.../java/org/apache/hadoop/hive/conf/HiveConf.java | 4 +
.../hive/ql/optimizer/GroupingSetOptimizer.java | 379 ++++++++++++++++
.../apache/hadoop/hive/ql/parse/TezCompiler.java | 5 +
.../groupingset_optimize_hive_28489.q | 30 ++
.../llap/groupingset_optimize_hive_28489.q.out | 479 +++++++++++++++++++++
.../perf/tpcds30tb/tez/query22.q.out | 55 ++-
.../perf/tpcds30tb/tez/query67.q.out | 61 +--
7 files changed, 968 insertions(+), 45 deletions(-)
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 125d6acf072..cf564da18d4 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2048,6 +2048,10 @@ public class HiveConf extends Configuration {
"assumption that the original group by will reduce the data size."),
HIVE_GROUPBY_LIMIT_EXTRASTEP("hive.groupby.limit.extrastep", true, "This
parameter decides if Hive should \n" +
"create new MR job for sorting final output"),
+
HIVE_OPTIMIZE_GROUPING_SET_THRESHOLD("hive.optimize.grouping.set.threshold",
1_000_000_000L,
+ "If # of estimated rows emitted by GroupBy operator with GroupingSet
is larger than the configured value, " +
+ "then the optimizer inserts an extra shuffle to partitioning input
data.\n" +
+ "Setting a negative number disables the optimization."),
// Max file num and size used to do a single copy (after that, distcp is
used)
HIVE_EXEC_COPYFILE_MAXNUMFILES("hive.exec.copyfile.maxnumfiles", 1L,
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupingSetOptimizer.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupingSetOptimizer.java
new file mode 100644
index 00000000000..2ebbf048905
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupingSetOptimizer.java
@@ -0,0 +1,379 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.GroupByOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.OperatorFactory;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.io.AcidUtils;
+import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
+import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.lib.RuleRegExp;
+import org.apache.hadoop.hive.ql.lib.SemanticDispatcher;
+import org.apache.hadoop.hive.ql.lib.SemanticGraphWalker;
+import org.apache.hadoop.hive.ql.lib.SemanticNodeProcessor;
+import org.apache.hadoop.hive.ql.lib.SemanticRule;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
+import org.apache.hadoop.hive.ql.plan.PlanUtils;
+import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
+import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc.ReducerTraits;
+import org.apache.hadoop.hive.ql.plan.SelectDesc;
+import org.apache.hadoop.hive.ql.plan.TableDesc;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Comparator;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+import java.util.stream.Collectors;
+
+public class GroupingSetOptimizer extends Transform {
+ private static final Logger LOG =
LoggerFactory.getLogger(GroupingSetOptimizer.class);
+
+ private static class GroupingSetProcessorContext implements NodeProcessorCtx
{
+ public final long bytesPerReducer;
+ public final int maxReducers;
+ public final long groupingSetThreshold;
+
+ public GroupingSetProcessorContext(HiveConf hiveConf) {
+ bytesPerReducer =
hiveConf.getLongVar(HiveConf.ConfVars.BYTES_PER_REDUCER);
+ maxReducers = hiveConf.getIntVar(HiveConf.ConfVars.MAX_REDUCERS);
+ groupingSetThreshold =
hiveConf.getLongVar(HiveConf.ConfVars.HIVE_OPTIMIZE_GROUPING_SET_THRESHOLD);
+ }
+ }
+
+ private static class GroupingSetProcessor implements SemanticNodeProcessor {
+ @Override
+ public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
+ Object... nodeOutputs) throws SemanticException {
+ GroupingSetProcessorContext context = (GroupingSetProcessorContext)
procCtx;
+ GroupByOperator gby = (GroupByOperator) nd;
+ if (!isGroupByFeasible(gby, context)) {
+ return null;
+ }
+
+ Operator<?> parentOp = gby.getParentOperators().get(0);
+ if (!isParentOpFeasible(parentOp)) {
+ return null;
+ }
+
+ String partitionCol = selectPartitionColumn(gby, parentOp);
+ if (partitionCol == null) {
+ return null;
+ }
+
+ LOG.info("Applying GroupingSetOptimization: partitioning the input data
of {} by {}",
+ gby, partitionCol);
+
+ ReduceSinkOperator rs = createReduceSink(parentOp, partitionCol,
context);
+
+ parentOp.removeChild(gby);
+ // gby.setParentOperators(Arrays.asList(rs));
+ // NOTE: The above expression does not work because GBY refers to _colN
+ // while input columns are VALUE._colN. Therefore, we should either
modify GBY expressions
+ // or insert a new SEL that renames columns. The following code
implements the later one as it is
+ // easier to implement.
+
+ SelectOperator sel = createSelect(parentOp.getSchema().getSignature(),
partitionCol, rs);
+
+ sel.setChildOperators(Arrays.asList(gby));
+ gby.setParentOperators(Arrays.asList(sel));
+
+ return null;
+ }
+
+ private boolean isGroupByFeasible(GroupByOperator gby,
GroupingSetProcessorContext context) {
+ if (!gby.getConf().isGroupingSetsPresent() || gby.getStatistics() ==
null) {
+ return false;
+ }
+
+ if (gby.getStatistics().getNumRows() < context.groupingSetThreshold) {
+ LOG.debug("Skip grouping-set optimization on a small operator: {}",
gby);
+ return false;
+ }
+
+ if (gby.getParentOperators().size() != 1) {
+ LOG.debug("Skip grouping-set optimization on a operator with multiple
parent operators: {}", gby);
+ return false;
+ }
+
+ return true;
+ }
+
+ private boolean isParentOpFeasible(Operator<?> parentOp) {
+ ReduceSinkOperator rs = null;
+
+ Operator<?> curOp = parentOp;
+ while (true) {
+ if (curOp instanceof ReduceSinkOperator) {
+ rs = (ReduceSinkOperator) curOp;
+ break;
+ }
+
+ if (curOp.getParentOperators() == null) {
+ break;
+ }
+
+ if (curOp.getParentOperators().size() == 1) {
+ curOp = curOp.getParentOperators().get(0);
+ } else if (curOp instanceof AbstractMapJoinOperator) {
+ MapJoinDesc desc = ((AbstractMapJoinOperator<?>) curOp).getConf();
+ curOp = curOp.getParentOperators().get(desc.getPosBigTable());
+ } else {
+ break;
+ }
+ }
+
+ if (rs == null) {
+ // There is no partitioning followed by this parentOp. Continue
optimization.
+ return true;
+ }
+
+ if (rs.getConf().getPartitionCols() != null &&
rs.getConf().getPartitionCols().size() > 0) {
+ // This rs might be irrelevant to the target GroupBy operator. For
example, the following query:
+ // SELECT a, b, sum(c) FROM (SELECT a, b, c FROM tbl DISTRIBUTE BY
c) z GROUP BY rollup(a, b)
+ // won't be optimized although 'DISTRIBUTE BY c' is irrelevant to the
key columns of GroupBy.
+ LOG.debug("Skip grouping-set optimization in order not to introduce
possibly redundant shuffle.");
+ return false;
+ } else {
+ // No partitioning. Continue optimization.
+ return true;
+ }
+ }
+
+ private String selectPartitionColumn(GroupByOperator gby, Operator<?>
parentOp) {
+ if (parentOp.getSchema() == null || parentOp.getSchema().getSignature()
== null) {
+ LOG.debug("Skip grouping-set optimization as the parent operator {}
does not provide signature",
+ parentOp);
+ return null;
+ }
+
+ if (parentOp.getStatistics() == null ||
+ parentOp.getStatistics().getNumRows() <= 0 ||
+ parentOp.getStatistics().getColumnStats() == null) {
+ LOG.debug("Skip grouping-set optimization as the parent operator {}
does not provide statistics",
+ parentOp);
+ return null;
+ }
+
+ if (parentOp.getStatistics().getNumRows() >
gby.getStatistics().getNumRows()) {
+ LOG.debug("Skip grouping-set optimization as the parent operator {}
emits more rows than {}",
+ parentOp, gby);
+ return null;
+ }
+
+ List<String> colNamesInSignature = new ArrayList<>();
+ for (ColumnInfo pColInfo: parentOp.getSchema().getSignature()) {
+ colNamesInSignature.add(pColInfo.getInternalName());
+ }
+
+ List<Integer> groupingSetKeys =
listGroupingSetKeyPositions(gby.getConf().getListGroupingSets());
+ Set<String> candidates = new HashSet<>();
+ for (Integer groupingSetKeyPosition: groupingSetKeys) {
+ ExprNodeDesc key = gby.getConf().getKeys().get(groupingSetKeyPosition);
+
+ if (key instanceof ExprNodeColumnDesc) {
+ candidates.add(((ExprNodeColumnDesc) key).getColumn());
+ }
+ }
+ candidates.retainAll(colNamesInSignature);
+
+ List<ColStatistics> columnStatistics =
+ new ArrayList<>(parentOp.getStatistics().getColumnStats()).stream()
+ .filter(cs -> cs.getCountDistint() > 0)
+
.sorted(Comparator.comparingLong(ColStatistics::getCountDistint).reversed())
+ .collect(Collectors.toList());
+
+ String partitionCol = null;
+ for (ColStatistics col: columnStatistics) {
+ String colName = col.getColumnName();
+ if (parentOp.getColumnExprMap().containsKey(colName) &&
candidates.contains(colName)) {
+ partitionCol = colName;
+ break;
+ }
+ }
+
+ if (partitionCol == null) {
+ LOG.debug("Skip grouping-set optimization as there is no feasible
column in parent operator {}.",
+ parentOp);
+ }
+
+ return partitionCol;
+ }
+
+ private ReduceSinkOperator createReduceSink(Operator<?> parentOp, String
partitionColName,
+ GroupingSetProcessorContext context) {
+ Map<String, ExprNodeDesc> colExprMap = new HashMap<>();
+ List<ExprNodeDesc> keyColumns = new ArrayList<>();
+ List<String> keyColumnNames = new ArrayList<>();
+ List<ExprNodeDesc> valueColumns = new ArrayList<>();
+ List<String> valueColumnNames = new ArrayList<>();
+ List<ColumnInfo> signature = new ArrayList<>();
+ List<ExprNodeDesc> partCols = new ArrayList<>();
+
+ for (ColumnInfo pColInfo: parentOp.getSchema().getSignature()) {
+ ColumnInfo cColInfo = new ColumnInfo(pColInfo);
+ String pColName = pColInfo.getInternalName();
+
+ if (pColName.equals(partitionColName)) {
+ keyColumnNames.add(pColName);
+
+ String cColName = Utilities.ReduceField.KEY + "." + pColName;
+ cColInfo.setInternalName(cColName);
+ signature.add(cColInfo);
+
+ ExprNodeDesc keyExpr = new ExprNodeColumnDesc(pColInfo);
+ keyColumns.add(keyExpr);
+ colExprMap.put(cColName, keyExpr);
+
+ partCols.add(keyExpr);
+ } else {
+ valueColumnNames.add(pColName);
+
+ String cColName = Utilities.ReduceField.VALUE + "." + pColName;
+ cColInfo.setInternalName(cColName);
+ signature.add(cColInfo);
+
+ ExprNodeDesc valueExpr = new ExprNodeColumnDesc(pColInfo);
+ valueColumns.add(valueExpr);
+ colExprMap.put(cColName, valueExpr);
+ }
+ }
+
+ List<FieldSchema> valueFields =
+ PlanUtils.getFieldSchemasFromColumnList(valueColumns,
valueColumnNames, 0, "");
+ TableDesc valueTable = PlanUtils.getReduceValueTableDesc(valueFields);
+
+ List<FieldSchema> keyFields =
+ PlanUtils.getFieldSchemasFromColumnList(keyColumns, keyColumnNames,
0, "");
+ TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(keyFields, "+",
"z");
+ List<List<Integer>> distinctColumnIndices = new ArrayList<>();
+
+ // If we run SetReducerParallelism after this optimization, then we
don't have to compute numReducers.
+ int numReducers = Utilities.estimateReducers(
+ parentOp.getStatistics().getDataSize(), context.bytesPerReducer,
context.maxReducers, false);
+
+ ReduceSinkDesc rsConf = new ReduceSinkDesc(keyColumns,
keyColumns.size(), valueColumns,
+ keyColumnNames, distinctColumnIndices, valueColumnNames, -1,
partCols, numReducers, keyTable,
+ valueTable, AcidUtils.Operation.NOT_ACID);
+
+ ReduceSinkOperator rs =
+ (ReduceSinkOperator) OperatorFactory.getAndMakeChild(rsConf, new
RowSchema(signature), parentOp);
+ rs.setColumnExprMap(colExprMap);
+
+ // If we run SetReducerParallelism after this optimization, the
following code becomes unnecessary.
+ rsConf.setReducerTraits(EnumSet.of(ReducerTraits.UNIFORM,
ReducerTraits.AUTOPARALLEL));
+
+ return rs;
+ }
+
+ private SelectOperator createSelect(List<ColumnInfo> signature, String
partitionColName,
+ Operator<?> parentOp) {
+ List<String> selColNames = new ArrayList<>();
+ List<ExprNodeDesc> selColumns = new ArrayList<>();
+ List<ColumnInfo> selSignature = new ArrayList<>();
+ Map<String, ExprNodeDesc> colExprMap = new HashMap<>();
+
+ for (ColumnInfo pColInfo: signature) {
+ String origColName = pColInfo.getInternalName();
+ String rsColName;
+
+ if (origColName.equals(partitionColName)) {
+ rsColName = Utilities.ReduceField.KEY + "." + origColName;
+ } else {
+ rsColName = Utilities.ReduceField.VALUE + "." + origColName;
+ }
+
+ ColumnInfo selColInfo = new ColumnInfo(pColInfo);
+
+ ExprNodeDesc selExpr = new ExprNodeColumnDesc(pColInfo.getType(),
rsColName, null, false);
+
+ selSignature.add(selColInfo);
+ selColumns.add(selExpr);
+ selColNames.add(origColName);
+ colExprMap.put(origColName, selExpr);
+ }
+
+ SelectDesc selConf = new SelectDesc(selColumns, selColNames);
+ SelectOperator sel =
+ (SelectOperator) OperatorFactory.getAndMakeChild(selConf, new
RowSchema(selSignature), parentOp);
+ sel.setColumnExprMap(colExprMap);
+
+ return sel;
+ }
+
+ private List<Integer> listGroupingSetKeyPositions(List<Long> groupingSets)
{
+ long acc = 0L;
+ for (Long groupingSet: groupingSets) {
+ acc |= groupingSet;
+ }
+
+ BitSet bitset = BitSet.valueOf(new long[]{acc});
+ List<Integer> ret = new ArrayList<>();
+ for (int i = bitset.nextSetBit(0); i >= 0; i = bitset.nextSetBit(i + 1))
{
+ ret.add(i);
+ }
+
+ return ret;
+ }
+ }
+
+ @Override
+ public ParseContext transform(ParseContext pCtx) throws SemanticException {
+ Map<SemanticRule, SemanticNodeProcessor> testRules = new LinkedHashMap<>();
+ testRules.put(new RuleRegExp("GBY", GroupByOperator.getOperatorName() +
"%"),
+ new GroupingSetProcessor()
+ );
+
+ SemanticDispatcher disp =
+ new DefaultRuleDispatcher(null, testRules, new
GroupingSetProcessorContext(pCtx.getConf()));
+ SemanticGraphWalker ogw = new DefaultGraphWalker(disp);
+
+ List<Node> topNodes = new ArrayList<Node>();
+ topNodes.addAll(pCtx.getTopOps().values());
+ ogw.startWalking(topNodes, null);
+
+ return pCtx;
+ }
+}
+
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
index c9990a9f69f..5aafe93cc58 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
@@ -88,6 +88,7 @@ import
org.apache.hadoop.hive.ql.optimizer.ConstantPropagateProcCtx.ConstantProp
import org.apache.hadoop.hive.ql.optimizer.ConvertJoinMapJoin;
import org.apache.hadoop.hive.ql.optimizer.DynamicPartitionPruningOptimization;
import org.apache.hadoop.hive.ql.optimizer.FiltertagAppenderProc;
+import org.apache.hadoop.hive.ql.optimizer.GroupingSetOptimizer;
import org.apache.hadoop.hive.ql.optimizer.MergeJoinProc;
import org.apache.hadoop.hive.ql.optimizer.NonBlockingOpDeDupProc;
import org.apache.hadoop.hive.ql.optimizer.ParallelEdgeFixer;
@@ -498,6 +499,10 @@ public class TezCompiler extends TaskCompiler {
topNodes.addAll(procCtx.parseContext.getTopOps().values());
SemanticGraphWalker ogw = new ForwardWalker(disp);
ogw.startWalking(topNodes, null);
+
+ if (procCtx.conf.getLongVar(ConfVars.HIVE_OPTIMIZE_GROUPING_SET_THRESHOLD)
> 0) {
+ new GroupingSetOptimizer().transform(procCtx.parseContext);
+ }
}
private void extendParentReduceSinkOfMapJoin(OptimizeTezProcContext procCtx)
throws SemanticException {
diff --git
a/ql/src/test/queries/clientpositive/groupingset_optimize_hive_28489.q
b/ql/src/test/queries/clientpositive/groupingset_optimize_hive_28489.q
new file mode 100644
index 00000000000..a8e332808d2
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/groupingset_optimize_hive_28489.q
@@ -0,0 +1,30 @@
+-- SORT_QUERY_RESULTS
+
+create table grp_set_test (key string, value string, col0 int, col1 int, col2
int, col3 int);
+insert into grp_set_test values (1, 1, 1, 1, 1, 1), (1, 1, 1, 2, 2, 10), (1,
1, 1, 2, 3, 100);
+
+-- Should not be optimized
+set hive.optimize.grouping.set.threshold=-1;
+explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0,
col1, col2);
+
+set hive.optimize.grouping.set.threshold=1;
+explain
+select col0, col1, col2, sum(col3) from (select * from grp_set_test distribute
by col0)d group by rollup(col0, col1, col2);
+
+set hive.optimize.grouping.set.threshold=1000000000;
+explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0,
col1, col2);
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0,
col1, col2);
+
+-- Should be optimized
+set hive.optimize.grouping.set.threshold=1;
+explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0,
col1, col2);
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0,
col1, col2);
+
+-- Should be optimized, and the selected partition key should not be col3.
+alter table grp_set_test update statistics for column col3
set('numDVs'='10000','numNulls'='10000');
+explain
+select col0, col1, col2, count(distinct col3) from grp_set_test group by
rollup(col0, col1, col2);
+
diff --git
a/ql/src/test/results/clientpositive/llap/groupingset_optimize_hive_28489.q.out
b/ql/src/test/results/clientpositive/llap/groupingset_optimize_hive_28489.q.out
new file mode 100644
index 00000000000..c7741dbe3dc
--- /dev/null
+++
b/ql/src/test/results/clientpositive/llap/groupingset_optimize_hive_28489.q.out
@@ -0,0 +1,479 @@
+PREHOOK: query: create table grp_set_test (key string, value string, col0 int,
col1 int, col2 int, col3 int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@grp_set_test
+POSTHOOK: query: create table grp_set_test (key string, value string, col0
int, col1 int, col2 int, col3 int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@grp_set_test
+PREHOOK: query: insert into grp_set_test values (1, 1, 1, 1, 1, 1), (1, 1, 1,
2, 2, 10), (1, 1, 1, 2, 3, 100)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@grp_set_test
+POSTHOOK: query: insert into grp_set_test values (1, 1, 1, 1, 1, 1), (1, 1, 1,
2, 2, 10), (1, 1, 1, 2, 3, 100)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@grp_set_test
+POSTHOOK: Lineage: grp_set_test.col0 SCRIPT []
+POSTHOOK: Lineage: grp_set_test.col1 SCRIPT []
+POSTHOOK: Lineage: grp_set_test.col2 SCRIPT []
+POSTHOOK: Lineage: grp_set_test.col3 SCRIPT []
+POSTHOOK: Lineage: grp_set_test.key SCRIPT []
+POSTHOOK: Lineage: grp_set_test.value SCRIPT []
+PREHOOK: query: explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0,
col1, col2)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0,
col1, col2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: grp_set_test
+ Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: col0 (type: int), col1 (type: int), col2
(type: int), col3 (type: int)
+ outputColumnNames: col0, col1, col2, col3
+ Statistics: Num rows: 3 Data size: 48 Basic stats:
COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: sum(col3)
+ keys: col0 (type: int), col1 (type: int), col2 (type:
int), 0L (type: bigint)
+ grouping sets: 0, 1, 3, 7
+ minReductionHashAggr: 0.4
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Statistics: Num rows: 6 Data size: 168 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int), _col1 (type: int),
_col2 (type: int), _col3 (type: bigint)
+ null sort order: zzzz
+ sort order: ++++
+ Map-reduce partition columns: _col0 (type: int), _col1
(type: int), _col2 (type: int), _col3 (type: bigint)
+ Statistics: Num rows: 6 Data size: 168 Basic stats:
COMPLETE Column stats: COMPLETE
+ value expressions: _col4 (type: bigint)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: sum(VALUE._col0)
+ keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2
(type: int), KEY._col3 (type: bigint)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col4
+ Statistics: Num rows: 6 Data size: 168 Basic stats: COMPLETE
Column stats: COMPLETE
+ pruneGroupingSetId: true
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: int), _col2
(type: int), _col4 (type: bigint)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 6 Data size: 120 Basic stats: COMPLETE
Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 6 Data size: 120 Basic stats:
COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: explain
+select col0, col1, col2, sum(col3) from (select * from grp_set_test distribute
by col0)d group by rollup(col0, col1, col2)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select col0, col1, col2, sum(col3) from (select * from grp_set_test distribute
by col0)d group by rollup(col0, col1, col2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+ Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: grp_set_test
+ Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: col0 (type: int), col1 (type: int), col2
(type: int), col3 (type: int)
+ outputColumnNames: _col2, _col3, _col4, _col5
+ Statistics: Num rows: 3 Data size: 48 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ null sort order:
+ sort order:
+ Map-reduce partition columns: _col2 (type: int)
+ Statistics: Num rows: 3 Data size: 48 Basic stats:
COMPLETE Column stats: COMPLETE
+ value expressions: _col2 (type: int), _col3 (type: int),
_col4 (type: int), _col5 (type: int)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Select Operator
+ expressions: VALUE._col2 (type: int), VALUE._col3 (type: int),
VALUE._col4 (type: int), VALUE._col5 (type: int)
+ outputColumnNames: _col2, _col3, _col4, _col5
+ Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE
Column stats: COMPLETE
+ Group By Operator
+ aggregations: sum(_col5)
+ keys: _col2 (type: int), _col3 (type: int), _col4 (type:
int), 0L (type: bigint)
+ grouping sets: 0, 1, 3, 7
+ minReductionHashAggr: 0.4
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Statistics: Num rows: 6 Data size: 168 Basic stats: COMPLETE
Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int), _col1 (type: int),
_col2 (type: int), _col3 (type: bigint)
+ null sort order: zzzz
+ sort order: ++++
+ Map-reduce partition columns: _col0 (type: int), _col1
(type: int), _col2 (type: int), _col3 (type: bigint)
+ Statistics: Num rows: 6 Data size: 168 Basic stats:
COMPLETE Column stats: COMPLETE
+ value expressions: _col4 (type: bigint)
+ Reducer 3
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: sum(VALUE._col0)
+ keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2
(type: int), KEY._col3 (type: bigint)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col4
+ Statistics: Num rows: 6 Data size: 168 Basic stats: COMPLETE
Column stats: COMPLETE
+ pruneGroupingSetId: true
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: int), _col2
(type: int), _col4 (type: bigint)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 6 Data size: 120 Basic stats: COMPLETE
Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 6 Data size: 120 Basic stats:
COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0,
col1, col2)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0,
col1, col2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: grp_set_test
+ Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: col0 (type: int), col1 (type: int), col2
(type: int), col3 (type: int)
+ outputColumnNames: col0, col1, col2, col3
+ Statistics: Num rows: 3 Data size: 48 Basic stats:
COMPLETE Column stats: COMPLETE
+ Group By Operator
+ aggregations: sum(col3)
+ keys: col0 (type: int), col1 (type: int), col2 (type:
int), 0L (type: bigint)
+ grouping sets: 0, 1, 3, 7
+ minReductionHashAggr: 0.4
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Statistics: Num rows: 6 Data size: 168 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int), _col1 (type: int),
_col2 (type: int), _col3 (type: bigint)
+ null sort order: zzzz
+ sort order: ++++
+ Map-reduce partition columns: _col0 (type: int), _col1
(type: int), _col2 (type: int), _col3 (type: bigint)
+ Statistics: Num rows: 6 Data size: 168 Basic stats:
COMPLETE Column stats: COMPLETE
+ value expressions: _col4 (type: bigint)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: sum(VALUE._col0)
+ keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2
(type: int), KEY._col3 (type: bigint)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col4
+ Statistics: Num rows: 6 Data size: 168 Basic stats: COMPLETE
Column stats: COMPLETE
+ pruneGroupingSetId: true
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: int), _col2
(type: int), _col4 (type: bigint)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 6 Data size: 120 Basic stats: COMPLETE
Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 6 Data size: 120 Basic stats:
COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select col0, col1, col2, sum(col3) from grp_set_test group by
rollup(col0, col1, col2)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: select col0, col1, col2, sum(col3) from grp_set_test group by
rollup(col0, col1, col2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+1 1 1 1
+1 1 NULL 1
+1 2 2 10
+1 2 3 100
+1 2 NULL 110
+1 NULL NULL 111
+NULL NULL NULL 111
+PREHOOK: query: explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0,
col1, col2)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0,
col1, col2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+ Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: grp_set_test
+ Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: col0 (type: int), col1 (type: int), col2
(type: int), col3 (type: int)
+ outputColumnNames: col0, col1, col2, col3
+ Statistics: Num rows: 3 Data size: 48 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: col2 (type: int)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: col2 (type: int)
+ value expressions: col0 (type: int), col1 (type: int),
col3 (type: int)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Select Operator
+ expressions: VALUE.col0 (type: int), VALUE.col1 (type: int),
KEY.col2 (type: int), VALUE.col3 (type: int)
+ outputColumnNames: col0, col1, col2, col3
+ Group By Operator
+ aggregations: sum(col3)
+ keys: col0 (type: int), col1 (type: int), col2 (type: int),
0L (type: bigint)
+ grouping sets: 0, 1, 3, 7
+ minReductionHashAggr: 0.4
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4
+ Statistics: Num rows: 6 Data size: 168 Basic stats: COMPLETE
Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int), _col1 (type: int),
_col2 (type: int), _col3 (type: bigint)
+ null sort order: zzzz
+ sort order: ++++
+ Map-reduce partition columns: _col0 (type: int), _col1
(type: int), _col2 (type: int), _col3 (type: bigint)
+ Statistics: Num rows: 6 Data size: 168 Basic stats:
COMPLETE Column stats: COMPLETE
+ value expressions: _col4 (type: bigint)
+ Reducer 3
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: sum(VALUE._col0)
+ keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2
(type: int), KEY._col3 (type: bigint)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col4
+ Statistics: Num rows: 6 Data size: 168 Basic stats: COMPLETE
Column stats: COMPLETE
+ pruneGroupingSetId: true
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: int), _col2
(type: int), _col4 (type: bigint)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 6 Data size: 120 Basic stats: COMPLETE
Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 6 Data size: 120 Basic stats:
COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
+PREHOOK: query: select col0, col1, col2, sum(col3) from grp_set_test group by
rollup(col0, col1, col2)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: select col0, col1, col2, sum(col3) from grp_set_test group by
rollup(col0, col1, col2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+1 1 1 1
+1 1 NULL 1
+1 2 2 10
+1 2 3 100
+1 2 NULL 110
+1 NULL NULL 111
+NULL NULL NULL 111
+PREHOOK: query: alter table grp_set_test update statistics for column col3
set('numDVs'='10000','numNulls'='10000')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@grp_set_test
+PREHOOK: Output: default@grp_set_test
+POSTHOOK: query: alter table grp_set_test update statistics for column col3
set('numDVs'='10000','numNulls'='10000')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@grp_set_test
+POSTHOOK: Output: default@grp_set_test
+PREHOOK: query: explain
+select col0, col1, col2, count(distinct col3) from grp_set_test group by
rollup(col0, col1, col2)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select col0, col1, col2, count(distinct col3) from grp_set_test group by
rollup(col0, col1, col2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+ Stage-1 is a root stage
+ Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+ Stage: Stage-1
+ Tez
+#### A masked pattern was here ####
+ Edges:
+ Reducer 2 <- Map 1 (SIMPLE_EDGE)
+ Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+ Vertices:
+ Map 1
+ Map Operator Tree:
+ TableScan
+ alias: grp_set_test
+ Statistics: Num rows: 3 Data size: 36 Basic stats: COMPLETE
Column stats: COMPLETE
+ Select Operator
+ expressions: col0 (type: int), col1 (type: int), col2
(type: int), col3 (type: int)
+ outputColumnNames: col0, col1, col2, col3
+ Statistics: Num rows: 3 Data size: 36 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: col2 (type: int)
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: col2 (type: int)
+ value expressions: col0 (type: int), col1 (type: int),
col3 (type: int)
+ Execution mode: vectorized, llap
+ LLAP IO: all inputs
+ Reducer 2
+ Execution mode: llap
+ Reduce Operator Tree:
+ Select Operator
+ expressions: VALUE.col0 (type: int), VALUE.col1 (type: int),
KEY.col2 (type: int), VALUE.col3 (type: int)
+ outputColumnNames: col0, col1, col2, col3
+ Group By Operator
+ aggregations: count(DISTINCT col3)
+ keys: col0 (type: int), col1 (type: int), col2 (type: int),
0L (type: bigint), col3 (type: int)
+ grouping sets: 0, 1, 3, 7
+ minReductionHashAggr: 0.4
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+ Statistics: Num rows: 6 Data size: 172 Basic stats: COMPLETE
Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int), _col1 (type: int),
_col2 (type: int), _col3 (type: bigint), _col4 (type: int)
+ null sort order: zzzzz
+ sort order: +++++
+ Map-reduce partition columns: _col0 (type: int), _col1
(type: int), _col2 (type: int), _col3 (type: bigint)
+ Statistics: Num rows: 6 Data size: 172 Basic stats:
COMPLETE Column stats: COMPLETE
+ Reducer 3
+ Execution mode: llap
+ Reduce Operator Tree:
+ Group By Operator
+ aggregations: count(DISTINCT KEY._col4:0._col0)
+ keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2
(type: int), KEY._col3 (type: bigint)
+ mode: mergepartial
+ outputColumnNames: _col0, _col1, _col2, _col4
+ Statistics: Num rows: 6 Data size: 168 Basic stats: COMPLETE
Column stats: COMPLETE
+ pruneGroupingSetId: true
+ Select Operator
+ expressions: _col0 (type: int), _col1 (type: int), _col2
(type: int), _col4 (type: bigint)
+ outputColumnNames: _col0, _col1, _col2, _col3
+ Statistics: Num rows: 6 Data size: 120 Basic stats: COMPLETE
Column stats: COMPLETE
+ File Output Operator
+ compressed: false
+ Statistics: Num rows: 6 Data size: 120 Basic stats:
COMPLETE Column stats: COMPLETE
+ table:
+ input format:
org.apache.hadoop.mapred.SequenceFileInputFormat
+ output format:
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+ serde:
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+ Stage: Stage-0
+ Fetch Operator
+ limit: -1
+ Processor Tree:
+ ListSink
+
diff --git
a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query22.q.out
b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query22.q.out
index 6c5e258b368..592b285f409 100644
--- a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query22.q.out
+++ b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query22.q.out
@@ -7,9 +7,10 @@ STAGE PLANS:
Tez
#### A masked pattern was here ####
Edges:
- Map 1 <- Map 4 (BROADCAST_EDGE), Map 5 (BROADCAST_EDGE)
+ Map 1 <- Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE)
Reducer 2 <- Map 1 (SIMPLE_EDGE)
Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+ Reducer 4 <- Reducer 3 (SIMPLE_EDGE)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -30,7 +31,7 @@ STAGE PLANS:
1 _col0 (type: bigint)
outputColumnNames: _col1, _col2
input vertices:
- 1 Map 4
+ 1 Map 5
Statistics: Num rows: 321094889 Data size: 3527549756
Basic stats: COMPLETE Column stats: COMPLETE
Map Join Operator
condition map:
@@ -40,26 +41,17 @@ STAGE PLANS:
1 _col0 (type: bigint)
outputColumnNames: _col2, _col5, _col6, _col7, _col8
input vertices:
- 1 Map 5
+ 1 Map 6
Statistics: Num rows: 321094889 Data size:
125864702465 Basic stats: COMPLETE Column stats: COMPLETE
- Group By Operator
- aggregations: sum(_col2), count(_col2)
- keys: _col5 (type: char(50)), _col6 (type:
char(50)), _col7 (type: char(50)), _col8 (type: char(50)), 0L (type: bigint)
- grouping sets: 0, 2, 6, 14, 15
- minReductionHashAggr: 0.83334786
- mode: hash
- outputColumnNames: _col0, _col1, _col2, _col3,
_col4, _col5, _col6
- Statistics: Num rows: 1605474445 Data size:
663060945785 Basic stats: COMPLETE Column stats: COMPLETE
- Reduce Output Operator
- key expressions: _col0 (type: char(50)), _col1
(type: char(50)), _col2 (type: char(50)), _col3 (type: char(50)), _col4 (type:
bigint)
- null sort order: zzzzz
- sort order: +++++
- Map-reduce partition columns: _col0 (type:
char(50)), _col1 (type: char(50)), _col2 (type: char(50)), _col3 (type:
char(50)), _col4 (type: bigint)
- Statistics: Num rows: 1605474445 Data size:
663060945785 Basic stats: COMPLETE Column stats: COMPLETE
- value expressions: _col5 (type: bigint), _col6
(type: bigint)
+ Reduce Output Operator
+ key expressions: _col8 (type: char(50))
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col8 (type: char(50))
+ value expressions: _col2 (type: int), _col5 (type:
char(50)), _col6 (type: char(50)), _col7 (type: char(50))
Execution mode: vectorized, llap
LLAP IO: may be used (ACID table)
- Map 4
+ Map 5
Map Operator Tree:
TableScan
alias: date_dim
@@ -80,7 +72,7 @@ STAGE PLANS:
Statistics: Num rows: 359 Data size: 2872 Basic stats:
COMPLETE Column stats: COMPLETE
Execution mode: vectorized, llap
LLAP IO: may be used (ACID table)
- Map 5
+ Map 6
Map Operator Tree:
TableScan
alias: item
@@ -99,6 +91,27 @@ STAGE PLANS:
Execution mode: vectorized, llap
LLAP IO: may be used (ACID table)
Reducer 2
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Select Operator
+ expressions: VALUE._col2 (type: int), VALUE._col5 (type:
char(50)), VALUE._col6 (type: char(50)), VALUE._col7 (type: char(50)),
KEY._col8 (type: char(50))
+ outputColumnNames: _col2, _col5, _col6, _col7, _col8
+ Group By Operator
+ aggregations: sum(_col2), count(_col2)
+ keys: _col5 (type: char(50)), _col6 (type: char(50)), _col7
(type: char(50)), _col8 (type: char(50)), 0L (type: bigint)
+ grouping sets: 0, 2, 6, 14, 15
+ minReductionHashAggr: 0.83334786
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5,
_col6
+ Statistics: Num rows: 1605474445 Data size: 663060945785
Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: char(50)), _col1 (type:
char(50)), _col2 (type: char(50)), _col3 (type: char(50)), _col4 (type: bigint)
+ null sort order: zzzzz
+ sort order: +++++
+ Map-reduce partition columns: _col0 (type: char(50)),
_col1 (type: char(50)), _col2 (type: char(50)), _col3 (type: char(50)), _col4
(type: bigint)
+ Statistics: Num rows: 1605474445 Data size: 663060945785
Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col5 (type: bigint), _col6 (type:
bigint)
+ Reducer 3
Execution mode: vectorized, llap
Reduce Operator Tree:
Group By Operator
@@ -123,7 +136,7 @@ STAGE PLANS:
null sort order: zzzzz
sort order: +++++
Statistics: Num rows: 1605474445 Data size: 637373354665
Basic stats: COMPLETE Column stats: COMPLETE
- Reducer 3
+ Reducer 4
Execution mode: vectorized, llap
Reduce Operator Tree:
Select Operator
diff --git
a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query67.q.out
b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query67.q.out
index 2ce3245886c..374699407bc 100644
--- a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query67.q.out
+++ b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query67.q.out
@@ -7,10 +7,11 @@ STAGE PLANS:
Tez
#### A masked pattern was here ####
Edges:
- Map 1 <- Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE), Map 7
(BROADCAST_EDGE)
+ Map 1 <- Map 6 (BROADCAST_EDGE), Map 7 (BROADCAST_EDGE), Map 8
(BROADCAST_EDGE)
Reducer 2 <- Map 1 (SIMPLE_EDGE)
Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
Reducer 4 <- Reducer 3 (SIMPLE_EDGE)
+ Reducer 5 <- Reducer 4 (SIMPLE_EDGE)
#### A masked pattern was here ####
Vertices:
Map 1
@@ -35,7 +36,7 @@ STAGE PLANS:
1 _col0 (type: bigint)
outputColumnNames: _col0, _col1, _col3, _col5, _col6,
_col7
input vertices:
- 1 Map 5
+ 1 Map 6
Statistics: Num rows: 15840066266 Data size:
2202441686776 Basic stats: COMPLETE Column stats: COMPLETE
Map Join Operator
condition map:
@@ -45,7 +46,7 @@ STAGE PLANS:
1 _col0 (type: bigint)
outputColumnNames: _col0, _col3, _col5, _col6,
_col7, _col9
input vertices:
- 1 Map 6
+ 1 Map 7
Statistics: Num rows: 15840066266 Data size:
3674895373712 Basic stats: COMPLETE Column stats: COMPLETE
Map Join Operator
condition map:
@@ -55,26 +56,17 @@ STAGE PLANS:
1 _col0 (type: bigint)
outputColumnNames: _col3, _col5, _col6, _col7,
_col9, _col11, _col12, _col13, _col14
input vertices:
- 1 Map 7
+ 1 Map 8
Statistics: Num rows: 15840066266 Data size:
9709960621058 Basic stats: COMPLETE Column stats: COMPLETE
- Group By Operator
- aggregations: sum(_col3)
- keys: _col5 (type: int), _col6 (type: int),
_col7 (type: int), _col9 (type: string), _col11 (type: char(50)), _col12 (type:
char(50)), _col13 (type: char(50)), _col14 (type: char(50)), 0L (type: bigint)
- grouping sets: 0, 16, 80, 112, 240, 241, 249,
253, 255
- minReductionHashAggr: 0.9867937
- mode: hash
- outputColumnNames: _col0, _col1, _col2, _col3,
_col4, _col5, _col6, _col7, _col8, _col9
- Statistics: Num rows: 142560596394 Data size:
88530130360674 Basic stats: COMPLETE Column stats: COMPLETE
- Reduce Output Operator
- key expressions: _col0 (type: int), _col1
(type: int), _col2 (type: int), _col3 (type: string), _col4 (type: char(50)),
_col5 (type: char(50)), _col6 (type: char(50)), _col7 (type: char(50)), _col8
(type: bigint)
- null sort order: zzzzzzzzz
- sort order: +++++++++
- Map-reduce partition columns: _col0 (type:
int), _col1 (type: int), _col2 (type: int), _col3 (type: string), _col4 (type:
char(50)), _col5 (type: char(50)), _col6 (type: char(50)), _col7 (type:
char(50)), _col8 (type: bigint)
- Statistics: Num rows: 142560596394 Data size:
88530130360674 Basic stats: COMPLETE Column stats: COMPLETE
- value expressions: _col9 (type: decimal(28,2))
+ Reduce Output Operator
+ key expressions: _col14 (type: char(50))
+ null sort order: z
+ sort order: +
+ Map-reduce partition columns: _col14 (type:
char(50))
+ value expressions: _col3 (type: decimal(18,2)),
_col5 (type: int), _col6 (type: int), _col7 (type: int), _col9 (type: string),
_col11 (type: char(50)), _col12 (type: char(50)), _col13 (type: char(50))
Execution mode: vectorized, llap
LLAP IO: may be used (ACID table)
- Map 5
+ Map 6
Map Operator Tree:
TableScan
alias: date_dim
@@ -112,7 +104,7 @@ STAGE PLANS:
Target Vertex: Map 1
Execution mode: vectorized, llap
LLAP IO: may be used (ACID table)
- Map 6
+ Map 7
Map Operator Tree:
TableScan
alias: store
@@ -130,7 +122,7 @@ STAGE PLANS:
value expressions: _col1 (type: string)
Execution mode: vectorized, llap
LLAP IO: may be used (ACID table)
- Map 7
+ Map 8
Map Operator Tree:
TableScan
alias: item
@@ -149,6 +141,27 @@ STAGE PLANS:
Execution mode: vectorized, llap
LLAP IO: may be used (ACID table)
Reducer 2
+ Execution mode: vectorized, llap
+ Reduce Operator Tree:
+ Select Operator
+ expressions: VALUE._col3 (type: decimal(18,2)), VALUE._col5
(type: int), VALUE._col6 (type: int), VALUE._col7 (type: int), VALUE._col9
(type: string), VALUE._col11 (type: char(50)), VALUE._col12 (type: char(50)),
VALUE._col13 (type: char(50)), KEY._col14 (type: char(50))
+ outputColumnNames: _col3, _col5, _col6, _col7, _col9, _col11,
_col12, _col13, _col14
+ Group By Operator
+ aggregations: sum(_col3)
+ keys: _col5 (type: int), _col6 (type: int), _col7 (type:
int), _col9 (type: string), _col11 (type: char(50)), _col12 (type: char(50)),
_col13 (type: char(50)), _col14 (type: char(50)), 0L (type: bigint)
+ grouping sets: 0, 16, 80, 112, 240, 241, 249, 253, 255
+ minReductionHashAggr: 0.9867937
+ mode: hash
+ outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5,
_col6, _col7, _col8, _col9
+ Statistics: Num rows: 142560596394 Data size: 88530130360674
Basic stats: COMPLETE Column stats: COMPLETE
+ Reduce Output Operator
+ key expressions: _col0 (type: int), _col1 (type: int),
_col2 (type: int), _col3 (type: string), _col4 (type: char(50)), _col5 (type:
char(50)), _col6 (type: char(50)), _col7 (type: char(50)), _col8 (type: bigint)
+ null sort order: zzzzzzzzz
+ sort order: +++++++++
+ Map-reduce partition columns: _col0 (type: int), _col1
(type: int), _col2 (type: int), _col3 (type: string), _col4 (type: char(50)),
_col5 (type: char(50)), _col6 (type: char(50)), _col7 (type: char(50)), _col8
(type: bigint)
+ Statistics: Num rows: 142560596394 Data size:
88530130360674 Basic stats: COMPLETE Column stats: COMPLETE
+ value expressions: _col9 (type: decimal(28,2))
+ Reducer 3
Execution mode: vectorized, llap
Reduce Operator Tree:
Group By Operator
@@ -176,7 +189,7 @@ STAGE PLANS:
Map-reduce partition columns: _col6 (type: char(50))
Statistics: Num rows: 142560596394 Data size:
87389645589522 Basic stats: COMPLETE Column stats: COMPLETE
value expressions: _col0 (type: int), _col1 (type: int),
_col2 (type: int), _col3 (type: string), _col4 (type: char(50)), _col5 (type:
char(50)), _col7 (type: char(50))
- Reducer 3
+ Reducer 4
Execution mode: vectorized, llap
Reduce Operator Tree:
Select Operator
@@ -222,7 +235,7 @@ STAGE PLANS:
null sort order: zzzzzzzzzz
sort order: ++++++++++
Statistics: Num rows: 47520198798 Data size:
29319962658366 Basic stats: COMPLETE Column stats: COMPLETE
- Reducer 4
+ Reducer 5
Execution mode: vectorized, llap
Reduce Operator Tree:
Select Operator