(hive) branch master updated: HIVE-28489: Partition the input data of GroupBy with GroupingSet (Seonggon Namgung, reviewed by PLASH SPEED, Shohei Okumiya)

okumin Wed, 27 Nov 2024 07:35:55 -0800

This is an automated email from the ASF dual-hosted git repository.

okumin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/master by this push:
     new 6e9c8283a94 HIVE-28489: Partition the input data of GroupBy with 
GroupingSet (Seonggon Namgung, reviewed by PLASH SPEED, Shohei Okumiya)
6e9c8283a94 is described below

commit 6e9c8283a94391050f1073629f0a04d3370a7cf2
Author: seonggon <[email protected]>
AuthorDate: Thu Nov 28 00:35:22 2024 +0900

    HIVE-28489: Partition the input data of GroupBy with GroupingSet (Seonggon 
Namgung, reviewed by PLASH SPEED, Shohei Okumiya)
---
 .../java/org/apache/hadoop/hive/conf/HiveConf.java |   4 +
 .../hive/ql/optimizer/GroupingSetOptimizer.java    | 379 ++++++++++++++++
 .../apache/hadoop/hive/ql/parse/TezCompiler.java   |   5 +
 .../groupingset_optimize_hive_28489.q              |  30 ++
 .../llap/groupingset_optimize_hive_28489.q.out     | 479 +++++++++++++++++++++
 .../perf/tpcds30tb/tez/query22.q.out               |  55 ++-
 .../perf/tpcds30tb/tez/query67.q.out               |  61 +--
 7 files changed, 968 insertions(+), 45 deletions(-)

diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java 
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 125d6acf072..cf564da18d4 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2048,6 +2048,10 @@ public class HiveConf extends Configuration {
         "assumption that the original group by will reduce the data size."),
     HIVE_GROUPBY_LIMIT_EXTRASTEP("hive.groupby.limit.extrastep", true, "This 
parameter decides if Hive should \n" +
         "create new MR job for sorting final output"),
+    
HIVE_OPTIMIZE_GROUPING_SET_THRESHOLD("hive.optimize.grouping.set.threshold", 
1_000_000_000L,
+        "If # of estimated rows emitted by GroupBy operator with GroupingSet 
is larger than the configured value, " +
+        "then the optimizer inserts an extra shuffle to partitioning input 
data.\n" +
+        "Setting a negative number disables the optimization."),
 
     // Max file num and size used to do a single copy (after that, distcp is 
used)
     HIVE_EXEC_COPYFILE_MAXNUMFILES("hive.exec.copyfile.maxnumfiles", 1L,
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupingSetOptimizer.java 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupingSetOptimizer.java
new file mode 100644
index 00000000000..2ebbf048905
--- /dev/null
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupingSetOptimizer.java
@@ -0,0 +1,379 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.GroupByOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.OperatorFactory;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.io.AcidUtils;
+import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
+import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.lib.RuleRegExp;
+import org.apache.hadoop.hive.ql.lib.SemanticDispatcher;
+import org.apache.hadoop.hive.ql.lib.SemanticGraphWalker;
+import org.apache.hadoop.hive.ql.lib.SemanticNodeProcessor;
+import org.apache.hadoop.hive.ql.lib.SemanticRule;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
+import org.apache.hadoop.hive.ql.plan.PlanUtils;
+import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
+import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc.ReducerTraits;
+import org.apache.hadoop.hive.ql.plan.SelectDesc;
+import org.apache.hadoop.hive.ql.plan.TableDesc;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Comparator;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+import java.util.stream.Collectors;
+
+public class GroupingSetOptimizer extends Transform {
+  private static final Logger LOG = 
LoggerFactory.getLogger(GroupingSetOptimizer.class);
+
+  private static class GroupingSetProcessorContext implements NodeProcessorCtx 
{
+    public final long bytesPerReducer;
+    public final int maxReducers;
+    public final long groupingSetThreshold;
+
+    public GroupingSetProcessorContext(HiveConf hiveConf) {
+      bytesPerReducer = 
hiveConf.getLongVar(HiveConf.ConfVars.BYTES_PER_REDUCER);
+      maxReducers = hiveConf.getIntVar(HiveConf.ConfVars.MAX_REDUCERS);
+      groupingSetThreshold = 
hiveConf.getLongVar(HiveConf.ConfVars.HIVE_OPTIMIZE_GROUPING_SET_THRESHOLD);
+    }
+  }
+
+  private static class GroupingSetProcessor implements SemanticNodeProcessor {
+    @Override
+    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
+        Object... nodeOutputs) throws SemanticException {
+      GroupingSetProcessorContext context = (GroupingSetProcessorContext) 
procCtx;
+      GroupByOperator gby = (GroupByOperator) nd;
+      if (!isGroupByFeasible(gby, context)) {
+        return null;
+      }
+
+      Operator<?> parentOp = gby.getParentOperators().get(0);
+      if (!isParentOpFeasible(parentOp)) {
+        return null;
+      }
+
+      String partitionCol = selectPartitionColumn(gby, parentOp);
+      if (partitionCol == null) {
+        return null;
+      }
+
+      LOG.info("Applying GroupingSetOptimization: partitioning the input data 
of {} by {}",
+          gby, partitionCol);
+
+      ReduceSinkOperator rs = createReduceSink(parentOp, partitionCol, 
context);
+
+      parentOp.removeChild(gby);
+      // gby.setParentOperators(Arrays.asList(rs));
+      // NOTE: The above expression does not work because GBY refers to _colN
+      //  while input columns are VALUE._colN. Therefore, we should either 
modify GBY expressions
+      //  or insert a new SEL that renames columns. The following code 
implements the later one as it is
+      //  easier to implement.
+
+      SelectOperator sel = createSelect(parentOp.getSchema().getSignature(), 
partitionCol, rs);
+
+      sel.setChildOperators(Arrays.asList(gby));
+      gby.setParentOperators(Arrays.asList(sel));
+
+      return null;
+    }
+
+    private boolean isGroupByFeasible(GroupByOperator gby, 
GroupingSetProcessorContext context) {
+      if (!gby.getConf().isGroupingSetsPresent() || gby.getStatistics() == 
null) {
+        return false;
+      }
+
+      if (gby.getStatistics().getNumRows() < context.groupingSetThreshold) {
+        LOG.debug("Skip grouping-set optimization on a small operator: {}", 
gby);
+        return false;
+      }
+
+      if (gby.getParentOperators().size() != 1) {
+        LOG.debug("Skip grouping-set optimization on a operator with multiple 
parent operators: {}", gby);
+        return false;
+      }
+
+      return true;
+    }
+
+    private boolean isParentOpFeasible(Operator<?> parentOp) {
+      ReduceSinkOperator rs = null;
+
+      Operator<?> curOp = parentOp;
+      while (true) {
+        if (curOp instanceof ReduceSinkOperator) {
+          rs = (ReduceSinkOperator) curOp;
+          break;
+        }
+
+        if (curOp.getParentOperators() == null) {
+          break;
+        }
+
+        if (curOp.getParentOperators().size() == 1) {
+          curOp = curOp.getParentOperators().get(0);
+        } else if (curOp instanceof AbstractMapJoinOperator) {
+          MapJoinDesc desc = ((AbstractMapJoinOperator<?>) curOp).getConf();
+          curOp = curOp.getParentOperators().get(desc.getPosBigTable());
+        } else {
+          break;
+        }
+      }
+
+      if (rs == null) {
+        // There is no partitioning followed by this parentOp. Continue 
optimization.
+        return true;
+      }
+
+      if (rs.getConf().getPartitionCols() != null && 
rs.getConf().getPartitionCols().size() > 0) {
+        // This rs might be irrelevant to the target GroupBy operator. For 
example, the following query:
+        //   SELECT a, b, sum(c) FROM (SELECT a, b, c FROM tbl DISTRIBUTE BY 
c) z GROUP BY rollup(a, b)
+        // won't be optimized although 'DISTRIBUTE BY c' is irrelevant to the 
key columns of GroupBy.
+        LOG.debug("Skip grouping-set optimization in order not to introduce 
possibly redundant shuffle.");
+        return false;
+      } else {
+        // No partitioning. Continue optimization.
+        return true;
+      }
+    }
+
+    private String selectPartitionColumn(GroupByOperator gby, Operator<?> 
parentOp) {
+      if (parentOp.getSchema() == null || parentOp.getSchema().getSignature() 
== null) {
+        LOG.debug("Skip grouping-set optimization as the parent operator {} 
does not provide signature",
+            parentOp);
+        return null;
+      }
+
+      if (parentOp.getStatistics() == null ||
+          parentOp.getStatistics().getNumRows() <= 0 ||
+          parentOp.getStatistics().getColumnStats() == null) {
+        LOG.debug("Skip grouping-set optimization as the parent operator {} 
does not provide statistics",
+            parentOp);
+        return null;
+      }
+
+      if (parentOp.getStatistics().getNumRows() > 
gby.getStatistics().getNumRows()) {
+        LOG.debug("Skip grouping-set optimization as the parent operator {} 
emits more rows than {}",
+            parentOp, gby);
+        return null;
+      }
+
+      List<String> colNamesInSignature = new ArrayList<>();
+      for (ColumnInfo pColInfo: parentOp.getSchema().getSignature()) {
+        colNamesInSignature.add(pColInfo.getInternalName());
+      }
+
+      List<Integer> groupingSetKeys = 
listGroupingSetKeyPositions(gby.getConf().getListGroupingSets());
+      Set<String> candidates = new HashSet<>();
+      for (Integer groupingSetKeyPosition: groupingSetKeys) {
+        ExprNodeDesc key = gby.getConf().getKeys().get(groupingSetKeyPosition);
+
+        if (key instanceof ExprNodeColumnDesc) {
+          candidates.add(((ExprNodeColumnDesc) key).getColumn());
+        }
+      }
+      candidates.retainAll(colNamesInSignature);
+
+      List<ColStatistics> columnStatistics =
+          new ArrayList<>(parentOp.getStatistics().getColumnStats()).stream()
+              .filter(cs -> cs.getCountDistint() > 0)
+              
.sorted(Comparator.comparingLong(ColStatistics::getCountDistint).reversed())
+              .collect(Collectors.toList());
+
+      String partitionCol = null;
+      for (ColStatistics col: columnStatistics) {
+        String colName = col.getColumnName();
+        if (parentOp.getColumnExprMap().containsKey(colName) && 
candidates.contains(colName)) {
+          partitionCol = colName;
+          break;
+        }
+      }
+
+      if (partitionCol == null) {
+        LOG.debug("Skip grouping-set optimization as there is no feasible 
column in parent operator {}.",
+            parentOp);
+      }
+
+      return partitionCol;
+    }
+
+    private ReduceSinkOperator createReduceSink(Operator<?> parentOp, String 
partitionColName,
+        GroupingSetProcessorContext context) {
+      Map<String, ExprNodeDesc> colExprMap = new HashMap<>();
+      List<ExprNodeDesc> keyColumns = new ArrayList<>();
+      List<String> keyColumnNames = new ArrayList<>();
+      List<ExprNodeDesc> valueColumns = new ArrayList<>();
+      List<String> valueColumnNames = new ArrayList<>();
+      List<ColumnInfo> signature = new ArrayList<>();
+      List<ExprNodeDesc> partCols = new ArrayList<>();
+
+      for (ColumnInfo pColInfo: parentOp.getSchema().getSignature()) {
+        ColumnInfo cColInfo = new ColumnInfo(pColInfo);
+        String pColName = pColInfo.getInternalName();
+
+        if (pColName.equals(partitionColName)) {
+          keyColumnNames.add(pColName);
+
+          String cColName = Utilities.ReduceField.KEY + "." + pColName;
+          cColInfo.setInternalName(cColName);
+          signature.add(cColInfo);
+
+          ExprNodeDesc keyExpr = new ExprNodeColumnDesc(pColInfo);
+          keyColumns.add(keyExpr);
+          colExprMap.put(cColName, keyExpr);
+
+          partCols.add(keyExpr);
+        } else {
+          valueColumnNames.add(pColName);
+
+          String cColName = Utilities.ReduceField.VALUE + "." + pColName;
+          cColInfo.setInternalName(cColName);
+          signature.add(cColInfo);
+
+          ExprNodeDesc valueExpr = new ExprNodeColumnDesc(pColInfo);
+          valueColumns.add(valueExpr);
+          colExprMap.put(cColName, valueExpr);
+        }
+      }
+
+      List<FieldSchema> valueFields =
+          PlanUtils.getFieldSchemasFromColumnList(valueColumns, 
valueColumnNames, 0, "");
+      TableDesc valueTable = PlanUtils.getReduceValueTableDesc(valueFields);
+
+      List<FieldSchema> keyFields =
+          PlanUtils.getFieldSchemasFromColumnList(keyColumns, keyColumnNames, 
0, "");
+      TableDesc keyTable = PlanUtils.getReduceKeyTableDesc(keyFields, "+", 
"z");
+      List<List<Integer>> distinctColumnIndices = new ArrayList<>();
+
+      // If we run SetReducerParallelism after this optimization, then we 
don't have to compute numReducers.
+      int numReducers = Utilities.estimateReducers(
+          parentOp.getStatistics().getDataSize(), context.bytesPerReducer, 
context.maxReducers, false);
+
+      ReduceSinkDesc rsConf = new ReduceSinkDesc(keyColumns, 
keyColumns.size(), valueColumns,
+          keyColumnNames, distinctColumnIndices, valueColumnNames, -1, 
partCols, numReducers, keyTable,
+          valueTable, AcidUtils.Operation.NOT_ACID);
+
+      ReduceSinkOperator rs =
+          (ReduceSinkOperator) OperatorFactory.getAndMakeChild(rsConf, new 
RowSchema(signature), parentOp);
+      rs.setColumnExprMap(colExprMap);
+
+      // If we run SetReducerParallelism after this optimization, the 
following code becomes unnecessary.
+      rsConf.setReducerTraits(EnumSet.of(ReducerTraits.UNIFORM, 
ReducerTraits.AUTOPARALLEL));
+
+      return rs;
+    }
+
+    private SelectOperator createSelect(List<ColumnInfo> signature,  String 
partitionColName,
+        Operator<?> parentOp) {
+      List<String> selColNames = new ArrayList<>();
+      List<ExprNodeDesc> selColumns = new ArrayList<>();
+      List<ColumnInfo> selSignature = new ArrayList<>();
+      Map<String, ExprNodeDesc> colExprMap = new HashMap<>();
+
+      for (ColumnInfo pColInfo: signature) {
+        String origColName = pColInfo.getInternalName();
+        String rsColName;
+
+        if (origColName.equals(partitionColName)) {
+          rsColName = Utilities.ReduceField.KEY + "." + origColName;
+        } else {
+          rsColName = Utilities.ReduceField.VALUE + "." + origColName;
+        }
+
+        ColumnInfo selColInfo = new ColumnInfo(pColInfo);
+
+        ExprNodeDesc selExpr = new ExprNodeColumnDesc(pColInfo.getType(), 
rsColName, null, false);
+
+        selSignature.add(selColInfo);
+        selColumns.add(selExpr);
+        selColNames.add(origColName);
+        colExprMap.put(origColName, selExpr);
+      }
+
+      SelectDesc selConf = new SelectDesc(selColumns, selColNames);
+      SelectOperator sel =
+          (SelectOperator) OperatorFactory.getAndMakeChild(selConf, new 
RowSchema(selSignature), parentOp);
+      sel.setColumnExprMap(colExprMap);
+
+      return sel;
+    }
+
+    private List<Integer> listGroupingSetKeyPositions(List<Long> groupingSets) 
{
+      long acc = 0L;
+      for (Long groupingSet: groupingSets) {
+        acc |= groupingSet;
+      }
+
+      BitSet bitset = BitSet.valueOf(new long[]{acc});
+      List<Integer> ret = new ArrayList<>();
+      for (int i = bitset.nextSetBit(0); i >= 0; i = bitset.nextSetBit(i + 1)) 
{
+        ret.add(i);
+      }
+
+      return ret;
+    }
+  }
+
+  @Override
+  public ParseContext transform(ParseContext pCtx) throws SemanticException {
+    Map<SemanticRule, SemanticNodeProcessor> testRules = new LinkedHashMap<>();
+    testRules.put(new RuleRegExp("GBY", GroupByOperator.getOperatorName() + 
"%"),
+        new GroupingSetProcessor()
+    );
+
+    SemanticDispatcher disp =
+        new DefaultRuleDispatcher(null, testRules, new 
GroupingSetProcessorContext(pCtx.getConf()));
+    SemanticGraphWalker ogw = new DefaultGraphWalker(disp);
+
+    List<Node> topNodes = new ArrayList<Node>();
+    topNodes.addAll(pCtx.getTopOps().values());
+    ogw.startWalking(topNodes, null);
+
+    return pCtx;
+  }
+}
+
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
index c9990a9f69f..5aafe93cc58 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/TezCompiler.java
@@ -88,6 +88,7 @@ import 
org.apache.hadoop.hive.ql.optimizer.ConstantPropagateProcCtx.ConstantProp
 import org.apache.hadoop.hive.ql.optimizer.ConvertJoinMapJoin;
 import org.apache.hadoop.hive.ql.optimizer.DynamicPartitionPruningOptimization;
 import org.apache.hadoop.hive.ql.optimizer.FiltertagAppenderProc;
+import org.apache.hadoop.hive.ql.optimizer.GroupingSetOptimizer;
 import org.apache.hadoop.hive.ql.optimizer.MergeJoinProc;
 import org.apache.hadoop.hive.ql.optimizer.NonBlockingOpDeDupProc;
 import org.apache.hadoop.hive.ql.optimizer.ParallelEdgeFixer;
@@ -498,6 +499,10 @@ public class TezCompiler extends TaskCompiler {
     topNodes.addAll(procCtx.parseContext.getTopOps().values());
     SemanticGraphWalker ogw = new ForwardWalker(disp);
     ogw.startWalking(topNodes, null);
+
+    if (procCtx.conf.getLongVar(ConfVars.HIVE_OPTIMIZE_GROUPING_SET_THRESHOLD) 
> 0) {
+      new GroupingSetOptimizer().transform(procCtx.parseContext);
+    }
   }
 
   private void extendParentReduceSinkOfMapJoin(OptimizeTezProcContext procCtx) 
throws SemanticException {
diff --git 
a/ql/src/test/queries/clientpositive/groupingset_optimize_hive_28489.q 
b/ql/src/test/queries/clientpositive/groupingset_optimize_hive_28489.q
new file mode 100644
index 00000000000..a8e332808d2
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/groupingset_optimize_hive_28489.q
@@ -0,0 +1,30 @@
+-- SORT_QUERY_RESULTS
+
+create table grp_set_test (key string, value string, col0 int, col1 int, col2 
int, col3 int);
+insert into grp_set_test values (1, 1, 1, 1, 1, 1), (1, 1, 1, 2, 2, 10), (1, 
1, 1, 2, 3, 100);
+
+-- Should not be optimized
+set hive.optimize.grouping.set.threshold=-1;
+explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0, 
col1, col2);
+
+set hive.optimize.grouping.set.threshold=1;
+explain
+select col0, col1, col2, sum(col3) from (select * from grp_set_test distribute 
by col0)d group by rollup(col0, col1, col2);
+
+set hive.optimize.grouping.set.threshold=1000000000;
+explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0, 
col1, col2);
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0, 
col1, col2);
+
+-- Should be optimized
+set hive.optimize.grouping.set.threshold=1;
+explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0, 
col1, col2);
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0, 
col1, col2);
+
+-- Should be optimized, and the selected partition key should not be col3.
+alter table grp_set_test update statistics for column col3 
set('numDVs'='10000','numNulls'='10000');
+explain
+select col0, col1, col2, count(distinct col3) from grp_set_test group by 
rollup(col0, col1, col2);
+
diff --git 
a/ql/src/test/results/clientpositive/llap/groupingset_optimize_hive_28489.q.out 
b/ql/src/test/results/clientpositive/llap/groupingset_optimize_hive_28489.q.out
new file mode 100644
index 00000000000..c7741dbe3dc
--- /dev/null
+++ 
b/ql/src/test/results/clientpositive/llap/groupingset_optimize_hive_28489.q.out
@@ -0,0 +1,479 @@
+PREHOOK: query: create table grp_set_test (key string, value string, col0 int, 
col1 int, col2 int, col3 int)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@grp_set_test
+POSTHOOK: query: create table grp_set_test (key string, value string, col0 
int, col1 int, col2 int, col3 int)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@grp_set_test
+PREHOOK: query: insert into grp_set_test values (1, 1, 1, 1, 1, 1), (1, 1, 1, 
2, 2, 10), (1, 1, 1, 2, 3, 100)
+PREHOOK: type: QUERY
+PREHOOK: Input: _dummy_database@_dummy_table
+PREHOOK: Output: default@grp_set_test
+POSTHOOK: query: insert into grp_set_test values (1, 1, 1, 1, 1, 1), (1, 1, 1, 
2, 2, 10), (1, 1, 1, 2, 3, 100)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: _dummy_database@_dummy_table
+POSTHOOK: Output: default@grp_set_test
+POSTHOOK: Lineage: grp_set_test.col0 SCRIPT []
+POSTHOOK: Lineage: grp_set_test.col1 SCRIPT []
+POSTHOOK: Lineage: grp_set_test.col2 SCRIPT []
+POSTHOOK: Lineage: grp_set_test.col3 SCRIPT []
+POSTHOOK: Lineage: grp_set_test.key SCRIPT []
+POSTHOOK: Lineage: grp_set_test.value SCRIPT []
+PREHOOK: query: explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0, 
col1, col2)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0, 
col1, col2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: grp_set_test
+                  Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    expressions: col0 (type: int), col1 (type: int), col2 
(type: int), col3 (type: int)
+                    outputColumnNames: col0, col1, col2, col3
+                    Statistics: Num rows: 3 Data size: 48 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      aggregations: sum(col3)
+                      keys: col0 (type: int), col1 (type: int), col2 (type: 
int), 0L (type: bigint)
+                      grouping sets: 0, 1, 3, 7
+                      minReductionHashAggr: 0.4
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                      Statistics: Num rows: 6 Data size: 168 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int), _col1 (type: int), 
_col2 (type: int), _col3 (type: bigint)
+                        null sort order: zzzz
+                        sort order: ++++
+                        Map-reduce partition columns: _col0 (type: int), _col1 
(type: int), _col2 (type: int), _col3 (type: bigint)
+                        Statistics: Num rows: 6 Data size: 168 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        value expressions: _col4 (type: bigint)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: sum(VALUE._col0)
+                keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 
(type: int), KEY._col3 (type: bigint)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col4
+                Statistics: Num rows: 6 Data size: 168 Basic stats: COMPLETE 
Column stats: COMPLETE
+                pruneGroupingSetId: true
+                Select Operator
+                  expressions: _col0 (type: int), _col1 (type: int), _col2 
(type: int), _col4 (type: bigint)
+                  outputColumnNames: _col0, _col1, _col2, _col3
+                  Statistics: Num rows: 6 Data size: 120 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 6 Data size: 120 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain
+select col0, col1, col2, sum(col3) from (select * from grp_set_test distribute 
by col0)d group by rollup(col0, col1, col2)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select col0, col1, col2, sum(col3) from (select * from grp_set_test distribute 
by col0)d group by rollup(col0, col1, col2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: grp_set_test
+                  Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    expressions: col0 (type: int), col1 (type: int), col2 
(type: int), col3 (type: int)
+                    outputColumnNames: _col2, _col3, _col4, _col5
+                    Statistics: Num rows: 3 Data size: 48 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      null sort order: 
+                      sort order: 
+                      Map-reduce partition columns: _col2 (type: int)
+                      Statistics: Num rows: 3 Data size: 48 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      value expressions: _col2 (type: int), _col3 (type: int), 
_col4 (type: int), _col5 (type: int)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Select Operator
+                expressions: VALUE._col2 (type: int), VALUE._col3 (type: int), 
VALUE._col4 (type: int), VALUE._col5 (type: int)
+                outputColumnNames: _col2, _col3, _col4, _col5
+                Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Group By Operator
+                  aggregations: sum(_col5)
+                  keys: _col2 (type: int), _col3 (type: int), _col4 (type: 
int), 0L (type: bigint)
+                  grouping sets: 0, 1, 3, 7
+                  minReductionHashAggr: 0.4
+                  mode: hash
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                  Statistics: Num rows: 6 Data size: 168 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Reduce Output Operator
+                    key expressions: _col0 (type: int), _col1 (type: int), 
_col2 (type: int), _col3 (type: bigint)
+                    null sort order: zzzz
+                    sort order: ++++
+                    Map-reduce partition columns: _col0 (type: int), _col1 
(type: int), _col2 (type: int), _col3 (type: bigint)
+                    Statistics: Num rows: 6 Data size: 168 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    value expressions: _col4 (type: bigint)
+        Reducer 3 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: sum(VALUE._col0)
+                keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 
(type: int), KEY._col3 (type: bigint)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col4
+                Statistics: Num rows: 6 Data size: 168 Basic stats: COMPLETE 
Column stats: COMPLETE
+                pruneGroupingSetId: true
+                Select Operator
+                  expressions: _col0 (type: int), _col1 (type: int), _col2 
(type: int), _col4 (type: bigint)
+                  outputColumnNames: _col0, _col1, _col2, _col3
+                  Statistics: Num rows: 6 Data size: 120 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 6 Data size: 120 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0, 
col1, col2)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0, 
col1, col2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: grp_set_test
+                  Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    expressions: col0 (type: int), col1 (type: int), col2 
(type: int), col3 (type: int)
+                    outputColumnNames: col0, col1, col2, col3
+                    Statistics: Num rows: 3 Data size: 48 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Group By Operator
+                      aggregations: sum(col3)
+                      keys: col0 (type: int), col1 (type: int), col2 (type: 
int), 0L (type: bigint)
+                      grouping sets: 0, 1, 3, 7
+                      minReductionHashAggr: 0.4
+                      mode: hash
+                      outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                      Statistics: Num rows: 6 Data size: 168 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Reduce Output Operator
+                        key expressions: _col0 (type: int), _col1 (type: int), 
_col2 (type: int), _col3 (type: bigint)
+                        null sort order: zzzz
+                        sort order: ++++
+                        Map-reduce partition columns: _col0 (type: int), _col1 
(type: int), _col2 (type: int), _col3 (type: bigint)
+                        Statistics: Num rows: 6 Data size: 168 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        value expressions: _col4 (type: bigint)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: sum(VALUE._col0)
+                keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 
(type: int), KEY._col3 (type: bigint)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col4
+                Statistics: Num rows: 6 Data size: 168 Basic stats: COMPLETE 
Column stats: COMPLETE
+                pruneGroupingSetId: true
+                Select Operator
+                  expressions: _col0 (type: int), _col1 (type: int), _col2 
(type: int), _col4 (type: bigint)
+                  outputColumnNames: _col0, _col1, _col2, _col3
+                  Statistics: Num rows: 6 Data size: 120 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 6 Data size: 120 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select col0, col1, col2, sum(col3) from grp_set_test group by 
rollup(col0, col1, col2)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: select col0, col1, col2, sum(col3) from grp_set_test group by 
rollup(col0, col1, col2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+1      1       1       1
+1      1       NULL    1
+1      2       2       10
+1      2       3       100
+1      2       NULL    110
+1      NULL    NULL    111
+NULL   NULL    NULL    111
+PREHOOK: query: explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0, 
col1, col2)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select col0, col1, col2, sum(col3) from grp_set_test group by rollup(col0, 
col1, col2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: grp_set_test
+                  Statistics: Num rows: 3 Data size: 48 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    expressions: col0 (type: int), col1 (type: int), col2 
(type: int), col3 (type: int)
+                    outputColumnNames: col0, col1, col2, col3
+                    Statistics: Num rows: 3 Data size: 48 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: col2 (type: int)
+                      null sort order: z
+                      sort order: +
+                      Map-reduce partition columns: col2 (type: int)
+                      value expressions: col0 (type: int), col1 (type: int), 
col3 (type: int)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Select Operator
+                expressions: VALUE.col0 (type: int), VALUE.col1 (type: int), 
KEY.col2 (type: int), VALUE.col3 (type: int)
+                outputColumnNames: col0, col1, col2, col3
+                Group By Operator
+                  aggregations: sum(col3)
+                  keys: col0 (type: int), col1 (type: int), col2 (type: int), 
0L (type: bigint)
+                  grouping sets: 0, 1, 3, 7
+                  minReductionHashAggr: 0.4
+                  mode: hash
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4
+                  Statistics: Num rows: 6 Data size: 168 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Reduce Output Operator
+                    key expressions: _col0 (type: int), _col1 (type: int), 
_col2 (type: int), _col3 (type: bigint)
+                    null sort order: zzzz
+                    sort order: ++++
+                    Map-reduce partition columns: _col0 (type: int), _col1 
(type: int), _col2 (type: int), _col3 (type: bigint)
+                    Statistics: Num rows: 6 Data size: 168 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    value expressions: _col4 (type: bigint)
+        Reducer 3 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: sum(VALUE._col0)
+                keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 
(type: int), KEY._col3 (type: bigint)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col4
+                Statistics: Num rows: 6 Data size: 168 Basic stats: COMPLETE 
Column stats: COMPLETE
+                pruneGroupingSetId: true
+                Select Operator
+                  expressions: _col0 (type: int), _col1 (type: int), _col2 
(type: int), _col4 (type: bigint)
+                  outputColumnNames: _col0, _col1, _col2, _col3
+                  Statistics: Num rows: 6 Data size: 120 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 6 Data size: 120 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select col0, col1, col2, sum(col3) from grp_set_test group by 
rollup(col0, col1, col2)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: select col0, col1, col2, sum(col3) from grp_set_test group by 
rollup(col0, col1, col2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+1      1       1       1
+1      1       NULL    1
+1      2       2       10
+1      2       3       100
+1      2       NULL    110
+1      NULL    NULL    111
+NULL   NULL    NULL    111
+PREHOOK: query: alter table grp_set_test update statistics for column col3 
set('numDVs'='10000','numNulls'='10000')
+PREHOOK: type: ALTERTABLE_UPDATETABLESTATS
+PREHOOK: Input: default@grp_set_test
+PREHOOK: Output: default@grp_set_test
+POSTHOOK: query: alter table grp_set_test update statistics for column col3 
set('numDVs'='10000','numNulls'='10000')
+POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS
+POSTHOOK: Input: default@grp_set_test
+POSTHOOK: Output: default@grp_set_test
+PREHOOK: query: explain
+select col0, col1, col2, count(distinct col3) from grp_set_test group by 
rollup(col0, col1, col2)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+POSTHOOK: query: explain
+select col0, col1, col2, count(distinct col3) from grp_set_test group by 
rollup(col0, col1, col2)
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@grp_set_test
+#### A masked pattern was here ####
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Tez
+#### A masked pattern was here ####
+      Edges:
+        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+        Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+#### A masked pattern was here ####
+      Vertices:
+        Map 1 
+            Map Operator Tree:
+                TableScan
+                  alias: grp_set_test
+                  Statistics: Num rows: 3 Data size: 36 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Select Operator
+                    expressions: col0 (type: int), col1 (type: int), col2 
(type: int), col3 (type: int)
+                    outputColumnNames: col0, col1, col2, col3
+                    Statistics: Num rows: 3 Data size: 36 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      key expressions: col2 (type: int)
+                      null sort order: z
+                      sort order: +
+                      Map-reduce partition columns: col2 (type: int)
+                      value expressions: col0 (type: int), col1 (type: int), 
col3 (type: int)
+            Execution mode: vectorized, llap
+            LLAP IO: all inputs
+        Reducer 2 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Select Operator
+                expressions: VALUE.col0 (type: int), VALUE.col1 (type: int), 
KEY.col2 (type: int), VALUE.col3 (type: int)
+                outputColumnNames: col0, col1, col2, col3
+                Group By Operator
+                  aggregations: count(DISTINCT col3)
+                  keys: col0 (type: int), col1 (type: int), col2 (type: int), 
0L (type: bigint), col3 (type: int)
+                  grouping sets: 0, 1, 3, 7
+                  minReductionHashAggr: 0.4
+                  mode: hash
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5
+                  Statistics: Num rows: 6 Data size: 172 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Reduce Output Operator
+                    key expressions: _col0 (type: int), _col1 (type: int), 
_col2 (type: int), _col3 (type: bigint), _col4 (type: int)
+                    null sort order: zzzzz
+                    sort order: +++++
+                    Map-reduce partition columns: _col0 (type: int), _col1 
(type: int), _col2 (type: int), _col3 (type: bigint)
+                    Statistics: Num rows: 6 Data size: 172 Basic stats: 
COMPLETE Column stats: COMPLETE
+        Reducer 3 
+            Execution mode: llap
+            Reduce Operator Tree:
+              Group By Operator
+                aggregations: count(DISTINCT KEY._col4:0._col0)
+                keys: KEY._col0 (type: int), KEY._col1 (type: int), KEY._col2 
(type: int), KEY._col3 (type: bigint)
+                mode: mergepartial
+                outputColumnNames: _col0, _col1, _col2, _col4
+                Statistics: Num rows: 6 Data size: 168 Basic stats: COMPLETE 
Column stats: COMPLETE
+                pruneGroupingSetId: true
+                Select Operator
+                  expressions: _col0 (type: int), _col1 (type: int), _col2 
(type: int), _col4 (type: bigint)
+                  outputColumnNames: _col0, _col1, _col2, _col3
+                  Statistics: Num rows: 6 Data size: 120 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 6 Data size: 120 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
diff --git 
a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query22.q.out 
b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query22.q.out
index 6c5e258b368..592b285f409 100644
--- a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query22.q.out
+++ b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query22.q.out
@@ -7,9 +7,10 @@ STAGE PLANS:
     Tez
 #### A masked pattern was here ####
       Edges:
-        Map 1 <- Map 4 (BROADCAST_EDGE), Map 5 (BROADCAST_EDGE)
+        Map 1 <- Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE)
         Reducer 2 <- Map 1 (SIMPLE_EDGE)
         Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
+        Reducer 4 <- Reducer 3 (SIMPLE_EDGE)
 #### A masked pattern was here ####
       Vertices:
         Map 1 
@@ -30,7 +31,7 @@ STAGE PLANS:
                         1 _col0 (type: bigint)
                       outputColumnNames: _col1, _col2
                       input vertices:
-                        1 Map 4
+                        1 Map 5
                       Statistics: Num rows: 321094889 Data size: 3527549756 
Basic stats: COMPLETE Column stats: COMPLETE
                       Map Join Operator
                         condition map:
@@ -40,26 +41,17 @@ STAGE PLANS:
                           1 _col0 (type: bigint)
                         outputColumnNames: _col2, _col5, _col6, _col7, _col8
                         input vertices:
-                          1 Map 5
+                          1 Map 6
                         Statistics: Num rows: 321094889 Data size: 
125864702465 Basic stats: COMPLETE Column stats: COMPLETE
-                        Group By Operator
-                          aggregations: sum(_col2), count(_col2)
-                          keys: _col5 (type: char(50)), _col6 (type: 
char(50)), _col7 (type: char(50)), _col8 (type: char(50)), 0L (type: bigint)
-                          grouping sets: 0, 2, 6, 14, 15
-                          minReductionHashAggr: 0.83334786
-                          mode: hash
-                          outputColumnNames: _col0, _col1, _col2, _col3, 
_col4, _col5, _col6
-                          Statistics: Num rows: 1605474445 Data size: 
663060945785 Basic stats: COMPLETE Column stats: COMPLETE
-                          Reduce Output Operator
-                            key expressions: _col0 (type: char(50)), _col1 
(type: char(50)), _col2 (type: char(50)), _col3 (type: char(50)), _col4 (type: 
bigint)
-                            null sort order: zzzzz
-                            sort order: +++++
-                            Map-reduce partition columns: _col0 (type: 
char(50)), _col1 (type: char(50)), _col2 (type: char(50)), _col3 (type: 
char(50)), _col4 (type: bigint)
-                            Statistics: Num rows: 1605474445 Data size: 
663060945785 Basic stats: COMPLETE Column stats: COMPLETE
-                            value expressions: _col5 (type: bigint), _col6 
(type: bigint)
+                        Reduce Output Operator
+                          key expressions: _col8 (type: char(50))
+                          null sort order: z
+                          sort order: +
+                          Map-reduce partition columns: _col8 (type: char(50))
+                          value expressions: _col2 (type: int), _col5 (type: 
char(50)), _col6 (type: char(50)), _col7 (type: char(50))
             Execution mode: vectorized, llap
             LLAP IO: may be used (ACID table)
-        Map 4 
+        Map 5 
             Map Operator Tree:
                 TableScan
                   alias: date_dim
@@ -80,7 +72,7 @@ STAGE PLANS:
                         Statistics: Num rows: 359 Data size: 2872 Basic stats: 
COMPLETE Column stats: COMPLETE
             Execution mode: vectorized, llap
             LLAP IO: may be used (ACID table)
-        Map 5 
+        Map 6 
             Map Operator Tree:
                 TableScan
                   alias: item
@@ -99,6 +91,27 @@ STAGE PLANS:
             Execution mode: vectorized, llap
             LLAP IO: may be used (ACID table)
         Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Select Operator
+                expressions: VALUE._col2 (type: int), VALUE._col5 (type: 
char(50)), VALUE._col6 (type: char(50)), VALUE._col7 (type: char(50)), 
KEY._col8 (type: char(50))
+                outputColumnNames: _col2, _col5, _col6, _col7, _col8
+                Group By Operator
+                  aggregations: sum(_col2), count(_col2)
+                  keys: _col5 (type: char(50)), _col6 (type: char(50)), _col7 
(type: char(50)), _col8 (type: char(50)), 0L (type: bigint)
+                  grouping sets: 0, 2, 6, 14, 15
+                  minReductionHashAggr: 0.83334786
+                  mode: hash
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, 
_col6
+                  Statistics: Num rows: 1605474445 Data size: 663060945785 
Basic stats: COMPLETE Column stats: COMPLETE
+                  Reduce Output Operator
+                    key expressions: _col0 (type: char(50)), _col1 (type: 
char(50)), _col2 (type: char(50)), _col3 (type: char(50)), _col4 (type: bigint)
+                    null sort order: zzzzz
+                    sort order: +++++
+                    Map-reduce partition columns: _col0 (type: char(50)), 
_col1 (type: char(50)), _col2 (type: char(50)), _col3 (type: char(50)), _col4 
(type: bigint)
+                    Statistics: Num rows: 1605474445 Data size: 663060945785 
Basic stats: COMPLETE Column stats: COMPLETE
+                    value expressions: _col5 (type: bigint), _col6 (type: 
bigint)
+        Reducer 3 
             Execution mode: vectorized, llap
             Reduce Operator Tree:
               Group By Operator
@@ -123,7 +136,7 @@ STAGE PLANS:
                       null sort order: zzzzz
                       sort order: +++++
                       Statistics: Num rows: 1605474445 Data size: 637373354665 
Basic stats: COMPLETE Column stats: COMPLETE
-        Reducer 3 
+        Reducer 4 
             Execution mode: vectorized, llap
             Reduce Operator Tree:
               Select Operator
diff --git 
a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query67.q.out 
b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query67.q.out
index 2ce3245886c..374699407bc 100644
--- a/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query67.q.out
+++ b/ql/src/test/results/clientpositive/perf/tpcds30tb/tez/query67.q.out
@@ -7,10 +7,11 @@ STAGE PLANS:
     Tez
 #### A masked pattern was here ####
       Edges:
-        Map 1 <- Map 5 (BROADCAST_EDGE), Map 6 (BROADCAST_EDGE), Map 7 
(BROADCAST_EDGE)
+        Map 1 <- Map 6 (BROADCAST_EDGE), Map 7 (BROADCAST_EDGE), Map 8 
(BROADCAST_EDGE)
         Reducer 2 <- Map 1 (SIMPLE_EDGE)
         Reducer 3 <- Reducer 2 (SIMPLE_EDGE)
         Reducer 4 <- Reducer 3 (SIMPLE_EDGE)
+        Reducer 5 <- Reducer 4 (SIMPLE_EDGE)
 #### A masked pattern was here ####
       Vertices:
         Map 1 
@@ -35,7 +36,7 @@ STAGE PLANS:
                           1 _col0 (type: bigint)
                         outputColumnNames: _col0, _col1, _col3, _col5, _col6, 
_col7
                         input vertices:
-                          1 Map 5
+                          1 Map 6
                         Statistics: Num rows: 15840066266 Data size: 
2202441686776 Basic stats: COMPLETE Column stats: COMPLETE
                         Map Join Operator
                           condition map:
@@ -45,7 +46,7 @@ STAGE PLANS:
                             1 _col0 (type: bigint)
                           outputColumnNames: _col0, _col3, _col5, _col6, 
_col7, _col9
                           input vertices:
-                            1 Map 6
+                            1 Map 7
                           Statistics: Num rows: 15840066266 Data size: 
3674895373712 Basic stats: COMPLETE Column stats: COMPLETE
                           Map Join Operator
                             condition map:
@@ -55,26 +56,17 @@ STAGE PLANS:
                               1 _col0 (type: bigint)
                             outputColumnNames: _col3, _col5, _col6, _col7, 
_col9, _col11, _col12, _col13, _col14
                             input vertices:
-                              1 Map 7
+                              1 Map 8
                             Statistics: Num rows: 15840066266 Data size: 
9709960621058 Basic stats: COMPLETE Column stats: COMPLETE
-                            Group By Operator
-                              aggregations: sum(_col3)
-                              keys: _col5 (type: int), _col6 (type: int), 
_col7 (type: int), _col9 (type: string), _col11 (type: char(50)), _col12 (type: 
char(50)), _col13 (type: char(50)), _col14 (type: char(50)), 0L (type: bigint)
-                              grouping sets: 0, 16, 80, 112, 240, 241, 249, 
253, 255
-                              minReductionHashAggr: 0.9867937
-                              mode: hash
-                              outputColumnNames: _col0, _col1, _col2, _col3, 
_col4, _col5, _col6, _col7, _col8, _col9
-                              Statistics: Num rows: 142560596394 Data size: 
88530130360674 Basic stats: COMPLETE Column stats: COMPLETE
-                              Reduce Output Operator
-                                key expressions: _col0 (type: int), _col1 
(type: int), _col2 (type: int), _col3 (type: string), _col4 (type: char(50)), 
_col5 (type: char(50)), _col6 (type: char(50)), _col7 (type: char(50)), _col8 
(type: bigint)
-                                null sort order: zzzzzzzzz
-                                sort order: +++++++++
-                                Map-reduce partition columns: _col0 (type: 
int), _col1 (type: int), _col2 (type: int), _col3 (type: string), _col4 (type: 
char(50)), _col5 (type: char(50)), _col6 (type: char(50)), _col7 (type: 
char(50)), _col8 (type: bigint)
-                                Statistics: Num rows: 142560596394 Data size: 
88530130360674 Basic stats: COMPLETE Column stats: COMPLETE
-                                value expressions: _col9 (type: decimal(28,2))
+                            Reduce Output Operator
+                              key expressions: _col14 (type: char(50))
+                              null sort order: z
+                              sort order: +
+                              Map-reduce partition columns: _col14 (type: 
char(50))
+                              value expressions: _col3 (type: decimal(18,2)), 
_col5 (type: int), _col6 (type: int), _col7 (type: int), _col9 (type: string), 
_col11 (type: char(50)), _col12 (type: char(50)), _col13 (type: char(50))
             Execution mode: vectorized, llap
             LLAP IO: may be used (ACID table)
-        Map 5 
+        Map 6 
             Map Operator Tree:
                 TableScan
                   alias: date_dim
@@ -112,7 +104,7 @@ STAGE PLANS:
                             Target Vertex: Map 1
             Execution mode: vectorized, llap
             LLAP IO: may be used (ACID table)
-        Map 6 
+        Map 7 
             Map Operator Tree:
                 TableScan
                   alias: store
@@ -130,7 +122,7 @@ STAGE PLANS:
                       value expressions: _col1 (type: string)
             Execution mode: vectorized, llap
             LLAP IO: may be used (ACID table)
-        Map 7 
+        Map 8 
             Map Operator Tree:
                 TableScan
                   alias: item
@@ -149,6 +141,27 @@ STAGE PLANS:
             Execution mode: vectorized, llap
             LLAP IO: may be used (ACID table)
         Reducer 2 
+            Execution mode: vectorized, llap
+            Reduce Operator Tree:
+              Select Operator
+                expressions: VALUE._col3 (type: decimal(18,2)), VALUE._col5 
(type: int), VALUE._col6 (type: int), VALUE._col7 (type: int), VALUE._col9 
(type: string), VALUE._col11 (type: char(50)), VALUE._col12 (type: char(50)), 
VALUE._col13 (type: char(50)), KEY._col14 (type: char(50))
+                outputColumnNames: _col3, _col5, _col6, _col7, _col9, _col11, 
_col12, _col13, _col14
+                Group By Operator
+                  aggregations: sum(_col3)
+                  keys: _col5 (type: int), _col6 (type: int), _col7 (type: 
int), _col9 (type: string), _col11 (type: char(50)), _col12 (type: char(50)), 
_col13 (type: char(50)), _col14 (type: char(50)), 0L (type: bigint)
+                  grouping sets: 0, 16, 80, 112, 240, 241, 249, 253, 255
+                  minReductionHashAggr: 0.9867937
+                  mode: hash
+                  outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, 
_col6, _col7, _col8, _col9
+                  Statistics: Num rows: 142560596394 Data size: 88530130360674 
Basic stats: COMPLETE Column stats: COMPLETE
+                  Reduce Output Operator
+                    key expressions: _col0 (type: int), _col1 (type: int), 
_col2 (type: int), _col3 (type: string), _col4 (type: char(50)), _col5 (type: 
char(50)), _col6 (type: char(50)), _col7 (type: char(50)), _col8 (type: bigint)
+                    null sort order: zzzzzzzzz
+                    sort order: +++++++++
+                    Map-reduce partition columns: _col0 (type: int), _col1 
(type: int), _col2 (type: int), _col3 (type: string), _col4 (type: char(50)), 
_col5 (type: char(50)), _col6 (type: char(50)), _col7 (type: char(50)), _col8 
(type: bigint)
+                    Statistics: Num rows: 142560596394 Data size: 
88530130360674 Basic stats: COMPLETE Column stats: COMPLETE
+                    value expressions: _col9 (type: decimal(28,2))
+        Reducer 3 
             Execution mode: vectorized, llap
             Reduce Operator Tree:
               Group By Operator
@@ -176,7 +189,7 @@ STAGE PLANS:
                       Map-reduce partition columns: _col6 (type: char(50))
                       Statistics: Num rows: 142560596394 Data size: 
87389645589522 Basic stats: COMPLETE Column stats: COMPLETE
                       value expressions: _col0 (type: int), _col1 (type: int), 
_col2 (type: int), _col3 (type: string), _col4 (type: char(50)), _col5 (type: 
char(50)), _col7 (type: char(50))
-        Reducer 3 
+        Reducer 4 
             Execution mode: vectorized, llap
             Reduce Operator Tree:
               Select Operator
@@ -222,7 +235,7 @@ STAGE PLANS:
                           null sort order: zzzzzzzzzz
                           sort order: ++++++++++
                           Statistics: Num rows: 47520198798 Data size: 
29319962658366 Basic stats: COMPLETE Column stats: COMPLETE
-        Reducer 4 
+        Reducer 5 
             Execution mode: vectorized, llap
             Reduce Operator Tree:
               Select Operator

(hive) branch master updated: HIVE-28489: Partition the input data of GroupBy with GroupingSet (Seonggon Namgung, reviewed by PLASH SPEED, Shohei Okumiya)

Reply via email to