Re: [PR] HIVE-28489: Partition the input data of GroupBy with GroupingSet [hive]

via GitHub Fri, 22 Nov 2024 03:27:44 -0800


ngsg commented on code in PR #5424:
URL: https://github.com/apache/hive/pull/5424#discussion_r1853766901



##########
ql/src/java/org/apache/hadoop/hive/ql/optimizer/GroupingSetOptimizer.java:
##########
@@ -0,0 +1,362 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.optimizer;
+
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.metastore.api.FieldSchema;
+import org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.GroupByOperator;
+import org.apache.hadoop.hive.ql.exec.Operator;
+import org.apache.hadoop.hive.ql.exec.OperatorFactory;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.RowSchema;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
+import org.apache.hadoop.hive.ql.exec.Utilities;
+import org.apache.hadoop.hive.ql.io.AcidUtils;
+import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
+import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.lib.RuleRegExp;
+import org.apache.hadoop.hive.ql.lib.SemanticDispatcher;
+import org.apache.hadoop.hive.ql.lib.SemanticGraphWalker;
+import org.apache.hadoop.hive.ql.lib.SemanticNodeProcessor;
+import org.apache.hadoop.hive.ql.lib.SemanticRule;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.ColStatistics;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.MapJoinDesc;
+import org.apache.hadoop.hive.ql.plan.PlanUtils;
+import org.apache.hadoop.hive.ql.plan.ReduceSinkDesc;
+import org.apache.hadoop.hive.ql.plan.SelectDesc;
+import org.apache.hadoop.hive.ql.plan.TableDesc;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.BitSet;
+import java.util.Comparator;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+
+public class GroupingSetOptimizer extends Transform {
+  private static final Logger LOG = 
LoggerFactory.getLogger(GroupingSetOptimizer.class);
+
+  private static class GroupingSetProcessorContext implements NodeProcessorCtx 
{
+    public final long bytesPerReducer;
+    public final int maxReducers;
+    public final long groupingSetThreshold;
+
+    public GroupingSetProcessorContext(HiveConf hiveConf) {
+      bytesPerReducer = 
hiveConf.getLongVar(HiveConf.ConfVars.BYTES_PER_REDUCER);
+      maxReducers = hiveConf.getIntVar(HiveConf.ConfVars.MAX_REDUCERS);
+      groupingSetThreshold = 
hiveConf.getLongVar(HiveConf.ConfVars.HIVE_OPTIMIZE_GROUPING_SET_THRESHOLD);
+    }
+  }
+
+  private static class GroupingSetProcessor implements SemanticNodeProcessor {
+    @Override
+    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
+        Object... nodeOutputs) throws SemanticException {
+      GroupingSetProcessorContext context = (GroupingSetProcessorContext) 
procCtx;
+      GroupByOperator gby = (GroupByOperator) nd;
+      if (!isGroupByFeasible(gby, context)) {
+        return null;
+      }
+
+      Operator<?> parentOp = gby.getParentOperators().get(0);
+      if (!isParentOpFeasible(parentOp)) {
+        return null;
+      }
+
+      String partitionCol = selectPartitionColumn(gby, parentOp);
+      if (partitionCol == null) {
+        return null;
+      }
+
+      LOG.info("Applying GroupingSetOptimization: partitioning the input data 
of {} by {}",
+          gby, partitionCol);
+
+      ReduceSinkOperator rs = createReduceSink(parentOp, partitionCol, 
context);
+
+      parentOp.removeChild(gby);
+      // gby.setParentOperators(Arrays.asList(rs));
+      // NOTE: The above expression does not work because GBY refers to _colN
+      //  while input columns are VALUE._colN. Therefore, we should either 
modify GBY expressions
+      //  or insert a new SEL that renames columns. The following code 
implements the later one as it is
+      //  easier to implement.
+
+      SelectOperator sel = createSelect(parentOp.getSchema().getSignature(), 
rs);
+
+      sel.setChildOperators(Arrays.asList(gby));
+      gby.setParentOperators(Arrays.asList(sel));
+
+      return null;
+    }
+
+    private boolean isGroupByFeasible(GroupByOperator gby, 
GroupingSetProcessorContext context) {
+      if (!gby.getConf().isGroupingSetsPresent() || gby.getStatistics() == 
null) {
+        return false;
+      }
+
+      if (gby.getStatistics().getNumRows() < context.groupingSetThreshold) {
+        LOG.debug("Skip grouping-set optimization on a small operator: {}", 
gby);
+        return false;
+      }
+
+      if (gby.getParentOperators().size() != 1) {
+        LOG.debug("Skip grouping-set optimization on a operator with multiple 
parent operators: {}", gby);
+        return false;
+      }
+
+      return true;
+    }
+
+    private boolean isParentOpFeasible(Operator<?> parentOp) {
+      ReduceSinkOperator rs = null;
+
+      Operator<?> curOp = parentOp;
+      while (true) {
+        if (curOp instanceof ReduceSinkOperator) {
+          rs = (ReduceSinkOperator) curOp;
+          break;
+        }
+
+        if (curOp.getParentOperators() == null) {
+          break;
+        }
+
+        if (curOp.getParentOperators().size() == 1) {
+          curOp = curOp.getParentOperators().get(0);
+        } else if (curOp instanceof AbstractMapJoinOperator) {
+          MapJoinDesc desc = ((AbstractMapJoinOperator<?>) curOp).getConf();
+          curOp = curOp.getParentOperators().get(desc.getPosBigTable());
+        } else {
+          break;
+        }
+      }
+
+      if (rs == null) {
+        // There is no partitioning followed by this parentOp. Continue 
optimization.
+        return true;
+      }
+
+      if (rs.getConf().getPartitionCols() != null && 
rs.getConf().getPartitionCols().size() > 0) {
+        // This rs might be irrelevant to the target GroupBy operator. For 
example, the following query:
+        //   SELECT a, b, sum(c) FROM (SELECT a, b, c FROM tbl DISTRIBUTE BY 
c) z GROUP BY rollup(a, b)
+        // won't be optimized although 'DISTRIBUTE BY c' is irrelevant to the 
key columns of GroupBy.
+        LOG.debug("Skip grouping-set optimization in order not to introduce 
possibly redundant shuffle.");
+        return false;
+      } else {
+        // No partitioning. Continue optimization.
+        return true;
+      }
+    }
+
+    private String selectPartitionColumn(GroupByOperator gby, Operator<?> 
parentOp) {
+      if (!(parentOp.getSchema() != null && 
parentOp.getSchema().getSignature() != null)) {

Review Comment:
   fixed



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] HIVE-28489: Partition the input data of GroupBy with GroupingSet [hive]

Reply via email to