This is an automated email from the ASF dual-hosted git repository.

zabetak pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/master by this push:
     new 5ffcdb71658 HIVE-28582: OOM when compiling query with many GROUP BY 
columns aliased multiple times (Stamatis Zampetakis reviewed by Krisztian Kasa)
5ffcdb71658 is described below

commit 5ffcdb716581861f415964d46546ab092e551f45
Author: Stamatis Zampetakis <[email protected]>
AuthorDate: Thu Dec 19 13:24:23 2024 +0100

    HIVE-28582: OOM when compiling query with many GROUP BY columns aliased 
multiple times (Stamatis Zampetakis reviewed by Krisztian Kasa)
    
    The RelMdUniqueKeys handler has some logic that leads to an exponential 
increase of results (unique keys) when certain query/plan patterns appear.
    The problem has been addressed by CALCITE-6640 and CALCITE-6704 but those 
are not available in current Calcite version.
    
    Copy RelMdUniqueKeys#getProjectUniqueKeys
    
(https://github.com/apache/calcite/blob/648a832e0b3abc0f1cd4887847bdef7c133cb383/core/src/main/java/org/apache/calcite/rel/metadata/RelMdUniqueKeys.java#L162)
    to HiveRelMdUniqueKeys handler to overcome the OOM till we upgrade to a 
Calcite version with the afforementioned fixes.
    The copy is almost exact minus a small adaptation in Util#transform that 
transforms the colMask bitset to a list.
    
    Close apache/hive#5583
---
 .../hadoop/hive/ql/optimizer/calcite/Bug.java      |   5 +
 .../calcite/stats/HiveRelMdUniqueKeys.java         |  95 +++++++++++++++++++
 .../queries/clientpositive/cbo_unique_keys_oom.q   |  44 +++++++++
 .../clientpositive/llap/cbo_unique_keys_oom.q.out  | 105 +++++++++++++++++++++
 4 files changed, 249 insertions(+)

diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/Bug.java 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/Bug.java
index ab79129ae33..c1564ee0d3a 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/Bug.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/Bug.java
@@ -98,4 +98,9 @@ public final class Bug {
    * Whether <a 
href="https://issues.apache.org/jira/browse/CALCITE-6513";>CALCITE-6513</a> is 
fixed.
    */
   public static final boolean CALCITE_6513_FIXED = false;
+
+  /**
+   * Whether <a 
href="https://issues.apache.org/jira/browse/CALCITE-6704";>CALCITE-6704</a> is 
fixed.
+   */
+  public static final boolean CALCITE_6704_FIXED = false;
 }
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdUniqueKeys.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdUniqueKeys.java
index 7772335ac5a..cb89b01e552 100644
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdUniqueKeys.java
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdUniqueKeys.java
@@ -21,28 +21,123 @@ import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 
+import org.apache.calcite.linq4j.Linq4j;
+import org.apache.calcite.rel.SingleRel;
+import org.apache.calcite.rel.core.Project;
 import org.apache.calcite.rel.metadata.BuiltInMetadata;
 import org.apache.calcite.rel.metadata.MetadataDef;
 import org.apache.calcite.rel.metadata.MetadataHandler;
 import org.apache.calcite.rel.metadata.ReflectiveRelMetadataProvider;
 import org.apache.calcite.rel.metadata.RelMetadataProvider;
 import org.apache.calcite.rel.metadata.RelMetadataQuery;
+import org.apache.calcite.rex.RexInputRef;
+import org.apache.calcite.rex.RexNode;
 import org.apache.calcite.util.BuiltInMethod;
 import org.apache.calcite.util.ImmutableBitSet;
+import org.apache.calcite.util.Util;
+import org.apache.hadoop.hive.ql.optimizer.calcite.Bug;
 import org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable;
 import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan;
 
+import com.google.common.collect.ImmutableMultimap;
+import com.google.common.collect.ImmutableSet;
+import com.google.common.collect.Multimap;
+
 public class HiveRelMdUniqueKeys implements 
MetadataHandler<BuiltInMetadata.UniqueKeys> {
 
   public static final RelMetadataProvider SOURCE =
       ReflectiveRelMetadataProvider.reflectiveSource(
           BuiltInMethod.UNIQUE_KEYS.method, new HiveRelMdUniqueKeys());
+  /**
+   * A limit about the number of unique keys returned by the handler.
+   * The limit must be in the range [0, Integer.MAX_VALUE].
+   */
+  private final int limit;
+
+  private HiveRelMdUniqueKeys() {
+    if (Bug.CALCITE_6704_FIXED) {
+      throw new IllegalStateException("Remove constructor and limit once we 
upgrade to version with CALCITE-6704");
+    }
+    this.limit = 1000;
+  }
 
   @Override
   public MetadataDef<BuiltInMetadata.UniqueKeys> getDef() {
     return BuiltInMetadata.UniqueKeys.DEF;
   }
 
+  public Set<ImmutableBitSet> getUniqueKeys(Project rel, RelMetadataQuery mq,
+      boolean ignoreNulls) {
+    return getProjectUniqueKeys(rel, mq, ignoreNulls, rel.getProjects());
+  }
+
+  private Set<ImmutableBitSet> getProjectUniqueKeys(SingleRel rel, 
RelMetadataQuery mq,
+      boolean ignoreNulls, List<RexNode> projExprs) {
+    if (Bug.CALCITE_6704_FIXED) {
+      throw new IllegalStateException("Method is redundant once we upgrade to 
version with CALCITE-6704");
+
+    }
+    // LogicalProject maps a set of rows to a different set;
+    // Without knowledge of the mapping function(whether it
+    // preserves uniqueness), it is only safe to derive uniqueness
+    // info from the child of a project when the mapping is f(a) => a.
+    //
+    // Further more, the unique bitset coming from the child needs
+    // to be mapped to match the output of the project.
+
+    // Single input can be mapped to multiple outputs
+    ImmutableMultimap.Builder<Integer, Integer> inToOutPosBuilder = 
ImmutableMultimap.builder();
+    ImmutableBitSet.Builder mappedInColumnsBuilder = ImmutableBitSet.builder();
+
+    // Build an input to output position map.
+    for (int i = 0; i < projExprs.size(); i++) {
+      RexNode projExpr = projExprs.get(i);
+      if (projExpr instanceof RexInputRef) {
+        int inputIndex = ((RexInputRef) projExpr).getIndex();
+        inToOutPosBuilder.put(inputIndex, i);
+        mappedInColumnsBuilder.set(inputIndex);
+      }
+    }
+    ImmutableBitSet inColumnsUsed = mappedInColumnsBuilder.build();
+
+    if (inColumnsUsed.isEmpty()) {
+      // if there's no RexInputRef in the projected expressions
+      // return empty set.
+      return ImmutableSet.of();
+    }
+
+    Set<ImmutableBitSet> childUniqueKeySet =
+        mq.getUniqueKeys(rel.getInput(), ignoreNulls);
+
+    if (childUniqueKeySet == null) {
+      return ImmutableSet.of();
+    }
+
+    Multimap<Integer, Integer> mapInToOutPos = inToOutPosBuilder.build();
+
+    Set<ImmutableBitSet> resultBuilder = new HashSet<>();
+    // Now add to the projUniqueKeySet the child keys that are fully
+    // projected.
+    outerLoop:
+    for (ImmutableBitSet colMask : childUniqueKeySet) {
+      if (!inColumnsUsed.contains(colMask)) {
+        // colMask contains a column that is not projected as RexInput => the 
key is not unique
+        continue;
+      }
+      // colMask is mapped to output project, however, the column can be 
mapped more than once:
+      // select key1, key1, val1, val2, key2 from ...
+      // the resulting unique keys would be {{0},{4}}, {{1},{4}}
+
+      Iterable<List<Integer>> product = 
Linq4j.product(Util.transform(colMask.toList(), mapInToOutPos::get));
+      for (List<Integer> passKey : product) {
+        if (resultBuilder.size() == limit) {
+          break outerLoop;
+        }
+        resultBuilder.add(ImmutableBitSet.of(passKey));
+      }
+    }
+    return resultBuilder;
+  }
 
   public Set<ImmutableBitSet> getUniqueKeys(HiveTableScan rel, 
RelMetadataQuery mq,
                                             boolean ignoreNulls) {
diff --git a/ql/src/test/queries/clientpositive/cbo_unique_keys_oom.q 
b/ql/src/test/queries/clientpositive/cbo_unique_keys_oom.q
new file mode 100644
index 00000000000..a13fca711bc
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/cbo_unique_keys_oom.q
@@ -0,0 +1,44 @@
+CREATE TABLE t (
+    c1 string,
+    c2 string,
+    c3 string,
+    c4 string,
+    c5 string,
+    c6 string,
+    c7 string,
+    c8 string,
+    c9 string
+);
+
+EXPLAIN CBO
+SELECT a0
+FROM 
+(SELECT c1 as a0,
+        c1 as a1,
+        c1 as a2,
+        c2 as a3,
+        c2 as a4,
+        c2 as a5,
+        c3 as a6,
+        c3 as a7,
+        c3 as a8,
+        c4 as a9,
+        c4 as a10,
+        c4 as a11,
+        c5 as a12,
+        c5 as a13,
+        c5 as a14,
+        c6 as a15,
+        c6 as a16,
+        c6 as a17,
+        c7 as a18,
+        c7 as a19,
+        c7 as a20,
+        c8 as a21,
+        c8 as a22,
+        c8 as a23,
+        c9 as a24,
+        c9 as a25,
+        c9 as a26
+FROM t GROUP BY c1,c2,c3,c4,c5,c6,c7,c8,c9) t1
+GROUP BY a0, a4
diff --git a/ql/src/test/results/clientpositive/llap/cbo_unique_keys_oom.q.out 
b/ql/src/test/results/clientpositive/llap/cbo_unique_keys_oom.q.out
new file mode 100644
index 00000000000..d580e8539d7
--- /dev/null
+++ b/ql/src/test/results/clientpositive/llap/cbo_unique_keys_oom.q.out
@@ -0,0 +1,105 @@
+PREHOOK: query: CREATE TABLE t (
+    c1 string,
+    c2 string,
+    c3 string,
+    c4 string,
+    c5 string,
+    c6 string,
+    c7 string,
+    c8 string,
+    c9 string
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@t
+POSTHOOK: query: CREATE TABLE t (
+    c1 string,
+    c2 string,
+    c3 string,
+    c4 string,
+    c5 string,
+    c6 string,
+    c7 string,
+    c8 string,
+    c9 string
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@t
+PREHOOK: query: EXPLAIN CBO
+SELECT a0
+FROM 
+(SELECT c1 as a0,
+        c1 as a1,
+        c1 as a2,
+        c2 as a3,
+        c2 as a4,
+        c2 as a5,
+        c3 as a6,
+        c3 as a7,
+        c3 as a8,
+        c4 as a9,
+        c4 as a10,
+        c4 as a11,
+        c5 as a12,
+        c5 as a13,
+        c5 as a14,
+        c6 as a15,
+        c6 as a16,
+        c6 as a17,
+        c7 as a18,
+        c7 as a19,
+        c7 as a20,
+        c8 as a21,
+        c8 as a22,
+        c8 as a23,
+        c9 as a24,
+        c9 as a25,
+        c9 as a26
+FROM t GROUP BY c1,c2,c3,c4,c5,c6,c7,c8,c9) t1
+GROUP BY a0, a4
+PREHOOK: type: QUERY
+PREHOOK: Input: default@t
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN CBO
+SELECT a0
+FROM 
+(SELECT c1 as a0,
+        c1 as a1,
+        c1 as a2,
+        c2 as a3,
+        c2 as a4,
+        c2 as a5,
+        c3 as a6,
+        c3 as a7,
+        c3 as a8,
+        c4 as a9,
+        c4 as a10,
+        c4 as a11,
+        c5 as a12,
+        c5 as a13,
+        c5 as a14,
+        c6 as a15,
+        c6 as a16,
+        c6 as a17,
+        c7 as a18,
+        c7 as a19,
+        c7 as a20,
+        c8 as a21,
+        c8 as a22,
+        c8 as a23,
+        c9 as a24,
+        c9 as a25,
+        c9 as a26
+FROM t GROUP BY c1,c2,c3,c4,c5,c6,c7,c8,c9) t1
+GROUP BY a0, a4
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@t
+#### A masked pattern was here ####
+CBO PLAN:
+HiveProject(a0=[$0])
+  HiveAggregate(group=[{0, 1}])
+    HiveProject(c1=[$0], c2=[$1], c3=[$2], c4=[$3], c5=[$4], c6=[$5], c7=[$6], 
c8=[$7], c9=[$8])
+      HiveAggregate(group=[{0, 1, 2, 3, 4, 5, 6, 7, 8}])
+        HiveTableScan(table=[[default, t]], table:alias=[t])
+

Reply via email to