[hive] branch master updated: HIVE-23434: Add option to rewrite PERCENTILE_DISC to sketch functions (Zoltan Haindrich reviewed by Jesus Camacho Rodriguez)

kgyrtkirk Thu, 21 May 2020 23:55:54 -0700

This is an automated email from the ASF dual-hosted git repository.

kgyrtkirk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git



The following commit(s) were added to refs/heads/master by this push:
     new 716f1f9  HIVE-23434: Add option to rewrite PERCENTILE_DISC to sketch 
functions (Zoltan Haindrich reviewed by Jesus Camacho Rodriguez)
716f1f9 is described below

commit 716f1f9a945a9a11e6702754667660d27e0a5cf4
Author: Zoltan Haindrich <k...@rxd.hu>
AuthorDate: Fri May 22 06:54:20 2020 +0000

    HIVE-23434: Add option to rewrite PERCENTILE_DISC to sketch functions 
(Zoltan Haindrich reviewed by Jesus Camacho Rodriguez)
    
    Signed-off-by: Zoltan Haindrich <k...@rxd.hu>
---
 .../java/org/apache/hadoop/hive/conf/HiveConf.java |   9 +-
 .../test/resources/testconfiguration.properties    |   4 +-
 .../hadoop/hive/ql/exec/DataSketchesFunctions.java |  20 +-
 .../HiveRewriteCountDistinctToDataSketches.java    | 175 ----------
 .../rules/HiveRewriteToDataSketchesRules.java      | 371 +++++++++++++++++++++
 .../hadoop/hive/ql/parse/CalcitePlanner.java       |  14 +-
 .../sketches_materialized_view_percentile_disc.q   |  54 +++
 ...rewrite.q => sketches_rewrite_count_distinct.q} |   0
 ...ewrite.q => sketches_rewrite_percentile_disc.q} |   9 +-
 ...etches_materialized_view_percentile_disc.q.out} | 280 ++++++++--------
 .../llap/sketches_materialized_view_rollup2.q.out  |   8 +-
 .../llap/sketches_materialized_view_safety.q.out   |   2 +-
 ...q.out => sketches_rewrite_count_distinct.q.out} |   2 +-
 ....out => sketches_rewrite_percentile_disc.q.out} |  64 ++--
 14 files changed, 643 insertions(+), 369 deletions(-)

diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java 
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index bd884a9..a00d907 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -2492,12 +2492,19 @@ public class HiveConf extends Configuration {
     
HIVE_OPTIMIZE_BI_REWRITE_COUNTDISTINCT_ENABLED("hive.optimize.bi.rewrite.countdistinct.enabled",
         true,
         "Enables to rewrite COUNT(DISTINCT(X)) queries to be rewritten to use 
sketch functions."),
-
     HIVE_OPTIMIZE_BI_REWRITE_COUNT_DISTINCT_SKETCH(
         "hive.optimize.bi.rewrite.countdistinct.sketch", "hll",
         new StringSet("hll"),
         "Defines which sketch type to use when rewriting COUNT(DISTINCT(X)) 
expressions. "
             + "Distinct counting can be done with: hll"),
+    
HIVE_OPTIMIZE_BI_REWRITE_PERCENTILE_DISC_ENABLED("hive.optimize.bi.rewrite.percentile_disc.enabled",
+        true,
+        "Enables to rewrite PERCENTILE_DISC(X) queries to be rewritten to use 
sketch functions."),
+    HIVE_OPTIMIZE_BI_REWRITE_PERCENTILE_DISC_SKETCH(
+        "hive.optimize.bi.rewrite.percentile_disc.sketch", "kll",
+        new StringSet("kll"),
+        "Defines which sketch type to use when rewriting PERCENTILE_DISC 
expressions. Options: kll"),
+
 
     // Statistics
     HIVE_STATS_ESTIMATE_STATS("hive.stats.estimate", true,
diff --git a/itests/src/test/resources/testconfiguration.properties 
b/itests/src/test/resources/testconfiguration.properties
index e7c3e43..0d06d02 100644
--- a/itests/src/test/resources/testconfiguration.properties
+++ b/itests/src/test/resources/testconfiguration.properties
@@ -872,9 +872,11 @@ minillaplocal.query.files=\
   schq_ingest.q,\
   sketches_hll.q,\
   sketches_theta.q,\
-  sketches_rewrite.q,\
+  sketches_rewrite_count_distinct.q,\
+  sketches_rewrite_percentile_disc.q,\
   sketches_materialized_view_rollup.q,\
   sketches_materialized_view_rollup2.q,\
+  sketches_materialized_view_percentile_disc.q,\
   sketches_materialized_view_safety.q,\
   table_access_keys_stats.q,\
   temp_table_llap_partitioned.q,\
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java
index 8865380..cc48d5b 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java
@@ -65,7 +65,7 @@ public final class DataSketchesFunctions implements 
HiveUDFPlugin {
   private static final String GET_CDF = "cdf";
   private static final String GET_PMF = "pmf";
   private static final String GET_QUANTILES = "quantiles";
-  private static final String GET_QUANTILE = "quantile";
+  public static final String GET_QUANTILE = "quantile";
   private static final String GET_RANK = "rank";
   private static final String INTERSECT_SKETCH = "intersect";
   private static final String INTERSECT_SKETCH1 = "intersect_f";
@@ -109,7 +109,8 @@ public final class DataSketchesFunctions implements 
HiveUDFPlugin {
     }
     SketchDescriptor sc = sketchClasses.get(className);
     if (!sc.fnMap.containsKey(function)) {
-      throw new IllegalArgumentException(String.format("The Sketch-class '%s' 
doesn't have a '%s' method", function));
+      throw new IllegalArgumentException(
+          String.format("The Sketch-class '%s' doesn't have a '%s' method", 
className, function));
     }
     return sketchClasses.get(className).fnMap.get(function);
   }
@@ -128,6 +129,7 @@ public final class DataSketchesFunctions implements 
HiveUDFPlugin {
       SketchFunctionDescriptor sketchSFD = sd.fnMap.get(DATA_TO_SKETCH);
       SketchFunctionDescriptor unionSFD = sd.fnMap.get(UNION_SKETCH);
       SketchFunctionDescriptor estimateSFD = sd.fnMap.get(SKETCH_TO_ESTIMATE);
+      SketchFunctionDescriptor quantileSFD = sd.fnMap.get(GET_QUANTILE);
 
       if (sketchSFD == null || unionSFD == null) {
         continue;
@@ -163,6 +165,20 @@ public final class DataSketchesFunctions implements 
HiveUDFPlugin {
 
         estimateSFD.setCalciteFunction(estimateFn);
       }
+
+      if (quantileSFD != null && 
quantileSFD.getReturnRelDataType().isPresent()) {
+        SqlFunction quantileFn = new HiveSqlFunction(quantileSFD.name,
+            SqlKind.OTHER_FUNCTION,
+            
ReturnTypes.explicit(quantileSFD.getReturnRelDataType().get().getSqlTypeName()),
+            InferTypes.ANY_NULLABLE,
+            OperandTypes.family(),
+            SqlFunctionCategory.USER_DEFINED_FUNCTION,
+            true,
+            false);
+
+        quantileSFD.setCalciteFunction(quantileFn);
+
+      }
     }
   }
 
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteCountDistinctToDataSketches.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteCountDistinctToDataSketches.java
deleted file mode 100644
index c23e2c4..0000000
--- 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteCountDistinctToDataSketches.java
+++ /dev/null
@@ -1,175 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to you under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.hadoop.hive.ql.optimizer.calcite.rules;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import org.apache.calcite.plan.RelOptRule;
-import org.apache.calcite.plan.RelOptRuleCall;
-import org.apache.calcite.rel.RelCollation;
-import org.apache.calcite.rel.RelNode;
-import org.apache.calcite.rel.core.Aggregate;
-import org.apache.calcite.rel.core.AggregateCall;
-import org.apache.calcite.rel.core.RelFactories.ProjectFactory;
-import org.apache.calcite.rel.type.RelDataType;
-import org.apache.calcite.rex.RexBuilder;
-import org.apache.calcite.rex.RexNode;
-import org.apache.calcite.sql.SqlAggFunction;
-import org.apache.calcite.sql.SqlOperator;
-import org.apache.hadoop.hive.ql.exec.DataSketchesFunctions;
-import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
-import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate;
-import org.apache.hive.plugin.api.HiveUDFPlugin.UDFDescriptor;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import com.google.common.collect.ImmutableList;
-
-/**
- * This rule could rewrite {@code count(distinct(x))} calls to be calculated 
using sketch based functions.
- *
- * The transformation here works on Aggregate nodes; the operations done are 
the following:
- *
- * 1. Identify candidate {@code count(distinct)} aggregate calls
- * 2. A new Aggregate is created in which the aggregation is done by the 
sketch function
- * 3. A new Project is inserted on top of the Aggregate; which unwraps the 
resulting
- *    count-distinct estimation from the sketch representation
- */
-public final class HiveRewriteCountDistinctToDataSketches extends RelOptRule {
-
-  protected static final Logger LOG = 
LoggerFactory.getLogger(HiveRewriteCountDistinctToDataSketches.class);
-  private final String sketchClass;
-  private final ProjectFactory projectFactory;
-
-  public HiveRewriteCountDistinctToDataSketches(String sketchClass) {
-    super(operand(HiveAggregate.class, any()));
-    this.sketchClass = sketchClass;
-    projectFactory = HiveRelFactories.HIVE_PROJECT_FACTORY;
-  }
-
-  @Override
-  public void onMatch(RelOptRuleCall call) {
-    final Aggregate aggregate = call.rel(0);
-
-    if (aggregate.getGroupSets().size() != 1) {
-      // not yet supported
-      return;
-    }
-
-    List<AggregateCall> newAggCalls = new ArrayList<AggregateCall>();
-
-    VBuilder vb = new VBuilder(aggregate);
-
-    if (aggregate.getAggCallList().equals(vb.newAggCalls)) {
-      // rule didn't made any changes
-      return;
-    }
-
-    newAggCalls = vb.newAggCalls;
-    RelNode newAgg = aggregate.copy(aggregate.getTraitSet(), 
aggregate.getInput(), aggregate.getGroupSet(),
-        aggregate.getGroupSets(), newAggCalls);
-
-    RelNode newProject = projectFactory.createProject(newAgg, vb.newProjects, 
aggregate.getRowType().getFieldNames());
-
-    call.transformTo(newProject);
-    return;
-  }
-
-  /**
-   * Helper class to help in building a new Aggregate and Project.
-   */
-  // NOTE: methods in this class are not re-entrant; drop-to-frame to 
constructor during debugging
-  class VBuilder {
-
-    private Aggregate aggregate;
-    private List<AggregateCall> newAggCalls;
-    private List<RexNode> newProjects;
-    private final RexBuilder rexBuilder;
-
-    public VBuilder(Aggregate aggregate) {
-      this.aggregate = aggregate;
-      newAggCalls = new ArrayList<AggregateCall>();
-      newProjects = new ArrayList<RexNode>();
-      rexBuilder = aggregate.getCluster().getRexBuilder();
-
-      // add non-aggregated fields - as identity projections
-      addGroupFields();
-
-      for (AggregateCall aggCall : aggregate.getAggCallList()) {
-        processAggCall(aggCall);
-      }
-    }
-
-    private void addGroupFields() {
-      for (int i = 0; i < aggregate.getGroupCount(); i++) {
-        newProjects.add(rexBuilder.makeInputRef(aggregate, 0));
-      }
-    }
-
-    private void processAggCall(AggregateCall aggCall) {
-      if (isSimpleCountDistinct(aggCall)) {
-        rewriteCountDistinct(aggCall);
-        return;
-      }
-      appendAggCall(aggCall, null);
-    }
-
-    private void appendAggCall(AggregateCall aggCall, SqlOperator 
projectOperator) {
-      RelDataType origType = 
aggregate.getRowType().getFieldList().get(newProjects.size()).getType();
-      RexNode projRex = rexBuilder.makeInputRef(aggCall.getType(), 
newProjects.size());
-      if (projectOperator != null) {
-        projRex = rexBuilder.makeCall(projectOperator, 
ImmutableList.of(projRex));
-        projRex = rexBuilder.makeCast(origType, projRex);
-      }
-      newAggCalls.add(aggCall);
-      newProjects.add(projRex);
-    }
-
-    private boolean isSimpleCountDistinct(AggregateCall aggCall) {
-      return aggCall.isDistinct() && aggCall.getArgList().size() == 1
-          && aggCall.getAggregation().getName().equalsIgnoreCase("count") && 
!aggCall.hasFilter();
-    }
-
-    private void rewriteCountDistinct(AggregateCall aggCall) {
-      SqlAggFunction aggFunction = (SqlAggFunction) 
getSqlOperator(DataSketchesFunctions.DATA_TO_SKETCH);
-      boolean distinct = false;
-      boolean approximate = true;
-      boolean ignoreNulls = aggCall.ignoreNulls();
-      List<Integer> argList = aggCall.getArgList();
-      int filterArg = aggCall.filterArg;
-      RelCollation collation = aggCall.getCollation();
-      int groupCount = aggregate.getGroupCount();
-      RelNode input = aggregate.getInput();
-      RelDataType type = rexBuilder.deriveReturnType(aggFunction, 
Collections.emptyList());
-      String name = aggFunction.getName();
-
-      AggregateCall ret = AggregateCall.create(aggFunction, distinct, 
approximate, ignoreNulls, argList, filterArg,
-          collation, groupCount, input, type, name);
-
-      appendAggCall(ret, 
getSqlOperator(DataSketchesFunctions.SKETCH_TO_ESTIMATE));
-    }
-
-    private SqlOperator getSqlOperator(String fnName) {
-      UDFDescriptor fn = 
DataSketchesFunctions.INSTANCE.getSketchFunction(sketchClass, fnName);
-      if (!fn.getCalciteFunction().isPresent()) {
-        throw new RuntimeException(fn.toString() + " doesn't have a Calcite 
function associated with it");
-      }
-      return fn.getCalciteFunction().get();
-    }
-  }
-}
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteToDataSketchesRules.java
 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteToDataSketchesRules.java
new file mode 100644
index 0000000..0123137
--- /dev/null
+++ 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteToDataSketchesRules.java
@@ -0,0 +1,371 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to you under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hive.ql.optimizer.calcite.rules;
+
+import java.math.BigDecimal;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import org.apache.calcite.plan.RelOptRule;
+import org.apache.calcite.plan.RelOptRuleCall;
+import org.apache.calcite.plan.RelOptRuleOperand;
+import org.apache.calcite.rel.RelCollation;
+import org.apache.calcite.rel.RelNode;
+import org.apache.calcite.rel.core.Aggregate;
+import org.apache.calcite.rel.core.AggregateCall;
+import org.apache.calcite.rel.core.Project;
+import org.apache.calcite.rel.core.RelFactories.ProjectFactory;
+import org.apache.calcite.rel.type.RelDataType;
+import org.apache.calcite.rel.type.RelDataTypeFactory;
+import org.apache.calcite.rex.RexBuilder;
+import org.apache.calcite.rex.RexLiteral;
+import org.apache.calcite.rex.RexNode;
+import org.apache.calcite.sql.SqlAggFunction;
+import org.apache.calcite.sql.SqlKind;
+import org.apache.calcite.sql.SqlOperator;
+import org.apache.calcite.sql.fun.SqlStdOperatorTable;
+import org.apache.calcite.sql.type.SqlTypeName;
+import org.apache.hadoop.hive.ql.exec.DataSketchesFunctions;
+import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate;
+import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject;
+import org.apache.hive.plugin.api.HiveUDFPlugin.UDFDescriptor;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.google.common.collect.ImmutableList;
+import com.google.common.collect.Lists;
+
+/**
+ * This rule could rewrite aggregate calls to be calculated using sketch based 
functions.
+ *
+ * <br/>
+ * Currently it can rewrite:
+ * <ul>
+ *  <li>{@code count(distinct(x))} to distinct counting sketches
+ *    <pre>
+ *     SELECT COUNT(DISTINCT id) FROM sketch_input;
+ *       ⇒ SELECT ROUND(ds_hll_estimate(ds_hll_sketch(id))) FROM sketch_input;
+ *    </pre>
+ *  </li>
+ *  <li>{@code percentile_disc(0.2) within group (order by id)}
+ *    <pre>
+ *     SELECT PERCENTILE_DISC(0.2) WITHIN GROUP(ORDER BY ID) FROM sketch_input;
+ *       ⇒ SELECT ds_kll_quantile(ds_kll_sketch(CAST(id AS FLOAT)), 0.2) FROM 
sketch_input;
+ *    </pre>
+ *  </li>
+ *  </ul>
+ *
+ * <p>
+ *   The transformation here works on Aggregate nodes; the operations done are 
the following:
+ * </p>
+ * <ol>
+ * <li>Identify candidate aggregate calls</li>
+ * <li>A new Project is inserted below the Aggregate; to help with data 
pre-processing</li>
+ * <li>A new Aggregate is created in which the aggregation is done by the 
sketch function</li>
+ * <li>A new Project is inserted on top of the Aggregate; which unwraps the 
resulting
+ *    count-distinct estimation from the sketch representation</li>
+ * </ol>
+ */
+public final class HiveRewriteToDataSketchesRules {
+
+  protected static final Logger LOG = 
LoggerFactory.getLogger(HiveRewriteToDataSketchesRules.class);
+
+  /**
+   * Generic support for rewriting an Aggregate into a chain of 
Project->Aggregate->Project.
+   */
+  private static abstract class AggregateToProjectAggregateProject extends 
RelOptRule {
+
+    private final ProjectFactory projectFactory;
+
+    public AggregateToProjectAggregateProject(RelOptRuleOperand operand) {
+      super(operand);
+      projectFactory = HiveRelFactories.HIVE_PROJECT_FACTORY;
+    }
+
+    @Override
+    public void onMatch(RelOptRuleCall call) {
+      VbuilderPAP vb = processCall(call);
+      if (vb == null) {
+        return;
+      }
+
+      Aggregate aggregate = vb.aggregate;
+      if (aggregate.getAggCallList().equals(vb.newAggCalls)) {
+        // rule didn't make any changes
+        return;
+      }
+
+      List<AggregateCall> newAggCalls = vb.newAggCalls;
+      List<String> fieldNames = new ArrayList<String>();
+      for (int i = 0; i < vb.newProjectsBelow.size(); i++) {
+        fieldNames.add("ff_" + i);
+      }
+      RelNode newProjectBelow = 
projectFactory.createProject(aggregate.getInput(), vb.newProjectsBelow, 
fieldNames);
+
+      RelNode newAgg = aggregate.copy(aggregate.getTraitSet(), 
newProjectBelow, aggregate.getGroupSet(),
+          aggregate.getGroupSets(), newAggCalls);
+
+      RelNode newProject =
+          projectFactory.createProject(newAgg, vb.newProjectsAbove, 
aggregate.getRowType().getFieldNames());
+
+      call.transformTo(newProject);
+      return;
+
+    }
+
+    protected abstract VbuilderPAP processCall(RelOptRuleCall call);
+
+    private static abstract class VbuilderPAP {
+      protected final RexBuilder rexBuilder;
+
+      /** The original aggregate RelNode */
+      protected final Aggregate aggregate;
+      /** The list of the new aggregations */
+      protected final List<AggregateCall> newAggCalls;
+      /**
+       * The new projections expressions inserted above the aggregate
+       *
+       *  These projections should do the neccessary conversions to behave 
like the original aggregate.
+       *  Most important here is to CAST the final result to the same type as 
the original aggregate was producing.
+       */
+      protected final List<RexNode> newProjectsAbove;
+      /** The new projections expressions inserted belove the aggregate
+       *
+       * These projections could be used to prepocess the incoming datastream.
+       * For example a CAST might need to be injected.
+       */
+      protected final List<RexNode> newProjectsBelow;
+
+      private final String sketchClass;
+
+      protected VbuilderPAP(Aggregate aggregate, String sketchClass) {
+        this.aggregate = aggregate;
+        this.sketchClass = sketchClass;
+        newAggCalls = new ArrayList<AggregateCall>();
+        newProjectsAbove = new ArrayList<RexNode>();
+        newProjectsBelow = new ArrayList<RexNode>();
+        rexBuilder = aggregate.getCluster().getRexBuilder();
+      }
+
+      protected final void processAggregate() {
+        // add identity projections
+        addProjectedFields();
+
+        for (AggregateCall aggCall : aggregate.getAggCallList()) {
+          processAggCall(aggCall);
+        }
+      }
+
+      private final void addProjectedFields() {
+        for (int i = 0; i < aggregate.getGroupCount(); i++) {
+          newProjectsAbove.add(rexBuilder.makeInputRef(aggregate, i));
+        }
+        int numInputFields = aggregate.getInput().getRowType().getFieldCount();
+        for (int i = 0; i < numInputFields; i++) {
+          newProjectsBelow.add(rexBuilder.makeInputRef(aggregate.getInput(), 
i));
+        }
+      }
+
+      private final void processAggCall(AggregateCall aggCall) {
+        if (isApplicable(aggCall)) {
+          rewrite(aggCall);
+        } else {
+          appendAggCall(aggCall);
+        }
+      }
+
+      private final void appendAggCall(AggregateCall aggCall) {
+        RexNode projRex = rexBuilder.makeInputRef(aggCall.getType(), 
newProjectsAbove.size());
+
+        newAggCalls.add(aggCall);
+        newProjectsAbove.add(projRex);
+      }
+
+      protected final SqlOperator getSqlOperator(String fnName) {
+        UDFDescriptor fn = 
DataSketchesFunctions.INSTANCE.getSketchFunction(sketchClass, fnName);
+        if (!fn.getCalciteFunction().isPresent()) {
+          throw new RuntimeException(fn.toString() + " doesn't have a Calcite 
function associated with it");
+        }
+        return fn.getCalciteFunction().get();
+      }
+
+      abstract void rewrite(AggregateCall aggCall);
+
+      abstract boolean isApplicable(AggregateCall aggCall);
+
+    }
+
+  };
+
+  public static class CountDistinctRewrite extends 
AggregateToProjectAggregateProject {
+
+    private final String sketchType;
+
+    public CountDistinctRewrite(String sketchType) {
+      super(operand(HiveAggregate.class, any()));
+      this.sketchType = sketchType;
+    }
+
+    @Override
+    protected VBuilderPAP processCall(RelOptRuleCall call) {
+      final Aggregate aggregate = call.rel(0);
+
+      if (aggregate.getGroupSets().size() != 1) {
+        // not yet supported
+        return null;
+      }
+
+      return new VBuilderPAP(aggregate, sketchType);
+    }
+
+    private static class VBuilderPAP extends 
AggregateToProjectAggregateProject.VbuilderPAP {
+
+      protected VBuilderPAP(Aggregate aggregate, String sketchClass) {
+        super(aggregate, sketchClass);
+        processAggregate();
+      }
+
+      @Override
+      boolean isApplicable(AggregateCall aggCall) {
+        return aggCall.isDistinct() && aggCall.getArgList().size() == 1
+            && aggCall.getAggregation().getKind() == SqlKind.COUNT && 
!aggCall.hasFilter();
+      }
+
+      @Override
+      void rewrite(AggregateCall aggCall) {
+        RelDataType origType = 
aggregate.getRowType().getFieldList().get(newProjectsAbove.size()).getType();
+
+        Integer argIndex = aggCall.getArgList().get(0);
+        RexNode call = rexBuilder.makeInputRef(aggregate.getInput(), argIndex);
+        newProjectsBelow.add(call);
+
+        SqlAggFunction aggFunction = (SqlAggFunction) 
getSqlOperator(DataSketchesFunctions.DATA_TO_SKETCH);
+        boolean distinct = false;
+        boolean approximate = true;
+        boolean ignoreNulls = true;
+        List<Integer> argList = Lists.newArrayList(newProjectsBelow.size() - 
1);
+        int filterArg = aggCall.filterArg;
+        RelCollation collation = aggCall.getCollation();
+        RelDataType type = rexBuilder.deriveReturnType(aggFunction, 
Collections.emptyList());
+        String name = aggFunction.getName();
+
+        AggregateCall newAgg = AggregateCall.create(aggFunction, distinct, 
approximate, ignoreNulls, argList, filterArg,
+            collation, type, name);
+
+        SqlOperator projectOperator = 
getSqlOperator(DataSketchesFunctions.SKETCH_TO_ESTIMATE);
+        RexNode projRex = rexBuilder.makeInputRef(newAgg.getType(), 
newProjectsAbove.size());
+        projRex = rexBuilder.makeCall(projectOperator, 
ImmutableList.of(projRex));
+        projRex = rexBuilder.makeCall(SqlStdOperatorTable.ROUND, 
ImmutableList.of(projRex));
+        projRex = rexBuilder.makeCast(origType, projRex);
+
+        newAggCalls.add(newAgg);
+        newProjectsAbove.add(projRex);
+      }
+    }
+  }
+
+  public static class PercentileDiscRewrite extends 
AggregateToProjectAggregateProject {
+
+    private final String sketchType;
+
+    public PercentileDiscRewrite(String sketchType) {
+      super(operand(HiveAggregate.class, operand(HiveProject.class, any())));
+      this.sketchType = sketchType;
+    }
+
+    @Override
+    protected VBuilderPAP processCall(RelOptRuleCall call) {
+      final Aggregate aggregate = call.rel(0);
+      final Project project = call.rel(1);
+
+      if (aggregate.getGroupSets().size() != 1) {
+        // not yet supported
+        return null;
+      }
+
+      return new VBuilderPAP(aggregate, project, sketchType);
+    }
+
+    private static class VBuilderPAP extends 
AggregateToProjectAggregateProject.VbuilderPAP {
+
+      private final Project aggInput;
+
+      protected VBuilderPAP(Aggregate aggregate, Project project, String 
sketchClass) {
+        super(aggregate, sketchClass);
+        aggInput = project;
+        processAggregate();
+      }
+
+      @Override
+      boolean isApplicable(AggregateCall aggCall) {
+        if ((aggInput instanceof Project)
+            && !aggCall.isDistinct() && aggCall.getArgList().size() == 4
+            && 
aggCall.getAggregation().getName().equalsIgnoreCase("percentile_disc")
+            && !aggCall.hasFilter()) {
+          List<Integer> argList = aggCall.getArgList();
+          RexNode orderLiteral = aggInput.getChildExps().get(argList.get(2));
+          if (orderLiteral.isA(SqlKind.LITERAL)) {
+            RexLiteral lit = (RexLiteral) orderLiteral;
+            return BigDecimal.valueOf(1).equals(lit.getValue());
+          }
+        }
+        return false;
+      }
+
+      @Override
+      void rewrite(AggregateCall aggCall) {
+        RelDataType origType = 
aggregate.getRowType().getFieldList().get(newProjectsAbove.size()).getType();
+
+        Integer argIndex = aggCall.getArgList().get(1);
+        RexNode call = rexBuilder.makeInputRef(aggregate.getInput(), argIndex);
+
+        RelDataTypeFactory typeFactory = rexBuilder.getTypeFactory();
+        RelDataType notNullFloatType = 
typeFactory.createSqlType(SqlTypeName.FLOAT);
+        RelDataType floatType = 
typeFactory.createTypeWithNullability(notNullFloatType, true);
+
+        call = rexBuilder.makeCast(floatType, call);
+        newProjectsBelow.add(call);
+
+        SqlAggFunction aggFunction = (SqlAggFunction) 
getSqlOperator(DataSketchesFunctions.DATA_TO_SKETCH);
+        boolean distinct = false;
+        boolean approximate = true;
+        boolean ignoreNulls = true;
+        List<Integer> argList = Lists.newArrayList(newProjectsBelow.size() - 
1);
+        int filterArg = aggCall.filterArg;
+        RelCollation collation = aggCall.getCollation();
+        RelDataType type = rexBuilder.deriveReturnType(aggFunction, 
Collections.emptyList());
+        String name = aggFunction.getName();
+
+        AggregateCall newAgg = AggregateCall.create(aggFunction, distinct, 
approximate, ignoreNulls, argList, filterArg,
+            collation, type, name);
+
+        Integer origFractionIdx = aggCall.getArgList().get(0);
+        RexNode fraction = aggInput.getChildExps().get(origFractionIdx);
+        fraction = rexBuilder.makeCast(floatType, fraction);
+
+        SqlOperator projectOperator = 
getSqlOperator(DataSketchesFunctions.GET_QUANTILE);
+        RexNode projRex = rexBuilder.makeInputRef(newAgg.getType(), 
newProjectsAbove.size());
+        projRex = rexBuilder.makeCall(projectOperator, 
ImmutableList.of(projRex, fraction));
+        projRex = rexBuilder.makeCast(origType, projRex);
+
+        newAggCalls.add(newAgg);
+        newProjectsAbove.add(projRex);
+      }
+    }
+  }
+}
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java 
b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
index 32ad4c1..377e828 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java
@@ -92,7 +92,6 @@ import 
org.apache.calcite.rel.metadata.ChainedRelMetadataProvider;
 import org.apache.calcite.rel.metadata.JaninoRelMetadataProvider;
 import org.apache.calcite.rel.metadata.RelMetadataProvider;
 import org.apache.calcite.rel.metadata.RelMetadataQuery;
-import org.apache.calcite.rel.rules.FilterMergeRule;
 import org.apache.calcite.rel.rules.JoinToMultiJoinRule;
 import org.apache.calcite.rel.rules.LoptOptimizeJoinRule;
 import org.apache.calcite.rel.rules.ProjectMergeRule;
@@ -238,7 +237,7 @@ import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRelDecorrelator;
 import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRelFieldTrimmer;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRemoveGBYSemiJoinRule;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRemoveSqCountCheck;
-import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRewriteCountDistinctToDataSketches;
+import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRewriteToDataSketchesRules;
 import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRulesRegistry;
 import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSemiJoinRule;
 import 
org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortJoinReduceRule;
@@ -1975,9 +1974,14 @@ public class CalcitePlanner extends SemanticAnalyzer {
       if (!isMaterializedViewMaintenance() && 
conf.getBoolVar(ConfVars.HIVE_OPTIMIZE_BI_ENABLED)) {
         // Rewrite to datasketches if enabled
         if 
(conf.getBoolVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_COUNTDISTINCT_ENABLED)) {
-          String sketchClass = 
conf.getVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_COUNT_DISTINCT_SKETCH);
-          generatePartialProgram(program, true, HepMatchOrder.TOP_DOWN,
-              new HiveRewriteCountDistinctToDataSketches(sketchClass));
+          String countDistinctSketchType = 
conf.getVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_COUNT_DISTINCT_SKETCH);
+          RelOptRule rule = new 
HiveRewriteToDataSketchesRules.CountDistinctRewrite(countDistinctSketchType);
+          generatePartialProgram(program, true, HepMatchOrder.TOP_DOWN, rule);
+        }
+        if 
(conf.getBoolVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_PERCENTILE_DISC_ENABLED)) {
+          String percentileDiscSketchType = 
conf.getVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_PERCENTILE_DISC_SKETCH);
+          RelOptRule rule = new 
HiveRewriteToDataSketchesRules.PercentileDiscRewrite(percentileDiscSketchType);
+          generatePartialProgram(program, true, HepMatchOrder.TOP_DOWN, rule);
         }
       }
       // Run this optimization early, since it is expanding the operator 
pipeline.
diff --git 
a/ql/src/test/queries/clientpositive/sketches_materialized_view_percentile_disc.q
 
b/ql/src/test/queries/clientpositive/sketches_materialized_view_percentile_disc.q
new file mode 100644
index 0000000..4578f4f
--- /dev/null
+++ 
b/ql/src/test/queries/clientpositive/sketches_materialized_view_percentile_disc.q
@@ -0,0 +1,54 @@
+--! qt:transactional
+set hive.fetch.task.conversion=none;
+
+create table sketch_input (id int, category char(1))
+STORED AS ORC
+TBLPROPERTIES ('transactional'='true');
+
+insert into table sketch_input values
+  (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 
'a'), (8, 'a'), (9, 'a'), (10, 'a'),
+  (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 
'b'), (13, 'b'), (14, 'b'), (15, 'b')
+; 
+
+-- create an mv for the intermediate results
+create  materialized view mv_1 as
+  select category, ds_kll_sketch(cast(id as float)) from sketch_input group by 
category;
+
+-- bi mode on
+set hive.optimize.bi.enabled=true;
+
+explain
+select 'rewrite; mv matching', category, percentile_disc(0.2) within group 
(order by id) from sketch_input group by category;
+select 'rewrite; mv matching', category, percentile_disc(0.2) within group 
(order by id) from sketch_input group by category;
+
+set hive.optimize.bi.enabled=false;
+
+explain
+select 'no rewrite; no mv usage', category, percentile_disc(0.2) within group 
(order by id) from sketch_input group by category;
+select 'no rewrite; no mv usage', category, percentile_disc(0.2) within group 
(order by id) from sketch_input group by category;
+
+set hive.optimize.bi.enabled=true;
+
+insert into table sketch_input values
+  (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 
'a'), (8, 'a'), (9, 'a'), (10, 'a'),
+  (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 
'b'), (13, 'b'), (14, 'b'), (15, 'b')
+;
+
+explain
+select 'rewrite; but no mv usage', category, percentile_disc(0.2) within group 
(order by id) from sketch_input group by category;
+select 'rewrite; but no mv usage', category, percentile_disc(0.2) within group 
(order by id) from sketch_input group by category;
+
+explain
+alter materialized view mv_1 rebuild;
+alter materialized view mv_1 rebuild;
+
+explain
+select 'rewrite; mv matching', category, percentile_disc(0.2) within group 
(order by id) from sketch_input group by category;
+select 'rewrite; mv matching', category, percentile_disc(0.2) within group 
(order by id) from sketch_input group by category;
+
+-- rewrite+mv matching with rollup
+explain
+select 'rewrite;mv matching with rollup',percentile_disc(0.2) within group 
(order by id) from sketch_input;
+select 'rewrite;mv matching with rollup',percentile_disc(0.2) within group 
(order by id) from sketch_input;
+
+drop materialized view mv_1;
diff --git a/ql/src/test/queries/clientpositive/sketches_rewrite.q 
b/ql/src/test/queries/clientpositive/sketches_rewrite_count_distinct.q
similarity index 100%
copy from ql/src/test/queries/clientpositive/sketches_rewrite.q
copy to ql/src/test/queries/clientpositive/sketches_rewrite_count_distinct.q
diff --git a/ql/src/test/queries/clientpositive/sketches_rewrite.q 
b/ql/src/test/queries/clientpositive/sketches_rewrite_percentile_disc.q
similarity index 68%
rename from ql/src/test/queries/clientpositive/sketches_rewrite.q
rename to ql/src/test/queries/clientpositive/sketches_rewrite_percentile_disc.q
index 0420d62..aade878 100644
--- a/ql/src/test/queries/clientpositive/sketches_rewrite.q
+++ b/ql/src/test/queries/clientpositive/sketches_rewrite_percentile_disc.q
@@ -1,6 +1,5 @@
 --! qt:transactional
 
-set hive.optimize.bi.enabled=true;
 
 create table sketch_input (id int, category char(1))
 STORED AS ORC
@@ -11,9 +10,13 @@ insert into table sketch_input values
   (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 
'b'), (13, 'b'), (14, 'b'), (15, 'b')
 ; 
 
+select percentile_disc(0.3) within group(order by id) from sketch_input;
+
+set hive.optimize.bi.enabled=true;
+
 -- see if rewrite happens
 explain
-select category, count(distinct id) from sketch_input group by category;
+select percentile_disc(0.3) within group(order by id) from sketch_input;
 
-select category, count(distinct id) from sketch_input group by category;
+select percentile_disc(0.3) within group(order by id) from sketch_input;
 
diff --git 
a/ql/src/test/results/clientpositive/llap/sketches_materialized_view_rollup2.q.out
 
b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_percentile_disc.q.out
similarity index 66%
copy from 
ql/src/test/results/clientpositive/llap/sketches_materialized_view_rollup2.q.out
copy to 
ql/src/test/results/clientpositive/llap/sketches_materialized_view_percentile_disc.q.out
index e7b3c0e..1752199 100644
--- 
a/ql/src/test/results/clientpositive/llap/sketches_materialized_view_rollup2.q.out
+++ 
b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_percentile_disc.q.out
@@ -25,25 +25,25 @@ POSTHOOK: Output: default@sketch_input
 POSTHOOK: Lineage: sketch_input.category SCRIPT []
 POSTHOOK: Lineage: sketch_input.id SCRIPT []
 PREHOOK: query: create  materialized view mv_1 as
-  select category, ds_hll_sketch(id),count(id) from sketch_input group by 
category
+  select category, ds_kll_sketch(cast(id as float)) from sketch_input group by 
category
 PREHOOK: type: CREATE_MATERIALIZED_VIEW
 PREHOOK: Input: default@sketch_input
 PREHOOK: Output: database:default
 PREHOOK: Output: default@mv_1
 POSTHOOK: query: create  materialized view mv_1 as
-  select category, ds_hll_sketch(id),count(id) from sketch_input group by 
category
+  select category, ds_kll_sketch(cast(id as float)) from sketch_input group by 
category
 POSTHOOK: type: CREATE_MATERIALIZED_VIEW
 POSTHOOK: Input: default@sketch_input
 POSTHOOK: Output: database:default
 POSTHOOK: Output: default@mv_1
 PREHOOK: query: explain
-select 'rewrite; mv matching', category, count(distinct id) from sketch_input 
group by category
+select 'rewrite; mv matching', category, percentile_disc(0.2) within group 
(order by id) from sketch_input group by category
 PREHOOK: type: QUERY
 PREHOOK: Input: default@mv_1
 PREHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
 POSTHOOK: query: explain
-select 'rewrite; mv matching', category, count(distinct id) from sketch_input 
group by category
+select 'rewrite; mv matching', category, percentile_disc(0.2) within group 
(order by id) from sketch_input group by category
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@mv_1
 POSTHOOK: Input: default@sketch_input
@@ -61,9 +61,9 @@ STAGE PLANS:
             Map Operator Tree:
                 TableScan
                   alias: default.mv_1
-                  Statistics: Num rows: 2 Data size: 362 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Statistics: Num rows: 2 Data size: 410 Basic stats: COMPLETE 
Column stats: COMPLETE
                   Select Operator
-                    expressions: 'rewrite; mv matching' (type: string), 
category (type: char(1)), UDFToLong(ds_hll_estimate(_c1)) (type: bigint)
+                    expressions: 'rewrite; mv matching' (type: string), 
category (type: char(1)), UDFToDouble(ds_kll_quantile(_c1, 0.2)) (type: double)
                     outputColumnNames: _col0, _col1, _col2
                     Statistics: Num rows: 2 Data size: 394 Basic stats: 
COMPLETE Column stats: COMPLETE
                     File Output Operator
@@ -82,25 +82,25 @@ STAGE PLANS:
       Processor Tree:
         ListSink
 
-PREHOOK: query: select 'rewrite; mv matching', category, count(distinct id) 
from sketch_input group by category
+PREHOOK: query: select 'rewrite; mv matching', category, percentile_disc(0.2) 
within group (order by id) from sketch_input group by category
 PREHOOK: type: QUERY
 PREHOOK: Input: default@mv_1
 PREHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
-POSTHOOK: query: select 'rewrite; mv matching', category, count(distinct id) 
from sketch_input group by category
+POSTHOOK: query: select 'rewrite; mv matching', category, percentile_disc(0.2) 
within group (order by id) from sketch_input group by category
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@mv_1
 POSTHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
-rewrite; mv matching   a       10
-rewrite; mv matching   b       10
+rewrite; mv matching   a       2.0
+rewrite; mv matching   b       7.0
 PREHOOK: query: explain
-select 'no rewrite; no mv usage', category, count(distinct id) from 
sketch_input group by category
+select 'no rewrite; no mv usage', category, percentile_disc(0.2) within group 
(order by id) from sketch_input group by category
 PREHOOK: type: QUERY
 PREHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
 POSTHOOK: query: explain
-select 'no rewrite; no mv usage', category, count(distinct id) from 
sketch_input group by category
+select 'no rewrite; no mv usage', category, percentile_disc(0.2) within group 
(order by id) from sketch_input group by category
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
@@ -122,52 +122,45 @@ STAGE PLANS:
                   alias: sketch_input
                   Statistics: Num rows: 22 Data size: 1958 Basic stats: 
COMPLETE Column stats: COMPLETE
                   Select Operator
-                    expressions: id (type: int), category (type: char(1))
-                    outputColumnNames: id, category
+                    expressions: category (type: char(1)), id (type: int)
+                    outputColumnNames: _col0, _col2
                     Statistics: Num rows: 22 Data size: 1958 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Group By Operator
-                      keys: category (type: char(1)), id (type: int)
-                      minReductionHashAggr: 0.3181818
+                      aggregations: percentile_disc(0.2, _col2, 1, 0)
+                      keys: _col0 (type: char(1))
+                      minReductionHashAggr: 0.9090909
                       mode: hash
                       outputColumnNames: _col0, _col1
-                      Statistics: Num rows: 11 Data size: 979 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 2 Data size: 1746 Basic stats: 
COMPLETE Column stats: COMPLETE
                       Reduce Output Operator
-                        key expressions: _col0 (type: char(1)), _col1 (type: 
int)
-                        null sort order: zz
-                        sort order: ++
+                        key expressions: _col0 (type: char(1))
+                        null sort order: z
+                        sort order: +
                         Map-reduce partition columns: _col0 (type: char(1))
-                        Statistics: Num rows: 11 Data size: 979 Basic stats: 
COMPLETE Column stats: COMPLETE
-            Execution mode: vectorized, llap
+                        Statistics: Num rows: 2 Data size: 1746 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        value expressions: _col1 (type: 
struct<counts:map<bigint,bigint>,percentiles:array<double>,isascending:boolean>)
+            Execution mode: llap
             LLAP IO: may be used (ACID table)
         Reducer 2 
-            Execution mode: vectorized, llap
+            Execution mode: llap
             Reduce Operator Tree:
               Group By Operator
-                keys: KEY._col0 (type: char(1)), KEY._col1 (type: int)
+                aggregations: percentile_disc(VALUE._col0)
+                keys: KEY._col0 (type: char(1))
                 mode: mergepartial
                 outputColumnNames: _col0, _col1
-                Statistics: Num rows: 11 Data size: 979 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Statistics: Num rows: 2 Data size: 186 Basic stats: COMPLETE 
Column stats: COMPLETE
                 Select Operator
-                  expressions: _col1 (type: int), _col0 (type: char(1))
-                  outputColumnNames: _col0, _col1
-                  Statistics: Num rows: 11 Data size: 979 Basic stats: 
COMPLETE Column stats: COMPLETE
-                  Group By Operator
-                    aggregations: count(_col0)
-                    keys: _col1 (type: char(1))
-                    mode: complete
-                    outputColumnNames: _col0, _col1
-                    Statistics: Num rows: 2 Data size: 186 Basic stats: 
COMPLETE Column stats: COMPLETE
-                    Select Operator
-                      expressions: 'no rewrite; no mv usage' (type: string), 
_col0 (type: char(1)), _col1 (type: bigint)
-                      outputColumnNames: _col0, _col1, _col2
-                      Statistics: Num rows: 2 Data size: 400 Basic stats: 
COMPLETE Column stats: COMPLETE
-                      File Output Operator
-                        compressed: false
-                        Statistics: Num rows: 2 Data size: 400 Basic stats: 
COMPLETE Column stats: COMPLETE
-                        table:
-                            input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
-                            output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
-                            serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+                  expressions: 'no rewrite; no mv usage' (type: string), _col0 
(type: char(1)), _col1 (type: double)
+                  outputColumnNames: _col0, _col1, _col2
+                  Statistics: Num rows: 2 Data size: 400 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  File Output Operator
+                    compressed: false
+                    Statistics: Num rows: 2 Data size: 400 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    table:
+                        input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
+                        output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                        serde: 
org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
 
   Stage: Stage-0
     Fetch Operator
@@ -175,16 +168,16 @@ STAGE PLANS:
       Processor Tree:
         ListSink
 
-PREHOOK: query: select 'no rewrite; no mv usage', category, count(distinct id) 
from sketch_input group by category
+PREHOOK: query: select 'no rewrite; no mv usage', category, 
percentile_disc(0.2) within group (order by id) from sketch_input group by 
category
 PREHOOK: type: QUERY
 PREHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
-POSTHOOK: query: select 'no rewrite; no mv usage', category, count(distinct 
id) from sketch_input group by category
+POSTHOOK: query: select 'no rewrite; no mv usage', category, 
percentile_disc(0.2) within group (order by id) from sketch_input group by 
category
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
-no rewrite; no mv usage        a       10
-no rewrite; no mv usage        b       10
+no rewrite; no mv usage        a       2.0
+no rewrite; no mv usage        b       7.0
 PREHOOK: query: insert into table sketch_input values
   (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 
'a'), (8, 'a'), (9, 'a'), (10, 'a'),
   (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 
'b'), (13, 'b'), (14, 'b'), (15, 'b')
@@ -200,12 +193,12 @@ POSTHOOK: Output: default@sketch_input
 POSTHOOK: Lineage: sketch_input.category SCRIPT []
 POSTHOOK: Lineage: sketch_input.id SCRIPT []
 PREHOOK: query: explain
-select 'rewrite; but no mv usage', category, count(distinct id) from 
sketch_input group by category
+select 'rewrite; but no mv usage', category, percentile_disc(0.2) within group 
(order by id) from sketch_input group by category
 PREHOOK: type: QUERY
 PREHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
 POSTHOOK: query: explain
-select 'rewrite; but no mv usage', category, count(distinct id) from 
sketch_input group by category
+select 'rewrite; but no mv usage', category, percentile_disc(0.2) within group 
(order by id) from sketch_input group by category
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
@@ -227,36 +220,36 @@ STAGE PLANS:
                   alias: sketch_input
                   Statistics: Num rows: 44 Data size: 3916 Basic stats: 
COMPLETE Column stats: COMPLETE
                   Select Operator
-                    expressions: id (type: int), category (type: char(1))
-                    outputColumnNames: id, category
+                    expressions: category (type: char(1)), UDFToFloat(id) 
(type: float)
+                    outputColumnNames: _col0, _col1
                     Statistics: Num rows: 44 Data size: 3916 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Group By Operator
-                      aggregations: ds_hll_sketch(id)
-                      keys: category (type: char(1))
+                      aggregations: ds_kll_sketch(_col1)
+                      keys: _col0 (type: char(1))
                       minReductionHashAggr: 0.95454544
                       mode: hash
                       outputColumnNames: _col0, _col1
-                      Statistics: Num rows: 2 Data size: 946 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 2 Data size: 458 Basic stats: 
COMPLETE Column stats: COMPLETE
                       Reduce Output Operator
                         key expressions: _col0 (type: char(1))
                         null sort order: z
                         sort order: +
                         Map-reduce partition columns: _col0 (type: char(1))
-                        Statistics: Num rows: 2 Data size: 946 Basic stats: 
COMPLETE Column stats: COMPLETE
-                        value expressions: _col1 (type: 
struct<lgk:int,type:string,sketch:binary>)
+                        Statistics: Num rows: 2 Data size: 458 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        value expressions: _col1 (type: binary)
             Execution mode: llap
             LLAP IO: may be used (ACID table)
         Reducer 2 
             Execution mode: llap
             Reduce Operator Tree:
               Group By Operator
-                aggregations: ds_hll_sketch(VALUE._col0)
+                aggregations: ds_kll_sketch(VALUE._col0)
                 keys: KEY._col0 (type: char(1))
                 mode: mergepartial
                 outputColumnNames: _col0, _col1
                 Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE 
Column stats: COMPLETE
                 Select Operator
-                  expressions: 'rewrite; but no mv usage' (type: string), 
_col0 (type: char(1)), UDFToLong(ds_hll_estimate(_col1)) (type: bigint)
+                  expressions: 'rewrite; but no mv usage' (type: string), 
_col0 (type: char(1)), UDFToDouble(ds_kll_quantile(_col1, 0.2)) (type: double)
                   outputColumnNames: _col0, _col1, _col2
                   Statistics: Num rows: 2 Data size: 402 Basic stats: COMPLETE 
Column stats: COMPLETE
                   File Output Operator
@@ -273,16 +266,16 @@ STAGE PLANS:
       Processor Tree:
         ListSink
 
-PREHOOK: query: select 'rewrite; but no mv usage', category, count(distinct 
id) from sketch_input group by category
+PREHOOK: query: select 'rewrite; but no mv usage', category, 
percentile_disc(0.2) within group (order by id) from sketch_input group by 
category
 PREHOOK: type: QUERY
 PREHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
-POSTHOOK: query: select 'rewrite; but no mv usage', category, count(distinct 
id) from sketch_input group by category
+POSTHOOK: query: select 'rewrite; but no mv usage', category, 
percentile_disc(0.2) within group (order by id) from sketch_input group by 
category
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
-rewrite; but no mv usage       a       10
-rewrite; but no mv usage       b       10
+rewrite; but no mv usage       a       2.0
+rewrite; but no mv usage       b       7.0
 PREHOOK: query: explain
 alter materialized view mv_1 rebuild
 PREHOOK: type: QUERY
@@ -323,120 +316,116 @@ STAGE PLANS:
                     predicate: (ROW__ID.writeid > 1L) (type: boolean)
                     Statistics: Num rows: 14 Data size: 1246 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Select Operator
-                      expressions: id (type: int), category (type: char(1))
-                      outputColumnNames: id, category
+                      expressions: category (type: char(1)), UDFToFloat(id) 
(type: float)
+                      outputColumnNames: _col0, _col1
                       Statistics: Num rows: 14 Data size: 1246 Basic stats: 
COMPLETE Column stats: COMPLETE
                       Group By Operator
-                        aggregations: ds_hll_sketch(id), count(id)
-                        keys: category (type: char(1))
+                        aggregations: ds_kll_sketch(_col1)
+                        keys: _col0 (type: char(1))
                         minReductionHashAggr: 0.85714287
                         mode: hash
-                        outputColumnNames: _col0, _col1, _col2
-                        Statistics: Num rows: 2 Data size: 962 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        outputColumnNames: _col0, _col1
+                        Statistics: Num rows: 2 Data size: 458 Basic stats: 
COMPLETE Column stats: COMPLETE
                         Reduce Output Operator
                           key expressions: _col0 (type: char(1))
                           null sort order: z
                           sort order: +
                           Map-reduce partition columns: _col0 (type: char(1))
-                          Statistics: Num rows: 2 Data size: 962 Basic stats: 
COMPLETE Column stats: COMPLETE
-                          value expressions: _col1 (type: 
struct<lgk:int,type:string,sketch:binary>), _col2 (type: bigint)
+                          Statistics: Num rows: 2 Data size: 458 Basic stats: 
COMPLETE Column stats: COMPLETE
+                          value expressions: _col1 (type: binary)
             Execution mode: llap
             LLAP IO: may be used (ACID table)
         Map 6 
             Map Operator Tree:
                 TableScan
                   alias: default.mv_1
-                  Statistics: Num rows: 2 Data size: 378 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Statistics: Num rows: 2 Data size: 410 Basic stats: COMPLETE 
Column stats: COMPLETE
                   Select Operator
-                    expressions: category (type: char(1)), _c1 (type: binary), 
_c2 (type: bigint)
-                    outputColumnNames: _col0, _col1, _col2
-                    Statistics: Num rows: 2 Data size: 378 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    expressions: category (type: char(1)), _c1 (type: binary)
+                    outputColumnNames: _col0, _col1
+                    Statistics: Num rows: 2 Data size: 410 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Group By Operator
-                      aggregations: ds_hll_union(_col1), sum(_col2)
+                      aggregations: ds_kll_union(_col1)
                       keys: _col0 (type: char(1))
                       minReductionHashAggr: 0.5
                       mode: hash
-                      outputColumnNames: _col0, _col1, _col2
-                      Statistics: Num rows: 2 Data size: 962 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      outputColumnNames: _col0, _col1
+                      Statistics: Num rows: 2 Data size: 458 Basic stats: 
COMPLETE Column stats: COMPLETE
                       Reduce Output Operator
                         key expressions: _col0 (type: char(1))
                         null sort order: z
                         sort order: +
                         Map-reduce partition columns: _col0 (type: char(1))
-                        Statistics: Num rows: 2 Data size: 962 Basic stats: 
COMPLETE Column stats: COMPLETE
-                        value expressions: _col1 (type: 
struct<lgk:int,type:string,sketch:binary>), _col2 (type: bigint)
+                        Statistics: Num rows: 2 Data size: 458 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        value expressions: _col1 (type: binary)
             Execution mode: llap
             LLAP IO: all inputs
         Reducer 2 
             Execution mode: llap
             Reduce Operator Tree:
               Group By Operator
-                aggregations: ds_hll_sketch(VALUE._col0), count(VALUE._col1)
+                aggregations: ds_kll_sketch(VALUE._col0)
                 keys: KEY._col0 (type: char(1))
                 mode: mergepartial
-                outputColumnNames: _col0, _col1, _col2
-                Statistics: Num rows: 2 Data size: 474 Basic stats: COMPLETE 
Column stats: COMPLETE
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE 
Column stats: COMPLETE
                 Group By Operator
-                  aggregations: ds_hll_union(_col1), sum(_col2)
+                  aggregations: ds_kll_union(_col1)
                   keys: _col0 (type: char(1))
                   minReductionHashAggr: 0.5
                   mode: hash
-                  outputColumnNames: _col0, _col1, _col2
-                  Statistics: Num rows: 2 Data size: 962 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  outputColumnNames: _col0, _col1
+                  Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE 
Column stats: COMPLETE
                   Reduce Output Operator
                     key expressions: _col0 (type: char(1))
                     null sort order: z
                     sort order: +
                     Map-reduce partition columns: _col0 (type: char(1))
-                    Statistics: Num rows: 2 Data size: 962 Basic stats: 
COMPLETE Column stats: COMPLETE
-                    value expressions: _col1 (type: 
struct<lgk:int,type:string,sketch:binary>), _col2 (type: bigint)
+                    Statistics: Num rows: 2 Data size: 458 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    value expressions: _col1 (type: binary)
         Reducer 4 
             Execution mode: llap
             Reduce Operator Tree:
               Group By Operator
-                aggregations: ds_hll_union(VALUE._col0), sum(VALUE._col1)
+                aggregations: ds_kll_union(VALUE._col0)
                 keys: KEY._col0 (type: char(1))
                 mode: mergepartial
-                outputColumnNames: _col0, _col1, _col2
-                Statistics: Num rows: 2 Data size: 474 Basic stats: COMPLETE 
Column stats: COMPLETE
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE 
Column stats: COMPLETE
+                File Output Operator
+                  compressed: false
+                  Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  table:
+                      input format: 
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
+                      output format: 
org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
+                      serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
+                      name: default.mv_1
                 Select Operator
-                  expressions: _col0 (type: char(1)), _col1 (type: binary), 
COALESCE(_col2,0L) (type: bigint)
-                  outputColumnNames: _col0, _col1, _col2
-                  Statistics: Num rows: 2 Data size: 474 Basic stats: COMPLETE 
Column stats: COMPLETE
-                  File Output Operator
-                    compressed: false
-                    Statistics: Num rows: 2 Data size: 474 Basic stats: 
COMPLETE Column stats: COMPLETE
-                    table:
-                        input format: 
org.apache.hadoop.hive.ql.io.orc.OrcInputFormat
-                        output format: 
org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat
-                        serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde
-                        name: default.mv_1
-                  Select Operator
-                    expressions: _col0 (type: char(1)), _col1 (type: binary), 
_col2 (type: bigint)
-                    outputColumnNames: category, _c1, _c2
-                    Statistics: Num rows: 2 Data size: 474 Basic stats: 
COMPLETE Column stats: COMPLETE
-                    Group By Operator
-                      aggregations: compute_stats(category, 'hll'), 
compute_stats(_c1, 'hll'), compute_stats(_c2, 'hll')
-                      minReductionHashAggr: 0.5
-                      mode: hash
-                      outputColumnNames: _col0, _col1, _col2
-                      Statistics: Num rows: 1 Data size: 1152 Basic stats: 
COMPLETE Column stats: COMPLETE
-                      Reduce Output Operator
-                        null sort order: 
-                        sort order: 
-                        Statistics: Num rows: 1 Data size: 1152 Basic stats: 
COMPLETE Column stats: COMPLETE
-                        value expressions: _col0 (type: 
struct<columntype:string,maxlength:bigint,sumlength:bigint,count:bigint,countnulls:bigint,bitvector:binary>),
 _col1 (type: 
struct<columntype:string,maxlength:bigint,sumlength:bigint,count:bigint,countnulls:bigint>),
 _col2 (type: 
struct<columntype:string,min:bigint,max:bigint,countnulls:bigint,bitvector:binary>)
+                  expressions: _col0 (type: char(1)), _col1 (type: binary)
+                  outputColumnNames: category, _c1
+                  Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Group By Operator
+                    aggregations: compute_stats(category, 'hll'), 
compute_stats(_c1, 'hll')
+                    minReductionHashAggr: 0.5
+                    mode: hash
+                    outputColumnNames: _col0, _col1
+                    Statistics: Num rows: 1 Data size: 728 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Reduce Output Operator
+                      null sort order: 
+                      sort order: 
+                      Statistics: Num rows: 1 Data size: 728 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      value expressions: _col0 (type: 
struct<columntype:string,maxlength:bigint,sumlength:bigint,count:bigint,countnulls:bigint,bitvector:binary>),
 _col1 (type: 
struct<columntype:string,maxlength:bigint,sumlength:bigint,count:bigint,countnulls:bigint>)
         Reducer 5 
             Execution mode: llap
             Reduce Operator Tree:
               Group By Operator
-                aggregations: compute_stats(VALUE._col0), 
compute_stats(VALUE._col1), compute_stats(VALUE._col2)
+                aggregations: compute_stats(VALUE._col0), 
compute_stats(VALUE._col1)
                 mode: mergepartial
-                outputColumnNames: _col0, _col1, _col2
-                Statistics: Num rows: 1 Data size: 1152 Basic stats: COMPLETE 
Column stats: COMPLETE
+                outputColumnNames: _col0, _col1
+                Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE 
Column stats: COMPLETE
                 File Output Operator
                   compressed: false
-                  Statistics: Num rows: 1 Data size: 1152 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 1 Data size: 712 Basic stats: COMPLETE 
Column stats: COMPLETE
                   table:
                       input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
                       output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -461,8 +450,8 @@ STAGE PLANS:
     Stats Work
       Basic Stats Work:
       Column Stats Desc:
-          Columns: category, _c1, _c2
-          Column Types: char(1), binary, bigint
+          Columns: category, _c1
+          Column Types: char(1), binary
           Table: default.mv_1
 
   Stage: Stage-4
@@ -481,16 +470,15 @@ POSTHOOK: Input: default@mv_1
 POSTHOOK: Input: default@sketch_input
 POSTHOOK: Output: default@mv_1
 POSTHOOK: Lineage: mv_1._c1 EXPRESSION 
[(sketch_input)sketch_input.FieldSchema(name:id, type:int, comment:null), 
(mv_1)default.mv_1.FieldSchema(name:_c1, type:binary, comment:null), ]
-POSTHOOK: Lineage: mv_1._c2 EXPRESSION 
[(sketch_input)sketch_input.FieldSchema(name:id, type:int, comment:null), 
(mv_1)default.mv_1.FieldSchema(name:_c2, type:bigint, comment:null), ]
 POSTHOOK: Lineage: mv_1.category EXPRESSION 
[(sketch_input)sketch_input.FieldSchema(name:category, type:char(1), 
comment:null), (mv_1)default.mv_1.FieldSchema(name:category, type:char(1), 
comment:null), ]
 PREHOOK: query: explain
-select 'rewrite; mv matching', category, count(distinct id) from sketch_input 
group by category
+select 'rewrite; mv matching', category, percentile_disc(0.2) within group 
(order by id) from sketch_input group by category
 PREHOOK: type: QUERY
 PREHOOK: Input: default@mv_1
 PREHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
 POSTHOOK: query: explain
-select 'rewrite; mv matching', category, count(distinct id) from sketch_input 
group by category
+select 'rewrite; mv matching', category, percentile_disc(0.2) within group 
(order by id) from sketch_input group by category
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@mv_1
 POSTHOOK: Input: default@sketch_input
@@ -508,9 +496,9 @@ STAGE PLANS:
             Map Operator Tree:
                 TableScan
                   alias: default.mv_1
-                  Statistics: Num rows: 2 Data size: 362 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Statistics: Num rows: 2 Data size: 490 Basic stats: COMPLETE 
Column stats: COMPLETE
                   Select Operator
-                    expressions: 'rewrite; mv matching' (type: string), 
category (type: char(1)), UDFToLong(ds_hll_estimate(_c1)) (type: bigint)
+                    expressions: 'rewrite; mv matching' (type: string), 
category (type: char(1)), UDFToDouble(ds_kll_quantile(_c1, 0.2)) (type: double)
                     outputColumnNames: _col0, _col1, _col2
                     Statistics: Num rows: 2 Data size: 394 Basic stats: 
COMPLETE Column stats: COMPLETE
                     File Output Operator
@@ -529,26 +517,26 @@ STAGE PLANS:
       Processor Tree:
         ListSink
 
-PREHOOK: query: select 'rewrite; mv matching', category, count(distinct id) 
from sketch_input group by category
+PREHOOK: query: select 'rewrite; mv matching', category, percentile_disc(0.2) 
within group (order by id) from sketch_input group by category
 PREHOOK: type: QUERY
 PREHOOK: Input: default@mv_1
 PREHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
-POSTHOOK: query: select 'rewrite; mv matching', category, count(distinct id) 
from sketch_input group by category
+POSTHOOK: query: select 'rewrite; mv matching', category, percentile_disc(0.2) 
within group (order by id) from sketch_input group by category
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@mv_1
 POSTHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
-rewrite; mv matching   a       10
-rewrite; mv matching   b       10
+rewrite; mv matching   a       2.0
+rewrite; mv matching   b       7.0
 PREHOOK: query: explain
-select 'rewrite;mv matching with rollup',count(distinct id) from sketch_input
+select 'rewrite;mv matching with rollup',percentile_disc(0.2) within group 
(order by id) from sketch_input
 PREHOOK: type: QUERY
 PREHOOK: Input: default@mv_1
 PREHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
 POSTHOOK: query: explain
-select 'rewrite;mv matching with rollup',count(distinct id) from sketch_input
+select 'rewrite;mv matching with rollup',percentile_disc(0.2) within group 
(order by id) from sketch_input
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@mv_1
 POSTHOOK: Input: default@sketch_input
@@ -569,34 +557,34 @@ STAGE PLANS:
             Map Operator Tree:
                 TableScan
                   alias: default.mv_1
-                  Statistics: Num rows: 2 Data size: 192 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  Statistics: Num rows: 2 Data size: 320 Basic stats: COMPLETE 
Column stats: COMPLETE
                   Select Operator
                     expressions: _c1 (type: binary)
                     outputColumnNames: _c1
-                    Statistics: Num rows: 2 Data size: 192 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 2 Data size: 320 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Group By Operator
-                      aggregations: ds_hll_union(_c1)
+                      aggregations: ds_kll_union(_c1)
                       minReductionHashAggr: 0.5
                       mode: hash
                       outputColumnNames: _col0
-                      Statistics: Num rows: 1 Data size: 388 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      Statistics: Num rows: 1 Data size: 144 Basic stats: 
COMPLETE Column stats: COMPLETE
                       Reduce Output Operator
                         null sort order: 
                         sort order: 
-                        Statistics: Num rows: 1 Data size: 388 Basic stats: 
COMPLETE Column stats: COMPLETE
-                        value expressions: _col0 (type: 
struct<lgk:int,type:string,sketch:binary>)
+                        Statistics: Num rows: 1 Data size: 144 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        value expressions: _col0 (type: binary)
             Execution mode: llap
             LLAP IO: all inputs
         Reducer 2 
             Execution mode: llap
             Reduce Operator Tree:
               Group By Operator
-                aggregations: ds_hll_union(VALUE._col0)
+                aggregations: ds_kll_union(VALUE._col0)
                 mode: mergepartial
                 outputColumnNames: _col0
                 Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE 
Column stats: COMPLETE
                 Select Operator
-                  expressions: 'rewrite;mv matching with rollup' (type: 
string), UDFToLong(ds_hll_estimate(_col0)) (type: bigint)
+                  expressions: 'rewrite;mv matching with rollup' (type: 
string), UDFToDouble(ds_kll_quantile(_col0, 0.2)) (type: double)
                   outputColumnNames: _col0, _col1
                   Statistics: Num rows: 1 Data size: 123 Basic stats: COMPLETE 
Column stats: COMPLETE
                   File Output Operator
@@ -613,17 +601,17 @@ STAGE PLANS:
       Processor Tree:
         ListSink
 
-PREHOOK: query: select 'rewrite;mv matching with rollup',count(distinct id) 
from sketch_input
+PREHOOK: query: select 'rewrite;mv matching with rollup',percentile_disc(0.2) 
within group (order by id) from sketch_input
 PREHOOK: type: QUERY
 PREHOOK: Input: default@mv_1
 PREHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
-POSTHOOK: query: select 'rewrite;mv matching with rollup',count(distinct id) 
from sketch_input
+POSTHOOK: query: select 'rewrite;mv matching with rollup',percentile_disc(0.2) 
within group (order by id) from sketch_input
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@mv_1
 POSTHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
-rewrite;mv matching with rollup        15
+rewrite;mv matching with rollup        4.0
 PREHOOK: query: drop materialized view mv_1
 PREHOOK: type: DROP_MATERIALIZED_VIEW
 PREHOOK: Input: default@mv_1
diff --git 
a/ql/src/test/results/clientpositive/llap/sketches_materialized_view_rollup2.q.out
 
b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_rollup2.q.out
index e7b3c0e..d9b72b0 100644
--- 
a/ql/src/test/results/clientpositive/llap/sketches_materialized_view_rollup2.q.out
+++ 
b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_rollup2.q.out
@@ -63,7 +63,7 @@ STAGE PLANS:
                   alias: default.mv_1
                   Statistics: Num rows: 2 Data size: 362 Basic stats: COMPLETE 
Column stats: COMPLETE
                   Select Operator
-                    expressions: 'rewrite; mv matching' (type: string), 
category (type: char(1)), UDFToLong(ds_hll_estimate(_c1)) (type: bigint)
+                    expressions: 'rewrite; mv matching' (type: string), 
category (type: char(1)), UDFToLong(round(ds_hll_estimate(_c1))) (type: bigint)
                     outputColumnNames: _col0, _col1, _col2
                     Statistics: Num rows: 2 Data size: 394 Basic stats: 
COMPLETE Column stats: COMPLETE
                     File Output Operator
@@ -256,7 +256,7 @@ STAGE PLANS:
                 outputColumnNames: _col0, _col1
                 Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE 
Column stats: COMPLETE
                 Select Operator
-                  expressions: 'rewrite; but no mv usage' (type: string), 
_col0 (type: char(1)), UDFToLong(ds_hll_estimate(_col1)) (type: bigint)
+                  expressions: 'rewrite; but no mv usage' (type: string), 
_col0 (type: char(1)), UDFToLong(round(ds_hll_estimate(_col1))) (type: bigint)
                   outputColumnNames: _col0, _col1, _col2
                   Statistics: Num rows: 2 Data size: 402 Basic stats: COMPLETE 
Column stats: COMPLETE
                   File Output Operator
@@ -510,7 +510,7 @@ STAGE PLANS:
                   alias: default.mv_1
                   Statistics: Num rows: 2 Data size: 362 Basic stats: COMPLETE 
Column stats: COMPLETE
                   Select Operator
-                    expressions: 'rewrite; mv matching' (type: string), 
category (type: char(1)), UDFToLong(ds_hll_estimate(_c1)) (type: bigint)
+                    expressions: 'rewrite; mv matching' (type: string), 
category (type: char(1)), UDFToLong(round(ds_hll_estimate(_c1))) (type: bigint)
                     outputColumnNames: _col0, _col1, _col2
                     Statistics: Num rows: 2 Data size: 394 Basic stats: 
COMPLETE Column stats: COMPLETE
                     File Output Operator
@@ -596,7 +596,7 @@ STAGE PLANS:
                 outputColumnNames: _col0
                 Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE 
Column stats: COMPLETE
                 Select Operator
-                  expressions: 'rewrite;mv matching with rollup' (type: 
string), UDFToLong(ds_hll_estimate(_col0)) (type: bigint)
+                  expressions: 'rewrite;mv matching with rollup' (type: 
string), UDFToLong(round(ds_hll_estimate(_col0))) (type: bigint)
                   outputColumnNames: _col0, _col1
                   Statistics: Num rows: 1 Data size: 123 Basic stats: COMPLETE 
Column stats: COMPLETE
                   File Output Operator
diff --git 
a/ql/src/test/results/clientpositive/llap/sketches_materialized_view_safety.q.out
 
b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_safety.q.out
index 959edfc..126aee6 100644
--- 
a/ql/src/test/results/clientpositive/llap/sketches_materialized_view_safety.q.out
+++ 
b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_safety.q.out
@@ -399,7 +399,7 @@ STAGE PLANS:
                 outputColumnNames: _col0, _col1
                 Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE 
Column stats: COMPLETE
                 Select Operator
-                  expressions: 'rewritten;mv not used' (type: string), _col0 
(type: char(1)), UDFToLong(ds_hll_estimate(_col1)) (type: bigint)
+                  expressions: 'rewritten;mv not used' (type: string), _col0 
(type: char(1)), UDFToLong(round(ds_hll_estimate(_col1))) (type: bigint)
                   outputColumnNames: _col0, _col1, _col2
                   Statistics: Num rows: 2 Data size: 396 Basic stats: COMPLETE 
Column stats: COMPLETE
                   File Output Operator
diff --git a/ql/src/test/results/clientpositive/llap/sketches_rewrite.q.out 
b/ql/src/test/results/clientpositive/llap/sketches_rewrite_count_distinct.q.out
similarity index 99%
copy from ql/src/test/results/clientpositive/llap/sketches_rewrite.q.out
copy to 
ql/src/test/results/clientpositive/llap/sketches_rewrite_count_distinct.q.out
index dedcff9..8c556ef 100644
--- a/ql/src/test/results/clientpositive/llap/sketches_rewrite.q.out
+++ 
b/ql/src/test/results/clientpositive/llap/sketches_rewrite_count_distinct.q.out
@@ -81,7 +81,7 @@ STAGE PLANS:
                 outputColumnNames: _col0, _col1
                 Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE 
Column stats: COMPLETE
                 Select Operator
-                  expressions: _col0 (type: char(1)), 
UDFToLong(ds_hll_estimate(_col1)) (type: bigint)
+                  expressions: _col0 (type: char(1)), 
UDFToLong(round(ds_hll_estimate(_col1))) (type: bigint)
                   outputColumnNames: _col0, _col1
                   Statistics: Num rows: 2 Data size: 186 Basic stats: COMPLETE 
Column stats: COMPLETE
                   File Output Operator
diff --git a/ql/src/test/results/clientpositive/llap/sketches_rewrite.q.out 
b/ql/src/test/results/clientpositive/llap/sketches_rewrite_percentile_disc.q.out
similarity index 62%
rename from ql/src/test/results/clientpositive/llap/sketches_rewrite.q.out
rename to 
ql/src/test/results/clientpositive/llap/sketches_rewrite_percentile_disc.q.out
index dedcff9..d6c6911 100644
--- a/ql/src/test/results/clientpositive/llap/sketches_rewrite.q.out
+++ 
b/ql/src/test/results/clientpositive/llap/sketches_rewrite_percentile_disc.q.out
@@ -24,13 +24,22 @@ POSTHOOK: Input: _dummy_database@_dummy_table
 POSTHOOK: Output: default@sketch_input
 POSTHOOK: Lineage: sketch_input.category SCRIPT []
 POSTHOOK: Lineage: sketch_input.id SCRIPT []
+PREHOOK: query: select percentile_disc(0.3) within group(order by id) from 
sketch_input
+PREHOOK: type: QUERY
+PREHOOK: Input: default@sketch_input
+#### A masked pattern was here ####
+POSTHOOK: query: select percentile_disc(0.3) within group(order by id) from 
sketch_input
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@sketch_input
+#### A masked pattern was here ####
+6.0
 PREHOOK: query: explain
-select category, count(distinct id) from sketch_input group by category
+select percentile_disc(0.3) within group(order by id) from sketch_input
 PREHOOK: type: QUERY
 PREHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
 POSTHOOK: query: explain
-select category, count(distinct id) from sketch_input group by category
+select percentile_disc(0.3) within group(order by id) from sketch_input
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
@@ -43,50 +52,46 @@ STAGE PLANS:
     Tez
 #### A masked pattern was here ####
       Edges:
-        Reducer 2 <- Map 1 (SIMPLE_EDGE)
+        Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE)
 #### A masked pattern was here ####
       Vertices:
         Map 1 
             Map Operator Tree:
                 TableScan
                   alias: sketch_input
-                  Statistics: Num rows: 22 Data size: 1958 Basic stats: 
COMPLETE Column stats: COMPLETE
+                  Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE 
Column stats: COMPLETE
                   Select Operator
-                    expressions: id (type: int), category (type: char(1))
-                    outputColumnNames: id, category
-                    Statistics: Num rows: 22 Data size: 1958 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    expressions: UDFToFloat(id) (type: float)
+                    outputColumnNames: _col0
+                    Statistics: Num rows: 22 Data size: 88 Basic stats: 
COMPLETE Column stats: COMPLETE
                     Group By Operator
-                      aggregations: ds_hll_sketch(id)
-                      keys: category (type: char(1))
-                      minReductionHashAggr: 0.9090909
+                      aggregations: ds_kll_sketch(_col0)
+                      minReductionHashAggr: 0.95454544
                       mode: hash
-                      outputColumnNames: _col0, _col1
-                      Statistics: Num rows: 2 Data size: 946 Basic stats: 
COMPLETE Column stats: COMPLETE
+                      outputColumnNames: _col0
+                      Statistics: Num rows: 1 Data size: 144 Basic stats: 
COMPLETE Column stats: COMPLETE
                       Reduce Output Operator
-                        key expressions: _col0 (type: char(1))
-                        null sort order: z
-                        sort order: +
-                        Map-reduce partition columns: _col0 (type: char(1))
-                        Statistics: Num rows: 2 Data size: 946 Basic stats: 
COMPLETE Column stats: COMPLETE
-                        value expressions: _col1 (type: 
struct<lgk:int,type:string,sketch:binary>)
+                        null sort order: 
+                        sort order: 
+                        Statistics: Num rows: 1 Data size: 144 Basic stats: 
COMPLETE Column stats: COMPLETE
+                        value expressions: _col0 (type: binary)
             Execution mode: llap
             LLAP IO: may be used (ACID table)
         Reducer 2 
             Execution mode: llap
             Reduce Operator Tree:
               Group By Operator
-                aggregations: ds_hll_sketch(VALUE._col0)
-                keys: KEY._col0 (type: char(1))
+                aggregations: ds_kll_sketch(VALUE._col0)
                 mode: mergepartial
-                outputColumnNames: _col0, _col1
-                Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE 
Column stats: COMPLETE
+                outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE 
Column stats: COMPLETE
                 Select Operator
-                  expressions: _col0 (type: char(1)), 
UDFToLong(ds_hll_estimate(_col1)) (type: bigint)
-                  outputColumnNames: _col0, _col1
-                  Statistics: Num rows: 2 Data size: 186 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  expressions: UDFToDouble(ds_kll_quantile(_col0, 0.3)) (type: 
double)
+                  outputColumnNames: _col0
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: COMPLETE
                   File Output Operator
                     compressed: false
-                    Statistics: Num rows: 2 Data size: 186 Basic stats: 
COMPLETE Column stats: COMPLETE
+                    Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: COMPLETE
                     table:
                         input format: 
org.apache.hadoop.mapred.SequenceFileInputFormat
                         output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
@@ -98,13 +103,12 @@ STAGE PLANS:
       Processor Tree:
         ListSink
 
-PREHOOK: query: select category, count(distinct id) from sketch_input group by 
category
+PREHOOK: query: select percentile_disc(0.3) within group(order by id) from 
sketch_input
 PREHOOK: type: QUERY
 PREHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
-POSTHOOK: query: select category, count(distinct id) from sketch_input group 
by category
+POSTHOOK: query: select percentile_disc(0.3) within group(order by id) from 
sketch_input
 POSTHOOK: type: QUERY
 POSTHOOK: Input: default@sketch_input
 #### A masked pattern was here ####
-a      10
-b      10
+6.0

[hive] branch master updated: HIVE-23434: Add option to rewrite PERCENTILE_DISC to sketch functions (Zoltan Haindrich reviewed by Jesus Camacho Rodriguez)

Reply via email to