This is an automated email from the ASF dual-hosted git repository. kgyrtkirk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new dc6e477 HIVE-23031: Add option to enable transparent rewrite of count(distinct) into sketch functions (Zoltan Haindrich reviewed by Jesus Camacho Rodriguez) dc6e477 is described below commit dc6e4771ba2160263490b0fc708da51f1a8c628d Author: Zoltan Haindrich <k...@rxd.hu> AuthorDate: Thu Apr 30 16:02:27 2020 +0000 HIVE-23031: Add option to enable transparent rewrite of count(distinct) into sketch functions (Zoltan Haindrich reviewed by Jesus Camacho Rodriguez) Signed-off-by: Zoltan Haindrich <zhaindr...@cloudera.com> --- .../java/org/apache/hadoop/hive/conf/HiveConf.java | 13 + .../test/resources/testconfiguration.properties | 2 + .../hadoop/hive/ql/exec/DataSketchesFunctions.java | 110 +++- .../HiveRewriteCountDistinctToDataSketches.java | 175 ++++++ .../hadoop/hive/ql/parse/CalcitePlanner.java | 11 +- .../sketches_materialized_view_rollup.q | 7 +- .../sketches_materialized_view_rollup2.q | 54 ++ .../test/queries/clientpositive/sketches_rewrite.q | 19 + .../llap/sketches_materialized_view_rollup2.q.out | 634 +++++++++++++++++++++ .../clientpositive/llap/sketches_rewrite.q.out | 110 ++++ 10 files changed, 1109 insertions(+), 26 deletions(-) diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index b3faf05..829791e 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2465,6 +2465,19 @@ public class HiveConf extends Configuration { "If the number of references to a CTE clause exceeds this threshold, Hive will materialize it\n" + "before executing the main query block. -1 will disable this feature."), + HIVE_OPTIMIZE_BI_ENABLED("hive.optimize.bi.enabled", false, + "Enables query rewrites based on approximate functions(sketches)."), + + HIVE_OPTIMIZE_BI_REWRITE_COUNTDISTINCT_ENABLED("hive.optimize.bi.rewrite.countdistinct.enabled", + true, + "Enables to rewrite COUNT(DISTINCT(X)) queries to be rewritten to use sketch functions."), + + HIVE_OPTIMIZE_BI_REWRITE_COUNT_DISTINCT_SKETCH( + "hive.optimize.bi.rewrite.countdistinct.sketch", "hll", + new StringSet("hll"), + "Defines which sketch type to use when rewriting COUNT(DISTINCT(X)) expressions. " + + "Distinct counting can be done with: hll"), + // Statistics HIVE_STATS_ESTIMATE_STATS("hive.stats.estimate", true, "Estimate statistics in absence of statistics."), diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 48ecc35..c966392 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -820,7 +820,9 @@ minillaplocal.query.files=\ schq_ingest.q,\ sketches_hll.q,\ sketches_theta.q,\ + sketches_rewrite.q,\ sketches_materialized_view_rollup.q,\ + sketches_materialized_view_rollup2.q,\ table_access_keys_stats.q,\ temp_table_llap_partitioned.q,\ tez_bmj_schema_evolution.q,\ diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java index eec90c6..8865380 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java @@ -18,21 +18,28 @@ package org.apache.hadoop.hive.ql.exec; +import java.lang.reflect.Method; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; + +import org.apache.calcite.jdbc.JavaTypeFactoryImpl; +import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeImpl; import org.apache.calcite.rel.type.RelProtoDataType; import org.apache.calcite.sql.SqlFunction; +import org.apache.calcite.sql.SqlFunctionCategory; import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.type.InferTypes; import org.apache.calcite.sql.type.OperandTypes; import org.apache.calcite.sql.type.ReturnTypes; import org.apache.calcite.sql.type.SqlTypeName; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveTypeSystemImpl; import org.apache.hadoop.hive.ql.optimizer.calcite.functions.HiveMergeableAggregate; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveSqlFunction; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFResolver2; import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; import org.apache.hive.plugin.api.HiveUDFPlugin; @@ -48,9 +55,9 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { private static final String DATASKETCHES_PREFIX = "ds"; - private static final String DATA_TO_SKETCH = "sketch"; + public static final String DATA_TO_SKETCH = "sketch"; + public static final String SKETCH_TO_ESTIMATE = "estimate"; private static final String SKETCH_TO_ESTIMATE_WITH_ERROR_BOUNDS = "estimate_bounds"; - private static final String SKETCH_TO_ESTIMATE = "estimate"; private static final String SKETCH_TO_STRING = "stringify"; private static final String UNION_SKETCH = "union"; private static final String UNION_SKETCH1 = "union_f"; @@ -73,12 +80,12 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { private static final String SKETCH_TO_VARIANCES = "variances"; private static final String SKETCH_TO_PERCENTILE = "percentile"; - private final List<SketchDescriptor> sketchClasses; + private final Map<String, SketchDescriptor> sketchClasses; private final ArrayList<UDFDescriptor> descriptors; private DataSketchesFunctions() { - this.sketchClasses = new ArrayList<SketchDescriptor>(); - this.descriptors = new ArrayList<HiveUDFPlugin.UDFDescriptor>(); + this.sketchClasses = new HashMap<>(); + this.descriptors = new ArrayList<>(); registerHll(); registerCpc(); registerKll(); @@ -96,19 +103,31 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { return descriptors; } + public SketchFunctionDescriptor getSketchFunction(String className, String function) { + if (!sketchClasses.containsKey(className)) { + throw new IllegalArgumentException(String.format("Sketch-class '%s' doesn't exists", className)); + } + SketchDescriptor sc = sketchClasses.get(className); + if (!sc.fnMap.containsKey(function)) { + throw new IllegalArgumentException(String.format("The Sketch-class '%s' doesn't have a '%s' method", function)); + } + return sketchClasses.get(className).fnMap.get(function); + } + private void buildDescritors() { - for (SketchDescriptor sketchDescriptor : sketchClasses) { + for (SketchDescriptor sketchDescriptor : sketchClasses.values()) { descriptors.addAll(sketchDescriptor.fnMap.values()); } } private void buildCalciteFns() { - for (SketchDescriptor sd : sketchClasses) { + for (SketchDescriptor sd : sketchClasses.values()) { // Mergability is exposed to Calcite; which enables to use it during rollup. RelProtoDataType sketchType = RelDataTypeImpl.proto(SqlTypeName.BINARY, true); SketchFunctionDescriptor sketchSFD = sd.fnMap.get(DATA_TO_SKETCH); SketchFunctionDescriptor unionSFD = sd.fnMap.get(UNION_SKETCH); + SketchFunctionDescriptor estimateSFD = sd.fnMap.get(SKETCH_TO_ESTIMATE); if (sketchSFD == null || unionSFD == null) { continue; @@ -128,14 +147,27 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { OperandTypes.family(), unionFn); + unionSFD.setCalciteFunction(unionFn); sketchSFD.setCalciteFunction(sketchFn); + if (estimateSFD != null && estimateSFD.getReturnRelDataType().isPresent()) { + + SqlFunction estimateFn = new HiveSqlFunction(estimateSFD.name, + SqlKind.OTHER_FUNCTION, + ReturnTypes.explicit(estimateSFD.getReturnRelDataType().get().getSqlTypeName()), + InferTypes.ANY_NULLABLE, + OperandTypes.family(), + SqlFunctionCategory.USER_DEFINED_FUNCTION, + true, + false); + + estimateSFD.setCalciteFunction(estimateFn); + } } } - private void registerHiveFunctionsInternal(Registry system) { - for (SketchDescriptor sketchDescriptor : sketchClasses) { + for (SketchDescriptor sketchDescriptor : sketchClasses.values()) { Collection<SketchFunctionDescriptor> functions = sketchDescriptor.fnMap.values(); for (SketchFunctionDescriptor fn : functions) { if (UDF.class.isAssignableFrom(fn.udfClass)) { @@ -165,6 +197,7 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { String name; Class<?> udfClass; private SqlFunction calciteFunction; + private Class<?> returnType; public SketchFunctionDescriptor(String name, Class<?> udfClass) { this.name = name; @@ -181,6 +214,19 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { return name; } + public Optional<RelDataType> getReturnRelDataType() { + if (returnType == null) { + return Optional.empty(); + } else { + JavaTypeFactoryImpl typeFactory = new JavaTypeFactoryImpl(new HiveTypeSystemImpl()); + return Optional.of(typeFactory.createType(returnType)); + } + } + + public void setReturnType(Class<?> returnType) { + this.returnType = returnType; + } + @Override public Optional<SqlFunction> getCalciteFunction() { return Optional.ofNullable(calciteFunction); @@ -189,6 +235,11 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { public void setCalciteFunction(SqlFunction calciteFunction) { this.calciteFunction = calciteFunction; } + + @Override + public String toString() { + return getClass().getCanonicalName() + "[" + name + "]"; + } } private static class SketchDescriptor { @@ -201,7 +252,28 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { } private void register(String name, Class<?> clazz) { - fnMap.put(name, new SketchFunctionDescriptor(functionPrefix + name, clazz)); + SketchFunctionDescriptor value = new SketchFunctionDescriptor(functionPrefix + name, clazz); + if (UDF.class.isAssignableFrom(clazz)) { + Optional<Method> evaluateMethod = getEvaluateMethod(clazz); + if (evaluateMethod.isPresent()) { + value.setReturnType(evaluateMethod.get().getReturnType()); + } + } + fnMap.put(name, value); + } + + private Optional<Method> getEvaluateMethod(Class<?> clazz) { + List<Method> evaluateMethods = new ArrayList<Method>(); + for (Method method : clazz.getMethods()) { + if ("evaluate".equals(method.getName())) { + evaluateMethods.add(method); + } + } + if (evaluateMethods.size() > 0) { + return Optional.of(evaluateMethods.get(0)); + } else { + return Optional.empty(); + } } } @@ -214,7 +286,7 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { sd.register(SKETCH_TO_STRING, org.apache.datasketches.hive.hll.SketchToStringUDF.class); sd.register(UNION_SKETCH1, org.apache.datasketches.hive.hll.UnionSketchUDF.class); sd.register(UNION_SKETCH, org.apache.datasketches.hive.hll.UnionSketchUDAF.class); - sketchClasses.add(sd); + sketchClasses.put("hll", sd); } private void registerCpc() { @@ -228,7 +300,7 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { sd.register(SKETCH_TO_STRING, org.apache.datasketches.hive.cpc.SketchToStringUDF.class); sd.register(UNION_SKETCH1, org.apache.datasketches.hive.cpc.UnionSketchUDF.class); sd.register(UNION_SKETCH, org.apache.datasketches.hive.cpc.UnionSketchUDAF.class); - sketchClasses.add(sd); + sketchClasses.put("cpc", sd); } private void registerKll() { @@ -244,7 +316,7 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { sd.register(GET_QUANTILES, org.apache.datasketches.hive.kll.GetQuantilesUDF.class); sd.register(GET_QUANTILE, org.apache.datasketches.hive.kll.GetQuantileUDF.class); sd.register(GET_RANK, org.apache.datasketches.hive.kll.GetRankUDF.class); - sketchClasses.add(sd); + sketchClasses.put("kll", sd); } private void registerTheta() { @@ -258,7 +330,7 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { sd.register(INTERSECT_SKETCH, org.apache.datasketches.hive.theta.IntersectSketchUDAF.class); sd.register(SKETCH_TO_ESTIMATE, org.apache.datasketches.hive.theta.EstimateSketchUDF.class); sd.register(EXCLUDE_SKETCH, org.apache.datasketches.hive.theta.ExcludeSketchUDF.class); - sketchClasses.add(sd); + sketchClasses.put("theta", sd); } @@ -284,7 +356,7 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToQuantilesSketchUDF.class); sd.register(SKETCH_TO_VALUES, org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToValuesUDTF.class); sd.register(SKETCH_TO_VARIANCES, org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToVariancesUDF.class); - sketchClasses.add(sd); + sketchClasses.put("tuple_arrayofdouble", sd); } private void registerTupleDoubleSummary() { @@ -294,7 +366,7 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { sd.register(UNION_SKETCH, org.apache.datasketches.hive.tuple.UnionDoubleSummarySketchUDAF.class); sd.register(SKETCH_TO_ESTIMATE, org.apache.datasketches.hive.tuple.DoubleSummarySketchToEstimatesUDF.class); sd.register(SKETCH_TO_PERCENTILE, org.apache.datasketches.hive.tuple.DoubleSummarySketchToPercentileUDF.class); - sketchClasses.add(sd); + sketchClasses.put("tuple_doublesummary", sd); } private void registerQuantiles() { @@ -312,7 +384,7 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { sd.register(UNION_SKETCH, org.apache.datasketches.hive.frequencies.UnionStringsSketchUDAF.class); sd.register(GET_FREQUENT_ITEMS, org.apache.datasketches.hive.frequencies.GetFrequentItemsFromStringsSketchUDTF.class); - sketchClasses.add(sd); + sketchClasses.put("freq", sd); } private void registerQuantilesString() { @@ -327,7 +399,7 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { sd.register(GET_PMF, org.apache.datasketches.hive.quantiles.GetPmfFromStringsSketchUDF.class); sd.register(GET_QUANTILE, org.apache.datasketches.hive.quantiles.GetQuantileFromStringsSketchUDF.class); sd.register(GET_QUANTILES, org.apache.datasketches.hive.quantiles.GetQuantilesFromStringsSketchUDF.class); - sketchClasses.add(sd); + sketchClasses.put("quantile_strings", sd); } private void registerQuantilesDoubles() { @@ -342,7 +414,7 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { sd.register(GET_PMF, org.apache.datasketches.hive.quantiles.GetPmfFromDoublesSketchUDF.class); sd.register(GET_QUANTILE, org.apache.datasketches.hive.quantiles.GetQuantileFromDoublesSketchUDF.class); sd.register(GET_QUANTILES, org.apache.datasketches.hive.quantiles.GetQuantilesFromDoublesSketchUDF.class); - sketchClasses.add(sd); + sketchClasses.put("quantile_doubles", sd); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteCountDistinctToDataSketches.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteCountDistinctToDataSketches.java new file mode 100644 index 0000000..c23e2c4 --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteCountDistinctToDataSketches.java @@ -0,0 +1,175 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to you under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.optimizer.calcite.rules; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import org.apache.calcite.plan.RelOptRule; +import org.apache.calcite.plan.RelOptRuleCall; +import org.apache.calcite.rel.RelCollation; +import org.apache.calcite.rel.RelNode; +import org.apache.calcite.rel.core.Aggregate; +import org.apache.calcite.rel.core.AggregateCall; +import org.apache.calcite.rel.core.RelFactories.ProjectFactory; +import org.apache.calcite.rel.type.RelDataType; +import org.apache.calcite.rex.RexBuilder; +import org.apache.calcite.rex.RexNode; +import org.apache.calcite.sql.SqlAggFunction; +import org.apache.calcite.sql.SqlOperator; +import org.apache.hadoop.hive.ql.exec.DataSketchesFunctions; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories; +import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; +import org.apache.hive.plugin.api.HiveUDFPlugin.UDFDescriptor; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import com.google.common.collect.ImmutableList; + +/** + * This rule could rewrite {@code count(distinct(x))} calls to be calculated using sketch based functions. + * + * The transformation here works on Aggregate nodes; the operations done are the following: + * + * 1. Identify candidate {@code count(distinct)} aggregate calls + * 2. A new Aggregate is created in which the aggregation is done by the sketch function + * 3. A new Project is inserted on top of the Aggregate; which unwraps the resulting + * count-distinct estimation from the sketch representation + */ +public final class HiveRewriteCountDistinctToDataSketches extends RelOptRule { + + protected static final Logger LOG = LoggerFactory.getLogger(HiveRewriteCountDistinctToDataSketches.class); + private final String sketchClass; + private final ProjectFactory projectFactory; + + public HiveRewriteCountDistinctToDataSketches(String sketchClass) { + super(operand(HiveAggregate.class, any())); + this.sketchClass = sketchClass; + projectFactory = HiveRelFactories.HIVE_PROJECT_FACTORY; + } + + @Override + public void onMatch(RelOptRuleCall call) { + final Aggregate aggregate = call.rel(0); + + if (aggregate.getGroupSets().size() != 1) { + // not yet supported + return; + } + + List<AggregateCall> newAggCalls = new ArrayList<AggregateCall>(); + + VBuilder vb = new VBuilder(aggregate); + + if (aggregate.getAggCallList().equals(vb.newAggCalls)) { + // rule didn't made any changes + return; + } + + newAggCalls = vb.newAggCalls; + RelNode newAgg = aggregate.copy(aggregate.getTraitSet(), aggregate.getInput(), aggregate.getGroupSet(), + aggregate.getGroupSets(), newAggCalls); + + RelNode newProject = projectFactory.createProject(newAgg, vb.newProjects, aggregate.getRowType().getFieldNames()); + + call.transformTo(newProject); + return; + } + + /** + * Helper class to help in building a new Aggregate and Project. + */ + // NOTE: methods in this class are not re-entrant; drop-to-frame to constructor during debugging + class VBuilder { + + private Aggregate aggregate; + private List<AggregateCall> newAggCalls; + private List<RexNode> newProjects; + private final RexBuilder rexBuilder; + + public VBuilder(Aggregate aggregate) { + this.aggregate = aggregate; + newAggCalls = new ArrayList<AggregateCall>(); + newProjects = new ArrayList<RexNode>(); + rexBuilder = aggregate.getCluster().getRexBuilder(); + + // add non-aggregated fields - as identity projections + addGroupFields(); + + for (AggregateCall aggCall : aggregate.getAggCallList()) { + processAggCall(aggCall); + } + } + + private void addGroupFields() { + for (int i = 0; i < aggregate.getGroupCount(); i++) { + newProjects.add(rexBuilder.makeInputRef(aggregate, 0)); + } + } + + private void processAggCall(AggregateCall aggCall) { + if (isSimpleCountDistinct(aggCall)) { + rewriteCountDistinct(aggCall); + return; + } + appendAggCall(aggCall, null); + } + + private void appendAggCall(AggregateCall aggCall, SqlOperator projectOperator) { + RelDataType origType = aggregate.getRowType().getFieldList().get(newProjects.size()).getType(); + RexNode projRex = rexBuilder.makeInputRef(aggCall.getType(), newProjects.size()); + if (projectOperator != null) { + projRex = rexBuilder.makeCall(projectOperator, ImmutableList.of(projRex)); + projRex = rexBuilder.makeCast(origType, projRex); + } + newAggCalls.add(aggCall); + newProjects.add(projRex); + } + + private boolean isSimpleCountDistinct(AggregateCall aggCall) { + return aggCall.isDistinct() && aggCall.getArgList().size() == 1 + && aggCall.getAggregation().getName().equalsIgnoreCase("count") && !aggCall.hasFilter(); + } + + private void rewriteCountDistinct(AggregateCall aggCall) { + SqlAggFunction aggFunction = (SqlAggFunction) getSqlOperator(DataSketchesFunctions.DATA_TO_SKETCH); + boolean distinct = false; + boolean approximate = true; + boolean ignoreNulls = aggCall.ignoreNulls(); + List<Integer> argList = aggCall.getArgList(); + int filterArg = aggCall.filterArg; + RelCollation collation = aggCall.getCollation(); + int groupCount = aggregate.getGroupCount(); + RelNode input = aggregate.getInput(); + RelDataType type = rexBuilder.deriveReturnType(aggFunction, Collections.emptyList()); + String name = aggFunction.getName(); + + AggregateCall ret = AggregateCall.create(aggFunction, distinct, approximate, ignoreNulls, argList, filterArg, + collation, groupCount, input, type, name); + + appendAggCall(ret, getSqlOperator(DataSketchesFunctions.SKETCH_TO_ESTIMATE)); + } + + private SqlOperator getSqlOperator(String fnName) { + UDFDescriptor fn = DataSketchesFunctions.INSTANCE.getSketchFunction(sketchClass, fnName); + if (!fn.getCalciteFunction().isPresent()) { + throw new RuntimeException(fn.toString() + " doesn't have a Calcite function associated with it"); + } + return fn.getCalciteFunction().get(); + } + } +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 1cfd04e..7b34e91 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -236,6 +236,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRelDecorrelator; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRelFieldTrimmer; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRemoveGBYSemiJoinRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRemoveSqCountCheck; +import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRewriteCountDistinctToDataSketches; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveRulesRegistry; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSemiJoinRule; import org.apache.hadoop.hive.ql.optimizer.calcite.rules.HiveSortJoinReduceRule; @@ -1968,6 +1969,14 @@ public class CalcitePlanner extends SemanticAnalyzer { HiveExceptRewriteRule.INSTANCE); //1. Distinct aggregate rewrite + if (conf.getBoolVar(ConfVars.HIVE_OPTIMIZE_BI_ENABLED)) { + // Rewrite to datasketches if enabled + if (conf.getBoolVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_COUNTDISTINCT_ENABLED)) { + String sketchClass = conf.getVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_COUNT_DISTINCT_SKETCH); + generatePartialProgram(program, true, HepMatchOrder.TOP_DOWN, + new HiveRewriteCountDistinctToDataSketches(sketchClass)); + } + } // Run this optimization early, since it is expanding the operator pipeline. if (!conf.getVar(HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("mr") && conf.getBoolVar(HiveConf.ConfVars.HIVEOPTIMIZEDISTINCTREWRITE)) { @@ -2278,7 +2287,7 @@ public class CalcitePlanner extends SemanticAnalyzer { RelMetadataQuery.THREAD_PROVIDERS.set(JaninoRelMetadataProvider.DEFAULT); RelMetadataQuery mq = RelMetadataQuery.instance(); RelOptCost costOriginalPlan = mq.getCumulativeCost(calcitePreMVRewritingPlan); - final double factorSelectivity = (double) HiveConf.getFloatVar( + final double factorSelectivity = HiveConf.getFloatVar( conf, HiveConf.ConfVars.HIVE_MATERIALIZED_VIEW_REBUILD_INCREMENTAL_FACTOR); RelOptCost costRebuildPlan = mq.getCumulativeCost(basePlan).multiplyBy(factorSelectivity); if (costOriginalPlan.isLe(costRebuildPlan)) { diff --git a/ql/src/test/queries/clientpositive/sketches_materialized_view_rollup.q b/ql/src/test/queries/clientpositive/sketches_materialized_view_rollup.q index e1a2054..3beea62 100644 --- a/ql/src/test/queries/clientpositive/sketches_materialized_view_rollup.q +++ b/ql/src/test/queries/clientpositive/sketches_materialized_view_rollup.q @@ -1,9 +1,4 @@ - -set hive.support.concurrency=true; -set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; -set hive.strict.checks.cartesian.product=false; -set hive.stats.fetch.column.stats=true; -set hive.materializedview.rewriting=true; +--! qt:transactional set hive.fetch.task.conversion=none; create table sketch_input (id int, category char(1)) diff --git a/ql/src/test/queries/clientpositive/sketches_materialized_view_rollup2.q b/ql/src/test/queries/clientpositive/sketches_materialized_view_rollup2.q new file mode 100644 index 0000000..39c7ac4 --- /dev/null +++ b/ql/src/test/queries/clientpositive/sketches_materialized_view_rollup2.q @@ -0,0 +1,54 @@ +--! qt:transactional +set hive.fetch.task.conversion=none; + +create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true'); + +insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +; + +-- create an mv for the intermediate results +create materialized view mv_1 as + select category, ds_hll_sketch(id),count(id) from sketch_input group by category; + +-- bi mode on +set hive.optimize.bi.enabled=true; + +explain +select 'rewrite; mv matching', category, count(distinct id) from sketch_input group by category; +select 'rewrite; mv matching', category, count(distinct id) from sketch_input group by category; + +set hive.optimize.bi.enabled=false; + +explain +select 'no rewrite; no mv usage', category, count(distinct id) from sketch_input group by category; +select 'no rewrite; no mv usage', category, count(distinct id) from sketch_input group by category; + +set hive.optimize.bi.enabled=true; + +insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +; + +explain +select 'rewrite; but no mv usage', category, count(distinct id) from sketch_input group by category; +select 'rewrite; but no mv usage', category, count(distinct id) from sketch_input group by category; + +explain +alter materialized view mv_1 rebuild; +alter materialized view mv_1 rebuild; + +explain +select 'rewrite; mv matching', category, count(distinct id) from sketch_input group by category; +select 'rewrite; mv matching', category, count(distinct id) from sketch_input group by category; + +-- rewrite+mv matching with rollup +explain +select 'rewrite;mv matching with rollup',count(distinct id) from sketch_input; +select 'rewrite;mv matching with rollup',count(distinct id) from sketch_input; + +drop materialized view mv_1; diff --git a/ql/src/test/queries/clientpositive/sketches_rewrite.q b/ql/src/test/queries/clientpositive/sketches_rewrite.q new file mode 100644 index 0000000..0420d62 --- /dev/null +++ b/ql/src/test/queries/clientpositive/sketches_rewrite.q @@ -0,0 +1,19 @@ +--! qt:transactional + +set hive.optimize.bi.enabled=true; + +create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true'); + +insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +; + +-- see if rewrite happens +explain +select category, count(distinct id) from sketch_input group by category; + +select category, count(distinct id) from sketch_input group by category; + diff --git a/ql/src/test/results/clientpositive/llap/sketches_materialized_view_rollup2.q.out b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_rollup2.q.out new file mode 100644 index 0000000..e7b3c0e --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_rollup2.q.out @@ -0,0 +1,634 @@ +PREHOOK: query: create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_input +POSTHOOK: query: create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_input +PREHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@sketch_input +POSTHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@sketch_input +POSTHOOK: Lineage: sketch_input.category SCRIPT [] +POSTHOOK: Lineage: sketch_input.id SCRIPT [] +PREHOOK: query: create materialized view mv_1 as + select category, ds_hll_sketch(id),count(id) from sketch_input group by category +PREHOOK: type: CREATE_MATERIALIZED_VIEW +PREHOOK: Input: default@sketch_input +PREHOOK: Output: database:default +PREHOOK: Output: default@mv_1 +POSTHOOK: query: create materialized view mv_1 as + select category, ds_hll_sketch(id),count(id) from sketch_input group by category +POSTHOOK: type: CREATE_MATERIALIZED_VIEW +POSTHOOK: Input: default@sketch_input +POSTHOOK: Output: database:default +POSTHOOK: Output: default@mv_1 +PREHOOK: query: explain +select 'rewrite; mv matching', category, count(distinct id) from sketch_input group by category +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select 'rewrite; mv matching', category, count(distinct id) from sketch_input group by category +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: default.mv_1 + Statistics: Num rows: 2 Data size: 362 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: 'rewrite; mv matching' (type: string), category (type: char(1)), UDFToLong(ds_hll_estimate(_c1)) (type: bigint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 394 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 394 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: all inputs + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select 'rewrite; mv matching', category, count(distinct id) from sketch_input group by category +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select 'rewrite; mv matching', category, count(distinct id) from sketch_input group by category +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +rewrite; mv matching a 10 +rewrite; mv matching b 10 +PREHOOK: query: explain +select 'no rewrite; no mv usage', category, count(distinct id) from sketch_input group by category +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select 'no rewrite; no mv usage', category, count(distinct id) from sketch_input group by category +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 22 Data size: 1958 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), category (type: char(1)) + outputColumnNames: id, category + Statistics: Num rows: 22 Data size: 1958 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + keys: category (type: char(1)), id (type: int) + minReductionHashAggr: 0.3181818 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 979 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: char(1)), _col1 (type: int) + null sort order: zz + sort order: ++ + Map-reduce partition columns: _col0 (type: char(1)) + Statistics: Num rows: 11 Data size: 979 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: vectorized, llap + Reduce Operator Tree: + Group By Operator + keys: KEY._col0 (type: char(1)), KEY._col1 (type: int) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 979 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: int), _col0 (type: char(1)) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 11 Data size: 979 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(_col0) + keys: _col1 (type: char(1)) + mode: complete + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 186 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: 'no rewrite; no mv usage' (type: string), _col0 (type: char(1)), _col1 (type: bigint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 400 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 400 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select 'no rewrite; no mv usage', category, count(distinct id) from sketch_input group by category +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select 'no rewrite; no mv usage', category, count(distinct id) from sketch_input group by category +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +no rewrite; no mv usage a 10 +no rewrite; no mv usage b 10 +PREHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@sketch_input +POSTHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@sketch_input +POSTHOOK: Lineage: sketch_input.category SCRIPT [] +POSTHOOK: Lineage: sketch_input.id SCRIPT [] +PREHOOK: query: explain +select 'rewrite; but no mv usage', category, count(distinct id) from sketch_input group by category +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select 'rewrite; but no mv usage', category, count(distinct id) from sketch_input group by category +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 44 Data size: 3916 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), category (type: char(1)) + outputColumnNames: id, category + Statistics: Num rows: 44 Data size: 3916 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_hll_sketch(id) + keys: category (type: char(1)) + minReductionHashAggr: 0.95454544 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 946 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: char(1)) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: char(1)) + Statistics: Num rows: 2 Data size: 946 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: struct<lgk:int,type:string,sketch:binary>) + Execution mode: llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_hll_sketch(VALUE._col0) + keys: KEY._col0 (type: char(1)) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: 'rewrite; but no mv usage' (type: string), _col0 (type: char(1)), UDFToLong(ds_hll_estimate(_col1)) (type: bigint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 402 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 402 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select 'rewrite; but no mv usage', category, count(distinct id) from sketch_input group by category +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select 'rewrite; but no mv usage', category, count(distinct id) from sketch_input group by category +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +rewrite; but no mv usage a 10 +rewrite; but no mv usage b 10 +PREHOOK: query: explain +alter materialized view mv_1 rebuild +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +PREHOOK: Output: default@mv_1 +POSTHOOK: query: explain +alter materialized view mv_1 rebuild +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +POSTHOOK: Output: default@mv_1 +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 depends on stages: Stage-3 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 6 <- Union 3 (CONTAINS) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Union 3 (CONTAINS) + Reducer 4 <- Union 3 (SIMPLE_EDGE) + Reducer 5 <- Reducer 4 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + filterExpr: (ROW__ID.writeid > 1L) (type: boolean) + Statistics: Num rows: 44 Data size: 3916 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (ROW__ID.writeid > 1L) (type: boolean) + Statistics: Num rows: 14 Data size: 1246 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), category (type: char(1)) + outputColumnNames: id, category + Statistics: Num rows: 14 Data size: 1246 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_hll_sketch(id), count(id) + keys: category (type: char(1)) + minReductionHashAggr: 0.85714287 + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 962 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: char(1)) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: char(1)) + Statistics: Num rows: 2 Data size: 962 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: struct<lgk:int,type:string,sketch:binary>), _col2 (type: bigint) + Execution mode: llap + LLAP IO: may be used (ACID table) + Map 6 + Map Operator Tree: + TableScan + alias: default.mv_1 + Statistics: Num rows: 2 Data size: 378 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: category (type: char(1)), _c1 (type: binary), _c2 (type: bigint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 378 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_hll_union(_col1), sum(_col2) + keys: _col0 (type: char(1)) + minReductionHashAggr: 0.5 + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 962 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: char(1)) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: char(1)) + Statistics: Num rows: 2 Data size: 962 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: struct<lgk:int,type:string,sketch:binary>), _col2 (type: bigint) + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_hll_sketch(VALUE._col0), count(VALUE._col1) + keys: KEY._col0 (type: char(1)) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 474 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_hll_union(_col1), sum(_col2) + keys: _col0 (type: char(1)) + minReductionHashAggr: 0.5 + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 962 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: char(1)) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: char(1)) + Statistics: Num rows: 2 Data size: 962 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: struct<lgk:int,type:string,sketch:binary>), _col2 (type: bigint) + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_hll_union(VALUE._col0), sum(VALUE._col1) + keys: KEY._col0 (type: char(1)) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 474 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: char(1)), _col1 (type: binary), COALESCE(_col2,0L) (type: bigint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 474 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 474 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.mv_1 + Select Operator + expressions: _col0 (type: char(1)), _col1 (type: binary), _col2 (type: bigint) + outputColumnNames: category, _c1, _c2 + Statistics: Num rows: 2 Data size: 474 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: compute_stats(category, 'hll'), compute_stats(_c1, 'hll'), compute_stats(_c2, 'hll') + minReductionHashAggr: 0.5 + mode: hash + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 1152 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 1152 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: struct<columntype:string,maxlength:bigint,sumlength:bigint,count:bigint,countnulls:bigint,bitvector:binary>), _col1 (type: struct<columntype:string,maxlength:bigint,sumlength:bigint,count:bigint,countnulls:bigint>), _col2 (type: struct<columntype:string,min:bigint,max:bigint,countnulls:bigint,bitvector:binary>) + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: compute_stats(VALUE._col0), compute_stats(VALUE._col1), compute_stats(VALUE._col2) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 1 Data size: 1152 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 1152 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Union 3 + Vertex: Union 3 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.mv_1 + + Stage: Stage-3 + Stats Work + Basic Stats Work: + Column Stats Desc: + Columns: category, _c1, _c2 + Column Types: char(1), binary, bigint + Table: default.mv_1 + + Stage: Stage-4 + Materialized View Update + name: default.mv_1 + update creation metadata: true + +PREHOOK: query: alter materialized view mv_1 rebuild +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +PREHOOK: Output: default@mv_1 +POSTHOOK: query: alter materialized view mv_1 rebuild +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +POSTHOOK: Output: default@mv_1 +POSTHOOK: Lineage: mv_1._c1 EXPRESSION [(sketch_input)sketch_input.FieldSchema(name:id, type:int, comment:null), (mv_1)default.mv_1.FieldSchema(name:_c1, type:binary, comment:null), ] +POSTHOOK: Lineage: mv_1._c2 EXPRESSION [(sketch_input)sketch_input.FieldSchema(name:id, type:int, comment:null), (mv_1)default.mv_1.FieldSchema(name:_c2, type:bigint, comment:null), ] +POSTHOOK: Lineage: mv_1.category EXPRESSION [(sketch_input)sketch_input.FieldSchema(name:category, type:char(1), comment:null), (mv_1)default.mv_1.FieldSchema(name:category, type:char(1), comment:null), ] +PREHOOK: query: explain +select 'rewrite; mv matching', category, count(distinct id) from sketch_input group by category +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select 'rewrite; mv matching', category, count(distinct id) from sketch_input group by category +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: default.mv_1 + Statistics: Num rows: 2 Data size: 362 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: 'rewrite; mv matching' (type: string), category (type: char(1)), UDFToLong(ds_hll_estimate(_c1)) (type: bigint) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 2 Data size: 394 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 394 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized, llap + LLAP IO: all inputs + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select 'rewrite; mv matching', category, count(distinct id) from sketch_input group by category +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select 'rewrite; mv matching', category, count(distinct id) from sketch_input group by category +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +rewrite; mv matching a 10 +rewrite; mv matching b 10 +PREHOOK: query: explain +select 'rewrite;mv matching with rollup',count(distinct id) from sketch_input +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select 'rewrite;mv matching with rollup',count(distinct id) from sketch_input +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: default.mv_1 + Statistics: Num rows: 2 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _c1 (type: binary) + outputColumnNames: _c1 + Statistics: Num rows: 2 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_hll_union(_c1) + minReductionHashAggr: 0.5 + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 388 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 388 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: struct<lgk:int,type:string,sketch:binary>) + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_hll_union(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: 'rewrite;mv matching with rollup' (type: string), UDFToLong(ds_hll_estimate(_col0)) (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 123 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 123 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select 'rewrite;mv matching with rollup',count(distinct id) from sketch_input +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select 'rewrite;mv matching with rollup',count(distinct id) from sketch_input +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +rewrite;mv matching with rollup 15 +PREHOOK: query: drop materialized view mv_1 +PREHOOK: type: DROP_MATERIALIZED_VIEW +PREHOOK: Input: default@mv_1 +PREHOOK: Output: default@mv_1 +POSTHOOK: query: drop materialized view mv_1 +POSTHOOK: type: DROP_MATERIALIZED_VIEW +POSTHOOK: Input: default@mv_1 +POSTHOOK: Output: default@mv_1 diff --git a/ql/src/test/results/clientpositive/llap/sketches_rewrite.q.out b/ql/src/test/results/clientpositive/llap/sketches_rewrite.q.out new file mode 100644 index 0000000..dedcff9 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/sketches_rewrite.q.out @@ -0,0 +1,110 @@ +PREHOOK: query: create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_input +POSTHOOK: query: create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_input +PREHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@sketch_input +POSTHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@sketch_input +POSTHOOK: Lineage: sketch_input.category SCRIPT [] +POSTHOOK: Lineage: sketch_input.id SCRIPT [] +PREHOOK: query: explain +select category, count(distinct id) from sketch_input group by category +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select category, count(distinct id) from sketch_input group by category +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 22 Data size: 1958 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), category (type: char(1)) + outputColumnNames: id, category + Statistics: Num rows: 22 Data size: 1958 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_hll_sketch(id) + keys: category (type: char(1)) + minReductionHashAggr: 0.9090909 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 946 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: char(1)) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: char(1)) + Statistics: Num rows: 2 Data size: 946 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: struct<lgk:int,type:string,sketch:binary>) + Execution mode: llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_hll_sketch(VALUE._col0) + keys: KEY._col0 (type: char(1)) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: char(1)), UDFToLong(ds_hll_estimate(_col1)) (type: bigint) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 186 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 186 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select category, count(distinct id) from sketch_input group by category +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select category, count(distinct id) from sketch_input group by category +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +a 10 +b 10