This is an automated email from the ASF dual-hosted git repository. kgyrtkirk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 160f467 HIVE-23462: Add option to rewrite CUME_DIST to sketch functions (#1031) 160f467 is described below commit 160f467011a489bb27fdd17d228b9120e18f50e6 Author: Zoltan Haindrich <zhaindr...@cloudera.com> AuthorDate: Wed Jun 10 13:40:33 2020 +0200 HIVE-23462: Add option to rewrite CUME_DIST to sketch functions (#1031) --- .../java/org/apache/hadoop/hive/conf/HiveConf.java | 13 +- .../hadoop/hive/ql/exec/DataSketchesFunctions.java | 72 +- .../hive/ql/optimizer/calcite/HiveRelBuilder.java | 7 + .../rules/HiveRewriteToDataSketchesRules.java | 290 +++++- .../calcite/translator/SqlFunctionConverter.java | 3 +- .../hadoop/hive/ql/parse/CalcitePlanner.java | 13 +- .../hive/ql/parse/type/HiveFunctionHelper.java | 3 +- .../hive/ql/exec/TestDataSketchesFunctions.java | 38 + .../sketches_materialized_view_cume_dist.q | 54 + .../clientpositive/sketches_rewrite_cume_dist.q | 47 + .../sketches_rewrite_cume_dist_partition_by.q | 27 + .../clientpositive/llap/cbo_rp_windowing_2.q.out | 104 +- .../sketches_materialized_view_cume_dist.q.out | 1054 ++++++++++++++++++++ .../llap/sketches_rewrite_cume_dist.q.out | 775 ++++++++++++++ .../sketches_rewrite_cume_dist_partition_by.q.out | 258 +++++ 15 files changed, 2635 insertions(+), 123 deletions(-) diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 085ab4a..8cdb2eb 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -2495,19 +2495,22 @@ public class HiveConf extends Configuration { HIVE_OPTIMIZE_BI_REWRITE_COUNTDISTINCT_ENABLED("hive.optimize.bi.rewrite.countdistinct.enabled", true, "Enables to rewrite COUNT(DISTINCT(X)) queries to be rewritten to use sketch functions."), - HIVE_OPTIMIZE_BI_REWRITE_COUNT_DISTINCT_SKETCH( - "hive.optimize.bi.rewrite.countdistinct.sketch", "hll", + HIVE_OPTIMIZE_BI_REWRITE_COUNT_DISTINCT_SKETCH("hive.optimize.bi.rewrite.countdistinct.sketch", "hll", new StringSet("hll"), "Defines which sketch type to use when rewriting COUNT(DISTINCT(X)) expressions. " + "Distinct counting can be done with: hll"), HIVE_OPTIMIZE_BI_REWRITE_PERCENTILE_DISC_ENABLED("hive.optimize.bi.rewrite.percentile_disc.enabled", true, "Enables to rewrite PERCENTILE_DISC(X) queries to be rewritten to use sketch functions."), - HIVE_OPTIMIZE_BI_REWRITE_PERCENTILE_DISC_SKETCH( - "hive.optimize.bi.rewrite.percentile_disc.sketch", "kll", + HIVE_OPTIMIZE_BI_REWRITE_PERCENTILE_DISC_SKETCH("hive.optimize.bi.rewrite.percentile_disc.sketch", "kll", new StringSet("kll"), "Defines which sketch type to use when rewriting PERCENTILE_DISC expressions. Options: kll"), - + HIVE_OPTIMIZE_BI_REWRITE_CUME_DIST_ENABLED("hive.optimize.bi.rewrite.cume_dist.enabled", + true, + "Enables to rewrite CUME_DIST(X) queries to be rewritten to use sketch functions."), + HIVE_OPTIMIZE_BI_REWRITE_CUME_DIST_SKETCH("hive.optimize.bi.rewrite.cume_dist.sketch", "kll", + new StringSet("kll"), + "Defines which sketch type to use when rewriting CUME_DIST expressions. Options: kll"), // Statistics HIVE_STATS_ESTIMATE_STATS("hive.stats.estimate", true, diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java index cc48d5b..3a450a9 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java @@ -19,6 +19,8 @@ package org.apache.hadoop.hive.ql.exec; import java.lang.reflect.Method; +import java.lang.reflect.ParameterizedType; +import java.lang.reflect.Type; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -61,8 +63,8 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { private static final String SKETCH_TO_STRING = "stringify"; private static final String UNION_SKETCH = "union"; private static final String UNION_SKETCH1 = "union_f"; - private static final String GET_N = "n"; - private static final String GET_CDF = "cdf"; + public static final String GET_N = "n"; + public static final String GET_CDF = "cdf"; private static final String GET_PMF = "pmf"; private static final String GET_QUANTILES = "quantiles"; public static final String GET_QUANTILE = "quantile"; @@ -123,13 +125,17 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { private void buildCalciteFns() { for (SketchDescriptor sd : sketchClasses.values()) { + + registerAsHiveFunction(sd.fnMap.get(SKETCH_TO_ESTIMATE)); + registerAsHiveFunction(sd.fnMap.get(GET_QUANTILE)); + registerAsHiveFunction(sd.fnMap.get(GET_CDF)); + registerAsHiveFunction(sd.fnMap.get(GET_N)); + // Mergability is exposed to Calcite; which enables to use it during rollup. RelProtoDataType sketchType = RelDataTypeImpl.proto(SqlTypeName.BINARY, true); SketchFunctionDescriptor sketchSFD = sd.fnMap.get(DATA_TO_SKETCH); SketchFunctionDescriptor unionSFD = sd.fnMap.get(UNION_SKETCH); - SketchFunctionDescriptor estimateSFD = sd.fnMap.get(SKETCH_TO_ESTIMATE); - SketchFunctionDescriptor quantileSFD = sd.fnMap.get(GET_QUANTILE); if (sketchSFD == null || unionSFD == null) { continue; @@ -152,33 +158,24 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { unionSFD.setCalciteFunction(unionFn); sketchSFD.setCalciteFunction(sketchFn); - if (estimateSFD != null && estimateSFD.getReturnRelDataType().isPresent()) { - - SqlFunction estimateFn = new HiveSqlFunction(estimateSFD.name, - SqlKind.OTHER_FUNCTION, - ReturnTypes.explicit(estimateSFD.getReturnRelDataType().get().getSqlTypeName()), - InferTypes.ANY_NULLABLE, - OperandTypes.family(), - SqlFunctionCategory.USER_DEFINED_FUNCTION, - true, - false); - - estimateSFD.setCalciteFunction(estimateFn); - } - if (quantileSFD != null && quantileSFD.getReturnRelDataType().isPresent()) { - SqlFunction quantileFn = new HiveSqlFunction(quantileSFD.name, - SqlKind.OTHER_FUNCTION, - ReturnTypes.explicit(quantileSFD.getReturnRelDataType().get().getSqlTypeName()), - InferTypes.ANY_NULLABLE, - OperandTypes.family(), - SqlFunctionCategory.USER_DEFINED_FUNCTION, - true, - false); - quantileSFD.setCalciteFunction(quantileFn); + } + } - } + private void registerAsHiveFunction(SketchFunctionDescriptor sfd) { + if (sfd != null && sfd.getReturnRelDataType().isPresent()) { + SqlFunction cdfFn = + new HiveSqlFunction(sfd.name, + SqlKind.OTHER_FUNCTION, + ReturnTypes.explicit(sfd.getReturnRelDataType().get()), + InferTypes.ANY_NULLABLE, + OperandTypes.family(), + SqlFunctionCategory.USER_DEFINED_FUNCTION, + true, + false); + + sfd.setCalciteFunction(cdfFn); } } @@ -209,11 +206,11 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { } - private static class SketchFunctionDescriptor implements HiveUDFPlugin.UDFDescriptor { + static class SketchFunctionDescriptor implements HiveUDFPlugin.UDFDescriptor { String name; Class<?> udfClass; private SqlFunction calciteFunction; - private Class<?> returnType; + private Type returnType; public SketchFunctionDescriptor(String name, Class<?> udfClass) { this.name = name; @@ -235,12 +232,21 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { return Optional.empty(); } else { JavaTypeFactoryImpl typeFactory = new JavaTypeFactoryImpl(new HiveTypeSystemImpl()); + if (returnType instanceof ParameterizedType) { + ParameterizedType parameterizedType = (ParameterizedType) returnType; + if (parameterizedType.getRawType() == List.class) { + final RelDataType componentRelType = typeFactory.createType(parameterizedType.getActualTypeArguments()[0]); + return Optional + .of(typeFactory.createArrayType(typeFactory.createTypeWithNullability(componentRelType, true), -1)); + } + return Optional.empty(); + } return Optional.of(typeFactory.createType(returnType)); } } - public void setReturnType(Class<?> returnType) { - this.returnType = returnType; + public void setReturnType(Type type) { + this.returnType = type; } @Override @@ -272,7 +278,7 @@ public final class DataSketchesFunctions implements HiveUDFPlugin { if (UDF.class.isAssignableFrom(clazz)) { Optional<Method> evaluateMethod = getEvaluateMethod(clazz); if (evaluateMethod.isPresent()) { - value.setReturnType(evaluateMethod.get().getReturnType()); + value.setReturnType(evaluateMethod.get().getGenericReturnType()); } } fnMap.put(name, value); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelBuilder.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelBuilder.java index 184a026..4e184dc 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelBuilder.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/HiveRelBuilder.java @@ -42,6 +42,7 @@ import org.apache.hadoop.hive.ql.optimizer.calcite.functions.HiveSqlSumAggFuncti import org.apache.hadoop.hive.ql.optimizer.calcite.functions.HiveSqlSumEmptyIsZeroAggFunction; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFloorDate; +import com.google.common.collect.ImmutableList; /** * Builder for relational expressions in Hive. @@ -165,4 +166,10 @@ public class HiveRelBuilder extends RelBuilder { return false; } + /** Make the method visible */ + @Override + public AggCall aggregateCall(SqlAggFunction aggFunction, boolean distinct, boolean approximate, boolean ignoreNulls, + RexNode filter, ImmutableList<RexNode> orderKeys, String alias, ImmutableList<RexNode> operands) { + return super.aggregateCall(aggFunction, distinct, approximate, ignoreNulls, filter, orderKeys, alias, operands); + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteToDataSketchesRules.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteToDataSketchesRules.java index 0123137..6cc638b 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteToDataSketchesRules.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveRewriteToDataSketchesRules.java @@ -20,29 +20,43 @@ import java.math.BigDecimal; import java.util.ArrayList; import java.util.Collections; import java.util.List; +import java.util.stream.Collectors; + +import org.apache.calcite.linq4j.Ord; import org.apache.calcite.plan.RelOptRule; import org.apache.calcite.plan.RelOptRuleCall; import org.apache.calcite.plan.RelOptRuleOperand; import org.apache.calcite.rel.RelCollation; +import org.apache.calcite.rel.RelFieldCollation.NullDirection; import org.apache.calcite.rel.RelNode; import org.apache.calcite.rel.core.Aggregate; import org.apache.calcite.rel.core.AggregateCall; +import org.apache.calcite.rel.core.JoinRelType; import org.apache.calcite.rel.core.Project; import org.apache.calcite.rel.core.RelFactories.ProjectFactory; import org.apache.calcite.rel.type.RelDataType; import org.apache.calcite.rel.type.RelDataTypeFactory; import org.apache.calcite.rex.RexBuilder; +import org.apache.calcite.rex.RexFieldCollation; +import org.apache.calcite.rex.RexInputRef; import org.apache.calcite.rex.RexLiteral; import org.apache.calcite.rex.RexNode; +import org.apache.calcite.rex.RexOver; +import org.apache.calcite.rex.RexShuttle; +import org.apache.calcite.rex.RexWindow; import org.apache.calcite.sql.SqlAggFunction; import org.apache.calcite.sql.SqlKind; import org.apache.calcite.sql.SqlOperator; import org.apache.calcite.sql.fun.SqlStdOperatorTable; import org.apache.calcite.sql.type.SqlTypeName; +import org.apache.calcite.tools.RelBuilder; +import org.apache.calcite.tools.RelBuilder.AggCall; import org.apache.hadoop.hive.ql.exec.DataSketchesFunctions; +import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelBuilder; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRelFactories; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveAggregate; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveProject; +import org.apache.hadoop.hive.ql.optimizer.calcite.translator.SqlFunctionConverter; import org.apache.hive.plugin.api.HiveUDFPlugin.UDFDescriptor; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -56,30 +70,13 @@ import com.google.common.collect.Lists; * <br/> * Currently it can rewrite: * <ul> - * <li>{@code count(distinct(x))} to distinct counting sketches - * <pre> - * SELECT COUNT(DISTINCT id) FROM sketch_input; - * ⇒ SELECT ROUND(ds_hll_estimate(ds_hll_sketch(id))) FROM sketch_input; - * </pre> + * <li>{@code count(distinct(x))} using {@code CountDistinctRewrite} + * </li> + * <li>{@code percentile_disc(0.2) within group (order by id)} using {@code PercentileDiscRewrite} * </li> - * <li>{@code percentile_disc(0.2) within group (order by id)} - * <pre> - * SELECT PERCENTILE_DISC(0.2) WITHIN GROUP(ORDER BY ID) FROM sketch_input; - * ⇒ SELECT ds_kll_quantile(ds_kll_sketch(CAST(id AS FLOAT)), 0.2) FROM sketch_input; - * </pre> + * <li>{@code cume_dist() over (order by id)} using {@code CumeDistRewrite} * </li> * </ul> - * - * <p> - * The transformation here works on Aggregate nodes; the operations done are the following: - * </p> - * <ol> - * <li>Identify candidate aggregate calls</li> - * <li>A new Project is inserted below the Aggregate; to help with data pre-processing</li> - * <li>A new Aggregate is created in which the aggregation is done by the sketch function</li> - * <li>A new Project is inserted on top of the Aggregate; which unwraps the resulting - * count-distinct estimation from the sketch representation</li> - * </ol> */ public final class HiveRewriteToDataSketchesRules { @@ -87,6 +84,15 @@ public final class HiveRewriteToDataSketchesRules { /** * Generic support for rewriting an Aggregate into a chain of Project->Aggregate->Project. + * <p> + * The transformation here works on Aggregate nodes; the operations done are the following: + * </p> + * <ol> + * <li>Identify candidate aggregate calls</li> + * <li>A new Project is inserted below the Aggregate; to help with data pre-processing</li> + * <li>A new Aggregate is created in which the aggregation is done by the sketch function</li> + * <li>A new Project is inserted on top of the Aggregate; which unwraps the resulting estimation from the sketch representation</li> + * </ol> */ private static abstract class AggregateToProjectAggregateProject extends RelOptRule { @@ -204,14 +210,21 @@ public final class HiveRewriteToDataSketchesRules { return fn.getCalciteFunction().get(); } - abstract void rewrite(AggregateCall aggCall); - abstract boolean isApplicable(AggregateCall aggCall); - } + abstract void rewrite(AggregateCall aggCall); + } }; + /** + * Rewrites {@code count(distinct(x))} to distinct counting sketches. + * + * <pre> + * SELECT COUNT(DISTINCT id) FROM sketch_input; + * ⇒ SELECT ROUND(ds_hll_estimate(ds_hll_sketch(id))) FROM sketch_input; + * </pre> + */ public static class CountDistinctRewrite extends AggregateToProjectAggregateProject { private final String sketchType; @@ -256,7 +269,7 @@ public final class HiveRewriteToDataSketchesRules { SqlAggFunction aggFunction = (SqlAggFunction) getSqlOperator(DataSketchesFunctions.DATA_TO_SKETCH); boolean distinct = false; - boolean approximate = true; + boolean approximate = false; boolean ignoreNulls = true; List<Integer> argList = Lists.newArrayList(newProjectsBelow.size() - 1); int filterArg = aggCall.filterArg; @@ -279,6 +292,14 @@ public final class HiveRewriteToDataSketchesRules { } } + /** + * Rewrites {@code percentile_disc(0.2) within group (order by id)}. + * + * <pre> + * SELECT PERCENTILE_DISC(0.2) WITHIN GROUP(ORDER BY ID) FROM sketch_input; + * ⇒ SELECT ds_kll_quantile(ds_kll_sketch(CAST(id AS FLOAT)), 0.2) FROM sketch_input; + * </pre> + */ public static class PercentileDiscRewrite extends AggregateToProjectAggregateProject { private final String sketchType; @@ -343,7 +364,7 @@ public final class HiveRewriteToDataSketchesRules { SqlAggFunction aggFunction = (SqlAggFunction) getSqlOperator(DataSketchesFunctions.DATA_TO_SKETCH); boolean distinct = false; - boolean approximate = true; + boolean approximate = false; boolean ignoreNulls = true; List<Integer> argList = Lists.newArrayList(newProjectsBelow.size() - 1); int filterArg = aggCall.filterArg; @@ -368,4 +389,221 @@ public final class HiveRewriteToDataSketchesRules { } } } + + /** + * Generic support for rewriting Windowing expression into a different form usually using joins. + */ + private static abstract class WindowingToProjectAggregateJoinProject extends RelOptRule { + + protected final String sketchType; + + public WindowingToProjectAggregateJoinProject(String sketchType) { + super(operand(HiveProject.class, any()), HiveRelFactories.HIVE_BUILDER, null); + this.sketchType = sketchType; + } + + @Override + public void onMatch(RelOptRuleCall call) { + final Project project = call.rel(0); + + VbuilderPAP vb = buildProcessor(call); + RelNode newProject = vb.processProject(project); + + if (newProject == project) { + return; + } else { + call.transformTo(newProject); + } + } + + protected abstract VbuilderPAP buildProcessor(RelOptRuleCall call); + + protected static abstract class VbuilderPAP { + private final String sketchClass; + protected final RelBuilder relBuilder; + protected final RexBuilder rexBuilder; + + protected VbuilderPAP(String sketchClass, RelBuilder relBuilder) { + this.sketchClass = sketchClass; + this.relBuilder = relBuilder; + rexBuilder = relBuilder.getRexBuilder(); + } + + final class ProcessShuttle extends RexShuttle { + public RexNode visitOver(RexOver over) { + return processCall(over); + } + }; + + protected final RelNode processProject(Project project) { + RelNode origInput = project.getInput(); + relBuilder.push(origInput); + RexShuttle shuttle = new ProcessShuttle(); + List<RexNode> newProjects = new ArrayList<RexNode>(); + for (RexNode expr : project.getChildExps()) { + newProjects.add(expr.accept(shuttle)); + } + if (relBuilder.peek() == origInput) { + relBuilder.clear(); + return project; + } + relBuilder.project(newProjects); + return relBuilder.build(); + } + + private final RexNode processCall(RexNode expr) { + if (expr instanceof RexOver) { + RexOver over = (RexOver) expr; + if (isApplicable(over)) { + return rewrite(over); + } + } + return expr; + } + + protected final SqlOperator getSqlOperator(String fnName) { + UDFDescriptor fn = DataSketchesFunctions.INSTANCE.getSketchFunction(sketchClass, fnName); + if (!fn.getCalciteFunction().isPresent()) { + throw new RuntimeException(fn.toString() + " doesn't have a Calcite function associated with it"); + } + return fn.getCalciteFunction().get(); + } + + /** + * Do the rewrite for the given expression. + * + * When this method is invoked the {@link #relBuilder} will only contain the current input. + * Expectation is to leave the new input there after the method finishes. + */ + abstract RexNode rewrite(RexOver expr); + + abstract boolean isApplicable(RexOver expr); + + } + } + + /** + * Rewrites {@code cume_dist() over (order by id)}. + * + * <pre> + * SELECT id, CUME_DIST() OVER (ORDER BY id) FROM sketch_input; + * ⇒ SELECT id, 1.0-ds_kll_cdf(ds, CAST(-id AS FLOAT) )[0] + * FROM sketch_input JOIN ( + * SELECT ds_kll_sketch(CAST(-id AS FLOAT)) AS ds FROM sketch_input + * ) q; + * </pre> + */ + public static class CumeDistRewrite extends WindowingToProjectAggregateJoinProject { + + public CumeDistRewrite(String sketchType) { + super(sketchType); + } + + @Override + protected VbuilderPAP buildProcessor(RelOptRuleCall call) { + return new VB(sketchType, call.builder()); + } + + private static class VB extends VbuilderPAP { + + protected VB(String sketchClass, RelBuilder relBuilder) { + super(sketchClass, relBuilder); + } + + @Override + boolean isApplicable(RexOver over) { + SqlAggFunction aggOp = over.getAggOperator(); + RexWindow window = over.getWindow(); + if (aggOp.getName().equalsIgnoreCase("cume_dist") && window.orderKeys.size() == 1 + && window.getLowerBound().isUnbounded() && window.getUpperBound().isUnbounded()) { + return true; + } + return false; + } + + @Override + RexNode rewrite(RexOver over) { + RexWindow w = over.getWindow(); + RexFieldCollation orderKey = w.orderKeys.get(0); + // we don't really support nulls in aggregate/etc...they are actually ignored + // so some hack will be needed for NULLs anyway.. + ImmutableList<RexNode> partitionKeys = w.partitionKeys; + + relBuilder.push(relBuilder.peek()); + // the CDF function utilizes the '<' operator; + // negating the input will mirror the values on the x axis + // by using 1-CDF(-x) we could get a <= operator + RexNode key = orderKey.getKey(); + key = rexBuilder.makeCall(SqlStdOperatorTable.UNARY_MINUS, key); + key = rexBuilder.makeCast(getFloatType(), key); + + AggCall aggCall = ((HiveRelBuilder) relBuilder).aggregateCall( + (SqlAggFunction) getSqlOperator(DataSketchesFunctions.DATA_TO_SKETCH), + /* distinct */ false, + /* approximate */ false, + /* ignoreNulls */ true, + null, + ImmutableList.of(), + null, + ImmutableList.of(key)); + + relBuilder.aggregate(relBuilder.groupKey(partitionKeys), aggCall); + + List<RexNode> joinConditions; + joinConditions = Ord.zip(partitionKeys).stream().map(o -> { + RexNode f = relBuilder.field(2, 1, o.i); + return rexBuilder.makeCall(SqlStdOperatorTable.IS_NOT_DISTINCT_FROM, o.e, f); + }).collect(Collectors.toList()); + relBuilder.join(JoinRelType.INNER, joinConditions); + + int sketchFieldIndex = relBuilder.peek().getRowType().getFieldCount() - 1; + RexInputRef sketchInputRef = relBuilder.field(sketchFieldIndex); + SqlOperator projectOperator = getSqlOperator(DataSketchesFunctions.GET_CDF); + + // NULLs will be replaced by this value - to be before / after the other values + // note: the sketch will ignore NULLs entirely but they will be placed at 0.0 or 1.0 + final RexNode nullReplacement = + relBuilder.literal(orderKey.getNullDirection() == NullDirection.FIRST ? Float.MAX_VALUE : -Float.MAX_VALUE); + + // long story short: CAST(1.0f-CDF(CAST(COALESCE(-X, nullReplacement) AS FLOAT))[0] AS targetType) + RexNode projRex = key; + projRex = rexBuilder.makeCall(SqlStdOperatorTable.COALESCE, key, nullReplacement); + projRex = rexBuilder.makeCast(getFloatType(), projRex); + projRex = rexBuilder.makeCall(projectOperator, ImmutableList.of(sketchInputRef, projRex)); + projRex = makeItemCall(projRex, relBuilder.literal(0)); + projRex = rexBuilder.makeCall(SqlStdOperatorTable.MINUS, relBuilder.literal(1.0f), projRex); + projRex = rexBuilder.makeCast(over.getType(), projRex); + + return projRex; + } + + private RexNode makeItemCall(RexNode arr, RexNode offset) { + if(getClass().desiredAssertionStatus()) { + try { + SqlKind.class.getField("ITEM"); + throw new RuntimeException("bind SqlKind.ITEM instead of this workaround - C1.23 a02155a70a"); + } catch(NoSuchFieldException e) { + // ignore + } + } + + try { + SqlOperator indexFn = SqlFunctionConverter.getCalciteFn("index", + ImmutableList.of(arr.getType(),offset.getType()), + arr.getType().getComponentType(), true, false); + RexNode call = rexBuilder.makeCall(indexFn, arr, offset); + return call; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private RelDataType getFloatType() { + RelDataTypeFactory typeFactory = rexBuilder.getTypeFactory(); + RelDataType notNullFloatType = typeFactory.createSqlType(SqlTypeName.FLOAT); + RelDataType floatType = typeFactory.createTypeWithNullability(notNullFloatType, true); + return floatType; + } + } + } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java index 9819f4a..f4608c5 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/translator/SqlFunctionConverter.java @@ -392,10 +392,11 @@ public class SqlFunctionConverter { registerFunction("istrue", SqlStdOperatorTable.IS_TRUE, hToken(HiveParser.Identifier, "istrue")); registerFunction("isnotfalse", SqlStdOperatorTable.IS_NOT_FALSE, hToken(HiveParser.Identifier, "isnotfalse")); registerFunction("isfalse", SqlStdOperatorTable.IS_FALSE, hToken(HiveParser.Identifier, "isfalse")); - registerFunction("is not distinct from", SqlStdOperatorTable.IS_NOT_DISTINCT_FROM, hToken(HiveParser.EQUAL_NS, "<=>")); + registerFunction("<=>", SqlStdOperatorTable.IS_NOT_DISTINCT_FROM, hToken(HiveParser.EQUAL_NS, "<=>")); registerFunction("when", SqlStdOperatorTable.CASE, hToken(HiveParser.Identifier, "when")); registerDuplicateFunction("case", SqlStdOperatorTable.CASE, hToken(HiveParser.Identifier, "when")); registerFunction("coalesce", SqlStdOperatorTable.COALESCE, hToken(HiveParser.Identifier, "coalesce")); + // timebased registerFunction("year", HiveExtractDate.YEAR, hToken(HiveParser.Identifier, "year")); diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java index 377e828..2396641 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/CalcitePlanner.java @@ -1974,13 +1974,18 @@ public class CalcitePlanner extends SemanticAnalyzer { if (!isMaterializedViewMaintenance() && conf.getBoolVar(ConfVars.HIVE_OPTIMIZE_BI_ENABLED)) { // Rewrite to datasketches if enabled if (conf.getBoolVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_COUNTDISTINCT_ENABLED)) { - String countDistinctSketchType = conf.getVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_COUNT_DISTINCT_SKETCH); - RelOptRule rule = new HiveRewriteToDataSketchesRules.CountDistinctRewrite(countDistinctSketchType); + String sketchType = conf.getVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_COUNT_DISTINCT_SKETCH); + RelOptRule rule = new HiveRewriteToDataSketchesRules.CountDistinctRewrite(sketchType); generatePartialProgram(program, true, HepMatchOrder.TOP_DOWN, rule); } if (conf.getBoolVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_PERCENTILE_DISC_ENABLED)) { - String percentileDiscSketchType = conf.getVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_PERCENTILE_DISC_SKETCH); - RelOptRule rule = new HiveRewriteToDataSketchesRules.PercentileDiscRewrite(percentileDiscSketchType); + String sketchType = conf.getVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_PERCENTILE_DISC_SKETCH); + RelOptRule rule = new HiveRewriteToDataSketchesRules.PercentileDiscRewrite(sketchType); + generatePartialProgram(program, true, HepMatchOrder.TOP_DOWN, rule); + } + if (conf.getBoolVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_CUME_DIST_ENABLED)) { + String sketchType = conf.getVar(ConfVars.HIVE_OPTIMIZE_BI_REWRITE_CUME_DIST_SKETCH); + RelOptRule rule = new HiveRewriteToDataSketchesRules.CumeDistRewrite(sketchType); generatePartialProgram(program, true, HepMatchOrder.TOP_DOWN, rule); } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/parse/type/HiveFunctionHelper.java b/ql/src/java/org/apache/hadoop/hive/ql/parse/type/HiveFunctionHelper.java index 07813b9..c91eb31 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/parse/type/HiveFunctionHelper.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/parse/type/HiveFunctionHelper.java @@ -41,7 +41,6 @@ import org.apache.hadoop.hive.ql.exec.FunctionInfo; import org.apache.hadoop.hive.ql.exec.FunctionRegistry; import org.apache.hadoop.hive.ql.metadata.Hive; import org.apache.hadoop.hive.ql.metadata.HiveException; -import org.apache.hadoop.hive.ql.optimizer.calcite.HiveCalciteUtil; import org.apache.hadoop.hive.ql.optimizer.calcite.HiveRexExecutorImpl; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveExtractDate; import org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveFloorDate; @@ -389,7 +388,7 @@ public class HiveFunctionHelper implements FunctionHelper { if (FunctionRegistry.isRankingFunction(aggregateName)) { // Rank functions type is 'int'/'double' - if (aggregateName.equalsIgnoreCase("percent_rank")) { + if (aggregateName.equalsIgnoreCase("percent_rank") || aggregateName.equalsIgnoreCase("cume_dist")) { returnType = TypeInfoFactory.doubleTypeInfo; } else { returnType = TypeInfoFactory.intTypeInfo; diff --git a/ql/src/test/org/apache/hadoop/hive/ql/exec/TestDataSketchesFunctions.java b/ql/src/test/org/apache/hadoop/hive/ql/exec/TestDataSketchesFunctions.java new file mode 100644 index 0000000..d446105 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/exec/TestDataSketchesFunctions.java @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +import static org.junit.Assert.assertTrue; + +import java.util.Optional; + +import org.apache.calcite.rel.type.RelDataType; +import org.apache.hadoop.hive.ql.exec.DataSketchesFunctions.SketchFunctionDescriptor; +import org.junit.Test; + +public final class TestDataSketchesFunctions { + + @Test + public void testKllGetCdfReturnType() { + SketchFunctionDescriptor cf = + DataSketchesFunctions.INSTANCE.getSketchFunction("kll", DataSketchesFunctions.GET_CDF); + Optional<RelDataType> retType = cf.getReturnRelDataType(); + assertTrue(retType.get().getComponentType() != null); + } +} diff --git a/ql/src/test/queries/clientpositive/sketches_materialized_view_cume_dist.q b/ql/src/test/queries/clientpositive/sketches_materialized_view_cume_dist.q new file mode 100644 index 0000000..e8e1795 --- /dev/null +++ b/ql/src/test/queries/clientpositive/sketches_materialized_view_cume_dist.q @@ -0,0 +1,54 @@ +--! qt:transactional +set hive.fetch.task.conversion=none; + +create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true'); + +insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +; + +-- create an mv for the intermediate results +create materialized view mv_1 as + select category,ds_kll_sketch(cast(-id as float)) from sketch_input group by category; + +-- bi mode on +set hive.optimize.bi.enabled=true; + +explain +select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id; +select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id; + +set hive.optimize.bi.enabled=false; + +explain +select 'no rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id; +select 'no rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id; + +set hive.optimize.bi.enabled=true; + +insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +; + +explain +select 'rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id; +select 'rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id; + +explain +alter materialized view mv_1 rebuild; +alter materialized view mv_1 rebuild; + +explain +select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id; +select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id; + +-- rewrite+mv matching with rollup +explain +select 'rewrite; mv matching', category, id, cume_dist() over (partition by category order by id) from sketch_input order by category,id; +select 'rewrite; mv matching', category, id, cume_dist() over (partition by category order by id) from sketch_input order by category,id; + +drop materialized view mv_1; diff --git a/ql/src/test/queries/clientpositive/sketches_rewrite_cume_dist.q b/ql/src/test/queries/clientpositive/sketches_rewrite_cume_dist.q new file mode 100644 index 0000000..c7b0837 --- /dev/null +++ b/ql/src/test/queries/clientpositive/sketches_rewrite_cume_dist.q @@ -0,0 +1,47 @@ +--! qt:transactional + + +create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true'); + +insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +; + +select id,cume_dist() over (order by id) from sketch_input; + +select id,cume_dist() over (order by id),1.0-ds_kll_cdf(ds, CAST(-id AS FLOAT) )[0] +from sketch_input +join ( select ds_kll_sketch(cast(-id as float)) as ds from sketch_input ) q +order by id; + +set hive.optimize.bi.enabled=true; + +-- see if rewrite happens +explain +select id,'rewrite',cume_dist() over (order by id) from sketch_input order by id; + +select id,'rewrite',cume_dist() over (order by id) from sketch_input order by id; + +-- see if rewrite happens in nested expressions +explain +select id,'rewrite',count(id) over ()*cume_dist() over (order by id) from sketch_input order by id; + +select id,'rewrite',count(id) over ()*cume_dist() over (order by id) from sketch_input order by id; + + +insert into sketch_input values (null,'a'),(null,'b'); + +explain +select id,'rewrite',cume_dist() over (order by id nulls first) from sketch_input order by id nulls first; + +select id,'rewrite',cume_dist() over (order by id nulls first) from sketch_input order by id nulls first; + +explain +select id,'rewrite',cume_dist() over (order by id nulls last) from sketch_input order by id nulls last; + +select id,'rewrite',cume_dist() over (order by id nulls last) from sketch_input order by id nulls last; + +select id,cume_dist() over (order by id) from sketch_input order by id; diff --git a/ql/src/test/queries/clientpositive/sketches_rewrite_cume_dist_partition_by.q b/ql/src/test/queries/clientpositive/sketches_rewrite_cume_dist_partition_by.q new file mode 100644 index 0000000..7931a59 --- /dev/null +++ b/ql/src/test/queries/clientpositive/sketches_rewrite_cume_dist_partition_by.q @@ -0,0 +1,27 @@ +--! qt:transactional + +create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true'); + +insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b'), + (1,null),(2,null),(10,null),(13,null); +; + +select id,category,cume_dist() over (partition by category order by id) from sketch_input order by category,id; + +select id,category,cume_dist() over (partition by category order by id),1.0-ds_kll_cdf(ds, CAST(-id AS FLOAT))[0] +from sketch_input +join ( select category as c,ds_kll_sketch(cast(-id as float)) as ds from sketch_input group by category) q on (q.c=category) +order by category,id; + +set hive.optimize.bi.enabled=true; + +-- see if rewrite happens +explain +select id,'rewrite',cume_dist() over (partition by category order by id) from sketch_input order by category,id; + +select id,'rewrite',cume_dist() over (partition by category order by id) from sketch_input order by category,id; + diff --git a/ql/src/test/results/clientpositive/llap/cbo_rp_windowing_2.q.out b/ql/src/test/results/clientpositive/llap/cbo_rp_windowing_2.q.out index 2ff927c..6ec2d60 100644 --- a/ql/src/test/results/clientpositive/llap/cbo_rp_windowing_2.q.out +++ b/ql/src/test/results/clientpositive/llap/cbo_rp_windowing_2.q.out @@ -625,32 +625,32 @@ window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 precedi POSTHOOK: type: QUERY POSTHOOK: Input: default@part #### A masked pattern was here #### -Manufacturer#1 almond antique burnished rose metallic 2 1 1 0 0.0 1 2 2.0 0.0 2 2 2 -Manufacturer#1 almond antique burnished rose metallic 2 1 1 0 0.0 1 2 2.0 0.0 2 2 2 -Manufacturer#1 almond antique chartreuse lavender yellow 34 3 2 0 0.4 2 3 12.666666666666666 15.084944665313014 2 34 2 -Manufacturer#1 almond antique salmon chartreuse burlywood 6 4 3 0 0.6 2 4 11.0 13.379088160259652 2 6 2 -Manufacturer#1 almond aquamarine burnished black steel 28 5 4 0 0.8 3 5 14.4 13.763720427268202 2 28 34 -Manufacturer#1 almond aquamarine pink moccasin thistle 42 6 5 1 1.0 3 6 19.0 16.237815945091466 2 42 6 -Manufacturer#2 almond antique violet chocolate turquoise 14 1 1 0 0.0 1 1 14.0 0.0 4 14 14 -Manufacturer#2 almond antique violet turquoise frosted 40 2 2 0 0.25 1 2 27.0 13.0 4 40 14 -Manufacturer#2 almond aquamarine midnight light salmon 2 3 3 0 0.5 2 3 18.666666666666668 15.86050300449376 4 2 14 -Manufacturer#2 almond aquamarine rose maroon antique 25 4 4 0 0.75 2 4 20.25 14.00669482783144 4 25 40 -Manufacturer#2 almond aquamarine sandy cyan gainsboro 18 5 5 1 1.0 3 5 19.8 12.560254774486067 4 18 2 -Manufacturer#3 almond antique chartreuse khaki white 17 1 1 0 0.0 1 1 17.0 0.0 2 17 17 -Manufacturer#3 almond antique forest lavender goldenrod 14 2 2 0 0.25 1 2 15.5 1.5 2 14 17 -Manufacturer#3 almond antique metallic orange dim 19 3 3 0 0.5 2 3 16.666666666666668 2.0548046676563256 2 19 17 -Manufacturer#3 almond antique misty red olive 1 4 4 0 0.75 2 4 12.75 7.013380069552769 2 1 14 -Manufacturer#3 almond antique olive coral navajo 45 5 5 1 1.0 3 5 19.2 14.344336861632886 2 45 19 -Manufacturer#4 almond antique gainsboro frosted violet 10 1 1 0 0.0 1 1 10.0 0.0 0 10 10 -Manufacturer#4 almond antique violet mint lemon 39 2 2 0 0.25 1 2 24.5 14.5 0 39 10 -Manufacturer#4 almond aquamarine floral ivory bisque 27 3 3 0 0.5 2 3 25.333333333333332 11.897712198383164 0 27 10 -Manufacturer#4 almond aquamarine yellow dodger mint 7 4 4 0 0.75 2 4 20.75 13.007209539328564 0 7 39 -Manufacturer#4 almond azure aquamarine papaya violet 12 5 5 1 1.0 3 5 19.0 12.149074038789951 0 12 27 -Manufacturer#5 almond antique blue firebrick mint 31 1 1 0 0.0 1 1 31.0 0.0 1 31 31 -Manufacturer#5 almond antique medium spring khaki 6 2 2 0 0.25 1 2 18.5 12.5 1 6 31 -Manufacturer#5 almond antique sky peru orange 2 3 3 0 0.5 2 3 13.0 12.832251036613439 1 2 31 -Manufacturer#5 almond aquamarine dodger light gainsboro 46 4 4 0 0.75 2 4 21.25 18.102140757380052 1 46 6 -Manufacturer#5 almond azure blanched chiffon midnight 23 5 5 1 1.0 3 5 21.6 16.206171663906314 1 23 2 +Manufacturer#1 almond antique burnished rose metallic 2 1 1 0.3333333333333333 0.0 1 2 2.0 0.0 2 2 2 +Manufacturer#1 almond antique burnished rose metallic 2 1 1 0.3333333333333333 0.0 1 2 2.0 0.0 2 2 2 +Manufacturer#1 almond antique chartreuse lavender yellow 34 3 2 0.5 0.4 2 3 12.666666666666666 15.084944665313014 2 34 2 +Manufacturer#1 almond antique salmon chartreuse burlywood 6 4 3 0.6666666666666666 0.6 2 4 11.0 13.379088160259652 2 6 2 +Manufacturer#1 almond aquamarine burnished black steel 28 5 4 0.8333333333333334 0.8 3 5 14.4 13.763720427268202 2 28 34 +Manufacturer#1 almond aquamarine pink moccasin thistle 42 6 5 1.0 1.0 3 6 19.0 16.237815945091466 2 42 6 +Manufacturer#2 almond antique violet chocolate turquoise 14 1 1 0.2 0.0 1 1 14.0 0.0 4 14 14 +Manufacturer#2 almond antique violet turquoise frosted 40 2 2 0.4 0.25 1 2 27.0 13.0 4 40 14 +Manufacturer#2 almond aquamarine midnight light salmon 2 3 3 0.6 0.5 2 3 18.666666666666668 15.86050300449376 4 2 14 +Manufacturer#2 almond aquamarine rose maroon antique 25 4 4 0.8 0.75 2 4 20.25 14.00669482783144 4 25 40 +Manufacturer#2 almond aquamarine sandy cyan gainsboro 18 5 5 1.0 1.0 3 5 19.8 12.560254774486067 4 18 2 +Manufacturer#3 almond antique chartreuse khaki white 17 1 1 0.2 0.0 1 1 17.0 0.0 2 17 17 +Manufacturer#3 almond antique forest lavender goldenrod 14 2 2 0.4 0.25 1 2 15.5 1.5 2 14 17 +Manufacturer#3 almond antique metallic orange dim 19 3 3 0.6 0.5 2 3 16.666666666666668 2.0548046676563256 2 19 17 +Manufacturer#3 almond antique misty red olive 1 4 4 0.8 0.75 2 4 12.75 7.013380069552769 2 1 14 +Manufacturer#3 almond antique olive coral navajo 45 5 5 1.0 1.0 3 5 19.2 14.344336861632886 2 45 19 +Manufacturer#4 almond antique gainsboro frosted violet 10 1 1 0.2 0.0 1 1 10.0 0.0 0 10 10 +Manufacturer#4 almond antique violet mint lemon 39 2 2 0.4 0.25 1 2 24.5 14.5 0 39 10 +Manufacturer#4 almond aquamarine floral ivory bisque 27 3 3 0.6 0.5 2 3 25.333333333333332 11.897712198383164 0 27 10 +Manufacturer#4 almond aquamarine yellow dodger mint 7 4 4 0.8 0.75 2 4 20.75 13.007209539328564 0 7 39 +Manufacturer#4 almond azure aquamarine papaya violet 12 5 5 1.0 1.0 3 5 19.0 12.149074038789951 0 12 27 +Manufacturer#5 almond antique blue firebrick mint 31 1 1 0.2 0.0 1 1 31.0 0.0 1 31 31 +Manufacturer#5 almond antique medium spring khaki 6 2 2 0.4 0.25 1 2 18.5 12.5 1 6 31 +Manufacturer#5 almond antique sky peru orange 2 3 3 0.6 0.5 2 3 13.0 12.832251036613439 1 2 31 +Manufacturer#5 almond aquamarine dodger light gainsboro 46 4 4 0.8 0.75 2 4 21.25 18.102140757380052 1 46 6 +Manufacturer#5 almond azure blanched chiffon midnight 23 5 5 1.0 1.0 3 5 21.6 16.206171663906314 1 23 2 PREHOOK: query: select p_mfgr,p_name, p_size, rank() over(distribute by p_mfgr sort by p_name) as r, dense_rank() over(distribute by p_mfgr sort by p_name) as dr, @@ -675,32 +675,32 @@ window w1 as (distribute by p_mfgr sort by p_mfgr, p_name rows between 2 precedi POSTHOOK: type: QUERY POSTHOOK: Input: default@part #### A masked pattern was here #### -Manufacturer#1 almond antique burnished rose metallic 2 1 1 0 4 4 2 -Manufacturer#1 almond antique burnished rose metallic 2 1 1 0 4 4 2 -Manufacturer#1 almond antique chartreuse lavender yellow 34 3 2 0 38 34 2 -Manufacturer#1 almond antique salmon chartreuse burlywood 6 4 3 0 44 10 2 -Manufacturer#1 almond aquamarine burnished black steel 28 5 4 0 72 28 34 -Manufacturer#1 almond aquamarine pink moccasin thistle 42 6 5 1 114 42 6 -Manufacturer#2 almond antique violet chocolate turquoise 14 1 1 0 14 14 14 -Manufacturer#2 almond antique violet turquoise frosted 40 2 2 0 54 40 14 -Manufacturer#2 almond aquamarine midnight light salmon 2 3 3 0 56 2 14 -Manufacturer#2 almond aquamarine rose maroon antique 25 4 4 0 81 25 40 -Manufacturer#2 almond aquamarine sandy cyan gainsboro 18 5 5 1 99 32 2 -Manufacturer#3 almond antique chartreuse khaki white 17 1 1 0 17 31 17 -Manufacturer#3 almond antique forest lavender goldenrod 14 2 2 0 31 14 17 -Manufacturer#3 almond antique metallic orange dim 19 3 3 0 50 50 17 -Manufacturer#3 almond antique misty red olive 1 4 4 0 51 1 14 -Manufacturer#3 almond antique olive coral navajo 45 5 5 1 96 45 19 -Manufacturer#4 almond antique gainsboro frosted violet 10 1 1 0 10 17 10 -Manufacturer#4 almond antique violet mint lemon 39 2 2 0 49 39 10 -Manufacturer#4 almond aquamarine floral ivory bisque 27 3 3 0 76 27 10 -Manufacturer#4 almond aquamarine yellow dodger mint 7 4 4 0 83 7 39 -Manufacturer#4 almond azure aquamarine papaya violet 12 5 5 1 95 29 27 -Manufacturer#5 almond antique blue firebrick mint 31 1 1 0 31 31 31 -Manufacturer#5 almond antique medium spring khaki 6 2 2 0 37 8 31 -Manufacturer#5 almond antique sky peru orange 2 3 3 0 39 2 31 -Manufacturer#5 almond aquamarine dodger light gainsboro 46 4 4 0 85 46 6 -Manufacturer#5 almond azure blanched chiffon midnight 23 5 5 1 108 23 2 +Manufacturer#1 almond antique burnished rose metallic 2 1 1 0.3333333333333333 4 4 2 +Manufacturer#1 almond antique burnished rose metallic 2 1 1 0.3333333333333333 4 4 2 +Manufacturer#1 almond antique chartreuse lavender yellow 34 3 2 0.5 38 34 2 +Manufacturer#1 almond antique salmon chartreuse burlywood 6 4 3 0.6666666666666666 44 10 2 +Manufacturer#1 almond aquamarine burnished black steel 28 5 4 0.8333333333333334 72 28 34 +Manufacturer#1 almond aquamarine pink moccasin thistle 42 6 5 1.0 114 42 6 +Manufacturer#2 almond antique violet chocolate turquoise 14 1 1 0.2 14 14 14 +Manufacturer#2 almond antique violet turquoise frosted 40 2 2 0.4 54 40 14 +Manufacturer#2 almond aquamarine midnight light salmon 2 3 3 0.6 56 2 14 +Manufacturer#2 almond aquamarine rose maroon antique 25 4 4 0.8 81 25 40 +Manufacturer#2 almond aquamarine sandy cyan gainsboro 18 5 5 1.0 99 32 2 +Manufacturer#3 almond antique chartreuse khaki white 17 1 1 0.2 17 31 17 +Manufacturer#3 almond antique forest lavender goldenrod 14 2 2 0.4 31 14 17 +Manufacturer#3 almond antique metallic orange dim 19 3 3 0.6 50 50 17 +Manufacturer#3 almond antique misty red olive 1 4 4 0.8 51 1 14 +Manufacturer#3 almond antique olive coral navajo 45 5 5 1.0 96 45 19 +Manufacturer#4 almond antique gainsboro frosted violet 10 1 1 0.2 10 17 10 +Manufacturer#4 almond antique violet mint lemon 39 2 2 0.4 49 39 10 +Manufacturer#4 almond aquamarine floral ivory bisque 27 3 3 0.6 76 27 10 +Manufacturer#4 almond aquamarine yellow dodger mint 7 4 4 0.8 83 7 39 +Manufacturer#4 almond azure aquamarine papaya violet 12 5 5 1.0 95 29 27 +Manufacturer#5 almond antique blue firebrick mint 31 1 1 0.2 31 31 31 +Manufacturer#5 almond antique medium spring khaki 6 2 2 0.4 37 8 31 +Manufacturer#5 almond antique sky peru orange 2 3 3 0.6 39 2 31 +Manufacturer#5 almond aquamarine dodger light gainsboro 46 4 4 0.8 85 46 6 +Manufacturer#5 almond azure blanched chiffon midnight 23 5 5 1.0 108 23 2 PREHOOK: query: select p_mfgr,p_name, p_size, count(*) over(distribute by p_mfgr sort by p_name ) as c, count(p_size) over(distribute by p_mfgr sort by p_name) as ca, diff --git a/ql/src/test/results/clientpositive/llap/sketches_materialized_view_cume_dist.q.out b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_cume_dist.q.out new file mode 100644 index 0000000..ae5c3a6 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/sketches_materialized_view_cume_dist.q.out @@ -0,0 +1,1054 @@ +PREHOOK: query: create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_input +POSTHOOK: query: create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_input +PREHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@sketch_input +POSTHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@sketch_input +POSTHOOK: Lineage: sketch_input.category SCRIPT [] +POSTHOOK: Lineage: sketch_input.id SCRIPT [] +PREHOOK: query: create materialized view mv_1 as + select category,ds_kll_sketch(cast(-id as float)) from sketch_input group by category +PREHOOK: type: CREATE_MATERIALIZED_VIEW +PREHOOK: Input: default@sketch_input +PREHOOK: Output: database:default +PREHOOK: Output: default@mv_1 +POSTHOOK: query: create materialized view mv_1 as + select category,ds_kll_sketch(cast(-id as float)) from sketch_input group by category +POSTHOOK: type: CREATE_MATERIALIZED_VIEW +POSTHOOK: Input: default@sketch_input +POSTHOOK: Output: database:default +POSTHOOK: Output: default@mv_1 +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: explain +select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (XPROD_EDGE), Reducer 5 (XPROD_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + Reducer 5 <- Map 4 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), UDFToFloat(COALESCE(UDFToFloat((- id)),-3.4028234663852886E38D)) (type: float) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: float) + Execution mode: vectorized, llap + LLAP IO: may be used (ACID table) + Map 4 + Map Operator Tree: + TableScan + alias: default.mv_1 + Statistics: Num rows: 2 Data size: 240 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _c1 (type: binary) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 240 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_union(_col0) + keys: true (type: boolean) + minReductionHashAggr: 0.5 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: boolean) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: boolean) + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: binary) + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 22 Data size: 3344 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), (1.0D - ds_kll_cdf(_col2, _col1)[0]) (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Statistics: Num rows: 22 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: 'rewrite; mv matching' (type: string), KEY.reducesinkkey0 (type: int), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 22 Data size: 2552 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 22 Data size: 2552 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_kll_union(VALUE._col0) + keys: KEY._col0 (type: boolean) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: binary) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +rewrite; mv matching 1 0.09090909090909094 +rewrite; mv matching 1 0.09090909090909094 +rewrite; mv matching 2 0.13636363636363635 +rewrite; mv matching 3 0.18181818181818177 +rewrite; mv matching 4 0.2272727272727273 +rewrite; mv matching 5 0.2727272727272727 +rewrite; mv matching 6 0.40909090909090906 +rewrite; mv matching 6 0.40909090909090906 +rewrite; mv matching 6 0.40909090909090906 +rewrite; mv matching 7 0.5 +rewrite; mv matching 7 0.5 +rewrite; mv matching 8 0.5909090909090908 +rewrite; mv matching 8 0.5909090909090908 +rewrite; mv matching 9 0.6818181818181819 +rewrite; mv matching 9 0.6818181818181819 +rewrite; mv matching 10 0.7727272727272727 +rewrite; mv matching 10 0.7727272727272727 +rewrite; mv matching 11 0.8181818181818181 +rewrite; mv matching 12 0.8636363636363636 +rewrite; mv matching 13 0.9090909090909091 +rewrite; mv matching 14 0.9545454545454546 +rewrite; mv matching 15 1.0 +PREHOOK: query: explain +select 'no rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select 'no rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: 0 (type: int), id (type: int) + null sort order: az + sort order: ++ + Map-reduce partition columns: 0 (type: int) + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 22 Data size: 5984 Basic stats: COMPLETE Column stats: COMPLETE + PTF Operator + Function definitions: + Input definition + input alias: ptf_0 + output shape: _col0: int + type: WINDOWING + Windowing table definition + input alias: ptf_1 + name: windowingtablefunction + order by: _col0 ASC NULLS LAST + partition by: 0 + raw input shape: + window functions: + window function definition + alias: cume_dist_window_0 + arguments: _col0 + name: cume_dist + window function: GenericUDAFCumeDistEvaluator + window frame: ROWS PRECEDING(MAX)~FOLLOWING(MAX) + isPivotResult: true + Statistics: Num rows: 22 Data size: 5984 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), cume_dist_window_0 (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Statistics: Num rows: 22 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: 'no rewrite; no mv matching' (type: string), KEY.reducesinkkey0 (type: int), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 22 Data size: 2684 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 22 Data size: 2684 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select 'no rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select 'no rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +no rewrite; no mv matching 1 0.09090909090909091 +no rewrite; no mv matching 1 0.09090909090909091 +no rewrite; no mv matching 2 0.13636363636363635 +no rewrite; no mv matching 3 0.18181818181818182 +no rewrite; no mv matching 4 0.22727272727272727 +no rewrite; no mv matching 5 0.2727272727272727 +no rewrite; no mv matching 6 0.4090909090909091 +no rewrite; no mv matching 6 0.4090909090909091 +no rewrite; no mv matching 6 0.4090909090909091 +no rewrite; no mv matching 7 0.5 +no rewrite; no mv matching 7 0.5 +no rewrite; no mv matching 8 0.5909090909090909 +no rewrite; no mv matching 8 0.5909090909090909 +no rewrite; no mv matching 9 0.6818181818181818 +no rewrite; no mv matching 9 0.6818181818181818 +no rewrite; no mv matching 10 0.7727272727272727 +no rewrite; no mv matching 10 0.7727272727272727 +no rewrite; no mv matching 11 0.8181818181818182 +no rewrite; no mv matching 12 0.8636363636363636 +no rewrite; no mv matching 13 0.9090909090909091 +no rewrite; no mv matching 14 0.9545454545454546 +no rewrite; no mv matching 15 1.0 +PREHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@sketch_input +POSTHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@sketch_input +POSTHOOK: Lineage: sketch_input.category SCRIPT [] +POSTHOOK: Lineage: sketch_input.id SCRIPT [] +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: explain +select 'rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select 'rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (XPROD_EDGE), Reducer 4 (XPROD_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + Reducer 4 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 44 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), UDFToFloat(COALESCE(UDFToFloat((- id)),-3.4028234663852886E38D)) (type: float) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 44 Data size: 352 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 44 Data size: 352 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: float) + Select Operator + expressions: UDFToFloat((- id)) (type: float) + outputColumnNames: _col1 + Statistics: Num rows: 44 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_sketch(_col1) + keys: true (type: boolean) + minReductionHashAggr: 0.97727275 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: boolean) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: boolean) + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: binary) + Execution mode: llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 44 Data size: 6688 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), (1.0D - ds_kll_cdf(_col2, _col1)[0]) (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 44 Data size: 528 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Statistics: Num rows: 44 Data size: 528 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: 'rewrite; no mv matching' (type: string), KEY.reducesinkkey0 (type: int), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 44 Data size: 5236 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 44 Data size: 5236 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_kll_sketch(VALUE._col0) + keys: KEY._col0 (type: boolean) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: binary) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: select 'rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select 'rewrite; no mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +rewrite; no mv matching 1 0.09090909090909094 +rewrite; no mv matching 1 0.09090909090909094 +rewrite; no mv matching 1 0.09090909090909094 +rewrite; no mv matching 1 0.09090909090909094 +rewrite; no mv matching 2 0.13636363636363635 +rewrite; no mv matching 2 0.13636363636363635 +rewrite; no mv matching 3 0.18181818181818177 +rewrite; no mv matching 3 0.18181818181818177 +rewrite; no mv matching 4 0.2272727272727273 +rewrite; no mv matching 4 0.2272727272727273 +rewrite; no mv matching 5 0.2727272727272727 +rewrite; no mv matching 5 0.2727272727272727 +rewrite; no mv matching 6 0.40909090909090906 +rewrite; no mv matching 6 0.40909090909090906 +rewrite; no mv matching 6 0.40909090909090906 +rewrite; no mv matching 6 0.40909090909090906 +rewrite; no mv matching 6 0.40909090909090906 +rewrite; no mv matching 6 0.40909090909090906 +rewrite; no mv matching 7 0.5 +rewrite; no mv matching 7 0.5 +rewrite; no mv matching 7 0.5 +rewrite; no mv matching 7 0.5 +rewrite; no mv matching 8 0.5909090909090908 +rewrite; no mv matching 8 0.5909090909090908 +rewrite; no mv matching 8 0.5909090909090908 +rewrite; no mv matching 8 0.5909090909090908 +rewrite; no mv matching 9 0.6818181818181819 +rewrite; no mv matching 9 0.6818181818181819 +rewrite; no mv matching 9 0.6818181818181819 +rewrite; no mv matching 9 0.6818181818181819 +rewrite; no mv matching 10 0.7727272727272727 +rewrite; no mv matching 10 0.7727272727272727 +rewrite; no mv matching 10 0.7727272727272727 +rewrite; no mv matching 10 0.7727272727272727 +rewrite; no mv matching 11 0.8181818181818181 +rewrite; no mv matching 11 0.8181818181818181 +rewrite; no mv matching 12 0.8636363636363636 +rewrite; no mv matching 12 0.8636363636363636 +rewrite; no mv matching 13 0.9090909090909091 +rewrite; no mv matching 13 0.9090909090909091 +rewrite; no mv matching 14 0.9545454545454546 +rewrite; no mv matching 14 0.9545454545454546 +rewrite; no mv matching 15 1.0 +rewrite; no mv matching 15 1.0 +PREHOOK: query: explain +alter materialized view mv_1 rebuild +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +PREHOOK: Output: default@mv_1 +POSTHOOK: query: explain +alter materialized view mv_1 rebuild +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +POSTHOOK: Output: default@mv_1 +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-2 depends on stages: Stage-1 + Stage-0 depends on stages: Stage-2 + Stage-3 depends on stages: Stage-0 + Stage-4 depends on stages: Stage-3 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 6 <- Union 3 (CONTAINS) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Union 3 (CONTAINS) + Reducer 4 <- Union 3 (SIMPLE_EDGE) + Reducer 5 <- Reducer 4 (CUSTOM_SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + filterExpr: (ROW__ID.writeid > 1L) (type: boolean) + Statistics: Num rows: 44 Data size: 3916 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (ROW__ID.writeid > 1L) (type: boolean) + Statistics: Num rows: 14 Data size: 1246 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: category (type: char(1)), UDFToFloat((- id)) (type: float) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 14 Data size: 1246 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_sketch(_col1) + keys: _col0 (type: char(1)) + minReductionHashAggr: 0.85714287 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: char(1)) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: char(1)) + Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: binary) + Execution mode: llap + LLAP IO: may be used (ACID table) + Map 6 + Map Operator Tree: + TableScan + alias: default.mv_1 + Statistics: Num rows: 2 Data size: 410 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: category (type: char(1)), _c1 (type: binary) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 410 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_union(_col1) + keys: _col0 (type: char(1)) + minReductionHashAggr: 0.5 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: char(1)) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: char(1)) + Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: binary) + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_kll_sketch(VALUE._col0) + keys: KEY._col0 (type: char(1)) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_union(_col1) + keys: _col0 (type: char(1)) + minReductionHashAggr: 0.5 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: char(1)) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: char(1)) + Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: binary) + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_kll_union(VALUE._col0) + keys: KEY._col0 (type: char(1)) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.mv_1 + Select Operator + expressions: _col0 (type: char(1)), _col1 (type: binary) + outputColumnNames: category, _c1 + Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: max(length(category)), avg(COALESCE(length(category),0)), count(1), count(category), compute_bit_vector(category, 'hll'), max(length(_c1)), avg(COALESCE(length(_c1),0)), count(_c1) + minReductionHashAggr: 0.5 + mode: hash + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7 + Statistics: Num rows: 1 Data size: 328 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 328 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: struct<count:bigint,sum:double,input:int>), _col2 (type: bigint), _col3 (type: bigint), _col4 (type: binary), _col5 (type: int), _col6 (type: struct<count:bigint,sum:double,input:int>), _col7 (type: bigint) + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: max(VALUE._col0), avg(VALUE._col1), count(VALUE._col2), count(VALUE._col3), compute_bit_vector(VALUE._col4), max(VALUE._col5), avg(VALUE._col6), count(VALUE._col7) + mode: mergepartial + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7 + Statistics: Num rows: 1 Data size: 192 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: 'STRING' (type: string), UDFToLong(COALESCE(_col0,0)) (type: bigint), COALESCE(_col1,0) (type: double), (_col2 - _col3) (type: bigint), COALESCE(ndv_compute_bit_vector(_col4),0) (type: bigint), _col4 (type: binary), 'BINARY' (type: string), UDFToLong(COALESCE(_col5,0)) (type: bigint), COALESCE(_col6,0) (type: double), (_col2 - _col7) (type: bigint) + outputColumnNames: _col0, _col1, _col2, _col3, _col4, _col5, _col6, _col7, _col8, _col9 + Statistics: Num rows: 1 Data size: 380 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 380 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Union 3 + Vertex: Union 3 + + Stage: Stage-2 + Dependency Collection + + Stage: Stage-0 + Move Operator + tables: + replace: true + table: + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.mv_1 + + Stage: Stage-3 + Stats Work + Basic Stats Work: + Column Stats Desc: + Columns: category, _c1 + Column Types: char(1), binary + Table: default.mv_1 + + Stage: Stage-4 + Materialized View Update + name: default.mv_1 + update creation metadata: true + +PREHOOK: query: alter materialized view mv_1 rebuild +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +PREHOOK: Output: default@mv_1 +POSTHOOK: query: alter materialized view mv_1 rebuild +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +POSTHOOK: Output: default@mv_1 +POSTHOOK: Lineage: mv_1._c1 EXPRESSION [(sketch_input)sketch_input.FieldSchema(name:id, type:int, comment:null), (mv_1)default.mv_1.FieldSchema(name:_c1, type:binary, comment:null), ] +POSTHOOK: Lineage: mv_1.category EXPRESSION [(sketch_input)sketch_input.FieldSchema(name:category, type:char(1), comment:null), (mv_1)default.mv_1.FieldSchema(name:category, type:char(1), comment:null), ] +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: explain +select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (XPROD_EDGE), Reducer 5 (XPROD_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + Reducer 5 <- Map 4 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 44 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), UDFToFloat(COALESCE(UDFToFloat((- id)),-3.4028234663852886E38D)) (type: float) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 44 Data size: 352 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 44 Data size: 352 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: float) + Execution mode: vectorized, llap + LLAP IO: may be used (ACID table) + Map 4 + Map Operator Tree: + TableScan + alias: default.mv_1 + Statistics: Num rows: 2 Data size: 320 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _c1 (type: binary) + outputColumnNames: _col0 + Statistics: Num rows: 2 Data size: 320 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_union(_col0) + keys: true (type: boolean) + minReductionHashAggr: 0.5 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: boolean) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: boolean) + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: binary) + Execution mode: llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 44 Data size: 6688 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), (1.0D - ds_kll_cdf(_col2, _col1)[0]) (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 44 Data size: 528 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Statistics: Num rows: 44 Data size: 528 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: 'rewrite; mv matching' (type: string), KEY.reducesinkkey0 (type: int), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 44 Data size: 5104 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 44 Data size: 5104 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_kll_union(VALUE._col0) + keys: KEY._col0 (type: boolean) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: binary) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select 'rewrite; mv matching', id, cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +rewrite; mv matching 1 0.09090909090909094 +rewrite; mv matching 1 0.09090909090909094 +rewrite; mv matching 1 0.09090909090909094 +rewrite; mv matching 1 0.09090909090909094 +rewrite; mv matching 2 0.13636363636363635 +rewrite; mv matching 2 0.13636363636363635 +rewrite; mv matching 3 0.18181818181818177 +rewrite; mv matching 3 0.18181818181818177 +rewrite; mv matching 4 0.2272727272727273 +rewrite; mv matching 4 0.2272727272727273 +rewrite; mv matching 5 0.2727272727272727 +rewrite; mv matching 5 0.2727272727272727 +rewrite; mv matching 6 0.40909090909090906 +rewrite; mv matching 6 0.40909090909090906 +rewrite; mv matching 6 0.40909090909090906 +rewrite; mv matching 6 0.40909090909090906 +rewrite; mv matching 6 0.40909090909090906 +rewrite; mv matching 6 0.40909090909090906 +rewrite; mv matching 7 0.5 +rewrite; mv matching 7 0.5 +rewrite; mv matching 7 0.5 +rewrite; mv matching 7 0.5 +rewrite; mv matching 8 0.5909090909090908 +rewrite; mv matching 8 0.5909090909090908 +rewrite; mv matching 8 0.5909090909090908 +rewrite; mv matching 8 0.5909090909090908 +rewrite; mv matching 9 0.6818181818181819 +rewrite; mv matching 9 0.6818181818181819 +rewrite; mv matching 9 0.6818181818181819 +rewrite; mv matching 9 0.6818181818181819 +rewrite; mv matching 10 0.7727272727272727 +rewrite; mv matching 10 0.7727272727272727 +rewrite; mv matching 10 0.7727272727272727 +rewrite; mv matching 10 0.7727272727272727 +rewrite; mv matching 11 0.8181818181818181 +rewrite; mv matching 11 0.8181818181818181 +rewrite; mv matching 12 0.8636363636363636 +rewrite; mv matching 12 0.8636363636363636 +rewrite; mv matching 13 0.9090909090909091 +rewrite; mv matching 13 0.9090909090909091 +rewrite; mv matching 14 0.9545454545454546 +rewrite; mv matching 14 0.9545454545454546 +rewrite; mv matching 15 1.0 +rewrite; mv matching 15 1.0 +PREHOOK: query: explain +select 'rewrite; mv matching', category, id, cume_dist() over (partition by category order by id) from sketch_input order by category,id +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select 'rewrite; mv matching', category, id, cume_dist() over (partition by category order by id) from sketch_input order by category,id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 44 Data size: 3916 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), category (type: char(1)), UDFToFloat(COALESCE(UDFToFloat((- id)),-3.4028234663852886E38D)) (type: float) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 44 Data size: 4092 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col1 (type: char(1)) + null sort order: z + sort order: + + Map-reduce partition columns: _col1 (type: char(1)) + Statistics: Num rows: 44 Data size: 4092 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col2 (type: float) + Execution mode: vectorized, llap + LLAP IO: may be used (ACID table) + Map 4 + Map Operator Tree: + TableScan + alias: default.mv_1 + Statistics: Num rows: 2 Data size: 490 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: category (type: char(1)) + null sort order: z + sort order: + + Map-reduce partition columns: category (type: char(1)) + Statistics: Num rows: 2 Data size: 490 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _c1 (type: binary) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col1 (type: char(1)) + 1 category (type: char(1)) + nullSafes: [true] + outputColumnNames: _col0, _col1, _col2, _col4 + Statistics: Num rows: 44 Data size: 11132 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: char(1)), _col0 (type: int), (1.0D - ds_kll_cdf(_col4, _col2)[0]) (type: double) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 44 Data size: 4268 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: char(1)), _col1 (type: int) + null sort order: zz + sort order: ++ + Statistics: Num rows: 44 Data size: 4268 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col2 (type: double) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: 'rewrite; mv matching' (type: string), KEY.reducesinkkey0 (type: char(1)), KEY.reducesinkkey1 (type: int), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1, _col2, _col3 + Statistics: Num rows: 44 Data size: 8844 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 44 Data size: 8844 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select 'rewrite; mv matching', category, id, cume_dist() over (partition by category order by id) from sketch_input order by category,id +PREHOOK: type: QUERY +PREHOOK: Input: default@mv_1 +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select 'rewrite; mv matching', category, id, cume_dist() over (partition by category order by id) from sketch_input order by category,id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@mv_1 +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +rewrite; mv matching a 1 0.18181818181818177 +rewrite; mv matching a 1 0.18181818181818177 +rewrite; mv matching a 1 0.18181818181818177 +rewrite; mv matching a 1 0.18181818181818177 +rewrite; mv matching a 2 0.2727272727272727 +rewrite; mv matching a 2 0.2727272727272727 +rewrite; mv matching a 3 0.36363636363636365 +rewrite; mv matching a 3 0.36363636363636365 +rewrite; mv matching a 4 0.4545454545454546 +rewrite; mv matching a 4 0.4545454545454546 +rewrite; mv matching a 5 0.5454545454545454 +rewrite; mv matching a 5 0.5454545454545454 +rewrite; mv matching a 6 0.6363636363636364 +rewrite; mv matching a 6 0.6363636363636364 +rewrite; mv matching a 7 0.7272727272727273 +rewrite; mv matching a 7 0.7272727272727273 +rewrite; mv matching a 8 0.8181818181818181 +rewrite; mv matching a 8 0.8181818181818181 +rewrite; mv matching a 9 0.9090909090909091 +rewrite; mv matching a 9 0.9090909090909091 +rewrite; mv matching a 10 1.0 +rewrite; mv matching a 10 1.0 +rewrite; mv matching b 6 0.18181818181818177 +rewrite; mv matching b 6 0.18181818181818177 +rewrite; mv matching b 6 0.18181818181818177 +rewrite; mv matching b 6 0.18181818181818177 +rewrite; mv matching b 7 0.2727272727272727 +rewrite; mv matching b 7 0.2727272727272727 +rewrite; mv matching b 8 0.36363636363636365 +rewrite; mv matching b 8 0.36363636363636365 +rewrite; mv matching b 9 0.4545454545454546 +rewrite; mv matching b 9 0.4545454545454546 +rewrite; mv matching b 10 0.5454545454545454 +rewrite; mv matching b 10 0.5454545454545454 +rewrite; mv matching b 11 0.6363636363636364 +rewrite; mv matching b 11 0.6363636363636364 +rewrite; mv matching b 12 0.7272727272727273 +rewrite; mv matching b 12 0.7272727272727273 +rewrite; mv matching b 13 0.8181818181818181 +rewrite; mv matching b 13 0.8181818181818181 +rewrite; mv matching b 14 0.9090909090909091 +rewrite; mv matching b 14 0.9090909090909091 +rewrite; mv matching b 15 1.0 +rewrite; mv matching b 15 1.0 +PREHOOK: query: drop materialized view mv_1 +PREHOOK: type: DROP_MATERIALIZED_VIEW +PREHOOK: Input: default@mv_1 +PREHOOK: Output: default@mv_1 +POSTHOOK: query: drop materialized view mv_1 +POSTHOOK: type: DROP_MATERIALIZED_VIEW +POSTHOOK: Input: default@mv_1 +POSTHOOK: Output: default@mv_1 diff --git a/ql/src/test/results/clientpositive/llap/sketches_rewrite_cume_dist.q.out b/ql/src/test/results/clientpositive/llap/sketches_rewrite_cume_dist.q.out new file mode 100644 index 0000000..372a0b0 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/sketches_rewrite_cume_dist.q.out @@ -0,0 +1,775 @@ +PREHOOK: query: create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_input +POSTHOOK: query: create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_input +PREHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@sketch_input +POSTHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@sketch_input +POSTHOOK: Lineage: sketch_input.category SCRIPT [] +POSTHOOK: Lineage: sketch_input.id SCRIPT [] +PREHOOK: query: select id,cume_dist() over (order by id) from sketch_input +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select id,cume_dist() over (order by id) from sketch_input +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +1 0.09090909090909091 +1 0.09090909090909091 +2 0.13636363636363635 +3 0.18181818181818182 +4 0.22727272727272727 +5 0.2727272727272727 +6 0.4090909090909091 +6 0.4090909090909091 +6 0.4090909090909091 +7 0.5 +7 0.5 +8 0.5909090909090909 +8 0.5909090909090909 +9 0.6818181818181818 +9 0.6818181818181818 +10 0.7727272727272727 +10 0.7727272727272727 +11 0.8181818181818182 +12 0.8636363636363636 +13 0.9090909090909091 +14 0.9545454545454546 +15 1.0 +Warning: Shuffle Join MERGEJOIN[20][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: select id,cume_dist() over (order by id),1.0-ds_kll_cdf(ds, CAST(-id AS FLOAT) )[0] +from sketch_input +join ( select ds_kll_sketch(cast(-id as float)) as ds from sketch_input ) q +order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select id,cume_dist() over (order by id),1.0-ds_kll_cdf(ds, CAST(-id AS FLOAT) )[0] +from sketch_input +join ( select ds_kll_sketch(cast(-id as float)) as ds from sketch_input ) q +order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +1 0.09090909090909091 0.09090909090909094 +1 0.09090909090909091 0.09090909090909094 +2 0.13636363636363635 0.13636363636363635 +3 0.18181818181818182 0.18181818181818177 +4 0.22727272727272727 0.2272727272727273 +5 0.2727272727272727 0.2727272727272727 +6 0.4090909090909091 0.40909090909090906 +6 0.4090909090909091 0.40909090909090906 +6 0.4090909090909091 0.40909090909090906 +7 0.5 0.5 +7 0.5 0.5 +8 0.5909090909090909 0.5909090909090908 +8 0.5909090909090909 0.5909090909090908 +9 0.6818181818181818 0.6818181818181819 +9 0.6818181818181818 0.6818181818181819 +10 0.7727272727272727 0.7727272727272727 +10 0.7727272727272727 0.7727272727272727 +11 0.8181818181818182 0.8181818181818181 +12 0.8636363636363636 0.8636363636363636 +13 0.9090909090909091 0.9090909090909091 +14 0.9545454545454546 0.9545454545454546 +15 1.0 1.0 +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: explain +select id,'rewrite',cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select id,'rewrite',cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (XPROD_EDGE), Reducer 4 (XPROD_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + Reducer 4 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), UDFToFloat(COALESCE(UDFToFloat((- id)),-3.4028234663852886E38D)) (type: float) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: float) + Select Operator + expressions: UDFToFloat((- id)) (type: float) + outputColumnNames: _col1 + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_sketch(_col1) + keys: true (type: boolean) + minReductionHashAggr: 0.95454544 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: boolean) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: boolean) + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: binary) + Execution mode: llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 22 Data size: 3344 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), (1.0D - ds_kll_cdf(_col2, _col1)[0]) (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Statistics: Num rows: 22 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: int), 'rewrite' (type: string), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 22 Data size: 2266 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 22 Data size: 2266 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_kll_sketch(VALUE._col0) + keys: KEY._col0 (type: boolean) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: binary) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: select id,'rewrite',cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select id,'rewrite',cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +1 rewrite 0.09090909090909094 +1 rewrite 0.09090909090909094 +2 rewrite 0.13636363636363635 +3 rewrite 0.18181818181818177 +4 rewrite 0.2272727272727273 +5 rewrite 0.2727272727272727 +6 rewrite 0.40909090909090906 +6 rewrite 0.40909090909090906 +6 rewrite 0.40909090909090906 +7 rewrite 0.5 +7 rewrite 0.5 +8 rewrite 0.5909090909090908 +8 rewrite 0.5909090909090908 +9 rewrite 0.6818181818181819 +9 rewrite 0.6818181818181819 +10 rewrite 0.7727272727272727 +10 rewrite 0.7727272727272727 +11 rewrite 0.8181818181818181 +12 rewrite 0.8636363636363636 +13 rewrite 0.9090909090909091 +14 rewrite 0.9545454545454546 +15 rewrite 1.0 +Warning: Shuffle Join MERGEJOIN[21][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: explain +select id,'rewrite',count(id) over ()*cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select id,'rewrite',count(id) over ()*cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (XPROD_EDGE), Reducer 5 (XPROD_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + Reducer 4 <- Reducer 3 (SIMPLE_EDGE) + Reducer 5 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), UDFToFloat(COALESCE(UDFToFloat((- id)),-3.4028234663852886E38D)) (type: float) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 22 Data size: 176 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: float) + Select Operator + expressions: UDFToFloat((- id)) (type: float) + outputColumnNames: _col1 + Statistics: Num rows: 22 Data size: 88 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_sketch(_col1) + keys: true (type: boolean) + minReductionHashAggr: 0.95454544 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: boolean) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: boolean) + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: binary) + Execution mode: llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 22 Data size: 3344 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: 0 (type: int) + null sort order: a + sort order: + + Map-reduce partition columns: 0 (type: int) + Statistics: Num rows: 22 Data size: 3344 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: float), _col2 (type: binary) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: VALUE._col0 (type: int), VALUE._col1 (type: float), VALUE._col2 (type: binary) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 22 Data size: 3344 Basic stats: COMPLETE Column stats: COMPLETE + PTF Operator + Function definitions: + Input definition + input alias: ptf_0 + output shape: _col0: int, _col1: float, _col2: binary + type: WINDOWING + Windowing table definition + input alias: ptf_1 + name: windowingtablefunction + order by: 0 ASC NULLS FIRST + partition by: 0 + raw input shape: + window functions: + window function definition + alias: count_window_0 + arguments: _col0 + name: count + window function: GenericUDAFCountEvaluator + window frame: ROWS PRECEDING(MAX)~FOLLOWING(MAX) + Statistics: Num rows: 22 Data size: 3344 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), (UDFToDouble(count_window_0) * (1.0D - ds_kll_cdf(_col2, _col1)[0])) (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 22 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Statistics: Num rows: 22 Data size: 264 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double) + Reducer 4 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: int), 'rewrite' (type: string), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 22 Data size: 2266 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 22 Data size: 2266 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_kll_sketch(VALUE._col0) + keys: KEY._col0 (type: boolean) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: binary) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join MERGEJOIN[21][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: select id,'rewrite',count(id) over ()*cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select id,'rewrite',count(id) over ()*cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +1 rewrite 2.000000000000001 +1 rewrite 2.000000000000001 +2 rewrite 3.0 +3 rewrite 3.999999999999999 +4 rewrite 5.0 +5 rewrite 6.0 +6 rewrite 9.0 +6 rewrite 9.0 +6 rewrite 9.0 +7 rewrite 11.0 +7 rewrite 11.0 +8 rewrite 12.999999999999998 +8 rewrite 12.999999999999998 +9 rewrite 15.000000000000002 +9 rewrite 15.000000000000002 +10 rewrite 17.0 +10 rewrite 17.0 +11 rewrite 18.0 +12 rewrite 19.0 +13 rewrite 20.0 +14 rewrite 21.0 +15 rewrite 22.0 +PREHOOK: query: insert into sketch_input values (null,'a'),(null,'b') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@sketch_input +POSTHOOK: query: insert into sketch_input values (null,'a'),(null,'b') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@sketch_input +POSTHOOK: Lineage: sketch_input.category SCRIPT [] +POSTHOOK: Lineage: sketch_input.id EXPRESSION [] +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: explain +select id,'rewrite',cume_dist() over (order by id nulls first) from sketch_input order by id nulls first +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select id,'rewrite',cume_dist() over (order by id nulls first) from sketch_input order by id nulls first +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (XPROD_EDGE), Reducer 4 (XPROD_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + Reducer 4 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 24 Data size: 92 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), UDFToFloat(COALESCE(UDFToFloat((- id)),3.4028234663852886E38D)) (type: float) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 24 Data size: 188 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 24 Data size: 188 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: float) + Select Operator + expressions: UDFToFloat((- id)) (type: float) + outputColumnNames: _col1 + Statistics: Num rows: 24 Data size: 92 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_sketch(_col1) + keys: true (type: boolean) + minReductionHashAggr: 0.9583333 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: boolean) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: boolean) + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: binary) + Execution mode: llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 24 Data size: 3644 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), (1.0D - ds_kll_cdf(_col2, _col1)[0]) (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 24 Data size: 284 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: a + sort order: + + Statistics: Num rows: 24 Data size: 284 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: int), 'rewrite' (type: string), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 24 Data size: 2468 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 24 Data size: 2468 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_kll_sketch(VALUE._col0) + keys: KEY._col0 (type: boolean) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: binary) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: select id,'rewrite',cume_dist() over (order by id nulls first) from sketch_input order by id nulls first +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select id,'rewrite',cume_dist() over (order by id nulls first) from sketch_input order by id nulls first +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +NULL rewrite 0.0 +NULL rewrite 0.0 +1 rewrite 0.09090909090909094 +1 rewrite 0.09090909090909094 +2 rewrite 0.13636363636363635 +3 rewrite 0.18181818181818177 +4 rewrite 0.2272727272727273 +5 rewrite 0.2727272727272727 +6 rewrite 0.40909090909090906 +6 rewrite 0.40909090909090906 +6 rewrite 0.40909090909090906 +7 rewrite 0.5 +7 rewrite 0.5 +8 rewrite 0.5909090909090908 +8 rewrite 0.5909090909090908 +9 rewrite 0.6818181818181819 +9 rewrite 0.6818181818181819 +10 rewrite 0.7727272727272727 +10 rewrite 0.7727272727272727 +11 rewrite 0.8181818181818181 +12 rewrite 0.8636363636363636 +13 rewrite 0.9090909090909091 +14 rewrite 0.9545454545454546 +15 rewrite 1.0 +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: explain +select id,'rewrite',cume_dist() over (order by id nulls last) from sketch_input order by id nulls last +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select id,'rewrite',cume_dist() over (order by id nulls last) from sketch_input order by id nulls last +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (XPROD_EDGE), Reducer 4 (XPROD_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + Reducer 4 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 24 Data size: 92 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), UDFToFloat(COALESCE(UDFToFloat((- id)),-3.4028234663852886E38D)) (type: float) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 24 Data size: 188 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 24 Data size: 188 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col1 (type: float) + Select Operator + expressions: UDFToFloat((- id)) (type: float) + outputColumnNames: _col1 + Statistics: Num rows: 24 Data size: 92 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_sketch(_col1) + keys: true (type: boolean) + minReductionHashAggr: 0.9583333 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: boolean) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: boolean) + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: binary) + Execution mode: llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 + 1 + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 24 Data size: 3644 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), (1.0D - ds_kll_cdf(_col2, _col1)[0]) (type: double) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 24 Data size: 284 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Statistics: Num rows: 24 Data size: 284 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: int), 'rewrite' (type: string), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 24 Data size: 2468 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 24 Data size: 2468 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 4 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_kll_sketch(VALUE._col0) + keys: KEY._col0 (type: boolean) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 148 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col1 (type: binary) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 144 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +Warning: Shuffle Join MERGEJOIN[17][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: select id,'rewrite',cume_dist() over (order by id nulls last) from sketch_input order by id nulls last +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select id,'rewrite',cume_dist() over (order by id nulls last) from sketch_input order by id nulls last +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +1 rewrite 0.09090909090909094 +1 rewrite 0.09090909090909094 +2 rewrite 0.13636363636363635 +3 rewrite 0.18181818181818177 +4 rewrite 0.2272727272727273 +5 rewrite 0.2727272727272727 +6 rewrite 0.40909090909090906 +6 rewrite 0.40909090909090906 +6 rewrite 0.40909090909090906 +7 rewrite 0.5 +7 rewrite 0.5 +8 rewrite 0.5909090909090908 +8 rewrite 0.5909090909090908 +9 rewrite 0.6818181818181819 +9 rewrite 0.6818181818181819 +10 rewrite 0.7727272727272727 +10 rewrite 0.7727272727272727 +11 rewrite 0.8181818181818181 +12 rewrite 0.8636363636363636 +13 rewrite 0.9090909090909091 +14 rewrite 0.9545454545454546 +15 rewrite 1.0 +NULL rewrite 1.0 +NULL rewrite 1.0 +Warning: Shuffle Join MERGEJOIN[16][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: select id,cume_dist() over (order by id) from sketch_input order by id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select id,cume_dist() over (order by id) from sketch_input order by id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +1 0.09090909090909094 +1 0.09090909090909094 +2 0.13636363636363635 +3 0.18181818181818177 +4 0.2272727272727273 +5 0.2727272727272727 +6 0.40909090909090906 +6 0.40909090909090906 +6 0.40909090909090906 +7 0.5 +7 0.5 +8 0.5909090909090908 +8 0.5909090909090908 +9 0.6818181818181819 +9 0.6818181818181819 +10 0.7727272727272727 +10 0.7727272727272727 +11 0.8181818181818181 +12 0.8636363636363636 +13 0.9090909090909091 +14 0.9545454545454546 +15 1.0 +NULL 1.0 +NULL 1.0 diff --git a/ql/src/test/results/clientpositive/llap/sketches_rewrite_cume_dist_partition_by.q.out b/ql/src/test/results/clientpositive/llap/sketches_rewrite_cume_dist_partition_by.q.out new file mode 100644 index 0000000..19f9dbe --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/sketches_rewrite_cume_dist_partition_by.q.out @@ -0,0 +1,258 @@ +PREHOOK: query: create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_input +POSTHOOK: query: create table sketch_input (id int, category char(1)) +STORED AS ORC +TBLPROPERTIES ('transactional'='true') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_input +PREHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b'), + (1,null),(2,null),(10,null),(13,null) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@sketch_input +POSTHOOK: query: insert into table sketch_input values + (1,'a'),(1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6,'b'),(6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b'), + (1,null),(2,null),(10,null),(13,null) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@sketch_input +POSTHOOK: Lineage: sketch_input.category SCRIPT [] +POSTHOOK: Lineage: sketch_input.id SCRIPT [] +PREHOOK: query: select id,category,cume_dist() over (partition by category order by id) from sketch_input order by category,id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select id,category,cume_dist() over (partition by category order by id) from sketch_input order by category,id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +1 a 0.18181818181818182 +1 a 0.18181818181818182 +2 a 0.2727272727272727 +3 a 0.36363636363636365 +4 a 0.45454545454545453 +5 a 0.5454545454545454 +6 a 0.6363636363636364 +7 a 0.7272727272727273 +8 a 0.8181818181818182 +9 a 0.9090909090909091 +10 a 1.0 +6 b 0.18181818181818182 +6 b 0.18181818181818182 +7 b 0.2727272727272727 +8 b 0.36363636363636365 +9 b 0.45454545454545453 +10 b 0.5454545454545454 +11 b 0.6363636363636364 +12 b 0.7272727272727273 +13 b 0.8181818181818182 +14 b 0.9090909090909091 +15 b 1.0 +1 NULL 0.25 +2 NULL 0.5 +10 NULL 0.75 +13 NULL 1.0 +PREHOOK: query: select id,category,cume_dist() over (partition by category order by id),1.0-ds_kll_cdf(ds, CAST(-id AS FLOAT))[0] +from sketch_input +join ( select category as c,ds_kll_sketch(cast(-id as float)) as ds from sketch_input group by category) q on (q.c=category) +order by category,id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select id,category,cume_dist() over (partition by category order by id),1.0-ds_kll_cdf(ds, CAST(-id AS FLOAT))[0] +from sketch_input +join ( select category as c,ds_kll_sketch(cast(-id as float)) as ds from sketch_input group by category) q on (q.c=category) +order by category,id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +1 a 0.18181818181818182 0.18181818181818177 +1 a 0.18181818181818182 0.18181818181818177 +2 a 0.2727272727272727 0.2727272727272727 +3 a 0.36363636363636365 0.36363636363636365 +4 a 0.45454545454545453 0.4545454545454546 +5 a 0.5454545454545454 0.5454545454545454 +6 a 0.6363636363636364 0.6363636363636364 +7 a 0.7272727272727273 0.7272727272727273 +8 a 0.8181818181818182 0.8181818181818181 +9 a 0.9090909090909091 0.9090909090909091 +10 a 1.0 1.0 +6 b 0.18181818181818182 0.18181818181818177 +6 b 0.18181818181818182 0.18181818181818177 +7 b 0.2727272727272727 0.2727272727272727 +8 b 0.36363636363636365 0.36363636363636365 +9 b 0.45454545454545453 0.4545454545454546 +10 b 0.5454545454545454 0.5454545454545454 +11 b 0.6363636363636364 0.6363636363636364 +12 b 0.7272727272727273 0.7272727272727273 +13 b 0.8181818181818182 0.8181818181818181 +14 b 0.9090909090909091 0.9090909090909091 +15 b 1.0 1.0 +PREHOOK: query: explain +select id,'rewrite',cume_dist() over (partition by category order by id) from sketch_input order by category,id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: explain +select id,'rewrite',cume_dist() over (partition by category order by id) from sketch_input order by category,id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE), Reducer 5 (SIMPLE_EDGE) + Reducer 3 <- Reducer 2 (SIMPLE_EDGE) + Reducer 5 <- Map 4 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 26 Data size: 2059 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: id (type: int), category (type: char(1)), UDFToFloat(COALESCE(UDFToFloat((- id)),-3.4028234663852886E38D)) (type: float) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 26 Data size: 2163 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col1 (type: char(1)) + null sort order: z + sort order: + + Map-reduce partition columns: _col1 (type: char(1)) + Statistics: Num rows: 26 Data size: 2163 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: int), _col2 (type: float) + Execution mode: vectorized, llap + LLAP IO: may be used (ACID table) + Map 4 + Map Operator Tree: + TableScan + alias: sketch_input + Statistics: Num rows: 26 Data size: 2059 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: category (type: char(1)), UDFToFloat((- id)) (type: float) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 26 Data size: 2059 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: ds_kll_sketch(_col1) + keys: _col0 (type: char(1)) + minReductionHashAggr: 0.9230769 + mode: hash + outputColumnNames: _col0, _col1 + Statistics: Num rows: 3 Data size: 687 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: char(1)) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: char(1)) + Statistics: Num rows: 3 Data size: 687 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: binary) + Execution mode: llap + LLAP IO: may be used (ACID table) + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col1 (type: char(1)) + 1 _col0 (type: char(1)) + nullSafes: [true] + outputColumnNames: _col0, _col1, _col2, _col4 + Statistics: Num rows: 26 Data size: 5907 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col0 (type: int), (1.0D - ds_kll_cdf(_col4, _col2)[0]) (type: double), _col1 (type: char(1)) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 26 Data size: 2267 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col2 (type: char(1)), _col0 (type: int) + null sort order: zz + sort order: ++ + Statistics: Num rows: 26 Data size: 2267 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: double) + Reducer 3 + Execution mode: vectorized, llap + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey1 (type: int), 'rewrite' (type: string), VALUE._col0 (type: double) + outputColumnNames: _col0, _col1, _col2 + Statistics: Num rows: 26 Data size: 2678 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 26 Data size: 2678 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Reducer 5 + Execution mode: llap + Reduce Operator Tree: + Group By Operator + aggregations: ds_kll_sketch(VALUE._col0) + keys: KEY._col0 (type: char(1)) + mode: mergepartial + outputColumnNames: _col0, _col1 + Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col0 (type: char(1)) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: char(1)) + Statistics: Num rows: 2 Data size: 458 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: binary) + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select id,'rewrite',cume_dist() over (partition by category order by id) from sketch_input order by category,id +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +#### A masked pattern was here #### +POSTHOOK: query: select id,'rewrite',cume_dist() over (partition by category order by id) from sketch_input order by category,id +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +#### A masked pattern was here #### +1 rewrite 0.18181818181818177 +1 rewrite 0.18181818181818177 +2 rewrite 0.2727272727272727 +3 rewrite 0.36363636363636365 +4 rewrite 0.4545454545454546 +5 rewrite 0.5454545454545454 +6 rewrite 0.6363636363636364 +7 rewrite 0.7272727272727273 +8 rewrite 0.8181818181818181 +9 rewrite 0.9090909090909091 +10 rewrite 1.0 +6 rewrite 0.18181818181818177 +6 rewrite 0.18181818181818177 +7 rewrite 0.2727272727272727 +8 rewrite 0.36363636363636365 +9 rewrite 0.4545454545454546 +10 rewrite 0.5454545454545454 +11 rewrite 0.6363636363636364 +12 rewrite 0.7272727272727273 +13 rewrite 0.8181818181818181 +14 rewrite 0.9090909090909091 +15 rewrite 1.0 +1 rewrite 0.25 +2 rewrite 0.5 +10 rewrite 0.75 +13 rewrite 1.0