This is an automated email from the ASF dual-hosted git repository. kgyrtkirk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 2105c66 HIVE-22940: Make the datasketches functions available as predefined functions (Zoltan Haindrich reviewed by Jesus Camacho Rodriguez) 2105c66 is described below commit 2105c6617ef9609dd8b2f712f596c2f9cc6d972e Author: Zoltan Haindrich <k...@rxd.hu> AuthorDate: Mon Mar 23 07:58:54 2020 +0000 HIVE-22940: Make the datasketches functions available as predefined functions (Zoltan Haindrich reviewed by Jesus Camacho Rodriguez) Signed-off-by: Zoltan Haindrich <k...@rxd.hu> --- .../test/resources/testconfiguration.properties | 2 + pom.xml | 1 + ql/pom.xml | 10 + .../hadoop/hive/ql/exec/DataSketchesFunctions.java | 221 +++++++++++++++++++++ .../hadoop/hive/ql/exec/FunctionRegistry.java | 3 +- ql/src/test/queries/clientpositive/sketches_hll.q | 16 ++ .../test/queries/clientpositive/sketches_theta.q | 33 +++ .../results/clientpositive/llap/sketches_hll.q.out | 59 ++++++ .../clientpositive/llap/sketches_theta.q.out | 120 +++++++++++ .../results/clientpositive/show_functions.q.out | 136 +++++++++++++ 10 files changed, 599 insertions(+), 2 deletions(-) diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index f71ed3d..3510016 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -818,6 +818,8 @@ minillaplocal.query.files=\ schq_materialized.q,\ schq_analyze.q,\ schq_ingest.q,\ + sketches_hll.q,\ + sketches_theta.q,\ table_access_keys_stats.q,\ temp_table_llap_partitioned.q,\ tez_bmj_schema_evolution.q,\ diff --git a/pom.xml b/pom.xml index af70972..579e745 100644 --- a/pom.xml +++ b/pom.xml @@ -228,6 +228,7 @@ <json-path.version>2.4.0</json-path.version> <janino.version>3.0.11</janino.version> <snakeyaml.version>1.23</snakeyaml.version> + <datasketches.version>1.0.0-incubating</datasketches.version> </properties> <repositories> diff --git a/ql/pom.xml b/ql/pom.xml index 161a527..9b45d31 100644 --- a/ql/pom.xml +++ b/ql/pom.xml @@ -313,6 +313,11 @@ <scope>test</scope> </dependency> <dependency> + <groupId>org.apache.datasketches</groupId> + <artifactId>datasketches-hive</artifactId> + <version>${datasketches.version}</version> + </dependency> + <dependency> <groupId>com.lmax</groupId> <artifactId>disruptor</artifactId> <version>${disruptor.version}</version> @@ -1007,6 +1012,7 @@ <include>io.dropwizard.metrics:metrics-jvm</include> <include>io.dropwizard.metrics:metrics-json</include> <include>com.zaxxer:HikariCP</include> + <include>org.apache.datasketches:*</include> <include>org.apache.calcite:*</include> <include>org.apache.calcite.avatica:avatica</include> </includes> @@ -1040,6 +1046,10 @@ <pattern>com.google.thirdparty.publicsuffix</pattern> <shadedPattern>org.apache.hive.com.google.thirdparty.publicsuffix</shadedPattern> </relocation> + <relocation> + <pattern>org.apache.datasketches</pattern> + <shadedPattern>org.apache.hive.org.apache.datasketches</shadedPattern> + </relocation> </relocations> </configuration> </execution> diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java new file mode 100644 index 0000000..b9d265f --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/DataSketchesFunctions.java @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.exec; + +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFResolver2; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDTF; + +/** + * Registers functions from the DataSketches library as builtin functions. + * + * In an effort to show a more consistent + */ +public class DataSketchesFunctions { + + private static final String DATA_TO_SKETCH = "sketch"; + private static final String SKETCH_TO_ESTIMATE_WITH_ERROR_BOUNDS = "estimate_bounds"; + private static final String SKETCH_TO_ESTIMATE = "estimate"; + private static final String SKETCH_TO_STRING = "stringify"; + private static final String UNION_SKETCH = "union"; + private static final String UNION_SKETCH1 = "union_f"; + private static final String GET_N = "n"; + private static final String GET_CDF = "cdf"; + private static final String GET_PMF = "pmf"; + private static final String GET_QUANTILES = "quantiles"; + private static final String GET_QUANTILE = "quantile"; + private static final String GET_RANK = "rank"; + private static final String INTERSECT_SKETCH = "intersect"; + private static final String INTERSECT_SKETCH1 = "intersect_f"; + private static final String EXCLUDE_SKETCH = "exclude"; + private static final String GET_K = "k"; + private static final String GET_FREQUENT_ITEMS = "frequent_items"; + private static final String T_TEST = "ttest"; + private static final String SKETCH_TO_MEANS = "means"; + private static final String SKETCH_TO_NUMBER_OF_RETAINED_ENTRIES = "n_retained"; + private static final String SKETCH_TO_QUANTILES_SKETCH = "quantiles_sketch"; + private static final String SKETCH_TO_VALUES = "values"; + private static final String SKETCH_TO_VARIANCES = "variances"; + private static final String SKETCH_TO_PERCENTILE = "percentile"; + + private final Registry system; + + public DataSketchesFunctions(Registry system) { + this.system = system; + } + + public static void register(Registry system) { + DataSketchesFunctions dsf = new DataSketchesFunctions(system); + String prefix = "ds"; + dsf.registerHll(prefix); + dsf.registerCpc(prefix); + dsf.registerKll(prefix); + dsf.registerTheta(prefix); + dsf.registerTuple(prefix); + dsf.registerQuantiles(prefix); + dsf.registerFrequencies(prefix); + } + + private void registerHll(String prefix) { + String p = prefix + "_hll_"; + registerUDAF(org.apache.datasketches.hive.hll.DataToSketchUDAF.class, p + DATA_TO_SKETCH); + registerUDF(org.apache.datasketches.hive.hll.SketchToEstimateAndErrorBoundsUDF.class, + p + SKETCH_TO_ESTIMATE_WITH_ERROR_BOUNDS); + registerUDF(org.apache.datasketches.hive.hll.SketchToEstimateUDF.class, p + SKETCH_TO_ESTIMATE); + registerUDF(org.apache.datasketches.hive.hll.SketchToStringUDF.class, p + SKETCH_TO_STRING); + registerUDF(org.apache.datasketches.hive.hll.UnionSketchUDF.class, p + UNION_SKETCH1); + registerUDAF(org.apache.datasketches.hive.hll.UnionSketchUDAF.class, p + UNION_SKETCH); + } + + private void registerCpc(String prefix) { + String p = prefix + "_cpc_"; + registerUDAF(org.apache.datasketches.hive.cpc.DataToSketchUDAF.class, p + DATA_TO_SKETCH); + // FIXME: normalize GetEstimateAndErrorBoundsUDF vs SketchToEstimateAndErrorBoundsUDF + registerUDF(org.apache.datasketches.hive.cpc.GetEstimateAndErrorBoundsUDF.class, + p + SKETCH_TO_ESTIMATE_WITH_ERROR_BOUNDS); + // FIXME: normalize GetEstimateUDF vs SketchToEstimateUDF + registerUDF(org.apache.datasketches.hive.cpc.GetEstimateUDF.class, p + SKETCH_TO_ESTIMATE); + registerUDF(org.apache.datasketches.hive.cpc.SketchToStringUDF.class, p + SKETCH_TO_STRING); + registerUDF(org.apache.datasketches.hive.cpc.UnionSketchUDF.class, p + UNION_SKETCH1); + registerUDAF(org.apache.datasketches.hive.cpc.UnionSketchUDAF.class, p + UNION_SKETCH); + } + + private void registerKll(String prefix) { + String p = prefix + "_kll_"; + registerUDAF(org.apache.datasketches.hive.kll.DataToSketchUDAF.class, p + DATA_TO_SKETCH); + registerUDF(org.apache.datasketches.hive.kll.SketchToStringUDF.class, p + SKETCH_TO_STRING); + // registerUDF(org.apache.datasketches.hive.kll.UnionSketchUDF.class, p + UNION_SKETCH); + registerUDAF(org.apache.datasketches.hive.kll.UnionSketchUDAF.class, p + UNION_SKETCH); + + registerUDF(org.apache.datasketches.hive.kll.GetNUDF.class, p + GET_N); + registerUDF(org.apache.datasketches.hive.kll.GetCdfUDF.class, p + GET_CDF); + registerUDF(org.apache.datasketches.hive.kll.GetPmfUDF.class, p + GET_PMF); + registerUDF(org.apache.datasketches.hive.kll.GetQuantilesUDF.class, p + GET_QUANTILES); + registerUDF(org.apache.datasketches.hive.kll.GetQuantileUDF.class, p + GET_QUANTILE); + registerUDF(org.apache.datasketches.hive.kll.GetRankUDF.class, p + GET_RANK); + } + + private void registerTheta(String prefix) { + String p = prefix + "_theta_"; + registerUDAF(org.apache.datasketches.hive.theta.DataToSketchUDAF.class, p + DATA_TO_SKETCH); + // FIXME: missing? + //registerUDF(org.apache.datasketches.hive.theta.SketchToStringUDF.class, p + SKETCH_TO_STRING); + registerUDF(org.apache.datasketches.hive.theta.UnionSketchUDF.class, p + UNION_SKETCH1); + registerUDAF(org.apache.datasketches.hive.theta.UnionSketchUDAF.class, p + UNION_SKETCH); + registerUDF(org.apache.datasketches.hive.theta.IntersectSketchUDF.class, p + INTERSECT_SKETCH1); + registerUDAF(org.apache.datasketches.hive.theta.IntersectSketchUDAF.class, p + INTERSECT_SKETCH); + registerUDF(org.apache.datasketches.hive.theta.EstimateSketchUDF.class, p + SKETCH_TO_ESTIMATE); + registerUDF(org.apache.datasketches.hive.theta.ExcludeSketchUDF.class, p + EXCLUDE_SKETCH); + + } + + private void registerTuple(String prefix) { + registerTupleArrayOfDoubles(prefix + "_tuple_arrayofdouble"); + registerTupleDoubleSummary(prefix + "_tuple_doublesummary"); + } + + private void registerTupleArrayOfDoubles(String string) { + String p = string + "_"; + registerUDAF(org.apache.datasketches.hive.tuple.DataToArrayOfDoublesSketchUDAF.class, p + DATA_TO_SKETCH); + // FIXME: missing? + //registerUDF(org.apache.datasketches.hive.theta.SketchToStringUDF.class, p + SKETCH_TO_STRING); + registerUDAF(org.apache.datasketches.hive.tuple.UnionArrayOfDoublesSketchUDAF.class, p + UNION_SKETCH); + registerUDF(org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchesTTestUDF.class, p + T_TEST); + registerUDF(org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToEstimatesUDF.class, p + SKETCH_TO_ESTIMATE); + registerUDF(org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToEstimateAndErrorBoundsUDF.class, + p + SKETCH_TO_ESTIMATE_WITH_ERROR_BOUNDS); + registerUDF(org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToMeansUDF.class, p + SKETCH_TO_MEANS); + registerUDF(org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToNumberOfRetainedEntriesUDF.class, + p + SKETCH_TO_NUMBER_OF_RETAINED_ENTRIES); + registerUDF(org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToQuantilesSketchUDF.class, + p + SKETCH_TO_QUANTILES_SKETCH); + registerUDTF(org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToValuesUDTF.class, p + SKETCH_TO_VALUES); + registerUDF(org.apache.datasketches.hive.tuple.ArrayOfDoublesSketchToVariancesUDF.class, p + SKETCH_TO_VARIANCES); + } + + private void registerTupleDoubleSummary(String string) { + String p = string + "_"; + registerUDAF(org.apache.datasketches.hive.tuple.DataToDoubleSummarySketchUDAF.class, p + DATA_TO_SKETCH); + // FIXME: missing? + //registerUDF(org.apache.datasketches.hive.theta.SketchToStringUDF.class, p + SKETCH_TO_STRING); + registerUDAF(org.apache.datasketches.hive.tuple.UnionDoubleSummarySketchUDAF.class, p + UNION_SKETCH); + registerUDF(org.apache.datasketches.hive.tuple.DoubleSummarySketchToEstimatesUDF.class, p + SKETCH_TO_ESTIMATE); + registerUDF(org.apache.datasketches.hive.tuple.DoubleSummarySketchToPercentileUDF.class, p + SKETCH_TO_PERCENTILE); + } + + private void registerQuantiles(String prefix) { + registerQuantilesString(prefix + "_quantile"); + registerQuantilesDoubles(prefix + "_quantile"); + } + + private void registerFrequencies(String prefix) { + String p = prefix + "_freq_"; + registerUDAF(org.apache.datasketches.hive.frequencies.DataToStringsSketchUDAF.class, p + DATA_TO_SKETCH); + // FIXME: missing? + //registerUDF(org.apache.datasketches.hive.frequencies.DoublesSketchToStringUDF.class, p + SKETCH_TO_STRING); + //registerUDF(org.apache.datasketches.hive.quantiles.UnionItemsSketchUDAF.class, p + UNION_SKETCH); + registerUDAF(org.apache.datasketches.hive.frequencies.UnionStringsSketchUDAF.class, p + UNION_SKETCH); + registerUDTF(org.apache.datasketches.hive.frequencies.GetFrequentItemsFromStringsSketchUDTF.class, + p + GET_FREQUENT_ITEMS); + } + + private void registerQuantilesString(String prefix) { + String p = prefix + "_strings_"; + registerUDAF(org.apache.datasketches.hive.quantiles.DataToStringsSketchUDAF.class, p + DATA_TO_SKETCH); + registerUDF(org.apache.datasketches.hive.quantiles.StringsSketchToStringUDF.class, p + SKETCH_TO_STRING); + //registerUDF(org.apache.datasketches.hive.quantiles.UnionItemsSketchUDAF.class, p + UNION_SKETCH); + registerUDAF(org.apache.datasketches.hive.quantiles.UnionStringsSketchUDAF.class, p + UNION_SKETCH); + registerUDF(org.apache.datasketches.hive.quantiles.GetNFromStringsSketchUDF.class, p + GET_N); + registerUDF(org.apache.datasketches.hive.quantiles.GetKFromStringsSketchUDF.class, p + GET_K); + registerUDF(org.apache.datasketches.hive.quantiles.GetCdfFromStringsSketchUDF.class, p + GET_CDF); + registerUDF(org.apache.datasketches.hive.quantiles.GetPmfFromStringsSketchUDF.class, p + GET_PMF); + registerUDF(org.apache.datasketches.hive.quantiles.GetQuantileFromStringsSketchUDF.class, p + GET_QUANTILE); + registerUDF(org.apache.datasketches.hive.quantiles.GetQuantilesFromStringsSketchUDF.class, p + GET_QUANTILES); + } + + private void registerQuantilesDoubles(String prefix) { + String p = prefix + "_doubles_"; + registerUDAF(org.apache.datasketches.hive.quantiles.DataToDoublesSketchUDAF.class, p + DATA_TO_SKETCH); + registerUDF(org.apache.datasketches.hive.quantiles.DoublesSketchToStringUDF.class, p + SKETCH_TO_STRING); + //registerUDF(org.apache.datasketches.hive.quantiles.UnionItemsSketchUDAF.class, p + UNION_SKETCH); + registerUDAF(org.apache.datasketches.hive.quantiles.UnionDoublesSketchUDAF.class, p + UNION_SKETCH); + registerUDF(org.apache.datasketches.hive.quantiles.GetNFromDoublesSketchUDF.class, p + GET_N); + registerUDF(org.apache.datasketches.hive.quantiles.GetKFromDoublesSketchUDF.class, p + GET_K); + registerUDF(org.apache.datasketches.hive.quantiles.GetCdfFromDoublesSketchUDF.class, p + GET_CDF); + registerUDF(org.apache.datasketches.hive.quantiles.GetPmfFromDoublesSketchUDF.class, p + GET_PMF); + registerUDF(org.apache.datasketches.hive.quantiles.GetQuantileFromDoublesSketchUDF.class, p + GET_QUANTILE); + registerUDF(org.apache.datasketches.hive.quantiles.GetQuantilesFromDoublesSketchUDF.class, p + GET_QUANTILES); + } + + private void registerUDF(Class<? extends UDF> udfClass, String name) { + system.registerUDF(name, udfClass, false); + } + + private void registerUDAF(Class<? extends GenericUDAFResolver2> udafClass, String name) { + try { + system.registerGenericUDAF(name, udafClass.newInstance()); + } catch (InstantiationException | IllegalAccessException e) { + throw new RuntimeException("Unable to register: " + name, e); + } + } + + private void registerUDTF(Class<? extends GenericUDTF> udtfClass, String name) { + system.registerGenericUDTF(name, udtfClass); + } + +} diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java index db5ee8d..dc3781a 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/FunctionRegistry.java @@ -140,8 +140,6 @@ import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.Pr import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorUtils.PrimitiveGrouping; import org.apache.hadoop.hive.serde2.typeinfo.HiveDecimalUtils; -import org.apache.hadoop.hive.serde2.typeinfo.ListTypeInfo; -import org.apache.hadoop.hive.serde2.typeinfo.MapTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.StructTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfo; @@ -475,6 +473,7 @@ public final class FunctionRegistry { system.registerGenericUDAF("percentile_cont", new GenericUDAFPercentileCont()); system.registerGenericUDAF("percentile_disc", new GenericUDAFPercentileDisc()); + DataSketchesFunctions.register(system); // Generic UDFs system.registerGenericUDF("reflect", GenericUDFReflect.class); diff --git a/ql/src/test/queries/clientpositive/sketches_hll.q b/ql/src/test/queries/clientpositive/sketches_hll.q new file mode 100644 index 0000000..56467a6 --- /dev/null +++ b/ql/src/test/queries/clientpositive/sketches_hll.q @@ -0,0 +1,16 @@ +-- prepare input data +create temporary table sketch_input (id int, category char(1)); +insert into table sketch_input values + (1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b'); + +-- build sketches per category +create temporary table sketch_intermediate (category char(1), sketch binary); +insert into sketch_intermediate select category, ds_hll_sketch(id) from sketch_input group by category; + +-- get unique count estimates per category +select category, ds_hll_estimate(sketch) from sketch_intermediate; + + +-- union sketches across categories and get overall unique count estimate +select ds_hll_estimate(ds_hll_union(sketch)) from sketch_intermediate; diff --git a/ql/src/test/queries/clientpositive/sketches_theta.q b/ql/src/test/queries/clientpositive/sketches_theta.q new file mode 100644 index 0000000..6ab7278 --- /dev/null +++ b/ql/src/test/queries/clientpositive/sketches_theta.q @@ -0,0 +1,33 @@ +-- see here: https://datasketches.apache.org/docs/Theta/ThetaHiveUDFs.html + +create temporary table theta_input (id int, category char(1)); +insert into table theta_input values + (1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b'); + +create temporary table sketch_intermediate (category char(1), sketch binary); +insert into sketch_intermediate select category, ds_theta_sketch(id) from theta_input group by category; + +select category, ds_theta_estimate(sketch) from sketch_intermediate; + +select ds_theta_estimate(ds_theta_union(sketch)) from sketch_intermediate; + + + +create temporary table sketch_input (id1 int, id2 int); +insert into table sketch_input values + (1, 2), (2, 4), (3, 6), (4, 8), (5, 10), (6, 12), (7, 14), (8, 16), (9, 18), (10, 20); + +create temporary table sketch_intermediate2 (sketch1 binary, sketch2 binary); + +insert into sketch_intermediate2 select ds_theta_sketch(id1), ds_theta_sketch(id2) from sketch_input; + +select + ds_theta_estimate(sketch1), + ds_theta_estimate(sketch2), + ds_theta_estimate(ds_theta_union_f(sketch1, sketch2)), + ds_theta_estimate(ds_theta_intersect_f(sketch1, sketch2)), + ds_theta_estimate(ds_theta_exclude(sketch1, sketch2)), + ds_theta_estimate(ds_theta_exclude(sketch2, sketch1)) +from sketch_intermediate2; + diff --git a/ql/src/test/results/clientpositive/llap/sketches_hll.q.out b/ql/src/test/results/clientpositive/llap/sketches_hll.q.out new file mode 100644 index 0000000..9ebce86 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/sketches_hll.q.out @@ -0,0 +1,59 @@ +PREHOOK: query: create temporary table sketch_input (id int, category char(1)) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_input +POSTHOOK: query: create temporary table sketch_input (id int, category char(1)) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_input +PREHOOK: query: insert into table sketch_input values + (1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@sketch_input +POSTHOOK: query: insert into table sketch_input values + (1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@sketch_input +POSTHOOK: Lineage: sketch_input.category SCRIPT [] +POSTHOOK: Lineage: sketch_input.id SCRIPT [] +PREHOOK: query: create temporary table sketch_intermediate (category char(1), sketch binary) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_intermediate +POSTHOOK: query: create temporary table sketch_intermediate (category char(1), sketch binary) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_intermediate +PREHOOK: query: insert into sketch_intermediate select category, ds_hll_sketch(id) from sketch_input group by category +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +PREHOOK: Output: default@sketch_intermediate +POSTHOOK: query: insert into sketch_intermediate select category, ds_hll_sketch(id) from sketch_input group by category +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +POSTHOOK: Output: default@sketch_intermediate +POSTHOOK: Lineage: sketch_intermediate.category SIMPLE [(sketch_input)sketch_input.FieldSchema(name:category, type:char(1), comment:null), ] +POSTHOOK: Lineage: sketch_intermediate.sketch EXPRESSION [(sketch_input)sketch_input.FieldSchema(name:id, type:int, comment:null), ] +PREHOOK: query: select category, ds_hll_estimate(sketch) from sketch_intermediate +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_intermediate +#### A masked pattern was here #### +POSTHOOK: query: select category, ds_hll_estimate(sketch) from sketch_intermediate +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_intermediate +#### A masked pattern was here #### +a 10.000000223517425 +b 10.000000223517425 +PREHOOK: query: select ds_hll_estimate(ds_hll_union(sketch)) from sketch_intermediate +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_intermediate +#### A masked pattern was here #### +POSTHOOK: query: select ds_hll_estimate(ds_hll_union(sketch)) from sketch_intermediate +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_intermediate +#### A masked pattern was here #### +15.000000521540663 diff --git a/ql/src/test/results/clientpositive/llap/sketches_theta.q.out b/ql/src/test/results/clientpositive/llap/sketches_theta.q.out new file mode 100644 index 0000000..b3ea64d --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/sketches_theta.q.out @@ -0,0 +1,120 @@ +PREHOOK: query: create temporary table theta_input (id int, category char(1)) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@theta_input +POSTHOOK: query: create temporary table theta_input (id int, category char(1)) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@theta_input +PREHOOK: query: insert into table theta_input values + (1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@theta_input +POSTHOOK: query: insert into table theta_input values + (1, 'a'), (2, 'a'), (3, 'a'), (4, 'a'), (5, 'a'), (6, 'a'), (7, 'a'), (8, 'a'), (9, 'a'), (10, 'a'), + (6, 'b'), (7, 'b'), (8, 'b'), (9, 'b'), (10, 'b'), (11, 'b'), (12, 'b'), (13, 'b'), (14, 'b'), (15, 'b') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@theta_input +POSTHOOK: Lineage: theta_input.category SCRIPT [] +POSTHOOK: Lineage: theta_input.id SCRIPT [] +PREHOOK: query: create temporary table sketch_intermediate (category char(1), sketch binary) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_intermediate +POSTHOOK: query: create temporary table sketch_intermediate (category char(1), sketch binary) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_intermediate +PREHOOK: query: insert into sketch_intermediate select category, ds_theta_sketch(id) from theta_input group by category +PREHOOK: type: QUERY +PREHOOK: Input: default@theta_input +PREHOOK: Output: default@sketch_intermediate +POSTHOOK: query: insert into sketch_intermediate select category, ds_theta_sketch(id) from theta_input group by category +POSTHOOK: type: QUERY +POSTHOOK: Input: default@theta_input +POSTHOOK: Output: default@sketch_intermediate +POSTHOOK: Lineage: sketch_intermediate.category SIMPLE [(theta_input)theta_input.FieldSchema(name:category, type:char(1), comment:null), ] +POSTHOOK: Lineage: sketch_intermediate.sketch EXPRESSION [(theta_input)theta_input.FieldSchema(name:id, type:int, comment:null), ] +PREHOOK: query: select category, ds_theta_estimate(sketch) from sketch_intermediate +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_intermediate +#### A masked pattern was here #### +POSTHOOK: query: select category, ds_theta_estimate(sketch) from sketch_intermediate +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_intermediate +#### A masked pattern was here #### +a 10.0 +b 10.0 +PREHOOK: query: select ds_theta_estimate(ds_theta_union(sketch)) from sketch_intermediate +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_intermediate +#### A masked pattern was here #### +POSTHOOK: query: select ds_theta_estimate(ds_theta_union(sketch)) from sketch_intermediate +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_intermediate +#### A masked pattern was here #### +15.0 +PREHOOK: query: create temporary table sketch_input (id1 int, id2 int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_input +POSTHOOK: query: create temporary table sketch_input (id1 int, id2 int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_input +PREHOOK: query: insert into table sketch_input values + (1, 2), (2, 4), (3, 6), (4, 8), (5, 10), (6, 12), (7, 14), (8, 16), (9, 18), (10, 20) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@sketch_input +POSTHOOK: query: insert into table sketch_input values + (1, 2), (2, 4), (3, 6), (4, 8), (5, 10), (6, 12), (7, 14), (8, 16), (9, 18), (10, 20) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@sketch_input +POSTHOOK: Lineage: sketch_input.id1 SCRIPT [] +POSTHOOK: Lineage: sketch_input.id2 SCRIPT [] +PREHOOK: query: create temporary table sketch_intermediate2 (sketch1 binary, sketch2 binary) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sketch_intermediate2 +POSTHOOK: query: create temporary table sketch_intermediate2 (sketch1 binary, sketch2 binary) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sketch_intermediate2 +PREHOOK: query: insert into sketch_intermediate2 select ds_theta_sketch(id1), ds_theta_sketch(id2) from sketch_input +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_input +PREHOOK: Output: default@sketch_intermediate2 +POSTHOOK: query: insert into sketch_intermediate2 select ds_theta_sketch(id1), ds_theta_sketch(id2) from sketch_input +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_input +POSTHOOK: Output: default@sketch_intermediate2 +POSTHOOK: Lineage: sketch_intermediate2.sketch1 EXPRESSION [(sketch_input)sketch_input.FieldSchema(name:id1, type:int, comment:null), ] +POSTHOOK: Lineage: sketch_intermediate2.sketch2 EXPRESSION [(sketch_input)sketch_input.FieldSchema(name:id2, type:int, comment:null), ] +PREHOOK: query: select + ds_theta_estimate(sketch1), + ds_theta_estimate(sketch2), + ds_theta_estimate(ds_theta_union_f(sketch1, sketch2)), + ds_theta_estimate(ds_theta_intersect_f(sketch1, sketch2)), + ds_theta_estimate(ds_theta_exclude(sketch1, sketch2)), + ds_theta_estimate(ds_theta_exclude(sketch2, sketch1)) +from sketch_intermediate2 +PREHOOK: type: QUERY +PREHOOK: Input: default@sketch_intermediate2 +#### A masked pattern was here #### +POSTHOOK: query: select + ds_theta_estimate(sketch1), + ds_theta_estimate(sketch2), + ds_theta_estimate(ds_theta_union_f(sketch1, sketch2)), + ds_theta_estimate(ds_theta_intersect_f(sketch1, sketch2)), + ds_theta_estimate(ds_theta_exclude(sketch1, sketch2)), + ds_theta_estimate(ds_theta_exclude(sketch2, sketch1)) +from sketch_intermediate2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@sketch_intermediate2 +#### A masked pattern was here #### +10.0 10.0 15.0 5.0 5.0 5.0 diff --git a/ql/src/test/results/clientpositive/show_functions.q.out b/ql/src/test/results/clientpositive/show_functions.q.out index 0453400..4b38cfb 100644 --- a/ql/src/test/results/clientpositive/show_functions.q.out +++ b/ql/src/test/results/clientpositive/show_functions.q.out @@ -105,6 +105,69 @@ decode degrees dense_rank div +ds_cpc_estimate +ds_cpc_estimate_bounds +ds_cpc_sketch +ds_cpc_stringify +ds_cpc_union +ds_cpc_union_f +ds_freq_frequent_items +ds_freq_sketch +ds_freq_union +ds_hll_estimate +ds_hll_estimate_bounds +ds_hll_sketch +ds_hll_stringify +ds_hll_union +ds_hll_union_f +ds_kll_cdf +ds_kll_n +ds_kll_pmf +ds_kll_quantile +ds_kll_quantiles +ds_kll_rank +ds_kll_sketch +ds_kll_stringify +ds_kll_union +ds_quantile_doubles_cdf +ds_quantile_doubles_k +ds_quantile_doubles_n +ds_quantile_doubles_pmf +ds_quantile_doubles_quantile +ds_quantile_doubles_quantiles +ds_quantile_doubles_sketch +ds_quantile_doubles_stringify +ds_quantile_doubles_union +ds_quantile_strings_cdf +ds_quantile_strings_k +ds_quantile_strings_n +ds_quantile_strings_pmf +ds_quantile_strings_quantile +ds_quantile_strings_quantiles +ds_quantile_strings_sketch +ds_quantile_strings_stringify +ds_quantile_strings_union +ds_theta_estimate +ds_theta_exclude +ds_theta_intersect +ds_theta_intersect_f +ds_theta_sketch +ds_theta_union +ds_theta_union_f +ds_tuple_arrayofdouble_estimate +ds_tuple_arrayofdouble_estimate_bounds +ds_tuple_arrayofdouble_means +ds_tuple_arrayofdouble_n_retained +ds_tuple_arrayofdouble_quantiles_sketch +ds_tuple_arrayofdouble_sketch +ds_tuple_arrayofdouble_ttest +ds_tuple_arrayofdouble_union +ds_tuple_arrayofdouble_values +ds_tuple_arrayofdouble_variances +ds_tuple_doublesummary_estimate +ds_tuple_doublesummary_percentile +ds_tuple_doublesummary_sketch +ds_tuple_doublesummary_union e elt encode @@ -392,6 +455,16 @@ coalesce current_database current_date decode +ds_cpc_estimate +ds_hll_estimate +ds_kll_quantile +ds_quantile_doubles_quantile +ds_quantile_strings_quantile +ds_theta_estimate +ds_theta_exclude +ds_tuple_arrayofdouble_estimate +ds_tuple_doublesummary_estimate +ds_tuple_doublesummary_percentile e encode explode @@ -540,6 +613,69 @@ decode degrees dense_rank div +ds_cpc_estimate +ds_cpc_estimate_bounds +ds_cpc_sketch +ds_cpc_stringify +ds_cpc_union +ds_cpc_union_f +ds_freq_frequent_items +ds_freq_sketch +ds_freq_union +ds_hll_estimate +ds_hll_estimate_bounds +ds_hll_sketch +ds_hll_stringify +ds_hll_union +ds_hll_union_f +ds_kll_cdf +ds_kll_n +ds_kll_pmf +ds_kll_quantile +ds_kll_quantiles +ds_kll_rank +ds_kll_sketch +ds_kll_stringify +ds_kll_union +ds_quantile_doubles_cdf +ds_quantile_doubles_k +ds_quantile_doubles_n +ds_quantile_doubles_pmf +ds_quantile_doubles_quantile +ds_quantile_doubles_quantiles +ds_quantile_doubles_sketch +ds_quantile_doubles_stringify +ds_quantile_doubles_union +ds_quantile_strings_cdf +ds_quantile_strings_k +ds_quantile_strings_n +ds_quantile_strings_pmf +ds_quantile_strings_quantile +ds_quantile_strings_quantiles +ds_quantile_strings_sketch +ds_quantile_strings_stringify +ds_quantile_strings_union +ds_theta_estimate +ds_theta_exclude +ds_theta_intersect +ds_theta_intersect_f +ds_theta_sketch +ds_theta_union +ds_theta_union_f +ds_tuple_arrayofdouble_estimate +ds_tuple_arrayofdouble_estimate_bounds +ds_tuple_arrayofdouble_means +ds_tuple_arrayofdouble_n_retained +ds_tuple_arrayofdouble_quantiles_sketch +ds_tuple_arrayofdouble_sketch +ds_tuple_arrayofdouble_ttest +ds_tuple_arrayofdouble_union +ds_tuple_arrayofdouble_values +ds_tuple_arrayofdouble_variances +ds_tuple_doublesummary_estimate +ds_tuple_doublesummary_percentile +ds_tuple_doublesummary_sketch +ds_tuple_doublesummary_union e elt encode