This is an automated email from the ASF dual-hosted git repository. alsay pushed a commit to branch sum_estimate in repository https://gitbox.apache.org/repos/asf/datasketches-bigquery.git
commit dada74bb5fa9f160c17e953cf1d93ff6d74a0baf Author: AlexanderSaydakov <[email protected]> AuthorDate: Mon Oct 7 16:32:33 2024 -0700 function to get sum of the summary column estimate with bounds, updated engagement example --- ...e_sketch_int64_get_sum_estimate_and_bounds.sqlx | 45 ++++++++++++++++++ ...tch_int64_get_sum_estimate_and_bounds_seed.sqlx | 53 ++++++++++++++++++++++ .../test/tuple_sketch_int64_engagement_example.sql | 27 +++++++---- tuple/tuple_sketch_int64.cpp | 11 +++++ 4 files changed, 126 insertions(+), 10 deletions(-) diff --git a/tuple/sqlx/tuple_sketch_int64_get_sum_estimate_and_bounds.sqlx b/tuple/sqlx/tuple_sketch_int64_get_sum_estimate_and_bounds.sqlx new file mode 100644 index 0000000..4775551 --- /dev/null +++ b/tuple/sqlx/tuple_sketch_int64_get_sum_estimate_and_bounds.sqlx @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +config { hasOutput: true } + +CREATE OR REPLACE FUNCTION ${self()}(sketch BYTES, num_std_devs BYTEINT) +RETURNS STRUCT<sum_estimate FLOAT64, sum_lower_bound FLOAT64, sum_upper_bound FLOAT64> +OPTIONS ( + description = '''Returns the estimate and bounds for the sum of the INT64 summary column +scaled to the original population from the given Tuple Sketch. +Note that cardinality estimation accuracy, plots, and error tables are the same as the Theta Sketch. +This function only applies to Tuple Sketches with an INT64 summary column. + +Param sketch: the given Tuple Sketch. This may not be NULL. +Param num_std_devs: The returned bounds will be based on the statistical confidence interval + determined by the given number of standard deviations from the returned estimate. + This number may be one of {1,2,3}, where 1 represents 68% confidence, + 2 represents 95% confidence and 3 represents 99.7% confidence. + For example, if the given num_std_devs = 2 and the returned values are {1000, 990, 1010} + that means that with 95% confidence, the true value lies within the range [990, 1010]. +Defaults: seed = 9001. +Returns: a STRUCT with three FLOAT64 values as {sum_estimate, sum_lower_bound, sum_upper_bound}. + +For more information: + - https://datasketches.apache.org/docs/Tuple/TupleSketches.html +''' +) AS ( + $BQ_DATASET.tuple_sketch_int64_get_sum_estimate_and_bounds_seed(sketch, num_std_devs, NULL) +); diff --git a/tuple/sqlx/tuple_sketch_int64_get_sum_estimate_and_bounds_seed.sqlx b/tuple/sqlx/tuple_sketch_int64_get_sum_estimate_and_bounds_seed.sqlx new file mode 100644 index 0000000..a90526c --- /dev/null +++ b/tuple/sqlx/tuple_sketch_int64_get_sum_estimate_and_bounds_seed.sqlx @@ -0,0 +1,53 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +config { hasOutput: true } + +CREATE OR REPLACE FUNCTION ${self()}(sketch BYTES, num_std_devs BYTEINT, seed INT64) +RETURNS STRUCT<sum_estimate FLOAT64, sum_lower_bound FLOAT64, sum_upper_bound FLOAT64> +LANGUAGE js +OPTIONS ( + library=["gs://$GCS_BUCKET/tuple_sketch_int64.js"], + js_parameter_encoding_mode='STANDARD', + description = '''Returns the estimate and bounds for the sum of the INT64 summary column +scaled to the original population from the given Tuple Sketch. +Note that cardinality estimation accuracy, plots, and error tables are the same as the Theta Sketch. +This function only applies to Tuple Sketches with an INT64 summary column. + +Param sketch: the given Tuple Sketch. This may not be NULL. +Param num_std_devs: The returned bounds will be based on the statistical confidence interval + determined by the given number of standard deviations from the returned estimate. + This number may be one of {1,2,3}, where 1 represents 68% confidence, + 2 represents 95% confidence and 3 represents 99.7% confidence. + For example, if the given num_std_devs = 2 and the returned values are {1000, 990, 1010} + that means that with 95% confidence, the true value lies within the range [990, 1010]. +Param seed: This is used to confirm that the given sketches were configured with the correct seed. A NULL specifies the default seed = 9001. +Returns: a STRUCT with three FLOAT64 values as {sum_estimate, sum_lower_bound, sum_upper_bound}. + +For more information: + - https://datasketches.apache.org/docs/Tuple/TupleSketches.html +''' +) AS R""" +try { + return Module.compact_tuple_sketch_int64.getSumEstimateAndBounds(sketch, Number(num_std_devs), seed ? BigInt(seed) : BigInt(Module.DEFAULT_SEED)); +} catch (e) { + if (e.message != null) throw e; + throw new Error(Module.getExceptionMessage(e)); +} +"""; diff --git a/tuple/test/tuple_sketch_int64_engagement_example.sql b/tuple/test/tuple_sketch_int64_engagement_example.sql index a99a05b..3cda3f9 100644 --- a/tuple/test/tuple_sketch_int64_engagement_example.sql +++ b/tuple/test/tuple_sketch_int64_engagement_example.sql @@ -18,6 +18,7 @@ */ # for details see https://datasketches.apache.org/docs/Tuple/TupleEngagementExample.html +declare result_sketch bytes; create temp table raw_data(day int, id int); insert into raw_data values (1, 0), @@ -918,16 +919,22 @@ insert into raw_data values (1, 269), (1, 270); -with daily_agg as ( - select day, $BQ_DATASET.tuple_sketch_int64_agg_int64_lgk_seed_p_mode(id, 1, struct<int, int, float64, string>(8, null, null, "ONE")) as sketch from raw_data group by day -), -result as ( +# this shows how to aggregate per day first and then union across days +set result_sketch = ( + with daily_agg as ( + select day, $BQ_DATASET.tuple_sketch_int64_agg_int64_lgk_seed_p_mode(id, 1, struct<int, int, float64, string>(8, null, null, "ONE")) as sketch from raw_data group by day + ) select $BQ_DATASET.tuple_sketch_int64_agg_union_lgk_seed_mode(sketch, struct<int, int, string>(8, null, "SUM")) as sketch from daily_agg -), -num_days as ( - select n from unnest(generate_array(1, 30)) as n -) +); + +# engagement histogram with bounds select n as days_visited, - $BQ_DATASET.tuple_sketch_int64_get_estimate_and_bounds($BQ_DATASET.tuple_sketch_int64_filter_low_high(sketch, n, n), 2) as visitors -from num_days, result; + $BQ_DATASET.tuple_sketch_int64_get_estimate_and_bounds($BQ_DATASET.tuple_sketch_int64_filter_low_high(result_sketch, n, n), 2) as visitors +from unnest(generate_array(1, 30)) as n; + +# total estimated number of visitors with bounds +select $BQ_DATASET.tuple_sketch_int64_get_estimate_and_bounds(result_sketch, 2); + +# total estimated number of visits with bounds +select $BQ_DATASET.tuple_sketch_int64_get_sum_estimate_and_bounds(result_sketch, 2); diff --git a/tuple/tuple_sketch_int64.cpp b/tuple/tuple_sketch_int64.cpp index fbe1d46..45fd15b 100644 --- a/tuple/tuple_sketch_int64.cpp +++ b/tuple/tuple_sketch_int64.cpp @@ -133,6 +133,17 @@ EMSCRIPTEN_BINDINGS(tuple_sketch_int64) { result.set("upper_bound", sketch.get_upper_bound(num_std_devs)); return result; })) + .class_function("getSumEstimateAndBounds", emscripten::optional_override([](const std::string& sketch_bytes, uint8_t num_std_devs, uint64_t seed) { + const auto sketch = compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), sketch_bytes.size(), seed); + uint64_t sum = 0; + for (const auto& entry: sketch) sum += entry.second; + const double sum_estimate = sum / sketch.get_theta(); + auto result = emscripten::val::object(); + result.set("sum_estimate", sum_estimate); + result.set("sum_lower_bound", sketch.get_estimate() > 0 ? (sum_estimate * sketch.get_lower_bound(num_std_devs) / sketch.get_estimate()) : 0); + result.set("sum_upper_bound", sketch.get_estimate() > 0 ? (sum_estimate * sketch.get_upper_bound(num_std_devs) / sketch.get_estimate()) : 0); + return result; + })) .class_function("getTheta", emscripten::optional_override([](const std::string& sketch_bytes, uint64_t seed) { return compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), sketch_bytes.size(), seed).get_theta(); })) --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
