This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch sum_estimate
in repository https://gitbox.apache.org/repos/asf/datasketches-bigquery.git

commit dada74bb5fa9f160c17e953cf1d93ff6d74a0baf
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Mon Oct 7 16:32:33 2024 -0700

    function to get sum of the summary column estimate with bounds, updated 
engagement example
---
 ...e_sketch_int64_get_sum_estimate_and_bounds.sqlx | 45 ++++++++++++++++++
 ...tch_int64_get_sum_estimate_and_bounds_seed.sqlx | 53 ++++++++++++++++++++++
 .../test/tuple_sketch_int64_engagement_example.sql | 27 +++++++----
 tuple/tuple_sketch_int64.cpp                       | 11 +++++
 4 files changed, 126 insertions(+), 10 deletions(-)

diff --git a/tuple/sqlx/tuple_sketch_int64_get_sum_estimate_and_bounds.sqlx 
b/tuple/sqlx/tuple_sketch_int64_get_sum_estimate_and_bounds.sqlx
new file mode 100644
index 0000000..4775551
--- /dev/null
+++ b/tuple/sqlx/tuple_sketch_int64_get_sum_estimate_and_bounds.sqlx
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+config { hasOutput: true }
+
+CREATE OR REPLACE FUNCTION ${self()}(sketch BYTES, num_std_devs BYTEINT)
+RETURNS STRUCT<sum_estimate FLOAT64, sum_lower_bound FLOAT64, sum_upper_bound 
FLOAT64>
+OPTIONS (
+  description = '''Returns the estimate and bounds for the sum of the INT64 
summary column
+scaled to the original population from the given Tuple Sketch.
+Note that cardinality estimation accuracy, plots, and error tables are the 
same as the Theta Sketch.
+This function only applies to Tuple Sketches with an INT64 summary column.
+  
+Param sketch: the given Tuple Sketch. This may not be NULL.
+Param num_std_devs: The returned bounds will be based on the statistical 
confidence interval
+  determined by the given number of standard deviations from the returned 
estimate.
+  This number may be one of {1,2,3}, where 1 represents 68% confidence,
+  2 represents 95% confidence and 3 represents 99.7% confidence.
+  For example, if the given num_std_devs = 2 and the returned values are 
{1000, 990, 1010}
+  that means that with 95% confidence, the true value lies within the range 
[990, 1010].
+Defaults: seed = 9001.
+Returns: a STRUCT with three FLOAT64 values as {sum_estimate, sum_lower_bound, 
sum_upper_bound}.
+
+For more information:
+ - https://datasketches.apache.org/docs/Tuple/TupleSketches.html
+'''
+) AS (
+  $BQ_DATASET.tuple_sketch_int64_get_sum_estimate_and_bounds_seed(sketch, 
num_std_devs, NULL)
+);
diff --git 
a/tuple/sqlx/tuple_sketch_int64_get_sum_estimate_and_bounds_seed.sqlx 
b/tuple/sqlx/tuple_sketch_int64_get_sum_estimate_and_bounds_seed.sqlx
new file mode 100644
index 0000000..a90526c
--- /dev/null
+++ b/tuple/sqlx/tuple_sketch_int64_get_sum_estimate_and_bounds_seed.sqlx
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+config { hasOutput: true }
+
+CREATE OR REPLACE FUNCTION ${self()}(sketch BYTES, num_std_devs BYTEINT, seed 
INT64)
+RETURNS STRUCT<sum_estimate FLOAT64, sum_lower_bound FLOAT64, sum_upper_bound 
FLOAT64>
+LANGUAGE js
+OPTIONS (
+  library=["gs://$GCS_BUCKET/tuple_sketch_int64.js"],
+  js_parameter_encoding_mode='STANDARD',
+  description = '''Returns the estimate and bounds for the sum of the INT64 
summary column
+scaled to the original population from the given Tuple Sketch.
+Note that cardinality estimation accuracy, plots, and error tables are the 
same as the Theta Sketch.
+This function only applies to Tuple Sketches with an INT64 summary column.
+
+Param sketch: the given Tuple Sketch. This may not be NULL.
+Param num_std_devs: The returned bounds will be based on the statistical 
confidence interval
+  determined by the given number of standard deviations from the returned 
estimate.
+  This number may be one of {1,2,3}, where 1 represents 68% confidence,
+  2 represents 95% confidence and 3 represents 99.7% confidence.
+  For example, if the given num_std_devs = 2 and the returned values are 
{1000, 990, 1010}
+  that means that with 95% confidence, the true value lies within the range 
[990, 1010].
+Param seed: This is used to confirm that the given sketches were configured 
with the correct seed. A NULL specifies the default seed = 9001.
+Returns: a STRUCT with three FLOAT64 values as {sum_estimate, sum_lower_bound, 
sum_upper_bound}.
+
+For more information:
+ - https://datasketches.apache.org/docs/Tuple/TupleSketches.html
+'''
+) AS R"""
+try {
+  return Module.compact_tuple_sketch_int64.getSumEstimateAndBounds(sketch, 
Number(num_std_devs), seed ? BigInt(seed) : BigInt(Module.DEFAULT_SEED));
+} catch (e) {
+  if (e.message != null) throw e;
+  throw new Error(Module.getExceptionMessage(e));
+}
+""";
diff --git a/tuple/test/tuple_sketch_int64_engagement_example.sql 
b/tuple/test/tuple_sketch_int64_engagement_example.sql
index a99a05b..3cda3f9 100644
--- a/tuple/test/tuple_sketch_int64_engagement_example.sql
+++ b/tuple/test/tuple_sketch_int64_engagement_example.sql
@@ -18,6 +18,7 @@
  */
 
 # for details see 
https://datasketches.apache.org/docs/Tuple/TupleEngagementExample.html
+declare result_sketch bytes;
 create temp table raw_data(day int, id int);
 insert into raw_data values
 (1, 0),
@@ -918,16 +919,22 @@ insert into raw_data values
 (1, 269),
 (1, 270);
 
-with daily_agg as (
-  select day, $BQ_DATASET.tuple_sketch_int64_agg_int64_lgk_seed_p_mode(id, 1, 
struct<int, int, float64, string>(8, null, null, "ONE")) as sketch from 
raw_data group by day
-),
-result as (
+# this shows how to aggregate per day first and then union across days
+set result_sketch = (
+  with daily_agg as (
+    select day, $BQ_DATASET.tuple_sketch_int64_agg_int64_lgk_seed_p_mode(id, 
1, struct<int, int, float64, string>(8, null, null, "ONE")) as sketch from 
raw_data group by day
+  )
   select $BQ_DATASET.tuple_sketch_int64_agg_union_lgk_seed_mode(sketch, 
struct<int, int, string>(8, null, "SUM")) as sketch from daily_agg
-),
-num_days as (
-  select n from unnest(generate_array(1, 30)) as n
-)
+);
+
+# engagement histogram with bounds
 select
   n as days_visited,
-  
$BQ_DATASET.tuple_sketch_int64_get_estimate_and_bounds($BQ_DATASET.tuple_sketch_int64_filter_low_high(sketch,
 n, n), 2) as visitors
-from num_days, result;
+  
$BQ_DATASET.tuple_sketch_int64_get_estimate_and_bounds($BQ_DATASET.tuple_sketch_int64_filter_low_high(result_sketch,
 n, n), 2) as visitors
+from unnest(generate_array(1, 30)) as n;
+
+# total estimated number of visitors with bounds
+select $BQ_DATASET.tuple_sketch_int64_get_estimate_and_bounds(result_sketch, 
2);
+
+# total estimated number of visits with bounds
+select 
$BQ_DATASET.tuple_sketch_int64_get_sum_estimate_and_bounds(result_sketch, 2);
diff --git a/tuple/tuple_sketch_int64.cpp b/tuple/tuple_sketch_int64.cpp
index fbe1d46..45fd15b 100644
--- a/tuple/tuple_sketch_int64.cpp
+++ b/tuple/tuple_sketch_int64.cpp
@@ -133,6 +133,17 @@ EMSCRIPTEN_BINDINGS(tuple_sketch_int64) {
       result.set("upper_bound", sketch.get_upper_bound(num_std_devs));
       return result;
     }))
+    .class_function("getSumEstimateAndBounds", 
emscripten::optional_override([](const std::string& sketch_bytes, uint8_t 
num_std_devs, uint64_t seed) {
+      const auto sketch = 
compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), 
sketch_bytes.size(), seed);
+      uint64_t sum = 0;
+      for (const auto& entry: sketch) sum += entry.second;
+      const double sum_estimate = sum / sketch.get_theta();
+      auto result =  emscripten::val::object();
+      result.set("sum_estimate", sum_estimate);
+      result.set("sum_lower_bound", sketch.get_estimate() > 0 ? (sum_estimate 
* sketch.get_lower_bound(num_std_devs) / sketch.get_estimate()) : 0);
+      result.set("sum_upper_bound", sketch.get_estimate() > 0 ? (sum_estimate 
* sketch.get_upper_bound(num_std_devs) / sketch.get_estimate()) : 0);
+      return result;
+    }))
     .class_function("getTheta", emscripten::optional_override([](const 
std::string& sketch_bytes, uint64_t seed) {
       return compact_tuple_sketch_int64::deserialize(sketch_bytes.data(), 
sketch_bytes.size(), seed).get_theta();
     }))


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to