This is an automated email from the ASF dual-hosted git repository.

alsay pushed a commit to branch hll
in repository https://gitbox.apache.org/repos/asf/datasketches-bigquery.git

commit 24fc3cf68b9be75011968142b7ac0ffbc0cf82aa
Author: AlexanderSaydakov <[email protected]>
AuthorDate: Tue Aug 20 18:55:47 2024 -0700

    HLL sketch
---
 hll/Makefile                     |  44 ++++++++++++
 hll/hll_sketch.cpp               | 102 +++++++++++++++++++++++++++
 hll/hll_sketch_agg_string.sqlx   | 145 +++++++++++++++++++++++++++++++++++++++
 hll/hll_sketch_get_estimate.sqlx |  43 ++++++++++++
 hll/hll_sketch_to_string.sqlx    |  43 ++++++++++++
 5 files changed, 377 insertions(+)

diff --git a/hll/Makefile b/hll/Makefile
new file mode 100644
index 0000000..bd08fe0
--- /dev/null
+++ b/hll/Makefile
@@ -0,0 +1,44 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+EMCC=emcc
+EMCFLAGS=-I../datasketches-cpp/common/include \
+       -I../datasketches-cpp/hll/include \
+       --no-entry \
+       -sWASM_BIGINT=1 \
+       -sEXPORTED_FUNCTIONS=[_malloc,_free] \
+       -sENVIRONMENT=shell \
+       -sTOTAL_MEMORY=1024MB \
+       -O3 \
+       --bind
+
+all: hll_sketch.mjs hll_sketch.js hll_sketch.wasm
+
+%.mjs: %.cpp
+       $(EMCC) $< $(EMCFLAGS) -sSINGLE_FILE=1 -o $@
+
+# this rule creates a non-es6 loadable library
+%.js: %.cpp
+       $(EMCC) $< $(EMCFLAGS) -sSINGLE_FILE=1 -o $@
+
+%.wasm: %.cpp
+       $(EMCC) $< $(EMCFLAGS) -sSTANDALONE_WASM=1 -o $@
+
+clean:
+       $(RM) *.mjs *.js *.wasm
+
+.PHONY: clean
diff --git a/hll/hll_sketch.cpp b/hll/hll_sketch.cpp
new file mode 100644
index 0000000..845c35f
--- /dev/null
+++ b/hll/hll_sketch.cpp
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <strstream>
+#include <emscripten/bind.h>
+
+#include <hll.hpp>
+
+#include "base64.hpp"
+
+const emscripten::val Uint8Array = emscripten::val::global("Uint8Array");
+
+EMSCRIPTEN_BINDINGS(hll_sketch) {
+
+  emscripten::function("getExceptionMessage", 
emscripten::optional_override([](intptr_t ptr) {
+    return std::string(reinterpret_cast<std::exception*>(ptr)->what());
+  }));
+
+  emscripten::enum_<datasketches::target_hll_type>("TargetHllType")
+    .value("HLL_4", datasketches::HLL_4)
+    .value("HLL_6", datasketches::HLL_6)
+    .value("HLL_8", datasketches::HLL_8)
+    ;
+
+  emscripten::class_<datasketches::hll_sketch>("hll_sketch")
+    .constructor(emscripten::optional_override([](uint8_t lg_k, 
datasketches::target_hll_type tgt_type) {
+      return new datasketches::hll_sketch(lg_k, tgt_type);
+    }))
+    .function("updateString", emscripten::select_overload<void(const 
std::string&)>(&datasketches::hll_sketch::update))
+    .function("serializeAsUint8Array", emscripten::optional_override([](const 
datasketches::hll_sketch& self) {
+      auto bytes = self.serialize_compact();
+      return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), 
bytes.data()));
+    }))
+    .class_function("deserializeFromB64", 
emscripten::optional_override([](const std::string& b64) {
+      std::vector<char> bytes(b64_dec_len(b64.data(), b64.size()));
+      b64_decode(b64.data(), b64.size(), bytes.data());
+      return new 
datasketches::hll_sketch(datasketches::hll_sketch::deserialize(bytes.data(), 
bytes.size()));
+    }), emscripten::allow_raw_pointers())
+    .class_function("deserializeFromBytes", 
emscripten::optional_override([](const std::string& bytes) {
+      return new 
datasketches::hll_sketch(datasketches::hll_sketch::deserialize(bytes.data(), 
bytes.size()));
+    }), emscripten::allow_raw_pointers())
+    .function("getEstimate", &datasketches::hll_sketch::get_estimate)
+    .function("getLowerBound", &datasketches::hll_sketch::get_lower_bound)
+    .function("getUpperBound", &datasketches::hll_sketch::get_upper_bound)
+    .function("toString", emscripten::optional_override([](const 
datasketches::hll_sketch& self) {
+      return self.to_string();
+    }))
+    .class_function("getMaxSerializedSizeBytes", 
&datasketches::hll_sketch::get_max_updatable_serialization_bytes)
+    ;
+
+  emscripten::class_<datasketches::hll_union>("hll_union")
+    .constructor(emscripten::optional_override([](uint8_t lg_k) {
+      return new datasketches::hll_union(lg_k);
+    }))
+    .function("updateWithSketch", 
emscripten::optional_override([](datasketches::hll_union& self, const 
datasketches::hll_sketch& sketch) {
+      self.update(sketch);
+    }), emscripten::allow_raw_pointers())
+    .function("updateWithBytes", 
emscripten::optional_override([](datasketches::hll_union& self, const 
std::string& bytes) {
+      self.update(datasketches::hll_sketch::deserialize(bytes.data(), 
bytes.size()));
+    }), emscripten::allow_raw_pointers())
+    .function("updateWithB64", 
emscripten::optional_override([](datasketches::hll_union& self, const 
std::string& b64) {
+      std::vector<char> bytes(b64_dec_len(b64.data(), b64.size()));
+      b64_decode(b64.data(), b64.size(), bytes.data());
+      self.update(datasketches::hll_sketch::deserialize(bytes.data(), 
bytes.size()));
+    }), emscripten::allow_raw_pointers())
+    .function("updateWithBuffer", 
emscripten::optional_override([](datasketches::hll_union& self, intptr_t bytes, 
size_t size) {
+      
self.update(datasketches::hll_sketch::deserialize(reinterpret_cast<void*>(bytes),
 size));
+    }))
+    .function("getResultStream", 
emscripten::optional_override([](datasketches::hll_union& self, intptr_t bytes, 
size_t size, datasketches::target_hll_type tgt_type) {
+      std::strstream stream(reinterpret_cast<char*>(bytes), size);
+      self.get_result(tgt_type).serialize_compact(stream);
+      return (int) stream.tellp();
+    }))
+    .function("getResultAsUint8Array", 
emscripten::optional_override([](datasketches::hll_union& self, 
datasketches::target_hll_type tgt_type) {
+      auto bytes = self.get_result(tgt_type).serialize_compact();
+      return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), 
bytes.data()));
+    }))
+    .function("getResultB64", 
emscripten::optional_override([](datasketches::hll_union& self, 
datasketches::target_hll_type tgt_type) {
+      auto bytes = self.get_result(tgt_type).serialize_compact();
+      std::vector<char> b64(b64_enc_len(bytes.size()));
+      b64_encode((const char*) bytes.data(), bytes.size(), b64.data());
+      return std::string(b64.data(), b64.size());
+    }))
+    ;
+
+}
diff --git a/hll/hll_sketch_agg_string.sqlx b/hll/hll_sketch_agg_string.sqlx
new file mode 100644
index 0000000..c08d5cf
--- /dev/null
+++ b/hll/hll_sketch_agg_string.sqlx
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+config { hasOutput: true }
+
+CREATE OR REPLACE AGGREGATE FUNCTION ${self()}(str STRING, params STRUCT<lg_k 
BYTEINT, tgt_type STRING> NOT AGGREGATE)
+RETURNS BYTES
+LANGUAGE js
+OPTIONS (
+  library=["gs://$GCS_BUCKET/hll_sketch.mjs"],
+  description = '''Creates a sketch that represents the cardinality of the 
given STRING column.
+Param str: the STRING column of identifiers.
+Param lg_k: the sketch accuracy/size parameter as an integer in the range [4, 
26].
+Param tgt_type: The HLL type to use, if or when the sketch reaches that state
+Returns: an HLL Sketch, as bytes, from which the cardinality can be obtained.
+For more details: https://datasketches.apache.org/docs/HLL/HLL.html'''
+) AS R"""
+import ModuleFactory from "gs://$GCS_BUCKET/hll_sketch.mjs";
+var Module = await ModuleFactory();
+const default_lg_k = Number(12);
+const default_tgt_type = Module.TargetHllType.HLL_4;
+
+function destroyState(state) {
+  if (state.sketch) {
+    state.sketch.delete();
+    state.sketch = null;
+  }
+  if (state.union) {
+    state.union.delete();
+    state.union = null;
+  }
+  state.serialized = null;
+}
+
+// UDAF interface
+export function initialState(params) {
+  try {
+    var state = {
+      lg_k: params.lg_k == null ? default_lg_k : Number(params.lg_k),
+    };
+    if (params.tgt_type == null) {
+      state.tgt_type = default_tgt_type;
+    } else if (params.tgt_type == "HLL_4") {
+      state.tgt_type = Module.TargetHllType.HLL_4;
+    } else if (params.tgt_type == "HLL_6") {
+      state.tgt_type = Module.TargetHllType.HLL_6;
+    } else if (params.tgt_type == "HLL_8") {
+      state.tgt_type = Module.TargetHllType.HLL_8;
+    } else {
+      throw new Error("unrecognized HLL type " + params.tgt_type);
+    }
+    state.sketch = new Module.hll_sketch(state.lg_k, state.tgt_type);
+    return state;
+  } catch (e) {
+    throw new Error(Module.getExceptionMessage(e));
+  }
+}
+
+export function aggregate(state, str) {
+  try {
+    if (state.sketch == null) { // for transition deserialize-aggregate
+      state.sketch = new Module.hll_sketch(state.lg_k, state.tgt_type);
+    }
+    state.sketch.updateString(str);
+  } catch (e) {
+    throw new Error(Module.getExceptionMessage(e));
+  }
+}
+
+export function serialize(state) {
+  if (state.sketch == null) return state; // for transition 
deserialize-serialize
+  try {
+    try {
+      // for prior transition deserialize-aggregate
+      // merge aggregated and serialized state
+      if (state.sketch != null && state.serialized != null) {
+        var u = null;
+        try {
+          u = new Module.hll_union(state.lg_k);
+          u.updateWithSketch(state.sketch);
+          u.updateWithBytes(state.serialized);
+          state.serialized = u.getResultAsUint8Array(state.tgt_type);
+        } finally {
+          if (u != null) u.delete();
+        }
+      } else if (state.sketch != null) {
+        state.serialized = state.sketch.serializeAsUint8Array();
+      } else if (state.union != null) {
+        state.serialized = state.union.getResultAsUint8Array(state.tgt_type);
+      }
+      return {
+        lg_k: state.lg_k,
+        tgt_type: state.tgt_type,
+        serialized: state.serialized
+      };
+    } catch (e) {
+      throw new Error(Module.getExceptionMessage(e));
+    }
+  } finally {
+    destroyState(state);
+  }
+}
+
+export function deserialize(state) {
+  return state;
+}
+
+export function merge(state, other_state) {
+  try {
+    if (state.union == null) {
+      state.union = new Module.hll_union(state.lg_k);
+    }
+    if (state.serialized) {
+      state.union.updateWithBytes(state.serialized);
+      state.serialized = null;
+    }
+    if (other_state.serialized) {
+      state.union.updateWithBytes(other_state.serialized);
+      other_state.serialized = null;
+    }
+  } catch (e) {
+    throw new Error(Module.getExceptionMessage(e));
+  }
+}
+
+export function finalize(state) {
+  return serialize(state).serialized
+}
+""";
diff --git a/hll/hll_sketch_get_estimate.sqlx b/hll/hll_sketch_get_estimate.sqlx
new file mode 100644
index 0000000..3c603e9
--- /dev/null
+++ b/hll/hll_sketch_get_estimate.sqlx
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+config { hasOutput: true }
+
+CREATE OR REPLACE FUNCTION ${self()}(sketch BYTES)
+RETURNS FLOAT64
+LANGUAGE js
+OPTIONS (
+  library=["gs://$GCS_BUCKET/hll_sketch.js"],
+  description = '''Returns a summary string that represents the state of the 
given sketch.
+Param sketch: the given sketch as bytes.
+Returns: the cardinality estimate as FLOAT64 value.
+For more details: https://datasketches.apache.org/docs/HLL/HLL.html'''
+) AS R"""
+try {
+  var sketchObject = null;
+  try {
+    sketchObject = Module.hll_sketch.deserializeFromB64(sketch);
+    return sketchObject.getEstimate();
+  } finally {
+    if (sketchObject != null) sketchObject.delete();
+  }
+} catch (e) {
+  throw new Error(Module.getExceptionMessage(e));
+}
+""";
diff --git a/hll/hll_sketch_to_string.sqlx b/hll/hll_sketch_to_string.sqlx
new file mode 100644
index 0000000..5e6e9c0
--- /dev/null
+++ b/hll/hll_sketch_to_string.sqlx
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+config { hasOutput: true }
+
+CREATE OR REPLACE FUNCTION ${self()}(sketch BYTES)
+RETURNS STRING
+LANGUAGE js
+OPTIONS (
+  library=["gs://$GCS_BUCKET/hll_sketch.js"],
+  description = '''Returns a summary string that represents the state of the 
given sketch.
+Param sketch: the given sketch as bytes.
+Returns: a string that represents the state of the given sketch.
+For more details: https://datasketches.apache.org/docs/HLL/HLL.html'''
+) AS R"""
+try {
+  var sketchObject = null;
+  try {
+    sketchObject = Module.hll_sketch.deserializeFromB64(sketch);
+    return sketchObject.toString();
+  } finally {
+    if (sketchObject != null) sketchObject.delete();
+  }
+} catch (e) {
+  throw new Error(Module.getExceptionMessage(e));
+}
+""";


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to