This is an automated email from the ASF dual-hosted git repository. alsay pushed a commit to branch hll in repository https://gitbox.apache.org/repos/asf/datasketches-bigquery.git
commit 24fc3cf68b9be75011968142b7ac0ffbc0cf82aa Author: AlexanderSaydakov <[email protected]> AuthorDate: Tue Aug 20 18:55:47 2024 -0700 HLL sketch --- hll/Makefile | 44 ++++++++++++ hll/hll_sketch.cpp | 102 +++++++++++++++++++++++++++ hll/hll_sketch_agg_string.sqlx | 145 +++++++++++++++++++++++++++++++++++++++ hll/hll_sketch_get_estimate.sqlx | 43 ++++++++++++ hll/hll_sketch_to_string.sqlx | 43 ++++++++++++ 5 files changed, 377 insertions(+) diff --git a/hll/Makefile b/hll/Makefile new file mode 100644 index 0000000..bd08fe0 --- /dev/null +++ b/hll/Makefile @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +EMCC=emcc +EMCFLAGS=-I../datasketches-cpp/common/include \ + -I../datasketches-cpp/hll/include \ + --no-entry \ + -sWASM_BIGINT=1 \ + -sEXPORTED_FUNCTIONS=[_malloc,_free] \ + -sENVIRONMENT=shell \ + -sTOTAL_MEMORY=1024MB \ + -O3 \ + --bind + +all: hll_sketch.mjs hll_sketch.js hll_sketch.wasm + +%.mjs: %.cpp + $(EMCC) $< $(EMCFLAGS) -sSINGLE_FILE=1 -o $@ + +# this rule creates a non-es6 loadable library +%.js: %.cpp + $(EMCC) $< $(EMCFLAGS) -sSINGLE_FILE=1 -o $@ + +%.wasm: %.cpp + $(EMCC) $< $(EMCFLAGS) -sSTANDALONE_WASM=1 -o $@ + +clean: + $(RM) *.mjs *.js *.wasm + +.PHONY: clean diff --git a/hll/hll_sketch.cpp b/hll/hll_sketch.cpp new file mode 100644 index 0000000..845c35f --- /dev/null +++ b/hll/hll_sketch.cpp @@ -0,0 +1,102 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include <strstream> +#include <emscripten/bind.h> + +#include <hll.hpp> + +#include "base64.hpp" + +const emscripten::val Uint8Array = emscripten::val::global("Uint8Array"); + +EMSCRIPTEN_BINDINGS(hll_sketch) { + + emscripten::function("getExceptionMessage", emscripten::optional_override([](intptr_t ptr) { + return std::string(reinterpret_cast<std::exception*>(ptr)->what()); + })); + + emscripten::enum_<datasketches::target_hll_type>("TargetHllType") + .value("HLL_4", datasketches::HLL_4) + .value("HLL_6", datasketches::HLL_6) + .value("HLL_8", datasketches::HLL_8) + ; + + emscripten::class_<datasketches::hll_sketch>("hll_sketch") + .constructor(emscripten::optional_override([](uint8_t lg_k, datasketches::target_hll_type tgt_type) { + return new datasketches::hll_sketch(lg_k, tgt_type); + })) + .function("updateString", emscripten::select_overload<void(const std::string&)>(&datasketches::hll_sketch::update)) + .function("serializeAsUint8Array", emscripten::optional_override([](const datasketches::hll_sketch& self) { + auto bytes = self.serialize_compact(); + return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); + })) + .class_function("deserializeFromB64", emscripten::optional_override([](const std::string& b64) { + std::vector<char> bytes(b64_dec_len(b64.data(), b64.size())); + b64_decode(b64.data(), b64.size(), bytes.data()); + return new datasketches::hll_sketch(datasketches::hll_sketch::deserialize(bytes.data(), bytes.size())); + }), emscripten::allow_raw_pointers()) + .class_function("deserializeFromBytes", emscripten::optional_override([](const std::string& bytes) { + return new datasketches::hll_sketch(datasketches::hll_sketch::deserialize(bytes.data(), bytes.size())); + }), emscripten::allow_raw_pointers()) + .function("getEstimate", &datasketches::hll_sketch::get_estimate) + .function("getLowerBound", &datasketches::hll_sketch::get_lower_bound) + .function("getUpperBound", &datasketches::hll_sketch::get_upper_bound) + .function("toString", emscripten::optional_override([](const datasketches::hll_sketch& self) { + return self.to_string(); + })) + .class_function("getMaxSerializedSizeBytes", &datasketches::hll_sketch::get_max_updatable_serialization_bytes) + ; + + emscripten::class_<datasketches::hll_union>("hll_union") + .constructor(emscripten::optional_override([](uint8_t lg_k) { + return new datasketches::hll_union(lg_k); + })) + .function("updateWithSketch", emscripten::optional_override([](datasketches::hll_union& self, const datasketches::hll_sketch& sketch) { + self.update(sketch); + }), emscripten::allow_raw_pointers()) + .function("updateWithBytes", emscripten::optional_override([](datasketches::hll_union& self, const std::string& bytes) { + self.update(datasketches::hll_sketch::deserialize(bytes.data(), bytes.size())); + }), emscripten::allow_raw_pointers()) + .function("updateWithB64", emscripten::optional_override([](datasketches::hll_union& self, const std::string& b64) { + std::vector<char> bytes(b64_dec_len(b64.data(), b64.size())); + b64_decode(b64.data(), b64.size(), bytes.data()); + self.update(datasketches::hll_sketch::deserialize(bytes.data(), bytes.size())); + }), emscripten::allow_raw_pointers()) + .function("updateWithBuffer", emscripten::optional_override([](datasketches::hll_union& self, intptr_t bytes, size_t size) { + self.update(datasketches::hll_sketch::deserialize(reinterpret_cast<void*>(bytes), size)); + })) + .function("getResultStream", emscripten::optional_override([](datasketches::hll_union& self, intptr_t bytes, size_t size, datasketches::target_hll_type tgt_type) { + std::strstream stream(reinterpret_cast<char*>(bytes), size); + self.get_result(tgt_type).serialize_compact(stream); + return (int) stream.tellp(); + })) + .function("getResultAsUint8Array", emscripten::optional_override([](datasketches::hll_union& self, datasketches::target_hll_type tgt_type) { + auto bytes = self.get_result(tgt_type).serialize_compact(); + return Uint8Array.new_(emscripten::typed_memory_view(bytes.size(), bytes.data())); + })) + .function("getResultB64", emscripten::optional_override([](datasketches::hll_union& self, datasketches::target_hll_type tgt_type) { + auto bytes = self.get_result(tgt_type).serialize_compact(); + std::vector<char> b64(b64_enc_len(bytes.size())); + b64_encode((const char*) bytes.data(), bytes.size(), b64.data()); + return std::string(b64.data(), b64.size()); + })) + ; + +} diff --git a/hll/hll_sketch_agg_string.sqlx b/hll/hll_sketch_agg_string.sqlx new file mode 100644 index 0000000..c08d5cf --- /dev/null +++ b/hll/hll_sketch_agg_string.sqlx @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +config { hasOutput: true } + +CREATE OR REPLACE AGGREGATE FUNCTION ${self()}(str STRING, params STRUCT<lg_k BYTEINT, tgt_type STRING> NOT AGGREGATE) +RETURNS BYTES +LANGUAGE js +OPTIONS ( + library=["gs://$GCS_BUCKET/hll_sketch.mjs"], + description = '''Creates a sketch that represents the cardinality of the given STRING column. +Param str: the STRING column of identifiers. +Param lg_k: the sketch accuracy/size parameter as an integer in the range [4, 26]. +Param tgt_type: The HLL type to use, if or when the sketch reaches that state +Returns: an HLL Sketch, as bytes, from which the cardinality can be obtained. +For more details: https://datasketches.apache.org/docs/HLL/HLL.html''' +) AS R""" +import ModuleFactory from "gs://$GCS_BUCKET/hll_sketch.mjs"; +var Module = await ModuleFactory(); +const default_lg_k = Number(12); +const default_tgt_type = Module.TargetHllType.HLL_4; + +function destroyState(state) { + if (state.sketch) { + state.sketch.delete(); + state.sketch = null; + } + if (state.union) { + state.union.delete(); + state.union = null; + } + state.serialized = null; +} + +// UDAF interface +export function initialState(params) { + try { + var state = { + lg_k: params.lg_k == null ? default_lg_k : Number(params.lg_k), + }; + if (params.tgt_type == null) { + state.tgt_type = default_tgt_type; + } else if (params.tgt_type == "HLL_4") { + state.tgt_type = Module.TargetHllType.HLL_4; + } else if (params.tgt_type == "HLL_6") { + state.tgt_type = Module.TargetHllType.HLL_6; + } else if (params.tgt_type == "HLL_8") { + state.tgt_type = Module.TargetHllType.HLL_8; + } else { + throw new Error("unrecognized HLL type " + params.tgt_type); + } + state.sketch = new Module.hll_sketch(state.lg_k, state.tgt_type); + return state; + } catch (e) { + throw new Error(Module.getExceptionMessage(e)); + } +} + +export function aggregate(state, str) { + try { + if (state.sketch == null) { // for transition deserialize-aggregate + state.sketch = new Module.hll_sketch(state.lg_k, state.tgt_type); + } + state.sketch.updateString(str); + } catch (e) { + throw new Error(Module.getExceptionMessage(e)); + } +} + +export function serialize(state) { + if (state.sketch == null) return state; // for transition deserialize-serialize + try { + try { + // for prior transition deserialize-aggregate + // merge aggregated and serialized state + if (state.sketch != null && state.serialized != null) { + var u = null; + try { + u = new Module.hll_union(state.lg_k); + u.updateWithSketch(state.sketch); + u.updateWithBytes(state.serialized); + state.serialized = u.getResultAsUint8Array(state.tgt_type); + } finally { + if (u != null) u.delete(); + } + } else if (state.sketch != null) { + state.serialized = state.sketch.serializeAsUint8Array(); + } else if (state.union != null) { + state.serialized = state.union.getResultAsUint8Array(state.tgt_type); + } + return { + lg_k: state.lg_k, + tgt_type: state.tgt_type, + serialized: state.serialized + }; + } catch (e) { + throw new Error(Module.getExceptionMessage(e)); + } + } finally { + destroyState(state); + } +} + +export function deserialize(state) { + return state; +} + +export function merge(state, other_state) { + try { + if (state.union == null) { + state.union = new Module.hll_union(state.lg_k); + } + if (state.serialized) { + state.union.updateWithBytes(state.serialized); + state.serialized = null; + } + if (other_state.serialized) { + state.union.updateWithBytes(other_state.serialized); + other_state.serialized = null; + } + } catch (e) { + throw new Error(Module.getExceptionMessage(e)); + } +} + +export function finalize(state) { + return serialize(state).serialized +} +"""; diff --git a/hll/hll_sketch_get_estimate.sqlx b/hll/hll_sketch_get_estimate.sqlx new file mode 100644 index 0000000..3c603e9 --- /dev/null +++ b/hll/hll_sketch_get_estimate.sqlx @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +config { hasOutput: true } + +CREATE OR REPLACE FUNCTION ${self()}(sketch BYTES) +RETURNS FLOAT64 +LANGUAGE js +OPTIONS ( + library=["gs://$GCS_BUCKET/hll_sketch.js"], + description = '''Returns a summary string that represents the state of the given sketch. +Param sketch: the given sketch as bytes. +Returns: the cardinality estimate as FLOAT64 value. +For more details: https://datasketches.apache.org/docs/HLL/HLL.html''' +) AS R""" +try { + var sketchObject = null; + try { + sketchObject = Module.hll_sketch.deserializeFromB64(sketch); + return sketchObject.getEstimate(); + } finally { + if (sketchObject != null) sketchObject.delete(); + } +} catch (e) { + throw new Error(Module.getExceptionMessage(e)); +} +"""; diff --git a/hll/hll_sketch_to_string.sqlx b/hll/hll_sketch_to_string.sqlx new file mode 100644 index 0000000..5e6e9c0 --- /dev/null +++ b/hll/hll_sketch_to_string.sqlx @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +config { hasOutput: true } + +CREATE OR REPLACE FUNCTION ${self()}(sketch BYTES) +RETURNS STRING +LANGUAGE js +OPTIONS ( + library=["gs://$GCS_BUCKET/hll_sketch.js"], + description = '''Returns a summary string that represents the state of the given sketch. +Param sketch: the given sketch as bytes. +Returns: a string that represents the state of the given sketch. +For more details: https://datasketches.apache.org/docs/HLL/HLL.html''' +) AS R""" +try { + var sketchObject = null; + try { + sketchObject = Module.hll_sketch.deserializeFromB64(sketch); + return sketchObject.toString(); + } finally { + if (sketchObject != null) sketchObject.delete(); + } +} catch (e) { + throw new Error(Module.getExceptionMessage(e)); +} +"""; --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
