This is an automated email from the ASF dual-hosted git repository. alsay pushed a commit to branch tdigest-cpp in repository https://gitbox.apache.org/repos/asf/datasketches-characterization.git
commit caf2cf3a2d85c3430d7bd2fca79428fd1ff8cf97 Author: AlexanderSaydakov <[email protected]> AuthorDate: Tue Oct 22 18:53:50 2024 -0700 tdigest characterization --- cpp/matlab/tdigest_error_vs_rank.m | 45 +++ cpp/matlab/tdigest_memory.m | 53 ++++ cpp/matlab/tdigest_rank_error.m | 127 +++++++++ cpp/matlab/tdigest_size.m | 53 ++++ cpp/matlab/tdigest_update_time.m | 38 +++ cpp/matlab/tdigest_vs_req_error_exp.m | 42 +++ cpp/src/main.cpp | 11 + cpp/src/tdigest_accuracy_profile_impl.hpp | 7 +- cpp/src/tdigest_memory_profile.hpp | 44 +++ cpp/src/tdigest_memory_profile_impl.hpp | 71 +++++ cpp/src/tdigest_merge_accuracy_profile.hpp | 38 +++ ...hpp => tdigest_merge_accuracy_profile_impl.hpp} | 44 ++- cpp/src/tdigest_merge_timing_profile.hpp | 44 +++ ...l.hpp => tdigest_merge_timing_profile_impl.hpp} | 7 +- cpp/src/tdigest_sketch_accuracy_profile_impl.hpp | 22 +- cpp/src/tdigest_timing_profile_impl.hpp | 13 +- .../tdigest/TDigestAccuracyProfile.java | 165 +++++++++++ .../tdigest/TDigestErrorVsRankProfile.java | 311 +++++++++++++++++++++ .../tdigest/TDigestMergeAccuracyProfile.java | 173 ++++++++++++ .../tdigest/TDigestSpeedProfile.java | 150 ++++++++++ src/main/resources/tdigest/TDigestAccuracyJob.conf | 38 +++ .../resources/tdigest/TDigestErrorVsRankJob.conf | 64 +++++ .../resources/tdigest/TDigestMergeAccuracyJob.conf | 40 +++ src/main/resources/tdigest/TDigestSpeedJob.conf | 27 ++ 24 files changed, 1578 insertions(+), 49 deletions(-) diff --git a/cpp/matlab/tdigest_error_vs_rank.m b/cpp/matlab/tdigest_error_vs_rank.m new file mode 100644 index 0000000..e064a86 --- /dev/null +++ b/cpp/matlab/tdigest_error_vs_rank.m @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +clf; + +td_cpp=load('../results/tdigest_error_vs_rank-k100-sl25.tsv'); + +hold on; + +plot(td_cpp(:,1), td_cpp(:,8), 'linewidth', 2); +plot(td_cpp(:,1), td_cpp(:,7), 'linewidth', 2); +plot(td_cpp(:,1), td_cpp(:,6), 'linewidth', 2); +plot(td_cpp(:,1), td_cpp(:,5), 'linewidth', 2); +plot(td_cpp(:,1), td_cpp(:,4), 'linewidth', 2); +plot(td_cpp(:,1), td_cpp(:,3), 'linewidth', 2); +plot(td_cpp(:,1), td_cpp(:,2), 'linewidth', 2); + +set(gca, 'fontsize', 16); +title 'TDigest rank error K=100, N=2^2^5, uniform distribution, 10000 trials' +xlabel 'normalized rank' +ylabel 'rank error' +grid minor on +legend( +'+3SD', +'+2SD', +'+1SD', +'median', +'-1SD', +'-2SD', +'-3SD', +'location', 'northwest'); diff --git a/cpp/matlab/tdigest_memory.m b/cpp/matlab/tdigest_memory.m new file mode 100644 index 0000000..b1d10fa --- /dev/null +++ b/cpp/matlab/tdigest_memory.m @@ -0,0 +1,53 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +clf; +#td100_dbl=load('../results/tdigest_double_memory-100.tsv'); +#td100_dbl_nres=load('../results/tdigest_double_memory-100-no-reserve.tsv'); + +td100_flt=load('../results/tdigest_float_memory-100.tsv'); +td200_flt=load('../results/tdigest_float_memory-200.tsv'); + +req10_flt=load('../results/req_sketch_memory_float_k10.tsv'); +req20_flt=load('../results/req_sketch_memory_float_k20.tsv'); +req30_flt=load('../results/req_sketch_memory_float_k30.tsv'); +req40_flt=load('../results/req_sketch_memory_float_k40.tsv'); + +hold on; + +semilogx(td100_flt(:,1), td100_flt(:,11), 'linewidth', 2); +semilogx(td200_flt(:,1), td200_flt(:,11), 'linewidth', 2); + +semilogx(req10_flt(:,1), req10_flt(:,11), 'linewidth', 2); +semilogx(req20_flt(:,1), req20_flt(:,11), 'linewidth', 2); +semilogx(req30_flt(:,1), req30_flt(:,11), 'linewidth', 2); +semilogx(req40_flt(:,1), req40_flt(:,11), 'linewidth', 2); + +set(gca, 'fontsize', 16); +title 'Memory usage TDigest vs REQ sketch (updates, no transients, C++)' +xlabel 'number of values' +ylabel 'size in memory, bytes' +grid minor on +legend( +'tdigest<float> k=100', +'tdigest<float> k=200', +'req\_sketch<float> k=10', +'req\_sketch<float> k=20', +'req\_sketch<float> k=30', +'req\_sketch<float> k=40', +'location', 'northwest' +); diff --git a/cpp/matlab/tdigest_rank_error.m b/cpp/matlab/tdigest_rank_error.m new file mode 100644 index 0000000..31921ff --- /dev/null +++ b/cpp/matlab/tdigest_rank_error.m @@ -0,0 +1,127 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +clf; + +#td_cpp=load('../results/tdigest_rank_error-100-my_version-true_rank.tsv'); +#td_cpp=load('../results/tdigest_rank_error-100-lb_ub_rank.tsv'); +td100_cpp=load('../results/tdigest_rank_error-100.tsv'); +#td_java=load('../../results/tdigest_rank_error-100.tsv'); +#td_java=load('../../results/tdigest_rank_error-100-new.tsv'); + +td200_cpp=load('../results/tdigest_rank_error-200-float.tsv'); +td200_n_cpp=load('../results/tdigest_rank_error-200-new_sizing.tsv'); + +#td_merge2_cpp=load('../results/tdigest_merge_rank_error-100-2way.tsv'); +#td_merge4_cpp=load('../results/tdigest_merge_rank_error-100-4way.tsv'); +#td_merge32_cpp=load('../results/tdigest_merge_rank_error-100-32way.tsv'); +#td_merge64_cpp=load('../results/tdigest_merge_rank_error-100-64way.tsv'); +#td_merge128_cpp=load('../results/tdigest_merge_rank_error-100-128way.tsv'); + +td200_merge32_cpp=load('../results/tdigest_merge_rank_error-200-32way.tsv'); + +#td_merge2_java=load('../../results/tdigest_merge_rank_error-100-2way.tsv'); +#td_merge4_java=load('../../results/tdigest_merge_rank_error-100-4way.tsv'); +#td_merge32_java=load('../../results/tdigest_merge_rank_error-100-32way.tsv'); +#td_merge64_java=load('../../results/tdigest_merge_rank_error-100-64way.tsv'); +#td_merge128_java=load('../../results/tdigest_merge_rank_error-100-128way.tsv'); + +#req12=load('../results/req_sketch_error-k12-double.tsv'); +req10=load('../results/req_sketch_error-k10-double.tsv'); +#req8=load('../results/req_sketch_error-k8-double.tsv'); + +req20=load('../results/req_sketch_error-k20-double.tsv'); +req30=load('../results/req_sketch_error-k30-double.tsv'); +req40=load('../results/req_sketch_error-k40-double.tsv'); + +hold on; + +semilogx(td100_cpp(:,1), td100_cpp(:,6), 'linewidth', 2); +semilogx(td200_cpp(:,1), td200_cpp(:,6), 'linewidth', 2); +semilogx(td200_n_cpp(:,1), td200_n_cpp(:,6), 'linewidth', 2); + +#semilogx(td_java(:,1), td_java(:,4), 'linewidth', 2); +#semilogx(td_java(:,1), td_java(:,5), 'linewidth', 2); +#semilogx(td_java(:,1), td_java(:,6), 'linewidth', 2); + +#semilogx(td_merge32_cpp(:,1), td_merge32_cpp(:,5), 'linewidth', 2); +#semilogx(td_merge64_cpp(:,1), td_merge64_cpp(:,5), 'linewidth', 2); +#semilogx(td_merge128_cpp(:,1), td_merge128_cpp(:,5), 'linewidth', 2); + +#semilogx(td_merge2_cpp(:,1), td_merge2_cpp(:,6), 'linewidth', 2); +#semilogx(td_merge4_cpp(:,1), td_merge4_cpp(:,6), 'linewidth', 2); +#semilogx(td_merge32_cpp(:,1), td_merge32_cpp(:,6), 'linewidth', 2); +#semilogx(td_merge64_cpp(:,1), td_merge64_cpp(:,6), 'linewidth', 2); +#semilogx(td_merge128_cpp(:,1), td_merge128_cpp(:,6), 'linewidth', 2); + +semilogx(td200_merge32_cpp(:,1), td200_merge32_cpp(:,6), 'linewidth', 2); + +#semilogx(td_merge32_java(:,1), td_merge32_java(:,5), 'linewidth', 2); +#semilogx(td_merge64_java(:,1), td_merge64_java(:,5), 'linewidth', 2); +#semilogx(td_merge128_java(:,1), td_merge128_java(:,5), 'linewidth', 2); + +#semilogx(td_merge2_java(:,1), td_merge2_java(:,6), 'linewidth', 2); +#semilogx(td_merge4_java(:,1), td_merge4_java(:,6), 'linewidth', 2); +#semilogx(td_merge32_java(:,1), td_merge32_java(:,6), 'linewidth', 2); +#semilogx(td_merge64_java(:,1), td_merge64_java(:,6), 'linewidth', 2); +#semilogx(td_merge128_java(:,1), td_merge128_java(:,6), 'linewidth', 2); + +semilogx(req10(:,1), req10(:,6), 'linewidth', 2); +semilogx(req20(:,1), req20(:,6), 'linewidth', 2); +semilogx(req30(:,1), req30(:,6), 'linewidth', 2); +semilogx(req40(:,1), req40(:,6), 'linewidth', 2); + +set(gca, 'fontsize', 16); +title 'Rank Error of TDigest and REQ sketch, uniform distribution, 1000 trials 99 pct' +xlabel 'stream size' +ylabel 'normalized rank error |true - esimate|, %' +grid minor on +legend( +#'TDigest k=100 rank 0.5 C++', +'TDigest k=100 rank 0.99', +#'t-digest k=100 rank 0.99 C++', +'TDigest k=200 rank 0.99', +'TDigest k=200 rank 0.99 new sizing', + +#'t-digest k=100 2-way merge rank 0.99 C++', +#'t-digest k=100 4-way merge rank 0.99 C++', +#'t-digest k=100 32-way merge rank 0.99 C++', +#'t-digest k=100 64-way merge rank 0.95 C++', +#'t-digest k=100 128-way merge rank 0.95 C++', + +'TDigest k=200 32-way merge rank 0.99 C++', + +#'t-digest k=100 32-way merge rank 0.99 Java', +#'t-digest k=100 2-way merge rank 0.99 Java', +#'t-digest k=100 4-way merge rank 0.99 Java', +#'t-digest k=100 64-way merge rank 0.95 Java', +#'t-digest k=100 128-way merge rank 0.95 Java', + +#'req k=12 rank 0.01 C++', +#'req k=12 rank 0.05 C++', +#'req k=12 rank 0.5 C++', +#'REQ k=12 rank 0.95', +#'REQ k=12 rank 0.99', +#'req k=10 rank 0.05 C++', +#'req k=10 rank 0.5 C++', +'REQ k=10 rank 0.99', +#'REQ k=10 rank 0.99', +'REQ k=20 rank 0.99', +'REQ k=30 rank 0.99', +'REQ k=40 rank 0.99', +'location', 'northwest' +); diff --git a/cpp/matlab/tdigest_size.m b/cpp/matlab/tdigest_size.m new file mode 100644 index 0000000..e9ed465 --- /dev/null +++ b/cpp/matlab/tdigest_size.m @@ -0,0 +1,53 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +clf; + +#tdigest_tdunning=load('../../results/tdigest_timing-100.tsv'); +td100_dbl=load('../results/tdigest_timing-100-double-gcc13.tsv'); +td200_dbl=load('../results/tdigest_timing-200-double-gcc13.tsv'); +req10_dbl=load('../results/req_sketch_timing_double_hra_k10_gcc13.tsv'); +req20_dbl=load('../results/req_sketch_timing_double_hra_k20_gcc13.tsv'); +req30_dbl=load('../results/req_sketch_timing_double_hra_k30_gcc13.tsv'); +req40_dbl=load('../results/req_sketch_timing_double_hra_k40_gcc13.tsv'); + +hold on; + +#semilogx(tdigest_tdunning(:,1), tdigest_tdunning(:,11), 'linewidth', 2); + +semilogx(td100_dbl(:,1), td100_dbl(:,5), 'linewidth', 2); +semilogx(td200_dbl(:,1), td200_dbl(:,5), 'linewidth', 2); + +semilogx(req10_dbl(:,1), req10_dbl(:,5), 'linewidth', 2); +semilogx(req20_dbl(:,1), req20_dbl(:,5), 'linewidth', 2); +semilogx(req30_dbl(:,1), req30_dbl(:,5), 'linewidth', 2); +semilogx(req40_dbl(:,1), req40_dbl(:,5), 'linewidth', 2); + +set(gca, 'fontsize', 16); +title 'Serialized size of TDigest and REQ sketch' +xlabel 'stream size' +ylabel 'serialized size, bytes' +grid minor on +legend( +'tdigest<double> k=100', +'tdigest<double> k=200', +'req\_sketch<double> k=10', +'req\_sketch<double> k=20', +'req\_sketch<double> k=30', +'req\_sketch<double> k=40', +'location', 'northwest' +); diff --git a/cpp/matlab/tdigest_update_time.m b/cpp/matlab/tdigest_update_time.m new file mode 100644 index 0000000..f62063a --- /dev/null +++ b/cpp/matlab/tdigest_update_time.m @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +clf; + +td100_java=load('../../results/tdigest_timing-100.tsv'); +td100_dbl_gcc13=load('../results/tdigest_timing-100-double-gcc13.tsv'); +td100_dbl_gcc13_res=load('../results/tdigest_timing-100-double-gcc13-reserve.tsv'); + +hold on; +semilogx(td100_dbl_gcc13(:,1), td100_dbl_gcc13(:,4), 'linewidth', 2); +semilogx(td100_dbl_gcc13_res(:,1), td100_dbl_gcc13_res(:,4), 'linewidth', 2); +semilogx(td100_java(:,1), td100_java(:,4), 'linewidth', 2); + +set(gca, 'fontsize', 16); +title 'Update time of TDigest' +xlabel 'stream size' +legend( +'cmopression=100 double gcc13', +'cmopression=100 double gcc13 with reserve', +'compression=100 Java tdunning', +'location', 'northwest'); +ylabel 'update time, nanoseconds' +grid minor on diff --git a/cpp/matlab/tdigest_vs_req_error_exp.m b/cpp/matlab/tdigest_vs_req_error_exp.m new file mode 100644 index 0000000..1a6c3df --- /dev/null +++ b/cpp/matlab/tdigest_vs_req_error_exp.m @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +clf; + +td200=load('../results/tdigest_rank_error-200-double-exp1.5.tsv'); + +req40_hra=load('../results/req_sketch_error-40-double-exp1.5-hra.tsv'); +req40_lra=load('../results/req_sketch_error-40-double-exp1.5-lra.tsv'); + +hold on; + +semilogx(td200(:,1), td200(:,6), 'linewidth', 2); +semilogx(req40_hra(:,1), req40_hra(:,6), 'linewidth', 2); + +#semilogx(td200(:,1), td200(:,2), 'linewidth', 2); +#semilogx(req40_lra(:,1), req40_lra(:,2), 'linewidth', 2); + +set(gca, 'fontsize', 16); +title 'Rank Error of TDigest and REQ sketch, exponential distribution lambda=1.5, 1000 trials 99 pct' +xlabel 'stream size' +ylabel 'normalized rank error |true - esimate|, %' +grid minor on +legend( +'TDigest k=200 rank 0.99', +'REQ k=40 HRA rank 0.99', +'location', 'northwest' +); diff --git a/cpp/src/main.cpp b/cpp/src/main.cpp index 4bb78ed..3a06be7 100644 --- a/cpp/src/main.cpp +++ b/cpp/src/main.cpp @@ -55,6 +55,8 @@ #include "tdigest_timing_profile.hpp" #include "tdigest_sketch_accuracy_profile.hpp" +#include "tdigest_merge_accuracy_profile.hpp" +#include "tdigest_memory_profile.hpp" #include "cpc_sketch_memory_profile.hpp" #include "hll_sketch_memory_profile.hpp" @@ -65,6 +67,7 @@ #include "req_sketch_timing_profile.hpp" #include "req_merge_timing_profile.hpp" +#include "req_error_vs_rank_profile.hpp" using namespace datasketches; typedef std::unique_ptr<job_profile> job_profile_ptr; @@ -86,6 +89,7 @@ int main(int argc, char **argv) { job_profile::add("fi-merge-timing", job_profile_ptr(new frequent_items_merge_timing_profile())); job_profile::add("req-sketch-timing-float", job_profile_ptr(new req_sketch_timing_profile<float>())); job_profile::add("req-merge-timing-float", job_profile_ptr(new req_merge_timing_profile<float>())); + job_profile::add("req-sketch-timing-double", job_profile_ptr(new req_sketch_timing_profile<double>())); job_profile::add("cpc-sketch-accuracy", job_profile_ptr(new cpc_sketch_accuracy_profile())); job_profile::add("cpc-union-accuracy", job_profile_ptr(new cpc_union_accuracy_profile())); @@ -96,9 +100,16 @@ int main(int argc, char **argv) { job_profile::add("kll-sketch-accuracy", job_profile_ptr(new kll_sketch_accuracy_profile())); job_profile::add("kll-merge-accuracy", job_profile_ptr(new kll_merge_accuracy_profile())); job_profile::add("fi-sketch-accuracy", job_profile_ptr(new frequent_items_sketch_accuracy_profile())); + job_profile::add("req-error-vs-rank-double", job_profile_ptr(new req_error_vs_rank_profile<double>())); job_profile::add("tdigest-timing-double", job_profile_ptr(new tdigest_timing_profile<double>())); job_profile::add("tdigest-sketch-accuracy-double", job_profile_ptr(new tdigest_sketch_accuracy_profile<double>())); + job_profile::add("tdigest-merge-accuracy-double", job_profile_ptr(new tdigest_merge_accuracy_profile<double>())); + job_profile::add("tdigest-timing-float", job_profile_ptr(new tdigest_timing_profile<float>())); + job_profile::add("tdigest-sketch-accuracy-float", job_profile_ptr(new tdigest_sketch_accuracy_profile<float>())); + job_profile::add("tdigest-merge-accuracy-float", job_profile_ptr(new tdigest_merge_accuracy_profile<float>())); + job_profile::add("tdigest-memory-float", job_profile_ptr(new tdigest_memory_profile<float>())); + job_profile::add("tdigest-memory-double", job_profile_ptr(new tdigest_memory_profile<double>())); job_profile::add("cpc-sketch-memory", job_profile_ptr(new cpc_sketch_memory_profile())); job_profile::add("hll-sketch-memory", job_profile_ptr(new hll_sketch_memory_profile())); diff --git a/cpp/src/tdigest_accuracy_profile_impl.hpp b/cpp/src/tdigest_accuracy_profile_impl.hpp index bfa5f7a..91dd78c 100644 --- a/cpp/src/tdigest_accuracy_profile_impl.hpp +++ b/cpp/src/tdigest_accuracy_profile_impl.hpp @@ -32,11 +32,11 @@ template<typename T> void tdigest_accuracy_profile<T>::run() { const unsigned lg_min = 0; const unsigned lg_max = 23; - const unsigned ppo = 16; + const unsigned ppo = 8; const unsigned num_trials = 1000; const unsigned error_pct = 99; - const uint16_t compression = 100; + const uint16_t compression = 200; const std::vector<double> ranks = {0.01, 0.05, 0.5, 0.95, 0.99}; std::vector<std::vector<double>> rank_errors(ranks.size(), std::vector<double>()); @@ -49,7 +49,8 @@ void tdigest_accuracy_profile<T>::run() { std::random_device rd; std::mt19937 gen(rd()); - std::uniform_real_distribution<T> dist(0, 1.0); +// std::uniform_real_distribution<T> dist(0, 1.0); + std::exponential_distribution<T> dist(1.5); const unsigned num_steps = count_points(lg_min, lg_max, ppo); unsigned stream_length = 1 << lg_min; diff --git a/cpp/src/tdigest_memory_profile.hpp b/cpp/src/tdigest_memory_profile.hpp new file mode 100644 index 0000000..e28a8ee --- /dev/null +++ b/cpp/src/tdigest_memory_profile.hpp @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef TDIGEST_MEMORY_PROFILE_HPP_ +#define TDIGEST_MEMORY_PROFILE_HPP_ + +#include <random> + +#include "memory_usage_profile.hpp" + +namespace datasketches { + +template<typename T> +class tdigest_memory_profile: public memory_usage_profile { +public: + tdigest_memory_profile(); + void run_trial(size_t lg_min_x, size_t num_points, size_t x_ppo); +private: + std::random_device rd; + std::mt19937 gen; + std::uniform_real_distribution<T> dist; +}; + +} + +#include "tdigest_memory_profile_impl.hpp" + +#endif diff --git a/cpp/src/tdigest_memory_profile_impl.hpp b/cpp/src/tdigest_memory_profile_impl.hpp new file mode 100644 index 0000000..6911b73 --- /dev/null +++ b/cpp/src/tdigest_memory_profile_impl.hpp @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef TDIGEST_MEMORY_PROFILE_IMPL_HPP_ +#define TDIGEST_MEMORY_PROFILE_IMPL_HPP_ + +#include <tdigest.hpp> +#include <req_sketch.hpp> + +#include "counting_allocator.hpp" + +namespace datasketches { + +// this relies on a global variable to count total amount of allocated memory +extern long long int total_allocated_memory; + +template<typename T> +tdigest_memory_profile<T>::tdigest_memory_profile(): +rd(), +gen(rd()), +dist(0, 1.0) +{} + +template<typename T> +void tdigest_memory_profile<T>::run_trial(size_t lg_min_x, size_t num_points, size_t x_ppo) { + const size_t k = 40; + +// using tdigest_t = tdigest<T, counting_allocator<T>>; +// tdigest_t* td = new (counting_allocator<tdigest_t>().allocate(1)) tdigest_t(k); + + using req_sketch_t = req_sketch<T, std::less<T>, counting_allocator<T>>; + req_sketch_t* s = new (counting_allocator<req_sketch_t>().allocate(1)) req_sketch_t(k); + + size_t count = 0; + size_t p = 1ULL << lg_min_x; + for (size_t i = 0; i < num_points; ++i) { + const size_t delta = p - count; + for (size_t j = 0; j < delta; ++j) { + s->update(dist(gen)); + } + count += delta; + stats[i].update(total_allocated_memory); + p = pwr_2_law_next(x_ppo, p); + } + +// td->~tdigest_t(); +// counting_allocator<tdigest_t>().deallocate(td, 1); + s->~req_sketch_t(); + counting_allocator<req_sketch_t>().deallocate(s, 1); + if (total_allocated_memory != 0) throw std::runtime_error("total_allocated_memory != 0"); +} + +} + +#endif diff --git a/cpp/src/tdigest_merge_accuracy_profile.hpp b/cpp/src/tdigest_merge_accuracy_profile.hpp new file mode 100644 index 0000000..cbe8c04 --- /dev/null +++ b/cpp/src/tdigest_merge_accuracy_profile.hpp @@ -0,0 +1,38 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef TDIGEST_MERGE_ACCURACY_PROFILE_HPP_ +#define TDIGEST_MERGE_ACCURACY_PROFILE_HPP_ + +#include "tdigest_accuracy_profile.hpp" + +namespace datasketches { + +template<typename T> +class tdigest_merge_accuracy_profile: public tdigest_accuracy_profile<T> { +public: + void run_trial(std::vector<T>& values, size_t stream_length, uint16_t k, + const std::vector<double>& ranks, std::vector<std::vector<double>>& rank_errors); +}; + +} + +#include "tdigest_merge_accuracy_profile_impl.hpp" + +#endif diff --git a/cpp/src/tdigest_sketch_accuracy_profile_impl.hpp b/cpp/src/tdigest_merge_accuracy_profile_impl.hpp similarity index 51% copy from cpp/src/tdigest_sketch_accuracy_profile_impl.hpp copy to cpp/src/tdigest_merge_accuracy_profile_impl.hpp index 4c1a6e4..a7f2406 100644 --- a/cpp/src/tdigest_sketch_accuracy_profile_impl.hpp +++ b/cpp/src/tdigest_merge_accuracy_profile_impl.hpp @@ -17,44 +17,40 @@ * under the License. */ -#ifndef TDIGEST_SKETCH_ACCURACY_PROFILE_IMPL_HPP_ -#define TDIGEST_SKETCH_ACCURACY_PROFILE_IMPL_HPP_ +#ifndef TDIGEST_MERGE_ACCURACY_PROFILE_IMPL_HPP_ +#define TDIGEST_MERGE_ACCURACY_PROFILE_IMPL_HPP_ #include <tdigest.hpp> +#include <req_sketch.hpp> + +#include "true_rank.hpp" namespace datasketches { -// assumes sorted values, 0 <= rank <= 1 -// only the first n = size values in the vector should be used template<typename T> -T get_quantile(const std::vector<T>& values, size_t size, double rank) { - return values[(size - 1) * rank]; -} +void tdigest_merge_accuracy_profile<T>::run_trial(std::vector<T>& values, size_t stream_length, uint16_t k, + const std::vector<double>& ranks, std::vector<std::vector<double>>& rank_errors) { -// assumes sorted values, given value is one of the values in the vector -// only the first n = size values in the vector should be used -template<typename T> -double get_rank(const std::vector<T>& values, size_t size, T value) { - auto lower = std::lower_bound(values.begin(), values.begin() + size, value); - const auto d1 = std::distance(values.begin(), lower); - auto upper = std::upper_bound(lower, values.begin() + size, value); - const auto d2 = std::distance(values.begin(), upper); - return (d1 + d2) / 2.0 / size; -} + const size_t num_sketches = 32; + std::vector<tdigest<T>> sketches; + for (size_t i = 0; i < num_sketches; ++i) sketches.push_back(tdigest<T>(k)); -template<typename T> -void tdigest_sketch_accuracy_profile<T>::run_trial(std::vector<T>& values, size_t stream_length, uint16_t k, - const std::vector<double>& ranks, std::vector<std::vector<double>>& rank_errors) { + size_t s = 0; + for (size_t i = 0; i < stream_length; ++i) { + sketches[s].update(values[i]); + ++s; + if (s == num_sketches) s = 0; + } - tdigest<T> sketch(k); - for (size_t i = 0; i < stream_length; ++i) sketch.update(values[i]); + tdigest<T> merge_sketch(k); + for (size_t i = 0; i < num_sketches; ++i) merge_sketch.merge(sketches[i]); std::sort(values.begin(), values.begin() + stream_length); unsigned j = 0; for (const double rank: ranks) { const T quantile = get_quantile(values, stream_length, rank); - const double true_rank = get_rank(values, stream_length, quantile); - rank_errors[j++].push_back(std::abs(sketch.get_rank(quantile) - true_rank)); + const double true_rank = get_rank(values, stream_length, quantile, MIDPOINT); + rank_errors[j++].push_back(std::abs(merge_sketch.get_rank(quantile) - true_rank)); } } diff --git a/cpp/src/tdigest_merge_timing_profile.hpp b/cpp/src/tdigest_merge_timing_profile.hpp new file mode 100644 index 0000000..d0c3ed9 --- /dev/null +++ b/cpp/src/tdigest_merge_timing_profile.hpp @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifndef TDIGEST_MERGE_TIMING_PROFILE_HPP_ +#define TDIGEST_MERGE_TIMING_PROFILE_HPP_ + +#include <random> + +#include "job_profile.hpp" + +namespace datasketches { + +template<typename T> +class tdigest_timing_profile: public job_profile { +public: + tdigest_timing_profile(); + void run(); + T sample(); +private: + std::default_random_engine generator; + std::uniform_real_distribution<T> distribution; +}; + +} + +#include "tdigest_merge_timing_profile_impl.hpp" + +#endif diff --git a/cpp/src/tdigest_timing_profile_impl.hpp b/cpp/src/tdigest_merge_timing_profile_impl.hpp similarity index 94% copy from cpp/src/tdigest_timing_profile_impl.hpp copy to cpp/src/tdigest_merge_timing_profile_impl.hpp index 8be06b9..9853deb 100644 --- a/cpp/src/tdigest_timing_profile_impl.hpp +++ b/cpp/src/tdigest_merge_timing_profile_impl.hpp @@ -59,6 +59,7 @@ void tdigest_timing_profile<T>::run() { std::chrono::nanoseconds build_time_ns(0); std::chrono::nanoseconds update_time_ns(0); std::chrono::nanoseconds get_rank_time_ns(0); + size_t size_bytes = 0; const size_t num_trials = get_num_trials(stream_length, lg_min_stream_len, lg_max_stream_len, lg_min_trials, lg_max_trials); for (size_t t = 0; t < num_trials; ++t) { @@ -80,12 +81,16 @@ void tdigest_timing_profile<T>::run() { // } // auto finish_get_rank(std::chrono::high_resolution_clock::now()); // get_rank_time_ns += std::chrono::duration_cast<std::chrono::nanoseconds>(finish_get_rank - start_get_rank); + + size_bytes += sketch.get_serialized_size_bytes(); } std::cout << stream_length << "\t" << num_trials << "\t" << (double) build_time_ns.count() / num_trials << "\t" << (double) update_time_ns.count() / num_trials / stream_length << "\t" - << (double) get_rank_time_ns.count() / num_trials / num_queries << "\n"; +// << (double) get_rank_time_ns.count() / num_trials / num_queries << "\t" + << (double) size_bytes / num_trials << "\n"; + stream_length = pwr_2_law_next(ppo, stream_length); } } diff --git a/cpp/src/tdigest_sketch_accuracy_profile_impl.hpp b/cpp/src/tdigest_sketch_accuracy_profile_impl.hpp index 4c1a6e4..60e84ba 100644 --- a/cpp/src/tdigest_sketch_accuracy_profile_impl.hpp +++ b/cpp/src/tdigest_sketch_accuracy_profile_impl.hpp @@ -22,25 +22,9 @@ #include <tdigest.hpp> -namespace datasketches { - -// assumes sorted values, 0 <= rank <= 1 -// only the first n = size values in the vector should be used -template<typename T> -T get_quantile(const std::vector<T>& values, size_t size, double rank) { - return values[(size - 1) * rank]; -} +#include "true_rank.hpp" -// assumes sorted values, given value is one of the values in the vector -// only the first n = size values in the vector should be used -template<typename T> -double get_rank(const std::vector<T>& values, size_t size, T value) { - auto lower = std::lower_bound(values.begin(), values.begin() + size, value); - const auto d1 = std::distance(values.begin(), lower); - auto upper = std::upper_bound(lower, values.begin() + size, value); - const auto d2 = std::distance(values.begin(), upper); - return (d1 + d2) / 2.0 / size; -} +namespace datasketches { template<typename T> void tdigest_sketch_accuracy_profile<T>::run_trial(std::vector<T>& values, size_t stream_length, uint16_t k, @@ -53,7 +37,7 @@ void tdigest_sketch_accuracy_profile<T>::run_trial(std::vector<T>& values, size_ unsigned j = 0; for (const double rank: ranks) { const T quantile = get_quantile(values, stream_length, rank); - const double true_rank = get_rank(values, stream_length, quantile); + const double true_rank = get_rank(values, stream_length, quantile, MIDPOINT); rank_errors[j++].push_back(std::abs(sketch.get_rank(quantile) - true_rank)); } } diff --git a/cpp/src/tdigest_timing_profile_impl.hpp b/cpp/src/tdigest_timing_profile_impl.hpp index 8be06b9..644528f 100644 --- a/cpp/src/tdigest_timing_profile_impl.hpp +++ b/cpp/src/tdigest_timing_profile_impl.hpp @@ -26,6 +26,7 @@ #include <sstream> #include <tdigest.hpp> +#include <req_sketch.hpp> namespace datasketches { @@ -46,6 +47,8 @@ void tdigest_timing_profile<T>::run() { const size_t num_queries(20); + const uint16_t k = 200; + std::cout << "Stream\tTrials\tBuild\tUpdate\tRank" << std::endl; std::vector<T> values(1ULL << lg_max_stream_len, 0); @@ -59,13 +62,15 @@ void tdigest_timing_profile<T>::run() { std::chrono::nanoseconds build_time_ns(0); std::chrono::nanoseconds update_time_ns(0); std::chrono::nanoseconds get_rank_time_ns(0); + size_t size_bytes = 0; const size_t num_trials = get_num_trials(stream_length, lg_min_stream_len, lg_max_stream_len, lg_min_trials, lg_max_trials); for (size_t t = 0; t < num_trials; ++t) { std::generate(values.begin(), values.begin() + stream_length, [this] { return this->sample(); }); auto start_build(std::chrono::high_resolution_clock::now()); - tdigest<T> sketch; + tdigest<T> sketch(k); +// req_sketch<T> sketch(40); auto finish_build(std::chrono::high_resolution_clock::now()); build_time_ns += std::chrono::duration_cast<std::chrono::nanoseconds>(finish_build - start_build); @@ -80,12 +85,16 @@ void tdigest_timing_profile<T>::run() { // } // auto finish_get_rank(std::chrono::high_resolution_clock::now()); // get_rank_time_ns += std::chrono::duration_cast<std::chrono::nanoseconds>(finish_get_rank - start_get_rank); + + size_bytes += sketch.get_serialized_size_bytes(); } std::cout << stream_length << "\t" << num_trials << "\t" << (double) build_time_ns.count() / num_trials << "\t" << (double) update_time_ns.count() / num_trials / stream_length << "\t" - << (double) get_rank_time_ns.count() / num_trials / num_queries << "\n"; +// << (double) get_rank_time_ns.count() / num_trials / num_queries << "\t" + << (double) size_bytes / num_trials << "\n"; + stream_length = pwr_2_law_next(ppo, stream_length); } } diff --git a/src/main/java/org/apache/datasketches/characterization/tdigest/TDigestAccuracyProfile.java b/src/main/java/org/apache/datasketches/characterization/tdigest/TDigestAccuracyProfile.java new file mode 100644 index 0000000..0598ca4 --- /dev/null +++ b/src/main/java/org/apache/datasketches/characterization/tdigest/TDigestAccuracyProfile.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.characterization.tdigest; + +import static org.apache.datasketches.common.Util.pwr2SeriesNext; + +import java.util.Arrays; +import java.util.Random; + +import org.apache.datasketches.Job; +import org.apache.datasketches.JobProfile; +import org.apache.datasketches.MonotonicPoints; +import org.apache.datasketches.Properties; + +import com.tdunning.math.stats.TDigest; + +public class TDigestAccuracyProfile implements JobProfile { + private Job job; + private Properties prop; + + private Random rand; + private double[] values; + + @Override + public void start(final Job job) { + this.job = job; + prop = job.getProperties(); + final int lgMin = Integer.parseInt(prop.mustGet("LgMin")); + final int lgMax = Integer.parseInt(prop.mustGet("LgMax")); + final int lgDelta = Integer.parseInt(prop.mustGet("LgDelta")); + final int ppo = Integer.parseInt(prop.mustGet("PPO")); + + final int compression = Integer.parseInt(prop.mustGet("Compression")); + final double[] ranks = {0.01, 0.05, 0.5, 0.95, 0.99}; + final double errorPercentile = Double.parseDouble(prop.mustGet("ErrorPercentile")); + + rand = new Random(); + values = new double[1 << lgMax]; + + final int numSteps; + final boolean useppo; + if (lgDelta < 1) { + numSteps = MonotonicPoints.countPoints(lgMin, lgMax, ppo); + useppo = true; + } else { + numSteps = (lgMax - lgMin) / lgDelta + 1; + useppo = false; + } + + int streamLength = 1 << lgMin; + int lgCurSL = lgMin; + for (int step = 0; step < numSteps; step++) { + + doStreamLength(streamLength, compression, ranks, errorPercentile); + + if (useppo) { + streamLength = (int)pwr2SeriesNext(ppo, streamLength); + } else { + lgCurSL += lgDelta; + streamLength = 1 << lgCurSL; + } + } + } + + @Override + public void shutdown() {} + + @Override + public void cleanup() {} + + void doStreamLength(final int streamLength, final int compression, final double[] ranks, final double errorPercentile) { + final int numTrials = 1 << Integer.parseInt(prop.mustGet("LgTrials")); + + double[][] rankErrors = new double[ranks.length][]; + for (int i = 0; i < ranks.length; i++) rankErrors[i] = new double[numTrials]; + + for (int t = 0; t < numTrials; t++) { + runTrial(t, streamLength, compression, ranks, rankErrors); + } + + job.print(streamLength); + for (int i = 0; i < ranks.length; i++) { + Arrays.sort(rankErrors[i]); + final int errPctIndex = (int)(numTrials * errorPercentile / 100); + final double rankErrorAtPct = rankErrors[i][errPctIndex]; + job.print("\t"); + job.print(rankErrorAtPct * 100); + } + job.print("\n"); + } + + void runTrial(final int trial, final int streamLength, final int compression, final double[] ranks, final double[][] rankErrors) { + TDigest td = TDigest.createDigest(compression); + for (int i = 0; i < streamLength; i++) { + values[i] = rand.nextDouble(); + td.add(values[i]); + } + Arrays.sort(values, 0, streamLength); + for (int i = 0; i < ranks.length; i++) { + final double quantile = values[(int)((streamLength - 1) * ranks[i])]; + final double trueRank = computeTrueRank(values, streamLength, quantile); + rankErrors[i][trial] = Math.abs(trueRank - td.cdf(quantile)); + } + } + + static double computeTrueRank(final double[] values, final int streamLength, final double value) { + final int lower = lowerBound(values, 0, streamLength, value); + final int upper = upperBound(values, lower, streamLength, value); + return (lower + upper) / 2.0 / streamLength; + } + + static int lowerBound(final double[] values, int first, final int last, final double value) { + int current; + int step; + int count = last - first; + while (count > 0) { + current = first; + step = count / 2; + current += step; + if (values[current] < value) { + first = ++current; + count -= step + 1; + } else { + count = step; + } + } + return first; + } + + static int upperBound(final double[] values, int first, final int last, final double value) { + int current; + int step; + int count = last - first; + while (count > 0) { + current = first; + step = count / 2; + current += step; + if (!(value < values[current])) { + first = ++current; + count -= step + 1; + } else { + count = step; + } + } + return first; + } + +} diff --git a/src/main/java/org/apache/datasketches/characterization/tdigest/TDigestErrorVsRankProfile.java b/src/main/java/org/apache/datasketches/characterization/tdigest/TDigestErrorVsRankProfile.java new file mode 100644 index 0000000..966b6a8 --- /dev/null +++ b/src/main/java/org/apache/datasketches/characterization/tdigest/TDigestErrorVsRankProfile.java @@ -0,0 +1,311 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.characterization.tdigest; + +import static java.lang.Math.round; +import static org.apache.datasketches.GaussianRanks.GAUSSIANS_3SD; +import static org.apache.datasketches.SpacedPoints.expSpaced; +import static org.apache.datasketches.common.Util.pwr2SeriesNext; +import static org.apache.datasketches.quantilescommon.QuantilesUtil.evenlySpacedDoubles; + +import org.apache.datasketches.Job; +import org.apache.datasketches.JobProfile; +import org.apache.datasketches.MonotonicPoints; +import org.apache.datasketches.Properties; +import org.apache.datasketches.characterization.Shuffle; +import org.apache.datasketches.characterization.req.StreamMaker; +import org.apache.datasketches.characterization.req.StreamMaker.Pattern; +import org.apache.datasketches.hll.HllSketch; +import org.apache.datasketches.quantiles.DoublesSketch; +import org.apache.datasketches.quantiles.DoublesSketchBuilder; +import org.apache.datasketches.quantiles.UpdateDoublesSketch; + +import com.tdunning.math.stats.TDigest; + +/** + * @author Lee Rhodes + */ +public class TDigestErrorVsRankProfile implements JobProfile { + private Job job; + private Properties prop; + + //FROM PROPERTIES + //Stream pattern config + StreamMaker streamMaker = new StreamMaker(); + private Pattern pattern; + private int offset; + + //For computing the different stream lengths + private int lgMin; + private int lgMax; + private int lgDelta; + private int ppo; //not currently used + + private int numTrials; //num of Trials per plotPoint + private int errQSkLgK; //size of the error quantiles sketches + private int errHllSkLgK; //size of the error HLL sketch + private boolean shuffle; //if true, shuffle for each trial + + //plotting & x-axis configuration + private int numPlotPoints; + private boolean evenlySpaced; + private double exponent; + private double rankRange; + private double metricsRankRange; + + //Target sketch configuration & error analysis + private int K; + private TDigest sk; + + //The array of Gaussian quantiles for +/- StdDev error analysis + private double[] gRanks; + private UpdateDoublesSketch[] errQSkArr; + private HllSketch[] errHllSkArr; + + //Specific to a streamLength + private TrueDoubleRanks trueRanks; + //The entire stream + private float[] stream; //a shuffled array of values from 1...N + private float[] sortedStream; + private int[] sortedAbsRanks; + //private int[] streamAbsRanks ?? do we need? + //The PP points + private float[] sortedPPValues; + private int[] sortedPPIndices; + private int[] sortedPPAbsRanks; + int sumAllocCounts = 0; + + private final String[] columnLabels = + {"nPP", "Value", "Rank", + "-3SD","-2SD", "-1SD", "Med", "+1SD", "+2SD", "+3SD", + "1LB", "1UB", "UErrCnt"}; + private final String sFmt = + "%3s\t%5s\t%4s\t" + + "%4s\t%4s\t%4s\t%5s\t%4s\t%4s\t%4s\t" + + "%3s\t%3s\t%7s\n"; + private final String fFmt = + "%f\t%f\t%f\t" // rPP, Value, Rank + + "%f\t%f\t%f\t%f\t%f\t%f\t%f\t" //-3sd to +3sd + + "\t%d\n"; // UErrCnt + + //JobProfile interface + @Override + public void start(final Job job) { + this.job = job; + prop = job.getProperties(); + extractProperties(); + configureCommon(); + doStreamLengths(); + } + + @Override + public void shutdown() {} + + @Override + public void cleanup() {} + //end JobProfile + + private void extractProperties() { + //Stream Pattern + pattern = Pattern.valueOf(prop.mustGet("Pattern")); + offset = Integer.parseInt(prop.mustGet("Offset")); + //Stream lengths + lgMin = Integer.parseInt(prop.mustGet("LgMin")); + lgMax = Integer.parseInt(prop.mustGet("LgMax")); + lgDelta = Integer.parseInt(prop.mustGet("LgDelta")); + ppo = Integer.parseInt(prop.mustGet("PPO")); + // Trials config (independent of sketch) + numTrials = 1 << Integer.parseInt(prop.mustGet("LgTrials")); + errQSkLgK = Integer.parseInt(prop.mustGet("ErrQSkLgK")); + errHllSkLgK = Integer.parseInt(prop.mustGet("ErrHllSkLgK")); + shuffle = Boolean.valueOf(prop.mustGet("Shuffle")); + //plotting + numPlotPoints = Integer.parseInt(prop.mustGet("NumPlotPoints")); + evenlySpaced = Boolean.valueOf(prop.mustGet("EvenlySpaced")); + exponent = Double.parseDouble(prop.mustGet("Exponent")); + rankRange = Double.parseDouble(prop.mustGet("RankRange")); + //Target sketch config + K = Integer.parseInt(prop.mustGet("K")); + + metricsRankRange = Double.parseDouble(prop.mustGet("MetricsRankRange")); + } + + void configureCommon() { + configureSketch(); + errQSkArr = new UpdateDoublesSketch[numPlotPoints]; + errHllSkArr = new HllSketch[numPlotPoints]; + //configure the error quantiles array & HLL sketch arr + final DoublesSketchBuilder builder = DoublesSketch.builder().setK(1 << errQSkLgK); + for (int i = 0; i < numPlotPoints; i++) { + errQSkArr[i] = builder.build(); + errHllSkArr[i] = new HllSketch(errHllSkLgK); + } + gRanks = new double[GAUSSIANS_3SD.length - 2]; //omit 0.0 and 1.0 + for (int i = 1; i < GAUSSIANS_3SD.length - 1; i++) { + gRanks[i - 1] = GAUSSIANS_3SD[i]; + } + } + + void configureSketch() { + } + + private void doStreamLengths() { + //compute the number of stream lengths for the whole job + final int numSteps; + final boolean useppo; + if (lgDelta < 1) { + numSteps = MonotonicPoints.countPoints(lgMin, lgMax, ppo); + useppo = true; + } else { + numSteps = (lgMax - lgMin) / lgDelta + 1; + useppo = false; + } + + int streamLength = 1 << lgMin; //initial streamLength + int lgCurSL = lgMin; + + // Step through the different stream lengths + for (int step = 0; step < numSteps; step++) { + + doStreamLength(streamLength); + + //go to next stream length + if (useppo) { + streamLength = (int)pwr2SeriesNext(ppo, streamLength); + } else { + lgCurSL += lgDelta; + streamLength = 1 << lgCurSL; + } + } + } + + void doStreamLength(final int streamLength) { + job.println(LS + "Stream Length: " + streamLength ); + job.println(LS + "param k: " + K ); + job.printfData(sFmt, (Object[])columnLabels); + //build the stream + stream = streamMaker.makeStream(streamLength, pattern, offset); + //compute true ranks + trueRanks = new TrueDoubleRanks(stream); + sortedStream = trueRanks.getSortedStream(); + sortedAbsRanks = trueRanks.getSortedAbsRanks(); + + //compute the true values used at the plot points + int startIdx = 0; + int endIdx = streamLength - 1; + final boolean hra = true; + if (rankRange < 1.0) { //A substream of points focuses on a sub-range at one end. + final int subStreamLen = (int)Math.round(rankRange * streamLength); + startIdx = hra ? streamLength - subStreamLen : 0; + endIdx = hra ? streamLength - 1 : subStreamLen - 1; + } + + //generates PP indices in [startIdx, endIdx] inclusive, inclusive + // PV 2020-01-07: using double so that there's enough precision even for large stream lengths + final double[] temp = evenlySpaced + ? evenlySpacedDoubles(startIdx, endIdx, numPlotPoints) + : expSpaced(startIdx, endIdx, numPlotPoints, exponent, hra); + + sortedPPIndices = new int[numPlotPoints]; + sortedPPAbsRanks = new int[numPlotPoints]; + sortedPPValues = new float[numPlotPoints]; + + for (int pp = 0; pp < numPlotPoints; pp++) { + final int idx = (int)Math.round(temp[pp]); + sortedPPIndices[pp] = idx; + sortedPPAbsRanks[pp] = sortedAbsRanks[idx]; + sortedPPValues[pp] = sortedStream[idx]; + } + + //Do numTrials for all plotpoints + for (int t = 0; t < numTrials; t++) { + doTrial(); + + //sumAllocCounts = sk. + } + + // for special metrics for capturing accuracy per byte + double sumRelStdDev = 0; + int numRelStdDev = 0; + double sumAddStdDev = 0; + int numAddStdDev = 0; + + //at this point each of the errQSkArr sketches has a distribution of error from numTrials + for (int pp = 0 ; pp < numPlotPoints; pp++) { + final double v = sortedPPValues[pp]; + final double tr = v / streamLength; //the true rank + + //for each of the numErrDistRanks distributions extract the sd Gaussian quantiles + final double[] errQ = errQSkArr[pp].getQuantiles(gRanks); + final int uErrCnt = (int)round(errHllSkArr[pp].getEstimate()); + + //Plot the row. + final double relPP = (double)(pp + 1) / numPlotPoints; + job.printfData(fFmt, relPP, v, tr, + errQ[0], errQ[1], errQ[2], errQ[3], errQ[4], errQ[5], errQ[6], + uErrCnt); + + if (relPP > 0 && relPP < 1 + && (hra && relPP < metricsRankRange || !hra && relPP >= 1 - metricsRankRange)) { + sumAddStdDev += errQ[4]; + numAddStdDev++; + } + if (relPP > 0 && relPP < 1 + && (!hra && relPP < metricsRankRange || hra && relPP >= 1 - metricsRankRange)) { + sumRelStdDev += errQ[4] / (hra ? 1 - relPP : relPP); + numRelStdDev++; + } + errQSkArr[pp].reset(); //reset the errQSkArr for next streamLength + errHllSkArr[pp].reset(); //reset the errHllSkArr for next streamLength + } + final int serBytes = sk.smallByteSize(); + + // special metrics for capturing accuracy per byte + final double avgRelStdDevTimesSize = serBytes * sumRelStdDev / numRelStdDev; + final double avgAddStdDevTimesSize = serBytes * sumAddStdDev / numAddStdDev; + job.println(LS + "Avg. relative std. dev. times size: " + avgRelStdDevTimesSize); + job.println( "Avg. additive std. dev. times size: " + avgAddStdDevTimesSize); + + job.println(LS + "Serialization Bytes: " + serBytes); +// job.println(sk.viewCompactorDetail("%5.0f", false)); + } + + /** + * A trial consists of updating a virgin sketch with a stream of values. + * Capture the estimated ranks for all plotPoints and then update the errQSkArr with those + * error values. + */ + void doTrial() { + sk = TDigest.createDigest(K); + if (shuffle) { Shuffle.shuffle(stream); } + final int sl = stream.length; + for (int i = 0; i < sl; i++) { sk.add(stream[i]); } + //get estimated ranks from sketch for all plot points + final double[] estRanks = new double[sortedPPValues.length]; + for (int i = 0; i < sortedPPValues.length; i++) estRanks[i] = sk.cdf(sortedPPValues[i]); + //compute errors and update HLL for each plotPoint + for (int pp = 0; pp < numPlotPoints; pp++) { + final double errorAtPlotPoint = estRanks[pp] - (double)sortedPPAbsRanks[pp] / sl; + errQSkArr[pp].update(errorAtPlotPoint); //update each of the errQArr sketches + errHllSkArr[pp].update(errorAtPlotPoint); //unique count of error values + } + } + +} diff --git a/src/main/java/org/apache/datasketches/characterization/tdigest/TDigestMergeAccuracyProfile.java b/src/main/java/org/apache/datasketches/characterization/tdigest/TDigestMergeAccuracyProfile.java new file mode 100644 index 0000000..15fdd3d --- /dev/null +++ b/src/main/java/org/apache/datasketches/characterization/tdigest/TDigestMergeAccuracyProfile.java @@ -0,0 +1,173 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.characterization.tdigest; + +import static org.apache.datasketches.common.Util.pwr2SeriesNext; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Random; + +import org.apache.datasketches.Job; +import org.apache.datasketches.JobProfile; +import org.apache.datasketches.MonotonicPoints; +import org.apache.datasketches.Properties; + +import com.tdunning.math.stats.TDigest; + +public class TDigestMergeAccuracyProfile implements JobProfile { + private Job job; + private Properties prop; + + private Random rand; + private double[] values; + + @Override + public void start(final Job job) { + this.job = job; + prop = job.getProperties(); + final int lgMin = Integer.parseInt(prop.mustGet("LgMin")); + final int lgMax = Integer.parseInt(prop.mustGet("LgMax")); + final int lgDelta = Integer.parseInt(prop.mustGet("LgDelta")); + final int ppo = Integer.parseInt(prop.mustGet("PPO")); + + final int compression = Integer.parseInt(prop.mustGet("Compression")); + final double[] ranks = {0.01, 0.05, 0.5, 0.95, 0.99}; + final double errorPercentile = Double.parseDouble(prop.mustGet("ErrorPercentile")); + + rand = new Random(); + values = new double[1 << lgMax]; + + final int numSteps; + final boolean useppo; + if (lgDelta < 1) { + numSteps = MonotonicPoints.countPoints(lgMin, lgMax, ppo); + useppo = true; + } else { + numSteps = (lgMax - lgMin) / lgDelta + 1; + useppo = false; + } + + int streamLength = 1 << lgMin; + int lgCurSL = lgMin; + for (int step = 0; step < numSteps; step++) { + + doStreamLength(streamLength, compression, ranks, errorPercentile); + + if (useppo) { + streamLength = (int)pwr2SeriesNext(ppo, streamLength); + } else { + lgCurSL += lgDelta; + streamLength = 1 << lgCurSL; + } + } + } + + @Override + public void shutdown() {} + + @Override + public void cleanup() {} + + void doStreamLength(final int streamLength, final int compression, final double[] ranks, final double errorPercentile) { + final int numTrials = 1 << Integer.parseInt(prop.mustGet("LgTrials")); + + double[][] rankErrors = new double[ranks.length][]; + for (int i = 0; i < ranks.length; i++) rankErrors[i] = new double[numTrials]; + + for (int t = 0; t < numTrials; t++) { + runTrial(t, streamLength, compression, ranks, rankErrors); + } + + job.print(streamLength); + for (int i = 0; i < ranks.length; i++) { + Arrays.sort(rankErrors[i]); + final int errPctIndex = (int)(numTrials * errorPercentile / 100); + final double rankErrorAtPct = rankErrors[i][errPctIndex]; + job.print("\t"); + job.print(rankErrorAtPct * 100); + } + job.print("\n"); + } + + void runTrial(final int trial, final int streamLength, final int compression, final double[] ranks, final double[][] rankErrors) { + final int numSketches = Integer.parseInt(prop.mustGet("NumSketches")); + final ArrayList<TDigest> tds = new ArrayList<>(); + for (int s = 0; s < numSketches; s++) tds.add(TDigest.createDigest(compression)); + int s = 0; + for (int i = 0; i < streamLength; i++) { + values[i] = rand.nextDouble(); + tds.get(s).add(values[i]); + s++; + if (s == numSketches) s = 0; + } + TDigest tdMerge = TDigest.createDigest(compression); + tdMerge.add(tds); + Arrays.sort(values, 0, streamLength); + for (int i = 0; i < ranks.length; i++) { + final double quantile = values[(int)((streamLength - 1) * ranks[i])]; + final double trueRank = computeTrueRank(values, streamLength, quantile); + rankErrors[i][trial] = Math.abs(trueRank - tdMerge.cdf(quantile)); + } + } + + static double computeTrueRank(final double[] values, final int streamLength, final double value) { + final int lower = lowerBound(values, 0, streamLength, value); + final int upper = upperBound(values, lower, streamLength, value); + return (lower + upper) / 2.0 / streamLength; + } + + static int lowerBound(final double[] values, int first, final int last, final double value) { + int current; + int step; + int count = last - first; + while (count > 0) { + current = first; + step = count / 2; + current += step; + if (values[current] < value) { + first = ++current; + count -= step + 1; + } else { + count = step; + } + } + return first; + } + + static int upperBound(final double[] values, int first, final int last, final double value) { + int current; + int step; + int count = last - first; + while (count > 0) { + current = first; + step = count / 2; + current += step; + if (!(value < values[current])) { + first = ++current; + count -= step + 1; + } else { + count = step; + } + } + return first; + } + +} diff --git a/src/main/java/org/apache/datasketches/characterization/tdigest/TDigestSpeedProfile.java b/src/main/java/org/apache/datasketches/characterization/tdigest/TDigestSpeedProfile.java new file mode 100644 index 0000000..b4e2efc --- /dev/null +++ b/src/main/java/org/apache/datasketches/characterization/tdigest/TDigestSpeedProfile.java @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.datasketches.characterization.tdigest; + +import java.util.Random; + +import org.apache.datasketches.Properties; +import org.apache.datasketches.characterization.quantiles.BaseQuantilesSpeedProfile; + +import com.tdunning.math.stats.TDigest; + +public class TDigestSpeedProfile extends BaseQuantilesSpeedProfile { + + private static final Random rnd = new Random(); + private int k; + private double[] inputValues; + private int numQueryValues; + private double[] rankQueryValues; + private double[] quantileQueryValues; + + long buildTimeNs; + long updateTimeNs; + long getRankTimeNs; + long getQuantileTimeNs; + long serializeTimeNs; + long deserializeTimeNs; + long fullSerializedSizeBytes; + long smallSerializedSizeBytes; + int numCentroids; + + @Override + public void configure(final int k, final int numQueryValues, final Properties properties) { + this.k = k; + this.numQueryValues = numQueryValues; + } + + @Override + public void prepareTrial(final int streamLength) { + // prepare input data + inputValues = new double[streamLength]; + for (int i = 0; i < streamLength; i++) { + inputValues[i] = rnd.nextDouble(); + } + // prepare query data + quantileQueryValues = new double[numQueryValues]; + for (int i = 0; i < numQueryValues; i++) { + quantileQueryValues[i] = rnd.nextDouble(); + } + rankQueryValues = new double[numQueryValues]; + for (int i = 0; i < numQueryValues; i++) { + rankQueryValues[i] = rnd.nextDouble(); + } + resetStats(); + } + + @Override + public void doTrial() { + final long startBuild = System.nanoTime(); + final TDigest sketch = TDigest.createDigest(k); + final long stopBuild = System.nanoTime(); + buildTimeNs += stopBuild - startBuild; + + final long startUpdate = System.nanoTime(); + for (int i = 0; i < inputValues.length; i++) { + sketch.add(inputValues[i]); + } + final long stopUpdate = System.nanoTime(); + updateTimeNs += stopUpdate - startUpdate; + + final long startGetRank = System.nanoTime(); + for (final double value: rankQueryValues) { + sketch.cdf(value); + } + final long stopGetRank = System.nanoTime(); + getRankTimeNs += stopGetRank - startGetRank; + + final long startGetQuantile = System.nanoTime(); + for (final double value: quantileQueryValues) { + sketch.quantile(value); + } + final long stopGetQuantile = System.nanoTime(); + getQuantileTimeNs += stopGetQuantile - startGetQuantile; + +// final long startSerialize = System.nanoTime(); +// final byte[] bytes = sketch.asBytes(null); +// final long stopSerialize = System.nanoTime(); +// serializeTimeNs += stopSerialize - startSerialize; + +// final long startDeserialize = System.nanoTime(); +// final deserializedSketch = MergingDigest.fromBytes(bytes); +// final long stopDeserialize = System.nanoTime(); +// deserializeTimeNs += stopDeserialize - startDeserialize; + + numCentroids += sketch.centroidCount(); + fullSerializedSizeBytes += sketch.byteSize(); + smallSerializedSizeBytes += sketch.smallByteSize(); + } + + @Override + public String getHeader() { + return "Stream\tTrials\tBuild\tUpdate\tRank\tQuant\tSer\tDeser\tCentroids\tFullSize\tSmallSize"; + } + + @Override + public String getStats(final int streamLength, final int numTrials, final int numQueryValues) { + return String.format("%d\t%d\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%.1f\t%d\t%d\t%d", + streamLength, + numTrials, + (double) buildTimeNs / numTrials, + (double) updateTimeNs / numTrials / streamLength, + (double) getRankTimeNs / numTrials / numQueryValues, + (double) getQuantileTimeNs / numTrials / numQueryValues, + (double) serializeTimeNs / numTrials, + (double) deserializeTimeNs / numTrials, + numCentroids / numTrials, + fullSerializedSizeBytes / numTrials, + smallSerializedSizeBytes / numTrials + ); + } + + private void resetStats() { + buildTimeNs = 0; + updateTimeNs = 0; + getRankTimeNs = 0; + getQuantileTimeNs = 0; + serializeTimeNs = 0; + deserializeTimeNs = 0; + fullSerializedSizeBytes = 0; + smallSerializedSizeBytes = 0; + numCentroids = 0; + } + +} diff --git a/src/main/resources/tdigest/TDigestAccuracyJob.conf b/src/main/resources/tdigest/TDigestAccuracyJob.conf new file mode 100644 index 0000000..7769a8f --- /dev/null +++ b/src/main/resources/tdigest/TDigestAccuracyJob.conf @@ -0,0 +1,38 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +JobProfile=org.apache.datasketches.characterization.tdigest.TDigestAccuracyProfile + +## Stream lengths +LgMin=0 # The starting stream length +LgMax=23 # How high the stream length goes +LgDelta=0 # If > 0, this is the lg Increment +PPO=16 # The horizontal x-resolution of trials points + +# Trials config (indep of sketch) +LgTrials=10 # lgTrials at every stream length + +# Specific sketch config +Compression=100 # sketch size and accuracy parameter + +ErrorPercentile=99 # output this percentile from rank error distribution + +# Date-Time Profile +TimeZone=UTC +TimeZoneOffset=0 #-25200000 # offset in millisec: PST (UTC-8) = -28_800_000 PDT (UTC-7) = -25_200_000 +FileNameDateFormat=yyyyMMdd'_'HHmmssz +ReadableDateFormat=yyyy/MM/dd HH:mm:ss diff --git a/src/main/resources/tdigest/TDigestErrorVsRankJob.conf b/src/main/resources/tdigest/TDigestErrorVsRankJob.conf new file mode 100644 index 0000000..1c4ea20 --- /dev/null +++ b/src/main/resources/tdigest/TDigestErrorVsRankJob.conf @@ -0,0 +1,64 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +JobProfile=org.apache.datasketches.characterization.tdigest.TDigestErrorVsRankProfile + +# Stream Pattern +Pattern=RANDOM # Sorted, Reversed, Zoomin, Zoomout, Random, Sqrt, FlipFlop, ZoominSqrt +Offset=1 #0 for min value of 0; 1 for min value of 1 + +## Stream lengths +LgMin=24 # The starting stream length +LgMax=24 # How high the stream length goes +LgDelta=2 # If > 0, this is the lg Increment +PPO=1 # The horizontal x-resolution of trials points + +# Trials config (indep of sketch) +LgTrials=14 # lgTrials at every stream length +ErrQSkLgK=12 # the rank error distribution sketch LgK +ErrHllSkLgK=12 # the rank error HLL sketch Lgk +Shuffle=false # If true, shuffle before each trial + +# Plotting +NumPlotPoints=100 # number of plot points along the x-axis +EvenlySpaced=false # if true the x-axis points will be evenly spaced ranks in [0,1], otherwise exponential in [0,1] +Exponent=2.0 # the steepness of the exponential x-axis density gradient curve, must be >= 1.0 +StdDev=1 # std deviation used when plotting LB, UB +RankRange=1.0 # range of rank to plot. E.g., given 0.3: if LRA => 0 to 0.3; if HRA => 0.7 to 1.0 + +# Specific sketch config +K=100 # sketch size and accuracy parameter + +HRA=true # if true use high-rank accuracy, otherwise low-rank accuracy +#Compatible=false +# For LRA, LE,GT have the converged point at rank 1.0 +# For HRA, LT,GE have the converged point at rank 0.0 +# Criterion=LE # LT, LE, GT, GE. Must be all caps. +LtEq=true + +# ReqDebugLevel=2 # or 0, 1, 2. disable by commenting it out. Use only when LgTrials=0 +# ReqDebugFmt=%5.0f + +# Date-Time Profile +TimeZone=UTC +TimeZoneOffset=0 #-25200000 # offset in millisec: PST (UTC-8) = -28_800_000 PDT (UTC-7) = -25_200_000 +FileNameDateFormat=yyyyMMdd'_'HHmmssz +ReadableDateFormat=yyyy/MM/dd HH:mm:ss + +# FOR SPECIAL METRICS CAPTURING ACCURACY PER BYTE +MetricsRankRange = 0.3 + diff --git a/src/main/resources/tdigest/TDigestMergeAccuracyJob.conf b/src/main/resources/tdigest/TDigestMergeAccuracyJob.conf new file mode 100644 index 0000000..233d8f5 --- /dev/null +++ b/src/main/resources/tdigest/TDigestMergeAccuracyJob.conf @@ -0,0 +1,40 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +JobProfile=org.apache.datasketches.characterization.tdigest.TDigestMergeAccuracyProfile + +## Stream lengths +LgMin=0 # The starting stream length +LgMax=23 # How high the stream length goes +LgDelta=0 # If > 0, this is the lg Increment +PPO=16 # The horizontal x-resolution of trials points + +# Trials config (indep of sketch) +LgTrials=10 # lgTrials at every stream length + +# Specific sketch config +Compression=100 # sketch size and accuracy parameter + +NumSketches=8 # stream is sprayed across so many sketches and sketches are merged + +ErrorPercentile=99 # output this percentile from rank error distribution + +# Date-Time Profile +TimeZone=UTC +TimeZoneOffset=0 #-25200000 # offset in millisec: PST (UTC-8) = -28_800_000 PDT (UTC-7) = -25_200_000 +FileNameDateFormat=yyyyMMdd'_'HHmmssz +ReadableDateFormat=yyyy/MM/dd HH:mm:ss diff --git a/src/main/resources/tdigest/TDigestSpeedJob.conf b/src/main/resources/tdigest/TDigestSpeedJob.conf new file mode 100644 index 0000000..4d0b5db --- /dev/null +++ b/src/main/resources/tdigest/TDigestSpeedJob.conf @@ -0,0 +1,27 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +JobProfile=org.apache.datasketches.characterization.tdigest.TDigestSpeedProfile +K=100 # sketch size and accuracy parameter +numQueryValues=20 # number of values for getQuantile() and getRank() + +lgMin=0 # The starting stream length +lgMax=23 # How high the stream length goes +PPO=16 # The horizontal x-resolution of trials points + +lgMaxTrials=16 # Max trials at start (low counts) +lgMinTrials=6 # Min trials at tail (high counts) --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
