This is an automated email from the ASF dual-hosted git repository.
charlie pushed a commit to branch master
in repository
https://gitbox.apache.org/repos/asf/datasketches-characterization.git
The following commit(s) were added to refs/heads/master by this push:
new 87380c0 Added filter accuracy scripts
87380c0 is described below
commit 87380c0d7b509b927d0694a09351410fe6d26a41
Author: Charlie Dickens <[email protected]>
AuthorDate: Wed Jun 12 17:52:20 2024 +0100
Added filter accuracy scripts
---
.../filters/BaseFilterAccuracyProfile.java | 199 +++++++++++++++++++++
.../filters/BloomFilterAccuracyProfile.java | 65 +++++++
.../filters/QuotientFilterAccuracyProfile.java | 65 +++++++
.../resources/filters/BloomFilterAccuracyJob.conf | 47 +++++
.../filters/QuotientFilterAccuracyJob.conf | 47 +++++
5 files changed, 423 insertions(+)
diff --git
a/src/main/java/org/apache/datasketches/characterization/filters/BaseFilterAccuracyProfile.java
b/src/main/java/org/apache/datasketches/characterization/filters/BaseFilterAccuracyProfile.java
new file mode 100644
index 0000000..2304b73
--- /dev/null
+++
b/src/main/java/org/apache/datasketches/characterization/filters/BaseFilterAccuracyProfile.java
@@ -0,0 +1,199 @@
+package org.apache.datasketches.characterization.filters;
+
+import org.apache.datasketches.Job;
+import org.apache.datasketches.JobProfile;
+import org.apache.datasketches.Properties;
+
+import java.util.ArrayList;
+
+import static java.lang.Math.log;
+import static java.lang.Math.pow;
+import static org.apache.datasketches.common.Util.pwr2SeriesNext;
+
+public abstract class BaseFilterAccuracyProfile implements JobProfile{
+
+ Job job;
+ public Properties prop;
+ public long vIn = 1;
+ public int lgU;
+ public double capacity;
+ public long numItemsInserted;
+ public ArrayList<Long> inputItems = new ArrayList<>();
+ public ArrayList<Long> negativeItems = new ArrayList<>();
+ long numQueries;
+
+ int minNumHashes;
+ int maxNumHashes ;
+ int bitsPerEntry;
+
+ int lgMinT;
+ int lgMaxT;
+ int tPPO;
+ int numTrials ;
+ double fpr;
+ long filterNumBits;
+ int lgMinBpU;
+ int lgMaxBpU;
+ double slope;
+
+ //JobProfile
+ @Override
+ public void start(final Job job) {
+ this.job = job;
+ prop = job.getProperties();
+ lgU = Integer.parseInt(prop.mustGet("Universe_lgU"));
+ capacity = Double.parseDouble(prop.mustGet("Universe_capacity"));
+ minNumHashes = Integer.parseInt(prop.mustGet("minNumHashes"));
+ maxNumHashes = Integer.parseInt(prop.mustGet("maxNumHashes"));
+ lgMinT = Integer.parseInt(prop.mustGet("Trials_lgMinT"));
+ lgMaxT = Integer.parseInt(prop.mustGet("Trials_lgMaxT"));
+ lgMinBpU = Integer.parseInt(prop.mustGet("Trials_lgMinBpU"));
+ lgMaxBpU = Integer.parseInt(prop.mustGet("Trials_lgMaxBpU"));
+ slope = (double) (lgMaxT - lgMinT) / (lgMinBpU - lgMaxBpU);
+ tPPO = Integer.parseInt(prop.mustGet("Trials_TPPO"));
+ numQueries = 1<<minNumHashes+1; // starting value for the number of
query points.
+ numTrials = 1<<lgMinT;
+ numItemsInserted = (long) Math.round(capacity * (1L << lgU));
+
+ configure();
+ doTrials();
+ shutdown();
+ cleanup();
+ }
+
+ @Override
+ public void shutdown() {}
+
+ @Override
+ public void cleanup() {}
+ //end JobProfile
+
+ /**
+ * Configure the sketch
+ */
+ public abstract void configure();
+
+ /**
+ * Used to get the size of the filter in bits for the current trial.
+ */
+ public abstract long getFilterLengthBits();
+
+
+ /**
+ * @return the number of bits per entry for the filter.
+ */
+ public abstract int getBitsperEntry(final int numHashes);
+
+ /**
+ * This method is used to perform a trial with a specific number of hashes
and queries.
+ *
+ * @param numHashBits The number of hashes to be used in the trial.
+ * See the specific profile for how this is implemented as the number of
hashes
+ * might have different meanings depending on the implementation.
+ * @param numQueries The number of queries to be performed in the trial.
+ * @return The average update time per update for this trial.
+ */
+ public abstract double doTrial(final int numHashBits, final long
numQueries);
+
+ /**
+ * Conducts a series of trials for different numbers of hashes. For each
number of hashes,
+ * it performs a set number of trials, calculates the false positive rate
and filter size,
+ * and processes the results. The results of each trial are appended to a
StringBuilder
+ * in a tab-separated format and printed.
+ *
+ * The number of hashes ranges from the minimum to the maximum number of
hashes specified
+ * in the class. The number of trials for each number of hashes is
determined by the
+ * getNumTrials method. The false positive rate and filter size are
calculated by
+ * averaging the results of the trials for each number of hashes.
+ *
+ * After the results are processed, they are printed and the number of
query points is
+ * updated for the next number of hashes.
+ * We need to increase the power of 2 for each trial set because the
failure probability decays
+ * with 2^(-x) when x is related to the number of bits per entry.
+ */
+ private void doTrials() {
+ final StringBuilder dataStr = new StringBuilder();
+ job.println(getHeader());
+ for (int nh= minNumHashes; nh <= maxNumHashes; nh++) {
+ fpr = 0;
+ filterNumBits = 0;
+ final int numTrials = getNumTrials(nh);
+ for (int t = 0; t < numTrials; t++) {
+ fpr += doTrial(nh, numQueries);
+ filterNumBits += getFilterLengthBits();
+ }
+ fpr /= numTrials;
+ filterNumBits /= numTrials;
+ bitsPerEntry = getBitsperEntry(nh);
+ process(nh, bitsPerEntry, fpr, filterNumBits, numQueries,
numTrials, dataStr);
+ job.println(dataStr.toString());
+ numQueries = (int)pwr2SeriesNext(1, 1L<<(nh+1));
+ }
+ }
+
+ /**
+ * Computes the number of trials for a given current number of uniques for
a
+ * trial set. This is used to decrease the number of trials
+ * as the number of uniques increase.
+ *
+ * @param curU the given current number of uniques for a trial set.
+ * @return the number of trials for a given current number of uniques for a
+ * trial set.
+ */
+ private int getNumTrials(final int curU) {
+ final int minBpU = 1 << lgMinBpU;
+ final int maxBpU = 1 << lgMaxBpU;
+ final int maxT = 1 << lgMaxT;
+ final int minT = 1 << lgMinT;
+ if (lgMinT == lgMaxT || curU <= minBpU) {
+ return maxT;
+ }
+ if (curU >= maxBpU) {
+ return minT;
+ }
+ final double lgCurU = log(curU) / LN2;
+ final double lgTrials = slope * (lgCurU - lgMinBpU) + lgMaxT;
+ return (int) pow(2.0, lgTrials);
+ }
+
+ /**
+ * Processes the results of a trial and appends them to a StringBuilder in
a tab-separated format.
+ *
+ * @param numHashes The number of hashes used in the trial.
+ * @param bitsPerEntry The number of bits per entry in the trial.
+ * @param falsePositiveRate The false positive rate observed in the trial.
+ * @param filterSizeBits The size of the filter used in the trial, in bits.
+ * @param numQueryPoints The number of query points used in the trial.
+ * @param numTrials The number of trials conducted.
+ * @param sb The StringBuilder to which the results are appended.
+ */
+ private static void process(final int numHashes, final int bitsPerEntry,
final double falsePositiveRate,
+ final long filterSizeBits, final long
numQueryPoints,
+ final long numTrials, final StringBuilder sb)
{
+ // OUTPUT
+ sb.setLength(0);
+ sb.append(numHashes).append(TAB);
+ sb.append(bitsPerEntry).append(TAB);
+ sb.append(String.format("%.5e", falsePositiveRate)).append(TAB);
+ sb.append(filterSizeBits).append(TAB);
+ sb.append(numQueryPoints).append(TAB);
+ sb.append(numTrials);
+ }
+
+ /**
+ * Returns a column header row
+ * @return a column header row
+ */
+ private String getHeader() {
+ final StringBuilder sb = new StringBuilder();
+ sb.append("numHashes").append(TAB);
+ sb.append("bitsPerEntry").append(TAB);
+ sb.append("FPR").append(TAB);
+ sb.append("filterSizeBits").append(TAB);
+ sb.append("numQueryPoints").append(TAB);
+ sb.append("numTrials");
+ return sb.toString();
+ }
+}
+
+
diff --git
a/src/main/java/org/apache/datasketches/characterization/filters/BloomFilterAccuracyProfile.java
b/src/main/java/org/apache/datasketches/characterization/filters/BloomFilterAccuracyProfile.java
new file mode 100644
index 0000000..0a9fc93
--- /dev/null
+++
b/src/main/java/org/apache/datasketches/characterization/filters/BloomFilterAccuracyProfile.java
@@ -0,0 +1,65 @@
+package org.apache.datasketches.characterization.filters;
+
+import org.apache.datasketches.filters.bloomfilter.BloomFilter;
+import org.apache.datasketches.filters.bloomfilter.BloomFilterBuilder;
+
+import java.util.ArrayList;
+
+public class BloomFilterAccuracyProfile extends BaseFilterAccuracyProfile{
+
+ protected BloomFilter sketch;
+
+ protected long filterLengthBits;
+// protected int minNumBitsPerEntry;
+// protected int maxNumBitsPerEntry ;
+
+ @Override
+ public void configure() {
+ filterLengthBits = Integer.parseInt(prop.mustGet("filterLengthBits"));
+ }
+
+ /*
+ * The experiment hasa fixed number of bits per entry and
+ */
+ @Override
+ public double doTrial(final int numHashes, final long numQueries) {
+
+ // Initialise and populate the sketch
+ filterLengthBits = (long) ((numHashes * numItemsInserted) / LN2);
+ sketch = BloomFilterBuilder.createBySize(filterLengthBits, numHashes);
+
+ // Build the test sets but clear them first so that they have the
correct cardinality and no surplus is added.
+ inputItems.clear();
+ negativeItems.clear();
+ long item;
+ for (long i = 0; i < numItemsInserted; i++){
+ item = ++vIn;
+ inputItems.add(item) ;
+ sketch.update(item);
+ }
+
+ for (long i = numItemsInserted; i < numItemsInserted+numQueries; i++ )
{
+ item = ++vIn;
+ negativeItems.add(item) ;
+ }
+
+ // Check the number of false positives
+ long numFalsePositive = 0 ;
+ for (int i = 0; i < negativeItems.size(); i++){
+ if (sketch.query(negativeItems.get(i))) ++numFalsePositive ;
+ }
+ return (double)numFalsePositive / numQueries;
+ }
+
+ @Override
+ public int getBitsperEntry(final int numHashes) {
+ return (int) (numHashes / LN2);
+ }
+
+ @Override
+ public long getFilterLengthBits() {
+ return sketch.getCapacity();
+ }
+
+
+}
diff --git
a/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterAccuracyProfile.java
b/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterAccuracyProfile.java
new file mode 100644
index 0000000..9bf5872
--- /dev/null
+++
b/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterAccuracyProfile.java
@@ -0,0 +1,65 @@
+package org.apache.datasketches.characterization.filters;
+
+import org.apache.datasketches.filters.quotientfilter.QuotientFilter;
+import org.apache.datasketches.filters.quotientfilter.QuotientFilterBuilder;
+
+import java.util.ArrayList;
+
+
+public class QuotientFilterAccuracyProfile extends BaseFilterAccuracyProfile{
+
+ protected QuotientFilter sketch;
+
+
+ @Override
+ public void configure() {}
+
+ /*
+ We use the numHashBits as being equivalent to the number of bits per entry
in the Quotient filter.
+ This value is 3 metadata bits plus the remaining number of bits for the
fingerprint length.
+ */
+ @Override
+ public double doTrial(final int numHashBits, final long numQueries) {
+ // Initialise and populate the sketch
+ sketch = new QuotientFilter(lgU, numHashBits);
+
+ // Build the test sets but clear them first so that they have the
correct cardinality and no surplus is added.
+ inputItems.clear();
+ negativeItems.clear();
+ long item;
+ for (long i = 0; i < numItemsInserted; i++){
+ item = ++vIn;
+ inputItems.add(item) ;
+ sketch.insert(item);
+ }
+
+ for (long i = numItemsInserted; i < numItemsInserted+numQueries; i++ )
{
+ item = ++vIn;
+ negativeItems.add(item) ;
+ }
+
+ // Check the number of false positives
+ long numFalsePositive = 0 ;
+ for (int i = 0; i < negativeItems.size(); i++){
+ if (sketch.search(negativeItems.get(i))) ++numFalsePositive ;
+ }
+ return (double)numFalsePositive / numQueries;
+ }
+
+ /*
+ The total number of bits per entry is the three metadata plus the
fingerprint length bits.
+ This is exactly the number of hash bits that is passed as a parameter.
+ */
+ @Override
+ public int getBitsperEntry(final int numHashBits) {
+ return numHashBits;
+ }
+
+ @Override
+ public long getFilterLengthBits() {
+ return sketch.get_space_use();
+ }
+
+
+}
+
diff --git a/src/main/resources/filters/BloomFilterAccuracyJob.conf
b/src/main/resources/filters/BloomFilterAccuracyJob.conf
new file mode 100644
index 0000000..2f55b3d
--- /dev/null
+++ b/src/main/resources/filters/BloomFilterAccuracyJob.conf
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Job
+
+# The Bloom Filter Update Speed profile is evaluated by fixing a size in bits
for the filter
+# and inserting items up to a maximum input cardinality N.
+# Note that `final long numItems = 1L<<20` should correspond directly with
`Trials_lgMaxU`
+# in this configuration file.
+
+# Uniques Profile
+Universe_lgU=20 # Maximum log2 of the input set.
+Universe_capacity = 0.9 # this is used to get number of uniques inserted:
numUniques = Trials_capacity *(2^Trials_lgU)
+
+# Trials Profile
+Trials_lgMinT=4 #Min trials at tail (high counts) 4
+Trials_lgMaxT=11 #Min trials at tail (high counts) 4
+Trials_TPPO=1 #how often intermediate results are printed
+Trials_lgMinBpU=1 #start the downward slope of trials at this LgU
+Trials_lgMaxBpU=5 #stop the downward slope of trials at this LgU
+
+# Date-Time Profile
+TimeZone=PST
+TimeZoneOffset=-28800000 # offset in millisec
+FileNameDateFormat=yyyyMMdd'_'HHmmssz
+ReadableDateFormat=yyyy/MM/dd HH:mm:ss z
+
+#Job Profile
+JobProfile=org.apache.datasketches.characterization.filters.BloomFilterAccuracyProfile
+filterLengthBits = 16777216
+minNumHashes = 4
+maxNumHashes = 24
+lgNumQueries = 20
diff --git a/src/main/resources/filters/QuotientFilterAccuracyJob.conf
b/src/main/resources/filters/QuotientFilterAccuracyJob.conf
new file mode 100644
index 0000000..37d4ef1
--- /dev/null
+++ b/src/main/resources/filters/QuotientFilterAccuracyJob.conf
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Job
+
+# The Bloom Filter Update Speed profile is evaluated by fixing a size in bits
for the filter
+# and inserting items up to a maximum input cardinality N.
+# Note that `final long numItems = 1L<<20` should correspond directly with
`Trials_lgMaxU`
+# in this configuration file.
+
+# Uniques Profile
+Universe_lgU=20 # Maximum log2 of the input set.
+Universe_capacity = 0.75 # this is used to get number of uniques inserted:
numUniques = Trials_capacity *(2^Trials_lgU)
+
+
+# Trials Profile
+Trials_lgMinT=4 #Min trials at tail (high counts) 4
+Trials_lgMaxT=11 #Min trials at tail (high counts) 4
+Trials_TPPO=1 #how often intermediate results are printed
+Trials_lgMinBpU=1 #start the downward slope of trials at this LgU
+Trials_lgMaxBpU=5 #stop the downward slope of trials at this LgU
+
+# Date-Time Profile
+TimeZone=PST
+TimeZoneOffset=-28800000 # offset in millisec
+FileNameDateFormat=yyyyMMdd'_'HHmmssz
+ReadableDateFormat=yyyy/MM/dd HH:mm:ss z
+
+#Job Profile
+JobProfile=org.apache.datasketches.characterization.filters.QuotientFilterAccuracyProfile
+minNumHashes = 4
+maxNumHashes = 24
+lgNumQueries = 20
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]