(datasketches-characterization) branch master updated: Added filter accuracy scripts

charlie Wed, 12 Jun 2024 09:52:53 -0700

This is an automated email from the ASF dual-hosted git repository.

charlie pushed a commit to branch master
in repository 
https://gitbox.apache.org/repos/asf/datasketches-characterization.git



The following commit(s) were added to refs/heads/master by this push:
     new 87380c0  Added filter accuracy scripts
87380c0 is described below

commit 87380c0d7b509b927d0694a09351410fe6d26a41
Author: Charlie Dickens <[email protected]>
AuthorDate: Wed Jun 12 17:52:20 2024 +0100

    Added filter accuracy scripts
---
 .../filters/BaseFilterAccuracyProfile.java         | 199 +++++++++++++++++++++
 .../filters/BloomFilterAccuracyProfile.java        |  65 +++++++
 .../filters/QuotientFilterAccuracyProfile.java     |  65 +++++++
 .../resources/filters/BloomFilterAccuracyJob.conf  |  47 +++++
 .../filters/QuotientFilterAccuracyJob.conf         |  47 +++++
 5 files changed, 423 insertions(+)

diff --git 
a/src/main/java/org/apache/datasketches/characterization/filters/BaseFilterAccuracyProfile.java
 
b/src/main/java/org/apache/datasketches/characterization/filters/BaseFilterAccuracyProfile.java
new file mode 100644
index 0000000..2304b73
--- /dev/null
+++ 
b/src/main/java/org/apache/datasketches/characterization/filters/BaseFilterAccuracyProfile.java
@@ -0,0 +1,199 @@
+package org.apache.datasketches.characterization.filters;
+
+import org.apache.datasketches.Job;
+import org.apache.datasketches.JobProfile;
+import org.apache.datasketches.Properties;
+
+import java.util.ArrayList;
+
+import static java.lang.Math.log;
+import static java.lang.Math.pow;
+import static org.apache.datasketches.common.Util.pwr2SeriesNext;
+
+public abstract class BaseFilterAccuracyProfile implements JobProfile{
+
+    Job job;
+    public Properties prop;
+    public long vIn = 1;
+    public int lgU;
+    public double capacity;
+    public long numItemsInserted;
+    public ArrayList<Long> inputItems = new ArrayList<>();
+    public ArrayList<Long> negativeItems = new ArrayList<>();
+    long numQueries;
+
+    int minNumHashes;
+    int maxNumHashes ;
+    int bitsPerEntry;
+
+    int lgMinT;
+    int lgMaxT;
+    int tPPO;
+    int numTrials ;
+    double fpr;
+    long filterNumBits;
+    int lgMinBpU;
+    int lgMaxBpU;
+    double slope;
+
+    //JobProfile
+    @Override
+    public void start(final Job job) {
+        this.job = job;
+        prop = job.getProperties();
+        lgU = Integer.parseInt(prop.mustGet("Universe_lgU"));
+        capacity = Double.parseDouble(prop.mustGet("Universe_capacity"));
+        minNumHashes = Integer.parseInt(prop.mustGet("minNumHashes"));
+        maxNumHashes = Integer.parseInt(prop.mustGet("maxNumHashes"));
+        lgMinT = Integer.parseInt(prop.mustGet("Trials_lgMinT"));
+        lgMaxT = Integer.parseInt(prop.mustGet("Trials_lgMaxT"));
+        lgMinBpU = Integer.parseInt(prop.mustGet("Trials_lgMinBpU"));
+        lgMaxBpU = Integer.parseInt(prop.mustGet("Trials_lgMaxBpU"));
+        slope = (double) (lgMaxT - lgMinT) / (lgMinBpU - lgMaxBpU);
+        tPPO = Integer.parseInt(prop.mustGet("Trials_TPPO"));
+        numQueries = 1<<minNumHashes+1; // starting value for the number of 
query points.
+        numTrials = 1<<lgMinT;
+        numItemsInserted = (long) Math.round(capacity * (1L << lgU));
+
+        configure();
+        doTrials();
+        shutdown();
+        cleanup();
+    }
+
+    @Override
+    public void shutdown() {}
+
+    @Override
+    public void cleanup() {}
+    //end JobProfile
+
+    /**
+     * Configure the sketch
+     */
+    public abstract void configure();
+
+    /**
+     * Used to get the size of the filter in bits for the current trial.
+     */
+    public abstract long getFilterLengthBits();
+
+
+    /**
+     * @return the number of bits per entry for the filter.
+     */
+    public abstract int getBitsperEntry(final int numHashes);
+
+    /**
+     * This method is used to perform a trial with a specific number of hashes 
and queries.
+     *
+     * @param numHashBits The number of hashes to be used in the trial.
+     * See the specific profile for how this is implemented as the number of 
hashes
+     * might have different meanings depending on the implementation.
+     * @param numQueries The number of queries to be performed in the trial.
+     * @return The average update time per update for this trial.
+     */
+    public abstract double doTrial(final int numHashBits, final long 
numQueries);
+
+    /**
+     * Conducts a series of trials for different numbers of hashes. For each 
number of hashes,
+     * it performs a set number of trials, calculates the false positive rate 
and filter size,
+     * and processes the results. The results of each trial are appended to a 
StringBuilder
+     * in a tab-separated format and printed.
+     *
+     * The number of hashes ranges from the minimum to the maximum number of 
hashes specified
+     * in the class. The number of trials for each number of hashes is 
determined by the
+     * getNumTrials method. The false positive rate and filter size are 
calculated by
+     * averaging the results of the trials for each number of hashes.
+     *
+     * After the results are processed, they are printed and the number of 
query points is
+     * updated for the next number of hashes.
+     * We need to increase the power of 2 for each trial set because the 
failure probability decays
+     * with 2^(-x) when x is related to the number of bits per entry.
+     */
+    private void doTrials() {
+        final StringBuilder dataStr = new StringBuilder();
+        job.println(getHeader());
+        for (int nh= minNumHashes; nh <= maxNumHashes; nh++) {
+            fpr = 0;
+            filterNumBits = 0;
+            final int numTrials = getNumTrials(nh);
+            for (int t = 0; t < numTrials; t++) {
+                fpr += doTrial(nh, numQueries);
+                filterNumBits += getFilterLengthBits();
+            }
+            fpr /= numTrials;
+            filterNumBits /= numTrials;
+            bitsPerEntry = getBitsperEntry(nh);
+            process(nh, bitsPerEntry, fpr, filterNumBits, numQueries, 
numTrials, dataStr);
+            job.println(dataStr.toString());
+            numQueries = (int)pwr2SeriesNext(1, 1L<<(nh+1));
+        }
+    }
+
+    /**
+     * Computes the number of trials for a given current number of uniques for 
a
+     * trial set. This is used to decrease the number of trials
+     * as the number of uniques increase.
+     *
+     * @param curU the given current number of uniques for a trial set.
+     * @return the number of trials for a given current number of uniques for a
+     * trial set.
+     */
+    private int getNumTrials(final int curU) {
+        final int minBpU = 1 << lgMinBpU;
+        final int maxBpU = 1 << lgMaxBpU;
+        final int maxT = 1 << lgMaxT;
+        final int minT = 1 << lgMinT;
+        if (lgMinT == lgMaxT || curU <= minBpU) {
+            return maxT;
+        }
+        if (curU >= maxBpU) {
+            return minT;
+        }
+        final double lgCurU = log(curU) / LN2;
+        final double lgTrials = slope * (lgCurU - lgMinBpU) + lgMaxT;
+        return (int) pow(2.0, lgTrials);
+    }
+
+    /**
+     * Processes the results of a trial and appends them to a StringBuilder in 
a tab-separated format.
+     *
+     * @param numHashes The number of hashes used in the trial.
+     * @param bitsPerEntry The number of bits per entry in the trial.
+     * @param falsePositiveRate The false positive rate observed in the trial.
+     * @param filterSizeBits The size of the filter used in the trial, in bits.
+     * @param numQueryPoints The number of query points used in the trial.
+     * @param numTrials The number of trials conducted.
+     * @param sb The StringBuilder to which the results are appended.
+     */
+    private static void process(final int numHashes, final int bitsPerEntry, 
final double falsePositiveRate,
+                                final long filterSizeBits, final long 
numQueryPoints,
+                                final long numTrials,  final StringBuilder sb) 
{
+        // OUTPUT
+        sb.setLength(0);
+        sb.append(numHashes).append(TAB);
+        sb.append(bitsPerEntry).append(TAB);
+        sb.append(String.format("%.5e", falsePositiveRate)).append(TAB);
+        sb.append(filterSizeBits).append(TAB);
+        sb.append(numQueryPoints).append(TAB);
+        sb.append(numTrials);
+    }
+
+    /**
+     * Returns a column header row
+     * @return a column header row
+     */
+    private String getHeader() {
+        final StringBuilder sb = new StringBuilder();
+        sb.append("numHashes").append(TAB);
+        sb.append("bitsPerEntry").append(TAB);
+        sb.append("FPR").append(TAB);
+        sb.append("filterSizeBits").append(TAB);
+        sb.append("numQueryPoints").append(TAB);
+        sb.append("numTrials");
+        return sb.toString();
+    }
+}
+
+
diff --git 
a/src/main/java/org/apache/datasketches/characterization/filters/BloomFilterAccuracyProfile.java
 
b/src/main/java/org/apache/datasketches/characterization/filters/BloomFilterAccuracyProfile.java
new file mode 100644
index 0000000..0a9fc93
--- /dev/null
+++ 
b/src/main/java/org/apache/datasketches/characterization/filters/BloomFilterAccuracyProfile.java
@@ -0,0 +1,65 @@
+package org.apache.datasketches.characterization.filters;
+
+import org.apache.datasketches.filters.bloomfilter.BloomFilter;
+import org.apache.datasketches.filters.bloomfilter.BloomFilterBuilder;
+
+import java.util.ArrayList;
+
+public class BloomFilterAccuracyProfile extends BaseFilterAccuracyProfile{
+
+    protected BloomFilter sketch;
+
+    protected long filterLengthBits;
+//    protected int minNumBitsPerEntry;
+//    protected int maxNumBitsPerEntry ;
+
+    @Override
+    public void configure() {
+        filterLengthBits = Integer.parseInt(prop.mustGet("filterLengthBits"));
+    }
+
+    /*
+     * The experiment hasa fixed number of bits per entry and
+     */
+    @Override
+    public double doTrial(final int  numHashes, final long numQueries) {
+
+        // Initialise and populate the sketch
+        filterLengthBits = (long) ((numHashes * numItemsInserted) / LN2);
+        sketch =  BloomFilterBuilder.createBySize(filterLengthBits, numHashes);
+
+        // Build the test sets but clear them first so that they have the 
correct cardinality and no surplus is added.
+        inputItems.clear();
+        negativeItems.clear();
+        long item;
+        for (long i = 0; i < numItemsInserted; i++){
+            item = ++vIn;
+            inputItems.add(item) ;
+            sketch.update(item);
+        }
+
+        for (long i = numItemsInserted; i < numItemsInserted+numQueries; i++ ) 
{
+            item = ++vIn;
+            negativeItems.add(item) ;
+        }
+
+        // Check the number of false positives
+        long numFalsePositive = 0 ;
+        for (int i = 0; i < negativeItems.size(); i++){
+            if (sketch.query(negativeItems.get(i))) ++numFalsePositive ;
+        }
+        return (double)numFalsePositive / numQueries;
+    }
+
+    @Override
+    public int getBitsperEntry(final int numHashes) {
+        return (int) (numHashes / LN2);
+    }
+
+    @Override
+    public long getFilterLengthBits() {
+        return sketch.getCapacity();
+    }
+
+
+}
diff --git 
a/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterAccuracyProfile.java
 
b/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterAccuracyProfile.java
new file mode 100644
index 0000000..9bf5872
--- /dev/null
+++ 
b/src/main/java/org/apache/datasketches/characterization/filters/QuotientFilterAccuracyProfile.java
@@ -0,0 +1,65 @@
+package org.apache.datasketches.characterization.filters;
+
+import org.apache.datasketches.filters.quotientfilter.QuotientFilter;
+import org.apache.datasketches.filters.quotientfilter.QuotientFilterBuilder;
+
+import java.util.ArrayList;
+
+
+public class QuotientFilterAccuracyProfile extends BaseFilterAccuracyProfile{
+
+    protected QuotientFilter sketch;
+
+
+    @Override
+    public void configure() {}
+
+    /*
+    We use the numHashBits as being equivalent to the number of bits per entry 
in the Quotient filter.
+    This value is 3 metadata bits plus the remaining number of bits for the 
fingerprint length.
+     */
+    @Override
+    public double doTrial(final int numHashBits, final long numQueries) {
+        // Initialise and populate the sketch
+        sketch = new QuotientFilter(lgU, numHashBits);
+
+        // Build the test sets but clear them first so that they have the 
correct cardinality and no surplus is added.
+        inputItems.clear();
+        negativeItems.clear();
+        long item;
+        for (long i = 0; i < numItemsInserted; i++){
+            item = ++vIn;
+            inputItems.add(item) ;
+            sketch.insert(item);
+        }
+
+        for (long i = numItemsInserted; i < numItemsInserted+numQueries; i++ ) 
{
+            item = ++vIn;
+            negativeItems.add(item) ;
+        }
+
+        // Check the number of false positives
+        long numFalsePositive = 0 ;
+        for (int i = 0; i < negativeItems.size(); i++){
+            if (sketch.search(negativeItems.get(i))) ++numFalsePositive ;
+        }
+        return (double)numFalsePositive / numQueries;
+    }
+
+    /*
+    The total number of bits per entry is the three metadata plus the 
fingerprint length bits.
+    This is exactly the number of hash bits that is passed as a parameter.
+     */
+    @Override
+    public int getBitsperEntry(final int numHashBits) {
+        return numHashBits;
+    }
+
+    @Override
+    public long getFilterLengthBits() {
+        return sketch.get_space_use();
+    }
+
+
+}
+
diff --git a/src/main/resources/filters/BloomFilterAccuracyJob.conf 
b/src/main/resources/filters/BloomFilterAccuracyJob.conf
new file mode 100644
index 0000000..2f55b3d
--- /dev/null
+++ b/src/main/resources/filters/BloomFilterAccuracyJob.conf
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Job
+
+# The Bloom Filter Update Speed profile is evaluated by fixing a size in bits 
for the filter
+# and inserting items up to a maximum input cardinality N.
+# Note that `final long numItems = 1L<<20` should correspond directly with 
`Trials_lgMaxU`
+# in this configuration file.
+
+# Uniques Profile
+Universe_lgU=20 # Maximum log2 of the input set.
+Universe_capacity = 0.9 # this is used to get number of uniques inserted: 
numUniques = Trials_capacity *(2^Trials_lgU)
+
+# Trials Profile
+Trials_lgMinT=4  #Min trials at tail (high counts) 4
+Trials_lgMaxT=11  #Min trials at tail (high counts) 4
+Trials_TPPO=1     #how often intermediate results are printed
+Trials_lgMinBpU=1   #start the downward slope of trials at this LgU
+Trials_lgMaxBpU=5  #stop the downward slope of trials at this LgU
+
+# Date-Time Profile
+TimeZone=PST
+TimeZoneOffset=-28800000 # offset in millisec
+FileNameDateFormat=yyyyMMdd'_'HHmmssz
+ReadableDateFormat=yyyy/MM/dd HH:mm:ss z
+
+#Job Profile
+JobProfile=org.apache.datasketches.characterization.filters.BloomFilterAccuracyProfile
+filterLengthBits = 16777216
+minNumHashes = 4
+maxNumHashes = 24
+lgNumQueries = 20
diff --git a/src/main/resources/filters/QuotientFilterAccuracyJob.conf 
b/src/main/resources/filters/QuotientFilterAccuracyJob.conf
new file mode 100644
index 0000000..37d4ef1
--- /dev/null
+++ b/src/main/resources/filters/QuotientFilterAccuracyJob.conf
@@ -0,0 +1,47 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Job
+
+# The Bloom Filter Update Speed profile is evaluated by fixing a size in bits 
for the filter
+# and inserting items up to a maximum input cardinality N.
+# Note that `final long numItems = 1L<<20` should correspond directly with 
`Trials_lgMaxU`
+# in this configuration file.
+
+# Uniques Profile
+Universe_lgU=20 # Maximum log2 of the input set.
+Universe_capacity = 0.75 # this is used to get number of uniques inserted: 
numUniques = Trials_capacity *(2^Trials_lgU)
+
+
+# Trials Profile
+Trials_lgMinT=4  #Min trials at tail (high counts) 4
+Trials_lgMaxT=11  #Min trials at tail (high counts) 4
+Trials_TPPO=1     #how often intermediate results are printed
+Trials_lgMinBpU=1   #start the downward slope of trials at this LgU
+Trials_lgMaxBpU=5  #stop the downward slope of trials at this LgU
+
+# Date-Time Profile
+TimeZone=PST
+TimeZoneOffset=-28800000 # offset in millisec
+FileNameDateFormat=yyyyMMdd'_'HHmmssz
+ReadableDateFormat=yyyy/MM/dd HH:mm:ss z
+
+#Job Profile
+JobProfile=org.apache.datasketches.characterization.filters.QuotientFilterAccuracyProfile
+minNumHashes = 4
+maxNumHashes = 24
+lgNumQueries = 20


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datasketches-characterization) branch master updated: Added filter accuracy scripts

Reply via email to