This is an automated email from the ASF dual-hosted git repository.

charlie pushed a commit to branch quotient-filter
in repository https://gitbox.apache.org/repos/asf/datasketches-java.git

commit 7a8e8383c13252b24517c389d7bb0d307d32a82c
Author: Charlie Dickens <[email protected]>
AuthorDate: Thu May 2 11:14:26 2024 +0100

    Added initial QF builder functions
---
 .../quotientfilter/QuotientFilterBuilder.java      | 131 +++++++++++++++++++++
 .../quotientfilter/QuotientFilterBuilderTest.java  | 113 ++++++++++++++++++
 2 files changed, 244 insertions(+)

diff --git 
a/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java
 
b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java
new file mode 100644
index 00000000..0d1812d9
--- /dev/null
+++ 
b/src/main/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilder.java
@@ -0,0 +1,131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.filters.quotientfilter;
+import java.util.concurrent.ThreadLocalRandom;
+
+import org.apache.datasketches.common.SketchesArgumentException;
+
+/**
+ * <p>This class provides methods to help estimate the correct parameters when
+ * creating a Quotient filter, and methods to create the filter using those 
values.</p>
+ *
+ * <p>The underlying math is described in the
+ *
+ * Wikipedia article on Quotient filters</a>.</p>
+ */
+public final class QuotientFilterBuilder {
+
+    /*
+    This function is used to suggest the number of bits per entry for a given 
number of entries.
+    The fingerprint length is related to the targetFalsePositiveProb roughly 
by 2^(-fingerprint_length).
+    Hence, the length of the fingerprint can be stored in at most 8 bits.
+    This, after rounding up, is the same as the more sophisticated expression 
which involves the capacity
+    from 
https://en.wikipedia.org/wiki/Quotient_filter#Probability_of_false_positives.
+    * @param targetFalsePositiveProb A desired false positive probability per 
item
+    * @return The suggested fingerprint length in bits
+     */
+    public static byte suggestFingerprintLength(double 
targetFalsePositiveProb) {
+        if (targetFalsePositiveProb <= 0. || targetFalsePositiveProb >= 1.) {
+
+            throw new SketchesArgumentException("targetFalsePositiveProb must 
be a valid probability and strictly greater than 0");
+        }
+        return (byte) Math.ceil(-Math.log(targetFalsePositiveProb) / 
Math.log(2));
+    }
+
+    /**
+     * This method suggests the number of slots in the filter for a given 
input size, assuming 90% capacity.
+     * There is no load factor checking internally within the filter, so this 
method is used to map between the
+     * number of items we insert into a sketch and the number of slots we need 
to allocate.
+     * A design feature of Niv's implementation is that 2^j +2*j slots are 
allocated. This asymptotically approaches
+     * 2^j slots as j grows, and the canonical number of slots is 2^j. 
Therefore, we will only check against
+     * 0.9*2^j slots.
+     * The load factor is 0.9 to get some space-utility advantages over the 
bloom filter.
+     */
+    public static byte suggestLgNumSlots(long maxDistinctItems) {
+        if (maxDistinctItems <= 0) {
+            throw new SketchesArgumentException("maxDistinctItems must be 
strictly positive");
+        }
+        byte result = (byte) Math.ceil(Math.log(maxDistinctItems / 0.9) / 
Math.log(2));
+        if (result < 31) {
+            return result;
+        } else {
+            // Largest address space for a Java array is 2^31 - 1
+            throw new SketchesArgumentException("Largest address space for a 
Java array is 2^31 - 1");
+        }
+    }
+
+    /*
+    Returns the largest number of unique items that can be inserted into the 
filter.
+    We use a predefined load factor of 0.9 compared to the number of slots as 
2^j.
+    @param lgNumSlots The log-base-2 of the number of slots in the filter
+    @return The maximum number of items that can be inserted into the filter
+     */
+    public static long suggestMaxNumItemsFromNumSlots(byte lgNumSlots) {
+        if (lgNumSlots <= 0) {
+            throw new SketchesArgumentException("lgNumSlots must be at least 
1.");
+        } else if (lgNumSlots >= 31) {
+            throw new SketchesArgumentException("lgNumSlots cannot exceed 2^31 
- 1.");
+        }
+        return (long) Math.floor(0.9 * Math.pow(2, lgNumSlots));
+    }
+
+
+    /**
+     * This method suggests the parameters for a Quotient filter based on the 
maximum number of distinct items and the target false positive probability.
+     * It first validates the inputs, then calculates the log-base-2 of the 
number of slots and the fingerprint length.
+     * The results are returned as a QFPair object.
+     *
+     * @param maxDistinctItems The maximum number of distinct items that can 
be inserted into the filter.
+     * @param targetFalsePositiveProb The desired false positive probability 
per item.
+     * @return A QFPair object containing the suggested number of slots 
(lgNumSlots) and the suggested fingerprint length.
+     * @throws SketchesArgumentException if the input parameters are not valid.
+     */
+    public static QFPair suggestParamsFromMaxDistinctsFPP(long 
maxDistinctItems, double targetFalsePositiveProb) {
+        validateAccuracyInputs(maxDistinctItems, targetFalsePositiveProb);
+        byte lgNumSlots = suggestLgNumSlots(maxDistinctItems);
+        byte fingerprintLength = 
suggestFingerprintLength(targetFalsePositiveProb);
+        return new QFPair(lgNumSlots, fingerprintLength);
+    }
+
+    private static void validateAccuracyInputs(final long maxDistinctItems, 
final double targetFalsePositiveProb) {
+        if (maxDistinctItems <= 0) {
+            throw new SketchesArgumentException("maxDistinctItems must be 
strictly positive");
+        }
+        if (targetFalsePositiveProb <= 0.0 || targetFalsePositiveProb > 1.0) {
+            throw new SketchesArgumentException("targetFalsePositiveProb must 
be a valid probability and strictly greater than 0");
+        }
+    }
+
+    /**
+     * Helper class to return a pair of parameters for a Quotient filter:
+     * the log-base-2 of the number of slots (lgNumSlots) and the fingerprint 
length.
+     * These parameters are used to configure the Quotient filter.
+     */
+    public static class QFPair {
+        public final byte lgNumSlots;
+        public final byte fingerprintLength;
+
+        public QFPair(byte lgNumSlots, byte fingerprintLength) {
+            this.lgNumSlots = lgNumSlots;
+            this.fingerprintLength = fingerprintLength;
+        }
+    }
+
+}
diff --git 
a/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java
 
b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java
new file mode 100644
index 00000000..4fc38b2b
--- /dev/null
+++ 
b/src/test/java/org/apache/datasketches/filters/quotientfilter/QuotientFilterBuilderTest.java
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.datasketches.filters.quotientfilter;
+
+import org.apache.datasketches.common.SketchesArgumentException;
+import org.apache.datasketches.filters.quotientfilter.QuotientFilterBuilder;
+import org.apache.datasketches.memory.WritableMemory;
+import org.testng.annotations.Test;
+
+import static org.testng.Assert.*;
+public class QuotientFilterBuilderTest {
+
+    @Test
+    public void testSuggestFingerprintLengthFromFPP(){
+        // invalid false positive rate
+        assertThrows(SketchesArgumentException.class, () -> 
QuotientFilterBuilder.suggestFingerprintLength(0.));
+        assertThrows(SketchesArgumentException.class, () -> 
QuotientFilterBuilder.suggestFingerprintLength(1.));
+
+        // manually computed values based on formula using 
ceil(log2(1/targetFalsePositiveProb))
+        double[] fpps = {0.1, 0.01, 0.001, 0.0001, 1E-5, 1E-6, 1E-7, 1E-8};
+        byte[] results = {4, 7, 10, 14, 17, 20, 24, 27, 30};
+        for (int i = 0; i < fpps.length; i++) {
+            
assertEquals(QuotientFilterBuilder.suggestFingerprintLength(fpps[i]), 
results[i]);
+        }
+    }
+
+    @Test
+    public static void testSuggestLgNumSlots(){
+        QuotientFilterBuilder qfb = new QuotientFilterBuilder();
+
+        // invalid number of items
+        assertThrows(SketchesArgumentException.class, () -> 
QuotientFilterBuilder.suggestLgNumSlots(0));
+        assertThrows(SketchesArgumentException.class, () -> 
QuotientFilterBuilder.suggestLgNumSlots(-1));
+        assertThrows(SketchesArgumentException.class, () -> 
QuotientFilterBuilder.suggestLgNumSlots(5000000000L));
+
+        long[] numItems = {1, 100, 1000, 1000000L};
+        int[] results = {1, 7, 11, 21} ;
+
+        for (int i = 0; i < numItems.length; i++) {
+            long num = numItems[i];
+            byte result = qfb.suggestLgNumSlots(num);
+            assertEquals(result, results[i]);
+        }
+    }
+
+    @Test
+    public static void testSuggestMaxNumItems(){
+        QuotientFilterBuilder qfb = new QuotientFilterBuilder();
+
+        // invalid number of slots
+        assertThrows(SketchesArgumentException.class, () -> 
QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)-127));
+        assertThrows(SketchesArgumentException.class, () -> 
QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)0));
+        assertThrows(SketchesArgumentException.class, () -> 
QuotientFilterBuilder.suggestMaxNumItemsFromNumSlots((byte)32));
+
+
+        byte[] lgNumSlots = {1, 2, 3, 6, 10, 15, 25, 30,};
+        long[] results = {1, 3, 7, 57, 921, 29491, 30198988, 966367641} ;
+
+        for (int i = 0; i < lgNumSlots.length; i++) {
+            long result = qfb.suggestMaxNumItemsFromNumSlots(lgNumSlots[i]);
+            assertEquals(result, results[i]);
+        }
+    }
+
+    @Test
+    public static void testSuggestParamsFromMaxDistinctsFPP(){
+
+        // invalid number of slots
+        assertThrows(SketchesArgumentException.class, () -> 
QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(5000000000L, 0.0001));
+        assertThrows(SketchesArgumentException.class, () -> 
QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(100000000, 0.));
+        assertThrows(SketchesArgumentException.class, () -> 
QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(100000000, 1.5));
+        assertThrows(SketchesArgumentException.class, () -> 
QuotientFilterBuilder.suggestParamsFromMaxDistinctsFPP(5000000000L, -1.));
+
+
+        QuotientFilterBuilder qfb = new QuotientFilterBuilder();
+        byte lgNumSlots ;
+        byte fingerprintLength ;
+        long[] numItems = {1L, 900L, 500_000_000L} ;
+        double[] fpp = {1E-10, 1E-2, 1e-7} ;
+
+        // expected outcomes
+        byte[] expected_lgNumSlots = {1, 10, 30} ;
+        byte[] expected_fingerprintLength = {34, 7, 24} ;
+
+        for (int i = 0; i < numItems.length; i++) {
+            QuotientFilterBuilder.QFPair pair = 
qfb.suggestParamsFromMaxDistinctsFPP(numItems[i], fpp[i]);
+            lgNumSlots = pair.lgNumSlots;
+            fingerprintLength = pair.fingerprintLength;
+            assertEquals(expected_lgNumSlots[i], lgNumSlots);
+            assertEquals(expected_fingerprintLength[i], fingerprintLength);
+        }
+    }
+
+
+
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to