This is an automated email from the ASF dual-hosted git repository.

garydgregory pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/commons-collections.git


The following commit(s) were added to refs/heads/master by this push:
     new 943dd8da8 Fix integer overflow in SetOperations.cosineSimilarity (#693)
943dd8da8 is described below

commit 943dd8da8c4269fa48446935bc64ab5d7929184a
Author: Vasiliy Mikhailov <[email protected]>
AuthorDate: Sat Jun 27 17:32:05 2026 +0400

    Fix integer overflow in SetOperations.cosineSimilarity (#693)
    
    cosineSimilarity computed the product of the two cardinalities as an int
    before passing it to Math.sqrt:
    
        numerator / Math.sqrt(cardinality(first) * cardinality(second))
    
    The comment above the line notes the product is meant to be a double so it
    will not overflow, but without a cast the multiplication happens in int.
    For large inputs (each cardinality > 46341) the product exceeds
    Integer.MAX_VALUE and wraps to a negative value, so Math.sqrt returns NaN
    and the similarity (and cosineDistance) become NaN.
    
    Cast the first operand to double so the product is computed in double, as
    the comment intended. Both the BitMapExtractor and BloomFilter overloads
    had the same issue.
    
    Signed-off-by: Vasiliy Mikhailov <[email protected]>
---
 .../collections4/bloomfilter/SetOperations.java    |  4 ++--
 .../bloomfilter/SetOperationsTest.java             | 26 ++++++++++++++++++++++
 2 files changed, 28 insertions(+), 2 deletions(-)

diff --git 
a/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java 
b/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java
index e7ee2f10a..e9046cd9b 100644
--- 
a/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java
+++ 
b/src/main/java/org/apache/commons/collections4/bloomfilter/SetOperations.java
@@ -100,7 +100,7 @@ public final class SetOperations {
         final int numerator = andCardinality(first, second);
         // Given that the cardinality is an int then the product as a double 
will not
         // overflow, we can use one sqrt:
-        return numerator == 0 ? 0 : numerator / Math.sqrt(cardinality(first) * 
cardinality(second));
+        return numerator == 0 ? 0 : numerator / Math.sqrt((double) 
cardinality(first) * cardinality(second));
     }
 
     /**
@@ -123,7 +123,7 @@ public final class SetOperations {
         final int numerator = andCardinality(first, second);
         // Given that the cardinality is an int then the product as a double 
will not
         // overflow, we can use one sqrt:
-        return numerator == 0 ? 0 : numerator / Math.sqrt(first.cardinality() 
* second.cardinality());
+        return numerator == 0 ? 0 : numerator / Math.sqrt((double) 
first.cardinality() * second.cardinality());
     }
 
     /**
diff --git 
a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java
 
b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java
index fc3782df1..60da89457 100644
--- 
a/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java
+++ 
b/src/test/java/org/apache/commons/collections4/bloomfilter/SetOperationsTest.java
@@ -324,4 +324,30 @@ class SetOperationsTest {
         filter2 = createFilter(shape2, IndexExtractor.fromIndexArray(5, 64, 
169));
         assertSymmetricOperation(3, SetOperations::xorCardinality, filter1, 
filter2);
     }
+
+    @Test
+    final void testCosineSimilarityLargeCardinalityNoOverflow() {
+        // Create two identical BitMapExtractors with cardinality > 46341 each.
+        // Each long has 64 bits. We want cardinality = 50000.
+        // 781 * 64 = 49984, plus 16 more bits = 50000.
+        final long[] bitMaps = new long[782];
+        for (int i = 0; i < 781; i++) {
+            bitMaps[i] = -1L; // all 64 bits set
+        }
+        // Last word: 16 bits set = 0xFFFF
+        bitMaps[781] = 0xFFFFL;
+
+        final BitMapExtractor extractor = 
BitMapExtractor.fromBitMapArray(bitMaps);
+        // Verify cardinality is 50000
+        assertEquals(50000, SetOperations.cardinality(extractor));
+
+        // Cosine similarity of two identical non-empty extractors should be 
1.0.
+        // With integer overflow: 50000 * 50000 = 2,500,000,000 > 
Integer.MAX_VALUE, overflows to negative,
+        // Math.sqrt(negative) = NaN, so the result would be NaN instead of 
1.0.
+        final double similarity = SetOperations.cosineSimilarity(extractor, 
extractor);
+        assertEquals(1.0, similarity, 1e-9, "cosineSimilarity of identical 
large-cardinality extractors should be 1.0, not NaN");
+
+        final double distance = SetOperations.cosineDistance(extractor, 
extractor);
+        assertEquals(0.0, distance, 1e-9, "cosineDistance of identical 
large-cardinality extractors should be 0.0, not NaN");
+    }
 }

Reply via email to