This is an automated email from the ASF dual-hosted git repository.

lidavidm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 4bbd48db8f GH-38366: [Java] Fix Murmur hash on buffers less than 4 
bytes (#38368)
4bbd48db8f is described below

commit 4bbd48db8f3937c3e2f54f4c1131fd6af5a6b85c
Author: Chris Larsen <[email protected]>
AuthorDate: Fri Oct 20 13:30:14 2023 -0700

    GH-38366: [Java] Fix Murmur hash on buffers less than 4 bytes (#38368)
    
    
    
    ### Rationale for this change
    
    Using the `MurmurHash` implementation would cause collisions on small input 
values.
    
    ### What changes are included in this PR?
    
    Fix the iteration for small and tail values that are not 4 bytes in length.
    
    ### Are these changes tested?
    
    Yes
    
    ### Are there any user-facing changes?
    Unlikely unless someone was using the `MurmurHash` functions to persist a 
hash value.
    
    * Closes: #38366
    
    Authored-by: Chris Larsen <[email protected]>
    Signed-off-by: David Li <[email protected]>
---
 .../arrow/memory/util/hash/MurmurHasher.java       |  2 +-
 .../arrow/memory/util/hash/TestArrowBufHasher.java | 22 ++++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git 
a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/MurmurHasher.java
 
b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/MurmurHasher.java
index ea565dfca6..75fc3f0c45 100644
--- 
a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/MurmurHasher.java
+++ 
b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/MurmurHasher.java
@@ -96,7 +96,7 @@ public class MurmurHasher implements ArrowBufHasher {
     if (index < length) {
       // process remaining data as a integer in little endian
       int intValue = 0;
-      for (int i = index - 1; i >= index; i--) {
+      for (long i = length - 1; i >= index; i--) {
         intValue <<= 8;
         intValue |= (MemoryUtil.UNSAFE.getByte(address + i) & 0x000000ff);
         index += 1;
diff --git 
a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/util/hash/TestArrowBufHasher.java
 
b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/util/hash/TestArrowBufHasher.java
index a8707e6ca9..3da0602bdf 100644
--- 
a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/util/hash/TestArrowBufHasher.java
+++ 
b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/util/hash/TestArrowBufHasher.java
@@ -18,8 +18,10 @@
 package org.apache.arrow.memory.util.hash;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 
+import java.nio.charset.StandardCharsets;
 import java.util.Arrays;
 import java.util.Collection;
 
@@ -110,6 +112,26 @@ public class TestArrowBufHasher {
     }
   }
 
+  @Test
+  public void testHasherLessThanInt() {
+    try (ArrowBuf buf1 = allocator.buffer(4);
+         ArrowBuf buf2 = allocator.buffer(4)) {
+      buf1.writeBytes("foo1".getBytes(StandardCharsets.UTF_8));
+      buf2.writeBytes("bar2".getBytes(StandardCharsets.UTF_8));
+
+      for (int i = 1; i <= 4; i ++) {
+        verifyHashCodeNotEqual(buf1, 0, i, buf2, 0, i);
+      }
+    }
+  }
+
+  private void verifyHashCodeNotEqual(ArrowBuf buf1, int offset1, int length1,
+                                      ArrowBuf buf2, int offset2, int length2) 
{
+    int hashCode1 = hasher.hashCode(buf1, 0, length1);
+    int hashCode2 = hasher.hashCode(buf2, 0, length2);
+    assertNotEquals(hashCode1, hashCode2);
+  }
+
   @Parameterized.Parameters(name = "hasher = {0}")
   public static Collection<Object[]> getHasher() {
     return Arrays.asList(

Reply via email to