This is an automated email from the ASF dual-hosted git repository.
lidavidm pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 4bbd48db8f GH-38366: [Java] Fix Murmur hash on buffers less than 4
bytes (#38368)
4bbd48db8f is described below
commit 4bbd48db8f3937c3e2f54f4c1131fd6af5a6b85c
Author: Chris Larsen <[email protected]>
AuthorDate: Fri Oct 20 13:30:14 2023 -0700
GH-38366: [Java] Fix Murmur hash on buffers less than 4 bytes (#38368)
### Rationale for this change
Using the `MurmurHash` implementation would cause collisions on small input
values.
### What changes are included in this PR?
Fix the iteration for small and tail values that are not 4 bytes in length.
### Are these changes tested?
Yes
### Are there any user-facing changes?
Unlikely unless someone was using the `MurmurHash` functions to persist a
hash value.
* Closes: #38366
Authored-by: Chris Larsen <[email protected]>
Signed-off-by: David Li <[email protected]>
---
.../arrow/memory/util/hash/MurmurHasher.java | 2 +-
.../arrow/memory/util/hash/TestArrowBufHasher.java | 22 ++++++++++++++++++++++
2 files changed, 23 insertions(+), 1 deletion(-)
diff --git
a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/MurmurHasher.java
b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/MurmurHasher.java
index ea565dfca6..75fc3f0c45 100644
---
a/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/MurmurHasher.java
+++
b/java/memory/memory-core/src/main/java/org/apache/arrow/memory/util/hash/MurmurHasher.java
@@ -96,7 +96,7 @@ public class MurmurHasher implements ArrowBufHasher {
if (index < length) {
// process remaining data as a integer in little endian
int intValue = 0;
- for (int i = index - 1; i >= index; i--) {
+ for (long i = length - 1; i >= index; i--) {
intValue <<= 8;
intValue |= (MemoryUtil.UNSAFE.getByte(address + i) & 0x000000ff);
index += 1;
diff --git
a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/util/hash/TestArrowBufHasher.java
b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/util/hash/TestArrowBufHasher.java
index a8707e6ca9..3da0602bdf 100644
---
a/java/memory/memory-core/src/test/java/org/apache/arrow/memory/util/hash/TestArrowBufHasher.java
+++
b/java/memory/memory-core/src/test/java/org/apache/arrow/memory/util/hash/TestArrowBufHasher.java
@@ -18,8 +18,10 @@
package org.apache.arrow.memory.util.hash;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
+import java.nio.charset.StandardCharsets;
import java.util.Arrays;
import java.util.Collection;
@@ -110,6 +112,26 @@ public class TestArrowBufHasher {
}
}
+ @Test
+ public void testHasherLessThanInt() {
+ try (ArrowBuf buf1 = allocator.buffer(4);
+ ArrowBuf buf2 = allocator.buffer(4)) {
+ buf1.writeBytes("foo1".getBytes(StandardCharsets.UTF_8));
+ buf2.writeBytes("bar2".getBytes(StandardCharsets.UTF_8));
+
+ for (int i = 1; i <= 4; i ++) {
+ verifyHashCodeNotEqual(buf1, 0, i, buf2, 0, i);
+ }
+ }
+ }
+
+ private void verifyHashCodeNotEqual(ArrowBuf buf1, int offset1, int length1,
+ ArrowBuf buf2, int offset2, int length2)
{
+ int hashCode1 = hasher.hashCode(buf1, 0, length1);
+ int hashCode2 = hasher.hashCode(buf2, 0, length2);
+ assertNotEquals(hashCode1, hashCode2);
+ }
+
@Parameterized.Parameters(name = "hasher = {0}")
public static Collection<Object[]> getHasher() {
return Arrays.asList(