spark git commit: [SPARK-25317][CORE] Avoid perf regression in Murmur3 Hash on UTF8String

wenchen Thu, 06 Sep 2018 00:29:18 -0700

Repository: spark
Updated Branches:
  refs/heads/branch-2.4 d749d034a -> b632e775c



[SPARK-25317][CORE] Avoid perf regression in Murmur3 Hash on UTF8String

## What changes were proposed in this pull request?

SPARK-10399 introduced a performance regression on the hash computation for 
UTF8String.

The regression can be evaluated with the code attached in the JIRA. That code 
runs in about 120 us per method on my laptop (MacBook Pro 2.5 GHz Intel Core 
i7, RAM 16 GB 1600 MHz DDR3) while the code from branch 2.3 takes on the same 
machine about 45 us for me. After the PR, the code takes about 45 us on the 
master branch too.

## How was this patch tested?

running the perf test from the JIRA

Closes #22338 from mgaido91/SPARK-25317.

Authored-by: Marco Gaido <marcogaid...@gmail.com>
Signed-off-by: Wenchen Fan <wenc...@databricks.com>
(cherry picked from commit 64c314e22fecca1ca3fe32378fc9374d8485deec)
Signed-off-by: Wenchen Fan <wenc...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b632e775
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b632e775
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b632e775

Branch: refs/heads/branch-2.4
Commit: b632e775cc057492ebba6b65647d90908aa00421
Parents: d749d03
Author: Marco Gaido <marcogaid...@gmail.com>
Authored: Thu Sep 6 15:27:59 2018 +0800
Committer: Wenchen Fan <wenc...@databricks.com>
Committed: Thu Sep 6 15:28:39 2018 +0800

----------------------------------------------------------------------
 .../spark/unsafe/hash/Murmur3_x86_32.java       | 23 ++++++++++++--------
 1 file changed, 14 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/b632e775/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
----------------------------------------------------------------------
diff --git 
a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java 
b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
index aff6e93..566f116 100644
--- 
a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
+++ 
b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java
@@ -19,6 +19,7 @@ package org.apache.spark.unsafe.hash;
 
 import com.google.common.primitives.Ints;
 
+import org.apache.spark.unsafe.Platform;
 import org.apache.spark.unsafe.memory.MemoryBlock;
 import org.apache.spark.unsafe.types.UTF8String;
 
@@ -59,7 +60,7 @@ public final class Murmur3_x86_32 {
     // This is based on Guava's `Murmur32_Hasher.processRemaining(ByteBuffer)` 
method.
     int lengthInBytes = Ints.checkedCast(base.size());
     assert (lengthInBytes % 8 == 0): "lengthInBytes must be a multiple of 8 
(word-aligned)";
-    int h1 = hashBytesByIntBlock(base, seed);
+    int h1 = hashBytesByIntBlock(base, lengthInBytes, seed);
     return fmix(h1, lengthInBytes);
   }
 
@@ -69,14 +70,19 @@ public final class Murmur3_x86_32 {
   }
 
   public static int hashUnsafeBytesBlock(MemoryBlock base, int seed) {
+    return hashUnsafeBytesBlock(base, Ints.checkedCast(base.size()), seed);
+  }
+
+  private static int hashUnsafeBytesBlock(MemoryBlock base, int lengthInBytes, 
int seed) {
     // This is not compatible with original and another implementations.
     // But remain it for backward compatibility for the components existing 
before 2.3.
-    int lengthInBytes = Ints.checkedCast(base.size());
     assert (lengthInBytes >= 0): "lengthInBytes cannot be negative";
     int lengthAligned = lengthInBytes - lengthInBytes % 4;
-    int h1 = hashBytesByIntBlock(base.subBlock(0, lengthAligned), seed);
+    int h1 = hashBytesByIntBlock(base, lengthAligned, seed);
+    long offset = base.getBaseOffset();
+    Object o = base.getBaseObject();
     for (int i = lengthAligned; i < lengthInBytes; i++) {
-      int halfWord = base.getByte(i);
+      int halfWord = Platform.getByte(o, offset + i);
       int k1 = mixK1(halfWord);
       h1 = mixH1(h1, k1);
     }
@@ -84,7 +90,7 @@ public final class Murmur3_x86_32 {
   }
 
   public static int hashUTF8String(UTF8String str, int seed) {
-    return hashUnsafeBytesBlock(str.getMemoryBlock(), seed);
+    return hashUnsafeBytesBlock(str.getMemoryBlock(), str.numBytes(), seed);
   }
 
   public static int hashUnsafeBytes(Object base, long offset, int 
lengthInBytes, int seed) {
@@ -101,7 +107,7 @@ public final class Murmur3_x86_32 {
     int lengthInBytes = Ints.checkedCast(base.size());
     assert (lengthInBytes >= 0) : "lengthInBytes cannot be negative";
     int lengthAligned = lengthInBytes - lengthInBytes % 4;
-    int h1 = hashBytesByIntBlock(base.subBlock(0, lengthAligned), seed);
+    int h1 = hashBytesByIntBlock(base, lengthAligned, seed);
     int k1 = 0;
     for (int i = lengthAligned, shift = 0; i < lengthInBytes; i++, shift += 8) 
{
       k1 ^= (base.getByte(i) & 0xFF) << shift;
@@ -110,11 +116,10 @@ public final class Murmur3_x86_32 {
     return fmix(h1, lengthInBytes);
   }
 
-  private static int hashBytesByIntBlock(MemoryBlock base, int seed) {
-    long lengthInBytes = base.size();
+  private static int hashBytesByIntBlock(MemoryBlock base, int lengthInBytes, 
int seed) {
     assert (lengthInBytes % 4 == 0);
     int h1 = seed;
-    for (long i = 0; i < lengthInBytes; i += 4) {
+    for (int i = 0; i < lengthInBytes; i += 4) {
       int halfWord = base.getInt(i);
       int k1 = mixK1(halfWord);
       h1 = mixH1(h1, k1);


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-25317][CORE] Avoid perf regression in Murmur3 Hash on UTF8String

Reply via email to