spark git commit: [SPARK-23381][CORE] Murmur3 hash generates a different value from other implementations
Repository: spark Updated Branches: refs/heads/master 0a73aa31f -> d5ed2108d [SPARK-23381][CORE] Murmur3 hash generates a different value from other implementations ## What changes were proposed in this pull request? Murmur3 hash generates a different value from the original and other implementations (like Scala standard library and Guava or so) when the length of a bytes array is not multiple of 4. ## How was this patch tested? Added a unit test. **Note: When we merge this PR, please give all the credits to Shintaro Murakami.** Author: Shintaro Murakami Author: gatorsmileAuthor: Shintaro Murakami Closes #20630 from gatorsmile/pr-20568. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d5ed2108 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d5ed2108 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d5ed2108 Branch: refs/heads/master Commit: d5ed2108d32e1d95b26ee7fed39e8a733e935e2c Parents: 0a73aa3 Author: Shintaro Murakami Authored: Fri Feb 16 17:17:55 2018 -0800 Committer: gatorsmile Committed: Fri Feb 16 17:17:55 2018 -0800 -- .../spark/util/sketch/Murmur3_x86_32.java | 16 ++ .../spark/unsafe/hash/Murmur3_x86_32.java | 16 ++ .../spark/unsafe/hash/Murmur3_x86_32Suite.java | 19 +++ .../apache/spark/ml/feature/FeatureHasher.scala | 33 +++- .../apache/spark/mllib/feature/HashingTF.scala | 2 +- .../spark/ml/feature/FeatureHasherSuite.scala | 11 ++- python/pyspark/ml/feature.py| 4 +-- 7 files changed, 96 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d5ed2108/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java -- diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java index a61ce4f..e83b331 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java @@ -60,6 +60,8 @@ final class Murmur3_x86_32 { } public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) { +// This is not compatible with original and another implementations. +// But remain it for backward compatibility for the components existing before 2.3. assert (lengthInBytes >= 0): "lengthInBytes cannot be negative"; int lengthAligned = lengthInBytes - lengthInBytes % 4; int h1 = hashBytesByInt(base, offset, lengthAligned, seed); @@ -71,6 +73,20 @@ final class Murmur3_x86_32 { return fmix(h1, lengthInBytes); } + public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed) { +// This is compatible with original and another implementations. +// Use this method for new components after Spark 2.3. +assert (lengthInBytes >= 0): "lengthInBytes cannot be negative"; +int lengthAligned = lengthInBytes - lengthInBytes % 4; +int h1 = hashBytesByInt(base, offset, lengthAligned, seed); +int k1 = 0; +for (int i = lengthAligned, shift = 0; i < lengthInBytes; i++, shift += 8) { + k1 ^= (Platform.getByte(base, offset + i) & 0xFF) << shift; +} +h1 ^= mixK1(k1); +return fmix(h1, lengthInBytes); + } + private static int hashBytesByInt(Object base, long offset, int lengthInBytes, int seed) { assert (lengthInBytes % 4 == 0); int h1 = seed; http://git-wip-us.apache.org/repos/asf/spark/blob/d5ed2108/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java -- diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java index 5e7ee48..d239de6 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java @@ -60,6 +60,8 @@ public final class Murmur3_x86_32 { } public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) { +// This is not compatible with original and another implementations. +// But remain it for backward compatibility for the components existing before 2.3. assert (lengthInBytes >= 0): "lengthInBytes cannot be negative"; int lengthAligned = lengthInBytes - lengthInBytes % 4; int h1 = hashBytesByInt(base, offset, lengthAligned, seed); @@ -71,6 +73,20 @@ public final class
spark git commit: [SPARK-23381][CORE] Murmur3 hash generates a different value from other implementations
Repository: spark Updated Branches: refs/heads/branch-2.3 ccb0a59d7 -> 8360da071 [SPARK-23381][CORE] Murmur3 hash generates a different value from other implementations ## What changes were proposed in this pull request? Murmur3 hash generates a different value from the original and other implementations (like Scala standard library and Guava or so) when the length of a bytes array is not multiple of 4. ## How was this patch tested? Added a unit test. **Note: When we merge this PR, please give all the credits to Shintaro Murakami.** Author: Shintaro Murakami Author: gatorsmileAuthor: Shintaro Murakami Closes #20630 from gatorsmile/pr-20568. (cherry picked from commit d5ed2108d32e1d95b26ee7fed39e8a733e935e2c) Signed-off-by: gatorsmile Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8360da07 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8360da07 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8360da07 Branch: refs/heads/branch-2.3 Commit: 8360da07110d847a01b243e6d786922a5057ad9f Parents: ccb0a59 Author: Shintaro Murakami Authored: Fri Feb 16 17:17:55 2018 -0800 Committer: gatorsmile Committed: Fri Feb 16 17:18:15 2018 -0800 -- .../spark/util/sketch/Murmur3_x86_32.java | 16 ++ .../spark/unsafe/hash/Murmur3_x86_32.java | 16 ++ .../spark/unsafe/hash/Murmur3_x86_32Suite.java | 19 +++ .../apache/spark/ml/feature/FeatureHasher.scala | 33 +++- .../apache/spark/mllib/feature/HashingTF.scala | 2 +- .../spark/ml/feature/FeatureHasherSuite.scala | 11 ++- python/pyspark/ml/feature.py| 4 +-- 7 files changed, 96 insertions(+), 5 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/8360da07/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java -- diff --git a/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java b/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java index a61ce4f..e83b331 100644 --- a/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java +++ b/common/sketch/src/main/java/org/apache/spark/util/sketch/Murmur3_x86_32.java @@ -60,6 +60,8 @@ final class Murmur3_x86_32 { } public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) { +// This is not compatible with original and another implementations. +// But remain it for backward compatibility for the components existing before 2.3. assert (lengthInBytes >= 0): "lengthInBytes cannot be negative"; int lengthAligned = lengthInBytes - lengthInBytes % 4; int h1 = hashBytesByInt(base, offset, lengthAligned, seed); @@ -71,6 +73,20 @@ final class Murmur3_x86_32 { return fmix(h1, lengthInBytes); } + public static int hashUnsafeBytes2(Object base, long offset, int lengthInBytes, int seed) { +// This is compatible with original and another implementations. +// Use this method for new components after Spark 2.3. +assert (lengthInBytes >= 0): "lengthInBytes cannot be negative"; +int lengthAligned = lengthInBytes - lengthInBytes % 4; +int h1 = hashBytesByInt(base, offset, lengthAligned, seed); +int k1 = 0; +for (int i = lengthAligned, shift = 0; i < lengthInBytes; i++, shift += 8) { + k1 ^= (Platform.getByte(base, offset + i) & 0xFF) << shift; +} +h1 ^= mixK1(k1); +return fmix(h1, lengthInBytes); + } + private static int hashBytesByInt(Object base, long offset, int lengthInBytes, int seed) { assert (lengthInBytes % 4 == 0); int h1 = seed; http://git-wip-us.apache.org/repos/asf/spark/blob/8360da07/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java -- diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java index 5e7ee48..d239de6 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/hash/Murmur3_x86_32.java @@ -60,6 +60,8 @@ public final class Murmur3_x86_32 { } public static int hashUnsafeBytes(Object base, long offset, int lengthInBytes, int seed) { +// This is not compatible with original and another implementations. +// But remain it for backward compatibility for the components existing before 2.3. assert (lengthInBytes >= 0): "lengthInBytes cannot be negative"; int lengthAligned =