[spark] branch master updated: [SPARK-23469][ML] HashingTF should use corrected MurmurHash3 implementation

srowen Fri, 02 Aug 2019 08:54:19 -0700

This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 660423d  [SPARK-23469][ML] HashingTF should use corrected MurmurHash3 
implementation
660423d is described below

commit 660423d71769a42176f7cbe867eb23ec8afe7592
Author: Huaxin Gao <huax...@us.ibm.com>
AuthorDate: Fri Aug 2 10:53:36 2019 -0500

    [SPARK-23469][ML] HashingTF should use corrected MurmurHash3 implementation
    
    ## What changes were proposed in this pull request?
    
    Update HashingTF to use new implementation of MurmurHash3
    Make HashingTF use the old MurmurHash3 when a model from pre 3.0 is loaded
    
    ## How was this patch tested?
    
    Change existing unit tests. Also add one unit test to make sure HashingTF 
use the old MurmurHash3 when a model from pre 3.0 is loaded
    
    Closes #25303 from huaxingao/spark-23469.
    
    Authored-by: Huaxin Gao <huax...@us.ibm.com>
    Signed-off-by: Sean Owen <sean.o...@databricks.com>
---
 .../org/apache/spark/ml/feature/HashingTF.scala    |  28 +++++++++++++++++++--
 .../hashingTF-pre3.0/metadata/.part-00000.crc      | Bin 0 -> 12 bytes
 .../test-data/hashingTF-pre3.0/metadata/_SUCCESS   |   0
 .../test-data/hashingTF-pre3.0/metadata/part-00000 |   1 +
 .../apache/spark/ml/feature/HashingTFSuite.scala   |  25 +++++++++++++-----
 python/pyspark/ml/feature.py                       |   8 +++---
 python/pyspark/ml/tests/test_feature.py            |   2 +-
 7 files changed, 51 insertions(+), 13 deletions(-)

diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
index 27b8bdc..0e6c43a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/HashingTF.scala
@@ -26,11 +26,12 @@ import org.apache.spark.ml.linalg.Vectors
 import org.apache.spark.ml.param._
 import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol}
 import org.apache.spark.ml.util._
-import org.apache.spark.mllib.feature.HashingTF.murmur3Hash
+import org.apache.spark.mllib.feature.{HashingTF => OldHashingTF}
 import org.apache.spark.sql.{DataFrame, Dataset}
 import org.apache.spark.sql.functions.{col, udf}
 import org.apache.spark.sql.types.{ArrayType, StructType}
 import org.apache.spark.util.Utils
+import org.apache.spark.util.VersionUtils.majorMinorVersion
 
 /**
  * Maps a sequence of terms to their term frequencies using the hashing trick.
@@ -44,7 +45,7 @@ import org.apache.spark.util.Utils
 class HashingTF @Since("1.4.0") (@Since("1.4.0") override val uid: String)
   extends Transformer with HasInputCol with HasOutputCol with 
DefaultParamsWritable {
 
-  private[this] val hashFunc: Any => Int = murmur3Hash
+  private var hashFunc: Any => Int = FeatureHasher.murmur3Hash
 
   @Since("1.2.0")
   def this() = this(Identifiable.randomUID("hashingTF"))
@@ -142,6 +143,29 @@ class HashingTF @Since("1.4.0") (@Since("1.4.0") override 
val uid: String)
 @Since("1.6.0")
 object HashingTF extends DefaultParamsReadable[HashingTF] {
 
+  private class HashingTFReader extends MLReader[HashingTF] {
+
+    private val className = classOf[HashingTF].getName
+
+    override def load(path: String): HashingTF = {
+      val metadata = DefaultParamsReader.loadMetadata(path, sc, className)
+      val hashingTF = new HashingTF(metadata.uid)
+      metadata.getAndSetParams(hashingTF)
+
+      // We support loading old `HashingTF` saved by previous Spark versions.
+      // Previous `HashingTF` uses `mllib.feature.HashingTF.murmur3Hash`, but 
new `HashingTF` uses
+      // `ml.Feature.FeatureHasher.murmur3Hash`.
+      val (majorVersion, _) = majorMinorVersion(metadata.sparkVersion)
+      if (majorVersion < 3) {
+        hashingTF.hashFunc = OldHashingTF.murmur3Hash
+      }
+      hashingTF
+    }
+  }
+
+  @Since("3.0.0")
+  override def read: MLReader[HashingTF] = new HashingTFReader
+
   @Since("1.6.0")
   override def load(path: String): HashingTF = super.load(path)
 }
diff --git 
a/mllib/src/test/resources/test-data/hashingTF-pre3.0/metadata/.part-00000.crc 
b/mllib/src/test/resources/test-data/hashingTF-pre3.0/metadata/.part-00000.crc
new file mode 100644
index 0000000..1ac377a
Binary files /dev/null and 
b/mllib/src/test/resources/test-data/hashingTF-pre3.0/metadata/.part-00000.crc 
differ
diff --git 
a/mllib/src/test/resources/test-data/hashingTF-pre3.0/metadata/_SUCCESS 
b/mllib/src/test/resources/test-data/hashingTF-pre3.0/metadata/_SUCCESS
new file mode 100644
index 0000000..e69de29
diff --git 
a/mllib/src/test/resources/test-data/hashingTF-pre3.0/metadata/part-00000 
b/mllib/src/test/resources/test-data/hashingTF-pre3.0/metadata/part-00000
new file mode 100644
index 0000000..492a07a
--- /dev/null
+++ b/mllib/src/test/resources/test-data/hashingTF-pre3.0/metadata/part-00000
@@ -0,0 +1 @@
+{"class":"org.apache.spark.ml.feature.HashingTF","timestamp":1564446310495,"sparkVersion":"2.3.0-SNAPSHOT","uid":"hashingTF_8ced2ab477c1","paramMap":{"binary":true,"numFeatures":100,"outputCol":"features","inputCol":"words"}}
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala
index d768a40..d65646e 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/HashingTFSuite.scala
@@ -83,11 +83,24 @@ class HashingTFSuite extends MLTest with 
DefaultReadWriteTest {
       .setInputCol("words")
       .setOutputCol("features")
       .setNumFeatures(n)
-    val mLlibHashingTF = new MLlibHashingTF(n)
-    assert(hashingTF.indexOf("a") === mLlibHashingTF.indexOf("a"))
-    assert(hashingTF.indexOf("b") === mLlibHashingTF.indexOf("b"))
-    assert(hashingTF.indexOf("c") === mLlibHashingTF.indexOf("c"))
-    assert(hashingTF.indexOf("d") === mLlibHashingTF.indexOf("d"))
+    assert(hashingTF.indexOf("a") === 67)
+    assert(hashingTF.indexOf("b") === 65)
+    assert(hashingTF.indexOf("c") === 68)
+    assert(hashingTF.indexOf("d") === 90)
+  }
+
+  test("SPARK-23469: Load HashingTF prior to Spark 3.0") {
+    val hashingTFPath = testFile("test-data/hashingTF-pre3.0")
+    val loadedHashingTF = HashingTF.load(hashingTFPath)
+    val mLlibHashingTF = new MLlibHashingTF(100)
+    assert(loadedHashingTF.indexOf("a") === mLlibHashingTF.indexOf("a"))
+    assert(loadedHashingTF.indexOf("b") === mLlibHashingTF.indexOf("b"))
+    assert(loadedHashingTF.indexOf("c") === mLlibHashingTF.indexOf("c"))
+    assert(loadedHashingTF.indexOf("d") === mLlibHashingTF.indexOf("d"))
+
+    val metadata = spark.read.json(s"$hashingTFPath/metadata")
+    val sparkVersionStr = metadata.select("sparkVersion").first().getString(0)
+    assert(sparkVersionStr == "2.3.0-SNAPSHOT")
   }
 
   test("read/write") {
@@ -103,7 +116,7 @@ class HashingTFSuite extends MLTest with 
DefaultReadWriteTest {
 object HashingTFSuite {
 
   private[feature] def murmur3FeatureIdx(numFeatures: Int)(term: Any): Int = {
-    Utils.nonNegativeMod(MLlibHashingTF.murmur3Hash(term), numFeatures)
+    Utils.nonNegativeMod(FeatureHasher.murmur3Hash(term), numFeatures)
   }
 
 }
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 1196da5..fe8ac62 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -902,19 +902,19 @@ class HashingTF(JavaTransformer, HasInputCol, 
HasOutputCol, HasNumFeatures, Java
     >>> df = spark.createDataFrame([(["a", "b", "c"],)], ["words"])
     >>> hashingTF = HashingTF(numFeatures=10, inputCol="words", 
outputCol="features")
     >>> hashingTF.transform(df).head().features
-    SparseVector(10, {0: 1.0, 1: 1.0, 2: 1.0})
+    SparseVector(10, {5: 1.0, 7: 1.0, 8: 1.0})
     >>> hashingTF.setParams(outputCol="freqs").transform(df).head().freqs
-    SparseVector(10, {0: 1.0, 1: 1.0, 2: 1.0})
+    SparseVector(10, {5: 1.0, 7: 1.0, 8: 1.0})
     >>> params = {hashingTF.numFeatures: 5, hashingTF.outputCol: "vector"}
     >>> hashingTF.transform(df, params).head().vector
-    SparseVector(5, {0: 1.0, 1: 1.0, 2: 1.0})
+    SparseVector(5, {0: 1.0, 2: 1.0, 3: 1.0})
     >>> hashingTFPath = temp_path + "/hashing-tf"
     >>> hashingTF.save(hashingTFPath)
     >>> loadedHashingTF = HashingTF.load(hashingTFPath)
     >>> loadedHashingTF.getNumFeatures() == hashingTF.getNumFeatures()
     True
     >>> hashingTF.indexOf("b")
-    1
+    5
 
     .. versionadded:: 1.3.0
     """
diff --git a/python/pyspark/ml/tests/test_feature.py 
b/python/pyspark/ml/tests/test_feature.py
index e2fc4e5..6b0d1dc 100644
--- a/python/pyspark/ml/tests/test_feature.py
+++ b/python/pyspark/ml/tests/test_feature.py
@@ -296,7 +296,7 @@ class HashingTFTest(SparkSessionTestCase):
         
hashingTF.setInputCol("words").setOutputCol("features").setNumFeatures(n).setBinary(True)
         output = hashingTF.transform(df)
         features = output.select("features").first().features.toArray()
-        expected = Vectors.dense([1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 
0.0]).toArray()
+        expected = Vectors.dense([0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 1.0, 
0.0]).toArray()
         for i in range(0, n):
             self.assertAlmostEqual(features[i], expected[i], 14, "Error at " + 
str(i) +
                                    ": expected " + str(expected[i]) + ", got " 
+ str(features[i]))


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch master updated: [SPARK-23469][ML] HashingTF should use corrected MurmurHash3 implementation

Reply via email to