Repository: spark
Updated Branches:
  refs/heads/master e1de34113 -> 8aceb961c


[SPARK-24754][ML] Minhash integer overflow

## What changes were proposed in this pull request?

Use longs in calculating min hash to avoid bias due to int overflow.

## How was this patch tested?

Existing tests.

Author: Sean Owen <sro...@gmail.com>

Closes #21750 from srowen/SPARK-24754.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/8aceb961
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/8aceb961
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/8aceb961

Branch: refs/heads/master
Commit: 8aceb961c3b8e462c6002dbe03be61b4fe194f47
Parents: e1de341
Author: Sean Owen <sro...@gmail.com>
Authored: Sat Jul 14 15:59:17 2018 -0500
Committer: Sean Owen <sro...@gmail.com>
Committed: Sat Jul 14 15:59:17 2018 -0500

----------------------------------------------------------------------
 .../main/scala/org/apache/spark/ml/feature/MinHashLSH.scala    | 2 +-
 python/pyspark/ml/feature.py                                   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/8aceb961/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
index a67a3b0..a043033 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/MinHashLSH.scala
@@ -66,7 +66,7 @@ class MinHashLSHModel private[ml](
       val elemsList = elems.toSparse.indices.toList
       val hashValues = randCoefficients.map { case (a, b) =>
         elemsList.map { elem: Int =>
-          ((1 + elem) * a + b) % MinHashLSH.HASH_PRIME
+          ((1L + elem) * a + b) % MinHashLSH.HASH_PRIME
         }.min.toDouble
       }
       // TODO: Output vectors of dimension numHashFunctions in SPARK-18450

http://git-wip-us.apache.org/repos/asf/spark/blob/8aceb961/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 14800d4..ddba738 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -1294,14 +1294,14 @@ class MinHashLSH(JavaEstimator, LSHParams, HasInputCol, 
HasOutputCol, HasSeed,
     >>> mh = MinHashLSH(inputCol="features", outputCol="hashes", seed=12345)
     >>> model = mh.fit(df)
     >>> model.transform(df).head()
-    Row(id=0, features=SparseVector(6, {0: 1.0, 1: 1.0, 2: 1.0}), 
hashes=[DenseVector([-1638925...
+    Row(id=0, features=SparseVector(6, {0: 1.0, 1: 1.0, 2: 1.0}), 
hashes=[DenseVector([6179668...
     >>> data2 = [(3, Vectors.sparse(6, [1, 3, 5], [1.0, 1.0, 1.0]),),
     ...          (4, Vectors.sparse(6, [2, 3, 5], [1.0, 1.0, 1.0]),),
     ...          (5, Vectors.sparse(6, [1, 2, 4], [1.0, 1.0, 1.0]),)]
     >>> df2 = spark.createDataFrame(data2, ["id", "features"])
     >>> key = Vectors.sparse(6, [1, 2], [1.0, 1.0])
     >>> model.approxNearestNeighbors(df2, key, 1).collect()
-    [Row(id=5, features=SparseVector(6, {1: 1.0, 2: 1.0, 4: 1.0}), 
hashes=[DenseVector([-163892...
+    [Row(id=5, features=SparseVector(6, {1: 1.0, 2: 1.0, 4: 1.0}), 
hashes=[DenseVector([6179668...
     >>> model.approxSimilarityJoin(df, df2, 0.6, 
distCol="JaccardDistance").select(
     ...     col("datasetA.id").alias("idA"),
     ...     col("datasetB.id").alias("idB"),
@@ -1309,8 +1309,8 @@ class MinHashLSH(JavaEstimator, LSHParams, HasInputCol, 
HasOutputCol, HasSeed,
     +---+---+---------------+
     |idA|idB|JaccardDistance|
     +---+---+---------------+
-    |  1|  4|            0.5|
     |  0|  5|            0.5|
+    |  1|  4|            0.5|
     +---+---+---------------+
     ...
     >>> mhPath = temp_path + "/mh"


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to