Repository: spark
Updated Branches:
  refs/heads/master 91fbc880b -> 87706eb66


[SPARK-15793][ML] Add maxSentenceLength for ml.Word2Vec

## What changes were proposed in this pull request?

https://issues.apache.org/jira/browse/SPARK-15793

Word2vec in ML package should have maxSentenceLength method for feature parity.

## How was this patch tested?

Tested with Spark unit test.

Author: yinxusen <yinxu...@gmail.com>

Closes #13536 from yinxusen/SPARK-15793.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/87706eb6
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/87706eb6
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/87706eb6

Branch: refs/heads/master
Commit: 87706eb66cd1370862a1f8ea447484c80969e45f
Parents: 91fbc88
Author: yinxusen <yinxu...@gmail.com>
Authored: Wed Jun 8 09:18:04 2016 +0100
Committer: Sean Owen <so...@cloudera.com>
Committed: Wed Jun 8 09:18:04 2016 +0100

----------------------------------------------------------------------
 .../org/apache/spark/ml/feature/Word2Vec.scala   | 19 +++++++++++++++++++
 .../apache/spark/ml/feature/Word2VecSuite.scala  |  1 +
 2 files changed, 20 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/87706eb6/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
index 2d89eb0..33515b2 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala
@@ -87,6 +87,21 @@ private[feature] trait Word2VecBase extends Params
   /** @group getParam */
   def getMinCount: Int = $(minCount)
 
+  /**
+   * Sets the maximum length (in words) of each sentence in the input data.
+   * Any sentence longer than this threshold will be divided into chunks of
+   * up to `maxSentenceLength` size.
+   * Default: 1000
+   * @group param
+   */
+  final val maxSentenceLength = new IntParam(this, "maxSentenceLength", 
"Maximum length " +
+    "(in words) of each sentence in the input data. Any sentence longer than 
this threshold will " +
+    "be divided into chunks up to the size.")
+  setDefault(maxSentenceLength -> 1000)
+
+  /** @group getParam */
+  def getMaxSentenceLength: Int = $(maxSentenceLength)
+
   setDefault(stepSize -> 0.025)
   setDefault(maxIter -> 1)
 
@@ -137,6 +152,9 @@ final class Word2Vec(override val uid: String) extends 
Estimator[Word2VecModel]
   /** @group setParam */
   def setMinCount(value: Int): this.type = set(minCount, value)
 
+  /** @group setParam */
+  def setMaxSentenceLength(value: Int): this.type = set(maxSentenceLength, 
value)
+
   @Since("2.0.0")
   override def fit(dataset: Dataset[_]): Word2VecModel = {
     transformSchema(dataset.schema, logging = true)
@@ -149,6 +167,7 @@ final class Word2Vec(override val uid: String) extends 
Estimator[Word2VecModel]
       .setSeed($(seed))
       .setVectorSize($(vectorSize))
       .setWindowSize($(windowSize))
+      .setMaxSentenceLength($(maxSentenceLength))
       .fit(input)
     copyValues(new Word2VecModel(uid, wordVectors).setParent(this))
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/87706eb6/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
index 280a36f..16c74f6 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala
@@ -191,6 +191,7 @@ class Word2VecSuite extends SparkFunSuite with 
MLlibTestSparkContext with Defaul
       .setSeed(42L)
       .setStepSize(0.01)
       .setVectorSize(100)
+      .setMaxSentenceLength(500)
     testDefaultReadWrite(t)
   }
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to