Repository: spark Updated Branches: refs/heads/master 91fbc880b -> 87706eb66
[SPARK-15793][ML] Add maxSentenceLength for ml.Word2Vec ## What changes were proposed in this pull request? https://issues.apache.org/jira/browse/SPARK-15793 Word2vec in ML package should have maxSentenceLength method for feature parity. ## How was this patch tested? Tested with Spark unit test. Author: yinxusen <yinxu...@gmail.com> Closes #13536 from yinxusen/SPARK-15793. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/87706eb6 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/87706eb6 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/87706eb6 Branch: refs/heads/master Commit: 87706eb66cd1370862a1f8ea447484c80969e45f Parents: 91fbc88 Author: yinxusen <yinxu...@gmail.com> Authored: Wed Jun 8 09:18:04 2016 +0100 Committer: Sean Owen <so...@cloudera.com> Committed: Wed Jun 8 09:18:04 2016 +0100 ---------------------------------------------------------------------- .../org/apache/spark/ml/feature/Word2Vec.scala | 19 +++++++++++++++++++ .../apache/spark/ml/feature/Word2VecSuite.scala | 1 + 2 files changed, 20 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/87706eb6/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala index 2d89eb0..33515b2 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/Word2Vec.scala @@ -87,6 +87,21 @@ private[feature] trait Word2VecBase extends Params /** @group getParam */ def getMinCount: Int = $(minCount) + /** + * Sets the maximum length (in words) of each sentence in the input data. + * Any sentence longer than this threshold will be divided into chunks of + * up to `maxSentenceLength` size. + * Default: 1000 + * @group param + */ + final val maxSentenceLength = new IntParam(this, "maxSentenceLength", "Maximum length " + + "(in words) of each sentence in the input data. Any sentence longer than this threshold will " + + "be divided into chunks up to the size.") + setDefault(maxSentenceLength -> 1000) + + /** @group getParam */ + def getMaxSentenceLength: Int = $(maxSentenceLength) + setDefault(stepSize -> 0.025) setDefault(maxIter -> 1) @@ -137,6 +152,9 @@ final class Word2Vec(override val uid: String) extends Estimator[Word2VecModel] /** @group setParam */ def setMinCount(value: Int): this.type = set(minCount, value) + /** @group setParam */ + def setMaxSentenceLength(value: Int): this.type = set(maxSentenceLength, value) + @Since("2.0.0") override def fit(dataset: Dataset[_]): Word2VecModel = { transformSchema(dataset.schema, logging = true) @@ -149,6 +167,7 @@ final class Word2Vec(override val uid: String) extends Estimator[Word2VecModel] .setSeed($(seed)) .setVectorSize($(vectorSize)) .setWindowSize($(windowSize)) + .setMaxSentenceLength($(maxSentenceLength)) .fit(input) copyValues(new Word2VecModel(uid, wordVectors).setParent(this)) } http://git-wip-us.apache.org/repos/asf/spark/blob/87706eb6/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala index 280a36f..16c74f6 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/Word2VecSuite.scala @@ -191,6 +191,7 @@ class Word2VecSuite extends SparkFunSuite with MLlibTestSparkContext with Defaul .setSeed(42L) .setStepSize(0.01) .setVectorSize(100) + .setMaxSentenceLength(500) testDefaultReadWrite(t) } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org