Repository: spark Updated Branches: refs/heads/master d280d1da1 -> 3d66a2ce9
[SPARK-14564][ML][MLLIB][PYSPARK] Python Word2Vec missing setWindowSize method ## What changes were proposed in this pull request? Added windowSize getter/setter to ML/MLlib ## How was this patch tested? Added test cases in tests.py under both ML and MLlib Author: Jason Lee <cj...@us.ibm.com> Closes #12428 from jasoncl/SPARK-14564. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/3d66a2ce Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/3d66a2ce Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/3d66a2ce Branch: refs/heads/master Commit: 3d66a2ce9bfc19096e07181f9e970372d32bbc0b Parents: d280d1d Author: Jason Lee <cj...@us.ibm.com> Authored: Mon Apr 18 12:47:14 2016 -0700 Committer: Joseph K. Bradley <jos...@databricks.com> Committed: Mon Apr 18 12:47:14 2016 -0700 ---------------------------------------------------------------------- .../spark/mllib/api/python/PythonMLLibAPI.scala | 5 +++- python/pyspark/ml/feature.py | 28 ++++++++++++++++---- python/pyspark/ml/tests.py | 5 ++++ python/pyspark/mllib/feature.py | 11 +++++++- python/pyspark/mllib/tests.py | 4 ++- 5 files changed, 45 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/3d66a2ce/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala index 1a58779..32dc16d 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/api/python/PythonMLLibAPI.scala @@ -671,6 +671,7 @@ private[python] class PythonMLLibAPI extends Serializable { * @param numPartitions number of partitions * @param numIterations number of iterations * @param seed initial seed for random generator + * @param windowSize size of window * @return A handle to java Word2VecModelWrapper instance at python side */ def trainWord2VecModel( @@ -680,7 +681,8 @@ private[python] class PythonMLLibAPI extends Serializable { numPartitions: Int, numIterations: Int, seed: Long, - minCount: Int): Word2VecModelWrapper = { + minCount: Int, + windowSize: Int): Word2VecModelWrapper = { val word2vec = new Word2Vec() .setVectorSize(vectorSize) .setLearningRate(learningRate) @@ -688,6 +690,7 @@ private[python] class PythonMLLibAPI extends Serializable { .setNumIterations(numIterations) .setSeed(seed) .setMinCount(minCount) + .setWindowSize(windowSize) try { val model = word2vec.fit(dataJRDD.rdd.persist(StorageLevel.MEMORY_AND_DISK_SER)) new Word2VecModelWrapper(model) http://git-wip-us.apache.org/repos/asf/spark/blob/3d66a2ce/python/pyspark/ml/feature.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 776906e..49a78ed 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -2219,28 +2219,31 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has minCount = Param(Params._dummy(), "minCount", "the minimum number of times a token must appear to be included in the " + "word2vec model's vocabulary", typeConverter=TypeConverters.toInt) + windowSize = Param(Params._dummy(), "windowSize", + "the window size (context words from [-window, window]). Default value is 5", + typeConverter=TypeConverters.toInt) @keyword_only def __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, - seed=None, inputCol=None, outputCol=None): + seed=None, inputCol=None, outputCol=None, windowSize=5): """ __init__(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, \ - seed=None, inputCol=None, outputCol=None) + seed=None, inputCol=None, outputCol=None, windowSize=5) """ super(Word2Vec, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.Word2Vec", self.uid) self._setDefault(vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, - seed=None) + seed=None, windowSize=5) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only @since("1.4.0") def setParams(self, vectorSize=100, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, - seed=None, inputCol=None, outputCol=None): + seed=None, inputCol=None, outputCol=None, windowSize=5): """ setParams(self, minCount=5, numPartitions=1, stepSize=0.025, maxIter=1, seed=None, \ - inputCol=None, outputCol=None) + inputCol=None, outputCol=None, windowSize=5) Sets params for this Word2Vec. """ kwargs = self.setParams._input_kwargs @@ -2291,6 +2294,21 @@ class Word2Vec(JavaEstimator, HasStepSize, HasMaxIter, HasSeed, HasInputCol, Has """ return self.getOrDefault(self.minCount) + @since("2.0.0") + def setWindowSize(self, value): + """ + Sets the value of :py:attr:`windowSize`. + """ + self._set(windowSize=value) + return self + + @since("2.0.0") + def getWindowSize(self): + """ + Gets the value of windowSize or its default value. + """ + return self.getOrDefault(self.windowSize) + def _create_model(self, java_model): return Word2VecModel(java_model) http://git-wip-us.apache.org/repos/asf/spark/blob/3d66a2ce/python/pyspark/ml/tests.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 9d6ff47..f1bca6e 100644 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -341,6 +341,11 @@ class ParamTests(PySparkTestCase): params = param_store.params # should not invoke the property 'test_property' self.assertEqual(len(params), 1) + def test_word2vec_param(self): + model = Word2Vec().setWindowSize(6) + # Check windowSize is set properly + self.assertEqual(model.getWindowSize(), 6) + class FeatureTests(PySparkTestCase): http://git-wip-us.apache.org/repos/asf/spark/blob/3d66a2ce/python/pyspark/mllib/feature.py ---------------------------------------------------------------------- diff --git a/python/pyspark/mllib/feature.py b/python/pyspark/mllib/feature.py index b3dd2f6..90559f6 100644 --- a/python/pyspark/mllib/feature.py +++ b/python/pyspark/mllib/feature.py @@ -617,6 +617,7 @@ class Word2Vec(object): self.numIterations = 1 self.seed = random.randint(0, sys.maxsize) self.minCount = 5 + self.windowSize = 5 @since('1.2.0') def setVectorSize(self, vectorSize): @@ -669,6 +670,14 @@ class Word2Vec(object): self.minCount = minCount return self + @since('2.0.0') + def setWindowSize(self, windowSize): + """ + Sets window size (default: 5). + """ + self.windowSize = windowSize + return self + @since('1.2.0') def fit(self, data): """ @@ -682,7 +691,7 @@ class Word2Vec(object): jmodel = callMLlibFunc("trainWord2VecModel", data, int(self.vectorSize), float(self.learningRate), int(self.numPartitions), int(self.numIterations), int(self.seed), - int(self.minCount)) + int(self.minCount), int(self.windowSize)) return Word2VecModel(jmodel) http://git-wip-us.apache.org/repos/asf/spark/blob/3d66a2ce/python/pyspark/mllib/tests.py ---------------------------------------------------------------------- diff --git a/python/pyspark/mllib/tests.py b/python/pyspark/mllib/tests.py index ac55fbf..f272da5 100644 --- a/python/pyspark/mllib/tests.py +++ b/python/pyspark/mllib/tests.py @@ -1027,13 +1027,15 @@ class Word2VecTests(MLlibTestCase): .setNumPartitions(2) \ .setNumIterations(10) \ .setSeed(1024) \ - .setMinCount(3) + .setMinCount(3) \ + .setWindowSize(6) self.assertEqual(model.vectorSize, 2) self.assertTrue(model.learningRate < 0.02) self.assertEqual(model.numPartitions, 2) self.assertEqual(model.numIterations, 10) self.assertEqual(model.seed, 1024) self.assertEqual(model.minCount, 3) + self.assertEqual(model.windowSize, 6) def test_word2vec_get_vectors(self): data = [ --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org