Repository: spark
Updated Branches:
  refs/heads/master e1d3f8010 -> 2224861f2


[SPARK-24439][ML][PYTHON] Add distanceMeasure to BisectingKMeans in PySpark

## What changes were proposed in this pull request?

add  distanceMeasure to BisectingKMeans in Python.

## How was this patch tested?

added doctest and also manually tested it.

Author: Huaxin Gao <huax...@us.ibm.com>

Closes #21557 from huaxingao/spark-24439.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2224861f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2224861f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2224861f

Branch: refs/heads/master
Commit: 2224861f2f93830d736b625c9a4cb72c918512b2
Parents: e1d3f80
Author: Huaxin Gao <huax...@us.ibm.com>
Authored: Thu Jun 28 14:07:28 2018 -0700
Committer: Bryan Cutler <cutl...@gmail.com>
Committed: Thu Jun 28 14:07:28 2018 -0700

----------------------------------------------------------------------
 python/pyspark/ml/clustering.py                 | 35 ++++++++++++++------
 .../pyspark/ml/param/_shared_params_code_gen.py |  4 ++-
 python/pyspark/ml/param/shared.py               | 24 ++++++++++++++
 3 files changed, 51 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/2224861f/python/pyspark/ml/clustering.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index 4aa1cf8..6d77baf 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -349,8 +349,8 @@ class KMeansModel(JavaModel, JavaMLWritable, 
JavaMLReadable):
 
 
 @inherit_doc
-class KMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, HasMaxIter, 
HasTol, HasSeed,
-             JavaMLWritable, JavaMLReadable):
+class KMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, 
HasPredictionCol, HasMaxIter,
+             HasTol, HasSeed, JavaMLWritable, JavaMLReadable):
     """
     K-means clustering with a k-means++ like initialization mode
     (the k-means|| algorithm by Bahmani et al).
@@ -406,9 +406,6 @@ class KMeans(JavaEstimator, HasFeaturesCol, 
HasPredictionCol, HasMaxIter, HasTol
                      typeConverter=TypeConverters.toString)
     initSteps = Param(Params._dummy(), "initSteps", "The number of steps for 
k-means|| " +
                       "initialization mode. Must be > 0.", 
typeConverter=TypeConverters.toInt)
-    distanceMeasure = Param(Params._dummy(), "distanceMeasure", "The distance 
measure. " +
-                            "Supported options: 'euclidean' and 'cosine'.",
-                            typeConverter=TypeConverters.toString)
 
     @keyword_only
     def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
@@ -544,8 +541,8 @@ class BisectingKMeansModel(JavaModel, JavaMLWritable, 
JavaMLReadable):
 
 
 @inherit_doc
-class BisectingKMeans(JavaEstimator, HasFeaturesCol, HasPredictionCol, 
HasMaxIter, HasSeed,
-                      JavaMLWritable, JavaMLReadable):
+class BisectingKMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, 
HasPredictionCol,
+                      HasMaxIter, HasSeed, JavaMLWritable, JavaMLReadable):
     """
     A bisecting k-means algorithm based on the paper "A comparison of document 
clustering
     techniques" by Steinbach, Karypis, and Kumar, with modification to fit 
Spark.
@@ -585,6 +582,8 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, 
HasPredictionCol, HasMaxIte
     >>> bkm2 = BisectingKMeans.load(bkm_path)
     >>> bkm2.getK()
     2
+    >>> bkm2.getDistanceMeasure()
+    'euclidean'
     >>> model_path = temp_path + "/bkm_model"
     >>> model.save(model_path)
     >>> model2 = BisectingKMeansModel.load(model_path)
@@ -607,10 +606,10 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, 
HasPredictionCol, HasMaxIte
 
     @keyword_only
     def __init__(self, featuresCol="features", predictionCol="prediction", 
maxIter=20,
-                 seed=None, k=4, minDivisibleClusterSize=1.0):
+                 seed=None, k=4, minDivisibleClusterSize=1.0, 
distanceMeasure="euclidean"):
         """
         __init__(self, featuresCol="features", predictionCol="prediction", 
maxIter=20, \
-                 seed=None, k=4, minDivisibleClusterSize=1.0)
+                 seed=None, k=4, minDivisibleClusterSize=1.0, 
distanceMeasure="euclidean")
         """
         super(BisectingKMeans, self).__init__()
         self._java_obj = 
self._new_java_obj("org.apache.spark.ml.clustering.BisectingKMeans",
@@ -622,10 +621,10 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, 
HasPredictionCol, HasMaxIte
     @keyword_only
     @since("2.0.0")
     def setParams(self, featuresCol="features", predictionCol="prediction", 
maxIter=20,
-                  seed=None, k=4, minDivisibleClusterSize=1.0):
+                  seed=None, k=4, minDivisibleClusterSize=1.0, 
distanceMeasure="euclidean"):
         """
         setParams(self, featuresCol="features", predictionCol="prediction", 
maxIter=20, \
-                  seed=None, k=4, minDivisibleClusterSize=1.0)
+                  seed=None, k=4, minDivisibleClusterSize=1.0, 
distanceMeasure="euclidean")
         Sets params for BisectingKMeans.
         """
         kwargs = self._input_kwargs
@@ -659,6 +658,20 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, 
HasPredictionCol, HasMaxIte
         """
         return self.getOrDefault(self.minDivisibleClusterSize)
 
+    @since("2.4.0")
+    def setDistanceMeasure(self, value):
+        """
+        Sets the value of :py:attr:`distanceMeasure`.
+        """
+        return self._set(distanceMeasure=value)
+
+    @since("2.4.0")
+    def getDistanceMeasure(self):
+        """
+        Gets the value of `distanceMeasure` or its default value.
+        """
+        return self.getOrDefault(self.distanceMeasure)
+
     def _create_model(self, java_model):
         return BisectingKMeansModel(java_model)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/2224861f/python/pyspark/ml/param/_shared_params_code_gen.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py 
b/python/pyspark/ml/param/_shared_params_code_gen.py
index 6e9e0a3..e45ba84 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -162,7 +162,9 @@ if __name__ == "__main__":
          "fitting. If set to true, then all sub-models will be available. 
Warning: For large " +
          "models, collecting all sub-models can cause OOMs on the Spark 
driver.",
          "False", "TypeConverters.toBoolean"),
-        ("loss", "the loss function to be optimized.", None, 
"TypeConverters.toString")]
+        ("loss", "the loss function to be optimized.", None, 
"TypeConverters.toString"),
+        ("distanceMeasure", "the distance measure. Supported options: 
'euclidean' and 'cosine'.",
+         "'euclidean'", "TypeConverters.toString")]
 
     code = []
     for name, doc, defaultValueStr, typeConverter in shared:

http://git-wip-us.apache.org/repos/asf/spark/blob/2224861f/python/pyspark/ml/param/shared.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/shared.py 
b/python/pyspark/ml/param/shared.py
index 08408ee..618f5bf 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -790,3 +790,27 @@ class DecisionTreeParams(Params):
         """
         return self.getOrDefault(self.cacheNodeIds)
 
+
+class HasDistanceMeasure(Params):
+    """
+    Mixin for param distanceMeasure: the distance measure. Supported options: 
'euclidean' and 'cosine'.
+    """
+
+    distanceMeasure = Param(Params._dummy(), "distanceMeasure", "the distance 
measure. Supported options: 'euclidean' and 'cosine'.", 
typeConverter=TypeConverters.toString)
+
+    def __init__(self):
+        super(HasDistanceMeasure, self).__init__()
+        self._setDefault(distanceMeasure='euclidean')
+
+    def setDistanceMeasure(self, value):
+        """
+        Sets the value of :py:attr:`distanceMeasure`.
+        """
+        return self._set(distanceMeasure=value)
+
+    def getDistanceMeasure(self):
+        """
+        Gets the value of distanceMeasure or its default value.
+        """
+        return self.getOrDefault(self.distanceMeasure)
+


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to