spark git commit: [SPARK-25867][ML] Remove KMeans computeCost

srowen Thu, 22 Nov 2018 13:46:13 -0800

Repository: spark
Updated Branches:
  refs/heads/master aeda76e2b -> dd8c179c2



[SPARK-25867][ML] Remove KMeans computeCost

## What changes were proposed in this pull request?

The PR removes the deprecated method `computeCost` of `KMeans`.

## How was this patch tested?

NA

Closes #22875 from mgaido91/SPARK-25867.

Authored-by: Marco Gaido <[email protected]>
Signed-off-by: Sean Owen <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dd8c179c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dd8c179c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dd8c179c

Branch: refs/heads/master
Commit: dd8c179c28c5df20210b70a69d93d866ccaca4cc
Parents: aeda76e
Author: Marco Gaido <[email protected]>
Authored: Thu Nov 22 15:45:25 2018 -0600
Committer: Sean Owen <[email protected]>
Committed: Thu Nov 22 15:45:25 2018 -0600

----------------------------------------------------------------------
 .../org/apache/spark/ml/clustering/KMeans.scala     | 16 ----------------
 .../apache/spark/ml/clustering/KMeansSuite.scala    | 12 +++++-------
 project/MimaExcludes.scala                          |  3 +++
 python/pyspark/ml/clustering.py                     | 16 ----------------
 4 files changed, 8 insertions(+), 39 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/dd8c179c/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 498310d..919496a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -144,22 +144,6 @@ class KMeansModel private[ml] (
   def clusterCenters: Array[Vector] = parentModel.clusterCenters.map(_.asML)
 
   /**
-   * Return the K-means cost (sum of squared distances of points to their 
nearest center) for this
-   * model on the given data.
-   *
-   * @deprecated This method is deprecated and will be removed in 3.0.0. Use 
ClusteringEvaluator
-   *             instead. You can also get the cost on the training dataset in 
the summary.
-   */
-  @deprecated("This method is deprecated and will be removed in 3.0.0. Use 
ClusteringEvaluator " +
-    "instead. You can also get the cost on the training dataset in the 
summary.", "2.4.0")
-  @Since("2.0.0")
-  def computeCost(dataset: Dataset[_]): Double = {
-    SchemaUtils.validateVectorCompatibleColumn(dataset.schema, getFeaturesCol)
-    val data = DatasetUtils.columnToOldVector(dataset, getFeaturesCol)
-    parentModel.computeCost(data)
-  }
-
-  /**
    * Returns a [[org.apache.spark.ml.util.GeneralMLWriter]] instance for this 
ML instance.
    *
    * For [[KMeansModel]], this does NOT currently save the training 
[[summary]].

http://git-wip-us.apache.org/repos/asf/spark/blob/dd8c179c/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala 
b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
index ccbceab..4f47d91 100644
--- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
+++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala
@@ -117,7 +117,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest 
with PMMLReadWriteTes
       assert(clusters === Set(0, 1, 2, 3, 4))
     }
 
-    assert(model.computeCost(dataset) < 0.1)
     assert(model.hasParent)
 
     // Check validity of model summary
@@ -132,7 +131,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest 
with PMMLReadWriteTes
     }
     assert(summary.cluster.columns === Array(predictionColName))
     assert(summary.trainingCost < 0.1)
-    assert(model.computeCost(dataset) == summary.trainingCost)
     val clusterSizes = summary.clusterSizes
     assert(clusterSizes.length === k)
     assert(clusterSizes.sum === numRows)
@@ -201,15 +199,15 @@ class KMeansSuite extends MLTest with 
DefaultReadWriteTest with PMMLReadWriteTes
   }
 
   test("KMean with Array input") {
-    def trainAndComputeCost(dataset: Dataset[_]): Double = {
+    def trainAndGetCost(dataset: Dataset[_]): Double = {
       val model = new KMeans().setK(k).setMaxIter(1).setSeed(1).fit(dataset)
-      model.computeCost(dataset)
+      model.summary.trainingCost
     }
 
     val (newDataset, newDatasetD, newDatasetF) = 
MLTestingUtils.generateArrayFeatureDataset(dataset)
-    val trueCost = trainAndComputeCost(newDataset)
-    val doubleArrayCost = trainAndComputeCost(newDatasetD)
-    val floatArrayCost = trainAndComputeCost(newDatasetF)
+    val trueCost = trainAndGetCost(newDataset)
+    val doubleArrayCost = trainAndGetCost(newDatasetD)
+    val floatArrayCost = trainAndGetCost(newDatasetF)
 
     // checking the cost is fine enough as a sanity check
     assert(trueCost ~== doubleArrayCost absTol 1e-6)

http://git-wip-us.apache.org/repos/asf/spark/blob/dd8c179c/project/MimaExcludes.scala
----------------------------------------------------------------------
diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala
index 9089c7d..333adb0 100644
--- a/project/MimaExcludes.scala
+++ b/project/MimaExcludes.scala
@@ -36,6 +36,9 @@ object MimaExcludes {
 
   // Exclude rules for 3.0.x
   lazy val v30excludes = v24excludes ++ Seq(
+    // [SPARK-25867] Remove KMeans computeCost
+    
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.KMeansModel.computeCost"),
+
     // [SPARK-26127] Remove deprecated setters from tree regression and 
classification models
     
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.DecisionTreeClassificationModel.setSeed"),
     
ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.DecisionTreeClassificationModel.setMinInfoGain"),

http://git-wip-us.apache.org/repos/asf/spark/blob/dd8c179c/python/pyspark/ml/clustering.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index aaeeeb8..d0b507e 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -335,20 +335,6 @@ class KMeansModel(JavaModel, JavaMLWritable, 
JavaMLReadable):
         """Get the cluster centers, represented as a list of NumPy arrays."""
         return [c.toArray() for c in self._call_java("clusterCenters")]
 
-    @since("2.0.0")
-    def computeCost(self, dataset):
-        """
-        Return the K-means cost (sum of squared distances of points to their 
nearest center)
-        for this model on the given data.
-
-        ..note:: Deprecated in 2.4.0. It will be removed in 3.0.0. Use 
ClusteringEvaluator instead.
-           You can also get the cost on the training dataset in the summary.
-        """
-        warnings.warn("Deprecated in 2.4.0. It will be removed in 3.0.0. Use 
ClusteringEvaluator "
-                      "instead. You can also get the cost on the training 
dataset in the summary.",
-                      DeprecationWarning)
-        return self._call_java("computeCost", dataset)
-
     @property
     @since("2.1.0")
     def hasSummary(self):
@@ -387,8 +373,6 @@ class KMeans(JavaEstimator, HasDistanceMeasure, 
HasFeaturesCol, HasPredictionCol
     >>> centers = model.clusterCenters()
     >>> len(centers)
     2
-    >>> model.computeCost(df)
-    2.0
     >>> transformed = model.transform(df).select("features", "prediction")
     >>> rows = transformed.collect()
     >>> rows[0].prediction == rows[1].prediction


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-25867][ML] Remove KMeans computeCost

Reply via email to