Repository: spark Updated Branches: refs/heads/master aeda76e2b -> dd8c179c2
[SPARK-25867][ML] Remove KMeans computeCost ## What changes were proposed in this pull request? The PR removes the deprecated method `computeCost` of `KMeans`. ## How was this patch tested? NA Closes #22875 from mgaido91/SPARK-25867. Authored-by: Marco Gaido <[email protected]> Signed-off-by: Sean Owen <[email protected]> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/dd8c179c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/dd8c179c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/dd8c179c Branch: refs/heads/master Commit: dd8c179c28c5df20210b70a69d93d866ccaca4cc Parents: aeda76e Author: Marco Gaido <[email protected]> Authored: Thu Nov 22 15:45:25 2018 -0600 Committer: Sean Owen <[email protected]> Committed: Thu Nov 22 15:45:25 2018 -0600 ---------------------------------------------------------------------- .../org/apache/spark/ml/clustering/KMeans.scala | 16 ---------------- .../apache/spark/ml/clustering/KMeansSuite.scala | 12 +++++------- project/MimaExcludes.scala | 3 +++ python/pyspark/ml/clustering.py | 16 ---------------- 4 files changed, 8 insertions(+), 39 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/dd8c179c/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index 498310d..919496a 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala @@ -144,22 +144,6 @@ class KMeansModel private[ml] ( def clusterCenters: Array[Vector] = parentModel.clusterCenters.map(_.asML) /** - * Return the K-means cost (sum of squared distances of points to their nearest center) for this - * model on the given data. - * - * @deprecated This method is deprecated and will be removed in 3.0.0. Use ClusteringEvaluator - * instead. You can also get the cost on the training dataset in the summary. - */ - @deprecated("This method is deprecated and will be removed in 3.0.0. Use ClusteringEvaluator " + - "instead. You can also get the cost on the training dataset in the summary.", "2.4.0") - @Since("2.0.0") - def computeCost(dataset: Dataset[_]): Double = { - SchemaUtils.validateVectorCompatibleColumn(dataset.schema, getFeaturesCol) - val data = DatasetUtils.columnToOldVector(dataset, getFeaturesCol) - parentModel.computeCost(data) - } - - /** * Returns a [[org.apache.spark.ml.util.GeneralMLWriter]] instance for this ML instance. * * For [[KMeansModel]], this does NOT currently save the training [[summary]]. http://git-wip-us.apache.org/repos/asf/spark/blob/dd8c179c/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala index ccbceab..4f47d91 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala @@ -117,7 +117,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes assert(clusters === Set(0, 1, 2, 3, 4)) } - assert(model.computeCost(dataset) < 0.1) assert(model.hasParent) // Check validity of model summary @@ -132,7 +131,6 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes } assert(summary.cluster.columns === Array(predictionColName)) assert(summary.trainingCost < 0.1) - assert(model.computeCost(dataset) == summary.trainingCost) val clusterSizes = summary.clusterSizes assert(clusterSizes.length === k) assert(clusterSizes.sum === numRows) @@ -201,15 +199,15 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes } test("KMean with Array input") { - def trainAndComputeCost(dataset: Dataset[_]): Double = { + def trainAndGetCost(dataset: Dataset[_]): Double = { val model = new KMeans().setK(k).setMaxIter(1).setSeed(1).fit(dataset) - model.computeCost(dataset) + model.summary.trainingCost } val (newDataset, newDatasetD, newDatasetF) = MLTestingUtils.generateArrayFeatureDataset(dataset) - val trueCost = trainAndComputeCost(newDataset) - val doubleArrayCost = trainAndComputeCost(newDatasetD) - val floatArrayCost = trainAndComputeCost(newDatasetF) + val trueCost = trainAndGetCost(newDataset) + val doubleArrayCost = trainAndGetCost(newDatasetD) + val floatArrayCost = trainAndGetCost(newDatasetF) // checking the cost is fine enough as a sanity check assert(trueCost ~== doubleArrayCost absTol 1e-6) http://git-wip-us.apache.org/repos/asf/spark/blob/dd8c179c/project/MimaExcludes.scala ---------------------------------------------------------------------- diff --git a/project/MimaExcludes.scala b/project/MimaExcludes.scala index 9089c7d..333adb0 100644 --- a/project/MimaExcludes.scala +++ b/project/MimaExcludes.scala @@ -36,6 +36,9 @@ object MimaExcludes { // Exclude rules for 3.0.x lazy val v30excludes = v24excludes ++ Seq( + // [SPARK-25867] Remove KMeans computeCost + ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.clustering.KMeansModel.computeCost"), + // [SPARK-26127] Remove deprecated setters from tree regression and classification models ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.DecisionTreeClassificationModel.setSeed"), ProblemFilters.exclude[DirectMissingMethodProblem]("org.apache.spark.ml.classification.DecisionTreeClassificationModel.setMinInfoGain"), http://git-wip-us.apache.org/repos/asf/spark/blob/dd8c179c/python/pyspark/ml/clustering.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index aaeeeb8..d0b507e 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -335,20 +335,6 @@ class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable): """Get the cluster centers, represented as a list of NumPy arrays.""" return [c.toArray() for c in self._call_java("clusterCenters")] - @since("2.0.0") - def computeCost(self, dataset): - """ - Return the K-means cost (sum of squared distances of points to their nearest center) - for this model on the given data. - - ..note:: Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator instead. - You can also get the cost on the training dataset in the summary. - """ - warnings.warn("Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator " - "instead. You can also get the cost on the training dataset in the summary.", - DeprecationWarning) - return self._call_java("computeCost", dataset) - @property @since("2.1.0") def hasSummary(self): @@ -387,8 +373,6 @@ class KMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol >>> centers = model.clusterCenters() >>> len(centers) 2 - >>> model.computeCost(df) - 2.0 >>> transformed = model.transform(df).select("features", "prediction") >>> rows = transformed.collect() >>> rows[0].prediction == rows[1].prediction --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
