Repository: spark Updated Branches: refs/heads/master e0b638321 -> cc4d64bb1
[SPARK-23451][ML] Deprecate KMeans.computeCost ## What changes were proposed in this pull request? Deprecate `KMeans.computeCost` which was introduced as a temp fix and now it is not needed anymore, since we introduced `ClusteringEvaluator`. ## How was this patch tested? manual test (deprecation warning displayed) Scala ``` ... scala> model.computeCost(dataset) warning: there was one deprecation warning; re-run with -deprecation for details res1: Double = 0.0 ``` Python ``` >>> import warnings >>> warnings.simplefilter('always', DeprecationWarning) ... >>> model.computeCost(df) /Users/mgaido/apache/spark/python/pyspark/ml/clustering.py:330: DeprecationWarning: Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator instead. " instead.", DeprecationWarning) ``` Author: Marco Gaido <marcogaid...@gmail.com> Closes #20629 from mgaido91/SPARK-23451. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cc4d64bb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cc4d64bb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cc4d64bb Branch: refs/heads/master Commit: cc4d64bb16987eb5a41d4198bf4a5882e549a94f Parents: e0b6383 Author: Marco Gaido <marcogaid...@gmail.com> Authored: Fri Jul 20 09:18:57 2018 -0700 Committer: Holden Karau <hol...@pigscanfly.ca> Committed: Fri Jul 20 09:18:57 2018 -0700 ---------------------------------------------------------------------- .../org/apache/spark/ml/clustering/KMeans.scala | 19 ++++++++++++++++--- .../apache/spark/mllib/clustering/KMeans.scala | 2 +- .../spark/mllib/clustering/KMeansModel.scala | 11 +++++++---- .../apache/spark/ml/clustering/KMeansSuite.scala | 2 ++ python/pyspark/ml/clustering.py | 19 ++++++++++++++++++- 5 files changed, 44 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/cc4d64bb/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala index f40037a..6f4a30d 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala @@ -145,8 +145,12 @@ class KMeansModel private[ml] ( /** * Return the K-means cost (sum of squared distances of points to their nearest center) for this * model on the given data. + * + * @deprecated This method is deprecated and will be removed in 3.0.0. Use ClusteringEvaluator + * instead. You can also get the cost on the training dataset in the summary. */ - // TODO: Replace the temp fix when we have proper evaluators defined for clustering. + @deprecated("This method is deprecated and will be removed in 3.0.0. Use ClusteringEvaluator " + + "instead. You can also get the cost on the training dataset in the summary.", "2.4.0") @Since("2.0.0") def computeCost(dataset: Dataset[_]): Double = { SchemaUtils.validateVectorCompatibleColumn(dataset.schema, getFeaturesCol) @@ -356,7 +360,12 @@ class KMeans @Since("1.5.0") ( val parentModel = algo.run(instances, Option(instr)) val model = copyValues(new KMeansModel(uid, parentModel).setParent(this)) val summary = new KMeansSummary( - model.transform(dataset), $(predictionCol), $(featuresCol), $(k), parentModel.numIter) + model.transform(dataset), + $(predictionCol), + $(featuresCol), + $(k), + parentModel.numIter, + parentModel.trainingCost) model.setSummary(Some(summary)) instr.logNamedValue("clusterSizes", summary.clusterSizes) @@ -389,6 +398,8 @@ object KMeans extends DefaultParamsReadable[KMeans] { * @param featuresCol Name for column of features in `predictions`. * @param k Number of clusters. * @param numIter Number of iterations. + * @param trainingCost K-means cost (sum of squared distances to the nearest centroid for all + * points in the training dataset). This is equivalent to sklearn's inertia. */ @Since("2.0.0") @Experimental @@ -397,4 +408,6 @@ class KMeansSummary private[clustering] ( predictionCol: String, featuresCol: String, k: Int, - numIter: Int) extends ClusteringSummary(predictions, predictionCol, featuresCol, k, numIter) + numIter: Int, + @Since("2.4.0") val trainingCost: Double) + extends ClusteringSummary(predictions, predictionCol, featuresCol, k, numIter) http://git-wip-us.apache.org/repos/asf/spark/blob/cc4d64bb/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala index 4f554f4..55df8a3 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeans.scala @@ -348,7 +348,7 @@ class KMeans private ( logInfo(s"The cost is $cost.") - new KMeansModel(centers.map(_.vector), distanceMeasure, iteration) + new KMeansModel(centers.map(_.vector), distanceMeasure, cost, iteration) } /** http://git-wip-us.apache.org/repos/asf/spark/blob/cc4d64bb/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala ---------------------------------------------------------------------- diff --git a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala index e3a88b4..d5c8188 100644 --- a/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala +++ b/mllib/src/main/scala/org/apache/spark/mllib/clustering/KMeansModel.scala @@ -38,6 +38,7 @@ import org.apache.spark.sql.{Row, SparkSession} @Since("0.8.0") class KMeansModel (@Since("1.0.0") val clusterCenters: Array[Vector], @Since("2.4.0") val distanceMeasure: String, + @Since("2.4.0") val trainingCost: Double, private[spark] val numIter: Int) extends Saveable with Serializable with PMMLExportable { @@ -49,11 +50,11 @@ class KMeansModel (@Since("1.0.0") val clusterCenters: Array[Vector], @Since("2.4.0") private[spark] def this(clusterCenters: Array[Vector], distanceMeasure: String) = - this(clusterCenters: Array[Vector], distanceMeasure, -1) + this(clusterCenters: Array[Vector], distanceMeasure, 0.0, -1) @Since("1.1.0") def this(clusterCenters: Array[Vector]) = - this(clusterCenters: Array[Vector], DistanceMeasure.EUCLIDEAN) + this(clusterCenters: Array[Vector], DistanceMeasure.EUCLIDEAN, 0.0, -1) /** * A Java-friendly constructor that takes an Iterable of Vectors. @@ -187,7 +188,8 @@ object KMeansModel extends Loader[KMeansModel] { val spark = SparkSession.builder().sparkContext(sc).getOrCreate() val metadata = compact(render( ("class" -> thisClassName) ~ ("version" -> thisFormatVersion) - ~ ("k" -> model.k) ~ ("distanceMeasure" -> model.distanceMeasure))) + ~ ("k" -> model.k) ~ ("distanceMeasure" -> model.distanceMeasure) + ~ ("trainingCost" -> model.trainingCost))) sc.parallelize(Seq(metadata), 1).saveAsTextFile(Loader.metadataPath(path)) val dataRDD = sc.parallelize(model.clusterCentersWithNorm.zipWithIndex).map { case (p, id) => Cluster(id, p.vector) @@ -207,7 +209,8 @@ object KMeansModel extends Loader[KMeansModel] { val localCentroids = centroids.rdd.map(Cluster.apply).collect() assert(k == localCentroids.length) val distanceMeasure = (metadata \ "distanceMeasure").extract[String] - new KMeansModel(localCentroids.sortBy(_.id).map(_.point), distanceMeasure) + val trainingCost = (metadata \ "trainingCost").extract[Double] + new KMeansModel(localCentroids.sortBy(_.id).map(_.point), distanceMeasure, trainingCost, -1) } } } http://git-wip-us.apache.org/repos/asf/spark/blob/cc4d64bb/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala ---------------------------------------------------------------------- diff --git a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala index 829c90f..9b0b526 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/clustering/KMeansSuite.scala @@ -131,6 +131,8 @@ class KMeansSuite extends MLTest with DefaultReadWriteTest with PMMLReadWriteTes assert(summary.predictions.columns.contains(c)) } assert(summary.cluster.columns === Array(predictionColName)) + assert(summary.trainingCost < 0.1) + assert(model.computeCost(dataset) == summary.trainingCost) val clusterSizes = summary.clusterSizes assert(clusterSizes.length === k) assert(clusterSizes.sum === numRows) http://git-wip-us.apache.org/repos/asf/spark/blob/cc4d64bb/python/pyspark/ml/clustering.py ---------------------------------------------------------------------- diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py index 2f06600..8a58d83 100644 --- a/python/pyspark/ml/clustering.py +++ b/python/pyspark/ml/clustering.py @@ -16,6 +16,7 @@ # import sys +import warnings from pyspark import since, keyword_only from pyspark.ml.util import * @@ -303,7 +304,15 @@ class KMeansSummary(ClusteringSummary): .. versionadded:: 2.1.0 """ - pass + + @property + @since("2.4.0") + def trainingCost(self): + """ + K-means cost (sum of squared distances to the nearest centroid for all points in the + training dataset). This is equivalent to sklearn's inertia. + """ + return self._call_java("trainingCost") class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable): @@ -323,7 +332,13 @@ class KMeansModel(JavaModel, JavaMLWritable, JavaMLReadable): """ Return the K-means cost (sum of squared distances of points to their nearest center) for this model on the given data. + + ..note:: Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator instead. + You can also get the cost on the training dataset in the summary. """ + warnings.warn("Deprecated in 2.4.0. It will be removed in 3.0.0. Use ClusteringEvaluator " + "instead. You can also get the cost on the training dataset in the summary.", + DeprecationWarning) return self._call_java("computeCost", dataset) @property @@ -379,6 +394,8 @@ class KMeans(JavaEstimator, HasDistanceMeasure, HasFeaturesCol, HasPredictionCol 2 >>> summary.clusterSizes [2, 2] + >>> summary.trainingCost + 2.000... >>> kmeans_path = temp_path + "/kmeans" >>> kmeans.save(kmeans_path) >>> kmeans2 = KMeans.load(kmeans_path) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org