Repository: spark Updated Branches: refs/heads/master 4289ac9d8 -> ec873a4fd
[SPARK-14516][FOLLOWUP] Adding ClusteringEvaluator to examples ## What changes were proposed in this pull request? In SPARK-14516 we have introduced ClusteringEvaluator, but we didn't put any reference in the documentation and the examples were still relying on the sum of squared errors to show a way to evaluate the clustering model. The PR adds the ClusteringEvaluator in the examples. ## How was this patch tested? Manual runs of the examples. Author: Marco Gaido <mga...@hortonworks.com> Closes #19676 from mgaido91/SPARK-14516_examples. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/ec873a4f Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/ec873a4f Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/ec873a4f Branch: refs/heads/master Commit: ec873a4fd20a47cf0791456bfb301f25a34ae014 Parents: 4289ac9 Author: Marco Gaido <mga...@hortonworks.com> Authored: Mon Dec 11 06:35:31 2017 -0600 Committer: Sean Owen <so...@cloudera.com> Committed: Mon Dec 11 06:35:31 2017 -0600 ---------------------------------------------------------------------- .../org/apache/spark/examples/ml/JavaKMeansExample.java | 12 +++++++++--- examples/src/main/python/ml/kmeans_example.py | 12 +++++++++--- .../org/apache/spark/examples/ml/KMeansExample.scala | 12 +++++++++--- 3 files changed, 27 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/ec873a4f/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java ---------------------------------------------------------------------- diff --git a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java index d8f948a..dc4b0bc 100644 --- a/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java +++ b/examples/src/main/java/org/apache/spark/examples/ml/JavaKMeansExample.java @@ -20,6 +20,7 @@ package org.apache.spark.examples.ml; // $example on$ import org.apache.spark.ml.clustering.KMeansModel; import org.apache.spark.ml.clustering.KMeans; +import org.apache.spark.ml.evaluation.ClusteringEvaluator; import org.apache.spark.ml.linalg.Vector; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; @@ -51,9 +52,14 @@ public class JavaKMeansExample { KMeans kmeans = new KMeans().setK(2).setSeed(1L); KMeansModel model = kmeans.fit(dataset); - // Evaluate clustering by computing Within Set Sum of Squared Errors. - double WSSSE = model.computeCost(dataset); - System.out.println("Within Set Sum of Squared Errors = " + WSSSE); + // Make predictions + Dataset<Row> predictions = model.transform(dataset); + + // Evaluate clustering by computing Silhouette score + ClusteringEvaluator evaluator = new ClusteringEvaluator(); + + double silhouette = evaluator.evaluate(predictions); + System.out.println("Silhouette with squared euclidean distance = " + silhouette); // Shows the result. Vector[] centers = model.clusterCenters(); http://git-wip-us.apache.org/repos/asf/spark/blob/ec873a4f/examples/src/main/python/ml/kmeans_example.py ---------------------------------------------------------------------- diff --git a/examples/src/main/python/ml/kmeans_example.py b/examples/src/main/python/ml/kmeans_example.py index 6846ec4..5f77843 100644 --- a/examples/src/main/python/ml/kmeans_example.py +++ b/examples/src/main/python/ml/kmeans_example.py @@ -19,6 +19,7 @@ from __future__ import print_function # $example on$ from pyspark.ml.clustering import KMeans +from pyspark.ml.evaluation import ClusteringEvaluator # $example off$ from pyspark.sql import SparkSession @@ -45,9 +46,14 @@ if __name__ == "__main__": kmeans = KMeans().setK(2).setSeed(1) model = kmeans.fit(dataset) - # Evaluate clustering by computing Within Set Sum of Squared Errors. - wssse = model.computeCost(dataset) - print("Within Set Sum of Squared Errors = " + str(wssse)) + # Make predictions + predictions = model.transform(dataset) + + # Evaluate clustering by computing Silhouette score + evaluator = ClusteringEvaluator() + + silhouette = evaluator.evaluate(predictions) + print("Silhouette with squared euclidean distance = " + str(silhouette)) # Shows the result. centers = model.clusterCenters() http://git-wip-us.apache.org/repos/asf/spark/blob/ec873a4f/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala ---------------------------------------------------------------------- diff --git a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala index a1d19e1..2bc8184 100644 --- a/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala +++ b/examples/src/main/scala/org/apache/spark/examples/ml/KMeansExample.scala @@ -21,6 +21,7 @@ package org.apache.spark.examples.ml // $example on$ import org.apache.spark.ml.clustering.KMeans +import org.apache.spark.ml.evaluation.ClusteringEvaluator // $example off$ import org.apache.spark.sql.SparkSession @@ -47,9 +48,14 @@ object KMeansExample { val kmeans = new KMeans().setK(2).setSeed(1L) val model = kmeans.fit(dataset) - // Evaluate clustering by computing Within Set Sum of Squared Errors. - val WSSSE = model.computeCost(dataset) - println(s"Within Set Sum of Squared Errors = $WSSSE") + // Make predictions + val predictions = model.transform(dataset) + + // Evaluate clustering by computing Silhouette score + val evaluator = new ClusteringEvaluator() + + val silhouette = evaluator.evaluate(predictions) + println(s"Silhouette with squared euclidean distance = $silhouette") // Shows the result. println("Cluster Centers: ") --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org