Repository: spark
Updated Branches:
  refs/heads/master c26f67325 -> 075d678c8


[SPARK-24155][ML] Instrumentation improvements for clustering

## What changes were proposed in this pull request?

changed the instrument for all of the clustering methods

## How was this patch tested?

N/A

Please review http://spark.apache.org/contributing.html before opening a pull 
request.

Author: Lu WANG <lu.w...@databricks.com>

Closes #21218 from ludatabricks/SPARK-23686-1.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/075d678c
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/075d678c
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/075d678c

Branch: refs/heads/master
Commit: 075d678c8844614910b50abca07282bde31ef7e0
Parents: c26f673
Author: Lu WANG <lu.w...@databricks.com>
Authored: Mon May 14 13:35:54 2018 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon May 14 13:35:54 2018 -0700

----------------------------------------------------------------------
 .../org/apache/spark/ml/clustering/BisectingKMeans.scala      | 7 +++++--
 .../org/apache/spark/ml/clustering/GaussianMixture.scala      | 5 ++++-
 .../main/scala/org/apache/spark/ml/clustering/KMeans.scala    | 4 +++-
 3 files changed, 12 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/075d678c/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
index 438e53b..1ad4e09 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/BisectingKMeans.scala
@@ -261,8 +261,9 @@ class BisectingKMeans @Since("2.0.0") (
     transformSchema(dataset.schema, logging = true)
     val rdd = DatasetUtils.columnToOldVector(dataset, getFeaturesCol)
 
-    val instr = Instrumentation.create(this, rdd)
-    instr.logParams(featuresCol, predictionCol, k, maxIter, seed, 
minDivisibleClusterSize)
+    val instr = Instrumentation.create(this, dataset)
+    instr.logParams(featuresCol, predictionCol, k, maxIter, seed,
+      minDivisibleClusterSize, distanceMeasure)
 
     val bkm = new MLlibBisectingKMeans()
       .setK($(k))
@@ -275,6 +276,8 @@ class BisectingKMeans @Since("2.0.0") (
     val summary = new BisectingKMeansSummary(
       model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
     model.setSummary(Some(summary))
+    // TODO: need to extend logNamedValue to support Array
+    instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", 
",", "]"))
     instr.logSuccess(model)
     model
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/075d678c/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
index 88d618c..3091bb5 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/GaussianMixture.scala
@@ -352,7 +352,7 @@ class GaussianMixture @Since("2.0.0") (
       s"than ${GaussianMixture.MAX_NUM_FEATURES} features because the size of 
the covariance" +
       s" matrix is quadratic in the number of features.")
 
-    val instr = Instrumentation.create(this, instances)
+    val instr = Instrumentation.create(this, dataset)
     instr.logParams(featuresCol, predictionCol, probabilityCol, k, maxIter, 
seed, tol)
     instr.logNumFeatures(numFeatures)
 
@@ -425,6 +425,9 @@ class GaussianMixture @Since("2.0.0") (
     val summary = new GaussianMixtureSummary(model.transform(dataset),
       $(predictionCol), $(probabilityCol), $(featuresCol), $(k), logLikelihood)
     model.setSummary(Some(summary))
+    instr.logNamedValue("logLikelihood", logLikelihood)
+    // TODO: need to extend logNamedValue to support Array
+    instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", 
",", "]"))
     instr.logSuccess(model)
     model
   }

http://git-wip-us.apache.org/repos/asf/spark/blob/075d678c/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala 
b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
index 97f246f..e72d7f9 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/clustering/KMeans.scala
@@ -342,7 +342,7 @@ class KMeans @Since("1.5.0") (
       instances.persist(StorageLevel.MEMORY_AND_DISK)
     }
 
-    val instr = Instrumentation.create(this, instances)
+    val instr = Instrumentation.create(this, dataset)
     instr.logParams(featuresCol, predictionCol, k, initMode, initSteps, 
distanceMeasure,
       maxIter, seed, tol)
     val algo = new MLlibKMeans()
@@ -359,6 +359,8 @@ class KMeans @Since("1.5.0") (
       model.transform(dataset), $(predictionCol), $(featuresCol), $(k))
 
     model.setSummary(Some(summary))
+    // TODO: need to extend logNamedValue to support Array
+    instr.logNamedValue("clusterSizes", summary.clusterSizes.mkString("[", 
",", "]"))
     instr.logSuccess(model)
     if (handlePersistence) {
       instances.unpersist()


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to