spark git commit: [MINOR][DOC][ML] ml.clustering scala & python api doc sync

mlnick Tue, 31 May 2016 14:57:09 -0700

Repository: spark
Updated Branches:
  refs/heads/branch-2.0 20a07e443 -> 7feb79085



[MINOR][DOC][ML] ml.clustering scala & python api doc sync

## What changes were proposed in this pull request?
Since we done Scala API audit for ml.clustering at #13148, we should also fix 
and update the corresponding Python API docs to keep them in sync.

## How was this patch tested?
Docs change, no tests.

Author: Yanbo Liang <yblia...@gmail.com>

Closes #13291 from yanboliang/spark-15361-followup.

(cherry picked from commit 594484cd8343f870c53fbc829ed4fb889016a8cf)
Signed-off-by: Nick Pentreath <ni...@za.ibm.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7feb7908
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7feb7908
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7feb7908

Branch: refs/heads/branch-2.0
Commit: 7feb7908563431090eabdee1369531dc1ba3e35b
Parents: 20a07e4
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Tue May 31 14:56:43 2016 -0700
Committer: Nick Pentreath <ni...@za.ibm.com>
Committed: Tue May 31 14:56:51 2016 -0700

----------------------------------------------------------------------
 python/pyspark/ml/clustering.py | 35 +++++++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/7feb7908/python/pyspark/ml/clustering.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/clustering.py b/python/pyspark/ml/clustering.py
index a457904..92df19e 100644
--- a/python/pyspark/ml/clustering.py
+++ b/python/pyspark/ml/clustering.py
@@ -64,6 +64,21 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, 
HasPredictionCol, HasMaxIte
     .. note:: Experimental
 
     GaussianMixture clustering.
+    This class performs expectation maximization for multivariate Gaussian
+    Mixture Models (GMMs).  A GMM represents a composite distribution of
+    independent Gaussian distributions with associated "mixing" weights
+    specifying each's contribution to the composite.
+
+    Given a set of sample points, this class will maximize the log-likelihood
+    for a mixture of k Gaussians, iterating until the log-likelihood changes by
+    less than convergenceTol, or until it has reached the max number of 
iterations.
+    While this process is generally guaranteed to converge, it is not 
guaranteed
+    to find a global optimum.
+
+    Note: For high-dimensional data (with many features), this algorithm may 
perform poorly.
+          This is due to high-dimensional data (a) making it difficult to 
cluster at all
+          (based on statistical/theoretical arguments) and (b) numerical 
issues with
+          Gaussian distributions.
 
     >>> from pyspark.ml.linalg import Vectors
 
@@ -118,8 +133,8 @@ class GaussianMixture(JavaEstimator, HasFeaturesCol, 
HasPredictionCol, HasMaxIte
     .. versionadded:: 2.0.0
     """
 
-    k = Param(Params._dummy(), "k", "number of clusters to create",
-              typeConverter=TypeConverters.toInt)
+    k = Param(Params._dummy(), "k", "Number of independent Gaussians in the 
mixture model. " +
+              "Must be > 1.", typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
@@ -227,15 +242,15 @@ class KMeans(JavaEstimator, HasFeaturesCol, 
HasPredictionCol, HasMaxIter, HasTol
     .. versionadded:: 1.5.0
     """
 
-    k = Param(Params._dummy(), "k", "number of clusters to create",
+    k = Param(Params._dummy(), "k", "The number of clusters to create. Must be 
> 1.",
               typeConverter=TypeConverters.toInt)
     initMode = Param(Params._dummy(), "initMode",
-                     "the initialization algorithm. This can be either 
\"random\" to " +
+                     "The initialization algorithm. This can be either 
\"random\" to " +
                      "choose random points as initial cluster centers, or 
\"k-means||\" " +
                      "to use a parallel variant of k-means++",
                      typeConverter=TypeConverters.toString)
-    initSteps = Param(Params._dummy(), "initSteps", "steps for k-means 
initialization mode",
-                      typeConverter=TypeConverters.toInt)
+    initSteps = Param(Params._dummy(), "initSteps", "The number of steps for 
k-means|| " +
+                      "initialization mode. Must be > 0.", 
typeConverter=TypeConverters.toInt)
 
     @keyword_only
     def __init__(self, featuresCol="features", predictionCol="prediction", k=2,
@@ -380,11 +395,11 @@ class BisectingKMeans(JavaEstimator, HasFeaturesCol, 
HasPredictionCol, HasMaxIte
     .. versionadded:: 2.0.0
     """
 
-    k = Param(Params._dummy(), "k", "number of clusters to create",
+    k = Param(Params._dummy(), "k", "The desired number of leaf clusters. Must 
be > 1.",
               typeConverter=TypeConverters.toInt)
     minDivisibleClusterSize = Param(Params._dummy(), "minDivisibleClusterSize",
-                                    "the minimum number of points (if >= 1.0) 
" +
-                                    "or the minimum proportion",
+                                    "The minimum number of points (if >= 1.0) 
or the minimum " +
+                                    "proportion of points (if < 1.0) of a 
divisible cluster.",
                                     typeConverter=TypeConverters.toFloat)
 
     @keyword_only
@@ -661,7 +676,7 @@ class LDA(JavaEstimator, HasFeaturesCol, HasMaxIter, 
HasSeed, HasCheckpointInter
     .. versionadded:: 2.0.0
     """
 
-    k = Param(Params._dummy(), "k", "number of topics (clusters) to infer",
+    k = Param(Params._dummy(), "k", "The number of topics (clusters) to infer. 
Must be > 1.",
               typeConverter=TypeConverters.toInt)
     optimizer = Param(Params._dummy(), "optimizer",
                       "Optimizer or inference algorithm used to estimate the 
LDA model.  "


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [MINOR][DOC][ML] ml.clustering scala & python api doc sync

Reply via email to