spark git commit: [DOC][MINOR] ml.feature Scala and Python API sync

mlnick Wed, 18 May 2016 19:49:07 -0700

Repository: spark
Updated Branches:
  refs/heads/master 4987f39ac -> b1bc5ebdd



[DOC][MINOR] ml.feature Scala and Python API sync

## What changes were proposed in this pull request?

I reviewed Scala and Python APIs for ml.feature and corrected discrepancies.

## How was this patch tested?

Built docs locally, ran style checks

Author: Bryan Cutler <cutl...@gmail.com>

Closes #13159 from BryanCutler/ml.feature-api-sync.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b1bc5ebd
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b1bc5ebd
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b1bc5ebd

Branch: refs/heads/master
Commit: b1bc5ebdd52ed12aea3fdc7b8f2fa2d00ea09c6b
Parents: 4987f39
Author: Bryan Cutler <cutl...@gmail.com>
Authored: Thu May 19 04:48:36 2016 +0200
Committer: Nick Pentreath <ni...@za.ibm.com>
Committed: Thu May 19 04:48:36 2016 +0200

----------------------------------------------------------------------
 .../scala/org/apache/spark/ml/feature/IDF.scala |  4 +-
 .../scala/org/apache/spark/ml/feature/PCA.scala |  5 ++-
 .../org/apache/spark/ml/feature/RFormula.scala  |  4 +-
 .../apache/spark/ml/feature/VectorIndexer.scala |  3 +-
 python/pyspark/ml/feature.py                    | 39 +++++++++++++-------
 5 files changed, 36 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/b1bc5ebd/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
index f85f4c6..08beda6 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/IDF.scala
@@ -38,12 +38,12 @@ import org.apache.spark.sql.types.StructType
 private[feature] trait IDFBase extends Params with HasInputCol with 
HasOutputCol {
 
   /**
-   * The minimum of documents in which a term should appear.
+   * The minimum number of documents in which a term should appear.
    * Default: 0
    * @group param
    */
   final val minDocFreq = new IntParam(
-    this, "minDocFreq", "minimum of documents in which a term should appear 
for filtering")
+    this, "minDocFreq", "minimum number of documents in which a term should 
appear for filtering")
 
   setDefault(minDocFreq -> 0)
 

http://git-wip-us.apache.org/repos/asf/spark/blob/b1bc5ebd/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
index 141d3b9..dbbaa5a 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/PCA.scala
@@ -53,7 +53,8 @@ private[feature] trait PCAParams extends Params with 
HasInputCol with HasOutputC
 
 /**
  * :: Experimental ::
- * PCA trains a model to project vectors to a low-dimensional space using PCA.
+ * PCA trains a model to project vectors to a lower dimensional space of the 
top [[PCA!.k]]
+ * principal components.
  */
 @Experimental
 class PCA (override val uid: String) extends Estimator[PCAModel] with PCAParams
@@ -106,7 +107,7 @@ object PCA extends DefaultParamsReadable[PCA] {
 
 /**
  * :: Experimental ::
- * Model fitted by [[PCA]].
+ * Model fitted by [[PCA]]. Transforms vectors to a lower dimensional space.
  *
  * @param pc A principal components Matrix. Each column is one principal 
component.
  * @param explainedVariance A vector of proportions of variance explained by

http://git-wip-us.apache.org/repos/asf/spark/blob/b1bc5ebd/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
----------------------------------------------------------------------
diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
index c0feaa0..2916b6d 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/RFormula.scala
@@ -194,7 +194,9 @@ object RFormula extends DefaultParamsReadable[RFormula] {
 
 /**
  * :: Experimental ::
- * A fitted RFormula. Fitting is required to determine the factor levels of 
formula terms.
+ * Model fitted by [[RFormula]]. Fitting is required to determine the factor 
levels of
+ * formula terms.
+ *
  * @param resolvedFormula the fitted R formula.
  * @param pipelineModel the fitted feature model, including factor to index 
mappings.
  */

http://git-wip-us.apache.org/repos/asf/spark/blob/b1bc5ebd/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
----------------------------------------------------------------------
diff --git 
a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala 
b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
index 2bc9d22..d814528 100644
--- a/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
+++ b/mllib/src/main/scala/org/apache/spark/ml/feature/VectorIndexer.scala
@@ -240,7 +240,8 @@ object VectorIndexer extends 
DefaultParamsReadable[VectorIndexer] {
 
 /**
  * :: Experimental ::
- * Transform categorical features to use 0-based indices instead of their 
original values.
+ * Model fitted by [[VectorIndexer]]. Transform categorical features to use 
0-based indices
+ * instead of their original values.
  *  - Categorical features are mapped to indices.
  *  - Continuous features (columns) are left unchanged.
  * This also appends metadata to the output column, marking features as 
Numeric (continuous),

http://git-wip-us.apache.org/repos/asf/spark/blob/b1bc5ebd/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 983b6a5..497f2ad 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -352,7 +352,7 @@ class CountVectorizerModel(JavaModel, JavaMLReadable, 
JavaMLWritable):
     """
     .. note:: Experimental
 
-    Model fitted by CountVectorizer.
+    Model fitted by :py:class:`CountVectorizer`.
 
     .. versionadded:: 1.6.0
     """
@@ -609,7 +609,7 @@ class IDF(JavaEstimator, HasInputCol, HasOutputCol, 
JavaMLReadable, JavaMLWritab
     """
 
     minDocFreq = Param(Params._dummy(), "minDocFreq",
-                       "minimum of documents in which a term should appear for 
filtering",
+                       "minimum number of documents in which a term should 
appear for filtering",
                        typeConverter=TypeConverters.toInt)
 
     @keyword_only
@@ -655,7 +655,7 @@ class IDFModel(JavaModel, JavaMLReadable, JavaMLWritable):
     """
     .. note:: Experimental
 
-    Model fitted by IDF.
+    Model fitted by :py:class:`IDF`.
 
     .. versionadded:: 1.4.0
     """
@@ -1302,7 +1302,8 @@ class RegexTokenizer(JavaTransformer, HasInputCol, 
HasOutputCol, JavaMLReadable,
 
     minTokenLength = Param(Params._dummy(), "minTokenLength", "minimum token 
length (>= 0)",
                            typeConverter=TypeConverters.toInt)
-    gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) 
or matches tokens")
+    gaps = Param(Params._dummy(), "gaps", "whether regex splits on gaps (True) 
or matches tokens " +
+                 "(False)")
     pattern = Param(Params._dummy(), "pattern", "regex pattern (Java dialect) 
used for tokenizing",
                     typeConverter=TypeConverters.toString)
     toLowercase = Param(Params._dummy(), "toLowercase", "whether to convert 
all characters to " +
@@ -1549,7 +1550,7 @@ class StandardScalerModel(JavaModel, JavaMLReadable, 
JavaMLWritable):
     """
     .. note:: Experimental
 
-    Model fitted by StandardScaler.
+    Model fitted by :py:class:`StandardScaler`.
 
     .. versionadded:: 1.4.0
     """
@@ -1641,7 +1642,7 @@ class StringIndexerModel(JavaModel, JavaMLReadable, 
JavaMLWritable):
     """
     .. note:: Experimental
 
-    Model fitted by StringIndexer.
+    Model fitted by :py:class:`StringIndexer`.
 
     .. versionadded:: 1.4.0
     """
@@ -1907,7 +1908,7 @@ class VectorIndexer(JavaEstimator, HasInputCol, 
HasOutputCol, JavaMLReadable, Ja
     """
     .. note:: Experimental
 
-    Class for indexing categorical feature columns in a dataset of [[Vector]].
+    Class for indexing categorical feature columns in a dataset of `Vector`.
 
     This has 2 usage modes:
       - Automatically identify categorical features (default behavior)
@@ -2023,7 +2024,17 @@ class VectorIndexerModel(JavaModel, JavaMLReadable, 
JavaMLWritable):
     """
     .. note:: Experimental
 
-    Model fitted by VectorIndexer.
+    Model fitted by :py:class:`VectorIndexer`.
+
+    Transform categorical features to use 0-based indices instead of their 
original values.
+      - Categorical features are mapped to indices.
+      - Continuous features (columns) are left unchanged.
+
+    This also appends metadata to the output column, marking features as 
Numeric (continuous),
+    Nominal (categorical), or Binary (either continuous or categorical).
+    Non-ML metadata is not carried over from the input to the output column.
+
+    This maintains vector sparsity.
 
     .. versionadded:: 1.4.0
     """
@@ -2296,7 +2307,7 @@ class Word2VecModel(JavaModel, JavaMLReadable, 
JavaMLWritable):
     """
     .. note:: Experimental
 
-    Model fitted by Word2Vec.
+    Model fitted by :py:class:`Word2Vec`.
 
     .. versionadded:: 1.4.0
     """
@@ -2327,7 +2338,8 @@ class PCA(JavaEstimator, HasInputCol, HasOutputCol, 
JavaMLReadable, JavaMLWritab
     """
     .. note:: Experimental
 
-    PCA trains a model to project vectors to a low-dimensional space using PCA.
+    PCA trains a model to project vectors to a lower dimensional space of the
+    top :py:attr:`k` principal components.
 
     >>> from pyspark.ml.linalg import Vectors
     >>> data = [(Vectors.sparse(5, [(1, 1.0), (3, 7.0)]),),
@@ -2401,7 +2413,7 @@ class PCAModel(JavaModel, JavaMLReadable, JavaMLWritable):
     """
     .. note:: Experimental
 
-    Model fitted by PCA.
+    Model fitted by :py:class:`PCA`. Transforms vectors to a lower dimensional 
space.
 
     .. versionadded:: 1.5.0
     """
@@ -2532,7 +2544,8 @@ class RFormulaModel(JavaModel, JavaMLReadable, 
JavaMLWritable):
     """
     .. note:: Experimental
 
-    Model fitted by :py:class:`RFormula`.
+    Model fitted by :py:class:`RFormula`. Fitting is required to determine the
+    factor levels of formula terms.
 
     .. versionadded:: 1.5.0
     """
@@ -2624,7 +2637,7 @@ class ChiSqSelectorModel(JavaModel, JavaMLReadable, 
JavaMLWritable):
     """
     .. note:: Experimental
 
-    Model fitted by ChiSqSelector.
+    Model fitted by :py:class:`ChiSqSelector`.
 
     .. versionadded:: 2.0.0
     """


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [DOC][MINOR] ml.feature Scala and Python API sync

Reply via email to