spark git commit: [SPARK-10027] [ML] [PySpark] Add Python API missing methods for ml.feature

meng Thu, 10 Sep 2015 20:44:35 -0700

Repository: spark
Updated Branches:
  refs/heads/master 339a52714 -> a140dd77c



[SPARK-10027] [ML] [PySpark] Add Python API missing methods for ml.feature

Missing method of ml.feature are listed here:
```StringIndexer``` lacks of parameter ```handleInvalid```.
```StringIndexerModel``` lacks of method ```labels```.
```VectorIndexerModel``` lacks of methods ```numFeatures``` and 
```categoryMaps```.

Author: Yanbo Liang <yblia...@gmail.com>

Closes #8313 from yanboliang/spark-10027.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a140dd77
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a140dd77
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a140dd77

Branch: refs/heads/master
Commit: a140dd77c62255d6f7f6817a2517d47feb8540d4
Parents: 339a527
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Thu Sep 10 20:43:38 2015 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Thu Sep 10 20:43:38 2015 -0700

----------------------------------------------------------------------
 python/pyspark/ml/feature.py                    | 31 ++++++++++++++++----
 .../pyspark/ml/param/_shared_params_code_gen.py |  5 +++-
 python/pyspark/ml/param/shared.py               | 31 ++++++++++++++++++--
 3 files changed, 59 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/a140dd77/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 1c42348..71dc636 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -920,7 +920,7 @@ class StandardScalerModel(JavaModel):
 
 
 @inherit_doc
-class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol):
+class StringIndexer(JavaEstimator, HasInputCol, HasOutputCol, 
HasHandleInvalid):
     """
     .. note:: Experimental
 
@@ -943,19 +943,20 @@ class StringIndexer(JavaEstimator, HasInputCol, 
HasOutputCol):
     """
 
     @keyword_only
-    def __init__(self, inputCol=None, outputCol=None):
+    def __init__(self, inputCol=None, outputCol=None, handleInvalid="error"):
         """
-        __init__(self, inputCol=None, outputCol=None)
+        __init__(self, inputCol=None, outputCol=None, handleInvalid="error")
         """
         super(StringIndexer, self).__init__()
         self._java_obj = 
self._new_java_obj("org.apache.spark.ml.feature.StringIndexer", self.uid)
+        self._setDefault(handleInvalid="error")
         kwargs = self.__init__._input_kwargs
         self.setParams(**kwargs)
 
     @keyword_only
-    def setParams(self, inputCol=None, outputCol=None):
+    def setParams(self, inputCol=None, outputCol=None, handleInvalid="error"):
         """
-        setParams(self, inputCol=None, outputCol=None)
+        setParams(self, inputCol=None, outputCol=None, handleInvalid="error")
         Sets params for this StringIndexer.
         """
         kwargs = self.setParams._input_kwargs
@@ -1235,6 +1236,10 @@ class VectorIndexer(JavaEstimator, HasInputCol, 
HasOutputCol):
     >>> model = indexer.fit(df)
     >>> model.transform(df).head().indexed
     DenseVector([1.0, 0.0])
+    >>> model.numFeatures
+    2
+    >>> model.categoryMaps
+    {0: {0.0: 0, -1.0: 1}}
     >>> 
indexer.setParams(outputCol="test").fit(df).transform(df).collect()[1].test
     DenseVector([0.0, 1.0])
     >>> params = {indexer.maxCategories: 3, indexer.outputCol: "vector"}
@@ -1297,6 +1302,22 @@ class VectorIndexerModel(JavaModel):
     Model fitted by VectorIndexer.
     """
 
+    @property
+    def numFeatures(self):
+        """
+        Number of features, i.e., length of Vectors which this transforms.
+        """
+        return self._call_java("numFeatures")
+
+    @property
+    def categoryMaps(self):
+        """
+        Feature value index.  Keys are categorical feature indices (column 
indices).
+        Values are maps from original features values to 0-based category 
indices.
+        If a feature is not in this map, it is treated as continuous.
+        """
+        return self._call_java("javaCategoryMaps")
+
 
 @inherit_doc
 class VectorSlicer(JavaTransformer, HasInputCol, HasOutputCol):

http://git-wip-us.apache.org/repos/asf/spark/blob/a140dd77/python/pyspark/ml/param/_shared_params_code_gen.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py 
b/python/pyspark/ml/param/_shared_params_code_gen.py
index 69efc42..926375e 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -121,7 +121,10 @@ if __name__ == "__main__":
         ("checkpointInterval", "checkpoint interval (>= 1)", None),
         ("seed", "random seed", "hash(type(self).__name__)"),
         ("tol", "the convergence tolerance for iterative algorithms", None),
-        ("stepSize", "Step size to be used for each iteration of 
optimization.", None)]
+        ("stepSize", "Step size to be used for each iteration of 
optimization.", None),
+        ("handleInvalid", "how to handle invalid entries. Options are skip 
(which will filter " +
+         "out rows with bad values), or error (which will throw an errror). 
More options may be " +
+         "added later.", None)]
     code = []
     for name, doc, defaultValueStr in shared:
         param_code = _gen_param_header(name, doc, defaultValueStr)

http://git-wip-us.apache.org/repos/asf/spark/blob/a140dd77/python/pyspark/ml/param/shared.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/shared.py 
b/python/pyspark/ml/param/shared.py
index 5951247..682170a 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -432,6 +432,33 @@ class HasStepSize(Params):
         return self.getOrDefault(self.stepSize)
 
 
+class HasHandleInvalid(Params):
+    """
+    Mixin for param handleInvalid: how to handle invalid entries. Options are 
skip (which will filter out rows with bad values), or error (which will throw 
an errror). More options may be added later..
+    """
+
+    # a placeholder to make it appear in the generated doc
+    handleInvalid = Param(Params._dummy(), "handleInvalid", "how to handle 
invalid entries. Options are skip (which will filter out rows with bad values), 
or error (which will throw an errror). More options may be added later.")
+
+    def __init__(self):
+        super(HasHandleInvalid, self).__init__()
+        #: param for how to handle invalid entries. Options are skip (which 
will filter out rows with bad values), or error (which will throw an errror). 
More options may be added later.
+        self.handleInvalid = Param(self, "handleInvalid", "how to handle 
invalid entries. Options are skip (which will filter out rows with bad values), 
or error (which will throw an errror). More options may be added later.")
+
+    def setHandleInvalid(self, value):
+        """
+        Sets the value of :py:attr:`handleInvalid`.
+        """
+        self._paramMap[self.handleInvalid] = value
+        return self
+
+    def getHandleInvalid(self):
+        """
+        Gets the value of handleInvalid or its default value.
+        """
+        return self.getOrDefault(self.handleInvalid)
+
+
 class DecisionTreeParams(Params):
     """
     Mixin for Decision Tree parameters.
@@ -444,7 +471,7 @@ class DecisionTreeParams(Params):
     minInfoGain = Param(Params._dummy(), "minInfoGain", "Minimum information 
gain for a split to be considered at a tree node.")
     maxMemoryInMB = Param(Params._dummy(), "maxMemoryInMB", "Maximum memory in 
MB allocated to histogram aggregation.")
     cacheNodeIds = Param(Params._dummy(), "cacheNodeIds", "If false, the 
algorithm will pass trees to executors to match instances with nodes. If true, 
the algorithm will cache node IDs for each instance. Caching can speed up 
training of deeper trees.")
-
+    
 
     def __init__(self):
         super(DecisionTreeParams, self).__init__()
@@ -460,7 +487,7 @@ class DecisionTreeParams(Params):
         self.maxMemoryInMB = Param(self, "maxMemoryInMB", "Maximum memory in 
MB allocated to histogram aggregation.")
         #: param for If false, the algorithm will pass trees to executors to 
match instances with nodes. If true, the algorithm will cache node IDs for each 
instance. Caching can speed up training of deeper trees.
         self.cacheNodeIds = Param(self, "cacheNodeIds", "If false, the 
algorithm will pass trees to executors to match instances with nodes. If true, 
the algorithm will cache node IDs for each instance. Caching can speed up 
training of deeper trees.")
-
+        
     def setMaxDepth(self, value):
         """
         Sets the value of :py:attr:`maxDepth`.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-10027] [ML] [PySpark] Add Python API missing methods for ml.feature

Reply via email to