Repository: spark
Updated Branches:
  refs/heads/branch-2.3 6bb22961c -> 55695c712


[SPARK-23008][ML] OnehotEncoderEstimator python API

## What changes were proposed in this pull request?

OnehotEncoderEstimator python API.

## How was this patch tested?

doctest

Author: WeichenXu <weichen...@databricks.com>

Closes #20209 from WeichenXu123/ohe_py.

(cherry picked from commit b5042d75c2faa5f15bc1e160d75f06dfdd6eea37)
Signed-off-by: Joseph K. Bradley <jos...@databricks.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/55695c71
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/55695c71
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/55695c71

Branch: refs/heads/branch-2.3
Commit: 55695c7127cb2f357dfdf677cab4d21fc840aa3d
Parents: 6bb2296
Author: WeichenXu <weichen...@databricks.com>
Authored: Thu Jan 11 16:20:30 2018 -0800
Committer: Joseph K. Bradley <jos...@databricks.com>
Committed: Thu Jan 11 16:20:41 2018 -0800

----------------------------------------------------------------------
 python/pyspark/ml/feature.py                    | 113 +++++++++++++++++++
 .../pyspark/ml/param/_shared_params_code_gen.py |   1 +
 python/pyspark/ml/param/shared.py               |  23 ++++
 3 files changed, 137 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/55695c71/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 13bf95c..b963e45 100755
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -45,6 +45,7 @@ __all__ = ['Binarizer',
            'NGram',
            'Normalizer',
            'OneHotEncoder',
+           'OneHotEncoderEstimator', 'OneHotEncoderModel',
            'PCA', 'PCAModel',
            'PolynomialExpansion',
            'QuantileDiscretizer',
@@ -1642,6 +1643,118 @@ class OneHotEncoder(JavaTransformer, HasInputCol, 
HasOutputCol, JavaMLReadable,
 
 
 @inherit_doc
+class OneHotEncoderEstimator(JavaEstimator, HasInputCols, HasOutputCols, 
HasHandleInvalid,
+                             JavaMLReadable, JavaMLWritable):
+    """
+    A one-hot encoder that maps a column of category indices to a column of 
binary vectors, with
+    at most a single one-value per row that indicates the input category index.
+    For example with 5 categories, an input value of 2.0 would map to an 
output vector of
+    `[0.0, 0.0, 1.0, 0.0]`.
+    The last category is not included by default (configurable via `dropLast`),
+    because it makes the vector entries sum up to one, and hence linearly 
dependent.
+    So an input value of 4.0 maps to `[0.0, 0.0, 0.0, 0.0]`.
+
+    Note: This is different from scikit-learn's OneHotEncoder, which keeps all 
categories.
+    The output vectors are sparse.
+
+    When `handleInvalid` is configured to 'keep', an extra "category" 
indicating invalid values is
+    added as last category. So when `dropLast` is true, invalid values are 
encoded as all-zeros
+    vector.
+
+    Note: When encoding multi-column by using `inputCols` and `outputCols` 
params, input/output
+    cols come in pairs, specified by the order in the arrays, and each pair is 
treated
+    independently.
+
+    See `StringIndexer` for converting categorical values into category indices
+
+    >>> from pyspark.ml.linalg import Vectors
+    >>> df = spark.createDataFrame([(0.0,), (1.0,), (2.0,)], ["input"])
+    >>> ohe = OneHotEncoderEstimator(inputCols=["input"], 
outputCols=["output"])
+    >>> model = ohe.fit(df)
+    >>> model.transform(df).head().output
+    SparseVector(2, {0: 1.0})
+    >>> ohePath = temp_path + "/oheEstimator"
+    >>> ohe.save(ohePath)
+    >>> loadedOHE = OneHotEncoderEstimator.load(ohePath)
+    >>> loadedOHE.getInputCols() == ohe.getInputCols()
+    True
+    >>> modelPath = temp_path + "/ohe-model"
+    >>> model.save(modelPath)
+    >>> loadedModel = OneHotEncoderModel.load(modelPath)
+    >>> loadedModel.categorySizes == model.categorySizes
+    True
+
+    .. versionadded:: 2.3.0
+    """
+
+    handleInvalid = Param(Params._dummy(), "handleInvalid", "How to handle 
invalid data during " +
+                          "transform(). Options are 'keep' (invalid data 
presented as an extra " +
+                          "categorical feature) or error (throw an error). 
Note that this Param " +
+                          "is only used during transform; during fitting, 
invalid data will " +
+                          "result in an error.",
+                          typeConverter=TypeConverters.toString)
+
+    dropLast = Param(Params._dummy(), "dropLast", "whether to drop the last 
category",
+                     typeConverter=TypeConverters.toBoolean)
+
+    @keyword_only
+    def __init__(self, inputCols=None, outputCols=None, handleInvalid="error", 
dropLast=True):
+        """
+        __init__(self, inputCols=None, outputCols=None, handleInvalid="error", 
dropLast=True)
+        """
+        super(OneHotEncoderEstimator, self).__init__()
+        self._java_obj = self._new_java_obj(
+            "org.apache.spark.ml.feature.OneHotEncoderEstimator", self.uid)
+        self._setDefault(handleInvalid="error", dropLast=True)
+        kwargs = self._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    @since("2.3.0")
+    def setParams(self, inputCols=None, outputCols=None, 
handleInvalid="error", dropLast=True):
+        """
+        setParams(self, inputCols=None, outputCols=None, 
handleInvalid="error", dropLast=True)
+        Sets params for this OneHotEncoderEstimator.
+        """
+        kwargs = self._input_kwargs
+        return self._set(**kwargs)
+
+    @since("2.3.0")
+    def setDropLast(self, value):
+        """
+        Sets the value of :py:attr:`dropLast`.
+        """
+        return self._set(dropLast=value)
+
+    @since("2.3.0")
+    def getDropLast(self):
+        """
+        Gets the value of dropLast or its default value.
+        """
+        return self.getOrDefault(self.dropLast)
+
+    def _create_model(self, java_model):
+        return OneHotEncoderModel(java_model)
+
+
+class OneHotEncoderModel(JavaModel, JavaMLReadable, JavaMLWritable):
+    """
+    Model fitted by :py:class:`OneHotEncoderEstimator`.
+
+    .. versionadded:: 2.3.0
+    """
+
+    @property
+    @since("2.3.0")
+    def categorySizes(self):
+        """
+        Original number of categories for each feature being encoded.
+        The array contains one value for each input column, in order.
+        """
+        return self._call_java("categorySizes")
+
+
+@inherit_doc
 class PolynomialExpansion(JavaTransformer, HasInputCol, HasOutputCol, 
JavaMLReadable,
                           JavaMLWritable):
     """

http://git-wip-us.apache.org/repos/asf/spark/blob/55695c71/python/pyspark/ml/param/_shared_params_code_gen.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/_shared_params_code_gen.py 
b/python/pyspark/ml/param/_shared_params_code_gen.py
index 1d0f60a..db951d8 100644
--- a/python/pyspark/ml/param/_shared_params_code_gen.py
+++ b/python/pyspark/ml/param/_shared_params_code_gen.py
@@ -119,6 +119,7 @@ if __name__ == "__main__":
         ("inputCol", "input column name.", None, "TypeConverters.toString"),
         ("inputCols", "input column names.", None, 
"TypeConverters.toListString"),
         ("outputCol", "output column name.", "self.uid + '__output'", 
"TypeConverters.toString"),
+        ("outputCols", "output column names.", None, 
"TypeConverters.toListString"),
         ("numFeatures", "number of features.", None, "TypeConverters.toInt"),
         ("checkpointInterval", "set checkpoint interval (>= 1) or disable 
checkpoint (-1). " +
          "E.g. 10 means that the cache will get checkpointed every 10 
iterations. Note: " +

http://git-wip-us.apache.org/repos/asf/spark/blob/55695c71/python/pyspark/ml/param/shared.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/param/shared.py 
b/python/pyspark/ml/param/shared.py
index 813f7a5..474c387 100644
--- a/python/pyspark/ml/param/shared.py
+++ b/python/pyspark/ml/param/shared.py
@@ -256,6 +256,29 @@ class HasOutputCol(Params):
         return self.getOrDefault(self.outputCol)
 
 
+class HasOutputCols(Params):
+    """
+    Mixin for param outputCols: output column names.
+    """
+
+    outputCols = Param(Params._dummy(), "outputCols", "output column names.", 
typeConverter=TypeConverters.toListString)
+
+    def __init__(self):
+        super(HasOutputCols, self).__init__()
+
+    def setOutputCols(self, value):
+        """
+        Sets the value of :py:attr:`outputCols`.
+        """
+        return self._set(outputCols=value)
+
+    def getOutputCols(self):
+        """
+        Gets the value of outputCols or its default value.
+        """
+        return self.getOrDefault(self.outputCols)
+
+
 class HasNumFeatures(Params):
     """
     Mixin for param numFeatures: number of features.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to