Repository: spark
Updated Branches:
  refs/heads/master fe16fd0b8 -> 52ea399e6


[SPARK-10355] [ML] [PySpark] Add Python API for SQLTransformer

Add Python API for SQLTransformer

Author: Yanbo Liang <yblia...@gmail.com>

Closes #8527 from yanboliang/spark-10355.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/52ea399e
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/52ea399e
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/52ea399e

Branch: refs/heads/master
Commit: 52ea399e6ee37b7c44aae7709863e006fca88906
Parents: fe16fd0
Author: Yanbo Liang <yblia...@gmail.com>
Authored: Mon Aug 31 16:11:27 2015 -0700
Committer: Xiangrui Meng <m...@databricks.com>
Committed: Mon Aug 31 16:11:27 2015 -0700

----------------------------------------------------------------------
 python/pyspark/ml/feature.py | 57 ++++++++++++++++++++++++++++++++++++---
 1 file changed, 54 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/52ea399e/python/pyspark/ml/feature.py
----------------------------------------------------------------------
diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py
index 59300a6..0626281 100644
--- a/python/pyspark/ml/feature.py
+++ b/python/pyspark/ml/feature.py
@@ -28,9 +28,9 @@ from pyspark.mllib.linalg import _convert_to_vector
 
 __all__ = ['Binarizer', 'Bucketizer', 'DCT', 'ElementwiseProduct', 
'HashingTF', 'IDF', 'IDFModel',
            'NGram', 'Normalizer', 'OneHotEncoder', 'PolynomialExpansion', 
'RegexTokenizer',
-           'StandardScaler', 'StandardScalerModel', 'StringIndexer', 
'StringIndexerModel',
-           'Tokenizer', 'VectorAssembler', 'VectorIndexer', 'Word2Vec', 
'Word2VecModel',
-           'PCA', 'PCAModel', 'RFormula', 'RFormulaModel']
+           'SQLTransformer', 'StandardScaler', 'StandardScalerModel', 
'StringIndexer',
+           'StringIndexerModel', 'Tokenizer', 'VectorAssembler', 
'VectorIndexer', 'Word2Vec',
+           'Word2VecModel', 'PCA', 'PCAModel', 'RFormula', 'RFormulaModel']
 
 
 @inherit_doc
@@ -744,6 +744,57 @@ class RegexTokenizer(JavaTransformer, HasInputCol, 
HasOutputCol):
 
 
 @inherit_doc
+class SQLTransformer(JavaTransformer):
+    """
+    Implements the transforms which are defined by SQL statement.
+    Currently we only support SQL syntax like 'SELECT ... FROM __THIS__'
+    where '__THIS__' represents the underlying table of the input dataset.
+
+    >>> df = sqlContext.createDataFrame([(0, 1.0, 3.0), (2, 2.0, 5.0)], ["id", 
"v1", "v2"])
+    >>> sqlTrans = SQLTransformer(
+    ...     statement="SELECT *, (v1 + v2) AS v3, (v1 * v2) AS v4 FROM 
__THIS__")
+    >>> sqlTrans.transform(df).head()
+    Row(id=0, v1=1.0, v2=3.0, v3=4.0, v4=3.0)
+    """
+
+    # a placeholder to make it appear in the generated doc
+    statement = Param(Params._dummy(), "statement", "SQL statement")
+
+    @keyword_only
+    def __init__(self, statement=None):
+        """
+        __init__(self, statement=None)
+        """
+        super(SQLTransformer, self).__init__()
+        self._java_obj = 
self._new_java_obj("org.apache.spark.ml.feature.SQLTransformer", self.uid)
+        self.statement = Param(self, "statement", "SQL statement")
+        kwargs = self.__init__._input_kwargs
+        self.setParams(**kwargs)
+
+    @keyword_only
+    def setParams(self, statement=None):
+        """
+        setParams(self, statement=None)
+        Sets params for this SQLTransformer.
+        """
+        kwargs = self.setParams._input_kwargs
+        return self._set(**kwargs)
+
+    def setStatement(self, value):
+        """
+        Sets the value of :py:attr:`statement`.
+        """
+        self._paramMap[self.statement] = value
+        return self
+
+    def getStatement(self):
+        """
+        Gets the value of statement or its default value.
+        """
+        return self.getOrDefault(self.statement)
+
+
+@inherit_doc
 class StandardScaler(JavaEstimator, HasInputCol, HasOutputCol):
     """
     Standardizes features by removing the mean and scaling to unit variance 
using column summary


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to