This is an automated email from the ASF dual-hosted git repository. ruifengz pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new d279dbf [SPARK-31243][ML][PYSPARK] Add ANOVATest and FValueTest to PySpark d279dbf is described below commit d279dbf09c18f37a2660dbc822763ebb54e4459a Author: Huaxin Gao <huax...@us.ibm.com> AuthorDate: Fri Mar 27 14:05:49 2020 +0800 [SPARK-31243][ML][PYSPARK] Add ANOVATest and FValueTest to PySpark ### What changes were proposed in this pull request? Add ANOVATest and FValueTest to PySpark ### Why are the changes needed? Parity between Scala and Python. ### Does this PR introduce any user-facing change? Yes. Python ANOVATest and FValueTest ### How was this patch tested? doctest Closes #28012 from huaxingao/stats-python. Authored-by: Huaxin Gao <huax...@us.ibm.com> Signed-off-by: zhengruifeng <ruife...@foxmail.com> --- python/pyspark/ml/stat.py | 100 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) diff --git a/python/pyspark/ml/stat.py b/python/pyspark/ml/stat.py index 53a57af..0581469 100644 --- a/python/pyspark/ml/stat.py +++ b/python/pyspark/ml/stat.py @@ -411,6 +411,106 @@ class MultivariateGaussian(object): self.cov = cov +class ANOVATest(object): + """ + Conduct ANOVA Classification Test for continuous features against categorical labels. + + .. versionadded:: 3.1.0 + """ + @staticmethod + @since("3.1.0") + def test(dataset, featuresCol, labelCol): + """ + Perform an ANOVA test using dataset. + + :param dataset: + DataFrame of categorical labels and continuous features. + :param featuresCol: + Name of features column in dataset, of type `Vector` (`VectorUDT`). + :param labelCol: + Name of label column in dataset, of any numerical type. + :return: + DataFrame containing the test result for every feature against the label. + This DataFrame will contain a single Row with the following fields: + - `pValues: Vector` + - `degreesOfFreedom: Array[Long]` + - `fValues: Vector` + Each of these fields has one value per feature. + + >>> from pyspark.ml.linalg import Vectors + >>> from pyspark.ml.stat import ANOVATest + >>> dataset = [[2.0, Vectors.dense([0.43486404, 0.57153633, 0.43175686, + ... 0.51418671, 0.61632374, 0.96565515])], + ... [1.0, Vectors.dense([0.49162732, 0.6785187, 0.85460572, + ... 0.59784822, 0.12394819, 0.53783355])], + ... [2.0, Vectors.dense([0.30879653, 0.54904515, 0.17103889, + ... 0.40492506, 0.18957493, 0.5440016])], + ... [3.0, Vectors.dense([0.68114391, 0.60549825, 0.69094651, + ... 0.62102109, 0.05471483, 0.96449167])]] + >>> dataset = spark.createDataFrame(dataset, ["label", "features"]) + >>> anovaResult = ANOVATest.test(dataset, 'features', 'label') + >>> row = anovaResult.select("fValues", "pValues").collect() + >>> row[0].fValues + DenseVector([4.0264, 18.4713, 3.4659, 1.9042, 0.5532, 0.512]) + >>> row[0].pValues + DenseVector([0.3324, 0.1623, 0.3551, 0.456, 0.689, 0.7029]) + """ + sc = SparkContext._active_spark_context + javaTestObj = _jvm().org.apache.spark.ml.stat.ANOVATest + args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)] + return _java2py(sc, javaTestObj.test(*args)) + + +class FValueTest(object): + """ + Conduct F Regression test for continuous features against continuous labels. + + .. versionadded:: 3.1.0 + """ + @staticmethod + @since("3.1.0") + def test(dataset, featuresCol, labelCol): + """ + Perform a F Regression test using dataset. + + :param dataset: + DataFrame of continuous labels and continuous features. + :param featuresCol: + Name of features column in dataset, of type `Vector` (`VectorUDT`). + :param labelCol: + Name of label column in dataset, of any numerical type. + :return: + DataFrame containing the test result for every feature against the label. + This DataFrame will contain a single Row with the following fields: + - `pValues: Vector` + - `degreesOfFreedom: Array[Long]` + - `fValues: Vector` + Each of these fields has one value per feature. + + >>> from pyspark.ml.linalg import Vectors + >>> from pyspark.ml.stat import FValueTest + >>> dataset = [[0.57495218, Vectors.dense([0.43486404, 0.57153633, 0.43175686, + ... 0.51418671, 0.61632374, 0.96565515])], + ... [0.84619853, Vectors.dense([0.49162732, 0.6785187, 0.85460572, + ... 0.59784822, 0.12394819, 0.53783355])], + ... [0.39777647, Vectors.dense([0.30879653, 0.54904515, 0.17103889, + ... 0.40492506, 0.18957493, 0.5440016])], + ... [0.79201573, Vectors.dense([0.68114391, 0.60549825, 0.69094651, + ... 0.62102109, 0.05471483, 0.96449167])]] + >>> dataset = spark.createDataFrame(dataset, ["label", "features"]) + >>> fValueResult = FValueTest.test(dataset, 'features', 'label') + >>> row = fValueResult.select("fValues", "pValues").collect() + >>> row[0].fValues + DenseVector([3.741, 7.5807, 142.0684, 34.9849, 0.4112, 0.0539]) + >>> row[0].pValues + DenseVector([0.1928, 0.1105, 0.007, 0.0274, 0.5871, 0.838]) + """ + sc = SparkContext._active_spark_context + javaTestObj = _jvm().org.apache.spark.ml.stat.FValueTest + args = [_py2java(sc, arg) for arg in (dataset, featuresCol, labelCol)] + return _java2py(sc, javaTestObj.test(*args)) + + if __name__ == "__main__": import doctest import numpy --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org