Repository: spark Updated Branches: refs/heads/master a9676cc71 -> 1d04dc95c
[SPARK-11467][SQL] add Python API for stddev/variance Add Python API for stddev/stddev_pop/stddev_samp/variance/var_pop/var_samp/skewness/kurtosis Author: Davies Liu <dav...@databricks.com> Closes #9424 from davies/py_var. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1d04dc95 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1d04dc95 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1d04dc95 Branch: refs/heads/master Commit: 1d04dc95c0d3caa485936e65b0493bcc9719f27e Parents: a9676cc Author: Davies Liu <dav...@databricks.com> Authored: Tue Nov 3 13:33:46 2015 -0800 Committer: Reynold Xin <r...@databricks.com> Committed: Tue Nov 3 13:33:46 2015 -0800 ---------------------------------------------------------------------- python/pyspark/sql/functions.py | 17 ++++ python/pyspark/sql/group.py | 88 ++++++++++++++++++++ .../scala/org/apache/spark/sql/functions.scala | 67 --------------- 3 files changed, 105 insertions(+), 67 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/1d04dc95/python/pyspark/sql/functions.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py index fa04f4c..2f7c2f4 100644 --- a/python/pyspark/sql/functions.py +++ b/python/pyspark/sql/functions.py @@ -122,6 +122,21 @@ _functions_1_4 = { 'bitwiseNOT': 'Computes bitwise not.', } +_functions_1_6 = { + # unary math functions + "stddev": "Aggregate function: returns the unbiased sample standard deviation of" + + " the expression in a group.", + "stddev_samp": "Aggregate function: returns the unbiased sample standard deviation of" + + " the expression in a group.", + "stddev_pop": "Aggregate function: returns population standard deviation of" + + " the expression in a group.", + "variance": "Aggregate function: returns the population variance of the values in a group.", + "var_samp": "Aggregate function: returns the unbiased variance of the values in a group.", + "var_pop": "Aggregate function: returns the population variance of the values in a group.", + "skewness": "Aggregate function: returns the skewness of the values in a group.", + "kurtosis": "Aggregate function: returns the kurtosis of the values in a group." +} + # math functions that take two arguments as input _binary_mathfunctions = { 'atan2': 'Returns the angle theta from the conversion of rectangular coordinates (x, y) to' + @@ -172,6 +187,8 @@ for _name, _doc in _binary_mathfunctions.items(): globals()[_name] = since(1.4)(_create_binary_mathfunction(_name, _doc)) for _name, _doc in _window_functions.items(): globals()[_name] = since(1.4)(_create_window_function(_name, _doc)) +for _name, _doc in _functions_1_6.items(): + globals()[_name] = since(1.6)(_create_function(_name, _doc)) del _name, _doc http://git-wip-us.apache.org/repos/asf/spark/blob/1d04dc95/python/pyspark/sql/group.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py index 71c0bcc..946b53e 100644 --- a/python/pyspark/sql/group.py +++ b/python/pyspark/sql/group.py @@ -167,6 +167,94 @@ class GroupedData(object): [Row(sum(age)=7, sum(height)=165)] """ + @df_varargs_api + @since(1.6) + def stddev(self, *cols): + """Compute the sample standard deviation for each numeric columns for each group. + + :param cols: list of column names (string). Non-numeric columns are ignored. + + >>> df3.groupBy().stddev('age', 'height').collect() + [Row(STDDEV(age)=2.12..., STDDEV(height)=3.53...)] + """ + + @df_varargs_api + @since(1.6) + def stddev_samp(self, *cols): + """Compute the sample standard deviation for each numeric columns for each group. + + :param cols: list of column names (string). Non-numeric columns are ignored. + + >>> df3.groupBy().stddev_samp('age', 'height').collect() + [Row(STDDEV_SAMP(age)=2.12..., STDDEV_SAMP(height)=3.53...)] + """ + + @df_varargs_api + @since(1.6) + def stddev_pop(self, *cols): + """Compute the population standard deviation for each numeric columns for each group. + + :param cols: list of column names (string). Non-numeric columns are ignored. + + >>> df3.groupBy().stddev_pop('age', 'height').collect() + [Row(STDDEV_POP(age)=1.5, STDDEV_POP(height)=2.5)] + """ + + @df_varargs_api + @since(1.6) + def variance(self, *cols): + """Compute the sample variance for each numeric columns for each group. + + :param cols: list of column names (string). Non-numeric columns are ignored. + + >>> df3.groupBy().variance('age', 'height').collect() + [Row(VARIANCE(age)=2.25, VARIANCE(height)=6.25)] + """ + + @df_varargs_api + @since(1.6) + def var_pop(self, *cols): + """Compute the sample variance for each numeric columns for each group. + + :param cols: list of column names (string). Non-numeric columns are ignored. + + >>> df3.groupBy().var_pop('age', 'height').collect() + [Row(VAR_POP(age)=2.25, VAR_POP(height)=6.25)] + """ + + @df_varargs_api + @since(1.6) + def var_samp(self, *cols): + """Compute the sample variance for each numeric columns for each group. + + :param cols: list of column names (string). Non-numeric columns are ignored. + + >>> df3.groupBy().var_samp('age', 'height').collect() + [Row(VAR_SAMP(age)=4.5, VAR_SAMP(height)=12.5)] + """ + + @df_varargs_api + @since(1.6) + def skewness(self, *cols): + """Compute the skewness for each numeric columns for each group. + + :param cols: list of column names (string). Non-numeric columns are ignored. + + >>> df3.groupBy().skewness('age', 'height').collect() + [Row(SKEWNESS(age)=0.0, SKEWNESS(height)=0.0)] + """ + + @df_varargs_api + @since(1.6) + def kurtosis(self, *cols): + """Compute the kurtosis for each numeric columns for each group. + + :param cols: list of column names (string). Non-numeric columns are ignored. + + >>> df3.groupBy().kurtosis('age', 'height').collect() + [Row(KURTOSIS(age)=-2.0, KURTOSIS(height)=-2.0)] + """ + def _test(): import doctest http://git-wip-us.apache.org/repos/asf/spark/blob/1d04dc95/sql/core/src/main/scala/org/apache/spark/sql/functions.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala index 5a5c695..c8c5283 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala @@ -255,14 +255,6 @@ object functions { def kurtosis(e: Column): Column = Kurtosis(e.expr) /** - * Aggregate function: returns the kurtosis of the values in a group. - * - * @group agg_funcs - * @since 1.6.0 - */ - def kurtosis(columnName: String): Column = kurtosis(Column(columnName)) - - /** * Aggregate function: returns the last value in a group. * * @group agg_funcs @@ -337,14 +329,6 @@ object functions { def skewness(e: Column): Column = Skewness(e.expr) /** - * Aggregate function: returns the skewness of the values in a group. - * - * @group agg_funcs - * @since 1.6.0 - */ - def skewness(columnName: String): Column = skewness(Column(columnName)) - - /** * Aggregate function: returns the unbiased sample standard deviation of * the expression in a group. * @@ -360,27 +344,9 @@ object functions { * @group agg_funcs * @since 1.6.0 */ - def stddev(columnName: String): Column = stddev(Column(columnName)) - - /** - * Aggregate function: returns the unbiased sample standard deviation of - * the expression in a group. - * - * @group agg_funcs - * @since 1.6.0 - */ def stddev_samp(e: Column): Column = StddevSamp(e.expr) /** - * Aggregate function: returns the unbiased sample standard deviation of - * the expression in a group. - * - * @group agg_funcs - * @since 1.6.0 - */ - def stddev_samp(columnName: String): Column = stddev_samp(Column(columnName)) - - /** * Aggregate function: returns the population standard deviation of * the expression in a group. * @@ -390,15 +356,6 @@ object functions { def stddev_pop(e: Column): Column = StddevPop(e.expr) /** - * Aggregate function: returns the population standard deviation of - * the expression in a group. - * - * @group agg_funcs - * @since 1.6.0 - */ - def stddev_pop(columnName: String): Column = stddev_pop(Column(columnName)) - - /** * Aggregate function: returns the sum of all values in the expression. * * @group agg_funcs @@ -439,14 +396,6 @@ object functions { def variance(e: Column): Column = Variance(e.expr) /** - * Aggregate function: returns the population variance of the values in a group. - * - * @group agg_funcs - * @since 1.6.0 - */ - def variance(columnName: String): Column = variance(Column(columnName)) - - /** * Aggregate function: returns the unbiased variance of the values in a group. * * @group agg_funcs @@ -455,14 +404,6 @@ object functions { def var_samp(e: Column): Column = VarianceSamp(e.expr) /** - * Aggregate function: returns the unbiased variance of the values in a group. - * - * @group agg_funcs - * @since 1.6.0 - */ - def var_samp(columnName: String): Column = var_samp(Column(columnName)) - - /** * Aggregate function: returns the population variance of the values in a group. * * @group agg_funcs @@ -470,14 +411,6 @@ object functions { */ def var_pop(e: Column): Column = VariancePop(e.expr) - /** - * Aggregate function: returns the population variance of the values in a group. - * - * @group agg_funcs - * @since 1.6.0 - */ - def var_pop(columnName: String): Column = var_pop(Column(columnName)) - ////////////////////////////////////////////////////////////////////////////////////////////// // Window functions ////////////////////////////////////////////////////////////////////////////////////////////// --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org