spark git commit: [SPARK-11467][SQL] add Python API for stddev/variance

rxin Tue, 03 Nov 2015 13:35:09 -0800

Repository: spark
Updated Branches:
  refs/heads/master a9676cc71 -> 1d04dc95c



[SPARK-11467][SQL] add Python API for stddev/variance

Add Python API for 
stddev/stddev_pop/stddev_samp/variance/var_pop/var_samp/skewness/kurtosis

Author: Davies Liu <dav...@databricks.com>

Closes #9424 from davies/py_var.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/1d04dc95
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/1d04dc95
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/1d04dc95

Branch: refs/heads/master
Commit: 1d04dc95c0d3caa485936e65b0493bcc9719f27e
Parents: a9676cc
Author: Davies Liu <dav...@databricks.com>
Authored: Tue Nov 3 13:33:46 2015 -0800
Committer: Reynold Xin <r...@databricks.com>
Committed: Tue Nov 3 13:33:46 2015 -0800

----------------------------------------------------------------------
 python/pyspark/sql/functions.py                 | 17 ++++
 python/pyspark/sql/group.py                     | 88 ++++++++++++++++++++
 .../scala/org/apache/spark/sql/functions.scala  | 67 ---------------
 3 files changed, 105 insertions(+), 67 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/1d04dc95/python/pyspark/sql/functions.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/functions.py b/python/pyspark/sql/functions.py
index fa04f4c..2f7c2f4 100644
--- a/python/pyspark/sql/functions.py
+++ b/python/pyspark/sql/functions.py
@@ -122,6 +122,21 @@ _functions_1_4 = {
     'bitwiseNOT': 'Computes bitwise not.',
 }
 
+_functions_1_6 = {
+    # unary math functions
+    "stddev": "Aggregate function: returns the unbiased sample standard 
deviation of" +
+              " the expression in a group.",
+    "stddev_samp": "Aggregate function: returns the unbiased sample standard 
deviation of" +
+              " the expression in a group.",
+    "stddev_pop": "Aggregate function: returns population standard deviation 
of" +
+              " the expression in a group.",
+    "variance": "Aggregate function: returns the population variance of the 
values in a group.",
+    "var_samp": "Aggregate function: returns the unbiased variance of the 
values in a group.",
+    "var_pop":  "Aggregate function: returns the population variance of the 
values in a group.",
+    "skewness": "Aggregate function: returns the skewness of the values in a 
group.",
+    "kurtosis": "Aggregate function: returns the kurtosis of the values in a 
group."
+}
+
 # math functions that take two arguments as input
 _binary_mathfunctions = {
     'atan2': 'Returns the angle theta from the conversion of rectangular 
coordinates (x, y) to' +
@@ -172,6 +187,8 @@ for _name, _doc in _binary_mathfunctions.items():
     globals()[_name] = since(1.4)(_create_binary_mathfunction(_name, _doc))
 for _name, _doc in _window_functions.items():
     globals()[_name] = since(1.4)(_create_window_function(_name, _doc))
+for _name, _doc in _functions_1_6.items():
+    globals()[_name] = since(1.6)(_create_function(_name, _doc))
 del _name, _doc
 
 

http://git-wip-us.apache.org/repos/asf/spark/blob/1d04dc95/python/pyspark/sql/group.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 71c0bcc..946b53e 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -167,6 +167,94 @@ class GroupedData(object):
         [Row(sum(age)=7, sum(height)=165)]
         """
 
+    @df_varargs_api
+    @since(1.6)
+    def stddev(self, *cols):
+        """Compute the sample standard deviation for each numeric columns for 
each group.
+
+        :param cols: list of column names (string). Non-numeric columns are 
ignored.
+
+        >>> df3.groupBy().stddev('age', 'height').collect()
+        [Row(STDDEV(age)=2.12..., STDDEV(height)=3.53...)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def stddev_samp(self, *cols):
+        """Compute the sample standard deviation for each numeric columns for 
each group.
+
+        :param cols: list of column names (string). Non-numeric columns are 
ignored.
+
+        >>> df3.groupBy().stddev_samp('age', 'height').collect()
+        [Row(STDDEV_SAMP(age)=2.12..., STDDEV_SAMP(height)=3.53...)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def stddev_pop(self, *cols):
+        """Compute the population standard deviation for each numeric columns 
for each group.
+
+        :param cols: list of column names (string). Non-numeric columns are 
ignored.
+
+        >>> df3.groupBy().stddev_pop('age', 'height').collect()
+        [Row(STDDEV_POP(age)=1.5, STDDEV_POP(height)=2.5)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def variance(self, *cols):
+        """Compute the sample variance for each numeric columns for each group.
+
+        :param cols: list of column names (string). Non-numeric columns are 
ignored.
+
+        >>> df3.groupBy().variance('age', 'height').collect()
+        [Row(VARIANCE(age)=2.25, VARIANCE(height)=6.25)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def var_pop(self, *cols):
+        """Compute the sample variance for each numeric columns for each group.
+
+        :param cols: list of column names (string). Non-numeric columns are 
ignored.
+
+        >>> df3.groupBy().var_pop('age', 'height').collect()
+        [Row(VAR_POP(age)=2.25, VAR_POP(height)=6.25)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def var_samp(self, *cols):
+        """Compute the sample variance for each numeric columns for each group.
+
+        :param cols: list of column names (string). Non-numeric columns are 
ignored.
+
+        >>> df3.groupBy().var_samp('age', 'height').collect()
+        [Row(VAR_SAMP(age)=4.5, VAR_SAMP(height)=12.5)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def skewness(self, *cols):
+        """Compute the skewness for each numeric columns for each group.
+
+        :param cols: list of column names (string). Non-numeric columns are 
ignored.
+
+        >>> df3.groupBy().skewness('age', 'height').collect()
+        [Row(SKEWNESS(age)=0.0, SKEWNESS(height)=0.0)]
+        """
+
+    @df_varargs_api
+    @since(1.6)
+    def kurtosis(self, *cols):
+        """Compute the kurtosis for each numeric columns for each group.
+
+        :param cols: list of column names (string). Non-numeric columns are 
ignored.
+
+        >>> df3.groupBy().kurtosis('age', 'height').collect()
+        [Row(KURTOSIS(age)=-2.0, KURTOSIS(height)=-2.0)]
+        """
+
 
 def _test():
     import doctest

http://git-wip-us.apache.org/repos/asf/spark/blob/1d04dc95/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
index 5a5c695..c8c5283 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/functions.scala
@@ -255,14 +255,6 @@ object functions {
   def kurtosis(e: Column): Column = Kurtosis(e.expr)
 
   /**
-   * Aggregate function: returns the kurtosis of the values in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def kurtosis(columnName: String): Column = kurtosis(Column(columnName))
-
-  /**
    * Aggregate function: returns the last value in a group.
    *
    * @group agg_funcs
@@ -337,14 +329,6 @@ object functions {
   def skewness(e: Column): Column = Skewness(e.expr)
 
   /**
-   * Aggregate function: returns the skewness of the values in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def skewness(columnName: String): Column = skewness(Column(columnName))
-
-  /**
    * Aggregate function: returns the unbiased sample standard deviation of
    * the expression in a group.
    *
@@ -360,27 +344,9 @@ object functions {
    * @group agg_funcs
    * @since 1.6.0
    */
-  def stddev(columnName: String): Column = stddev(Column(columnName))
-
-  /**
-   * Aggregate function: returns the unbiased sample standard deviation of
-   * the expression in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
   def stddev_samp(e: Column): Column = StddevSamp(e.expr)
 
   /**
-   * Aggregate function: returns the unbiased sample standard deviation of
-   * the expression in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def stddev_samp(columnName: String): Column = stddev_samp(Column(columnName))
-
-  /**
    * Aggregate function: returns the population standard deviation of
    * the expression in a group.
    *
@@ -390,15 +356,6 @@ object functions {
   def stddev_pop(e: Column): Column = StddevPop(e.expr)
 
   /**
-   * Aggregate function: returns the population standard deviation of
-   * the expression in a group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def stddev_pop(columnName: String): Column = stddev_pop(Column(columnName))
-
-  /**
    * Aggregate function: returns the sum of all values in the expression.
    *
    * @group agg_funcs
@@ -439,14 +396,6 @@ object functions {
   def variance(e: Column): Column = Variance(e.expr)
 
   /**
-   * Aggregate function: returns the population variance of the values in a 
group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def variance(columnName: String): Column = variance(Column(columnName))
-
-  /**
    * Aggregate function: returns the unbiased variance of the values in a 
group.
    *
    * @group agg_funcs
@@ -455,14 +404,6 @@ object functions {
   def var_samp(e: Column): Column = VarianceSamp(e.expr)
 
   /**
-   * Aggregate function: returns the unbiased variance of the values in a 
group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def var_samp(columnName: String): Column = var_samp(Column(columnName))
-
-  /**
    * Aggregate function: returns the population variance of the values in a 
group.
    *
    * @group agg_funcs
@@ -470,14 +411,6 @@ object functions {
    */
   def var_pop(e: Column): Column = VariancePop(e.expr)
 
-  /**
-   * Aggregate function: returns the population variance of the values in a 
group.
-   *
-   * @group agg_funcs
-   * @since 1.6.0
-   */
-  def var_pop(columnName: String): Column = var_pop(Column(columnName))
-
   
//////////////////////////////////////////////////////////////////////////////////////////////
   // Window functions
   
//////////////////////////////////////////////////////////////////////////////////////////////


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11467][SQL] add Python API for stddev/variance

Reply via email to