spark git commit: [SPARK-11489][SQL] Only include common first order statistics in GroupedData

rxin Tue, 03 Nov 2015 16:28:37 -0800

Repository: spark
Updated Branches:
  refs/heads/master 53e9cee3e -> 5051262d4



[SPARK-11489][SQL] Only include common first order statistics in GroupedData

We added a bunch of higher order statistics such as skewness and kurtosis to 
GroupedData. I don't think they are common enough to justify being listed, 
since users can always use the normal statistics aggregate functions.

That is to say, after this change, we won't support
```scala
df.groupBy("key").kurtosis("colA", "colB")
```

However, we will still support
```scala
df.groupBy("key").agg(kurtosis(col("colA")), kurtosis(col("colB")))
```

Author: Reynold Xin <r...@databricks.com>

Closes #9446 from rxin/SPARK-11489.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5051262d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5051262d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5051262d

Branch: refs/heads/master
Commit: 5051262d4ca6a2c529c9b1ba86d54cce60a7af17
Parents: 53e9cee
Author: Reynold Xin <r...@databricks.com>
Authored: Tue Nov 3 16:27:56 2015 -0800
Committer: Reynold Xin <r...@databricks.com>
Committed: Tue Nov 3 16:27:56 2015 -0800

----------------------------------------------------------------------
 python/pyspark/sql/group.py                     |  88 -----------
 .../org/apache/spark/sql/GroupedData.scala      | 146 ++++---------------
 .../apache/spark/sql/JavaDataFrameSuite.java    |   1 -
 3 files changed, 28 insertions(+), 207 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/5051262d/python/pyspark/sql/group.py
----------------------------------------------------------------------
diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py
index 946b53e..71c0bcc 100644
--- a/python/pyspark/sql/group.py
+++ b/python/pyspark/sql/group.py
@@ -167,94 +167,6 @@ class GroupedData(object):
         [Row(sum(age)=7, sum(height)=165)]
         """
 
-    @df_varargs_api
-    @since(1.6)
-    def stddev(self, *cols):
-        """Compute the sample standard deviation for each numeric columns for 
each group.
-
-        :param cols: list of column names (string). Non-numeric columns are 
ignored.
-
-        >>> df3.groupBy().stddev('age', 'height').collect()
-        [Row(STDDEV(age)=2.12..., STDDEV(height)=3.53...)]
-        """
-
-    @df_varargs_api
-    @since(1.6)
-    def stddev_samp(self, *cols):
-        """Compute the sample standard deviation for each numeric columns for 
each group.
-
-        :param cols: list of column names (string). Non-numeric columns are 
ignored.
-
-        >>> df3.groupBy().stddev_samp('age', 'height').collect()
-        [Row(STDDEV_SAMP(age)=2.12..., STDDEV_SAMP(height)=3.53...)]
-        """
-
-    @df_varargs_api
-    @since(1.6)
-    def stddev_pop(self, *cols):
-        """Compute the population standard deviation for each numeric columns 
for each group.
-
-        :param cols: list of column names (string). Non-numeric columns are 
ignored.
-
-        >>> df3.groupBy().stddev_pop('age', 'height').collect()
-        [Row(STDDEV_POP(age)=1.5, STDDEV_POP(height)=2.5)]
-        """
-
-    @df_varargs_api
-    @since(1.6)
-    def variance(self, *cols):
-        """Compute the sample variance for each numeric columns for each group.
-
-        :param cols: list of column names (string). Non-numeric columns are 
ignored.
-
-        >>> df3.groupBy().variance('age', 'height').collect()
-        [Row(VARIANCE(age)=2.25, VARIANCE(height)=6.25)]
-        """
-
-    @df_varargs_api
-    @since(1.6)
-    def var_pop(self, *cols):
-        """Compute the sample variance for each numeric columns for each group.
-
-        :param cols: list of column names (string). Non-numeric columns are 
ignored.
-
-        >>> df3.groupBy().var_pop('age', 'height').collect()
-        [Row(VAR_POP(age)=2.25, VAR_POP(height)=6.25)]
-        """
-
-    @df_varargs_api
-    @since(1.6)
-    def var_samp(self, *cols):
-        """Compute the sample variance for each numeric columns for each group.
-
-        :param cols: list of column names (string). Non-numeric columns are 
ignored.
-
-        >>> df3.groupBy().var_samp('age', 'height').collect()
-        [Row(VAR_SAMP(age)=4.5, VAR_SAMP(height)=12.5)]
-        """
-
-    @df_varargs_api
-    @since(1.6)
-    def skewness(self, *cols):
-        """Compute the skewness for each numeric columns for each group.
-
-        :param cols: list of column names (string). Non-numeric columns are 
ignored.
-
-        >>> df3.groupBy().skewness('age', 'height').collect()
-        [Row(SKEWNESS(age)=0.0, SKEWNESS(height)=0.0)]
-        """
-
-    @df_varargs_api
-    @since(1.6)
-    def kurtosis(self, *cols):
-        """Compute the kurtosis for each numeric columns for each group.
-
-        :param cols: list of column names (string). Non-numeric columns are 
ignored.
-
-        >>> df3.groupBy().kurtosis('age', 'height').collect()
-        [Row(KURTOSIS(age)=-2.0, KURTOSIS(height)=-2.0)]
-        """
-
 
 def _test():
     import doctest

http://git-wip-us.apache.org/repos/asf/spark/blob/5051262d/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
index dc96384..c2b2a40 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala
@@ -26,42 +26,14 @@ import org.apache.spark.sql.catalyst.expressions._
 import org.apache.spark.sql.catalyst.plans.logical.{Rollup, Cube, Aggregate}
 import org.apache.spark.sql.types.NumericType
 
-/**
- * Companion object for GroupedData
- */
-private[sql] object GroupedData {
-  def apply(
-      df: DataFrame,
-      groupingExprs: Seq[Expression],
-      groupType: GroupType): GroupedData = {
-    new GroupedData(df, groupingExprs, groupType: GroupType)
-  }
-
-  /**
-   * The Grouping Type
-   */
-  private[sql] trait GroupType
-
-  /**
-   * To indicate it's the GroupBy
-   */
-  private[sql] object GroupByType extends GroupType
-
-  /**
-   * To indicate it's the CUBE
-   */
-  private[sql] object CubeType extends GroupType
-
-  /**
-   * To indicate it's the ROLLUP
-   */
-  private[sql] object RollupType extends GroupType
-}
 
 /**
  * :: Experimental ::
  * A set of methods for aggregations on a [[DataFrame]], created by 
[[DataFrame.groupBy]].
  *
+ * The main method is the agg function, which has multiple variants. This 
class also contains
+ * convenience some first order statistics such as mean, sum for convenience.
+ *
  * @since 1.3.0
  */
 @Experimental
@@ -124,7 +96,7 @@ class GroupedData protected[sql](
       case "avg" | "average" | "mean" => Average
       case "max" => Max
       case "min" => Min
-      case "stddev" => Stddev
+      case "stddev" | "std" => Stddev
       case "stddev_pop" => StddevPop
       case "stddev_samp" => StddevSamp
       case "variance" => Variance
@@ -256,30 +228,6 @@ class GroupedData protected[sql](
   }
 
   /**
-   * Compute the skewness for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the skewness values for 
them.
-   *
-   * @since 1.6.0
-   */
-  @scala.annotation.varargs
-  def skewness(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(Skewness)
-  }
-
-  /**
-   * Compute the kurtosis for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the kurtosis values for 
them.
-   *
-   * @since 1.6.0
-   */
-  @scala.annotation.varargs
-  def kurtosis(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(Kurtosis)
-  }
-
-  /**
    * Compute the max value for each numeric columns for each group.
    * The resulting [[DataFrame]] will also contain the grouping columns.
    * When specified columns are given, only compute the max values for them.
@@ -316,86 +264,48 @@ class GroupedData protected[sql](
   }
 
   /**
-   * Compute the sample standard deviation for each numeric columns for each 
group.
+   * Compute the sum for each numeric columns for each group.
    * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the stddev for them.
+   * When specified columns are given, only compute the sum for them.
    *
-   * @since 1.6.0
+   * @since 1.3.0
    */
   @scala.annotation.varargs
-  def stddev(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(Stddev)
+  def sum(colNames: String*): DataFrame = {
+    aggregateNumericColumns(colNames : _*)(Sum)
   }
+}
 
-  /**
-   * Compute the population standard deviation for each numeric columns for 
each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the stddev for them.
-   *
-   * @since 1.6.0
-   */
-  @scala.annotation.varargs
-  def stddev_pop(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(StddevPop)
-  }
 
-  /**
-   * Compute the sample standard deviation for each numeric columns for each 
group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the stddev for them.
-   *
-   * @since 1.6.0
-   */
-  @scala.annotation.varargs
-  def stddev_samp(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(StddevSamp)
+/**
+ * Companion object for GroupedData.
+ */
+private[sql] object GroupedData {
+
+  def apply(
+      df: DataFrame,
+      groupingExprs: Seq[Expression],
+      groupType: GroupType): GroupedData = {
+    new GroupedData(df, groupingExprs, groupType: GroupType)
   }
 
   /**
-   * Compute the sum for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the sum for them.
-   *
-   * @since 1.3.0
+   * The Grouping Type
    */
-  @scala.annotation.varargs
-  def sum(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(Sum)
-  }
+  private[sql] trait GroupType
 
   /**
-   * Compute the sample variance for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the variance for them.
-   *
-   * @since 1.6.0
+   * To indicate it's the GroupBy
    */
-  @scala.annotation.varargs
-  def variance(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(Variance)
-  }
+  private[sql] object GroupByType extends GroupType
 
   /**
-   * Compute the population variance for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the variance for them.
-   *
-   * @since 1.6.0
+   * To indicate it's the CUBE
    */
-  @scala.annotation.varargs
-  def var_pop(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(VariancePop)
-  }
+  private[sql] object CubeType extends GroupType
 
   /**
-   * Compute the sample variance for each numeric columns for each group.
-   * The resulting [[DataFrame]] will also contain the grouping columns.
-   * When specified columns are given, only compute the variance for them.
-   *
-   * @since 1.6.0
+   * To indicate it's the ROLLUP
    */
-  @scala.annotation.varargs
-  def var_samp(colNames: String*): DataFrame = {
-    aggregateNumericColumns(colNames : _*)(VarianceSamp)
-  }
+  private[sql] object RollupType extends GroupType
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/5051262d/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java 
b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
index a1a3fdb..49f516e 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java
@@ -91,7 +91,6 @@ public class JavaDataFrameSuite {
     df.groupBy().mean("key");
     df.groupBy().max("key");
     df.groupBy().min("key");
-    df.groupBy().stddev("key");
     df.groupBy().sum("key");
 
     // Varargs in column expressions


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-11489][SQL] Only include common first order statistics in GroupedData

Reply via email to