Repository: spark Updated Branches: refs/heads/master 53e9cee3e -> 5051262d4
[SPARK-11489][SQL] Only include common first order statistics in GroupedData We added a bunch of higher order statistics such as skewness and kurtosis to GroupedData. I don't think they are common enough to justify being listed, since users can always use the normal statistics aggregate functions. That is to say, after this change, we won't support ```scala df.groupBy("key").kurtosis("colA", "colB") ``` However, we will still support ```scala df.groupBy("key").agg(kurtosis(col("colA")), kurtosis(col("colB"))) ``` Author: Reynold Xin <r...@databricks.com> Closes #9446 from rxin/SPARK-11489. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/5051262d Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/5051262d Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/5051262d Branch: refs/heads/master Commit: 5051262d4ca6a2c529c9b1ba86d54cce60a7af17 Parents: 53e9cee Author: Reynold Xin <r...@databricks.com> Authored: Tue Nov 3 16:27:56 2015 -0800 Committer: Reynold Xin <r...@databricks.com> Committed: Tue Nov 3 16:27:56 2015 -0800 ---------------------------------------------------------------------- python/pyspark/sql/group.py | 88 ----------- .../org/apache/spark/sql/GroupedData.scala | 146 ++++--------------- .../apache/spark/sql/JavaDataFrameSuite.java | 1 - 3 files changed, 28 insertions(+), 207 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/5051262d/python/pyspark/sql/group.py ---------------------------------------------------------------------- diff --git a/python/pyspark/sql/group.py b/python/pyspark/sql/group.py index 946b53e..71c0bcc 100644 --- a/python/pyspark/sql/group.py +++ b/python/pyspark/sql/group.py @@ -167,94 +167,6 @@ class GroupedData(object): [Row(sum(age)=7, sum(height)=165)] """ - @df_varargs_api - @since(1.6) - def stddev(self, *cols): - """Compute the sample standard deviation for each numeric columns for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df3.groupBy().stddev('age', 'height').collect() - [Row(STDDEV(age)=2.12..., STDDEV(height)=3.53...)] - """ - - @df_varargs_api - @since(1.6) - def stddev_samp(self, *cols): - """Compute the sample standard deviation for each numeric columns for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df3.groupBy().stddev_samp('age', 'height').collect() - [Row(STDDEV_SAMP(age)=2.12..., STDDEV_SAMP(height)=3.53...)] - """ - - @df_varargs_api - @since(1.6) - def stddev_pop(self, *cols): - """Compute the population standard deviation for each numeric columns for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df3.groupBy().stddev_pop('age', 'height').collect() - [Row(STDDEV_POP(age)=1.5, STDDEV_POP(height)=2.5)] - """ - - @df_varargs_api - @since(1.6) - def variance(self, *cols): - """Compute the sample variance for each numeric columns for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df3.groupBy().variance('age', 'height').collect() - [Row(VARIANCE(age)=2.25, VARIANCE(height)=6.25)] - """ - - @df_varargs_api - @since(1.6) - def var_pop(self, *cols): - """Compute the sample variance for each numeric columns for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df3.groupBy().var_pop('age', 'height').collect() - [Row(VAR_POP(age)=2.25, VAR_POP(height)=6.25)] - """ - - @df_varargs_api - @since(1.6) - def var_samp(self, *cols): - """Compute the sample variance for each numeric columns for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df3.groupBy().var_samp('age', 'height').collect() - [Row(VAR_SAMP(age)=4.5, VAR_SAMP(height)=12.5)] - """ - - @df_varargs_api - @since(1.6) - def skewness(self, *cols): - """Compute the skewness for each numeric columns for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df3.groupBy().skewness('age', 'height').collect() - [Row(SKEWNESS(age)=0.0, SKEWNESS(height)=0.0)] - """ - - @df_varargs_api - @since(1.6) - def kurtosis(self, *cols): - """Compute the kurtosis for each numeric columns for each group. - - :param cols: list of column names (string). Non-numeric columns are ignored. - - >>> df3.groupBy().kurtosis('age', 'height').collect() - [Row(KURTOSIS(age)=-2.0, KURTOSIS(height)=-2.0)] - """ - def _test(): import doctest http://git-wip-us.apache.org/repos/asf/spark/blob/5051262d/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala index dc96384..c2b2a40 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedData.scala @@ -26,42 +26,14 @@ import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical.{Rollup, Cube, Aggregate} import org.apache.spark.sql.types.NumericType -/** - * Companion object for GroupedData - */ -private[sql] object GroupedData { - def apply( - df: DataFrame, - groupingExprs: Seq[Expression], - groupType: GroupType): GroupedData = { - new GroupedData(df, groupingExprs, groupType: GroupType) - } - - /** - * The Grouping Type - */ - private[sql] trait GroupType - - /** - * To indicate it's the GroupBy - */ - private[sql] object GroupByType extends GroupType - - /** - * To indicate it's the CUBE - */ - private[sql] object CubeType extends GroupType - - /** - * To indicate it's the ROLLUP - */ - private[sql] object RollupType extends GroupType -} /** * :: Experimental :: * A set of methods for aggregations on a [[DataFrame]], created by [[DataFrame.groupBy]]. * + * The main method is the agg function, which has multiple variants. This class also contains + * convenience some first order statistics such as mean, sum for convenience. + * * @since 1.3.0 */ @Experimental @@ -124,7 +96,7 @@ class GroupedData protected[sql]( case "avg" | "average" | "mean" => Average case "max" => Max case "min" => Min - case "stddev" => Stddev + case "stddev" | "std" => Stddev case "stddev_pop" => StddevPop case "stddev_samp" => StddevSamp case "variance" => Variance @@ -256,30 +228,6 @@ class GroupedData protected[sql]( } /** - * Compute the skewness for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the skewness values for them. - * - * @since 1.6.0 - */ - @scala.annotation.varargs - def skewness(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(Skewness) - } - - /** - * Compute the kurtosis for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the kurtosis values for them. - * - * @since 1.6.0 - */ - @scala.annotation.varargs - def kurtosis(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(Kurtosis) - } - - /** * Compute the max value for each numeric columns for each group. * The resulting [[DataFrame]] will also contain the grouping columns. * When specified columns are given, only compute the max values for them. @@ -316,86 +264,48 @@ class GroupedData protected[sql]( } /** - * Compute the sample standard deviation for each numeric columns for each group. + * Compute the sum for each numeric columns for each group. * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the stddev for them. + * When specified columns are given, only compute the sum for them. * - * @since 1.6.0 + * @since 1.3.0 */ @scala.annotation.varargs - def stddev(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(Stddev) + def sum(colNames: String*): DataFrame = { + aggregateNumericColumns(colNames : _*)(Sum) } +} - /** - * Compute the population standard deviation for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the stddev for them. - * - * @since 1.6.0 - */ - @scala.annotation.varargs - def stddev_pop(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(StddevPop) - } - /** - * Compute the sample standard deviation for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the stddev for them. - * - * @since 1.6.0 - */ - @scala.annotation.varargs - def stddev_samp(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(StddevSamp) +/** + * Companion object for GroupedData. + */ +private[sql] object GroupedData { + + def apply( + df: DataFrame, + groupingExprs: Seq[Expression], + groupType: GroupType): GroupedData = { + new GroupedData(df, groupingExprs, groupType: GroupType) } /** - * Compute the sum for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the sum for them. - * - * @since 1.3.0 + * The Grouping Type */ - @scala.annotation.varargs - def sum(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(Sum) - } + private[sql] trait GroupType /** - * Compute the sample variance for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the variance for them. - * - * @since 1.6.0 + * To indicate it's the GroupBy */ - @scala.annotation.varargs - def variance(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(Variance) - } + private[sql] object GroupByType extends GroupType /** - * Compute the population variance for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the variance for them. - * - * @since 1.6.0 + * To indicate it's the CUBE */ - @scala.annotation.varargs - def var_pop(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(VariancePop) - } + private[sql] object CubeType extends GroupType /** - * Compute the sample variance for each numeric columns for each group. - * The resulting [[DataFrame]] will also contain the grouping columns. - * When specified columns are given, only compute the variance for them. - * - * @since 1.6.0 + * To indicate it's the ROLLUP */ - @scala.annotation.varargs - def var_samp(colNames: String*): DataFrame = { - aggregateNumericColumns(colNames : _*)(VarianceSamp) - } + private[sql] object RollupType extends GroupType } http://git-wip-us.apache.org/repos/asf/spark/blob/5051262d/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java ---------------------------------------------------------------------- diff --git a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java index a1a3fdb..49f516e 100644 --- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java +++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDataFrameSuite.java @@ -91,7 +91,6 @@ public class JavaDataFrameSuite { df.groupBy().mean("key"); df.groupBy().max("key"); df.groupBy().min("key"); - df.groupBy().stddev("key"); df.groupBy().sum("key"); // Varargs in column expressions --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org