[GitHub] spark pull request #18307: [SPARK-21100][SQL] Add summary method as alternat...
Github user asfgit closed the pull request at: https://github.com/apache/spark/pull/18307 --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18307: [SPARK-21100][SQL] Add summary method as alternat...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/18307#discussion_r125874238 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestData.scala --- @@ -230,6 +230,16 @@ private[sql] trait SQLTestData { self => df } + protected lazy val person2: DataFrame = { --- End diff -- if it's only used in `DataFrameSuite`, can we put this in `DataFrameSuite`? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18307: [SPARK-21100][SQL] Add summary method as alternat...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/18307#discussion_r125873472 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala --- @@ -712,6 +707,76 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { checkAnswer(emptyDescription, emptyDescribeResult) } + test("summary") { +val describeTestData = person2 --- End diff -- `summaryTestData`? Actually can we just use `person2`? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18307: [SPARK-21100][SQL] Add summary method as alternat...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/18307#discussion_r125836743 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala --- @@ -2205,47 +2205,80 @@ class Dataset[T] private[sql]( * // max 92.0 192.0 * }}} * + * Use [[summary]] for expanded statistics and control over which statistics to compute. + * + * @param cols Columns to compute statistics on. + * * @group action * @since 1.6.0 */ @scala.annotation.varargs - def describe(cols: String*): DataFrame = withPlan { - -// The list of summary statistics to compute, in the form of expressions. -val statistics = List[(String, Expression => Expression)]( - "count" -> ((child: Expression) => Count(child).toAggregateExpression()), - "mean" -> ((child: Expression) => Average(child).toAggregateExpression()), - "stddev" -> ((child: Expression) => StddevSamp(child).toAggregateExpression()), - "min" -> ((child: Expression) => Min(child).toAggregateExpression()), - "max" -> ((child: Expression) => Max(child).toAggregateExpression())) - -val outputCols = - (if (cols.isEmpty) aggregatableColumns.map(usePrettyExpression(_).sql) else cols).toList - -val ret: Seq[Row] = if (outputCols.nonEmpty) { - val aggExprs = statistics.flatMap { case (_, colToAgg) => -outputCols.map(c => Column(Cast(colToAgg(Column(c).expr), StringType)).as(c)) - } - - val row = groupBy().agg(aggExprs.head, aggExprs.tail: _*).head().toSeq - - // Pivot the data so each summary is one row - row.grouped(outputCols.size).toSeq.zip(statistics).map { case (aggregation, (statistic, _)) => -Row(statistic :: aggregation.toList: _*) - } -} else { - // If there are no output columns, just output a single column that contains the stats. - statistics.map { case (name, _) => Row(name) } -} - -// All columns are string type -val schema = StructType( - StructField("summary", StringType) :: outputCols.map(StructField(_, StringType))).toAttributes -// `toArray` forces materialization to make the seq serializable -LocalRelation.fromExternalRows(schema, ret.toArray.toSeq) + def describe(cols: String*): DataFrame = { +val selected = if (cols.isEmpty) this else select(cols.head, cols.tail: _*) +selected.summary("count", "mean", "stddev", "min", "max") } /** + * Computes specified statistics for numeric and string columns. Available statistics are: + * + * - count + * - mean + * - stddev + * - min + * - max + * - arbitrary approximate percentiles specified as a percentage (eg, 75%) + * + * If no statistics are given, this function computes count, mean, stddev, min, + * approximate quartiles, and max. + * + * This function is meant for exploratory data analysis, as we make no guarantee about the + * backward compatibility of the schema of the resulting Dataset. If you want to + * programmatically compute summary statistics, use the `agg` function instead. + * + * {{{ + * ds.summary().show() + * + * // output: + * // summary age height + * // count 10.0 10.0 + * // mean53.3 178.05 + * // stddev 11.6 15.7 + * // min 18.0 163.0 + * // 25% 24.0 176.0 + * // 50% 24.0 176.0 + * // 75% 32.0 180.0 + * // max 92.0 192.0 + * }}} + * + * {{{ + * ds.summary("count", "min", "25%", "75%", "max").show() + * + * // output: + * // summary age height + * // count 10.0 10.0 + * // min 18.0 163.0 + * // 25% 24.0 176.0 + * // 75% 32.0 180.0 + * // max 92.0 192.0 + * }}} + * + * To do a summary for specific columns first select them: --- End diff -- This is a better usage than the previous `describe` ð --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18307: [SPARK-21100][SQL] Add summary method as alternat...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/18307#discussion_r125837050 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/execution/stat/StatFunctions.scala --- @@ -220,4 +221,97 @@ object StatFunctions extends Logging { Dataset.ofRows(df.sparkSession, LocalRelation(schema.toAttributes, table)).na.fill(0.0) } + + /** Calculate selected summary statistics for a dataset */ + def summary[T](ds: Dataset[T], statistics: Seq[String]): DataFrame = { --- End diff -- how about `def summary(ds: Dataset[_], statistics: Seq[String]): DataFrame`? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18307: [SPARK-21100][SQL] Add summary method as alternat...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/18307#discussion_r125873943 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala --- @@ -712,6 +707,76 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { checkAnswer(emptyDescription, emptyDescribeResult) } + test("summary") { +val describeTestData = person2 + +val describeResult = Seq( + Row("count", "4", "4", "4"), + Row("mean", null, "33.0", "178.0"), + Row("stddev", null, "19.148542155126762", "11.547005383792516"), + Row("min", "Alice", "16", "164"), + Row("25%", null, "24.0", "176.0"), + Row("50%", null, "24.0", "176.0"), + Row("75%", null, "32.0", "180.0"), + Row("max", "David", "60", "192")) + +val emptyDescribeResult = Seq( + Row("count", "0", "0", "0"), + Row("mean", null, null, null), + Row("stddev", null, null, null), + Row("min", null, null, null), + Row("25%", null, null, null), + Row("50%", null, null, null), + Row("75%", null, null, null), + Row("max", null, null, null)) + +def getSchemaAsSeq(df: DataFrame): Seq[String] = df.schema.map(_.name) + +val describeTwoCols = describeTestData.summary() +assert(getSchemaAsSeq(describeTwoCols) === Seq("summary", "name", "age", "height")) +checkAnswer(describeTwoCols, describeResult) +// All aggregate value should have been cast to string +describeTwoCols.collect().foreach { row => + assert(row.get(2).isInstanceOf[String], "expected string but found " + row.get(2).getClass) + assert(row.get(3).isInstanceOf[String], "expected string but found " + row.get(3).getClass) +} + +val describeAllCols = describeTestData.summary() +assert(getSchemaAsSeq(describeAllCols) === Seq("summary", "name", "age", "height")) +checkAnswer(describeAllCols, describeResult) + +val describeOneCol = describeTestData.select("age").summary() +assert(getSchemaAsSeq(describeOneCol) === Seq("summary", "age")) +checkAnswer(describeOneCol, describeResult.map { case Row(s, _, d, _) => Row(s, d)} ) + +val describeNoCol = describeTestData.select("name").summary() --- End diff -- `describeNoCol`? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18307: [SPARK-21100][SQL] Add summary method as alternat...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/18307#discussion_r125873512 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/DataFrameSuite.scala --- @@ -712,6 +707,76 @@ class DataFrameSuite extends QueryTest with SharedSQLContext { checkAnswer(emptyDescription, emptyDescribeResult) } + test("summary") { +val describeTestData = person2 + +val describeResult = Seq( --- End diff -- `summaryResult`? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
[GitHub] spark pull request #18307: [SPARK-21100][SQL] Add summary method as alternat...
Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/18307#discussion_r125836508 --- Diff: sql/core/src/main/scala/org/apache/spark/sql/Dataset.scala --- @@ -2205,47 +2205,80 @@ class Dataset[T] private[sql]( * // max 92.0 192.0 * }}} * + * Use [[summary]] for expanded statistics and control over which statistics to compute. + * + * @param cols Columns to compute statistics on. + * * @group action * @since 1.6.0 */ @scala.annotation.varargs - def describe(cols: String*): DataFrame = withPlan { - -// The list of summary statistics to compute, in the form of expressions. -val statistics = List[(String, Expression => Expression)]( - "count" -> ((child: Expression) => Count(child).toAggregateExpression()), - "mean" -> ((child: Expression) => Average(child).toAggregateExpression()), - "stddev" -> ((child: Expression) => StddevSamp(child).toAggregateExpression()), - "min" -> ((child: Expression) => Min(child).toAggregateExpression()), - "max" -> ((child: Expression) => Max(child).toAggregateExpression())) - -val outputCols = - (if (cols.isEmpty) aggregatableColumns.map(usePrettyExpression(_).sql) else cols).toList - -val ret: Seq[Row] = if (outputCols.nonEmpty) { - val aggExprs = statistics.flatMap { case (_, colToAgg) => -outputCols.map(c => Column(Cast(colToAgg(Column(c).expr), StringType)).as(c)) - } - - val row = groupBy().agg(aggExprs.head, aggExprs.tail: _*).head().toSeq - - // Pivot the data so each summary is one row - row.grouped(outputCols.size).toSeq.zip(statistics).map { case (aggregation, (statistic, _)) => -Row(statistic :: aggregation.toList: _*) - } -} else { - // If there are no output columns, just output a single column that contains the stats. - statistics.map { case (name, _) => Row(name) } -} - -// All columns are string type -val schema = StructType( - StructField("summary", StringType) :: outputCols.map(StructField(_, StringType))).toAttributes -// `toArray` forces materialization to make the seq serializable -LocalRelation.fromExternalRows(schema, ret.toArray.toSeq) + def describe(cols: String*): DataFrame = { +val selected = if (cols.isEmpty) this else select(cols.head, cols.tail: _*) +selected.summary("count", "mean", "stddev", "min", "max") } /** + * Computes specified statistics for numeric and string columns. Available statistics are: + * + * - count + * - mean + * - stddev + * - min + * - max + * - arbitrary approximate percentiles specified as a percentage (eg, 75%) + * + * If no statistics are given, this function computes count, mean, stddev, min, + * approximate quartiles, and max. --- End diff -- `approximate quartiles at 25%, 50% and 75%`? --- If your project is set up for it, you can reply to this email and have your reply appear on GitHub as well. If your project does not have this feature enabled and wishes so, or if the feature is enabled but not working, please contact infrastructure at infrastruct...@apache.org or file a JIRA ticket with INFRA. --- - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org