This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
commit 1ba4011a7f91616e88df1d141a7057500309e14c Author: HyukjinKwon <gurwls...@apache.org> AuthorDate: Fri May 24 05:36:08 2019 +0900 Revert "Revert "[SPARK-27351][SQL] Wrong outputRows estimation after AggregateEstimation wit…"" This reverts commit 516b0fb537b4bae3b4d9550ee2989eb66939d5da. --- .../plans/logical/statsEstimation/AggregateEstimation.scala | 12 ++++++++++-- .../catalyst/statsEstimation/AggregateEstimationSuite.scala | 12 +++++++++++- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala index 0606d0d..1198d3f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala @@ -39,8 +39,16 @@ object AggregateEstimation { // Multiply distinct counts of group-by columns. This is an upper bound, which assumes // the data contains all combinations of distinct values of group-by columns. var outputRows: BigInt = agg.groupingExpressions.foldLeft(BigInt(1))( - (res, expr) => res * - childStats.attributeStats(expr.asInstanceOf[Attribute]).distinctCount.get) + (res, expr) => { + val columnStat = childStats.attributeStats(expr.asInstanceOf[Attribute]) + val distinctCount = columnStat.distinctCount.get + val distinctValue: BigInt = if (distinctCount == 0 && columnStat.nullCount.get > 0) { + 1 + } else { + distinctCount + } + res * distinctValue + }) outputRows = if (agg.groupingExpressions.isEmpty) { // If there's no group-by columns, the output is a single row containing values of aggregate diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala index dfa6e46..c247050 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala @@ -38,7 +38,9 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest { attr("key22") -> ColumnStat(distinctCount = Some(2), min = Some(10), max = Some(20), nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)), attr("key31") -> ColumnStat(distinctCount = Some(0), min = None, max = None, - nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)) + nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)), + attr("key32") -> ColumnStat(distinctCount = Some(0), min = None, max = None, + nullCount = Some(4), avgLen = Some(4), maxLen = Some(4)) )) private val nameToAttr: Map[String, Attribute] = columnInfo.map(kv => kv._1.name -> kv._1) @@ -116,6 +118,14 @@ class AggregateEstimationSuite extends StatsEstimationTestBase with PlanTest { expectedOutputRowCount = 0) } + test("group-by column with only null value") { + checkAggStats( + tableColumns = Seq("key22", "key32"), + tableRowCount = 6, + groupByColumns = Seq("key22", "key32"), + expectedOutputRowCount = nameToColInfo("key22")._2.distinctCount.get) + } + test("non-cbo estimation") { val attributes = Seq("key12").map(nameToAttr) val child = StatsTestPlan( --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org