[spark] 02/02: Revert "[SPARK-27351][SQL] Wrong outputRows estimation after AggregateEstimation wit…"

gurwls223 Thu, 23 May 2019 11:20:58 -0700

This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch branch-2.4
in repository https://gitbox.apache.org/repos/asf/spark.git


commit e69ad46c72ed26c8293da95dc19b6f31445c0df5
Author: HyukjinKwon <gurwls...@apache.org>
AuthorDate: Fri May 24 03:19:48 2019 +0900

    Revert "[SPARK-27351][SQL] Wrong outputRows estimation after 
AggregateEstimation wit…"
    
    This reverts commit 40668c53ed799881db1f316ceaf2f978b294d8ed.
---
 .../plans/logical/statsEstimation/AggregateEstimation.scala  | 12 ++----------
 .../catalyst/statsEstimation/AggregateEstimationSuite.scala  | 12 +-----------
 2 files changed, 3 insertions(+), 21 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala
index 7ef22fa..111c594 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/plans/logical/statsEstimation/AggregateEstimation.scala
@@ -39,16 +39,8 @@ object AggregateEstimation {
       // Multiply distinct counts of group-by columns. This is an upper bound, 
which assumes
       // the data contains all combinations of distinct values of group-by 
columns.
       var outputRows: BigInt = agg.groupingExpressions.foldLeft(BigInt(1))(
-        (res, expr) => {
-          val columnStat = 
childStats.attributeStats(expr.asInstanceOf[Attribute])
-          val distinctCount = columnStat.distinctCount.get
-          val distinctValue: BigInt = if (distinctCount == 0 && 
columnStat.nullCount.get > 0) {
-            1
-          } else {
-            distinctCount
-          }
-          res * distinctValue
-        })
+        (res, expr) => res *
+          
childStats.attributeStats(expr.asInstanceOf[Attribute]).distinctCount.get)
 
       outputRows = if (agg.groupingExpressions.isEmpty) {
         // If there's no group-by columns, the output is a single row 
containing values of aggregate
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
index 6bdf8cd..8213d56 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/statsEstimation/AggregateEstimationSuite.scala
@@ -38,9 +38,7 @@ class AggregateEstimationSuite extends 
StatsEstimationTestBase with PlanTest {
     attr("key22") -> ColumnStat(distinctCount = Some(2), min = Some(10), max = 
Some(20),
       nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)),
     attr("key31") -> ColumnStat(distinctCount = Some(0), min = None, max = 
None,
-      nullCount = Some(0), avgLen = Some(4), maxLen = Some(4)),
-    attr("key32") -> ColumnStat(distinctCount = Some(0), min = None, max = 
None,
-      nullCount = Some(4), avgLen = Some(4), maxLen = Some(4))
+      nullCount = Some(0), avgLen = Some(4), maxLen = Some(4))
   ))
 
   private val nameToAttr: Map[String, Attribute] = columnInfo.map(kv => 
kv._1.name -> kv._1)
@@ -94,14 +92,6 @@ class AggregateEstimationSuite extends 
StatsEstimationTestBase with PlanTest {
       expectedOutputRowCount = 0)
   }
 
-  test("group-by column with only null value") {
-    checkAggStats(
-      tableColumns = Seq("key22", "key32"),
-      tableRowCount = 6,
-      groupByColumns = Seq("key22", "key32"),
-      expectedOutputRowCount = nameToColInfo("key22")._2.distinctCount.get)
-  }
-
   test("non-cbo estimation") {
     val attributes = Seq("key12").map(nameToAttr)
     val child = StatsTestPlan(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] 02/02: Revert "[SPARK-27351][SQL] Wrong outputRows estimation after AggregateEstimation wit…"

Reply via email to