[GitHub] [spark] leanken commented on a change in pull request #29983: [SPARK-13860][SQL] Change statistical aggregate function to return null instead of Double.NaN when divideByZero

GitBox Sun, 11 Oct 2020 23:41:51 -0700


leanken commented on a change in pull request #29983:
URL: https://github.com/apache/spark/pull/29983#discussion_r503069104




##########
File path: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowQuerySuite.scala
##########
@@ -59,56 +60,115 @@ class WindowQuerySuite extends QueryTest with SQLTestUtils 
with TestHiveSingleto
   }
 
   test("windowing.q -- 15. testExpressions") {
-    // Moved because:
-    // - Spark uses a different default stddev (sample instead of pop)
-    // - Tiny numerical differences in stddev results.
-    // - Different StdDev behavior when n=1 (NaN instead of 0)
-    checkAnswer(sql(s"""
-      |select  p_mfgr,p_name, p_size,
-      |rank() over(distribute by p_mfgr sort by p_name) as r,
-      |dense_rank() over(distribute by p_mfgr sort by p_name) as dr,
-      |cume_dist() over(distribute by p_mfgr sort by p_name) as cud,
-      |percent_rank() over(distribute by p_mfgr sort by p_name) as pr,
-      |ntile(3) over(distribute by p_mfgr sort by p_name) as nt,
-      |count(p_size) over(distribute by p_mfgr sort by p_name) as ca,
-      |avg(p_size) over(distribute by p_mfgr sort by p_name) as avg,
-      |stddev(p_size) over(distribute by p_mfgr sort by p_name) as st,
-      |first_value(p_size % 5) over(distribute by p_mfgr sort by p_name) as fv,
-      |last_value(p_size) over(distribute by p_mfgr sort by p_name) as lv,
-      |first_value(p_size) over w1  as fvW1
-      |from part
-      |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name
-      |             rows between 2 preceding and 2 following)
+    withSQLConf(SQLConf.LEGACY_CENTRAL_MOMENT_AGG.key -> "true") {

Review comment:
       done.

##########
File path: 
sql/hive/src/test/scala/org/apache/spark/sql/hive/execution/WindowQuerySuite.scala
##########
@@ -59,56 +60,115 @@ class WindowQuerySuite extends QueryTest with SQLTestUtils 
with TestHiveSingleto
   }
 
   test("windowing.q -- 15. testExpressions") {
-    // Moved because:
-    // - Spark uses a different default stddev (sample instead of pop)
-    // - Tiny numerical differences in stddev results.
-    // - Different StdDev behavior when n=1 (NaN instead of 0)
-    checkAnswer(sql(s"""
-      |select  p_mfgr,p_name, p_size,
-      |rank() over(distribute by p_mfgr sort by p_name) as r,
-      |dense_rank() over(distribute by p_mfgr sort by p_name) as dr,
-      |cume_dist() over(distribute by p_mfgr sort by p_name) as cud,
-      |percent_rank() over(distribute by p_mfgr sort by p_name) as pr,
-      |ntile(3) over(distribute by p_mfgr sort by p_name) as nt,
-      |count(p_size) over(distribute by p_mfgr sort by p_name) as ca,
-      |avg(p_size) over(distribute by p_mfgr sort by p_name) as avg,
-      |stddev(p_size) over(distribute by p_mfgr sort by p_name) as st,
-      |first_value(p_size % 5) over(distribute by p_mfgr sort by p_name) as fv,
-      |last_value(p_size) over(distribute by p_mfgr sort by p_name) as lv,
-      |first_value(p_size) over w1  as fvW1
-      |from part
-      |window w1 as (distribute by p_mfgr sort by p_mfgr, p_name
-      |             rows between 2 preceding and 2 following)
+    withSQLConf(SQLConf.LEGACY_CENTRAL_MOMENT_AGG.key -> "true") {
+      // Moved because:
+      // - Spark uses a different default stddev (sample instead of pop)
+      // - Tiny numerical differences in stddev results.
+      // - Different StdDev behavior when n=1 (NaN instead of 0)

Review comment:
       done

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/CentralMomentAgg.scala
##########
@@ -174,7 +175,9 @@ case class StddevSamp(child: Expression) extends 
CentralMomentAgg(child) {
 
   override val evaluateExpression: Expression = {
     If(n === 0.0, Literal.create(null, DoubleType),
-      If(n === 1.0, Double.NaN, sqrt(m2 / (n - 1.0))))
+      If(n === 1.0,
+        if (SQLConf.get.legacyCentralMomentAgg) Double.NaN else 
Literal.create(null, DoubleType),

Review comment:
       done




----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] leanken commented on a change in pull request #29983: [SPARK-13860][SQL] Change statistical aggregate function to return null instead of Double.NaN when divideByZero

Reply via email to