[spark] branch branch-3.0 updated: [SPARK-33404][SQL][3.0] Fix incorrect results in `date_trunc` expression

dongjoon Wed, 11 Nov 2020 15:18:24 -0800

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 00be83a  [SPARK-33404][SQL][3.0] Fix incorrect results in `date_trunc` 
expression
00be83a is described below

commit 00be83a15807cb5b26b0cd7fe491060e27bf4c81
Author: Utkarsh <utkarsh.agar...@databricks.com>
AuthorDate: Wed Nov 11 14:45:49 2020 -0800

    [SPARK-33404][SQL][3.0] Fix incorrect results in `date_trunc` expression
    
    **Backport #30303 to 3.0**
    
    ### What changes were proposed in this pull request?
    The following query produces incorrect results:
    ```
    SELECT date_trunc('minute', '1769-10-17 17:10:02')
    ```
    Spark currently incorrectly returns
    ```
    1769-10-17 17:10:02
    ```
    against the expected return value of
    ```
    1769-10-17 17:10:00
    ```
    **Steps to repro**
    Run the following commands in spark-shell:
    ```
    spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles")
    spark.sql("SELECT date_trunc('minute', '1769-10-17 17:10:02')").show()
    ```
    This happens as `truncTimestamp` in package 
`org.apache.spark.sql.catalyst.util.DateTimeUtils` incorrectly assumes that 
time zone offsets can never have the granularity of a second and thus does not 
account for time zone adjustment when truncating the given timestamp to 
`minute`.
    This assumption is currently used when truncating the timestamps to 
`microsecond, millisecond, second, or minute`.
    
    This PR fixes this issue and always uses time zone knowledge when 
truncating timestamps regardless of the truncation unit.
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    Added new tests to `DateTimeUtilsSuite` which previously failed and pass 
now.
    
    Closes #30339 from utkarsh39/date_trunc_fix_3.0.
    
    Authored-by: Utkarsh <utkarsh.agar...@databricks.com>
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
---
 .../spark/sql/catalyst/util/DateTimeUtils.scala    |  6 ++--
 .../sql/catalyst/util/DateTimeUtilsSuite.scala     | 34 +++++++++++++++-------
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
index 3f8417f..344a6ae 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala
@@ -771,8 +771,12 @@ object DateTimeUtils {
    * Trunc level should be generated using `parseTruncLevel()`, should be 
between 0 and 9.
    */
   def truncTimestamp(t: SQLTimestamp, level: Int, zoneId: ZoneId): 
SQLTimestamp = {
+    // Time zone offsets have a maximum precision of seconds (see 
`java.time.ZoneOffset`). Hence
+    // truncation to microsecond, millisecond, and second can be done
+    // without using time zone information. This results in a performance 
improvement.
     level match {
       case TRUNC_TO_MICROSECOND => t
+      case TRUNC_TO_MINUTE => truncToUnit(t, zoneId, ChronoUnit.MINUTES)
       case TRUNC_TO_HOUR => truncToUnit(t, zoneId, ChronoUnit.HOURS)
       case TRUNC_TO_DAY => truncToUnit(t, zoneId, ChronoUnit.DAYS)
       case _ =>
@@ -781,8 +785,6 @@ object DateTimeUtils {
           case TRUNC_TO_MILLISECOND => millis
           case TRUNC_TO_SECOND =>
             millis - Math.floorMod(millis, MILLIS_PER_SECOND)
-          case TRUNC_TO_MINUTE =>
-            millis - Math.floorMod(millis, MILLIS_PER_MINUTE)
           case _ => // Try to truncate date levels
             val dDays = millisToDays(millis, zoneId)
             daysToMillis(truncDate(dDays, level), zoneId)
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
index d526ae1..f92d65c 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala
@@ -516,18 +516,32 @@ class DateTimeUtilsSuite extends SparkFunSuite with 
Matchers with SQLHelper {
     assert(time == None)
   }
 
-  test("truncTimestamp") {
-    def testTrunc(
-        level: Int,
-        expected: String,
-        inputTS: SQLTimestamp,
-        zoneId: ZoneId = defaultZoneId): Unit = {
-      val truncated =
-        DateTimeUtils.truncTimestamp(inputTS, level, zoneId)
-      val expectedTS = toTimestamp(expected, defaultZoneId)
-      assert(truncated === expectedTS.get)
+  def testTrunc(
+      level: Int,
+      expected: String,
+      inputTS: Long,
+      zoneId: ZoneId = defaultZoneId): Unit = {
+    val truncated = DateTimeUtils.truncTimestamp(inputTS, level, zoneId)
+    val expectedTS = toTimestamp(expected, defaultZoneId)
+    assert(truncated === expectedTS.get)
+  }
+
+  test("SPARK-33404: test truncTimestamp when time zone offset from UTC has a 
" +
+    "granularity of seconds") {
+    for (zid <- ALL_TIMEZONES) {
+      withDefaultTimeZone(zid) {
+        val inputTS = DateTimeUtils.stringToTimestamp(
+          UTF8String.fromString("1769-10-17T17:10:02.123456"), defaultZoneId)
+        testTrunc(DateTimeUtils.TRUNC_TO_MINUTE, "1769-10-17T17:10:00", 
inputTS.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_SECOND, "1769-10-17T17:10:02", 
inputTS.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_MILLISECOND, 
"1769-10-17T17:10:02.123", inputTS.get, zid)
+        testTrunc(DateTimeUtils.TRUNC_TO_MICROSECOND, 
"1769-10-17T17:10:02.123456",
+          inputTS.get, zid)
+      }
     }
+  }
 
+  test("truncTimestamp") {
     val defaultInputTS = DateTimeUtils.stringToTimestamp(
       UTF8String.fromString("2015-03-05T09:32:05.359123"), defaultZoneId)
     val defaultInputTS1 = DateTimeUtils.stringToTimestamp(


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-33404][SQL][3.0] Fix incorrect results in `date_trunc` expression

Reply via email to