This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new 00be83a [SPARK-33404][SQL][3.0] Fix incorrect results in `date_trunc` expression 00be83a is described below commit 00be83a15807cb5b26b0cd7fe491060e27bf4c81 Author: Utkarsh <utkarsh.agar...@databricks.com> AuthorDate: Wed Nov 11 14:45:49 2020 -0800 [SPARK-33404][SQL][3.0] Fix incorrect results in `date_trunc` expression **Backport #30303 to 3.0** ### What changes were proposed in this pull request? The following query produces incorrect results: ``` SELECT date_trunc('minute', '1769-10-17 17:10:02') ``` Spark currently incorrectly returns ``` 1769-10-17 17:10:02 ``` against the expected return value of ``` 1769-10-17 17:10:00 ``` **Steps to repro** Run the following commands in spark-shell: ``` spark.conf.set("spark.sql.session.timeZone", "America/Los_Angeles") spark.sql("SELECT date_trunc('minute', '1769-10-17 17:10:02')").show() ``` This happens as `truncTimestamp` in package `org.apache.spark.sql.catalyst.util.DateTimeUtils` incorrectly assumes that time zone offsets can never have the granularity of a second and thus does not account for time zone adjustment when truncating the given timestamp to `minute`. This assumption is currently used when truncating the timestamps to `microsecond, millisecond, second, or minute`. This PR fixes this issue and always uses time zone knowledge when truncating timestamps regardless of the truncation unit. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? Added new tests to `DateTimeUtilsSuite` which previously failed and pass now. Closes #30339 from utkarsh39/date_trunc_fix_3.0. Authored-by: Utkarsh <utkarsh.agar...@databricks.com> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> --- .../spark/sql/catalyst/util/DateTimeUtils.scala | 6 ++-- .../sql/catalyst/util/DateTimeUtilsSuite.scala | 34 +++++++++++++++------- 2 files changed, 28 insertions(+), 12 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala index 3f8417f..344a6ae 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/DateTimeUtils.scala @@ -771,8 +771,12 @@ object DateTimeUtils { * Trunc level should be generated using `parseTruncLevel()`, should be between 0 and 9. */ def truncTimestamp(t: SQLTimestamp, level: Int, zoneId: ZoneId): SQLTimestamp = { + // Time zone offsets have a maximum precision of seconds (see `java.time.ZoneOffset`). Hence + // truncation to microsecond, millisecond, and second can be done + // without using time zone information. This results in a performance improvement. level match { case TRUNC_TO_MICROSECOND => t + case TRUNC_TO_MINUTE => truncToUnit(t, zoneId, ChronoUnit.MINUTES) case TRUNC_TO_HOUR => truncToUnit(t, zoneId, ChronoUnit.HOURS) case TRUNC_TO_DAY => truncToUnit(t, zoneId, ChronoUnit.DAYS) case _ => @@ -781,8 +785,6 @@ object DateTimeUtils { case TRUNC_TO_MILLISECOND => millis case TRUNC_TO_SECOND => millis - Math.floorMod(millis, MILLIS_PER_SECOND) - case TRUNC_TO_MINUTE => - millis - Math.floorMod(millis, MILLIS_PER_MINUTE) case _ => // Try to truncate date levels val dDays = millisToDays(millis, zoneId) daysToMillis(truncDate(dDays, level), zoneId) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala index d526ae1..f92d65c 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/DateTimeUtilsSuite.scala @@ -516,18 +516,32 @@ class DateTimeUtilsSuite extends SparkFunSuite with Matchers with SQLHelper { assert(time == None) } - test("truncTimestamp") { - def testTrunc( - level: Int, - expected: String, - inputTS: SQLTimestamp, - zoneId: ZoneId = defaultZoneId): Unit = { - val truncated = - DateTimeUtils.truncTimestamp(inputTS, level, zoneId) - val expectedTS = toTimestamp(expected, defaultZoneId) - assert(truncated === expectedTS.get) + def testTrunc( + level: Int, + expected: String, + inputTS: Long, + zoneId: ZoneId = defaultZoneId): Unit = { + val truncated = DateTimeUtils.truncTimestamp(inputTS, level, zoneId) + val expectedTS = toTimestamp(expected, defaultZoneId) + assert(truncated === expectedTS.get) + } + + test("SPARK-33404: test truncTimestamp when time zone offset from UTC has a " + + "granularity of seconds") { + for (zid <- ALL_TIMEZONES) { + withDefaultTimeZone(zid) { + val inputTS = DateTimeUtils.stringToTimestamp( + UTF8String.fromString("1769-10-17T17:10:02.123456"), defaultZoneId) + testTrunc(DateTimeUtils.TRUNC_TO_MINUTE, "1769-10-17T17:10:00", inputTS.get, zid) + testTrunc(DateTimeUtils.TRUNC_TO_SECOND, "1769-10-17T17:10:02", inputTS.get, zid) + testTrunc(DateTimeUtils.TRUNC_TO_MILLISECOND, "1769-10-17T17:10:02.123", inputTS.get, zid) + testTrunc(DateTimeUtils.TRUNC_TO_MICROSECOND, "1769-10-17T17:10:02.123456", + inputTS.get, zid) + } } + } + test("truncTimestamp") { val defaultInputTS = DateTimeUtils.stringToTimestamp( UTF8String.fromString("2015-03-05T09:32:05.359123"), defaultZoneId) val defaultInputTS1 = DateTimeUtils.stringToTimestamp( --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org