This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new d9ad111 [SPARK-31402][SQL] Fix rebasing of BCE dates/timestamps d9ad111 is described below commit d9ad11110428b2d89fcb2f0c0cbdf84a9970910a Author: Max Gekk <max.g...@gmail.com> AuthorDate: Mon Apr 13 06:07:31 2020 +0000 [SPARK-31402][SQL] Fix rebasing of BCE dates/timestamps In the PR, I propose to fallback to rebasing via local dates/timestamps for days/micros of before common era (BCE). It fixes the bug of rebasing dates/timestamps of BCE. Yes - By existing tests in `RebaseDateTimeSuite` and `DateTimeUtilsSuite` - Added tests for negative years to `RebaseDateTimeSuite` Closes #28172 from MaxGekk/fix-era-in-date-micros-rebasing. Authored-by: Max Gekk <max.g...@gmail.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> (cherry picked from commit cf63ad61f517a44a2b60c0e4f06cb387569039b3) Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../spark/sql/catalyst/util/RebaseDateTime.scala | 96 ++++++++++++++++++++-- .../sql/catalyst/util/RebaseDateTimeSuite.scala | 88 ++++++++++---------- 2 files changed, 133 insertions(+), 51 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala index 7440787..f4a98cf 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala @@ -17,7 +17,7 @@ package org.apache.spark.sql.catalyst.util -import java.time.{LocalDateTime, ZoneId} +import java.time.{LocalDate, LocalDateTime, ZoneId} import java.time.temporal.ChronoField import java.util.{Calendar, TimeZone} @@ -66,15 +66,53 @@ object RebaseDateTime { // The starting point is the `0001-01-01` (-719164 days since the epoch in // Julian calendar). All dates before the staring point have the same difference // of 2 days in Julian and Proleptic Gregorian calendars. + // Rebasing switch days and diffs `julianGregDiffSwitchDay` and `julianGregDiffs` + // was generated by the `localRebaseJulianToGregorianDays` function. private val julianGregDiffSwitchDay = Array( -719164, -682945, -646420, -609895, -536845, -500320, -463795, -390745, -354220, -317695, -244645, -208120, -171595, -141427) + // The first days of Common Era (CE) which is mapped to the '0001-01-01' date in Julian calendar. + private final val julianCommonEraStartDay = julianGregDiffSwitchDay(0) + /** * Converts the given number of days since the epoch day 1970-01-01 to a local date in Julian * calendar, interprets the result as a local date in Proleptic Gregorian calendar, and takes the * number of days since the epoch from the Gregorian local date. * + * @param days The number of days since the epoch in Julian calendar. It can be negative. + * @return The rebased number of days in Gregorian calendar. + */ + private[sql] def localRebaseJulianToGregorianDays(days: Int): Int = { + val utcCal = new Calendar.Builder() + // `gregory` is a hybrid calendar that supports both + // the Julian and Gregorian calendar systems + .setCalendarType("gregory") + .setTimeZone(TimeZoneUTC) + .setInstant(Math.multiplyExact(days, MILLIS_PER_DAY)) + .build() + val localDate = LocalDate.of( + utcCal.get(Calendar.YEAR), + utcCal.get(Calendar.MONTH) + 1, + // The number of days will be added later to handle non-existing + // Julian dates in Proleptic Gregorian calendar. + // For example, 1000-02-29 exists in Julian calendar because 1000 + // is a leap year but it is not a leap year in Gregorian calendar. + 1) + .`with`(ChronoField.ERA, utcCal.get(Calendar.ERA)) + .plusDays(utcCal.get(Calendar.DAY_OF_MONTH) - 1) + Math.toIntExact(localDate.toEpochDay) + } + + /** + * For dates of Before Common Era, the function converts the given number of days since the epoch + * day 1970-01-01 to a local date in Julian calendar, interprets the result as a local date in + * Proleptic Gregorian calendar, and takes the number of days since the epoch from the Gregorian + * local date. + * + * For dates of Common Era, the function performs rebasing via the pre-calculated array of diffs + * between Julian and Proleptic Gregorian calendars. + * * This is used to guarantee backward compatibility, as Spark 2.4 and earlier versions use * Julian calendar for dates before 1582-10-15, while Spark 3.0 and later use Proleptic Gregorian * calendar. See SPARK-26651. @@ -88,7 +126,11 @@ object RebaseDateTime { * @return The rebased number of days in Gregorian calendar. */ def rebaseJulianToGregorianDays(days: Int): Int = { - rebaseDays(julianGregDiffSwitchDay, julianGregDiffs, days) + if (days < julianCommonEraStartDay) { + localRebaseJulianToGregorianDays(days) + } else { + rebaseDays(julianGregDiffSwitchDay, julianGregDiffs, days) + } } // The differences in days between Proleptic Gregorian and Julian dates. @@ -100,15 +142,46 @@ object RebaseDateTime { // The starting point is the `0001-01-01` (-719162 days since the epoch in // Proleptic Gregorian calendar). All dates before the staring point have the same // difference of -2 days in Proleptic Gregorian and Julian calendars. + // Rebasing switch days and diffs `gregJulianDiffSwitchDay` and `gregJulianDiffs` + // was generated by the `localRebaseGregorianToJulianDays` function. private val gregJulianDiffSwitchDay = Array( -719162, -682944, -646420, -609896, -536847, -500323, -463799, -390750, -354226, -317702, -244653, -208129, -171605, -141427) + // The first days of Common Era (CE) which is mapped to the '0001-01-01' date + // in Proleptic Gregorian calendar. + private final val gregorianCommonEraStartDay = gregJulianDiffSwitchDay(0) + /** * Converts the given number of days since the epoch day 1970-01-01 to a local date in Proleptic * Gregorian calendar, interprets the result as a local date in Julian calendar, and takes the * number of days since the epoch from the Julian local date. * + * @param days The number of days since the epoch in Proleptic Gregorian calendar. + * It can be negative. + * @return The rebased number of days in Julian calendar. + */ + private[sql] def localRebaseGregorianToJulianDays(days: Int): Int = { + val localDate = LocalDate.ofEpochDay(days) + val utcCal = new Calendar.Builder() + // `gregory` is a hybrid calendar that supports both + // the Julian and Gregorian calendar systems + .setCalendarType("gregory") + .setTimeZone(TimeZoneUTC) + .setDate(localDate.getYear, localDate.getMonthValue - 1, localDate.getDayOfMonth) + .build() + Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, MILLIS_PER_DAY)) + } + + /** + * For dates of Before Common Era, Converts the given number of days since the epoch day + * 1970-01-01 to a local date in Proleptic Gregorian calendar, interprets the result as a local + * date in Julian calendar, and takes the number of days since the epoch from the Julian + * local date. + * + * For dates of Common Era, the function performs rebasing via the pre-calculated array of diffs + * between Proleptic Gregorian and Julian calendars. + * * This is used to guarantee backward compatibility, as Spark 2.4 and earlier versions use * Julian calendar for dates before 1582-10-15, while Spark 3.0 and later use Proleptic Gregorian * calendar. See SPARK-26651. @@ -122,7 +195,11 @@ object RebaseDateTime { * @return The rebased number of days since the epoch in Julian calendar. */ def rebaseGregorianToJulianDays(days: Int): Int = { - rebaseDays(gregJulianDiffSwitchDay, gregJulianDiffs, days) + if (days < gregorianCommonEraStartDay) { + localRebaseGregorianToJulianDays(days) + } else { + rebaseDays(gregJulianDiffSwitchDay, gregJulianDiffs, days) + } } @@ -248,8 +325,8 @@ object RebaseDateTime { /** * An optimized version of [[rebaseGregorianToJulianMicros(ZoneId, Long)]]. This method leverages * the pre-calculated rebasing maps to save calculation. If the rebasing map doesn't contain - * information about the current JVM system time zone, the function falls back to the regular - * unoptimized version. + * information about the current JVM system time zone or `micros` is related to Before Common Era, + * the function falls back to the regular unoptimized version. * * Note: The function assumes that the input micros was derived from a local timestamp * at the default system JVM time zone in Proleptic Gregorian calendar. @@ -262,7 +339,7 @@ object RebaseDateTime { val timeZone = TimeZone.getDefault val tzId = timeZone.getID val rebaseRecord = gregJulianRebaseMap.getOrNull(tzId) - if (rebaseRecord == null) { + if (rebaseRecord == null || micros < rebaseRecord.switches(0)) { rebaseGregorianToJulianMicros(timeZone.toZoneId, micros) } else { rebaseMicros(rebaseRecord, micros) @@ -308,6 +385,7 @@ object RebaseDateTime { cal.get(Calendar.MINUTE), cal.get(Calendar.SECOND), (Math.floorMod(micros, MICROS_PER_SECOND) * NANOS_PER_MICROS).toInt) + .`with`(ChronoField.ERA, cal.get(Calendar.ERA)) .plusDays(cal.get(Calendar.DAY_OF_MONTH) - 1) val zonedDateTime = localDateTime.atZone(zoneId) // Assuming the daylight saving switchover time is 2:00, the local clock will go back to @@ -335,8 +413,8 @@ object RebaseDateTime { /** * An optimized version of [[rebaseJulianToGregorianMicros(ZoneId, Long)]]. This method leverages * the pre-calculated rebasing maps to save calculation. If the rebasing map doesn't contain - * information about the current JVM system time zone, the function falls back to the regular - * unoptimized version. + * information about the current JVM system time zone or `micros` is related to Before Common Era, + * the function falls back to the regular unoptimized version. * * Note: The function assumes that the input micros was derived from a local timestamp * at the default system JVM time zone in the Julian calendar. @@ -349,7 +427,7 @@ object RebaseDateTime { val timeZone = TimeZone.getDefault val tzId = timeZone.getID val rebaseRecord = julianGregRebaseMap.getOrNull(tzId) - if (rebaseRecord == null) { + if (rebaseRecord == null || micros < rebaseRecord.switches(0)) { rebaseJulianToGregorianMicros(timeZone.toZoneId, micros) } else { rebaseMicros(rebaseRecord, micros) diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala index dcc80b1..1e70e60 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.util import java.sql.{Date, Timestamp} import java.time.{Instant, LocalDate, LocalDateTime, ZoneId} -import java.util.{Calendar, TimeZone} +import java.util.TimeZone import org.scalatest.Matchers @@ -32,15 +32,18 @@ import org.apache.spark.sql.catalyst.util.RebaseDateTime._ class RebaseDateTimeSuite extends SparkFunSuite with Matchers with SQLHelper { - private def parseToJulianMicros(s: String): Long = { - val ts = Timestamp.valueOf(s) + private def toJulianMicros(ts: Timestamp): Long = { val julianMicros = fromMillis(ts.getTime) + ((ts.getNanos / NANOS_PER_MICROS) % MICROS_PER_MILLIS) julianMicros } + private def parseToJulianMicros(s: String): Long = toJulianMicros(Timestamp.valueOf(s)) + private def toGregorianMicros(ldt: LocalDateTime, zoneId: ZoneId): Long = { + instantToMicros(ldt.atZone(zoneId).toInstant) + } private def parseToGregMicros(s: String, zoneId: ZoneId): Long = { - instantToMicros(LocalDateTime.parse(s).atZone(zoneId).toInstant) + toGregorianMicros(LocalDateTime.parse(s), zoneId) } test("rebase julian to/from gregorian micros") { @@ -141,53 +144,17 @@ class RebaseDateTimeSuite extends SparkFunSuite with Matchers with SQLHelper { } test("optimization of days rebasing - Gregorian to Julian") { - // Rebasing switch days and diffs `gregJulianDiffSwitchDay` and `gregJulianDiffs` - // in `RebaseDateTime` was generated by using this function. - def refRebaseGregorianToJulianDays(days: Int): Int = { - val localDate = LocalDate.ofEpochDay(days) - val utcCal = new Calendar.Builder() - // `gregory` is a hybrid calendar that supports both - // the Julian and Gregorian calendar systems - .setCalendarType("gregory") - .setTimeZone(TimeZoneUTC) - .setDate(localDate.getYear, localDate.getMonthValue - 1, localDate.getDayOfMonth) - .build() - Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, MILLIS_PER_DAY)) - } - val start = localDateToDays(LocalDate.of(1, 1, 1)) val end = localDateToDays(LocalDate.of(2030, 1, 1)) var days = start while (days < end) { - assert(rebaseGregorianToJulianDays(days) === refRebaseGregorianToJulianDays(days)) + assert(rebaseGregorianToJulianDays(days) === localRebaseGregorianToJulianDays(days)) days += 1 } } test("optimization of days rebasing - Julian to Gregorian") { - // Rebasing switch days and diffs `julianGregDiffSwitchDay` and `julianGregDiffs` - // in `RebaseDateTime` was generated by using this function. - def refRebaseJulianToGregorianDays(days: Int): Int = { - val utcCal = new Calendar.Builder() - // `gregory` is a hybrid calendar that supports both - // the Julian and Gregorian calendar systems - .setCalendarType("gregory") - .setTimeZone(TimeZoneUTC) - .setInstant(Math.multiplyExact(days, MILLIS_PER_DAY)) - .build() - val localDate = LocalDate.of( - utcCal.get(Calendar.YEAR), - utcCal.get(Calendar.MONTH) + 1, - // The number of days will be added later to handle non-existing - // Julian dates in Proleptic Gregorian calendar. - // For example, 1000-02-29 exists in Julian calendar because 1000 - // is a leap year but it is not a leap year in Gregorian calendar. - 1) - .plusDays(utcCal.get(Calendar.DAY_OF_MONTH) - 1) - Math.toIntExact(localDate.toEpochDay) - } - val start = rebaseGregorianToJulianDays( localDateToDays(LocalDate.of(1, 1, 1))) val end = rebaseGregorianToJulianDays( @@ -195,7 +162,7 @@ class RebaseDateTimeSuite extends SparkFunSuite with Matchers with SQLHelper { var days = start while (days < end) { - assert(rebaseJulianToGregorianDays(days) === refRebaseJulianToGregorianDays(days)) + assert(rebaseJulianToGregorianDays(days) === localRebaseJulianToGregorianDays(days)) days += 1 } } @@ -360,4 +327,41 @@ class RebaseDateTimeSuite extends SparkFunSuite with Matchers with SQLHelper { dir = "/Users/maximgekk/tmp", fileName = "julian-gregorian-rebase-micros.json") } + + test("rebase gregorian to/from julian days - BCE era") { + outstandingZoneIds.foreach { zid => + withDefaultTimeZone(zid) { + Seq( + (-1100, 1, 1), + (-1044, 3, 5), + (-44, 3, 5)).foreach { case (year, month, day) => + val julianDays = fromJavaDateLegacy(new Date(year - 1900, month - 1, day)) + val gregorianDays = localDateToDays(LocalDate.of(year, month, day)) + + assert(rebaseGregorianToJulianDays(gregorianDays) === julianDays) + assert(rebaseJulianToGregorianDays(julianDays) === gregorianDays) + } + } + } + } + + test("rebase gregorian to/from julian micros - BCE era") { + outstandingZoneIds.foreach { zid => + withDefaultTimeZone(zid) { + Seq( + (-1100, 1, 1, 1, 2, 3, 0), + (-1044, 3, 5, 0, 0, 0, 123456000), + (-44, 3, 5, 23, 59, 59, 99999000) + ).foreach { case (year, month, day, hour, minute, second, nanos) => + val julianMicros = toJulianMicros(new Timestamp( + year - 1900, month - 1, day, hour, minute, second, nanos)) + val gregorianMicros = toGregorianMicros(LocalDateTime.of( + year, month, day, hour, minute, second, nanos), zid) + + assert(rebaseGregorianToJulianMicros(gregorianMicros) === julianMicros) + assert(rebaseJulianToGregorianMicros(julianMicros) === gregorianMicros) + } + } + } + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org