This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new d9ad111  [SPARK-31402][SQL] Fix rebasing of BCE dates/timestamps
d9ad111 is described below

commit d9ad11110428b2d89fcb2f0c0cbdf84a9970910a
Author: Max Gekk <max.g...@gmail.com>
AuthorDate: Mon Apr 13 06:07:31 2020 +0000

    [SPARK-31402][SQL] Fix rebasing of BCE dates/timestamps
    
    In the PR, I propose to fallback to rebasing via local dates/timestamps for 
days/micros of before common era (BCE).
    
    It fixes the bug of rebasing dates/timestamps of BCE.
    
    Yes
    
    - By existing tests in `RebaseDateTimeSuite` and `DateTimeUtilsSuite`
    - Added tests for negative years to `RebaseDateTimeSuite`
    
    Closes #28172 from MaxGekk/fix-era-in-date-micros-rebasing.
    
    Authored-by: Max Gekk <max.g...@gmail.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
    (cherry picked from commit cf63ad61f517a44a2b60c0e4f06cb387569039b3)
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../spark/sql/catalyst/util/RebaseDateTime.scala   | 96 ++++++++++++++++++++--
 .../sql/catalyst/util/RebaseDateTimeSuite.scala    | 88 ++++++++++----------
 2 files changed, 133 insertions(+), 51 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala
index 7440787..f4a98cf 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/RebaseDateTime.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.sql.catalyst.util
 
-import java.time.{LocalDateTime, ZoneId}
+import java.time.{LocalDate, LocalDateTime, ZoneId}
 import java.time.temporal.ChronoField
 import java.util.{Calendar, TimeZone}
 
@@ -66,15 +66,53 @@ object RebaseDateTime {
   // The starting point is the `0001-01-01` (-719164 days since the epoch in
   // Julian calendar). All dates before the staring point have the same 
difference
   // of 2 days in Julian and Proleptic Gregorian calendars.
+  // Rebasing switch days and diffs `julianGregDiffSwitchDay` and 
`julianGregDiffs`
+  // was generated by the `localRebaseJulianToGregorianDays` function.
   private val julianGregDiffSwitchDay = Array(
     -719164, -682945, -646420, -609895, -536845, -500320, -463795,
     -390745, -354220, -317695, -244645, -208120, -171595, -141427)
 
+  // The first days of Common Era (CE) which is mapped to the '0001-01-01' 
date in Julian calendar.
+  private final val julianCommonEraStartDay = julianGregDiffSwitchDay(0)
+
   /**
    * Converts the given number of days since the epoch day 1970-01-01 to a 
local date in Julian
    * calendar, interprets the result as a local date in Proleptic Gregorian 
calendar, and takes the
    * number of days since the epoch from the Gregorian local date.
    *
+   * @param days The number of days since the epoch in Julian calendar. It can 
be negative.
+   * @return The rebased number of days in Gregorian calendar.
+   */
+  private[sql] def localRebaseJulianToGregorianDays(days: Int): Int = {
+    val utcCal = new Calendar.Builder()
+      // `gregory` is a hybrid calendar that supports both
+      // the Julian and Gregorian calendar systems
+      .setCalendarType("gregory")
+      .setTimeZone(TimeZoneUTC)
+      .setInstant(Math.multiplyExact(days, MILLIS_PER_DAY))
+      .build()
+    val localDate = LocalDate.of(
+      utcCal.get(Calendar.YEAR),
+      utcCal.get(Calendar.MONTH) + 1,
+      // The number of days will be added later to handle non-existing
+      // Julian dates in Proleptic Gregorian calendar.
+      // For example, 1000-02-29 exists in Julian calendar because 1000
+      // is a leap year but it is not a leap year in Gregorian calendar.
+      1)
+      .`with`(ChronoField.ERA, utcCal.get(Calendar.ERA))
+      .plusDays(utcCal.get(Calendar.DAY_OF_MONTH) - 1)
+    Math.toIntExact(localDate.toEpochDay)
+  }
+
+  /**
+   * For dates of Before Common Era, the function converts the given number of 
days since the epoch
+   * day 1970-01-01 to a local date in Julian calendar, interprets the result 
as a local date in
+   * Proleptic Gregorian calendar, and takes the number of days since the 
epoch from the Gregorian
+   * local date.
+   *
+   * For dates of Common Era, the function performs rebasing via the 
pre-calculated array of diffs
+   * between Julian and Proleptic Gregorian calendars.
+   *
    * This is used to guarantee backward compatibility, as Spark 2.4 and 
earlier versions use
    * Julian calendar for dates before 1582-10-15, while Spark 3.0 and later 
use Proleptic Gregorian
    * calendar. See SPARK-26651.
@@ -88,7 +126,11 @@ object RebaseDateTime {
    * @return The rebased number of days in Gregorian calendar.
    */
   def rebaseJulianToGregorianDays(days: Int): Int = {
-    rebaseDays(julianGregDiffSwitchDay, julianGregDiffs, days)
+    if (days < julianCommonEraStartDay) {
+      localRebaseJulianToGregorianDays(days)
+    } else {
+      rebaseDays(julianGregDiffSwitchDay, julianGregDiffs, days)
+    }
   }
 
   // The differences in days between Proleptic Gregorian and Julian dates.
@@ -100,15 +142,46 @@ object RebaseDateTime {
   // The starting point is the `0001-01-01` (-719162 days since the epoch in
   // Proleptic Gregorian calendar). All dates before the staring point have 
the same
   // difference of -2 days in Proleptic Gregorian and Julian calendars.
+  // Rebasing switch days and diffs `gregJulianDiffSwitchDay` and 
`gregJulianDiffs`
+  // was generated by the `localRebaseGregorianToJulianDays` function.
   private val gregJulianDiffSwitchDay = Array(
     -719162, -682944, -646420, -609896, -536847, -500323, -463799,
     -390750, -354226, -317702, -244653, -208129, -171605, -141427)
 
+  // The first days of Common Era (CE) which is mapped to the '0001-01-01' date
+  // in Proleptic Gregorian calendar.
+  private final val gregorianCommonEraStartDay = gregJulianDiffSwitchDay(0)
+
   /**
    * Converts the given number of days since the epoch day 1970-01-01 to a 
local date in Proleptic
    * Gregorian calendar, interprets the result as a local date in Julian 
calendar, and takes the
    * number of days since the epoch from the Julian local date.
    *
+   * @param days The number of days since the epoch in Proleptic Gregorian 
calendar.
+   *             It can be negative.
+   * @return The rebased number of days in Julian calendar.
+   */
+  private[sql] def localRebaseGregorianToJulianDays(days: Int): Int = {
+    val localDate = LocalDate.ofEpochDay(days)
+    val utcCal = new Calendar.Builder()
+      // `gregory` is a hybrid calendar that supports both
+      // the Julian and Gregorian calendar systems
+      .setCalendarType("gregory")
+      .setTimeZone(TimeZoneUTC)
+      .setDate(localDate.getYear, localDate.getMonthValue - 1, 
localDate.getDayOfMonth)
+      .build()
+    Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, MILLIS_PER_DAY))
+  }
+
+  /**
+   * For dates of Before Common Era, Converts the given number of days since 
the epoch day
+   * 1970-01-01 to a local date in Proleptic Gregorian calendar, interprets 
the result as a local
+   * date in Julian calendar, and takes the number of days since the epoch 
from the Julian
+   * local date.
+   *
+   * For dates of Common Era, the function performs rebasing via the 
pre-calculated array of diffs
+   * between Proleptic Gregorian and Julian calendars.
+   *
    * This is used to guarantee backward compatibility, as Spark 2.4 and 
earlier versions use
    * Julian calendar for dates before 1582-10-15, while Spark 3.0 and later 
use Proleptic Gregorian
    * calendar. See SPARK-26651.
@@ -122,7 +195,11 @@ object RebaseDateTime {
    * @return The rebased number of days since the epoch in Julian calendar.
    */
   def rebaseGregorianToJulianDays(days: Int): Int = {
-    rebaseDays(gregJulianDiffSwitchDay, gregJulianDiffs, days)
+    if (days < gregorianCommonEraStartDay) {
+      localRebaseGregorianToJulianDays(days)
+    } else {
+      rebaseDays(gregJulianDiffSwitchDay, gregJulianDiffs, days)
+    }
   }
 
 
@@ -248,8 +325,8 @@ object RebaseDateTime {
   /**
    * An optimized version of [[rebaseGregorianToJulianMicros(ZoneId, Long)]]. 
This method leverages
    * the pre-calculated rebasing maps to save calculation. If the rebasing map 
doesn't contain
-   * information about the current JVM system time zone, the function falls 
back to the regular
-   * unoptimized version.
+   * information about the current JVM system time zone or `micros` is related 
to Before Common Era,
+   * the function falls back to the regular unoptimized version.
    *
    * Note: The function assumes that the input micros was derived from a local 
timestamp
    *       at the default system JVM time zone in Proleptic Gregorian calendar.
@@ -262,7 +339,7 @@ object RebaseDateTime {
     val timeZone = TimeZone.getDefault
     val tzId = timeZone.getID
     val rebaseRecord = gregJulianRebaseMap.getOrNull(tzId)
-    if (rebaseRecord == null) {
+    if (rebaseRecord == null || micros < rebaseRecord.switches(0)) {
       rebaseGregorianToJulianMicros(timeZone.toZoneId, micros)
     } else {
       rebaseMicros(rebaseRecord, micros)
@@ -308,6 +385,7 @@ object RebaseDateTime {
       cal.get(Calendar.MINUTE),
       cal.get(Calendar.SECOND),
       (Math.floorMod(micros, MICROS_PER_SECOND) * NANOS_PER_MICROS).toInt)
+      .`with`(ChronoField.ERA, cal.get(Calendar.ERA))
       .plusDays(cal.get(Calendar.DAY_OF_MONTH) - 1)
     val zonedDateTime = localDateTime.atZone(zoneId)
     // Assuming the daylight saving switchover time is 2:00, the local clock 
will go back to
@@ -335,8 +413,8 @@ object RebaseDateTime {
   /**
    * An optimized version of [[rebaseJulianToGregorianMicros(ZoneId, Long)]]. 
This method leverages
    * the pre-calculated rebasing maps to save calculation. If the rebasing map 
doesn't contain
-   * information about the current JVM system time zone, the function falls 
back to the regular
-   * unoptimized version.
+   * information about the current JVM system time zone or `micros` is related 
to Before Common Era,
+   * the function falls back to the regular unoptimized version.
    *
    * Note: The function assumes that the input micros was derived from a local 
timestamp
    *       at the default system JVM time zone in the Julian calendar.
@@ -349,7 +427,7 @@ object RebaseDateTime {
     val timeZone = TimeZone.getDefault
     val tzId = timeZone.getID
     val rebaseRecord = julianGregRebaseMap.getOrNull(tzId)
-    if (rebaseRecord == null) {
+    if (rebaseRecord == null || micros < rebaseRecord.switches(0)) {
       rebaseJulianToGregorianMicros(timeZone.toZoneId, micros)
     } else {
       rebaseMicros(rebaseRecord, micros)
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala
index dcc80b1..1e70e60 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/util/RebaseDateTimeSuite.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.catalyst.util
 
 import java.sql.{Date, Timestamp}
 import java.time.{Instant, LocalDate, LocalDateTime, ZoneId}
-import java.util.{Calendar, TimeZone}
+import java.util.TimeZone
 
 import org.scalatest.Matchers
 
@@ -32,15 +32,18 @@ import org.apache.spark.sql.catalyst.util.RebaseDateTime._
 
 class RebaseDateTimeSuite extends SparkFunSuite with Matchers with SQLHelper {
 
-  private def parseToJulianMicros(s: String): Long = {
-    val ts = Timestamp.valueOf(s)
+  private def toJulianMicros(ts: Timestamp): Long = {
     val julianMicros = fromMillis(ts.getTime) +
       ((ts.getNanos / NANOS_PER_MICROS) % MICROS_PER_MILLIS)
     julianMicros
   }
+  private def parseToJulianMicros(s: String): Long = 
toJulianMicros(Timestamp.valueOf(s))
 
+  private def toGregorianMicros(ldt: LocalDateTime, zoneId: ZoneId): Long = {
+    instantToMicros(ldt.atZone(zoneId).toInstant)
+  }
   private def parseToGregMicros(s: String, zoneId: ZoneId): Long = {
-    instantToMicros(LocalDateTime.parse(s).atZone(zoneId).toInstant)
+    toGregorianMicros(LocalDateTime.parse(s), zoneId)
   }
 
   test("rebase julian to/from gregorian micros") {
@@ -141,53 +144,17 @@ class RebaseDateTimeSuite extends SparkFunSuite with 
Matchers with SQLHelper {
   }
 
   test("optimization of days rebasing - Gregorian to Julian") {
-    // Rebasing switch days and diffs `gregJulianDiffSwitchDay` and 
`gregJulianDiffs`
-    // in `RebaseDateTime` was generated by using this function.
-    def refRebaseGregorianToJulianDays(days: Int): Int = {
-      val localDate = LocalDate.ofEpochDay(days)
-      val utcCal = new Calendar.Builder()
-        // `gregory` is a hybrid calendar that supports both
-        // the Julian and Gregorian calendar systems
-        .setCalendarType("gregory")
-        .setTimeZone(TimeZoneUTC)
-        .setDate(localDate.getYear, localDate.getMonthValue - 1, 
localDate.getDayOfMonth)
-        .build()
-      Math.toIntExact(Math.floorDiv(utcCal.getTimeInMillis, MILLIS_PER_DAY))
-    }
-
     val start = localDateToDays(LocalDate.of(1, 1, 1))
     val end = localDateToDays(LocalDate.of(2030, 1, 1))
 
     var days = start
     while (days < end) {
-      assert(rebaseGregorianToJulianDays(days) === 
refRebaseGregorianToJulianDays(days))
+      assert(rebaseGregorianToJulianDays(days) === 
localRebaseGregorianToJulianDays(days))
       days += 1
     }
   }
 
   test("optimization of days rebasing - Julian to Gregorian") {
-    // Rebasing switch days and diffs `julianGregDiffSwitchDay` and 
`julianGregDiffs`
-    // in `RebaseDateTime` was generated by using this function.
-    def refRebaseJulianToGregorianDays(days: Int): Int = {
-      val utcCal = new Calendar.Builder()
-        // `gregory` is a hybrid calendar that supports both
-        // the Julian and Gregorian calendar systems
-        .setCalendarType("gregory")
-        .setTimeZone(TimeZoneUTC)
-        .setInstant(Math.multiplyExact(days, MILLIS_PER_DAY))
-        .build()
-      val localDate = LocalDate.of(
-        utcCal.get(Calendar.YEAR),
-        utcCal.get(Calendar.MONTH) + 1,
-        // The number of days will be added later to handle non-existing
-        // Julian dates in Proleptic Gregorian calendar.
-        // For example, 1000-02-29 exists in Julian calendar because 1000
-        // is a leap year but it is not a leap year in Gregorian calendar.
-        1)
-        .plusDays(utcCal.get(Calendar.DAY_OF_MONTH) - 1)
-      Math.toIntExact(localDate.toEpochDay)
-    }
-
     val start = rebaseGregorianToJulianDays(
       localDateToDays(LocalDate.of(1, 1, 1)))
     val end = rebaseGregorianToJulianDays(
@@ -195,7 +162,7 @@ class RebaseDateTimeSuite extends SparkFunSuite with 
Matchers with SQLHelper {
 
     var days = start
     while (days < end) {
-      assert(rebaseJulianToGregorianDays(days) === 
refRebaseJulianToGregorianDays(days))
+      assert(rebaseJulianToGregorianDays(days) === 
localRebaseJulianToGregorianDays(days))
       days += 1
     }
   }
@@ -360,4 +327,41 @@ class RebaseDateTimeSuite extends SparkFunSuite with 
Matchers with SQLHelper {
       dir = "/Users/maximgekk/tmp",
       fileName = "julian-gregorian-rebase-micros.json")
   }
+
+  test("rebase gregorian to/from julian days - BCE era") {
+    outstandingZoneIds.foreach { zid =>
+      withDefaultTimeZone(zid) {
+        Seq(
+          (-1100, 1, 1),
+          (-1044, 3, 5),
+          (-44, 3, 5)).foreach { case (year, month, day) =>
+          val julianDays = fromJavaDateLegacy(new Date(year - 1900, month - 1, 
day))
+          val gregorianDays = localDateToDays(LocalDate.of(year, month, day))
+
+          assert(rebaseGregorianToJulianDays(gregorianDays) === julianDays)
+          assert(rebaseJulianToGregorianDays(julianDays) === gregorianDays)
+        }
+      }
+    }
+  }
+
+  test("rebase gregorian to/from julian micros - BCE era") {
+    outstandingZoneIds.foreach { zid =>
+      withDefaultTimeZone(zid) {
+        Seq(
+          (-1100, 1, 1, 1, 2, 3, 0),
+          (-1044, 3, 5, 0, 0, 0, 123456000),
+          (-44, 3, 5, 23, 59, 59, 99999000)
+        ).foreach { case (year, month, day, hour, minute, second, nanos) =>
+          val julianMicros = toJulianMicros(new Timestamp(
+            year - 1900, month - 1, day, hour, minute, second, nanos))
+          val gregorianMicros = toGregorianMicros(LocalDateTime.of(
+            year, month, day, hour, minute, second, nanos), zid)
+
+          assert(rebaseGregorianToJulianMicros(gregorianMicros) === 
julianMicros)
+          assert(rebaseJulianToGregorianMicros(julianMicros) === 
gregorianMicros)
+        }
+      }
+    }
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to