This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new 080c51e  [SPARK-31641][SQL] Fix days conversions by JSON legacy parser
080c51e is described below

commit 080c51e6b6268002948dd14171233bd35d954529
Author: Max Gekk <max.g...@gmail.com>
AuthorDate: Tue May 5 14:15:31 2020 +0000

    [SPARK-31641][SQL] Fix days conversions by JSON legacy parser
    
    ### What changes were proposed in this pull request?
    Perform days rebasing while converting days from JSON string field. In 
Spark 2.4 and earlier versions, the days are interpreted as days since the 
epoch in the hybrid calendar (Julian + Gregorian since 1582-10-15). Since Spark 
3.0, the base calendar was switched to Proleptic Gregorian calendar, so, the 
days should be rebased to represent the same local date.
    
    ### Why are the changes needed?
    The changes fix a bug and restore compatibility with Spark 2.4 in which:
    ```scala
    scala> spark.read.schema("d date").json(Seq("{'d': '-141704'}").toDS).show
    +----------+
    |         d|
    +----------+
    |1582-01-01|
    +----------+
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    Yes.
    
    Before:
    ```scala
    scala> spark.read.schema("d date").json(Seq("{'d': '-141704'}").toDS).show
    +----------+
    |         d|
    +----------+
    |1582-01-11|
    +----------+
    ```
    
    After:
    ```scala
    scala> spark.read.schema("d date").json(Seq("{'d': '-141704'}").toDS).show
    +----------+
    |         d|
    +----------+
    |1582-01-01|
    +----------+
    ```
    
    ### How was this patch tested?
    Add a test to `JsonSuite`.
    
    Closes #28453 from MaxGekk/json-rebase-legacy-days.
    
    Authored-by: Max Gekk <max.g...@gmail.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
    (cherry picked from commit bd264299317bba91f2dc1dc27fd51e6bc0609d66)
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../org/apache/spark/sql/catalyst/json/JacksonParser.scala   |  2 +-
 .../spark/sql/execution/datasources/json/JsonSuite.scala     | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
index 8965a81..a52c345 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JacksonParser.scala
@@ -259,7 +259,7 @@ class JacksonParser(
                 // In Spark 1.5.0, we store the data as number of days since 
epoch in string.
                 // So, we just convert it to Int.
                 try {
-                  parser.getText.toInt
+                  
RebaseDateTime.rebaseJulianToGregorianDays(parser.getText.toInt)
                 } catch {
                   case _: NumberFormatException => throw e
                 }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
index 999eadb..4982991 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonSuite.scala
@@ -2653,13 +2653,17 @@ abstract class JsonSuite extends QueryTest with 
SharedSparkSession with TestJson
     }
   }
 
-  test("SPARK-30960: parse date/timestamp string with legacy format") {
-    val ds = Seq("{'t': '2020-1-12 3:23:34.12', 'd': '2020-1-12 T', 'd2': 
'12345'}").toDS()
-    val json = spark.read.schema("t timestamp, d date, d2 date").json(ds)
+  test("SPARK-30960, SPARK-31641: parse date/timestamp string with legacy 
format") {
+    val julianDay = -141704 // 1582-01-01 in Julian calendar
+    val ds = Seq(
+      s"{'t': '2020-1-12 3:23:34.12', 'd': '2020-1-12 T', 'd2': '12345', 'd3': 
'$julianDay'}"
+    ).toDS()
+    val json = spark.read.schema("t timestamp, d date, d2 date, d3 
date").json(ds)
     checkAnswer(json, Row(
       Timestamp.valueOf("2020-1-12 3:23:34.12"),
       Date.valueOf("2020-1-12"),
-      Date.valueOf(LocalDate.ofEpochDay(12345))))
+      Date.valueOf(LocalDate.ofEpochDay(12345)),
+      Date.valueOf("1582-01-01")))
   }
 
   test("exception mode for parsing date/timestamp string") {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to