(spark) branch master updated: [SPARK-47968][SQL] MsSQLServer: Map datatimeoffset to TimestampType

yao Fri, 26 Apr 2024 08:42:43 -0700

This is an automated email from the ASF dual-hosted git repository.

yao pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new 733e53a4ff03 [SPARK-47968][SQL] MsSQLServer: Map datatimeoffset to 
TimestampType
733e53a4ff03 is described below

commit 733e53a4ff035b71a4865e1a88271af067d4765d
Author: Kent Yao <y...@apache.org>
AuthorDate: Fri Apr 26 23:42:20 2024 +0800

    [SPARK-47968][SQL] MsSQLServer: Map datatimeoffset to TimestampType
    
    ### What changes were proposed in this pull request?
    
    This PR changes the `datatimeoffset -> StringType` mapping to 
`datatimeoffset -> TimestampType` mapping as we use `mssql-jdbc` for Microsoft 
SQL Server. `spark.sql.legacy.mssqlserver.datetimeoffsetMapping.enabled` is 
provided for user to restore the old behavior.
    
    ### Why are the changes needed?
    
    With the official SQL Server client, it's more reasonable to read it as 
TimestampType, which is also much more compliant with other jdbc datasources
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, (please refer to the first section)
    ### How was this patch tested?
    
    new tests
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    no
    
    Closes #46239 from yaooqinn/SPARK-47968.
    
    Authored-by: Kent Yao <y...@apache.org>
    Signed-off-by: Kent Yao <y...@apache.org>
---
 .../sql/jdbc/MsSqlServerIntegrationSuite.scala     | 59 +++++++++++++---------
 docs/sql-data-sources-jdbc.md                      |  2 +-
 docs/sql-migration-guide.md                        |  1 +
 .../org/apache/spark/sql/internal/SQLConf.scala    | 12 +++++
 .../apache/spark/sql/jdbc/MsSqlServerDialect.scala |  7 ++-
 5 files changed, 55 insertions(+), 26 deletions(-)

diff --git 
a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MsSqlServerIntegrationSuite.scala
 
b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MsSqlServerIntegrationSuite.scala
index a39dcb60406e..623f404339e9 100644
--- 
a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MsSqlServerIntegrationSuite.scala
+++ 
b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MsSqlServerIntegrationSuite.scala
@@ -223,29 +223,42 @@ class MsSqlServerIntegrationSuite extends 
DockerJDBCIntegrationSuite {
 
   test("Date types") {
     withDefaultTimeZone(UTC) {
-      {
-        val df = spark.read
-          .option("preferTimestampNTZ", "false")
-          .jdbc(jdbcUrl, "dates", new Properties)
-        checkAnswer(df, Row(
-          Date.valueOf("1991-11-09"),
-          Timestamp.valueOf("1999-01-01 13:23:35"),
-          Timestamp.valueOf("9999-12-31 23:59:59"),
-          "1901-05-09 23:59:59.0000000 +14:00",
-          Timestamp.valueOf("1996-01-01 23:24:00"),
-          Timestamp.valueOf("1970-01-01 13:31:24")))
-      }
-      {
-        val df = spark.read
-          .option("preferTimestampNTZ", "true")
-          .jdbc(jdbcUrl, "dates", new Properties)
-        checkAnswer(df, Row(
-          Date.valueOf("1991-11-09"),
-          LocalDateTime.of(1999, 1, 1, 13, 23, 35),
-          LocalDateTime.of(9999, 12, 31, 23, 59, 59),
-          "1901-05-09 23:59:59.0000000 +14:00",
-          LocalDateTime.of(1996, 1, 1, 23, 24, 0),
-          LocalDateTime.of(1970, 1, 1, 13, 31, 24)))
+      Seq(true, false).foreach { ntz =>
+        Seq(true, false).foreach { legacy =>
+          withSQLConf(
+            SQLConf.LEGACY_MSSQLSERVER_DATETIMEOFFSET_MAPPING_ENABLED.key -> 
legacy.toString) {
+            val df = spark.read
+              .option("preferTimestampNTZ", ntz)
+              .jdbc(jdbcUrl, "dates", new Properties)
+            checkAnswer(df, Row(
+              Date.valueOf("1991-11-09"),
+              if (ntz) {
+                LocalDateTime.of(1999, 1, 1, 13, 23, 35)
+              } else {
+                Timestamp.valueOf("1999-01-01 13:23:35")
+              },
+              if (ntz) {
+                LocalDateTime.of(9999, 12, 31, 23, 59, 59)
+              } else {
+                Timestamp.valueOf("9999-12-31 23:59:59")
+              },
+              if (legacy) {
+                "1901-05-09 23:59:59.0000000 +14:00"
+              } else {
+                Timestamp.valueOf("1901-05-09 09:59:59")
+              },
+              if (ntz) {
+                LocalDateTime.of(1996, 1, 1, 23, 24, 0)
+              } else {
+                Timestamp.valueOf("1996-01-01 23:24:00")
+              },
+              if (ntz) {
+                LocalDateTime.of(1970, 1, 1, 13, 31, 24)
+              } else {
+                Timestamp.valueOf("1970-01-01 13:31:24")
+              }))
+          }
+        }
       }
     }
   }
diff --git a/docs/sql-data-sources-jdbc.md b/docs/sql-data-sources-jdbc.md
index 734ed43f912a..e217d87801e5 100644
--- a/docs/sql-data-sources-jdbc.md
+++ b/docs/sql-data-sources-jdbc.md
@@ -1540,7 +1540,7 @@ as the activated JDBC Driver.
     </tr>
     <tr>
       <td>datetimeoffset [ (fractional seconds precision) ]</td>
-      <td>StringType</td>
+      <td>TimestampType</td>
       <td></td>
     </tr>
     <tr>
diff --git a/docs/sql-migration-guide.md b/docs/sql-migration-guide.md
index 024423fb145a..1e0fdadde1e3 100644
--- a/docs/sql-migration-guide.md
+++ b/docs/sql-migration-guide.md
@@ -48,6 +48,7 @@ license: |
 - Since Spark 4.0, MySQL JDBC datasource will write ShortType as SMALLINT, 
while in Spark 3.5 and previous, write as INTEGER. To restore the previous 
behavior, you can replace the column with IntegerType whenever before writing.
 - Since Spark 4.0, Oracle JDBC datasource will write TimestampType as 
TIMESTAMP WITH LOCAL TIME ZONE, while in Spark 3.5 and previous, write as 
TIMESTAMP. To restore the previous behavior, set 
`spark.sql.legacy.oracle.timestampMapping.enabled` to `true`.
 - Since Spark 4.0, MsSQL Server JDBC datasource will read TINYINT as 
ShortType, while in Spark 3.5 and previous, read as IntegerType. To restore the 
previous behavior, set `spark.sql.legacy.mssqlserver.numericMapping.enabled` to 
`true`.
+- Since Spark 4.0, MsSQL Server JDBC datasource will read DATETIMEOFFSET as 
TimestampType, while in Spark 3.5 and previous, read as StringType. To restore 
the previous behavior, set 
`spark.sql.legacy.mssqlserver.datetimeoffsetMapping.enabled` to `true`.
 - Since Spark 4.0, The default value for 
`spark.sql.legacy.ctePrecedencePolicy` has been changed from `EXCEPTION` to 
`CORRECTED`. Instead of raising an error, inner CTE definitions take precedence 
over outer definitions.
 - Since Spark 4.0, The default value for `spark.sql.legacy.timeParserPolicy` 
has been changed from `EXCEPTION` to `CORRECTED`. Instead of raising an 
`INCONSISTENT_BEHAVIOR_CROSS_VERSION` error, `CANNOT_PARSE_TIMESTAMP` will be 
raised if ANSI mode is enable. `NULL` will be returned if ANSI mode is 
disabled. See [Datetime Patterns for Formatting and 
Parsing](sql-ref-datetime-pattern.html).
 - Since Spark 4.0, A bug falsely allowing `!` instead of `NOT` when `!` is not 
a prefix operator has been fixed. Clauses such as `expr ! IN (...)`, `expr ! 
BETWEEN ...`, or `col ! NULL` now raise syntax errors. To restore the previous 
behavior, set `spark.sql.legacy.bangEqualsNot` to `true`. 
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index b8ee54538f46..ac4a4ef90d0d 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -4154,6 +4154,15 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+  val LEGACY_MSSQLSERVER_DATETIMEOFFSET_MAPPING_ENABLED =
+    buildConf("spark.sql.legacy.mssqlserver.datetimeoffsetMapping.enabled")
+      .internal()
+      .doc("When true, DATETIMEOFFSET is mapped to StringType; otherwise, it 
is mapped to " +
+        "TimestampType.")
+      .version("4.0.0")
+      .booleanConf
+      .createWithDefault(false)
+
   val LEGACY_MYSQL_BIT_ARRAY_MAPPING_ENABLED =
     buildConf("spark.sql.legacy.mysql.bitArrayMapping.enabled")
       .internal()
@@ -5279,6 +5288,9 @@ class SQLConf extends Serializable with Logging with 
SqlApiConf {
   def legacyMsSqlServerNumericMappingEnabled: Boolean =
     getConf(LEGACY_MSSQLSERVER_NUMERIC_MAPPING_ENABLED)
 
+  def legacyMsSqlServerDatetimeOffsetMappingEnabled: Boolean =
+    getConf(LEGACY_MSSQLSERVER_DATETIMEOFFSET_MAPPING_ENABLED)
+
   def legacyMySqlBitArrayMappingEnabled: Boolean =
     getConf(LEGACY_MYSQL_BIT_ARRAY_MAPPING_ENABLED)
 
diff --git 
a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala
index 3d908d69615b..a1492d81bf53 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/jdbc/MsSqlServerDialect.scala
@@ -111,8 +111,11 @@ private case class MsSqlServerDialect() extends 
JdbcDialect {
       sqlType: Int, typeName: String, size: Int, md: MetadataBuilder): 
Option[DataType] = {
     sqlType match {
       case _ if typeName.contains("datetimeoffset") =>
-        // String is recommend by Microsoft SQL Server for datetimeoffset 
types in non-MS clients
-        Option(StringType)
+        if (SQLConf.get.legacyMsSqlServerDatetimeOffsetMappingEnabled) {
+          Some(StringType)
+        } else {
+          Some(TimestampType)
+        }
       case java.sql.Types.SMALLINT | java.sql.Types.TINYINT
           if !SQLConf.get.legacyMsSqlServerNumericMappingEnabled =>
         // Data range of TINYINT is 0-255 so it needs to be stored in 
ShortType.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

(spark) branch master updated: [SPARK-47968][SQL] MsSQLServer: Map datatimeoffset to TimestampType

Reply via email to