This is an automated email from the ASF dual-hosted git repository.

gurwls223 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 43a73e3  [SPARK-27528][SQL] Use Parquet logical type TIMESTAMP_MICROS 
by default
43a73e3 is described below

commit 43a73e387cb843486adcf5b8bbd8b99010ce6e02
Author: Maxim Gekk <max.g...@gmail.com>
AuthorDate: Tue Apr 23 11:06:39 2019 +0900

    [SPARK-27528][SQL] Use Parquet logical type TIMESTAMP_MICROS by default
    
    ## What changes were proposed in this pull request?
    
    In the PR, I propose to use the `TIMESTAMP_MICROS` logical type for 
timestamps written to parquet files. The type matches semantically to 
Catalyst's `TimestampType`, and stores microseconds since epoch in UTC time 
zone. This will allow to avoid conversions of microseconds to nanoseconds and 
to Julian calendar. Also this will reduce sizes of written parquet files.
    
    ## How was this patch tested?
    
    By existing test suites.
    
    Closes #24425 from MaxGekk/parquet-timestamp_micros.
    
    Authored-by: Maxim Gekk <max.g...@gmail.com>
    Signed-off-by: HyukjinKwon <gurwls...@apache.org>
---
 docs/sql-migration-guide-upgrade.md                               | 2 ++
 .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala    | 2 +-
 .../datasources/parquet/ParquetInteroperabilitySuite.scala        | 8 ++++++--
 .../test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala   | 2 +-
 4 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/docs/sql-migration-guide-upgrade.md 
b/docs/sql-migration-guide-upgrade.md
index 90a7d8d..54512ae 100644
--- a/docs/sql-migration-guide-upgrade.md
+++ b/docs/sql-migration-guide-upgrade.md
@@ -124,6 +124,8 @@ license: |
 
   - In Spark version 2.4, when a spark session is created via 
`cloneSession()`, the newly created spark session inherits its configuration 
from its parent `SparkContext` even though the same configuration may exist 
with a different value in its parent spark session. Since Spark 3.0, the 
configurations of a parent `SparkSession` have a higher precedence over the 
parent `SparkContext`.
 
+  - Since Spark 3.0, parquet logical type `TIMESTAMP_MICROS` is used by 
default while saving `TIMESTAMP` columns. In Spark version 2.4 and earlier, 
`TIMESTAMP` columns are saved as `INT96` in parquet files. To set `INT96` to 
`spark.sql.parquet.outputTimestampType` restores the previous behavior.
+
 ## Upgrading from Spark SQL 2.4 to 2.4.1
 
   - The value of `spark.executor.heartbeatInterval`, when specified without 
units like "30" rather than "30s", was
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index b223a48..9ebd2c0 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -405,7 +405,7 @@ object SQLConf {
     .stringConf
     .transform(_.toUpperCase(Locale.ROOT))
     .checkValues(ParquetOutputTimestampType.values.map(_.toString))
-    .createWithDefault(ParquetOutputTimestampType.INT96.toString)
+    .createWithDefault(ParquetOutputTimestampType.TIMESTAMP_MICROS.toString)
 
   val PARQUET_INT64_AS_TIMESTAMP_MILLIS = 
buildConf("spark.sql.parquet.int64AsTimestampMillis")
     .doc(s"(Deprecated since Spark 2.3, please set 
${PARQUET_OUTPUT_TIMESTAMP_TYPE.key}.) " +
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
index f06e186..09793bd 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala
@@ -120,8 +120,12 @@ class ParquetInteroperabilitySuite extends 
ParquetCompatibilityTest with SharedS
       ).map { s => java.sql.Timestamp.valueOf(s) }
       import testImplicits._
       // match the column names of the file from impala
-      val df = 
spark.createDataset(ts).toDF().repartition(1).withColumnRenamed("value", "ts")
-      df.write.parquet(tableDir.getAbsolutePath)
+      withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key ->
+        SQLConf.ParquetOutputTimestampType.INT96.toString) {
+        val df = spark.createDataset(ts).toDF().repartition(1)
+          .withColumnRenamed("value", "ts")
+        df.write.parquet(tableDir.getAbsolutePath)
+      }
       FileUtils.copyFile(new File(impalaPath), new File(tableDir, 
"part-00001.parq"))
 
       Seq(false, true).foreach { int96TimestampConversion =>
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
index 5ecb79b..829dea4 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
@@ -257,7 +257,7 @@ class SQLConfSuite extends QueryTest with SharedSQLContext {
 
     // check default value
     assert(spark.sessionState.conf.parquetOutputTimestampType ==
-      SQLConf.ParquetOutputTimestampType.INT96)
+      SQLConf.ParquetOutputTimestampType.TIMESTAMP_MICROS)
 
     // PARQUET_INT64_AS_TIMESTAMP_MILLIS should be respected.
     spark.sessionState.conf.setConf(SQLConf.PARQUET_INT64_AS_TIMESTAMP_MILLIS, 
true)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to