This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 43a73e3 [SPARK-27528][SQL] Use Parquet logical type TIMESTAMP_MICROS by default 43a73e3 is described below commit 43a73e387cb843486adcf5b8bbd8b99010ce6e02 Author: Maxim Gekk <max.g...@gmail.com> AuthorDate: Tue Apr 23 11:06:39 2019 +0900 [SPARK-27528][SQL] Use Parquet logical type TIMESTAMP_MICROS by default ## What changes were proposed in this pull request? In the PR, I propose to use the `TIMESTAMP_MICROS` logical type for timestamps written to parquet files. The type matches semantically to Catalyst's `TimestampType`, and stores microseconds since epoch in UTC time zone. This will allow to avoid conversions of microseconds to nanoseconds and to Julian calendar. Also this will reduce sizes of written parquet files. ## How was this patch tested? By existing test suites. Closes #24425 from MaxGekk/parquet-timestamp_micros. Authored-by: Maxim Gekk <max.g...@gmail.com> Signed-off-by: HyukjinKwon <gurwls...@apache.org> --- docs/sql-migration-guide-upgrade.md | 2 ++ .../src/main/scala/org/apache/spark/sql/internal/SQLConf.scala | 2 +- .../datasources/parquet/ParquetInteroperabilitySuite.scala | 8 ++++++-- .../test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala | 2 +- 4 files changed, 10 insertions(+), 4 deletions(-) diff --git a/docs/sql-migration-guide-upgrade.md b/docs/sql-migration-guide-upgrade.md index 90a7d8d..54512ae 100644 --- a/docs/sql-migration-guide-upgrade.md +++ b/docs/sql-migration-guide-upgrade.md @@ -124,6 +124,8 @@ license: | - In Spark version 2.4, when a spark session is created via `cloneSession()`, the newly created spark session inherits its configuration from its parent `SparkContext` even though the same configuration may exist with a different value in its parent spark session. Since Spark 3.0, the configurations of a parent `SparkSession` have a higher precedence over the parent `SparkContext`. + - Since Spark 3.0, parquet logical type `TIMESTAMP_MICROS` is used by default while saving `TIMESTAMP` columns. In Spark version 2.4 and earlier, `TIMESTAMP` columns are saved as `INT96` in parquet files. To set `INT96` to `spark.sql.parquet.outputTimestampType` restores the previous behavior. + ## Upgrading from Spark SQL 2.4 to 2.4.1 - The value of `spark.executor.heartbeatInterval`, when specified without units like "30" rather than "30s", was diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index b223a48..9ebd2c0 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -405,7 +405,7 @@ object SQLConf { .stringConf .transform(_.toUpperCase(Locale.ROOT)) .checkValues(ParquetOutputTimestampType.values.map(_.toString)) - .createWithDefault(ParquetOutputTimestampType.INT96.toString) + .createWithDefault(ParquetOutputTimestampType.TIMESTAMP_MICROS.toString) val PARQUET_INT64_AS_TIMESTAMP_MILLIS = buildConf("spark.sql.parquet.int64AsTimestampMillis") .doc(s"(Deprecated since Spark 2.3, please set ${PARQUET_OUTPUT_TIMESTAMP_TYPE.key}.) " + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala index f06e186..09793bd 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetInteroperabilitySuite.scala @@ -120,8 +120,12 @@ class ParquetInteroperabilitySuite extends ParquetCompatibilityTest with SharedS ).map { s => java.sql.Timestamp.valueOf(s) } import testImplicits._ // match the column names of the file from impala - val df = spark.createDataset(ts).toDF().repartition(1).withColumnRenamed("value", "ts") - df.write.parquet(tableDir.getAbsolutePath) + withSQLConf(SQLConf.PARQUET_OUTPUT_TIMESTAMP_TYPE.key -> + SQLConf.ParquetOutputTimestampType.INT96.toString) { + val df = spark.createDataset(ts).toDF().repartition(1) + .withColumnRenamed("value", "ts") + df.write.parquet(tableDir.getAbsolutePath) + } FileUtils.copyFile(new File(impalaPath), new File(tableDir, "part-00001.parq")) Seq(false, true).foreach { int96TimestampConversion => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala index 5ecb79b..829dea4 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala @@ -257,7 +257,7 @@ class SQLConfSuite extends QueryTest with SharedSQLContext { // check default value assert(spark.sessionState.conf.parquetOutputTimestampType == - SQLConf.ParquetOutputTimestampType.INT96) + SQLConf.ParquetOutputTimestampType.TIMESTAMP_MICROS) // PARQUET_INT64_AS_TIMESTAMP_MILLIS should be respected. spark.sessionState.conf.setConf(SQLConf.PARQUET_INT64_AS_TIMESTAMP_MILLIS, true) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org