This is an automated email from the ASF dual-hosted git repository. yumwang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new a7683af [SPARK-26346][BUILD][SQL] Upgrade Parquet to 1.11.1 a7683af is described below commit a7683afdf498c3ee09466dfa9635edacb5cc8f0c Author: Yuming Wang <yumw...@ebay.com> AuthorDate: Fri Jan 29 08:07:49 2021 +0800 [SPARK-26346][BUILD][SQL] Upgrade Parquet to 1.11.1 ### What changes were proposed in this pull request? This PR upgrade Parquet to 1.11.1. Parquet 1.11.1 new features: - [PARQUET-1201](https://issues.apache.org/jira/browse/PARQUET-1201) - Column indexes - [PARQUET-1253](https://issues.apache.org/jira/browse/PARQUET-1253) - Support for new logical type representation - [PARQUET-1388](https://issues.apache.org/jira/browse/PARQUET-1388) - Nanosecond precision time and timestamp - parquet-mr More details: https://github.com/apache/parquet-mr/blob/apache-parquet-1.11.1/CHANGES.md ### Why are the changes needed? Support column indexes to improve query performance. ### Does this PR introduce any user-facing change? No. ### How was this patch tested? Existing test. Closes #26804 from wangyum/SPARK-26346. Authored-by: Yuming Wang <yumw...@ebay.com> Signed-off-by: Yuming Wang <yumw...@ebay.com> --- dev/deps/spark-deps-hadoop-2.7-hive-2.3 | 12 ++++++------ dev/deps/spark-deps-hadoop-3.2-hive-2.3 | 12 ++++++------ pom.xml | 6 +++++- .../datasources/parquet/ParquetSchemaSuite.scala | 22 +++++++++++----------- .../apache/spark/sql/streaming/StreamSuite.scala | 4 +++- .../apache/spark/sql/hive/StatisticsSuite.scala | 2 +- 6 files changed, 32 insertions(+), 26 deletions(-) diff --git a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 index 2c468b8..179ab36 100644 --- a/dev/deps/spark-deps-hadoop-2.7-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-2.7-hive-2.3 @@ -202,12 +202,12 @@ orc-shims/1.6.7//orc-shims-1.6.7.jar oro/2.0.8//oro-2.0.8.jar osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar paranamer/2.8//paranamer-2.8.jar -parquet-column/1.10.1//parquet-column-1.10.1.jar -parquet-common/1.10.1//parquet-common-1.10.1.jar -parquet-encoding/1.10.1//parquet-encoding-1.10.1.jar -parquet-format/2.4.0//parquet-format-2.4.0.jar -parquet-hadoop/1.10.1//parquet-hadoop-1.10.1.jar -parquet-jackson/1.10.1//parquet-jackson-1.10.1.jar +parquet-column/1.11.1//parquet-column-1.11.1.jar +parquet-common/1.11.1//parquet-common-1.11.1.jar +parquet-encoding/1.11.1//parquet-encoding-1.11.1.jar +parquet-format-structures/1.11.1//parquet-format-structures-1.11.1.jar +parquet-hadoop/1.11.1//parquet-hadoop-1.11.1.jar +parquet-jackson/1.11.1//parquet-jackson-1.11.1.jar protobuf-java/2.5.0//protobuf-java-2.5.0.jar py4j/0.10.9.1//py4j-0.10.9.1.jar pyrolite/4.30//pyrolite-4.30.jar diff --git a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 index 894fd6a..83c32c4 100644 --- a/dev/deps/spark-deps-hadoop-3.2-hive-2.3 +++ b/dev/deps/spark-deps-hadoop-3.2-hive-2.3 @@ -172,12 +172,12 @@ orc-shims/1.6.7//orc-shims-1.6.7.jar oro/2.0.8//oro-2.0.8.jar osgi-resource-locator/1.0.3//osgi-resource-locator-1.0.3.jar paranamer/2.8//paranamer-2.8.jar -parquet-column/1.10.1//parquet-column-1.10.1.jar -parquet-common/1.10.1//parquet-common-1.10.1.jar -parquet-encoding/1.10.1//parquet-encoding-1.10.1.jar -parquet-format/2.4.0//parquet-format-2.4.0.jar -parquet-hadoop/1.10.1//parquet-hadoop-1.10.1.jar -parquet-jackson/1.10.1//parquet-jackson-1.10.1.jar +parquet-column/1.11.1//parquet-column-1.11.1.jar +parquet-common/1.11.1//parquet-common-1.11.1.jar +parquet-encoding/1.11.1//parquet-encoding-1.11.1.jar +parquet-format-structures/1.11.1//parquet-format-structures-1.11.1.jar +parquet-hadoop/1.11.1//parquet-hadoop-1.11.1.jar +parquet-jackson/1.11.1//parquet-jackson-1.11.1.jar protobuf-java/2.5.0//protobuf-java-2.5.0.jar py4j/0.10.9.1//py4j-0.10.9.1.jar pyrolite/4.30//pyrolite-4.30.jar diff --git a/pom.xml b/pom.xml index 84ec92e..05a2e04 100644 --- a/pom.xml +++ b/pom.xml @@ -136,7 +136,7 @@ <kafka.version>2.6.0</kafka.version> <!-- After 10.15.1.3, the minimum required version is JDK9 --> <derby.version>10.14.2.0</derby.version> - <parquet.version>1.10.1</parquet.version> + <parquet.version>1.11.1</parquet.version> <orc.version>1.6.7</orc.version> <jetty.version>9.4.34.v20201102</jetty.version> <jakartaservlet.version>4.0.3</jakartaservlet.version> @@ -2290,6 +2290,10 @@ <groupId>commons-pool</groupId> <artifactId>commons-pool</artifactId> </exclusion> + <exclusion> + <groupId>javax.annotation</groupId> + <artifactId>javax.annotation-api</artifactId> + </exclusion> </exclusions> </dependency> <dependency> diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala index e97c6cd..fcc08ee 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetSchemaSuite.scala @@ -251,7 +251,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """ |message root { | optional group _1 (MAP) { - | repeated group map (MAP_KEY_VALUE) { + | repeated group key_value (MAP_KEY_VALUE) { | required int32 key; | optional binary value (UTF8); | } @@ -267,7 +267,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """ |message root { | optional group _1 (MAP) { - | repeated group map (MAP_KEY_VALUE) { + | repeated group key_value (MAP_KEY_VALUE) { | required group key { | optional binary _1 (UTF8); | optional binary _2 (UTF8); @@ -300,7 +300,7 @@ class ParquetSchemaInferenceSuite extends ParquetSchemaTest { """ |message root { | optional group _1 (MAP_KEY_VALUE) { - | repeated group map { + | repeated group key_value { | required int32 key; | optional group value { | optional binary _1 (UTF8); @@ -740,7 +740,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { nullable = true))), """message root { | optional group f1 (MAP_KEY_VALUE) { - | repeated group map { + | repeated group key_value { | required int32 num; | required binary str (UTF8); | } @@ -759,7 +759,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { nullable = true))), """message root { | optional group f1 (MAP) { - | repeated group map (MAP_KEY_VALUE) { + | repeated group key_value (MAP_KEY_VALUE) { | required int32 key; | required binary value (UTF8); | } @@ -797,7 +797,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { nullable = true))), """message root { | optional group f1 (MAP_KEY_VALUE) { - | repeated group map { + | repeated group key_value { | required int32 num; | optional binary str (UTF8); | } @@ -816,7 +816,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { nullable = true))), """message root { | optional group f1 (MAP) { - | repeated group map (MAP_KEY_VALUE) { + | repeated group key_value (MAP_KEY_VALUE) { | required int32 key; | optional binary value (UTF8); | } @@ -857,7 +857,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { nullable = true))), """message root { | optional group f1 (MAP) { - | repeated group map (MAP_KEY_VALUE) { + | repeated group key_value (MAP_KEY_VALUE) { | required int32 key; | required binary value (UTF8); | } @@ -893,7 +893,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { nullable = true))), """message root { | optional group f1 (MAP) { - | repeated group map (MAP_KEY_VALUE) { + | repeated group key_value (MAP_KEY_VALUE) { | required int32 key; | optional binary value (UTF8); | } @@ -1447,7 +1447,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { parquetSchema = """message root { | required group f0 (MAP) { - | repeated group map (MAP_KEY_VALUE) { + | repeated group key_value (MAP_KEY_VALUE) { | required int32 key; | required group value { | required int32 value_f0; @@ -1472,7 +1472,7 @@ class ParquetSchemaSuite extends ParquetSchemaTest { expectedSchema = """message root { | required group f0 (MAP) { - | repeated group map (MAP_KEY_VALUE) { + | repeated group key_value (MAP_KEY_VALUE) { | required int32 key; | required group value { | required int64 value_f1; diff --git a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala index 440fe99..c4e43d2 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/streaming/StreamSuite.scala @@ -214,7 +214,9 @@ class StreamSuite extends StreamTest { .start(outputDir.getAbsolutePath) try { query.processAllAvailable() - val outputDf = spark.read.parquet(outputDir.getAbsolutePath).as[Long] + // Parquet write page-level CRC checksums will change the file size and + // affect the data order when reading these files. Please see PARQUET-1746 for details. + val outputDf = spark.read.parquet(outputDir.getAbsolutePath).sort('a).as[Long] checkDataset[Long](outputDf, (0L to 10L).toArray: _*) } finally { query.stop() diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala index 5357f4b..c91ee92 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/StatisticsSuite.scala @@ -1528,7 +1528,7 @@ class StatisticsSuite extends StatisticsCollectionTestBase with TestHiveSingleto Seq(tbl, ext_tbl).foreach { tblName => sql(s"INSERT INTO $tblName VALUES (1, 'a', '2019-12-13')") - val expectedSize = 601 + val expectedSize = 651 // analyze table sql(s"ANALYZE TABLE $tblName COMPUTE STATISTICS NOSCAN") var tableStats = getTableStats(tblName) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org