Repository: spark Updated Branches: refs/heads/branch-2.3 2f60df09d -> a8ee5706a
[SPARK-23852][SQL] Upgrade to Parquet 1.8.3 ## What changes were proposed in this pull request? Upgrade Parquet dependency to 1.8.3 to avoid PARQUET-1217 ## How was this patch tested? Ran the included new test case. Author: Henry Robinson <he...@apache.org> Closes #21302 from henryr/branch-2.3. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/a8ee5706 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/a8ee5706 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/a8ee5706 Branch: refs/heads/branch-2.3 Commit: a8ee5706ad96be3d6501471d05f7c3d61d3ca38e Parents: 2f60df0 Author: Henry Robinson <he...@apache.org> Authored: Mon May 14 14:05:32 2018 -0700 Committer: Marcelo Vanzin <van...@cloudera.com> Committed: Mon May 14 14:05:32 2018 -0700 ---------------------------------------------------------------------- dev/deps/spark-deps-hadoop-2.6 | 10 +++++----- dev/deps/spark-deps-hadoop-2.7 | 10 +++++----- pom.xml | 2 +- .../test/resources/test-data/parquet-1217.parquet | Bin 0 -> 321 bytes .../datasources/parquet/ParquetFilterSuite.scala | 10 ++++++++++ 5 files changed, 21 insertions(+), 11 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/a8ee5706/dev/deps/spark-deps-hadoop-2.6 ---------------------------------------------------------------------- diff --git a/dev/deps/spark-deps-hadoop-2.6 b/dev/deps/spark-deps-hadoop-2.6 index 577bf43..f4559a8 100644 --- a/dev/deps/spark-deps-hadoop-2.6 +++ b/dev/deps/spark-deps-hadoop-2.6 @@ -161,13 +161,13 @@ orc-mapreduce-1.4.3-nohive.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar -parquet-column-1.8.2.jar -parquet-common-1.8.2.jar -parquet-encoding-1.8.2.jar +parquet-column-1.8.3.jar +parquet-common-1.8.3.jar +parquet-encoding-1.8.3.jar parquet-format-2.3.1.jar -parquet-hadoop-1.8.2.jar +parquet-hadoop-1.8.3.jar parquet-hadoop-bundle-1.6.0.jar -parquet-jackson-1.8.2.jar +parquet-jackson-1.8.3.jar protobuf-java-2.5.0.jar py4j-0.10.7.jar pyrolite-4.13.jar http://git-wip-us.apache.org/repos/asf/spark/blob/a8ee5706/dev/deps/spark-deps-hadoop-2.7 ---------------------------------------------------------------------- diff --git a/dev/deps/spark-deps-hadoop-2.7 b/dev/deps/spark-deps-hadoop-2.7 index 304982e..c2df998 100644 --- a/dev/deps/spark-deps-hadoop-2.7 +++ b/dev/deps/spark-deps-hadoop-2.7 @@ -162,13 +162,13 @@ orc-mapreduce-1.4.3-nohive.jar oro-2.0.8.jar osgi-resource-locator-1.0.1.jar paranamer-2.8.jar -parquet-column-1.8.2.jar -parquet-common-1.8.2.jar -parquet-encoding-1.8.2.jar +parquet-column-1.8.3.jar +parquet-common-1.8.3.jar +parquet-encoding-1.8.3.jar parquet-format-2.3.1.jar -parquet-hadoop-1.8.2.jar +parquet-hadoop-1.8.3.jar parquet-hadoop-bundle-1.6.0.jar -parquet-jackson-1.8.2.jar +parquet-jackson-1.8.3.jar protobuf-java-2.5.0.jar py4j-0.10.7.jar pyrolite-4.13.jar http://git-wip-us.apache.org/repos/asf/spark/blob/a8ee5706/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 9c2d931..533c6b4 100644 --- a/pom.xml +++ b/pom.xml @@ -129,7 +129,7 @@ <!-- Version used for internal directory structure --> <hive.version.short>1.2.1</hive.version.short> <derby.version>10.12.1.1</derby.version> - <parquet.version>1.8.2</parquet.version> + <parquet.version>1.8.3</parquet.version> <orc.version>1.4.3</orc.version> <orc.classifier>nohive</orc.classifier> <hive.parquet.version>1.6.0</hive.parquet.version> http://git-wip-us.apache.org/repos/asf/spark/blob/a8ee5706/sql/core/src/test/resources/test-data/parquet-1217.parquet ---------------------------------------------------------------------- diff --git a/sql/core/src/test/resources/test-data/parquet-1217.parquet b/sql/core/src/test/resources/test-data/parquet-1217.parquet new file mode 100644 index 0000000..eb2dc4f Binary files /dev/null and b/sql/core/src/test/resources/test-data/parquet-1217.parquet differ http://git-wip-us.apache.org/repos/asf/spark/blob/a8ee5706/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala index 3380195..79891af 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFilterSuite.scala @@ -602,6 +602,16 @@ class ParquetFilterSuite extends QueryTest with ParquetTest with SharedSQLContex } } } + + test("SPARK-23852: Broken Parquet push-down for partially-written stats") { + // parquet-1217.parquet contains a single column with values -1, 0, 1, 2 and null. + // The row-group statistics include null counts, but not min and max values, which + // triggers PARQUET-1217. + val df = readResourceParquetFile("test-data/parquet-1217.parquet") + + // Will return 0 rows if PARQUET-1217 is not fixed. + assert(df.where("col > 0").count() === 2) + } } class NumRowGroupsAcc extends AccumulatorV2[Integer, Integer] { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org