spark git commit: [SPARK-14962][SQL] Do not push down isnotnull/isnull on unsuportted types in ORC
Repository: spark Updated Branches: refs/heads/branch-2.0 3f6a13c8a -> d7c755561 [SPARK-14962][SQL] Do not push down isnotnull/isnull on unsuportted types in ORC ## What changes were proposed in this pull request? https://issues.apache.org/jira/browse/SPARK-14962 ORC filters were being pushed down for all types for both `IsNull` and `IsNotNull`. This is apparently OK because both `IsNull` and `IsNotNull` do not take a type as an argument (Hive 1.2.x) during building filters (`SearchArgument`) in Spark-side but they do not filter correctly because stored statistics always produces `null` for not supported types (eg `ArrayType`) in ORC-side. So, it is always `true` for `IsNull` which ends up with always `false` for `IsNotNull`. (Please see [RecordReaderImpl.java#L296-L318](https://github.com/apache/hive/blob/branch-1.2/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java#L296-L318) and [RecordReaderImpl.java#L359-L365](https://github.com/apache/hive/blob/branch-1.2/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java#L359-L365) in Hive 1.2) This looks prevented in Hive 1.3.x >= by forcing to give a type ([`PredicateLeaf.Type`](https://github.com/apache/hive/blob/e085b7e9bd059d91aaf013df0db4d71dca90ec6f/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/PredicateLeaf.java#L50-L56)) when building a filter ([`SearchArgument`](https://github.com/apache/hive/blob/26b5c7b56a4f28ce3eabc0207566cce46b29b558/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgument.java#L260)) but Hive 1.2.x seems not doing this. This PR prevents ORC filter creation for `IsNull` and `IsNotNull` on unsupported types. `OrcFilters` resembles `ParquetFilters`. ## How was this patch tested? Unittests in `OrcQuerySuite` and `OrcFilterSuite` and `sbt scalastyle`. Author: hyukjinkwonAuthor: Hyukjin Kwon Closes #12777 from HyukjinKwon/SPARK-14962. (cherry picked from commit fa928ff9a3c1de5d5aff9d14e6bc1bd03fcca087) Signed-off-by: Cheng Lian Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/d7c75556 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/d7c75556 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/d7c75556 Branch: refs/heads/branch-2.0 Commit: d7c755561270ee8ec1c44df2e10a8bcb4985c3de Parents: 3f6a13c Author: hyukjinkwon Authored: Sat May 7 01:46:45 2016 +0800 Committer: Cheng Lian Committed: Sat May 7 01:53:08 2016 +0800 -- .../apache/spark/sql/test/SQLTestUtils.scala| 2 +- .../apache/spark/sql/hive/orc/OrcFilters.scala | 63 .../apache/spark/sql/hive/orc/OrcRelation.scala | 19 ++--- .../spark/sql/hive/orc/OrcFilterSuite.scala | 75 .../spark/sql/hive/orc/OrcQuerySuite.scala | 14 .../spark/sql/hive/orc/OrcSourceSuite.scala | 9 ++- 6 files changed, 126 insertions(+), 56 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/d7c75556/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala index ffb206a..6d2b95e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala @@ -213,7 +213,7 @@ private[sql] trait SQLTestUtils */ protected def stripSparkFilter(df: DataFrame): DataFrame = { val schema = df.schema -val withoutFilters = df.queryExecution.sparkPlan transform { +val withoutFilters = df.queryExecution.sparkPlan.transform { case FilterExec(_, child) => child } http://git-wip-us.apache.org/repos/asf/spark/blob/d7c75556/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala index c025c12..c463bc8 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala @@ -17,13 +17,12 @@ package org.apache.spark.sql.hive.orc -import org.apache.hadoop.hive.common.`type`.{HiveChar, HiveDecimal, HiveVarchar} import org.apache.hadoop.hive.ql.io.sarg.{SearchArgument, SearchArgumentFactory} import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.Builder -import org.apache.hadoop.hive.serde2.io.DateWritable import org.apache.spark.internal.Logging import
spark git commit: [SPARK-14962][SQL] Do not push down isnotnull/isnull on unsuportted types in ORC
Repository: spark Updated Branches: refs/heads/master a03c5e68a -> fa928ff9a [SPARK-14962][SQL] Do not push down isnotnull/isnull on unsuportted types in ORC ## What changes were proposed in this pull request? https://issues.apache.org/jira/browse/SPARK-14962 ORC filters were being pushed down for all types for both `IsNull` and `IsNotNull`. This is apparently OK because both `IsNull` and `IsNotNull` do not take a type as an argument (Hive 1.2.x) during building filters (`SearchArgument`) in Spark-side but they do not filter correctly because stored statistics always produces `null` for not supported types (eg `ArrayType`) in ORC-side. So, it is always `true` for `IsNull` which ends up with always `false` for `IsNotNull`. (Please see [RecordReaderImpl.java#L296-L318](https://github.com/apache/hive/blob/branch-1.2/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java#L296-L318) and [RecordReaderImpl.java#L359-L365](https://github.com/apache/hive/blob/branch-1.2/ql/src/java/org/apache/hadoop/hive/ql/io/orc/RecordReaderImpl.java#L359-L365) in Hive 1.2) This looks prevented in Hive 1.3.x >= by forcing to give a type ([`PredicateLeaf.Type`](https://github.com/apache/hive/blob/e085b7e9bd059d91aaf013df0db4d71dca90ec6f/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/PredicateLeaf.java#L50-L56)) when building a filter ([`SearchArgument`](https://github.com/apache/hive/blob/26b5c7b56a4f28ce3eabc0207566cce46b29b558/storage-api/src/java/org/apache/hadoop/hive/ql/io/sarg/SearchArgument.java#L260)) but Hive 1.2.x seems not doing this. This PR prevents ORC filter creation for `IsNull` and `IsNotNull` on unsupported types. `OrcFilters` resembles `ParquetFilters`. ## How was this patch tested? Unittests in `OrcQuerySuite` and `OrcFilterSuite` and `sbt scalastyle`. Author: hyukjinkwonAuthor: Hyukjin Kwon Closes #12777 from HyukjinKwon/SPARK-14962. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/fa928ff9 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/fa928ff9 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/fa928ff9 Branch: refs/heads/master Commit: fa928ff9a3c1de5d5aff9d14e6bc1bd03fcca087 Parents: a03c5e6 Author: hyukjinkwon Authored: Sat May 7 01:46:45 2016 +0800 Committer: Cheng Lian Committed: Sat May 7 01:46:45 2016 +0800 -- .../apache/spark/sql/test/SQLTestUtils.scala| 2 +- .../apache/spark/sql/hive/orc/OrcFilters.scala | 63 .../apache/spark/sql/hive/orc/OrcRelation.scala | 19 ++--- .../spark/sql/hive/orc/OrcFilterSuite.scala | 75 .../spark/sql/hive/orc/OrcQuerySuite.scala | 14 .../spark/sql/hive/orc/OrcSourceSuite.scala | 9 ++- 6 files changed, 126 insertions(+), 56 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/fa928ff9/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala -- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala index ffb206a..6d2b95e 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/test/SQLTestUtils.scala @@ -213,7 +213,7 @@ private[sql] trait SQLTestUtils */ protected def stripSparkFilter(df: DataFrame): DataFrame = { val schema = df.schema -val withoutFilters = df.queryExecution.sparkPlan transform { +val withoutFilters = df.queryExecution.sparkPlan.transform { case FilterExec(_, child) => child } http://git-wip-us.apache.org/repos/asf/spark/blob/fa928ff9/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala index c025c12..c463bc8 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/orc/OrcFilters.scala @@ -17,13 +17,12 @@ package org.apache.spark.sql.hive.orc -import org.apache.hadoop.hive.common.`type`.{HiveChar, HiveDecimal, HiveVarchar} import org.apache.hadoop.hive.ql.io.sarg.{SearchArgument, SearchArgumentFactory} import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.Builder -import org.apache.hadoop.hive.serde2.io.DateWritable import org.apache.spark.internal.Logging import org.apache.spark.sql.sources._ +import org.apache.spark.sql.types._ /** * Helper object for building ORC `SearchArgument`s,