Repository: spark
Updated Branches:
  refs/heads/branch-2.3 db1f3cc76 -> bd6bfacb2


[SPARK-24879][SQL] Fix NPE in Hive partition pruning filter pushdown

## What changes were proposed in this pull request?
We get a NPE when we have a filter on a partition column of the form `col in 
(x, null)`. This is due to the filter converter in HiveShim not handling 
`null`s correctly. This patch fixes this bug while still pushing down as much 
of the partition pruning predicates as possible, by filtering out `null`s from 
any `in` predicate. Since Hive only supports very simple partition pruning 
filters, this change should preserve correctness.

## How was this patch tested?
Unit tests, manual tests

Author: William Sheu <william.s...@databricks.com>

Closes #21832 from PenguinToast/partition-pruning-npe.

(cherry picked from commit bbd6f0c25fe19dc6c946e63cac7b98d0f78b3463)
Signed-off-by: Xiao Li <gatorsm...@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bd6bfacb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bd6bfacb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bd6bfacb

Branch: refs/heads/branch-2.3
Commit: bd6bfacb2ef473b858cc91e0692591deaea26118
Parents: db1f3cc
Author: William Sheu <william.s...@databricks.com>
Authored: Fri Jul 20 19:59:28 2018 -0700
Committer: Xiao Li <gatorsm...@gmail.com>
Committed: Fri Jul 20 20:00:17 2018 -0700

----------------------------------------------------------------------
 .../apache/spark/sql/hive/client/HiveShim.scala  | 19 ++++++++++++++++++-
 .../spark/sql/hive/client/FiltersSuite.scala     | 14 ++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/bd6bfacb/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 60fe31f..68acea7 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -599,6 +599,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
 
     object ExtractableLiteral {
       def unapply(expr: Expression): Option[String] = expr match {
+        case Literal(null, _) => None // `null`s can be cast as other types; 
we want to avoid NPEs.
         case Literal(value, _: IntegralType) => Some(value.toString)
         case Literal(value, _: StringType) => 
Some(quoteStringLiteral(value.toString))
         case _ => None
@@ -607,7 +608,23 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
 
     object ExtractableLiterals {
       def unapply(exprs: Seq[Expression]): Option[Seq[String]] = {
-        val extractables = exprs.map(ExtractableLiteral.unapply)
+        // SPARK-24879: The Hive metastore filter parser does not support 
"null", but we still want
+        // to push down as many predicates as we can while still maintaining 
correctness.
+        // In SQL, the `IN` expression evaluates as follows:
+        //  > `1 in (2, NULL)` -> NULL
+        //  > `1 in (1, NULL)` -> true
+        //  > `1 in (2)` -> false
+        // Since Hive metastore filters are NULL-intolerant binary operations 
joined only by
+        // `AND` and `OR`, we can treat `NULL` as `false` and thus rewrite `1 
in (2, NULL)` as
+        // `1 in (2)`.
+        // If the Hive metastore begins supporting NULL-tolerant predicates 
and Spark starts
+        // pushing down these predicates, then this optimization will become 
incorrect and need
+        // to be changed.
+        val extractables = exprs
+            .filter {
+              case Literal(null, _) => false
+              case _ => true
+            }.map(ExtractableLiteral.unapply)
         if (extractables.nonEmpty && extractables.forall(_.isDefined)) {
           Some(extractables.map(_.get))
         } else {

http://git-wip-us.apache.org/repos/asf/spark/blob/bd6bfacb/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
index 1976569..2a4efd0 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
@@ -72,6 +72,20 @@ class FiltersSuite extends SparkFunSuite with Logging with 
PlanTest {
       (Literal("p2\" and q=\"q2") === a("stringcol", StringType)) :: Nil,
     """stringcol = 'p1" and q="q1' and 'p2" and q="q2' = stringcol""")
 
+  filterTest("SPARK-24879 null literals should be ignored for IN constructs",
+    (a("intcol", IntegerType) in (Literal(1), Literal(null))) :: Nil,
+    "(intcol = 1)")
+
+  // Applying the predicate `x IN (NULL)` should return an empty set, but 
since this optimization
+  // will be applied by Catalyst, this filter converter does not need to 
account for this.
+  filterTest("SPARK-24879 IN predicates with only NULLs will not cause a NPE",
+    (a("intcol", IntegerType) in Literal(null)) :: Nil,
+    "")
+
+  filterTest("typecast null literals should not be pushed down in simple 
predicates",
+    (a("intcol", IntegerType) === Literal(null, IntegerType)) :: Nil,
+    "")
+
   private def filterTest(name: String, filters: Seq[Expression], result: 
String) = {
     test(name) {
       withSQLConf(SQLConf.ADVANCED_PARTITION_PREDICATE_PUSHDOWN.key -> "true") 
{


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to