spark git commit: [SPARK-24879][SQL] Fix NPE in Hive partition pruning filter pushdown
Repository: spark Updated Branches: refs/heads/branch-2.3 db1f3cc76 -> bd6bfacb2 [SPARK-24879][SQL] Fix NPE in Hive partition pruning filter pushdown ## What changes were proposed in this pull request? We get a NPE when we have a filter on a partition column of the form `col in (x, null)`. This is due to the filter converter in HiveShim not handling `null`s correctly. This patch fixes this bug while still pushing down as much of the partition pruning predicates as possible, by filtering out `null`s from any `in` predicate. Since Hive only supports very simple partition pruning filters, this change should preserve correctness. ## How was this patch tested? Unit tests, manual tests Author: William Sheu Closes #21832 from PenguinToast/partition-pruning-npe. (cherry picked from commit bbd6f0c25fe19dc6c946e63cac7b98d0f78b3463) Signed-off-by: Xiao Li Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bd6bfacb Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bd6bfacb Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bd6bfacb Branch: refs/heads/branch-2.3 Commit: bd6bfacb2ef473b858cc91e0692591deaea26118 Parents: db1f3cc Author: William Sheu Authored: Fri Jul 20 19:59:28 2018 -0700 Committer: Xiao Li Committed: Fri Jul 20 20:00:17 2018 -0700 -- .../apache/spark/sql/hive/client/HiveShim.scala | 19 ++- .../spark/sql/hive/client/FiltersSuite.scala | 14 ++ 2 files changed, 32 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bd6bfacb/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala index 60fe31f..68acea7 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala @@ -599,6 +599,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { object ExtractableLiteral { def unapply(expr: Expression): Option[String] = expr match { +case Literal(null, _) => None // `null`s can be cast as other types; we want to avoid NPEs. case Literal(value, _: IntegralType) => Some(value.toString) case Literal(value, _: StringType) => Some(quoteStringLiteral(value.toString)) case _ => None @@ -607,7 +608,23 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { object ExtractableLiterals { def unapply(exprs: Seq[Expression]): Option[Seq[String]] = { -val extractables = exprs.map(ExtractableLiteral.unapply) +// SPARK-24879: The Hive metastore filter parser does not support "null", but we still want +// to push down as many predicates as we can while still maintaining correctness. +// In SQL, the `IN` expression evaluates as follows: +// > `1 in (2, NULL)` -> NULL +// > `1 in (1, NULL)` -> true +// > `1 in (2)` -> false +// Since Hive metastore filters are NULL-intolerant binary operations joined only by +// `AND` and `OR`, we can treat `NULL` as `false` and thus rewrite `1 in (2, NULL)` as +// `1 in (2)`. +// If the Hive metastore begins supporting NULL-tolerant predicates and Spark starts +// pushing down these predicates, then this optimization will become incorrect and need +// to be changed. +val extractables = exprs +.filter { + case Literal(null, _) => false + case _ => true +}.map(ExtractableLiteral.unapply) if (extractables.nonEmpty && extractables.forall(_.isDefined)) { Some(extractables.map(_.get)) } else { http://git-wip-us.apache.org/repos/asf/spark/blob/bd6bfacb/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala index 1976569..2a4efd0 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala @@ -72,6 +72,20 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest { (Literal("p2\" and q=\"q2") === a("stringcol", StringType)) :: Nil, """stringcol = 'p1" and q="q1' and 'p2" and q="q2' = stringcol""") + filterTest("SPARK-24879 null literals should be ignored for IN constructs", +(a("intcol", IntegerType) in
spark git commit: [SPARK-24879][SQL] Fix NPE in Hive partition pruning filter pushdown
Repository: spark Updated Branches: refs/heads/master 96f312076 -> bbd6f0c25 [SPARK-24879][SQL] Fix NPE in Hive partition pruning filter pushdown ## What changes were proposed in this pull request? We get a NPE when we have a filter on a partition column of the form `col in (x, null)`. This is due to the filter converter in HiveShim not handling `null`s correctly. This patch fixes this bug while still pushing down as much of the partition pruning predicates as possible, by filtering out `null`s from any `in` predicate. Since Hive only supports very simple partition pruning filters, this change should preserve correctness. ## How was this patch tested? Unit tests, manual tests Author: William Sheu Closes #21832 from PenguinToast/partition-pruning-npe. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bbd6f0c2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bbd6f0c2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bbd6f0c2 Branch: refs/heads/master Commit: bbd6f0c25fe19dc6c946e63cac7b98d0f78b3463 Parents: 96f3120 Author: William Sheu Authored: Fri Jul 20 19:59:28 2018 -0700 Committer: Xiao Li Committed: Fri Jul 20 19:59:28 2018 -0700 -- .../apache/spark/sql/hive/client/HiveShim.scala | 19 ++- .../spark/sql/hive/client/FiltersSuite.scala | 14 ++ 2 files changed, 32 insertions(+), 1 deletion(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/bbd6f0c2/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala -- diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala index 933384e..bc9d4cd 100644 --- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala +++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala @@ -598,6 +598,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { object ExtractableLiteral { def unapply(expr: Expression): Option[String] = expr match { +case Literal(null, _) => None // `null`s can be cast as other types; we want to avoid NPEs. case Literal(value, _: IntegralType) => Some(value.toString) case Literal(value, _: StringType) => Some(quoteStringLiteral(value.toString)) case _ => None @@ -606,7 +607,23 @@ private[client] class Shim_v0_13 extends Shim_v0_12 { object ExtractableLiterals { def unapply(exprs: Seq[Expression]): Option[Seq[String]] = { -val extractables = exprs.map(ExtractableLiteral.unapply) +// SPARK-24879: The Hive metastore filter parser does not support "null", but we still want +// to push down as many predicates as we can while still maintaining correctness. +// In SQL, the `IN` expression evaluates as follows: +// > `1 in (2, NULL)` -> NULL +// > `1 in (1, NULL)` -> true +// > `1 in (2)` -> false +// Since Hive metastore filters are NULL-intolerant binary operations joined only by +// `AND` and `OR`, we can treat `NULL` as `false` and thus rewrite `1 in (2, NULL)` as +// `1 in (2)`. +// If the Hive metastore begins supporting NULL-tolerant predicates and Spark starts +// pushing down these predicates, then this optimization will become incorrect and need +// to be changed. +val extractables = exprs +.filter { + case Literal(null, _) => false + case _ => true +}.map(ExtractableLiteral.unapply) if (extractables.nonEmpty && extractables.forall(_.isDefined)) { Some(extractables.map(_.get)) } else { http://git-wip-us.apache.org/repos/asf/spark/blob/bbd6f0c2/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala -- diff --git a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala index 1976569..2a4efd0 100644 --- a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala +++ b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala @@ -72,6 +72,20 @@ class FiltersSuite extends SparkFunSuite with Logging with PlanTest { (Literal("p2\" and q=\"q2") === a("stringcol", StringType)) :: Nil, """stringcol = 'p1" and q="q1' and 'p2" and q="q2' = stringcol""") + filterTest("SPARK-24879 null literals should be ignored for IN constructs", +(a("intcol", IntegerType) in (Literal(1), Literal(null))) :: Nil, +"(intcol = 1)") + + // Applying the predicate `x IN (NULL)`