spark git commit: [SPARK-24879][SQL] Fix NPE in Hive partition pruning filter pushdown

2018-07-20 Thread lixiao
Repository: spark
Updated Branches:
  refs/heads/branch-2.3 db1f3cc76 -> bd6bfacb2


[SPARK-24879][SQL] Fix NPE in Hive partition pruning filter pushdown

## What changes were proposed in this pull request?
We get a NPE when we have a filter on a partition column of the form `col in 
(x, null)`. This is due to the filter converter in HiveShim not handling 
`null`s correctly. This patch fixes this bug while still pushing down as much 
of the partition pruning predicates as possible, by filtering out `null`s from 
any `in` predicate. Since Hive only supports very simple partition pruning 
filters, this change should preserve correctness.

## How was this patch tested?
Unit tests, manual tests

Author: William Sheu 

Closes #21832 from PenguinToast/partition-pruning-npe.

(cherry picked from commit bbd6f0c25fe19dc6c946e63cac7b98d0f78b3463)
Signed-off-by: Xiao Li 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bd6bfacb
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bd6bfacb
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bd6bfacb

Branch: refs/heads/branch-2.3
Commit: bd6bfacb2ef473b858cc91e0692591deaea26118
Parents: db1f3cc
Author: William Sheu 
Authored: Fri Jul 20 19:59:28 2018 -0700
Committer: Xiao Li 
Committed: Fri Jul 20 20:00:17 2018 -0700

--
 .../apache/spark/sql/hive/client/HiveShim.scala  | 19 ++-
 .../spark/sql/hive/client/FiltersSuite.scala | 14 ++
 2 files changed, 32 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/bd6bfacb/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 60fe31f..68acea7 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -599,6 +599,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
 
 object ExtractableLiteral {
   def unapply(expr: Expression): Option[String] = expr match {
+case Literal(null, _) => None // `null`s can be cast as other types; 
we want to avoid NPEs.
 case Literal(value, _: IntegralType) => Some(value.toString)
 case Literal(value, _: StringType) => 
Some(quoteStringLiteral(value.toString))
 case _ => None
@@ -607,7 +608,23 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
 
 object ExtractableLiterals {
   def unapply(exprs: Seq[Expression]): Option[Seq[String]] = {
-val extractables = exprs.map(ExtractableLiteral.unapply)
+// SPARK-24879: The Hive metastore filter parser does not support 
"null", but we still want
+// to push down as many predicates as we can while still maintaining 
correctness.
+// In SQL, the `IN` expression evaluates as follows:
+//  > `1 in (2, NULL)` -> NULL
+//  > `1 in (1, NULL)` -> true
+//  > `1 in (2)` -> false
+// Since Hive metastore filters are NULL-intolerant binary operations 
joined only by
+// `AND` and `OR`, we can treat `NULL` as `false` and thus rewrite `1 
in (2, NULL)` as
+// `1 in (2)`.
+// If the Hive metastore begins supporting NULL-tolerant predicates 
and Spark starts
+// pushing down these predicates, then this optimization will become 
incorrect and need
+// to be changed.
+val extractables = exprs
+.filter {
+  case Literal(null, _) => false
+  case _ => true
+}.map(ExtractableLiteral.unapply)
 if (extractables.nonEmpty && extractables.forall(_.isDefined)) {
   Some(extractables.map(_.get))
 } else {

http://git-wip-us.apache.org/repos/asf/spark/blob/bd6bfacb/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
index 1976569..2a4efd0 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
@@ -72,6 +72,20 @@ class FiltersSuite extends SparkFunSuite with Logging with 
PlanTest {
   (Literal("p2\" and q=\"q2") === a("stringcol", StringType)) :: Nil,
 """stringcol = 'p1" and q="q1' and 'p2" and q="q2' = stringcol""")
 
+  filterTest("SPARK-24879 null literals should be ignored for IN constructs",
+(a("intcol", IntegerType) in 

spark git commit: [SPARK-24879][SQL] Fix NPE in Hive partition pruning filter pushdown

2018-07-20 Thread lixiao
Repository: spark
Updated Branches:
  refs/heads/master 96f312076 -> bbd6f0c25


[SPARK-24879][SQL] Fix NPE in Hive partition pruning filter pushdown

## What changes were proposed in this pull request?
We get a NPE when we have a filter on a partition column of the form `col in 
(x, null)`. This is due to the filter converter in HiveShim not handling 
`null`s correctly. This patch fixes this bug while still pushing down as much 
of the partition pruning predicates as possible, by filtering out `null`s from 
any `in` predicate. Since Hive only supports very simple partition pruning 
filters, this change should preserve correctness.

## How was this patch tested?
Unit tests, manual tests

Author: William Sheu 

Closes #21832 from PenguinToast/partition-pruning-npe.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/bbd6f0c2
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/bbd6f0c2
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/bbd6f0c2

Branch: refs/heads/master
Commit: bbd6f0c25fe19dc6c946e63cac7b98d0f78b3463
Parents: 96f3120
Author: William Sheu 
Authored: Fri Jul 20 19:59:28 2018 -0700
Committer: Xiao Li 
Committed: Fri Jul 20 19:59:28 2018 -0700

--
 .../apache/spark/sql/hive/client/HiveShim.scala  | 19 ++-
 .../spark/sql/hive/client/FiltersSuite.scala | 14 ++
 2 files changed, 32 insertions(+), 1 deletion(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/bbd6f0c2/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
--
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
index 933384e..bc9d4cd 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/client/HiveShim.scala
@@ -598,6 +598,7 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
 
 object ExtractableLiteral {
   def unapply(expr: Expression): Option[String] = expr match {
+case Literal(null, _) => None // `null`s can be cast as other types; 
we want to avoid NPEs.
 case Literal(value, _: IntegralType) => Some(value.toString)
 case Literal(value, _: StringType) => 
Some(quoteStringLiteral(value.toString))
 case _ => None
@@ -606,7 +607,23 @@ private[client] class Shim_v0_13 extends Shim_v0_12 {
 
 object ExtractableLiterals {
   def unapply(exprs: Seq[Expression]): Option[Seq[String]] = {
-val extractables = exprs.map(ExtractableLiteral.unapply)
+// SPARK-24879: The Hive metastore filter parser does not support 
"null", but we still want
+// to push down as many predicates as we can while still maintaining 
correctness.
+// In SQL, the `IN` expression evaluates as follows:
+//  > `1 in (2, NULL)` -> NULL
+//  > `1 in (1, NULL)` -> true
+//  > `1 in (2)` -> false
+// Since Hive metastore filters are NULL-intolerant binary operations 
joined only by
+// `AND` and `OR`, we can treat `NULL` as `false` and thus rewrite `1 
in (2, NULL)` as
+// `1 in (2)`.
+// If the Hive metastore begins supporting NULL-tolerant predicates 
and Spark starts
+// pushing down these predicates, then this optimization will become 
incorrect and need
+// to be changed.
+val extractables = exprs
+.filter {
+  case Literal(null, _) => false
+  case _ => true
+}.map(ExtractableLiteral.unapply)
 if (extractables.nonEmpty && extractables.forall(_.isDefined)) {
   Some(extractables.map(_.get))
 } else {

http://git-wip-us.apache.org/repos/asf/spark/blob/bbd6f0c2/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
--
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
index 1976569..2a4efd0 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/client/FiltersSuite.scala
@@ -72,6 +72,20 @@ class FiltersSuite extends SparkFunSuite with Logging with 
PlanTest {
   (Literal("p2\" and q=\"q2") === a("stringcol", StringType)) :: Nil,
 """stringcol = 'p1" and q="q1' and 'p2" and q="q2' = stringcol""")
 
+  filterTest("SPARK-24879 null literals should be ignored for IN constructs",
+(a("intcol", IntegerType) in (Literal(1), Literal(null))) :: Nil,
+"(intcol = 1)")
+
+  // Applying the predicate `x IN (NULL)`