spark git commit: [SPARK-26211][SQL] Fix InSet for binary, and struct and array with null.

2018-11-29 Thread wenchen
Repository: spark
Updated Branches:
  refs/heads/branch-2.3 96a5a127e -> e96ba8430


[SPARK-26211][SQL] Fix InSet for binary, and struct and array with null.

Currently `InSet` doesn't work properly for binary type, or struct and array 
type with null value in the set.
Because, as for binary type, the `HashSet` doesn't work properly for 
`Array[Byte]`, and as for struct and array type with null value in the set, the 
`ordering` will throw a `NPE`.

Added a few tests.

Closes #23176 from ueshin/issues/SPARK-26211/inset.

Authored-by: Takuya UESHIN 
Signed-off-by: Wenchen Fan 
(cherry picked from commit b9b68a6dc7d0f735163e980392ea957f2d589923)
Signed-off-by: Wenchen Fan 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/e96ba843
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/e96ba843
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/e96ba843

Branch: refs/heads/branch-2.3
Commit: e96ba8430c693c3bcc9f6797a4779c8e9fadaaba
Parents: 96a5a127
Author: Takuya UESHIN 
Authored: Thu Nov 29 22:37:02 2018 +0800
Committer: Wenchen Fan 
Committed: Thu Nov 29 22:41:51 2018 +0800

--
 .../sql/catalyst/expressions/predicates.scala   | 31 +---
 .../catalyst/expressions/PredicateSuite.scala   | 50 +++-
 2 files changed, 62 insertions(+), 19 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/e96ba843/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index a29b136..8c42eee 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -337,31 +337,26 @@ case class InSet(child: Expression, hset: Set[Any]) 
extends UnaryExpression with
   }
 
   @transient lazy val set: Set[Any] = child.dataType match {
-case _: AtomicType => hset
+case t: AtomicType if !t.isInstanceOf[BinaryType] => hset
 case _: NullType => hset
 case _ =>
   // for structs use interpreted ordering to be able to compare UnsafeRows 
with non-UnsafeRows
-  TreeSet.empty(TypeUtils.getInterpretedOrdering(child.dataType)) ++ hset
+  TreeSet.empty(TypeUtils.getInterpretedOrdering(child.dataType)) ++ (hset 
- null)
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-val setTerm = ctx.addReferenceObj("set", set)
-val childGen = child.genCode(ctx)
-val setIsNull = if (hasNull) {
-  s"${ev.isNull} = !${ev.value};"
-} else {
-  ""
-}
-ev.copy(code =
+nullSafeCodeGen(ctx, ev, c => {
+  val setTerm = ctx.addReferenceObj("set", set)
+  val setIsNull = if (hasNull) {
+s"${ev.isNull} = !${ev.value};"
+  } else {
+""
+  }
   s"""
- |${childGen.code}
- |${ctx.JAVA_BOOLEAN} ${ev.isNull} = ${childGen.isNull};
- |${ctx.JAVA_BOOLEAN} ${ev.value} = false;
- |if (!${ev.isNull}) {
- |  ${ev.value} = $setTerm.contains(${childGen.value});
- |  $setIsNull
- |}
-   """.stripMargin)
+ |${ev.value} = $setTerm.contains($c);
+ |$setIsNull
+   """.stripMargin
+})
   }
 
   override def sql: String = {

http://git-wip-us.apache.org/repos/asf/spark/blob/e96ba843/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index 1bfd180..861fddc 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -268,7 +268,7 @@ class PredicateSuite extends SparkFunSuite with 
ExpressionEvalHelper {
 checkEvaluation(InSet(nl, nS), null)
 
 val primitiveTypes = Seq(IntegerType, FloatType, DoubleType, StringType, 
ByteType, ShortType,
-  LongType, BinaryType, BooleanType, DecimalType.USER_DEFAULT, 
TimestampType)
+  LongType, BooleanType, DecimalType.USER_DEFAULT, TimestampType)
 primitiveTypes.foreach { t =>
   val dataGen = RandomDataGenerator.forType(t, nullable = true).get
   val inputData = Seq.fill(10) {
@@ -293,6 +293,54 @@ class PredicateSuite extends SparkFunSuite with 
ExpressionEvalHelper {
 }
   }
 
+  test("INSET: binary") {
+  

spark git commit: [SPARK-26211][SQL] Fix InSet for binary, and struct and array with null.

2018-11-29 Thread wenchen
Repository: spark
Updated Branches:
  refs/heads/branch-2.4 99a9107c9 -> 7200915fa


[SPARK-26211][SQL] Fix InSet for binary, and struct and array with null.

## What changes were proposed in this pull request?

Currently `InSet` doesn't work properly for binary type, or struct and array 
type with null value in the set.
Because, as for binary type, the `HashSet` doesn't work properly for 
`Array[Byte]`, and as for struct and array type with null value in the set, the 
`ordering` will throw a `NPE`.

## How was this patch tested?

Added a few tests.

Closes #23176 from ueshin/issues/SPARK-26211/inset.

Authored-by: Takuya UESHIN 
Signed-off-by: Wenchen Fan 
(cherry picked from commit b9b68a6dc7d0f735163e980392ea957f2d589923)
Signed-off-by: Wenchen Fan 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7200915f
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7200915f
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7200915f

Branch: refs/heads/branch-2.4
Commit: 7200915fa9cd66fc2259369ce0fe115c2edff1f3
Parents: 99a9107
Author: Takuya UESHIN 
Authored: Thu Nov 29 22:37:02 2018 +0800
Committer: Wenchen Fan 
Committed: Thu Nov 29 22:38:18 2018 +0800

--
 .../sql/catalyst/expressions/predicates.scala   | 33 ++---
 .../catalyst/expressions/PredicateSuite.scala   | 50 +++-
 2 files changed, 63 insertions(+), 20 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/7200915f/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 7f21a62..eedfbc2 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -369,31 +369,26 @@ case class InSet(child: Expression, hset: Set[Any]) 
extends UnaryExpression with
   }
 
   @transient lazy val set: Set[Any] = child.dataType match {
-case _: AtomicType => hset
+case t: AtomicType if !t.isInstanceOf[BinaryType] => hset
 case _: NullType => hset
 case _ =>
   // for structs use interpreted ordering to be able to compare UnsafeRows 
with non-UnsafeRows
-  TreeSet.empty(TypeUtils.getInterpretedOrdering(child.dataType)) ++ hset
+  TreeSet.empty(TypeUtils.getInterpretedOrdering(child.dataType)) ++ (hset 
- null)
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-val setTerm = ctx.addReferenceObj("set", set)
-val childGen = child.genCode(ctx)
-val setIsNull = if (hasNull) {
-  s"${ev.isNull} = !${ev.value};"
-} else {
-  ""
-}
-ev.copy(code =
-  code"""
- |${childGen.code}
- |${CodeGenerator.JAVA_BOOLEAN} ${ev.isNull} = ${childGen.isNull};
- |${CodeGenerator.JAVA_BOOLEAN} ${ev.value} = false;
- |if (!${ev.isNull}) {
- |  ${ev.value} = $setTerm.contains(${childGen.value});
- |  $setIsNull
- |}
-   """.stripMargin)
+nullSafeCodeGen(ctx, ev, c => {
+  val setTerm = ctx.addReferenceObj("set", set)
+  val setIsNull = if (hasNull) {
+s"${ev.isNull} = !${ev.value};"
+  } else {
+""
+  }
+  s"""
+ |${ev.value} = $setTerm.contains($c);
+ |$setIsNull
+   """.stripMargin
+})
   }
 
   override def sql: String = {

http://git-wip-us.apache.org/repos/asf/spark/blob/7200915f/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index ac76b17..3b60d1d8 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -268,7 +268,7 @@ class PredicateSuite extends SparkFunSuite with 
ExpressionEvalHelper {
 checkEvaluation(InSet(nl, nS), null)
 
 val primitiveTypes = Seq(IntegerType, FloatType, DoubleType, StringType, 
ByteType, ShortType,
-  LongType, BinaryType, BooleanType, DecimalType.USER_DEFAULT, 
TimestampType)
+  LongType, BooleanType, DecimalType.USER_DEFAULT, TimestampType)
 primitiveTypes.foreach { t =>
   val dataGen = RandomDataGenerator.forType(t, nullable = true).get
   val inputData = Seq.fill(10) {
@@ -293,6 

spark git commit: [SPARK-26211][SQL] Fix InSet for binary, and struct and array with null.

2018-11-29 Thread wenchen
Repository: spark
Updated Branches:
  refs/heads/master 7a83d7140 -> b9b68a6dc


[SPARK-26211][SQL] Fix InSet for binary, and struct and array with null.

## What changes were proposed in this pull request?

Currently `InSet` doesn't work properly for binary type, or struct and array 
type with null value in the set.
Because, as for binary type, the `HashSet` doesn't work properly for 
`Array[Byte]`, and as for struct and array type with null value in the set, the 
`ordering` will throw a `NPE`.

## How was this patch tested?

Added a few tests.

Closes #23176 from ueshin/issues/SPARK-26211/inset.

Authored-by: Takuya UESHIN 
Signed-off-by: Wenchen Fan 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b9b68a6d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b9b68a6d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b9b68a6d

Branch: refs/heads/master
Commit: b9b68a6dc7d0f735163e980392ea957f2d589923
Parents: 7a83d71
Author: Takuya UESHIN 
Authored: Thu Nov 29 22:37:02 2018 +0800
Committer: Wenchen Fan 
Committed: Thu Nov 29 22:37:02 2018 +0800

--
 .../sql/catalyst/expressions/predicates.scala   | 33 ++---
 .../catalyst/expressions/PredicateSuite.scala   | 50 +++-
 2 files changed, 63 insertions(+), 20 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/b9b68a6d/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
--
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index 16e0bc3..01ecb99 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -367,31 +367,26 @@ case class InSet(child: Expression, hset: Set[Any]) 
extends UnaryExpression with
   }
 
   @transient lazy val set: Set[Any] = child.dataType match {
-case _: AtomicType => hset
+case t: AtomicType if !t.isInstanceOf[BinaryType] => hset
 case _: NullType => hset
 case _ =>
   // for structs use interpreted ordering to be able to compare UnsafeRows 
with non-UnsafeRows
-  TreeSet.empty(TypeUtils.getInterpretedOrdering(child.dataType)) ++ hset
+  TreeSet.empty(TypeUtils.getInterpretedOrdering(child.dataType)) ++ (hset 
- null)
   }
 
   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
-val setTerm = ctx.addReferenceObj("set", set)
-val childGen = child.genCode(ctx)
-val setIsNull = if (hasNull) {
-  s"${ev.isNull} = !${ev.value};"
-} else {
-  ""
-}
-ev.copy(code =
-  code"""
- |${childGen.code}
- |${CodeGenerator.JAVA_BOOLEAN} ${ev.isNull} = ${childGen.isNull};
- |${CodeGenerator.JAVA_BOOLEAN} ${ev.value} = false;
- |if (!${ev.isNull}) {
- |  ${ev.value} = $setTerm.contains(${childGen.value});
- |  $setIsNull
- |}
-   """.stripMargin)
+nullSafeCodeGen(ctx, ev, c => {
+  val setTerm = ctx.addReferenceObj("set", set)
+  val setIsNull = if (hasNull) {
+s"${ev.isNull} = !${ev.value};"
+  } else {
+""
+  }
+  s"""
+ |${ev.value} = $setTerm.contains($c);
+ |$setIsNull
+   """.stripMargin
+})
   }
 
   override def sql: String = {

http://git-wip-us.apache.org/repos/asf/spark/blob/b9b68a6d/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
--
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
index ac76b17..3b60d1d8 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/PredicateSuite.scala
@@ -268,7 +268,7 @@ class PredicateSuite extends SparkFunSuite with 
ExpressionEvalHelper {
 checkEvaluation(InSet(nl, nS), null)
 
 val primitiveTypes = Seq(IntegerType, FloatType, DoubleType, StringType, 
ByteType, ShortType,
-  LongType, BinaryType, BooleanType, DecimalType.USER_DEFAULT, 
TimestampType)
+  LongType, BooleanType, DecimalType.USER_DEFAULT, TimestampType)
 primitiveTypes.foreach { t =>
   val dataGen = RandomDataGenerator.forType(t, nullable = true).get
   val inputData = Seq.fill(10) {
@@ -293,6 +293,54 @@ class PredicateSuite extends SparkFunSuite with 
ExpressionEvalHelper {
 }
   }
 
+