[spark] branch branch-3.0 updated: [SPARK-31563][SQL] Fix failure of InSet.sql for collections of Catalyst's internal types

dongjoon Sat, 25 Apr 2020 09:31:43 -0700

This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new dbcf855  [SPARK-31563][SQL] Fix failure of InSet.sql for collections 
of Catalyst's internal types
dbcf855 is described below

commit dbcf855d8448b7bf8ba9d8c4a08f8bef14be2805
Author: Max Gekk <max.g...@gmail.com>
AuthorDate: Sat Apr 25 09:29:51 2020 -0700

    [SPARK-31563][SQL] Fix failure of InSet.sql for collections of Catalyst's 
internal types
    
    ### What changes were proposed in this pull request?
    In the PR, I propose to fix the `InSet.sql` method for the cases when input 
collection contains values of internal Catalyst's types, for instance 
`UTF8String`. Elements of the input set `hset` are converted to Scala types, 
and wrapped by `Literal` to properly form SQL view of the input collection.
    
    ### Why are the changes needed?
    The changes fixed the bug in `InSet.sql` that makes wrong assumption about 
types of collection elements. See more details in SPARK-31563.
    
    ### Does this PR introduce any user-facing change?
    Highly likely, not.
    
    ### How was this patch tested?
    Added a test to `ColumnExpressionSuite`
    
    Closes #28343 from MaxGekk/fix-InSet-sql.
    
    Authored-by: Max Gekk <max.g...@gmail.com>
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
    (cherry picked from commit 7d8216a6642f40af0d1b623129b1d5f4c86bec68)
    Signed-off-by: Dongjoon Hyun <dongj...@apache.org>
---
 .../org/apache/spark/sql/catalyst/expressions/predicates.scala    | 5 ++++-
 .../test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala   | 8 +++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
index bd190c3..ac492cf 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/predicates.scala
@@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions
 
 import scala.collection.immutable.TreeSet
 
+import org.apache.spark.sql.catalyst.CatalystTypeConverters.convertToScala
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.analysis.TypeCheckResult
 import org.apache.spark.sql.catalyst.expressions.BindReferences.bindReference
@@ -519,7 +520,9 @@ case class InSet(child: Expression, hset: Set[Any]) extends 
UnaryExpression with
 
   override def sql: String = {
     val valueSQL = child.sql
-    val listSQL = hset.toSeq.map(Literal(_).sql).mkString(", ")
+    val listSQL = hset.toSeq
+      .map(elem => Literal(convertToScala(elem, child.dataType)).sql)
+      .mkString(", ")
     s"($valueSQL IN ($listSQL))"
   }
 }
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
index a9ee25b..b72d92b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/ColumnExpressionSuite.scala
@@ -26,12 +26,13 @@ import org.apache.hadoop.io.{LongWritable, Text}
 import org.apache.hadoop.mapreduce.lib.input.{TextInputFormat => 
NewTextInputFormat}
 import org.scalatest.Matchers._
 
-import org.apache.spark.sql.catalyst.expressions.{In, InSet, NamedExpression}
+import org.apache.spark.sql.catalyst.expressions.{In, InSet, Literal, 
NamedExpression}
 import org.apache.spark.sql.execution.ProjectExec
 import org.apache.spark.sql.functions._
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.test.SharedSparkSession
 import org.apache.spark.sql.types._
+import org.apache.spark.unsafe.types.UTF8String
 
 class ColumnExpressionSuite extends QueryTest with SharedSparkSession {
   import testImplicits._
@@ -869,4 +870,9 @@ class ColumnExpressionSuite extends QueryTest with 
SharedSparkSession {
       df.select(typedLit(("a", 2, 1.0))),
       Row(Row("a", 2, 1.0)) :: Nil)
   }
+
+  test("SPARK-31563: sql of InSet for UTF8String collection") {
+    val inSet = InSet(Literal("a"), Set("a", "b").map(UTF8String.fromString))
+    assert(inSet.sql === "('a' IN ('a', 'b'))")
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

[spark] branch branch-3.0 updated: [SPARK-31563][SQL] Fix failure of InSet.sql for collections of Catalyst's internal types

Reply via email to