Re: [PR] [SPARK-49043][SQL] Fix interpreted codepath group by on map containing collated strings [spark]

via GitHub Mon, 05 Aug 2024 02:15:51 -0700


stefankandic commented on code in PR #47521:
URL: https://github.com/apache/spark/pull/47521#discussion_r1703731222



##########
sql/core/src/test/resources/sql-tests/results/mode.sql.out:
##########
@@ -182,15 +182,9 @@ struct<mode(col):map<int,string>>
 -- !query
 SELECT mode(col, true) FROM VALUES (map(1, 'a')) AS tab(col)
 -- !query schema
-struct<>
+struct<mode() WITHIN GROUP (ORDER BY col DESC):map<int,string>>

Review Comment:
   i get the output part, but why is the query schema different?



##########
sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala:
##########
@@ -989,6 +989,93 @@ class CollationSuite extends DatasourceV2SQLBase with 
AdaptiveSparkPlanHelper {
     }
   }
 
+  for (collation <- Seq("UTF8_LCASE", "UNICODE_CI", "UTF8_BINARY", "")) {
+    for (codeGen <- Seq("NO_CODEGEN", "CODEGEN_ONLY")) {
+      val collationSetup = if (collation.isEmpty) "" else "collate " + 
collation
+
+      test(s"Group by on map containing $collationSetup strings ($codeGen)") {
+        val table = "t"
+
+        withTable(table) {
+          withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> codeGen) {
+
+            sql(s"create table $table" +
+              s" (m map<string $collationSetup, string $collationSetup>)")
+            sql(s"insert into $table values (map('aaa', 'AAA'))")
+            sql(s"insert into $table values (map('AAA', 'aaa'))")
+            sql(s"insert into $table values (map('aaa', 'AAA'))")
+            sql(s"insert into $table values (map('bbb', 'BBB'))")
+            sql(s"insert into $table values (map('aAA', 'AaA'))")
+            sql(s"insert into $table values (map('BBb', 'bBB'))")
+            sql(s"insert into $table values (map('aaaa', 'AAA'))")
+
+            val df = sql(s"select count(*) from $table group by m")
+            if (collation.isEmpty ||
+              
CollationFactory.fetchCollation(collation).supportsBinaryEquality) {
+              checkAnswer(df, Seq(Row(2), Row(1), Row(1), Row(1), Row(1), 
Row(1)))
+            } else {
+              checkAnswer(df, Seq(Row(4), Row(2), Row(1)))
+            }
+          }
+        }
+      }
+
+      test(s"Group by on map containing structs with $collationSetup strings 
($codeGen)") {
+        val table = "t"
+
+        withTable(table) {
+          withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> codeGen) {
+            sql(s"create table $table" +
+              s" (m map<struct<fld1: string $collationSetup, fld2: string 
$collationSetup>, " +
+              s"struct<fld1: string $collationSetup, fld2: string 
$collationSetup>>)")
+            sql(s"insert into $table values " +
+              s"(map(struct('aaa', 'bbb'), struct('ccc', 'ddd')))")
+            sql(s"insert into $table values " +
+              s"(map(struct('Aaa', 'BBB'), struct('cCC', 'dDd')))")
+            sql(s"insert into $table values " +
+              s"(map(struct('AAA', 'BBb'), struct('cCc', 'DDD')))")
+            sql(s"insert into $table values " +
+              s"(map(struct('aaa', 'bbB'), struct('CCC', 'DDD')))")
+
+            val df = sql(s"select count(*) from $table group by m")
+            if (collation.isEmpty ||
+              
CollationFactory.fetchCollation(collation).supportsBinaryEquality) {
+              checkAnswer(df, Seq(Row(1), Row(1), Row(1), Row(1)))
+            } else {
+              checkAnswer(df, Seq(Row(4)))
+            }
+          }
+        }
+      }
+
+      test(s"Group by on map containing arrays with $collationSetup strings 
($codeGen)") {
+        val table = "t"
+
+        withTable(table) {
+          withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> codeGen) {
+            sql(s"create table $table " +
+              s"(m map<array<string $collationSetup>, array<string 
$collationSetup>>)")
+            sql(s"insert into $table values (map(array('aaa', 'bbb'), 
array('ccc', 'ddd')))")
+            sql(s"insert into $table values (map(array('AAA', 'BbB'), 
array('Ccc', 'ddD')))")
+            sql(s"insert into $table values (map(array('AAA', 'BbB', 'Ccc'), 
array('ddD')))")
+            sql(s"insert into $table values (map(array('aAa', 'Bbb'), 
array('CCC', 'DDD')))")
+            sql(s"insert into $table values (map(array('AAa', 'BBb'), 
array('cCC', 'DDd')))")
+            sql(s"insert into $table values (map(array('AAA', 'BBB', 'CCC'), 
array('DDD')))")
+
+            val df = sql(s"select count(*) from $table group by m")
+            if (collation.isEmpty ||
+              
CollationFactory.fetchCollation(collation).supportsBinaryEquality) {
+              checkAnswer(df, Seq(Row(1), Row(1), Row(1), Row(1), Row(1), 
Row(1)))
+            } else {
+              checkAnswer(df, Seq(Row(4), Row(2)))
+            }
+          }
+        }
+      }
+    }
+  }

Review Comment:
   please add a test that order by map still fails



##########
sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala:
##########
@@ -989,6 +989,93 @@ class CollationSuite extends DatasourceV2SQLBase with 
AdaptiveSparkPlanHelper {
     }
   }
 
+  for (collation <- Seq("UTF8_LCASE", "UNICODE_CI", "UTF8_BINARY", "")) {
+    for (codeGen <- Seq("NO_CODEGEN", "CODEGEN_ONLY")) {
+      val collationSetup = if (collation.isEmpty) "" else "collate " + 
collation
+
+      test(s"Group by on map containing $collationSetup strings ($codeGen)") {
+        val table = "t"

Review Comment:
   also for the tests below



##########
sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala:
##########
@@ -989,6 +989,93 @@ class CollationSuite extends DatasourceV2SQLBase with 
AdaptiveSparkPlanHelper {
     }
   }
 
+  for (collation <- Seq("UTF8_LCASE", "UNICODE_CI", "UTF8_BINARY", "")) {
+    for (codeGen <- Seq("NO_CODEGEN", "CODEGEN_ONLY")) {
+      val collationSetup = if (collation.isEmpty) "" else "collate " + 
collation
+
+      test(s"Group by on map containing $collationSetup strings ($codeGen)") {
+        val table = "t"
+
+        withTable(table) {
+          withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> codeGen) {
+
+            sql(s"create table $table" +
+              s" (m map<string $collationSetup, string $collationSetup>)")
+            sql(s"insert into $table values (map('aaa', 'AAA'))")
+            sql(s"insert into $table values (map('AAA', 'aaa'))")
+            sql(s"insert into $table values (map('aaa', 'AAA'))")
+            sql(s"insert into $table values (map('bbb', 'BBB'))")
+            sql(s"insert into $table values (map('aAA', 'AaA'))")
+            sql(s"insert into $table values (map('BBb', 'bBB'))")
+            sql(s"insert into $table values (map('aaaa', 'AAA'))")
+
+            val df = sql(s"select count(*) from $table group by m")
+            if (collation.isEmpty ||

Review Comment:
   can we extract this predicate into a variable at the top?



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/PhysicalDataType.scala:
##########
@@ -234,10 +234,69 @@ case object PhysicalLongType extends PhysicalLongType
 
 case class PhysicalMapType(keyType: DataType, valueType: DataType, 
valueContainsNull: Boolean)
     extends PhysicalDataType {
-  override private[sql] def ordering =
-    throw 
QueryExecutionErrors.orderedOperationUnsupportedByDataTypeError("PhysicalMapType")
-  override private[sql] type InternalType = Any
+  override private[sql] def ordering = interpretedOrdering
+  override private[sql] type InternalType = MapData
   @transient private[sql] lazy val tag = typeTag[InternalType]
+
+  @transient
+  private[sql] lazy val interpretedOrdering: Ordering[MapData] = new 
Ordering[MapData] {
+    private[this] val keyOrdering =
+      PhysicalDataType(keyType).ordering.asInstanceOf[Ordering[Any]]
+    private[this] val valuesOrdering =
+      PhysicalDataType(valueType).ordering.asInstanceOf[Ordering[Any]]
+
+    override def compare(x: MapData, y: MapData): Int = {
+      val lengthX = x.numElements()
+      val lengthY = y.numElements()
+      val keyArrayX = x.keyArray()
+      val valueArrayX = x.valueArray()
+      val keyArrayY = y.keyArray()
+      val valueArrayY = y.valueArray()
+      val minLength = math.min(lengthX, lengthY)
+      var i = 0
+      while (i < minLength) {
+        var comp = compareElements(keyArrayX, keyArrayY, keyType, i, 
keyOrdering)
+        if (comp != 0) {
+          return comp
+        }
+        comp = compareElements(valueArrayX, valueArrayY, valueType, i, 
valuesOrdering)
+        if (comp != 0) {
+          return comp
+        }
+
+        i += 1
+      }
+
+      if (lengthX < lengthY) {
+        -1
+      }
+      else if (lengthX > lengthY) {
+        1
+      }
+      else {
+        0
+      }
+    }
+
+    private def compareElements(arrayX: ArrayData, arrayY: ArrayData, 
dataType: DataType,

Review Comment:
   please follow dbx scala style guide on indentation for method declarations: 
https://github.com/databricks/scala-style-guide?tab=readme-ov-file#indent



##########
sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala:
##########
@@ -989,6 +989,93 @@ class CollationSuite extends DatasourceV2SQLBase with 
AdaptiveSparkPlanHelper {
     }
   }
 
+  for (collation <- Seq("UTF8_LCASE", "UNICODE_CI", "UTF8_BINARY", "")) {
+    for (codeGen <- Seq("NO_CODEGEN", "CODEGEN_ONLY")) {
+      val collationSetup = if (collation.isEmpty) "" else "collate " + 
collation
+
+      test(s"Group by on map containing $collationSetup strings ($codeGen)") {
+        val table = "t"

Review Comment:
   ```suggestion
           val tableName = "t"
   ```



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/PhysicalDataType.scala:
##########
@@ -234,10 +234,69 @@ case object PhysicalLongType extends PhysicalLongType
 
 case class PhysicalMapType(keyType: DataType, valueType: DataType, 
valueContainsNull: Boolean)
     extends PhysicalDataType {
-  override private[sql] def ordering =
-    throw 
QueryExecutionErrors.orderedOperationUnsupportedByDataTypeError("PhysicalMapType")
-  override private[sql] type InternalType = Any
+  override private[sql] def ordering = interpretedOrdering
+  override private[sql] type InternalType = MapData
   @transient private[sql] lazy val tag = typeTag[InternalType]
+
+  @transient
+  private[sql] lazy val interpretedOrdering: Ordering[MapData] = new 
Ordering[MapData] {

Review Comment:
   can we add a comment here explaining that maps are not in fact orderable but 
that we need this just to support the group by scenario?



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/types/PhysicalDataType.scala:
##########
@@ -234,10 +234,69 @@ case object PhysicalLongType extends PhysicalLongType
 
 case class PhysicalMapType(keyType: DataType, valueType: DataType, 
valueContainsNull: Boolean)
     extends PhysicalDataType {
-  override private[sql] def ordering =
-    throw 
QueryExecutionErrors.orderedOperationUnsupportedByDataTypeError("PhysicalMapType")
-  override private[sql] type InternalType = Any
+  override private[sql] def ordering = interpretedOrdering
+  override private[sql] type InternalType = MapData
   @transient private[sql] lazy val tag = typeTag[InternalType]
+
+  @transient
+  private[sql] lazy val interpretedOrdering: Ordering[MapData] = new 
Ordering[MapData] {
+    private[this] val keyOrdering =
+      PhysicalDataType(keyType).ordering.asInstanceOf[Ordering[Any]]
+    private[this] val valuesOrdering =
+      PhysicalDataType(valueType).ordering.asInstanceOf[Ordering[Any]]
+
+    override def compare(x: MapData, y: MapData): Int = {
+      val lengthX = x.numElements()

Review Comment:
   can we follow the `PhysicalArrayType` convention of naming these variables 
`left` and `right` instead of `x` and `y`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Re: [PR] [SPARK-49043][SQL] Fix interpreted codepath group by on map containing collated strings [spark]

Reply via email to