[GitHub] [spark] maropu commented on a change in pull request #31967: [SPARK-34819][SQL] MapType supports orderable semantics

GitBox Tue, 30 Mar 2021 00:01:36 -0700


maropu commented on a change in pull request #31967:
URL: https://github.com/apache/spark/pull/31967#discussion_r603820580




##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
##########
@@ -687,6 +688,118 @@ class CodegenContext extends Logging {
           }
         """
       s"${addNewFunction(compareFunc, funcCode)}($c1, $c2)"
+    case _ @ MapType(keyType, valueType, valueContainsNull) =>

Review comment:
       What's a difference from the @hvanhovell impl.? The @hvanhovell one 
looks simpler though.
   
https://github.com/apache/spark/pull/15970/files#diff-1501206e78d34b65183af1092c8ec392ce18574bb538f905ca93a22983c63ae6R558-R598

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeMapType.scala
##########
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import scala.math.Ordering
+
+import org.apache.spark.sql.catalyst.expressions.{And, EqualTo, 
ExpectsInputTypes, Expression, UnaryExpression}
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
+import 
org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.{getValue, 
javaType}
+import org.apache.spark.sql.catalyst.expressions.codegen.ExprCode
+import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Window}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapBuilder, MapData, 
TypeUtils}
+import org.apache.spark.sql.types.{AbstractDataType, DataType, MapType}
+
+/**
+ * When comparing two maps, we have to make sure two maps have the same key 
value pairs but
+ * with different key ordering are equal.
+ * For example, Map('a' -> 1, 'b' -> 2) equals to Map('b' -> 2, 'a' -> 1).
+ *
+ * We have to specially handle this in grouping/join/window because Spark SQL 
turns
+ * grouping/join/window partition keys into binary `UnsafeRow` and compare the
+ * binary data directly instead of using MapType's ordering. So in these 
cases, we have
+ * to insert an expression to sort map entries by key.
+ *
+ * Note that, this rule must be executed at the end of optimizer, because the 
optimizer may create
+ * new joins(the subquery rewrite) and new join conditions(the join reorder).
+ */
+object NormalizeMapType extends Rule[LogicalPlan] {
+  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case w: Window if w.partitionSpec.exists(p => needNormalize(p)) =>
+      w.copy(partitionSpec = w.partitionSpec.map(normalize))
+
+    case j @ ExtractEquiJoinKeys(_, leftKeys, rightKeys, condition, _, _, _)
+      // The analyzer guarantees left and right joins keys are of the same 
data type.
+      if leftKeys.exists(k => needNormalize(k)) =>
+      val newLeftJoinKeys = leftKeys.map(normalize)
+      val newRightJoinKeys = rightKeys.map(normalize)
+      val newConditions = newLeftJoinKeys.zip(newRightJoinKeys).map {
+        case (l, r) => EqualTo(l, r)
+      } ++ condition
+      j.copy(condition = Some(newConditions.reduce(And)))
+  }
+
+  private def needNormalize(expr: Expression): Boolean = expr match {
+    case SortMapKey(_) => false
+    case e if e.dataType.isInstanceOf[MapType] => true
+    case _ => false
+  }
+
+  private[sql] def normalize(expr: Expression): Expression = expr match {
+    case _ if !needNormalize(expr) => expr
+    case e if e.dataType.isInstanceOf[MapType] =>
+      SortMapKey(e)
+  }
+}
+
+case class SortMapKey(child: Expression) extends UnaryExpression with 
ExpectsInputTypes {
+  private lazy val MapType(keyType, valueType, valueContainsNull) = 
dataType.asInstanceOf[MapType]
+  private lazy val keyOrdering: Ordering[Any] = 
TypeUtils.getInterpretedOrdering(keyType)
+  private lazy val mapBuilder = new ArrayBasedMapBuilder(keyType, valueType)
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(MapType)
+
+  override def dataType: DataType = child.dataType
+
+  override def nullSafeEval(input: Any): Any = {
+    val childMap = input.asInstanceOf[MapData]
+    val keys = childMap.keyArray()

Review comment:
       We don't need to sort data recursively just for nested case like 
`map<map<int,int>,string>` and `map<struct<a: map<int,int>>,string>)`?

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeMapType.scala
##########
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import scala.math.Ordering
+
+import org.apache.spark.sql.catalyst.expressions.{And, EqualTo, 
ExpectsInputTypes, Expression, UnaryExpression}
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
+import 
org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.{getValue, 
javaType}
+import org.apache.spark.sql.catalyst.expressions.codegen.ExprCode
+import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Window}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapBuilder, MapData, 
TypeUtils}
+import org.apache.spark.sql.types.{AbstractDataType, DataType, MapType}
+
+/**
+ * When comparing two maps, we have to make sure two maps have the same key 
value pairs but
+ * with different key ordering are equal.
+ * For example, Map('a' -> 1, 'b' -> 2) equals to Map('b' -> 2, 'a' -> 1).
+ *
+ * We have to specially handle this in grouping/join/window because Spark SQL 
turns
+ * grouping/join/window partition keys into binary `UnsafeRow` and compare the
+ * binary data directly instead of using MapType's ordering. So in these 
cases, we have
+ * to insert an expression to sort map entries by key.
+ *
+ * Note that, this rule must be executed at the end of optimizer, because the 
optimizer may create
+ * new joins(the subquery rewrite) and new join conditions(the join reorder).
+ */
+object NormalizeMapType extends Rule[LogicalPlan] {
+  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case w: Window if w.partitionSpec.exists(p => needNormalize(p)) =>
+      w.copy(partitionSpec = w.partitionSpec.map(normalize))
+
+    case j @ ExtractEquiJoinKeys(_, leftKeys, rightKeys, condition, _, _, _)
+      // The analyzer guarantees left and right joins keys are of the same 
data type.
+      if leftKeys.exists(k => needNormalize(k)) =>
+      val newLeftJoinKeys = leftKeys.map(normalize)
+      val newRightJoinKeys = rightKeys.map(normalize)
+      val newConditions = newLeftJoinKeys.zip(newRightJoinKeys).map {
+        case (l, r) => EqualTo(l, r)
+      } ++ condition
+      j.copy(condition = Some(newConditions.reduce(And)))
+  }
+
+  private def needNormalize(expr: Expression): Boolean = expr match {
+    case SortMapKey(_) => false
+    case e if e.dataType.isInstanceOf[MapType] => true
+    case _ => false
+  }
+
+  private[sql] def normalize(expr: Expression): Expression = expr match {
+    case _ if !needNormalize(expr) => expr
+    case e if e.dataType.isInstanceOf[MapType] =>
+      SortMapKey(e)
+  }
+}
+
+case class SortMapKey(child: Expression) extends UnaryExpression with 
ExpectsInputTypes {

Review comment:
       `SortMapKey` -> `SortMapKeys`?

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeMapType.scala
##########
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import scala.math.Ordering
+
+import org.apache.spark.sql.catalyst.expressions.{And, EqualTo, 
ExpectsInputTypes, Expression, UnaryExpression}
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
+import 
org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.{getValue, 
javaType}
+import org.apache.spark.sql.catalyst.expressions.codegen.ExprCode
+import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Window}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapBuilder, MapData, 
TypeUtils}
+import org.apache.spark.sql.types.{AbstractDataType, DataType, MapType}
+
+/**
+ * When comparing two maps, we have to make sure two maps have the same key 
value pairs but
+ * with different key ordering are equal.
+ * For example, Map('a' -> 1, 'b' -> 2) equals to Map('b' -> 2, 'a' -> 1).
+ *
+ * We have to specially handle this in grouping/join/window because Spark SQL 
turns
+ * grouping/join/window partition keys into binary `UnsafeRow` and compare the
+ * binary data directly instead of using MapType's ordering. So in these 
cases, we have
+ * to insert an expression to sort map entries by key.
+ *
+ * Note that, this rule must be executed at the end of optimizer, because the 
optimizer may create
+ * new joins(the subquery rewrite) and new join conditions(the join reorder).

Review comment:
       Could you leave some comments about why this rule does not handle the 
Aggregate cases? 
https://github.com/apache/spark/pull/31967/files#diff-21f071d73070b8257ad76e6e16ec5ed38a13d1278fe94bd42546c258a69f4410R344

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeMapType.scala
##########
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import scala.math.Ordering
+
+import org.apache.spark.sql.catalyst.expressions.{And, EqualTo, 
ExpectsInputTypes, Expression, UnaryExpression}
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
+import 
org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.{getValue, 
javaType}
+import org.apache.spark.sql.catalyst.expressions.codegen.ExprCode
+import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Window}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapBuilder, MapData, 
TypeUtils}
+import org.apache.spark.sql.types.{AbstractDataType, DataType, MapType}
+
+/**
+ * When comparing two maps, we have to make sure two maps have the same key 
value pairs but
+ * with different key ordering are equal.
+ * For example, Map('a' -> 1, 'b' -> 2) equals to Map('b' -> 2, 'a' -> 1).
+ *
+ * We have to specially handle this in grouping/join/window because Spark SQL 
turns
+ * grouping/join/window partition keys into binary `UnsafeRow` and compare the
+ * binary data directly instead of using MapType's ordering. So in these 
cases, we have
+ * to insert an expression to sort map entries by key.
+ *
+ * Note that, this rule must be executed at the end of optimizer, because the 
optimizer may create
+ * new joins(the subquery rewrite) and new join conditions(the join reorder).
+ */
+object NormalizeMapType extends Rule[LogicalPlan] {
+  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case w: Window if w.partitionSpec.exists(p => needNormalize(p)) =>

Review comment:
       You didn't support `BinaryComparison` cases?

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFloatingNumbers.scala
##########
@@ -141,6 +139,27 @@ object NormalizeFloatingNumbers extends Rule[LogicalPlan] {
       val function = normalize(lv)
       KnownFloatingPointNormalized(ArrayTransform(expr, 
LambdaFunction(function, Seq(lv))))
 
+    case _ if expr.dataType.isInstanceOf[MapType] =>
+      val MapType(kt, vt, containsNull) = expr.dataType
+      var normalized = if (needNormalize(kt)) {

Review comment:
       Could you avoid to use `var` here?

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeMapType.scala
##########
@@ -0,0 +1,155 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import scala.math.Ordering
+
+import org.apache.spark.sql.catalyst.expressions.{And, EqualTo, 
ExpectsInputTypes, Expression, UnaryExpression}
+import org.apache.spark.sql.catalyst.expressions.codegen.CodegenContext
+import 
org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.{getValue, 
javaType}
+import org.apache.spark.sql.catalyst.expressions.codegen.ExprCode
+import org.apache.spark.sql.catalyst.planning.ExtractEquiJoinKeys
+import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Window}
+import org.apache.spark.sql.catalyst.rules.Rule
+import org.apache.spark.sql.catalyst.util.{ArrayBasedMapBuilder, MapData, 
TypeUtils}
+import org.apache.spark.sql.types.{AbstractDataType, DataType, MapType}
+
+/**
+ * When comparing two maps, we have to make sure two maps have the same key 
value pairs but
+ * with different key ordering are equal.
+ * For example, Map('a' -> 1, 'b' -> 2) equals to Map('b' -> 2, 'a' -> 1).
+ *
+ * We have to specially handle this in grouping/join/window because Spark SQL 
turns
+ * grouping/join/window partition keys into binary `UnsafeRow` and compare the
+ * binary data directly instead of using MapType's ordering. So in these 
cases, we have
+ * to insert an expression to sort map entries by key.
+ *
+ * Note that, this rule must be executed at the end of optimizer, because the 
optimizer may create
+ * new joins(the subquery rewrite) and new join conditions(the join reorder).
+ */
+object NormalizeMapType extends Rule[LogicalPlan] {
+  def apply(plan: LogicalPlan): LogicalPlan = plan transform {
+    case w: Window if w.partitionSpec.exists(p => needNormalize(p)) =>
+      w.copy(partitionSpec = w.partitionSpec.map(normalize))
+
+    case j @ ExtractEquiJoinKeys(_, leftKeys, rightKeys, condition, _, _, _)
+      // The analyzer guarantees left and right joins keys are of the same 
data type.
+      if leftKeys.exists(k => needNormalize(k)) =>
+      val newLeftJoinKeys = leftKeys.map(normalize)
+      val newRightJoinKeys = rightKeys.map(normalize)
+      val newConditions = newLeftJoinKeys.zip(newRightJoinKeys).map {
+        case (l, r) => EqualTo(l, r)
+      } ++ condition
+      j.copy(condition = Some(newConditions.reduce(And)))
+  }
+
+  private def needNormalize(expr: Expression): Boolean = expr match {
+    case SortMapKey(_) => false
+    case e if e.dataType.isInstanceOf[MapType] => true
+    case _ => false
+  }
+
+  private[sql] def normalize(expr: Expression): Expression = expr match {
+    case _ if !needNormalize(expr) => expr
+    case e if e.dataType.isInstanceOf[MapType] =>
+      SortMapKey(e)
+  }
+}
+
+case class SortMapKey(child: Expression) extends UnaryExpression with 
ExpectsInputTypes {
+  private lazy val MapType(keyType, valueType, valueContainsNull) = 
dataType.asInstanceOf[MapType]
+  private lazy val keyOrdering: Ordering[Any] = 
TypeUtils.getInterpretedOrdering(keyType)
+  private lazy val mapBuilder = new ArrayBasedMapBuilder(keyType, valueType)
+
+  override def inputTypes: Seq[AbstractDataType] = Seq(MapType)
+
+  override def dataType: DataType = child.dataType
+
+  override def nullSafeEval(input: Any): Any = {
+    val childMap = input.asInstanceOf[MapData]
+    val keys = childMap.keyArray()
+    val values = childMap.valueArray()
+    val sortedKeyIndex = (0 until childMap.numElements()).toArray.sorted(new 
Ordering[Int] {
+      override def compare(a: Int, b: Int): Int = {
+        keyOrdering.compare(keys.get(a, keyType), keys.get(b, keyType))
+      }
+    })
+
+    var i = 0
+    while (i < childMap.numElements()) {
+      val index = sortedKeyIndex(i)
+      mapBuilder.put(
+        keys.get(index, keyType),
+        if (values.isNullAt(index)) null else values.get(index, valueType))
+
+      i += 1
+    }
+
+    mapBuilder.build()
+  }
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): 
ExprCode = {

Review comment:
       To make this PR simpler, how about leaving the codegen support into 
follow-up PRs just like the original PR?  
https://github.com/apache/spark/pull/15970/files#diff-da163d97a5f0fc534aad719c4a39eca97116f25bfc05b7d8941b342a3ed96036R423-R429

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeFloatingNumbers.scala
##########
@@ -141,6 +139,27 @@ object NormalizeFloatingNumbers extends Rule[LogicalPlan] {
       val function = normalize(lv)
       KnownFloatingPointNormalized(ArrayTransform(expr, 
LambdaFunction(function, Seq(lv))))
 
+    case _ if expr.dataType.isInstanceOf[MapType] =>
+      val MapType(kt, vt, containsNull) = expr.dataType
+      var normalized = if (needNormalize(kt)) {

Review comment:
       Could you add tests for this new code path in 
`NormalizeFloatingPointNumbersSuite`?

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/optimizer/Optimizer.scala
##########
@@ -230,10 +230,12 @@ abstract class Optimizer(catalogManager: CatalogManager)
       ColumnPruning,
       CollapseProject,
       RemoveNoopOperators) :+
-    // This batch must be executed after the `RewriteSubquery` batch, which 
creates joins.
+    // Following batches must be executed after the `RewriteSubquery` batch, 
which creates joins.
     Batch("NormalizeFloatingNumbers", Once, NormalizeFloatingNumbers) :+
+    Batch("NormalizeMapType", Once, NormalizeMapType) :+
     Batch("ReplaceUpdateFieldsExpression", Once, ReplaceUpdateFieldsExpression)
 
+

Review comment:
       nit: unnecessary change.

##########
File path: sql/core/src/test/scala/org/apache/spark/sql/SQLQuerySuite.scala
##########
@@ -4140,6 +4142,136 @@ class SQLQuerySuite extends QueryTest with 
SharedSparkSession with AdaptiveSpark
       }
     }
   }
+
+  test("SPARK-34819: MapType supports orderable semantics") {
+    Seq(CodegenObjectFactoryMode.CODEGEN_ONLY.toString,
+      CodegenObjectFactoryMode.NO_CODEGEN.toString).foreach {
+      case codegenMode =>
+        withSQLConf(SQLConf.CODEGEN_FACTORY_MODE.key -> codegenMode) {
+          withTable("t", "t2") {
+            val df = Seq(
+              Map("a" -> 1, "b" -> 2, "c" -> 3),
+              Map("c" -> 3, "b" -> 2, "a" -> 1),
+              Map("d" -> 4),
+              Map("a" -> 1, "e" -> 2),
+              Map("d" -> 4),
+              Map("d" -> 5)
+            ).toDF("m")
+            val df2 = Seq(
+              Map("b" -> 2, "a" -> 1, "c" -> 3)
+            ).toDF("m2")
+            df.createOrReplaceTempView("t")
+            df2.createOrReplaceTempView("t2")
+
+            checkAnswer(
+              sql("select m, count(1) from t group by m"),
+              Row(Map("d" -> 4), 2) ::
+                Row(Map("d" -> 5), 1) ::
+                Row(Map("a" -> 1, "e" -> 2), 1) ::
+                Row(Map("a" -> 1, "b" -> 2, "c" -> 3), 2) :: Nil
+            )
+
+            checkAnswer(
+              sql("select distinct m from t"),
+              Row(Map("d" -> 4)) ::
+                Row(Map("d" -> 5)) ::
+                Row(Map("a" -> 1, "e" -> 2)) ::
+                Row(Map("a" -> 1, "b" -> 2, "c" -> 3)) :: Nil
+            )
+
+            checkAnswer(
+              sql("select m from t order by m"),
+              Row(Map("d" -> 4)) ::
+                Row(Map("d" -> 4)) ::
+                Row(Map("d" -> 5)) ::
+                Row(Map("a" -> 1, "e" -> 2)) ::
+                Row(Map("a" -> 1, "b" -> 2, "c" -> 3)) ::
+                Row(Map("c" -> 3, "b" -> 2, "a" -> 1)) :: Nil
+            )
+
+            checkAnswer(
+              sql("select m, count(1) over (partition by m) from t"),
+              Row(Map("d" -> 4), 2) ::
+                Row(Map("d" -> 4), 2) ::
+                Row(Map("d" -> 5), 1) ::
+                Row(Map("a" -> 1, "e" -> 2), 1) ::
+                Row(Map("a" -> 1, "b" -> 2, "c" -> 3), 2) ::
+                Row(Map("c" -> 3, "b" -> 2, "a" -> 1), 2) :: Nil
+            )
+
+            checkAnswer(
+              sql(
+                """select m2, count(1), percentile(id, 0.5) from (
+                  |   select
+                  |     case when size(m) == 3 then m else map('b', 2, 'a', 1, 
'c', 3)
+                  |     end as m2,
+                  |     1 as id
+                  |   from t
+                  |)
+                  |group by m2
+                  |""".stripMargin),
+                Row(Map("a" -> 1, "b" -> 2, "c" -> 3), 6, 1.0) :: Nil
+            )
+
+            checkAnswer(
+              sql("select m, m2 from t join t2 on t.m = t2.m2"),
+              Row(Map("a" -> 1, "b" -> 2, "c" -> 3), Map("b" -> 2, "a" -> 1, 
"c" -> 3)) ::
+                Row(Map("c" -> 3, "b" -> 2, "a" -> 1), Map("b" -> 2, "a" -> 1, 
"c" -> 3)) :: Nil
+            )
+
+            checkAnswer(
+              sql("select distinct m, m2 from t join t2 on t.m = t2.m2"),
+              Row(Map("a" -> 1, "b" -> 2, "c" -> 3), Map("a" -> 1, "b" -> 2, 
"c" -> 3)) :: Nil
+            )
+
+            checkAnswer(
+              sql("select m from t where m = map('b', 2, 'a', 1, 'c', 3)"),
+              Row(Map("a" -> 1, "b" -> 2, "c" -> 3)) ::
+                Row(Map("c" -> 3, "b" -> 2, "a" -> 1)) :: Nil
+            )
+          }
+        }
+    }
+  }
+
+  test("SPARK-34819: MapType has nesting complex type supports orderable 
semantics") {
+    Seq(CodegenObjectFactoryMode.CODEGEN_ONLY.toString,
+      CodegenObjectFactoryMode.NO_CODEGEN.toString).foreach {

Review comment:
       Could you move the two tests into `SQLQueryTestSuite`? You can use the 
`CONFIG_DIM` directive there: 
   
https://github.com/apache/spark/blob/master/sql/core/src/test/resources/sql-tests/inputs/postgreSQL/join.sql#L18-L20

##########
File path: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala
##########
@@ -687,6 +688,118 @@ class CodegenContext extends Logging {
           }
         """
       s"${addNewFunction(compareFunc, funcCode)}($c1, $c2)"
+    case _ @ MapType(keyType, valueType, valueContainsNull) =>

Review comment:
       Btw, we cannot reuse the Array case? 
https://github.com/apache/spark/pull/31967/files#diff-1501206e78d34b65183af1092c8ec392ce18574bb538f905ca93a22983c63ae6R643

##########
File path: 
sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/optimizer/NormalizeMapTypeSuite.scala
##########
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.catalyst.optimizer
+
+import org.apache.spark.sql.catalyst.dsl.expressions._
+import org.apache.spark.sql.catalyst.dsl.plans._
+import org.apache.spark.sql.catalyst.plans.PlanTest
+import org.apache.spark.sql.catalyst.plans.logical.{LocalRelation, LogicalPlan}
+import org.apache.spark.sql.catalyst.rules.RuleExecutor
+import org.apache.spark.sql.types.{MapType, StringType}
+
+class NormalizeMapTypeSuite extends PlanTest {
+
+  object Optimize extends RuleExecutor[LogicalPlan] {
+    val batches = Batch("NormalizeMapType", Once, NormalizeMapType) :: Nil
+  }
+
+  val testRelation1 = LocalRelation('a.int, 'm.map(MapType(StringType, 
StringType, false)))
+  val a1 = testRelation1.output(0)
+  val m1 = testRelation1.output(1)
+
+  val testRelation2 = LocalRelation('a.int, 'm.map(MapType(StringType, 
StringType, false)))
+  val a2 = testRelation2.output(0)
+  val m2 = testRelation2.output(1)
+
+  test("normalize map types in window function expressions") {
+    val query = testRelation1.window(Seq(sum(a1).as("sum")), Seq(m1), 
Seq(m1.asc))
+    val optimized = Optimize.execute(query)
+    val correctAnswer = testRelation1.window(Seq(sum(a1).as("sum")),
+      Seq(SortMapKey(m1)), Seq(m1.asc))
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("normalize map types in window function expressions - idempotence") {
+    val query = testRelation1.window(Seq(sum(a1).as("sum")), Seq(m1), 
Seq(m1.asc))
+    val optimized = Optimize.execute(query)
+    val doubleOptimized = Optimize.execute(optimized)
+    val correctAnswer = testRelation1.window(Seq(sum(a1).as("sum")),
+      Seq(SortMapKey(m1)), Seq(m1.asc))
+
+    comparePlans(doubleOptimized, correctAnswer)
+  }
+
+  test("normalize map types in join keys") {
+    val query = testRelation1.join(testRelation2, condition = Some(m1 === m2))
+
+    val optimized = Optimize.execute(query)
+    val joinCond = Some(SortMapKey(m1) === SortMapKey(m2))
+    val correctAnswer = testRelation1.join(testRelation2, condition = joinCond)
+
+    comparePlans(optimized, correctAnswer)
+  }
+
+  test("normalize map types in join keys - idempotence") {
+    val query = testRelation1.join(testRelation2, condition = Some(m1 === m2))
+
+    val optimized = Optimize.execute(query)
+    val doubleOptimized = Optimize.execute(optimized)
+    val joinCond = Some(SortMapKey(m1) === SortMapKey(m2))
+    val correctAnswer = testRelation1.join(testRelation2, condition = joinCond)
+
+    comparePlans(doubleOptimized, correctAnswer)
+  }
+}
+
+

Review comment:
       nit: remove unnecessary blank lines.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org



---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

[GitHub] [spark] maropu commented on a change in pull request #31967: [SPARK-34819][SQL] MapType supports orderable semantics

Reply via email to