Repository: spark Updated Branches: refs/heads/branch-2.0 664426e00 -> 22fe336c8
[SPARK-16135][SQL] Remove hashCode and euqals in ArrayBasedMapData ## What changes were proposed in this pull request? This pr is to remove `hashCode` and `equals` in `ArrayBasedMapData` because the type cannot be used as join keys, grouping keys, or in equality tests. ## How was this patch tested? Add a new test suite `MapDataSuite` for comparison tests. Author: Takeshi YAMAMURO <linguin....@gmail.com> Closes #13847 from maropu/UnsafeMapTest. (cherry picked from commit 3e4e868c850e6b6da2c0d005167316e1abdc7460) Signed-off-by: Wenchen Fan <wenc...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/22fe336c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/22fe336c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/22fe336c Branch: refs/heads/branch-2.0 Commit: 22fe336c8d379ecb723c58264f8c4b98c244008a Parents: 664426e Author: Takeshi YAMAMURO <linguin....@gmail.com> Authored: Mon Jun 27 21:45:22 2016 +0800 Committer: Wenchen Fan <wenc...@databricks.com> Committed: Mon Jun 27 21:46:38 2016 +0800 ---------------------------------------------------------------------- .../catalyst/expressions/UnsafeArrayData.java | 4 ++ .../sql/catalyst/util/ArrayBasedMapData.scala | 17 ------ .../spark/sql/catalyst/util/MapData.scala | 5 ++ .../expressions/CodeGenerationSuite.scala | 8 +-- .../expressions/ExpressionEvalHelper.scala | 8 ++- .../sql/catalyst/expressions/MapDataSuite.scala | 57 ++++++++++++++++++++ 6 files changed, 76 insertions(+), 23 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/22fe336c/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java index 02a863b..6302660 100644 --- a/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java +++ b/sql/catalyst/src/main/java/org/apache/spark/sql/catalyst/expressions/UnsafeArrayData.java @@ -298,6 +298,10 @@ public final class UnsafeArrayData extends ArrayData { return map; } + // This `hashCode` computation could consume much processor time for large data. + // If the computation becomes a bottleneck, we can use a light-weight logic; the first fixed bytes + // are used to compute `hashCode` (See `Vector.hashCode`). + // The same issue exists in `UnsafeRow.hashCode`. @Override public int hashCode() { return Murmur3_x86_32.hashUnsafeBytes(baseObject, baseOffset, sizeInBytes, 42); http://git-wip-us.apache.org/repos/asf/spark/blob/22fe336c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala index d46f03a..4449da1 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/ArrayBasedMapData.scala @@ -24,23 +24,6 @@ class ArrayBasedMapData(val keyArray: ArrayData, val valueArray: ArrayData) exte override def copy(): MapData = new ArrayBasedMapData(keyArray.copy(), valueArray.copy()) - override def equals(o: Any): Boolean = { - if (!o.isInstanceOf[ArrayBasedMapData]) { - return false - } - - val other = o.asInstanceOf[ArrayBasedMapData] - if (other eq null) { - return false - } - - this.keyArray == other.keyArray && this.valueArray == other.valueArray - } - - override def hashCode: Int = { - keyArray.hashCode() * 37 + valueArray.hashCode() - } - override def toString: String = { s"keys: $keyArray, values: $valueArray" } http://git-wip-us.apache.org/repos/asf/spark/blob/22fe336c/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala index 40db606..94e8824 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/MapData.scala @@ -19,6 +19,11 @@ package org.apache.spark.sql.catalyst.util import org.apache.spark.sql.types.DataType +/** + * This is an internal data representation for map type in Spark SQL. This should not implement + * `equals` and `hashCode` because the type cannot be used as join keys, grouping keys, or + * in equality tests. See SPARK-9415 and PR#13847 for the discussions. + */ abstract class MapData extends Serializable { def numElements(): Int http://git-wip-us.apache.org/repos/asf/spark/blob/22fe336c/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala index 62429a2..60dd03f 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala @@ -110,10 +110,10 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper { case (expr, i) => Seq(Literal(i), expr) })) val plan = GenerateMutableProjection.generate(expressions) - val actual = plan(new GenericMutableRow(length)).toSeq(expressions.map(_.dataType)) - val expected = Seq(new ArrayBasedMapData( - new GenericArrayData(0 until length), - new GenericArrayData(Seq.fill(length)(true)))) + val actual = plan(new GenericMutableRow(length)).toSeq(expressions.map(_.dataType)).map { + case m: ArrayBasedMapData => ArrayBasedMapData.toScalaMap(m) + } + val expected = (0 until length).map((_, true)).toMap :: Nil if (!checkResult(actual, expected)) { fail(s"Incorrect Evaluation: expressions: $expressions, actual: $actual, expected: $expected") http://git-wip-us.apache.org/repos/asf/spark/blob/22fe336c/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala index 8a9617c..e58a0df 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/ExpressionEvalHelper.scala @@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.codegen._ import org.apache.spark.sql.catalyst.optimizer.SimpleTestOptimizer import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project} +import org.apache.spark.sql.catalyst.util.MapData import org.apache.spark.sql.types.DataType import org.apache.spark.util.Utils @@ -52,7 +53,7 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks { /** * Check the equality between result of expression and expected value, it will handle - * Array[Byte] and Spread[Double]. + * Array[Byte], Spread[Double], and MapData. */ protected def checkResult(result: Any, expected: Any): Boolean = { (result, expected) match { @@ -60,7 +61,10 @@ trait ExpressionEvalHelper extends GeneratorDrivenPropertyChecks { java.util.Arrays.equals(result, expected) case (result: Double, expected: Spread[Double @unchecked]) => expected.asInstanceOf[Spread[Double]].isWithin(result) - case _ => result == expected + case (result: MapData, expected: MapData) => + result.keyArray() == expected.keyArray() && result.valueArray() == expected.valueArray() + case _ => + result == expected } } http://git-wip-us.apache.org/repos/asf/spark/blob/22fe336c/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MapDataSuite.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MapDataSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MapDataSuite.scala new file mode 100644 index 0000000..0f1264c --- /dev/null +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/MapDataSuite.scala @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.catalyst.expressions + +import scala.collection._ + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.catalyst.util.ArrayBasedMapData +import org.apache.spark.sql.types.{DataType, IntegerType, MapType, StringType} +import org.apache.spark.unsafe.types.UTF8String + +class MapDataSuite extends SparkFunSuite { + + test("inequality tests") { + def u(str: String): UTF8String = UTF8String.fromString(str) + + // test data + val testMap1 = Map(u("key1") -> 1) + val testMap2 = Map(u("key1") -> 1, u("key2") -> 2) + val testMap3 = Map(u("key1") -> 1) + val testMap4 = Map(u("key1") -> 1, u("key2") -> 2) + + // ArrayBasedMapData + val testArrayMap1 = ArrayBasedMapData(testMap1.toMap) + val testArrayMap2 = ArrayBasedMapData(testMap2.toMap) + val testArrayMap3 = ArrayBasedMapData(testMap3.toMap) + val testArrayMap4 = ArrayBasedMapData(testMap4.toMap) + assert(testArrayMap1 !== testArrayMap3) + assert(testArrayMap2 !== testArrayMap4) + + // UnsafeMapData + val unsafeConverter = UnsafeProjection.create(Array[DataType](MapType(StringType, IntegerType))) + val row = new GenericMutableRow(1) + def toUnsafeMap(map: ArrayBasedMapData): UnsafeMapData = { + row.update(0, map) + val unsafeRow = unsafeConverter.apply(row) + unsafeRow.getMap(0).copy + } + assert(toUnsafeMap(testArrayMap1) !== toUnsafeMap(testArrayMap3)) + assert(toUnsafeMap(testArrayMap2) !== toUnsafeMap(testArrayMap4)) + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org