This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 0d435411ec5 [SPARK-41029][SQL] Optimize constructor use of `GenericArrayData` for Scala 2.13 0d435411ec5 is described below commit 0d435411ec5c69e6fd94636986f9749abbcf09a1 Author: yangjie01 <yangji...@baidu.com> AuthorDate: Tue Nov 8 08:42:35 2022 -0600 [SPARK-41029][SQL] Optimize constructor use of `GenericArrayData` for Scala 2.13 ### What changes were proposed in this pull request? This pr change to use a more appropriate constructor when the input is `ArrayBuffer` or `Empty Collection` to improve the construction performance of `GenericArrayData` with Scala 2.13. ### Why are the changes needed? Minor performance improvement. `GenericArrayData ` has the following constructor https://github.com/apache/spark/blob/57d492556768eb341f525ce7eb5c934089fa9e7e/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/GenericArrayData.scala#L30 When the input type is `ArrayBuffer`, the following code is similar in Spark ``` new GenericArrayData(arrayBuffer.toSeq) ``` For Scala 2.12, there will be no performance gap between `new GenericArrayData(arrayBuffer.toSeq)` and `new GenericArrayData(arrayBuffer)`. However, when Scala 2.13 is used, there will be a performance gap, because 'toSeq' will cause a redundant memory copy. For the following test case: ```scala val valuesPerIteration: Long = 1000 * 1000 * 10 val buffer = if (bufferSize == 0) { ArrayBuffer.empty[Any] } else { ArrayBuffer.fill[Any](bufferSize)(() => 1) } val benchmark = new Benchmark(s"constructor with buffer size = $bufferSize", valuesPerIteration, output = output) benchmark.addCase("toSeq and construct") { _ => var n = 0 while (n < valuesPerIteration) { new GenericArrayData(buffer.toSeq) n += 1 } } benchmark.addCase("construct directly") { _ => var n = 0 while (n < valuesPerIteration) { new GenericArrayData(buffer) n += 1 } } ``` When bufferSize=10, there is a performance gap of more than 5 times between a and b, and the performance gap increases almost linearly with the increase of bufferSize There will be more than 5 times performance gap between `new GenericArrayData(buffer.toSeq)` and `new GenericArrayData(buffer)` when `bufferSize = 10` and the performance gap will increase with the increase of bufferSize. ``` OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1022-azure Intel(R) Xeon(R) Platinum 8370C CPU 2.80GHz constructor with buffer size = 10: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ toSeq and construct 2617 2622 7 3.8 261.7 1.0X construct directly 399 406 11 25.1 39.9 6.6X OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1022-azure Intel(R) Xeon(R) Platinum 8370C CPU 2.80GHz constructor with buffer size = 100: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ toSeq and construct 12512 12554 60 0.8 1251.2 1.0X construct directly 779 781 2 12.8 77.9 16.1X OpenJDK 64-Bit Server VM 1.8.0_345-b01 on Linux 5.15.0-1022-azure Intel(R) Xeon(R) Platinum 8370C CPU 2.80GHz constructor with buffer size = 1000: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ toSeq and construct 108882 109400 732 0.1 10888.2 1.0X construct directly 5717 5731 20 1.7 571.7 19.0X ``` We can safely change `new GenericArrayData(buffer.toSeq)` to `new GenericArrayData(buffer)` due to `ArrayBuffer` is still `scala.collection.Seq` in Scala 2.13. On the other hand, when input is an empty set, using `Array.empty` is 10% faster than using `Seq.empty`. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? - Pass GitHub Actions - Manual tests `catalyst` and `sql` module with Scala 2.13 passed Closes #38533 from LuciferYang/create-GenericArrayData. Authored-by: yangjie01 <yangji...@baidu.com> Signed-off-by: Sean Owen <sro...@gmail.com> --- .../sql/catalyst/expressions/aggregate/Mode.scala | 4 ++-- .../catalyst/expressions/collectionOperations.scala | 18 +++++++++--------- .../catalyst/expressions/higherOrderFunctions.scala | 2 +- .../sql/catalyst/expressions/stringExpressions.scala | 4 ++-- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala index cd6e1a5a18e..cad7d1f07dc 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/aggregate/Mode.scala @@ -142,7 +142,7 @@ case class PandasMode( override def eval(buffer: OpenHashMap[AnyRef, Long]): Any = { if (buffer.isEmpty) { - return new GenericArrayData(Seq.empty) + return new GenericArrayData(Array.empty) } val modes = collection.mutable.ArrayBuffer.empty[AnyRef] @@ -158,7 +158,7 @@ case class PandasMode( modes.append(key) } } - new GenericArrayData(modes.toSeq) + new GenericArrayData(modes) } override def withNewMutableAggBufferOffset(newMutableAggBufferOffset: Int): PandasMode = diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index 5ba15109dc7..f3d846615fa 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -3713,7 +3713,7 @@ case class ArrayDistinct(child: Expression) withNullCheckFunc(array, i) i += 1 } - new GenericArrayData(arrayBuffer.toSeq) + new GenericArrayData(arrayBuffer) } else { (data: ArrayData) => { val array = data.toArray[AnyRef](elementType) @@ -3739,7 +3739,7 @@ case class ArrayDistinct(child: Expression) } } } - new GenericArrayData(arrayBuffer.toSeq) + new GenericArrayData(arrayBuffer) } } @@ -3895,7 +3895,7 @@ case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLi i += 1 } } - new GenericArrayData(arrayBuffer.toSeq) + new GenericArrayData(arrayBuffer) } else { (array1, array2) => val arrayBuffer = new scala.collection.mutable.ArrayBuffer[Any] @@ -3926,7 +3926,7 @@ case class ArrayUnion(left: Expression, right: Expression) extends ArrayBinaryLi arrayBuffer += elem } })) - new GenericArrayData(arrayBuffer.toSeq) + new GenericArrayData(arrayBuffer) } } @@ -4060,7 +4060,7 @@ object ArrayUnion { arrayBuffer += elem } })) - new GenericArrayData(arrayBuffer.toSeq) + new GenericArrayData(arrayBuffer) } } @@ -4131,7 +4131,7 @@ case class ArrayIntersect(left: Expression, right: Expression) extends ArrayBina withArray1NullCheckFunc(array1, i) i += 1 } - new GenericArrayData(arrayBuffer.toSeq) + new GenericArrayData(arrayBuffer) } else { new GenericArrayData(Array.emptyObjectArray) } @@ -4179,7 +4179,7 @@ case class ArrayIntersect(left: Expression, right: Expression) extends ArrayBina } i += 1 } - new GenericArrayData(arrayBuffer.toSeq) + new GenericArrayData(arrayBuffer) } else { new GenericArrayData(Array.emptyObjectArray) } @@ -4354,7 +4354,7 @@ case class ArrayExcept(left: Expression, right: Expression) extends ArrayBinaryL withArray1NullCheckFunc(array1, i) i += 1 } - new GenericArrayData(arrayBuffer.toSeq) + new GenericArrayData(arrayBuffer) } else { (array1, array2) => val arrayBuffer = new scala.collection.mutable.ArrayBuffer[Any] @@ -4399,7 +4399,7 @@ case class ArrayExcept(left: Expression, right: Expression) extends ArrayBinaryL } i += 1 } - new GenericArrayData(arrayBuffer.toSeq) + new GenericArrayData(arrayBuffer) } } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala index b59860ff181..2a8334e942d 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/higherOrderFunctions.scala @@ -602,7 +602,7 @@ case class ArrayFilter( } i += 1 } - new GenericArrayData(buffer.toSeq) + new GenericArrayData(buffer) } override def prettyName: String = "filter" diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala index cc47d739d71..45bed3e2387 100755 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -2890,9 +2890,9 @@ case class Sentences( widx = wi.current if (Character.isLetterOrDigit(word.charAt(0))) words += UTF8String.fromString(word) } - result += new GenericArrayData(words.toSeq) + result += new GenericArrayData(words) } - new GenericArrayData(result.toSeq) + new GenericArrayData(result) } override protected def withNewChildrenInternal( --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org