This is an automated email from the ASF dual-hosted git repository. srowen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 8c6d312 [SPARK-37796][SQL] ByteArrayMethods arrayEquals should fast skip the check of aligning with unaligned platform 8c6d312 is described below commit 8c6d3123086cf4def7e8be61214dfc9286578169 Author: ulysses-you <ulyssesyo...@gmail.com> AuthorDate: Wed Jan 5 09:30:05 2022 -0600 [SPARK-37796][SQL] ByteArrayMethods arrayEquals should fast skip the check of aligning with unaligned platform ### What changes were proposed in this pull request? The method `arrayEquals` in `ByteArrayMethods` is critical function which is used in `UTF8String.` `equals`, `indexOf`,`find` etc. After SPARK-16962, it add the complexity of aligned. It would be better to fast sikip the check of aligning if the platform is unaligned. ### Why are the changes needed? Improve the performance. ### Does this PR introduce _any_ user-facing change? no ### How was this patch tested? Pass CI. Run the benchmark using [unaligned-benchmark](https://github.com/ulysses-you/spark/commit/d14d4bfcfeddcf90ccfe7cc3f6cda426d6d6b7e5), and here is the benchmark result: [JDK8](https://github.com/ulysses-you/spark/actions/runs/1639852573) ``` ================================================================================================ byte array equals ================================================================================================ OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure Intel(R) Xeon(R) Platinum 8272CL CPU 2.60GHz Byte Array equals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ Byte Array equals fast 1322 2222 NaN 121.0 8.3 1.0X Byte Array equals 3378 3381 3 47.4 21.1 0.4X ``` [JDK11](https://github.com/ulysses-you/spark/actions/runs/1639853330) ``` ================================================================================================ byte array equals ================================================================================================ OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure Intel(R) Xeon(R) Platinum 8272CL CPU 2.60GHz Byte Array equals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ Byte Array equals fast 1860 1891 15 86.0 11.6 1.0X Byte Array equals 2913 2921 8 54.9 18.2 0.6X ``` [JDK17](https://github.com/ulysses-you/spark/actions/runs/1639853938) ``` ================================================================================================ byte array equals ================================================================================================ OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure Intel(R) Xeon(R) Platinum 8171M CPU 2.60GHz Byte Array equals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ Byte Array equals fast 1543 1602 39 103.7 9.6 1.0X Byte Array equals 3027 3029 1 52.9 18.9 0.5X ``` Closes #35078 from ulysses-you/SPARK-37796. Authored-by: ulysses-you <ulyssesyo...@gmail.com> Signed-off-by: Sean Owen <sro...@gmail.com> --- .../spark/unsafe/array/ByteArrayMethods.java | 2 +- .../ByteArrayBenchmark-jdk11-results.txt | 10 ++++ .../ByteArrayBenchmark-jdk17-results.txt | 10 ++++ sql/core/benchmarks/ByteArrayBenchmark-results.txt | 10 ++++ .../execution/benchmark/ByteArrayBenchmark.scala | 66 +++++++++++++++++----- 5 files changed, 83 insertions(+), 15 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java index f3a59e3..5a7e32b 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java @@ -61,7 +61,7 @@ public class ByteArrayMethods { int i = 0; // check if stars align and we can get both offsets to be aligned - if ((leftOffset % 8) == (rightOffset % 8)) { + if (!unaligned && ((leftOffset % 8) == (rightOffset % 8))) { while ((leftOffset + i) % 8 != 0 && i < length) { if (Platform.getByte(leftBase, leftOffset + i) != Platform.getByte(rightBase, rightOffset + i)) { diff --git a/sql/core/benchmarks/ByteArrayBenchmark-jdk11-results.txt b/sql/core/benchmarks/ByteArrayBenchmark-jdk11-results.txt index 0bdab8d..aafe6e6 100644 --- a/sql/core/benchmarks/ByteArrayBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/ByteArrayBenchmark-jdk11-results.txt @@ -14,3 +14,13 @@ Byte Array compareTo: Best Time(ms) Avg Time(ms) Stdev(m 2-7 byte 548 564 9 119.5 8.4 0.9X +================================================================================================ +byte array equals +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Byte Array equals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Byte Array equals 1860 1891 15 86.0 11.6 1.0X + diff --git a/sql/core/benchmarks/ByteArrayBenchmark-jdk17-results.txt b/sql/core/benchmarks/ByteArrayBenchmark-jdk17-results.txt index b5e0428..33af4c2 100644 --- a/sql/core/benchmarks/ByteArrayBenchmark-jdk17-results.txt +++ b/sql/core/benchmarks/ByteArrayBenchmark-jdk17-results.txt @@ -14,3 +14,13 @@ Byte Array compareTo: Best Time(ms) Avg Time(ms) Stdev(m 2-7 byte 454 454 0 144.3 6.9 0.9X +================================================================================================ +byte array equals +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +Byte Array equals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Byte Array equals 1543 1602 39 103.7 9.6 1.0X + diff --git a/sql/core/benchmarks/ByteArrayBenchmark-results.txt b/sql/core/benchmarks/ByteArrayBenchmark-results.txt index cf2a6d4..ae1054b 100644 --- a/sql/core/benchmarks/ByteArrayBenchmark-results.txt +++ b/sql/core/benchmarks/ByteArrayBenchmark-results.txt @@ -14,3 +14,13 @@ Byte Array compareTo: Best Time(ms) Avg Time(ms) Stdev(m 2-7 byte 402 403 0 162.8 6.1 1.0X +================================================================================================ +byte array equals +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +Byte Array equals: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Byte Array equals 1322 2222 NaN 121.0 8.3 1.0X + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ByteArrayBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ByteArrayBenchmark.scala index f8b1e27..9901684 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ByteArrayBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ByteArrayBenchmark.scala @@ -20,10 +20,11 @@ package org.apache.spark.sql.execution.benchmark import scala.util.Random import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} -import org.apache.spark.unsafe.types.ByteArray +import org.apache.spark.unsafe.array.ByteArrayMethods +import org.apache.spark.unsafe.types.{ByteArray, UTF8String} /** - * Benchmark to measure performance for byte array comparisons. + * Benchmark to measure performance for byte array operators. * {{{ * To run this benchmark: * 1. without sbt: @@ -34,21 +35,21 @@ import org.apache.spark.unsafe.types.ByteArray * }}} */ object ByteArrayBenchmark extends BenchmarkBase { + private val chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" + private val randomChar = new Random(0) - def byteArrayComparisons(iters: Long): Unit = { - val chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ" - val random = new Random(0) - def randomBytes(min: Int, max: Int): Array[Byte] = { - val len = random.nextInt(max - min) + min - val bytes = new Array[Byte](len) - var i = 0 - while (i < len) { - bytes(i) = chars.charAt(random.nextInt(chars.length())).toByte - i += 1 - } - bytes + def randomBytes(min: Int, max: Int): Array[Byte] = { + val len = randomChar.nextInt(max - min) + min + val bytes = new Array[Byte](len) + var i = 0 + while (i < len) { + bytes(i) = chars.charAt(randomChar.nextInt(chars.length())).toByte + i += 1 } + bytes + } + def byteArrayComparisons(iters: Long): Unit = { val count = 16 * 1000 val dataTiny = Seq.fill(count)(randomBytes(2, 7)).toArray val dataSmall = Seq.fill(count)(randomBytes(8, 16)).toArray @@ -78,9 +79,46 @@ object ByteArrayBenchmark extends BenchmarkBase { benchmark.run() } + def byteArrayEquals(iters: Long): Unit = { + def binaryEquals(inputs: Array[BinaryEqualInfo]) = { _: Int => + var res = false + for (_ <- 0L until iters) { + inputs.foreach { input => + res = ByteArrayMethods.arrayEquals( + input.s1.getBaseObject, input.s1.getBaseOffset, + input.s2.getBaseObject, input.s2.getBaseOffset + input.deltaOffset, + input.len) + } + } + } + val count = 16 * 1000 + val rand = new Random(0) + val inputs = (0 until count).map { _ => + val s1 = UTF8String.fromBytes(randomBytes(1, 16)) + val s2 = UTF8String.fromBytes(randomBytes(1, 16)) + val len = s1.numBytes().min(s2.numBytes()) + val deltaOffset = rand.nextInt(len) + BinaryEqualInfo(s1, s2, deltaOffset, len) + }.toArray + + val benchmark = new Benchmark("Byte Array equals", count * iters, 25, output = output) + benchmark.addCase("Byte Array equals")(binaryEquals(inputs)) + benchmark.run() + } + + case class BinaryEqualInfo( + s1: UTF8String, + s2: UTF8String, + deltaOffset: Int, + len: Int) + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { runBenchmark("byte array comparisons") { byteArrayComparisons(1024 * 4) } + + runBenchmark("byte array equals") { + byteArrayEquals(1000 * 10) + } } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org