This is an automated email from the ASF dual-hosted git repository.

srowen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 8c6d312  [SPARK-37796][SQL] ByteArrayMethods arrayEquals should fast 
skip the check of aligning with unaligned platform
8c6d312 is described below

commit 8c6d3123086cf4def7e8be61214dfc9286578169
Author: ulysses-you <ulyssesyo...@gmail.com>
AuthorDate: Wed Jan 5 09:30:05 2022 -0600

    [SPARK-37796][SQL] ByteArrayMethods arrayEquals should fast skip the check 
of aligning with unaligned platform
    
    ### What changes were proposed in this pull request?
    
    The method `arrayEquals` in `ByteArrayMethods` is critical function which 
is used in `UTF8String.` `equals`, `indexOf`,`find` etc.
    
    After SPARK-16962, it add the complexity of aligned. It would be better to 
fast sikip the check of aligning if the platform is unaligned.
    
    ### Why are the changes needed?
    
    Improve the performance.
    
    ### Does this PR introduce _any_ user-facing change?
    
    no
    
    ### How was this patch tested?
    
    Pass CI. Run the benchmark using 
[unaligned-benchmark](https://github.com/ulysses-you/spark/commit/d14d4bfcfeddcf90ccfe7cc3f6cda426d6d6b7e5),
 and here is the benchmark result:
    
    [JDK8](https://github.com/ulysses-you/spark/actions/runs/1639852573)
    ```
    
================================================================================================
    byte array equals
    
================================================================================================
    
    OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure
    Intel(R) Xeon(R) Platinum 8272CL CPU  2.60GHz
    Byte Array equals:                        Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
    
------------------------------------------------------------------------------------------------------------------------
    Byte Array equals fast                             1322           2222      
   NaN        121.0           8.3       1.0X
    Byte Array equals                                  3378           3381      
     3         47.4          21.1       0.4X
    ```
    
    [JDK11](https://github.com/ulysses-you/spark/actions/runs/1639853330)
    ```
    
================================================================================================
    byte array equals
    
================================================================================================
    
    OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure
    Intel(R) Xeon(R) Platinum 8272CL CPU  2.60GHz
    Byte Array equals:                        Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
    
------------------------------------------------------------------------------------------------------------------------
    Byte Array equals fast                             1860           1891      
    15         86.0          11.6       1.0X
    Byte Array equals                                  2913           2921      
     8         54.9          18.2       0.6X
    ```
    
    [JDK17](https://github.com/ulysses-you/spark/actions/runs/1639853938)
    ```
    
================================================================================================
    byte array equals
    
================================================================================================
    
    OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure
    Intel(R) Xeon(R) Platinum 8171M CPU  2.60GHz
    Byte Array equals:                        Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
    
------------------------------------------------------------------------------------------------------------------------
    Byte Array equals fast                             1543           1602      
    39        103.7           9.6       1.0X
    Byte Array equals                                  3027           3029      
     1         52.9          18.9       0.5X
    ```
    
    Closes #35078 from ulysses-you/SPARK-37796.
    
    Authored-by: ulysses-you <ulyssesyo...@gmail.com>
    Signed-off-by: Sean Owen <sro...@gmail.com>
---
 .../spark/unsafe/array/ByteArrayMethods.java       |  2 +-
 .../ByteArrayBenchmark-jdk11-results.txt           | 10 ++++
 .../ByteArrayBenchmark-jdk17-results.txt           | 10 ++++
 sql/core/benchmarks/ByteArrayBenchmark-results.txt | 10 ++++
 .../execution/benchmark/ByteArrayBenchmark.scala   | 66 +++++++++++++++++-----
 5 files changed, 83 insertions(+), 15 deletions(-)

diff --git 
a/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
 
b/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
index f3a59e3..5a7e32b 100644
--- 
a/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
+++ 
b/common/unsafe/src/main/java/org/apache/spark/unsafe/array/ByteArrayMethods.java
@@ -61,7 +61,7 @@ public class ByteArrayMethods {
     int i = 0;
 
     // check if stars align and we can get both offsets to be aligned
-    if ((leftOffset % 8) == (rightOffset % 8)) {
+    if (!unaligned && ((leftOffset % 8) == (rightOffset % 8))) {
       while ((leftOffset + i) % 8 != 0 && i < length) {
         if (Platform.getByte(leftBase, leftOffset + i) !=
             Platform.getByte(rightBase, rightOffset + i)) {
diff --git a/sql/core/benchmarks/ByteArrayBenchmark-jdk11-results.txt 
b/sql/core/benchmarks/ByteArrayBenchmark-jdk11-results.txt
index 0bdab8d..aafe6e6 100644
--- a/sql/core/benchmarks/ByteArrayBenchmark-jdk11-results.txt
+++ b/sql/core/benchmarks/ByteArrayBenchmark-jdk11-results.txt
@@ -14,3 +14,13 @@ Byte Array compareTo:                     Best Time(ms)   
Avg Time(ms)   Stdev(m
 2-7 byte                                            548            564         
  9        119.5           8.4       0.9X
 
 
+================================================================================================
+byte array equals
+================================================================================================
+
+OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure
+Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+Byte Array equals:                        Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+Byte Array equals                                  1860           1891         
 15         86.0          11.6       1.0X
+
diff --git a/sql/core/benchmarks/ByteArrayBenchmark-jdk17-results.txt 
b/sql/core/benchmarks/ByteArrayBenchmark-jdk17-results.txt
index b5e0428..33af4c2 100644
--- a/sql/core/benchmarks/ByteArrayBenchmark-jdk17-results.txt
+++ b/sql/core/benchmarks/ByteArrayBenchmark-jdk17-results.txt
@@ -14,3 +14,13 @@ Byte Array compareTo:                     Best Time(ms)   
Avg Time(ms)   Stdev(m
 2-7 byte                                            454            454         
  0        144.3           6.9       0.9X
 
 
+================================================================================================
+byte array equals
+================================================================================================
+
+OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure
+Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz
+Byte Array equals:                        Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+Byte Array equals                                  1543           1602         
 39        103.7           9.6       1.0X
+
diff --git a/sql/core/benchmarks/ByteArrayBenchmark-results.txt 
b/sql/core/benchmarks/ByteArrayBenchmark-results.txt
index cf2a6d4..ae1054b 100644
--- a/sql/core/benchmarks/ByteArrayBenchmark-results.txt
+++ b/sql/core/benchmarks/ByteArrayBenchmark-results.txt
@@ -14,3 +14,13 @@ Byte Array compareTo:                     Best Time(ms)   
Avg Time(ms)   Stdev(m
 2-7 byte                                            402            403         
  0        162.8           6.1       1.0X
 
 
+================================================================================================
+byte array equals
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure
+Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz
+Byte Array equals:                        Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+Byte Array equals                                  1322           2222         
NaN        121.0           8.3       1.0X
+
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ByteArrayBenchmark.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ByteArrayBenchmark.scala
index f8b1e27..9901684 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ByteArrayBenchmark.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/ByteArrayBenchmark.scala
@@ -20,10 +20,11 @@ package org.apache.spark.sql.execution.benchmark
 import scala.util.Random
 
 import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
-import org.apache.spark.unsafe.types.ByteArray
+import org.apache.spark.unsafe.array.ByteArrayMethods
+import org.apache.spark.unsafe.types.{ByteArray, UTF8String}
 
 /**
- * Benchmark to measure performance for byte array comparisons.
+ * Benchmark to measure performance for byte array operators.
  * {{{
  *   To run this benchmark:
  *   1. without sbt:
@@ -34,21 +35,21 @@ import org.apache.spark.unsafe.types.ByteArray
  * }}}
  */
 object ByteArrayBenchmark extends BenchmarkBase {
+  private val chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+  private val randomChar = new Random(0)
 
-  def byteArrayComparisons(iters: Long): Unit = {
-    val chars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-    val random = new Random(0)
-    def randomBytes(min: Int, max: Int): Array[Byte] = {
-      val len = random.nextInt(max - min) + min
-      val bytes = new Array[Byte](len)
-      var i = 0
-      while (i < len) {
-        bytes(i) = chars.charAt(random.nextInt(chars.length())).toByte
-        i += 1
-      }
-      bytes
+  def randomBytes(min: Int, max: Int): Array[Byte] = {
+    val len = randomChar.nextInt(max - min) + min
+    val bytes = new Array[Byte](len)
+    var i = 0
+    while (i < len) {
+      bytes(i) = chars.charAt(randomChar.nextInt(chars.length())).toByte
+      i += 1
     }
+    bytes
+  }
 
+  def byteArrayComparisons(iters: Long): Unit = {
     val count = 16 * 1000
     val dataTiny = Seq.fill(count)(randomBytes(2, 7)).toArray
     val dataSmall = Seq.fill(count)(randomBytes(8, 16)).toArray
@@ -78,9 +79,46 @@ object ByteArrayBenchmark extends BenchmarkBase {
     benchmark.run()
   }
 
+  def byteArrayEquals(iters: Long): Unit = {
+    def binaryEquals(inputs: Array[BinaryEqualInfo]) = { _: Int =>
+      var res = false
+      for (_ <- 0L until iters) {
+        inputs.foreach { input =>
+          res = ByteArrayMethods.arrayEquals(
+            input.s1.getBaseObject, input.s1.getBaseOffset,
+            input.s2.getBaseObject, input.s2.getBaseOffset + input.deltaOffset,
+            input.len)
+        }
+      }
+    }
+    val count = 16 * 1000
+    val rand = new Random(0)
+    val inputs = (0 until count).map { _ =>
+      val s1 = UTF8String.fromBytes(randomBytes(1, 16))
+      val s2 = UTF8String.fromBytes(randomBytes(1, 16))
+      val len = s1.numBytes().min(s2.numBytes())
+      val deltaOffset = rand.nextInt(len)
+      BinaryEqualInfo(s1, s2, deltaOffset, len)
+    }.toArray
+
+    val benchmark = new Benchmark("Byte Array equals", count * iters, 25, 
output = output)
+    benchmark.addCase("Byte Array equals")(binaryEquals(inputs))
+    benchmark.run()
+  }
+
+  case class BinaryEqualInfo(
+      s1: UTF8String,
+      s2: UTF8String,
+      deltaOffset: Int,
+      len: Int)
+
   override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
     runBenchmark("byte array comparisons") {
       byteArrayComparisons(1024 * 4)
     }
+
+    runBenchmark("byte array equals") {
+      byteArrayEquals(1000 * 10)
+    }
   }
 }

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to