This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new e9f204ae9306 [SPARK-46840][SQL][TESTS] Add `CollationBenchmark`
e9f204ae9306 is described below

commit e9f204ae93061a862e4da52c128eaf3512a66c7b
Author: GideonPotok <g.pot...@gmail.com>
AuthorDate: Mon Apr 1 22:29:28 2024 +0800

    [SPARK-46840][SQL][TESTS] Add `CollationBenchmark`
    
    ### What changes were proposed in this pull request?
    
    https://issues.apache.org/jira/browse/SPARK-46840
    
    [Collation Support in 
Spark.docx](https://github.com/apache/spark/files/14551958/Collation.Support.in.Spark.docx)
    
    ### Why are the changes needed?
    
    Work is underway to introduce collation concept into Spark. There is a need 
to build out a benchmarking suite to allow engineers to address performance 
impact.
    
    ### Does this PR introduce _any_ user-facing change?
    
    No.
    
    ### How was this patch tested?
    
    GHA 'Run Benchmarks' ran on this, for both JDK 17 and JDK 21
    
    In addition, both the author and dbatomic tested locally on personal 
computers:
    `build/sbt "sql/Test/runMain  
org.apache.spark.sql.execution.benchmark.CollationBenchmark"`
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #45453 from GideonPotok/spark_46840.
    
    Authored-by: GideonPotok <g.pot...@gmail.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../CollationBenchmark-jdk21-results.txt           |  27 +++++
 sql/core/benchmarks/CollationBenchmark-results.txt |  27 +++++
 .../execution/benchmark/CollationBenchmark.scala   | 129 +++++++++++++++++++++
 3 files changed, 183 insertions(+)

diff --git a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt 
b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
new file mode 100644
index 000000000000..e1d7a42aac61
--- /dev/null
+++ b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
@@ -0,0 +1,27 @@
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - equalsFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+--------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                   29904          29937       
   47          0.0      299036.1       1.0X
+UNICODE                                              3886           3893       
   10          0.0       38863.0       7.7X
+UTF8_BINARY                                          3945           3945       
    0          0.0       39449.6       7.6X
+UNICODE_CI                                          45321          45330       
   12          0.0      453210.3       0.7X
+
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - compareFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+---------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                    29807          29818      
    17          0.0      298065.0       1.0X
+UNICODE                                              45704          45723      
    27          0.0      457036.2       0.7X
+UTF8_BINARY                                           6460           6464      
     7          0.0       64597.9       4.6X
+UNICODE_CI                                           45498          45508      
    14          0.0      454977.6       0.7X
+
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - hashFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                 23553          23595         
 59          0.0      235531.8       1.0X
+UNICODE                                          197303         197309         
  8          0.0     1973034.1       0.1X
+UTF8_BINARY                                       14389          14391         
  2          0.0      143891.2       1.6X
+UNICODE_CI                                       166880         166885         
  7          0.0     1668799.5       0.1X
+
diff --git a/sql/core/benchmarks/CollationBenchmark-results.txt 
b/sql/core/benchmarks/CollationBenchmark-results.txt
new file mode 100644
index 000000000000..d8ebdfa695ff
--- /dev/null
+++ b/sql/core/benchmarks/CollationBenchmark-results.txt
@@ -0,0 +1,27 @@
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - equalsFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+--------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                   34122          34152       
   42          0.0      341224.2       1.0X
+UNICODE                                              4520           4522       
    2          0.0       45201.8       7.5X
+UTF8_BINARY                                          4524           4526       
    2          0.0       45243.0       7.5X
+UNICODE_CI                                          52706          52711       
    7          0.0      527056.1       0.6X
+
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - compareFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+---------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                    33467          33474      
    10          0.0      334671.7       1.0X
+UNICODE                                              51168          51168      
     1          0.0      511677.4       0.7X
+UTF8_BINARY                                           5561           5593      
    45          0.0       55610.9       6.0X
+UNICODE_CI                                           51929          51955      
    36          0.0      519291.8       0.6X
+
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - hashFunction:  Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                 22079          22083         
  5          0.0      220786.7       1.0X
+UNICODE                                          177636         177709         
103          0.0     1776363.9       0.1X
+UTF8_BINARY                                       11954          11956         
  3          0.0      119536.7       1.8X
+UNICODE_CI                                       158014         158038         
 35          0.0     1580135.7       0.1X
+
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
new file mode 100644
index 000000000000..24e61052f561
--- /dev/null
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.spark.sql.execution.benchmark
+
+import scala.concurrent.duration._
+
+import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
+import org.apache.spark.sql.catalyst.util.CollationFactory
+import org.apache.spark.unsafe.types.UTF8String
+
+/**
+ * Benchmark to measure performance for comparisons between collated strings. 
To run this benchmark:
+ * {{{
+ *   1. without sbt:
+ *      bin/spark-submit --class <this class>
+ *        --jars <spark core test jar>,<spark catalyst test jar> <spark sql 
test jar>
+ *   2. build/sbt "sql/Test/runMain 
org.apache.spark.sql.execution.benchmark.CollationBenchmark"
+ *   3. generate result:
+ *      SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain <this 
class>"
+ *      Results will be written to "benchmarks/CollationBenchmark-results.txt".
+ * }}}
+ */
+
+object CollationBenchmark extends BenchmarkBase {
+  private val collationTypes = Seq("UTF8_BINARY_LCASE", "UNICODE", 
"UTF8_BINARY", "UNICODE_CI")
+
+  def generateSeqInput(n: Long): Seq[UTF8String] = {
+    val input = Seq("ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", 
"def", "def",
+      "GHI", "ghi", "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", 
"VWX", "vwx",
+      "ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def", 
"GHI", "ghi",
+      "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx", 
"YZ")
+      .map(UTF8String.fromString)
+    val inputLong: Seq[UTF8String] = (0L until n).map(i => input(i.toInt % 
input.size))
+    inputLong
+  }
+
+  def benchmarkUTFStringEquals(collationTypes: Seq[String], utf8Strings: 
Seq[UTF8String]): Unit = {
+    val sublistStrings = utf8Strings
+
+    val benchmark = new Benchmark(
+      "collation unit benchmarks - equalsFunction",
+      utf8Strings.size * 10,
+      warmupTime = 4.seconds,
+      output = output)
+    collationTypes.foreach(collationType => {
+      val collation = CollationFactory.fetchCollation(collationType)
+      benchmark.addCase(s"$collationType") { _ =>
+        sublistStrings.foreach(s1 =>
+          utf8Strings.foreach(s =>
+            (0 to 10).foreach(_ =>
+              collation.equalsFunction(s, s1).booleanValue())
+          )
+        )
+      }
+    }
+    )
+    benchmark.run()
+  }
+
+  def benchmarkUTFStringCompare(collationTypes: Seq[String], utf8Strings: 
Seq[UTF8String]): Unit = {
+    val sublistStrings = utf8Strings
+
+    val benchmark = new Benchmark(
+      "collation unit benchmarks - compareFunction",
+      utf8Strings.size * 10,
+      warmupTime = 4.seconds,
+      output = output)
+    collationTypes.foreach(collationType => {
+      val collation = CollationFactory.fetchCollation(collationType)
+      benchmark.addCase(s"$collationType") { _ =>
+        sublistStrings.foreach(s1 =>
+          utf8Strings.foreach(s =>
+            (0 to 10).foreach(_ =>
+              collation.comparator.compare(s, s1)
+            )
+          )
+        )
+      }
+    }
+    )
+    benchmark.run()
+  }
+
+  def benchmarkUTFStringHashFunction(
+      collationTypes: Seq[String],
+      utf8Strings: Seq[UTF8String]): Unit = {
+    val sublistStrings = utf8Strings
+
+    val benchmark = new Benchmark(
+      "collation unit benchmarks - hashFunction",
+      utf8Strings.size * 10,
+      warmupTime = 4.seconds,
+      output = output)
+    collationTypes.foreach(collationType => {
+      val collation = CollationFactory.fetchCollation(collationType)
+      benchmark.addCase(s"$collationType") { _ =>
+        sublistStrings.foreach(_ =>
+          utf8Strings.foreach(s =>
+            (0 to 10).foreach(_ =>
+              collation.hashFunction.applyAsLong(s)
+            )
+          )
+        )
+      }
+    }
+    )
+    benchmark.run()
+  }
+
+  override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+    benchmarkUTFStringEquals(collationTypes, generateSeqInput(10000L))
+    benchmarkUTFStringCompare(collationTypes, generateSeqInput(10000L))
+    benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(10000L))
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to