This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 1b65b2aebf4e [SPARK-47416][SQL] Add new functions to CollationBenchmark
1b65b2aebf4e is described below

commit 1b65b2aebf4eacb005629f26a019cef66c454710
Author: Vladimir Golubev <vladimir.golu...@databricks.com>
AuthorDate: Wed Apr 17 23:25:34 2024 +0800

    [SPARK-47416][SQL] Add new functions to CollationBenchmark
    
    ### What changes were proposed in this pull request?
    Added new benchmarks for contains, startsWith, endsWith prior to improving 
the implementation for the UTF8_BINARY_LCASE collation.
    
    ### Why are the changes needed?
    To see exact improvements after the implementation of 
https://issues.apache.org/jira/browse/SPARK-47418
    
    ### Does this PR introduce _any_ user-facing change?
    No
    
    ### How was this patch tested?
    GHA 'Run Benchmarks' ran on this, for both JDK 17 and JDK 21
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No
    
    Closes #46078 from 
vladimirg-db/vladimirg-db/add-new-string-functions-to-collation-bencmark.
    
    Authored-by: Vladimir Golubev <vladimir.golu...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../CollationBenchmark-jdk21-results.txt           |  27 ++++
 sql/core/benchmarks/CollationBenchmark-results.txt |  27 ++++
 .../CollationNonASCIIBenchmark-jdk21-results.txt   |  27 ++++
 .../CollationNonASCIIBenchmark-results.txt         |  27 ++++
 .../execution/benchmark/CollationBenchmark.scala   | 141 +++++++++++++++++----
 5 files changed, 224 insertions(+), 25 deletions(-)

diff --git a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt 
b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
index 32cbbc74e911..24605e051dbb 100644
--- a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
+++ b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt
@@ -25,3 +25,30 @@ UNICODE                                          180133      
   180137
 UTF8_BINARY                                       10476          10477         
  1          0.0      104757.4       1.1X
 UNICODE_CI                                       148171         148190         
 28          0.0     1481705.6       0.1X
 
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - contains:     Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                 49257          49280         
 32          0.0      492574.0       1.0X
+UNICODE                                           18253          18293         
 57          0.0      182530.8       2.7X
+UTF8_BINARY                                       20199          20247         
 68          0.0      201987.8       2.4X
+UNICODE_CI                                       882302         882576         
387          0.0     8823023.9       0.1X
+
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - startsWith:   Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                 45015          45024         
 13          0.0      450153.7       1.0X
+UNICODE                                           17425          17455         
 43          0.0      174247.1       2.6X
+UTF8_BINARY                                       19237          19268         
 44          0.0      192371.4       2.3X
+UNICODE_CI                                       954993         955680         
971          0.0     9549930.3       0.0X
+
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - endsWith:     Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                 45919          45966         
 67          0.0      459187.0       1.0X
+UNICODE                                           17697          17713         
 23          0.0      176970.4       2.6X
+UTF8_BINARY                                       19448          19449         
  2          0.0      194479.6       2.4X
+UNICODE_CI                                       962916         963010         
133          0.0     9629158.5       0.0X
+
diff --git a/sql/core/benchmarks/CollationBenchmark-results.txt 
b/sql/core/benchmarks/CollationBenchmark-results.txt
index 4028b0f005a3..a92aadc52ee2 100644
--- a/sql/core/benchmarks/CollationBenchmark-results.txt
+++ b/sql/core/benchmarks/CollationBenchmark-results.txt
@@ -25,3 +25,30 @@ UNICODE                                          171375      
   171435
 UTF8_BINARY                                       14012          14030         
 26          0.0      140116.7       1.3X
 UNICODE_CI                                       153847         153901         
 76          0.0     1538471.1       0.1X
 
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - contains:     Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                 48528          48534         
  8          0.0      485281.3       1.0X
+UNICODE                                           17612          17628         
 23          0.0      176119.4       2.8X
+UTF8_BINARY                                       19664          19671         
 11          0.0      196636.4       2.5X
+UNICODE_CI                                       860919         862936        
2853          0.0     8609190.8       0.1X
+
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - startsWith:   Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                 49520          49524         
  7          0.0      495195.4       1.0X
+UNICODE                                           18346          18346         
  0          0.0      183457.7       2.7X
+UTF8_BINARY                                       20483          20488         
  7          0.0      204828.7       2.4X
+UNICODE_CI                                       928756         930065        
1851          0.0     9287564.4       0.1X
+
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - endsWith:     Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                 49501          49504         
  5          0.0      495006.9       1.0X
+UNICODE                                           18052          18095         
 61          0.0      180523.7       2.7X
+UTF8_BINARY                                       20187          20197         
 15          0.0      201867.1       2.5X
+UNICODE_CI                                       934011         938842        
6833          0.0     9340109.8       0.1X
+
diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt 
b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt
index dc68b747203f..0a50baab36ea 100644
--- a/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt
+++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt
@@ -25,3 +25,30 @@ UNICODE                                           38937      
    38952
 UTF8_BINARY                                        1376           1376         
  0          0.0       34397.5       6.7X
 UNICODE_CI                                        32881          32882         
  1          0.0      822027.4       0.3X
 
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - contains:     Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                 22429          22438         
 13          0.0      560735.1       1.0X
+UNICODE                                            2900           2901         
  2          0.0       72503.2       7.7X
+UTF8_BINARY                                        3190           3198         
 11          0.0       79740.5       7.0X
+UNICODE_CI                                       166847         167278         
609          0.0     4171180.3       0.1X
+
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - startsWith:   Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                 22865          22875         
 13          0.0      571636.3       1.0X
+UNICODE                                            3137           3137         
  0          0.0       78422.3       7.3X
+UTF8_BINARY                                        3448           3450         
  3          0.0       86188.5       6.6X
+UNICODE_CI                                       190473         190894         
595          0.0     4761831.2       0.1X
+
+OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - endsWith:     Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                 23693          23695         
  3          0.0      592333.2       1.0X
+UNICODE                                            3170           3172         
  3          0.0       79243.5       7.5X
+UTF8_BINARY                                        3472           3473         
  2          0.0       86788.8       6.8X
+UNICODE_CI                                        63331          63603         
384          0.0     1583274.3       0.4X
+
diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt 
b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt
index bb58968764c7..bef5f9d7211f 100644
--- a/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt
+++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt
@@ -25,3 +25,30 @@ UNICODE                                           40410      
    40422
 UTF8_BINARY                                        2035           2035         
  1          0.0       50877.8       5.2X
 UNICODE_CI                                        31470          31493         
 32          0.0      786752.4       0.3X
 
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - contains:     Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                 22342          22352         
 13          0.0      558560.4       1.0X
+UNICODE                                            3073           3074         
  0          0.0       76829.5       7.3X
+UTF8_BINARY                                        3486           3487         
  2          0.0       87147.6       6.4X
+UNICODE_CI                                       162838         164378        
2177          0.0     4070960.3       0.1X
+
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - startsWith:   Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                 21882          21890         
 11          0.0      547051.8       1.0X
+UNICODE                                            2672           2676         
  6          0.0       66799.0       8.2X
+UTF8_BINARY                                        3069           3071         
  2          0.0       76732.2       7.1X
+UNICODE_CI                                       187853         188724        
1232          0.0     4696336.1       0.1X
+
+OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure
+AMD EPYC 7763 64-Core Processor
+collation unit benchmarks - endsWith:     Best Time(ms)   Avg Time(ms)   
Stdev(ms)    Rate(M/s)   Per Row(ns)   Relative
+------------------------------------------------------------------------------------------------------------------------
+UTF8_BINARY_LCASE                                 21818          21866         
 68          0.0      545439.9       1.0X
+UNICODE                                            2637           2643         
  9          0.0       65913.3       8.3X
+UTF8_BINARY                                        3037           3039         
  2          0.0       75934.6       7.2X
+UNICODE_CI                                        61372          61510         
195          0.0     1534307.9       0.4X
+
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
index 7a93c7c495e2..70ad8b9989c1 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.benchmark
 import scala.concurrent.duration._
 
 import org.apache.spark.benchmark.{Benchmark, BenchmarkBase}
-import org.apache.spark.sql.catalyst.util.CollationFactory
+import org.apache.spark.sql.catalyst.util.{CollationFactory, CollationSupport}
 import org.apache.spark.unsafe.types.UTF8String
 
 abstract class CollationBenchmarkBase extends BenchmarkBase {
@@ -36,18 +36,19 @@ abstract class CollationBenchmarkBase extends BenchmarkBase 
{
       utf8Strings.size * 10,
       warmupTime = 10.seconds,
       output = output)
-    collationTypes.foreach(collationType => {
+    collationTypes.foreach { collationType => {
       val collation = CollationFactory.fetchCollation(collationType)
       benchmark.addCase(s"$collationType") { _ =>
-        sublistStrings.foreach(s1 =>
-          utf8Strings.foreach(s =>
-            (0 to 10).foreach(_ =>
-              collation.equalsFunction(s, s1).booleanValue())
-          )
-        )
+        sublistStrings.foreach { s1 =>
+          utf8Strings.foreach { s =>
+            (0 to 10).foreach { _ =>
+              collation.equalsFunction(s, s1).booleanValue()
+            }
+          }
+        }
       }
     }
-    )
+    }
     benchmark.run()
   }
 
@@ -59,19 +60,19 @@ abstract class CollationBenchmarkBase extends BenchmarkBase 
{
       utf8Strings.size * 10,
       warmupTime = 10.seconds,
       output = output)
-    collationTypes.foreach(collationType => {
+    collationTypes.foreach { collationType => {
       val collation = CollationFactory.fetchCollation(collationType)
       benchmark.addCase(s"$collationType") { _ =>
-        sublistStrings.foreach(s1 =>
-          utf8Strings.foreach(s =>
-            (0 to 10).foreach(_ =>
+        sublistStrings.foreach { s1 =>
+          utf8Strings.foreach { s =>
+            (0 to 10).foreach { _ =>
               collation.comparator.compare(s, s1)
-            )
-          )
-        )
+            }
+          }
+        }
       }
     }
-    )
+    }
     benchmark.run()
   }
 
@@ -85,19 +86,103 @@ abstract class CollationBenchmarkBase extends 
BenchmarkBase {
       utf8Strings.size * 10,
       warmupTime = 10.seconds,
       output = output)
-    collationTypes.foreach(collationType => {
+    collationTypes.foreach { collationType => {
       val collation = CollationFactory.fetchCollation(collationType)
       benchmark.addCase(s"$collationType") { _ =>
-        sublistStrings.foreach(_ =>
-          utf8Strings.foreach(s =>
-            (0 to 10).foreach(_ =>
+        sublistStrings.foreach { _ =>
+          utf8Strings.foreach { s =>
+            (0 to 10).foreach { _ =>
               collation.hashFunction.applyAsLong(s)
-            )
-          )
-        )
+            }
+          }
+        }
+      }
+    }
+    }
+    benchmark.run()
+  }
+
+  def benchmarkContains(
+      collationTypes: Seq[String],
+      utf8Strings: Seq[UTF8String]): Unit = {
+    val sublistStrings = utf8Strings
+
+    val benchmark = new Benchmark(
+      "collation unit benchmarks - contains",
+      utf8Strings.size * 10,
+      warmupTime = 10.seconds,
+      output = output)
+    collationTypes.foreach { collationType => {
+      val collation = CollationFactory.fetchCollation(collationType)
+      benchmark.addCase(s"$collationType") { _ =>
+        sublistStrings.foreach { s1 =>
+          utf8Strings.foreach { s =>
+            (0 to 10).foreach { _ =>
+              CollationSupport.Contains.exec(
+                s, s1, 
CollationFactory.collationNameToId(collation.collationName)
+              )
+            }
+          }
+        }
       }
     }
-    )
+    }
+    benchmark.run()
+  }
+
+  def benchmarkStartsWith(
+      collationTypes: Seq[String],
+      utf8Strings: Seq[UTF8String]): Unit = {
+    val sublistStrings = utf8Strings
+
+    val benchmark = new Benchmark(
+      "collation unit benchmarks - startsWith",
+      utf8Strings.size * 10,
+      warmupTime = 10.seconds,
+      output = output)
+    collationTypes.foreach { collationType => {
+      val collation = CollationFactory.fetchCollation(collationType)
+      benchmark.addCase(s"$collationType") { _ =>
+        sublistStrings.foreach { s1 =>
+          utf8Strings.foreach { s =>
+            (0 to 10).foreach { _ =>
+              CollationSupport.StartsWith.exec(
+                s, s1, 
CollationFactory.collationNameToId(collation.collationName)
+              )
+            }
+          }
+        }
+      }
+    }
+    }
+    benchmark.run()
+  }
+
+  def benchmarkEndsWith(
+      collationTypes: Seq[String],
+      utf8Strings: Seq[UTF8String]): Unit = {
+    val sublistStrings = utf8Strings
+
+    val benchmark = new Benchmark(
+      "collation unit benchmarks - endsWith",
+      utf8Strings.size * 10,
+      warmupTime = 10.seconds,
+      output = output)
+    collationTypes.foreach { collationType => {
+      val collation = CollationFactory.fetchCollation(collationType)
+      benchmark.addCase(s"$collationType") { _ =>
+        sublistStrings.foreach { s1 =>
+          utf8Strings.foreach { s =>
+            (0 to 10).foreach { _ =>
+              CollationSupport.EndsWith.exec(
+                s, s1, 
CollationFactory.collationNameToId(collation.collationName)
+              )
+            }
+          }
+        }
+      }
+    }
+    }
     benchmark.run()
   }
 }
@@ -130,6 +215,9 @@ object CollationBenchmark extends CollationBenchmarkBase {
     benchmarkUTFStringEquals(collationTypes, generateSeqInput(10000L))
     benchmarkUTFStringCompare(collationTypes, generateSeqInput(10000L))
     benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(10000L))
+    benchmarkContains(collationTypes, generateSeqInput(10000L))
+    benchmarkStartsWith(collationTypes, generateSeqInput(10000L))
+    benchmarkEndsWith(collationTypes, generateSeqInput(10000L))
   }
 }
 
@@ -155,5 +243,8 @@ object CollationNonASCIIBenchmark extends 
CollationBenchmarkBase {
     benchmarkUTFStringEquals(collationTypes, generateSeqInput(4000L))
     benchmarkUTFStringCompare(collationTypes, generateSeqInput(4000L))
     benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(4000L))
+    benchmarkContains(collationTypes, generateSeqInput(4000L))
+    benchmarkStartsWith(collationTypes, generateSeqInput(4000L))
+    benchmarkEndsWith(collationTypes, generateSeqInput(4000L))
   }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to