This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 1b65b2aebf4e [SPARK-47416][SQL] Add new functions to CollationBenchmark 1b65b2aebf4e is described below commit 1b65b2aebf4eacb005629f26a019cef66c454710 Author: Vladimir Golubev <vladimir.golu...@databricks.com> AuthorDate: Wed Apr 17 23:25:34 2024 +0800 [SPARK-47416][SQL] Add new functions to CollationBenchmark ### What changes were proposed in this pull request? Added new benchmarks for contains, startsWith, endsWith prior to improving the implementation for the UTF8_BINARY_LCASE collation. ### Why are the changes needed? To see exact improvements after the implementation of https://issues.apache.org/jira/browse/SPARK-47418 ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? GHA 'Run Benchmarks' ran on this, for both JDK 17 and JDK 21 ### Was this patch authored or co-authored using generative AI tooling? No Closes #46078 from vladimirg-db/vladimirg-db/add-new-string-functions-to-collation-bencmark. Authored-by: Vladimir Golubev <vladimir.golu...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../CollationBenchmark-jdk21-results.txt | 27 ++++ sql/core/benchmarks/CollationBenchmark-results.txt | 27 ++++ .../CollationNonASCIIBenchmark-jdk21-results.txt | 27 ++++ .../CollationNonASCIIBenchmark-results.txt | 27 ++++ .../execution/benchmark/CollationBenchmark.scala | 141 +++++++++++++++++---- 5 files changed, 224 insertions(+), 25 deletions(-) diff --git a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt index 32cbbc74e911..24605e051dbb 100644 --- a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt @@ -25,3 +25,30 @@ UNICODE 180133 180137 UTF8_BINARY 10476 10477 1 0.0 104757.4 1.1X UNICODE_CI 148171 148190 28 0.0 1481705.6 0.1X +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 49257 49280 32 0.0 492574.0 1.0X +UNICODE 18253 18293 57 0.0 182530.8 2.7X +UTF8_BINARY 20199 20247 68 0.0 201987.8 2.4X +UNICODE_CI 882302 882576 387 0.0 8823023.9 0.1X + +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 45015 45024 13 0.0 450153.7 1.0X +UNICODE 17425 17455 43 0.0 174247.1 2.6X +UTF8_BINARY 19237 19268 44 0.0 192371.4 2.3X +UNICODE_CI 954993 955680 971 0.0 9549930.3 0.0X + +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 45919 45966 67 0.0 459187.0 1.0X +UNICODE 17697 17713 23 0.0 176970.4 2.6X +UTF8_BINARY 19448 19449 2 0.0 194479.6 2.4X +UNICODE_CI 962916 963010 133 0.0 9629158.5 0.0X + diff --git a/sql/core/benchmarks/CollationBenchmark-results.txt b/sql/core/benchmarks/CollationBenchmark-results.txt index 4028b0f005a3..a92aadc52ee2 100644 --- a/sql/core/benchmarks/CollationBenchmark-results.txt +++ b/sql/core/benchmarks/CollationBenchmark-results.txt @@ -25,3 +25,30 @@ UNICODE 171375 171435 UTF8_BINARY 14012 14030 26 0.0 140116.7 1.3X UNICODE_CI 153847 153901 76 0.0 1538471.1 0.1X +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 48528 48534 8 0.0 485281.3 1.0X +UNICODE 17612 17628 23 0.0 176119.4 2.8X +UTF8_BINARY 19664 19671 11 0.0 196636.4 2.5X +UNICODE_CI 860919 862936 2853 0.0 8609190.8 0.1X + +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 49520 49524 7 0.0 495195.4 1.0X +UNICODE 18346 18346 0 0.0 183457.7 2.7X +UTF8_BINARY 20483 20488 7 0.0 204828.7 2.4X +UNICODE_CI 928756 930065 1851 0.0 9287564.4 0.1X + +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 49501 49504 5 0.0 495006.9 1.0X +UNICODE 18052 18095 61 0.0 180523.7 2.7X +UTF8_BINARY 20187 20197 15 0.0 201867.1 2.5X +UNICODE_CI 934011 938842 6833 0.0 9340109.8 0.1X + diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt index dc68b747203f..0a50baab36ea 100644 --- a/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt @@ -25,3 +25,30 @@ UNICODE 38937 38952 UTF8_BINARY 1376 1376 0 0.0 34397.5 6.7X UNICODE_CI 32881 32882 1 0.0 822027.4 0.3X +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 22429 22438 13 0.0 560735.1 1.0X +UNICODE 2900 2901 2 0.0 72503.2 7.7X +UTF8_BINARY 3190 3198 11 0.0 79740.5 7.0X +UNICODE_CI 166847 167278 609 0.0 4171180.3 0.1X + +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 22865 22875 13 0.0 571636.3 1.0X +UNICODE 3137 3137 0 0.0 78422.3 7.3X +UTF8_BINARY 3448 3450 3 0.0 86188.5 6.6X +UNICODE_CI 190473 190894 595 0.0 4761831.2 0.1X + +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 23693 23695 3 0.0 592333.2 1.0X +UNICODE 3170 3172 3 0.0 79243.5 7.5X +UTF8_BINARY 3472 3473 2 0.0 86788.8 6.8X +UNICODE_CI 63331 63603 384 0.0 1583274.3 0.4X + diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt index bb58968764c7..bef5f9d7211f 100644 --- a/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt +++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt @@ -25,3 +25,30 @@ UNICODE 40410 40422 UTF8_BINARY 2035 2035 1 0.0 50877.8 5.2X UNICODE_CI 31470 31493 32 0.0 786752.4 0.3X +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - contains: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 22342 22352 13 0.0 558560.4 1.0X +UNICODE 3073 3074 0 0.0 76829.5 7.3X +UTF8_BINARY 3486 3487 2 0.0 87147.6 6.4X +UNICODE_CI 162838 164378 2177 0.0 4070960.3 0.1X + +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - startsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 21882 21890 11 0.0 547051.8 1.0X +UNICODE 2672 2676 6 0.0 66799.0 8.2X +UTF8_BINARY 3069 3071 2 0.0 76732.2 7.1X +UNICODE_CI 187853 188724 1232 0.0 4696336.1 0.1X + +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - endsWith: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 21818 21866 68 0.0 545439.9 1.0X +UNICODE 2637 2643 9 0.0 65913.3 8.3X +UTF8_BINARY 3037 3039 2 0.0 75934.6 7.2X +UNICODE_CI 61372 61510 195 0.0 1534307.9 0.4X + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala index 7a93c7c495e2..70ad8b9989c1 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala @@ -19,7 +19,7 @@ package org.apache.spark.sql.execution.benchmark import scala.concurrent.duration._ import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} -import org.apache.spark.sql.catalyst.util.CollationFactory +import org.apache.spark.sql.catalyst.util.{CollationFactory, CollationSupport} import org.apache.spark.unsafe.types.UTF8String abstract class CollationBenchmarkBase extends BenchmarkBase { @@ -36,18 +36,19 @@ abstract class CollationBenchmarkBase extends BenchmarkBase { utf8Strings.size * 10, warmupTime = 10.seconds, output = output) - collationTypes.foreach(collationType => { + collationTypes.foreach { collationType => { val collation = CollationFactory.fetchCollation(collationType) benchmark.addCase(s"$collationType") { _ => - sublistStrings.foreach(s1 => - utf8Strings.foreach(s => - (0 to 10).foreach(_ => - collation.equalsFunction(s, s1).booleanValue()) - ) - ) + sublistStrings.foreach { s1 => + utf8Strings.foreach { s => + (0 to 10).foreach { _ => + collation.equalsFunction(s, s1).booleanValue() + } + } + } } } - ) + } benchmark.run() } @@ -59,19 +60,19 @@ abstract class CollationBenchmarkBase extends BenchmarkBase { utf8Strings.size * 10, warmupTime = 10.seconds, output = output) - collationTypes.foreach(collationType => { + collationTypes.foreach { collationType => { val collation = CollationFactory.fetchCollation(collationType) benchmark.addCase(s"$collationType") { _ => - sublistStrings.foreach(s1 => - utf8Strings.foreach(s => - (0 to 10).foreach(_ => + sublistStrings.foreach { s1 => + utf8Strings.foreach { s => + (0 to 10).foreach { _ => collation.comparator.compare(s, s1) - ) - ) - ) + } + } + } } } - ) + } benchmark.run() } @@ -85,19 +86,103 @@ abstract class CollationBenchmarkBase extends BenchmarkBase { utf8Strings.size * 10, warmupTime = 10.seconds, output = output) - collationTypes.foreach(collationType => { + collationTypes.foreach { collationType => { val collation = CollationFactory.fetchCollation(collationType) benchmark.addCase(s"$collationType") { _ => - sublistStrings.foreach(_ => - utf8Strings.foreach(s => - (0 to 10).foreach(_ => + sublistStrings.foreach { _ => + utf8Strings.foreach { s => + (0 to 10).foreach { _ => collation.hashFunction.applyAsLong(s) - ) - ) - ) + } + } + } + } + } + } + benchmark.run() + } + + def benchmarkContains( + collationTypes: Seq[String], + utf8Strings: Seq[UTF8String]): Unit = { + val sublistStrings = utf8Strings + + val benchmark = new Benchmark( + "collation unit benchmarks - contains", + utf8Strings.size * 10, + warmupTime = 10.seconds, + output = output) + collationTypes.foreach { collationType => { + val collation = CollationFactory.fetchCollation(collationType) + benchmark.addCase(s"$collationType") { _ => + sublistStrings.foreach { s1 => + utf8Strings.foreach { s => + (0 to 10).foreach { _ => + CollationSupport.Contains.exec( + s, s1, CollationFactory.collationNameToId(collation.collationName) + ) + } + } + } } } - ) + } + benchmark.run() + } + + def benchmarkStartsWith( + collationTypes: Seq[String], + utf8Strings: Seq[UTF8String]): Unit = { + val sublistStrings = utf8Strings + + val benchmark = new Benchmark( + "collation unit benchmarks - startsWith", + utf8Strings.size * 10, + warmupTime = 10.seconds, + output = output) + collationTypes.foreach { collationType => { + val collation = CollationFactory.fetchCollation(collationType) + benchmark.addCase(s"$collationType") { _ => + sublistStrings.foreach { s1 => + utf8Strings.foreach { s => + (0 to 10).foreach { _ => + CollationSupport.StartsWith.exec( + s, s1, CollationFactory.collationNameToId(collation.collationName) + ) + } + } + } + } + } + } + benchmark.run() + } + + def benchmarkEndsWith( + collationTypes: Seq[String], + utf8Strings: Seq[UTF8String]): Unit = { + val sublistStrings = utf8Strings + + val benchmark = new Benchmark( + "collation unit benchmarks - endsWith", + utf8Strings.size * 10, + warmupTime = 10.seconds, + output = output) + collationTypes.foreach { collationType => { + val collation = CollationFactory.fetchCollation(collationType) + benchmark.addCase(s"$collationType") { _ => + sublistStrings.foreach { s1 => + utf8Strings.foreach { s => + (0 to 10).foreach { _ => + CollationSupport.EndsWith.exec( + s, s1, CollationFactory.collationNameToId(collation.collationName) + ) + } + } + } + } + } + } benchmark.run() } } @@ -130,6 +215,9 @@ object CollationBenchmark extends CollationBenchmarkBase { benchmarkUTFStringEquals(collationTypes, generateSeqInput(10000L)) benchmarkUTFStringCompare(collationTypes, generateSeqInput(10000L)) benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(10000L)) + benchmarkContains(collationTypes, generateSeqInput(10000L)) + benchmarkStartsWith(collationTypes, generateSeqInput(10000L)) + benchmarkEndsWith(collationTypes, generateSeqInput(10000L)) } } @@ -155,5 +243,8 @@ object CollationNonASCIIBenchmark extends CollationBenchmarkBase { benchmarkUTFStringEquals(collationTypes, generateSeqInput(4000L)) benchmarkUTFStringCompare(collationTypes, generateSeqInput(4000L)) benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(4000L)) + benchmarkContains(collationTypes, generateSeqInput(4000L)) + benchmarkStartsWith(collationTypes, generateSeqInput(4000L)) + benchmarkEndsWith(collationTypes, generateSeqInput(4000L)) } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org