This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new 2a9280c [SPARK-31932][SQL][TESTS] Add date/timestamp benchmarks for `HiveResult.hiveResultString()` 2a9280c is described below commit 2a9280ca4a6610bec0453ced7ed12174f8f43e5e Author: Max Gekk <max.g...@gmail.com> AuthorDate: Tue Jun 9 04:59:41 2020 +0000 [SPARK-31932][SQL][TESTS] Add date/timestamp benchmarks for `HiveResult.hiveResultString()` ### What changes were proposed in this pull request? Add benchmarks for `HiveResult.hiveResultString()/toHiveString()` to measure throughput of `toHiveString` for the date/timestamp types: - java.sql.Date/Timestamp - java.time.Instant - java.time.LocalDate Benchmark results were generated in the environment: | Item | Description | | ---- | ----| | Region | us-west-2 (Oregon) | | Instance | r3.xlarge | | AMI | ubuntu/images/hvm-ssd/ubuntu-bionic-18.04-amd64-server-20190722.1 (ami-06f2f779464715dc5) | | Java | OpenJDK 64-Bit Server VM 1.8.0_242 and OpenJDK 64-Bit Server VM 11.0.6+10 | ### Why are the changes needed? To detect perf regressions of `toHiveString` in the future. ### Does this PR introduce _any_ user-facing change? No ### How was this patch tested? By running `DateTimeBenchmark` and check dataset content. Closes #28757 from MaxGekk/benchmark-toHiveString. Authored-by: Max Gekk <max.g...@gmail.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> (cherry picked from commit ddd8d5f5a0b6db17babc201ba4b73f7df91df1a3) Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../benchmarks/DateTimeBenchmark-jdk11-results.txt | 4 ++ sql/core/benchmarks/DateTimeBenchmark-results.txt | 4 ++ .../execution/benchmark/DateTimeBenchmark.scala | 46 ++++++++++++++++++---- 3 files changed, 46 insertions(+), 8 deletions(-) diff --git a/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt b/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt index f4ed8ce..70d8882 100644 --- a/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/DateTimeBenchmark-jdk11-results.txt @@ -453,5 +453,9 @@ From java.time.Instant 325 328 Collect longs 1300 1321 25 3.8 260.0 0.3X Collect java.sql.Timestamp 1450 1557 102 3.4 290.0 0.3X Collect java.time.Instant 1499 1599 87 3.3 299.9 0.3X +java.sql.Date to Hive string 17536 18367 1059 0.3 3507.2 0.0X +java.time.LocalDate to Hive string 12089 12897 725 0.4 2417.8 0.0X +java.sql.Timestamp to Hive string 48014 48625 752 0.1 9602.9 0.0X +java.time.Instant to Hive string 37346 37445 93 0.1 7469.1 0.0X diff --git a/sql/core/benchmarks/DateTimeBenchmark-results.txt b/sql/core/benchmarks/DateTimeBenchmark-results.txt index 7a9aa4b..0795f11 100644 --- a/sql/core/benchmarks/DateTimeBenchmark-results.txt +++ b/sql/core/benchmarks/DateTimeBenchmark-results.txt @@ -453,5 +453,9 @@ From java.time.Instant 236 243 Collect longs 1280 1337 79 3.9 256.1 0.3X Collect java.sql.Timestamp 1485 1501 15 3.4 297.0 0.3X Collect java.time.Instant 1441 1465 37 3.5 288.1 0.3X +java.sql.Date to Hive string 18745 20895 1364 0.3 3749.0 0.0X +java.time.LocalDate to Hive string 15296 15450 143 0.3 3059.2 0.0X +java.sql.Timestamp to Hive string 46421 47210 946 0.1 9284.2 0.0X +java.time.Instant to Hive string 34747 35187 382 0.1 6949.4 0.0X diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala index f56efa3..c7b8737 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DateTimeBenchmark.scala @@ -21,8 +21,10 @@ import java.sql.{Date, Timestamp} import java.time.{Instant, LocalDate} import org.apache.spark.benchmark.Benchmark +import org.apache.spark.sql.Dataset import org.apache.spark.sql.catalyst.util.DateTimeConstants.MILLIS_PER_DAY import org.apache.spark.sql.catalyst.util.DateTimeTestUtils.{withDefaultTimeZone, LA} +import org.apache.spark.sql.execution.HiveResult import org.apache.spark.sql.internal.SQLConf /** @@ -182,14 +184,19 @@ object DateTimeBenchmark extends SqlBasedBenchmark { benchmark.addCase("From java.time.LocalDate", numIters) { _ => spark.range(rowsNum).map(millis => LocalDate.ofEpochDay(millis / MILLIS_PER_DAY)).noop() } + def dates = { + spark.range(0, rowsNum, 1, 1).map(millis => new Date(millis)) + } benchmark.addCase("Collect java.sql.Date", numIters) { _ => - spark.range(0, rowsNum, 1, 1).map(millis => new Date(millis)).collect() + dates.collect() + } + def localDates = { + spark.range(0, rowsNum, 1, 1) + .map(millis => LocalDate.ofEpochDay(millis / MILLIS_PER_DAY)) } benchmark.addCase("Collect java.time.LocalDate", numIters) { _ => withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") { - spark.range(0, rowsNum, 1, 1) - .map(millis => LocalDate.ofEpochDay(millis / MILLIS_PER_DAY)) - .collect() + localDates.collect() } } benchmark.addCase("From java.sql.Timestamp", numIters) { _ => @@ -202,14 +209,37 @@ object DateTimeBenchmark extends SqlBasedBenchmark { spark.range(0, rowsNum, 1, 1) .collect() } + def timestamps = { + spark.range(0, rowsNum, 1, 1).map(millis => new Timestamp(millis)) + } benchmark.addCase("Collect java.sql.Timestamp", numIters) { _ => - spark.range(0, rowsNum, 1, 1).map(millis => new Timestamp(millis)).collect() + timestamps.collect() + } + def instants = { + spark.range(0, rowsNum, 1, 1).map(millis => Instant.ofEpochMilli(millis)) } benchmark.addCase("Collect java.time.Instant", numIters) { _ => withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") { - spark.range(0, rowsNum, 1, 1) - .map(millis => Instant.ofEpochMilli(millis)) - .collect() + instants.collect() + } + } + def toHiveString(df: Dataset[_]): Unit = { + HiveResult.hiveResultString(df.queryExecution.executedPlan) + } + benchmark.addCase("java.sql.Date to Hive string", numIters) { _ => + toHiveString(dates) + } + benchmark.addCase("java.time.LocalDate to Hive string", numIters) { _ => + withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") { + toHiveString(localDates) + } + } + benchmark.addCase("java.sql.Timestamp to Hive string", numIters) { _ => + toHiveString(timestamps) + } + benchmark.addCase("java.time.Instant to Hive string", numIters) { _ => + withSQLConf(SQLConf.DATETIME_JAVA8API_ENABLED.key -> "true") { + toHiveString(instants) } } benchmark.run() --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org