This is an automated email from the ASF dual-hosted git repository. gurwls223 pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 93a264d [SPARK-27535][SQL][TEST] Date and timestamp JSON benchmarks 93a264d is described below commit 93a264d05a55c2617d34e977dbaf182987187a27 Author: Maxim Gekk <max.g...@gmail.com> AuthorDate: Tue Apr 23 11:09:14 2019 +0900 [SPARK-27535][SQL][TEST] Date and timestamp JSON benchmarks ## What changes were proposed in this pull request? Added new JSON benchmarks related to date and timestamps operations: - Write date/timestamp to JSON files - `to_json()` and `from_json()` for dates and timestamps - Read date/timestamps from JSON files, and infer schemas - Parse and infer schemas from `Dataset[String]` Also existing JSON benchmarks are ported on `NoOp` datasource. Closes #24430 from MaxGekk/json-datetime-benchmark. Authored-by: Maxim Gekk <max.g...@gmail.com> Signed-off-by: HyukjinKwon <gurwls...@apache.org> --- sql/core/benchmarks/JSONBenchmark-results.txt | 79 +++++++++---- .../execution/datasources/json/JsonBenchmark.scala | 126 ++++++++++++++++++++- 2 files changed, 179 insertions(+), 26 deletions(-) diff --git a/sql/core/benchmarks/JSONBenchmark-results.txt b/sql/core/benchmarks/JSONBenchmark-results.txt index 2b784c3..7846983 100644 --- a/sql/core/benchmarks/JSONBenchmark-results.txt +++ b/sql/core/benchmarks/JSONBenchmark-results.txt @@ -7,77 +7,106 @@ Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 51280 51722 420 2.0 512.8 1.0X -UTF-8 is set 75009 77276 1963 1.3 750.1 0.7X +No encoding 50949 51086 150 2.0 509.5 1.0X +UTF-8 is set 72012 72147 120 1.4 720.1 0.7X Preparing data for benchmarking ... Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 39675 39738 83 2.5 396.7 1.0X -UTF-8 is set 62755 64399 1436 1.6 627.5 0.6X +No encoding 36799 36891 80 2.7 368.0 1.0X +UTF-8 is set 59796 59880 74 1.7 598.0 0.6X Preparing data for benchmarking ... Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 56429 56468 65 0.2 5642.9 1.0X -UTF-8 is set 81078 81454 374 0.1 8107.8 0.7X +No encoding 55803 55967 152 0.2 5580.3 1.0X +UTF-8 is set 80623 80825 178 0.1 8062.3 0.7X Preparing data for benchmarking ... Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 95329 95557 265 0.0 190658.2 1.0X -UTF-8 is set 102827 102967 166 0.0 205654.2 0.9X +No encoding 84263 85750 1476 0.0 168526.2 1.0X +UTF-8 is set 98848 100183 1592 0.0 197696.0 0.9X Preparing data for benchmarking ... Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns 14102 14136 52 0.7 1410.2 1.0X -Select 1 column 17487 17537 51 0.6 1748.7 0.8X +Select 10 columns 13930 13996 60 0.7 1393.0 1.0X +Select 1 column 17092 17394 360 0.6 1709.2 0.8X Preparing data for benchmarking ... Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Short column without encoding 6013 6066 70 1.7 601.3 1.0X -Short column with UTF-8 8031 8079 45 1.2 803.1 0.7X -Wide column without encoding 107093 108539 NaN 0.1 10709.3 0.1X -Wide column with UTF-8 130983 132518 1346 0.1 13098.3 0.0X +Short column without encoding 5596 5711 101 1.8 559.6 1.0X +Short column with UTF-8 7983 8158 160 1.3 798.3 0.7X +Wide column without encoding 110189 118451 NaN 0.1 11018.9 0.1X +Wide column with UTF-8 137827 142813 NaN 0.1 13782.7 0.0X Preparing data for benchmarking ... Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 939 950 11 10.6 93.9 1.0X -from_json 12924 12944 26 0.8 1292.4 0.1X -json_tuple 15312 15771 432 0.7 1531.2 0.1X -get_json_object 13049 13475 714 0.8 1304.9 0.1X +Text read 951 953 2 10.5 95.1 1.0X +from_json 13015 13045 27 0.8 1301.5 0.1X +json_tuple 16257 16306 43 0.6 1625.7 0.1X +get_json_object 13195 13225 39 0.8 1319.5 0.1X Preparing data for benchmarking ... Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 4556 4630 108 11.0 91.1 1.0X -schema inferring 23624 24338 626 2.1 472.5 0.2X -parsing 22342 22420 81 2.2 446.8 0.2X +Text read 4632 4687 49 10.8 92.6 1.0X +schema inferring 29176 29297 146 1.7 583.5 0.2X +parsing 24268 24457 175 2.1 485.4 0.2X Preparing data for benchmarking ... Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4 Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 7537 7556 26 6.6 150.7 1.0X -Schema inferring 27875 28306 499 1.8 557.5 0.3X -Parsing without charset 26030 26083 67 1.9 520.6 0.3X -Parsing with UTF-8 37115 37480 392 1.3 742.3 0.2X +Text read 8264 8272 7 6.1 165.3 1.0X +Schema inferring 31910 32375 543 1.6 638.2 0.3X +Parsing without charset 29290 29397 124 1.7 585.8 0.3X +Parsing with UTF-8 41301 41390 81 1.2 826.0 0.2X + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4 +Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Create a dataset of timestamps 1149 1160 11 8.7 114.9 1.0X +to_json(timestamp) 11585 11688 140 0.9 1158.5 0.1X +write timestamps to files 10212 10260 49 1.0 1021.2 0.1X +Create a dataset of dates 1322 1328 10 7.6 132.2 0.9X +to_json(date) 7226 7241 14 1.4 722.6 0.2X +write dates to files 5634 5648 20 1.8 563.4 0.2X + +Java HotSpot(TM) 64-Bit Server VM 1.8.0_202-b08 on Mac OS X 10.14.4 +Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +read timestamp text from files 2097 2137 41 4.8 209.7 1.0X +read timestamps from files 20438 20451 11 0.5 2043.8 0.1X +infer timestamps from files 41694 41770 66 0.2 4169.4 0.1X +read date text from files 1832 1847 16 5.5 183.2 1.1X +read date from files 13796 13837 49 0.7 1379.6 0.2X +timestamp strings 3213 3233 26 3.1 321.3 0.7X +parse timestamps from Dataset[String] 22686 22743 53 0.4 2268.6 0.1X +infer timestamps from Dataset[String] 45301 45368 58 0.2 4530.1 0.0X +date strings 3431 3439 7 2.9 343.1 0.6X +parse dates from Dataset[String] 17688 17734 41 0.6 1768.8 0.1X +from_json(timestamp) 33439 33456 24 0.3 3343.9 0.1X +from_json(date) 24055 24164 107 0.4 2405.5 0.1X + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala index f9e867b..f486e60 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala @@ -16,10 +16,13 @@ */ package org.apache.spark.sql.execution.datasources.json +import java.io.File +import java.time.{Instant, LocalDate} + import org.apache.spark.benchmark.Benchmark import org.apache.spark.sql.{Dataset, Row} import org.apache.spark.sql.execution.benchmark.SqlBasedBenchmark -import org.apache.spark.sql.functions.{from_json, get_json_object, json_tuple, lit} +import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ /** @@ -374,6 +377,126 @@ object JSONBenchmark extends SqlBasedBenchmark { } } + private def datetimeBenchmark(rowsNum: Int, numIters: Int): Unit = { + def timestamps = { + spark.range(0, rowsNum, 1, 1).mapPartitions { iter => + iter.map(Instant.ofEpochSecond(_)) + }.select($"value".as("timestamp")) + } + + def dates = { + spark.range(0, rowsNum, 1, 1).mapPartitions { iter => + iter.map(d => LocalDate.ofEpochDay(d % (100 * 365))) + }.select($"value".as("date")) + } + + withTempPath { path => + + val timestampDir = new File(path, "timestamp").getAbsolutePath + val dateDir = new File(path, "date").getAbsolutePath + + val writeBench = new Benchmark("Write dates and timestamps", rowsNum, output = output) + writeBench.addCase(s"Create a dataset of timestamps", numIters) { _ => + run(timestamps) + } + + writeBench.addCase("to_json(timestamp)", numIters) { _ => + run(timestamps.select(to_json(struct($"timestamp")))) + } + + writeBench.addCase("write timestamps to files", numIters) { _ => + timestamps.write.option("header", true).mode("overwrite").json(timestampDir) + } + + writeBench.addCase("Create a dataset of dates", numIters) { _ => + run(dates) + } + + writeBench.addCase("to_json(date)", numIters) { _ => + run(dates.select(to_json(struct($"date")))) + } + + writeBench.addCase("write dates to files", numIters) { _ => + dates.write.option("header", true).mode("overwrite").json(dateDir) + } + + writeBench.run() + + val readBench = new Benchmark("Read dates and timestamps", rowsNum, output = output) + val tsSchema = new StructType().add("timestamp", TimestampType) + + readBench.addCase("read timestamp text from files", numIters) { _ => + run(spark.read.text(timestampDir)) + } + + readBench.addCase("read timestamps from files", numIters) { _ => + run(spark.read.schema(tsSchema).json(timestampDir)) + } + + readBench.addCase("infer timestamps from files", numIters) { _ => + run(spark.read.json(timestampDir)) + } + + val dateSchema = new StructType().add("date", DateType) + + readBench.addCase("read date text from files", numIters) { _ => + run(spark.read.text(dateDir)) + } + + readBench.addCase("read date from files", numIters) { _ => + run(spark.read.schema(dateSchema).json(dateDir)) + } + + def timestampStr: Dataset[String] = { + spark.range(0, rowsNum, 1, 1).mapPartitions { iter => + iter.map(i => s"""{"timestamp":"1970-01-01T01:02:03.${100 + i % 100}Z"}""") + }.select($"value".as("timestamp")).as[String] + } + + readBench.addCase("timestamp strings", numIters) { _ => + run(timestampStr) + } + + readBench.addCase("parse timestamps from Dataset[String]", numIters) { _ => + run(spark.read.schema(tsSchema).json(timestampStr)) + } + + readBench.addCase("infer timestamps from Dataset[String]", numIters) { _ => + run(spark.read.json(timestampStr)) + } + + def dateStr: Dataset[String] = { + spark.range(0, rowsNum, 1, 1).mapPartitions { iter => + iter.map(i => s"""{"date":"${LocalDate.ofEpochDay(i % 1000 * 365).toString}"}""") + }.select($"value".as("date")).as[String] + } + + readBench.addCase("date strings", numIters) { _ => + run(dateStr) + } + + readBench.addCase("parse dates from Dataset[String]", numIters) { _ => + val ds = spark.read + .option("header", false) + .schema(dateSchema) + .json(dateStr) + run(ds) + } + + readBench.addCase("from_json(timestamp)", numIters) { _ => + val ds = timestampStr.select(from_json($"timestamp", tsSchema, Map.empty[String, String])) + run(ds) + } + + readBench.addCase("from_json(date)", numIters) { _ => + val ds = dateStr.select(from_json($"date", dateSchema, Map.empty[String, String])) + run(ds) + } + + readBench.run() + } + } + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { val numIters = 3 runBenchmark("Benchmark for performance of JSON parsing") { @@ -386,6 +509,7 @@ object JSONBenchmark extends SqlBasedBenchmark { jsonFunctions(10 * 1000 * 1000, numIters) jsonInDS(50 * 1000 * 1000, numIters) jsonInFile(50 * 1000 * 1000, numIters) + datetimeBenchmark(rowsNum = 10 * 1000 * 1000, numIters) } } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org