This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch branch-3.0 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.0 by this push: new 6fe52ad [SPARK-31414][SQL] Fix performance regression with new TimestampFormatter for json and csv time parsing 6fe52ad is described below commit 6fe52ad0a7eafe4293338a075ee25917127d4497 Author: Kent Yao <yaooq...@hotmail.com> AuthorDate: Mon Apr 13 03:11:28 2020 +0000 [SPARK-31414][SQL] Fix performance regression with new TimestampFormatter for json and csv time parsing With benchmark original, where the timestamp values are valid to the new parser the result is ```scala [info] Running benchmark: Read dates and timestamps [info] Running case: timestamp strings [info] Stopped after 3 iterations, 5781 ms [info] Running case: parse timestamps from Dataset[String] [info] Stopped after 3 iterations, 44764 ms [info] Running case: infer timestamps from Dataset[String] [info] Stopped after 3 iterations, 93764 ms [info] Running case: from_json(timestamp) [info] Stopped after 3 iterations, 59021 ms ``` When we modify the benchmark to ```scala def timestampStr: Dataset[String] = { spark.range(0, rowsNum, 1, 1).mapPartitions { iter => iter.map(i => s"""{"timestamp":"1970-01-01T01:02:03.${i % 100}"}""") }.select($"value".as("timestamp")).as[String] } readBench.addCase("timestamp strings", numIters) { _ => timestampStr.noop() } readBench.addCase("parse timestamps from Dataset[String]", numIters) { _ => spark.read.schema(tsSchema).json(timestampStr).noop() } readBench.addCase("infer timestamps from Dataset[String]", numIters) { _ => spark.read.json(timestampStr).noop() } ``` where the timestamp values are invalid for the new parser which causes a fallback to legacy parser(2.4). the result is ```scala [info] Running benchmark: Read dates and timestamps [info] Running case: timestamp strings [info] Stopped after 3 iterations, 5623 ms [info] Running case: parse timestamps from Dataset[String] [info] Stopped after 3 iterations, 506637 ms [info] Running case: infer timestamps from Dataset[String] [info] Stopped after 3 iterations, 509076 ms ``` About 10x perf-regression BUT if we modify the timestamp pattern to `....HH:mm:ss[.SSS][XXX]` which make all timestamp values valid for the new parser to prohibit fallback, the result is ```scala [info] Running benchmark: Read dates and timestamps [info] Running case: timestamp strings [info] Stopped after 3 iterations, 5623 ms [info] Running case: parse timestamps from Dataset[String] [info] Stopped after 3 iterations, 506637 ms [info] Running case: infer timestamps from Dataset[String] [info] Stopped after 3 iterations, 509076 ms ``` Fix performance regression. NO new tests added. Closes #28181 from yaooqinn/SPARK-31414. Authored-by: Kent Yao <yaooq...@hotmail.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> (cherry picked from commit d65f534c5ad4385b7c5198f15cb014e1d24e47c9) Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../apache/spark/sql/catalyst/csv/CSVOptions.scala | 9 +- .../spark/sql/catalyst/json/JSONOptions.scala | 9 +- sql/core/benchmarks/CSVBenchmark-jdk11-results.txt | 88 +++++++------- sql/core/benchmarks/CSVBenchmark-results.txt | 88 +++++++------- .../benchmarks/JsonBenchmark-jdk11-results.txt | 130 ++++++++++----------- sql/core/benchmarks/JsonBenchmark-results.txt | 130 ++++++++++----------- .../org/apache/spark/sql/CsvFunctionsSuite.scala | 12 ++ .../org/apache/spark/sql/JsonFunctionsSuite.scala | 12 ++ .../execution/datasources/csv/CSVBenchmark.scala | 4 +- .../execution/datasources/json/JsonBenchmark.scala | 4 +- 10 files changed, 262 insertions(+), 224 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala index 8892037..9d09cab 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/csv/CSVOptions.scala @@ -26,6 +26,7 @@ import com.univocity.parsers.csv.{CsvParserSettings, CsvWriterSettings, Unescape import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy class CSVOptions( @transient val parameters: CaseInsensitiveMap[String], @@ -148,8 +149,12 @@ class CSVOptions( val dateFormat: String = parameters.getOrElse("dateFormat", DateFormatter.defaultPattern) - val timestampFormat: String = - parameters.getOrElse("timestampFormat", s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX") + val timestampFormat: String = parameters.getOrElse("timestampFormat", + if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) { + s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX" + } else { + s"${DateFormatter.defaultPattern}'T'HH:mm:ss[.SSS][XXX]" + }) val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala index 45c4edf..f9222f5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/json/JSONOptions.scala @@ -27,6 +27,7 @@ import com.fasterxml.jackson.core.json.JsonReadFeature import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.SQLConf.LegacyBehaviorPolicy /** * Options for parsing JSON data into Spark SQL rows. @@ -90,8 +91,12 @@ private[sql] class JSONOptions( val dateFormat: String = parameters.getOrElse("dateFormat", DateFormatter.defaultPattern) - val timestampFormat: String = - parameters.getOrElse("timestampFormat", s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX") + val timestampFormat: String = parameters.getOrElse("timestampFormat", + if (SQLConf.get.legacyTimeParserPolicy == LegacyBehaviorPolicy.LEGACY) { + s"${DateFormatter.defaultPattern}'T'HH:mm:ss.SSSXXX" + } else { + s"${DateFormatter.defaultPattern}'T'HH:mm:ss[.SSS][XXX]" + }) val multiLine = parameters.get("multiLine").map(_.toBoolean).getOrElse(false) diff --git a/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt b/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt index d8071e7..147a77f 100644 --- a/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/CSVBenchmark-jdk11-results.txt @@ -2,66 +2,66 @@ Benchmark to measure CSV read/write performance ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Parsing quoted values: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -One quoted string 44297 44515 373 0.0 885948.7 1.0X +One quoted string 24907 29374 NaN 0.0 498130.5 1.0X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Wide rows with 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 1000 columns 196720 197783 1560 0.0 196719.8 1.0X -Select 100 columns 46691 46861 219 0.0 46691.4 4.2X -Select one column 36811 36922 111 0.0 36811.3 5.3X -count() 8520 8610 106 0.1 8520.5 23.1X -Select 100 columns, one bad input field 67914 67994 136 0.0 67914.0 2.9X -Select 100 columns, corrupt record field 77272 77445 214 0.0 77272.0 2.5X +Select 1000 columns 62811 63690 1416 0.0 62811.4 1.0X +Select 100 columns 23839 24064 230 0.0 23839.5 2.6X +Select one column 19936 20641 827 0.1 19936.4 3.2X +count() 4174 4380 206 0.2 4174.4 15.0X +Select 100 columns, one bad input field 41015 42380 1688 0.0 41015.4 1.5X +Select 100 columns, corrupt record field 46281 46338 93 0.0 46280.5 1.4X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Count a dataset with 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns + count() 25965 26054 103 0.4 2596.5 1.0X -Select 1 column + count() 18591 18666 91 0.5 1859.1 1.4X -count() 6102 6119 18 1.6 610.2 4.3X +Select 10 columns + count() 10810 10997 163 0.9 1081.0 1.0X +Select 1 column + count() 7608 7641 47 1.3 760.8 1.4X +count() 2415 2462 77 4.1 241.5 4.5X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 2142 2161 17 4.7 214.2 1.0X -to_csv(timestamp) 14744 14950 182 0.7 1474.4 0.1X -write timestamps to files 12078 12202 175 0.8 1207.8 0.2X -Create a dataset of dates 2275 2291 18 4.4 227.5 0.9X -to_csv(date) 11407 11464 51 0.9 1140.7 0.2X -write dates to files 7638 7702 90 1.3 763.8 0.3X +Create a dataset of timestamps 874 914 37 11.4 87.4 1.0X +to_csv(timestamp) 7051 7223 250 1.4 705.1 0.1X +write timestamps to files 6712 6741 31 1.5 671.2 0.1X +Create a dataset of dates 909 945 35 11.0 90.9 1.0X +to_csv(date) 4222 4231 8 2.4 422.2 0.2X +write dates to files 3799 3813 14 2.6 379.9 0.2X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -read timestamp text from files 2578 2590 10 3.9 257.8 1.0X -read timestamps from files 60103 60694 512 0.2 6010.3 0.0X -infer timestamps from files 107871 108268 351 0.1 10787.1 0.0X -read date text from files 2306 2310 4 4.3 230.6 1.1X -read date from files 47415 47657 367 0.2 4741.5 0.1X -infer date from files 35261 35447 164 0.3 3526.1 0.1X -timestamp strings 3045 3056 11 3.3 304.5 0.8X -parse timestamps from Dataset[String] 62221 63173 849 0.2 6222.1 0.0X -infer timestamps from Dataset[String] 118838 119629 697 0.1 11883.8 0.0X -date strings 3459 3481 19 2.9 345.9 0.7X -parse dates from Dataset[String] 51026 51447 503 0.2 5102.6 0.1X -from_csv(timestamp) 60738 61818 936 0.2 6073.8 0.0X -from_csv(date) 46012 46278 370 0.2 4601.2 0.1X +read timestamp text from files 1342 1364 35 7.5 134.2 1.0X +read timestamps from files 20300 20473 247 0.5 2030.0 0.1X +infer timestamps from files 40705 40744 54 0.2 4070.5 0.0X +read date text from files 1146 1151 6 8.7 114.6 1.2X +read date from files 12278 12408 117 0.8 1227.8 0.1X +infer date from files 12734 12872 220 0.8 1273.4 0.1X +timestamp strings 1467 1482 15 6.8 146.7 0.9X +parse timestamps from Dataset[String] 21708 22234 477 0.5 2170.8 0.1X +infer timestamps from Dataset[String] 42357 43253 922 0.2 4235.7 0.0X +date strings 1512 1532 18 6.6 151.2 0.9X +parse dates from Dataset[String] 13436 13470 33 0.7 1343.6 0.1X +from_csv(timestamp) 20390 20486 95 0.5 2039.0 0.1X +from_csv(date) 12592 12693 139 0.8 1259.2 0.1X -OpenJDK 64-Bit Server VM 11.0.5+10 on Mac OS X 10.15.2 -Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 11889 11945 52 0.0 118893.1 1.0X -pushdown disabled 11790 11860 115 0.0 117902.3 1.0X -w/ filters 1240 1278 33 0.1 12400.8 9.6X +w/o filters 12535 12606 67 0.0 125348.8 1.0X +pushdown disabled 12611 12672 91 0.0 126112.9 1.0X +w/ filters 1093 1099 11 0.1 10928.3 11.5X diff --git a/sql/core/benchmarks/CSVBenchmark-results.txt b/sql/core/benchmarks/CSVBenchmark-results.txt index b3ba69c..498ca4c 100644 --- a/sql/core/benchmarks/CSVBenchmark-results.txt +++ b/sql/core/benchmarks/CSVBenchmark-results.txt @@ -2,66 +2,66 @@ Benchmark to measure CSV read/write performance ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Parsing quoted values: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -One quoted string 51602 51659 59 0.0 1032039.4 1.0X +One quoted string 24073 24109 33 0.0 481463.5 1.0X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Wide rows with 1000 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 1000 columns 191926 192879 1615 0.0 191925.6 1.0X -Select 100 columns 46766 46846 69 0.0 46766.1 4.1X -Select one column 35877 35930 83 0.0 35876.8 5.3X -count() 11186 11262 65 0.1 11186.0 17.2X -Select 100 columns, one bad input field 59943 60107 232 0.0 59943.0 3.2X -Select 100 columns, corrupt record field 73062 73406 479 0.0 73062.2 2.6X +Select 1000 columns 58415 59611 2071 0.0 58414.8 1.0X +Select 100 columns 22568 23020 594 0.0 22568.0 2.6X +Select one column 18995 19058 99 0.1 18995.0 3.1X +count() 5301 5332 30 0.2 5300.9 11.0X +Select 100 columns, one bad input field 39736 40153 361 0.0 39736.1 1.5X +Select 100 columns, corrupt record field 47195 47826 590 0.0 47195.2 1.2X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Count a dataset with 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns + count() 22389 22447 87 0.4 2238.9 1.0X -Select 1 column + count() 14844 14890 43 0.7 1484.4 1.5X -count() 5519 5538 18 1.8 551.9 4.1X +Select 10 columns + count() 9884 9904 25 1.0 988.4 1.0X +Select 1 column + count() 6794 6835 46 1.5 679.4 1.5X +count() 2060 2065 5 4.9 206.0 4.8X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 1949 1977 25 5.1 194.9 1.0X -to_csv(timestamp) 14944 15702 714 0.7 1494.4 0.1X -write timestamps to files 12983 12998 14 0.8 1298.3 0.2X -Create a dataset of dates 2156 2164 7 4.6 215.6 0.9X -to_csv(date) 9675 9709 41 1.0 967.5 0.2X -write dates to files 7880 7897 15 1.3 788.0 0.2X +Create a dataset of timestamps 717 732 18 14.0 71.7 1.0X +to_csv(timestamp) 6994 7100 121 1.4 699.4 0.1X +write timestamps to files 6417 6435 27 1.6 641.7 0.1X +Create a dataset of dates 827 855 24 12.1 82.7 0.9X +to_csv(date) 4408 4438 32 2.3 440.8 0.2X +write dates to files 3738 3758 28 2.7 373.8 0.2X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -read timestamp text from files 2235 2245 10 4.5 223.5 1.0X -read timestamps from files 54490 54690 283 0.2 5449.0 0.0X -infer timestamps from files 104501 104737 236 0.1 10450.1 0.0X -read date text from files 2035 2040 6 4.9 203.5 1.1X -read date from files 39650 39707 52 0.3 3965.0 0.1X -infer date from files 29235 29363 164 0.3 2923.5 0.1X -timestamp strings 3412 3426 18 2.9 341.2 0.7X -parse timestamps from Dataset[String] 66864 67804 981 0.1 6686.4 0.0X -infer timestamps from Dataset[String] 118780 119284 837 0.1 11878.0 0.0X -date strings 3730 3734 4 2.7 373.0 0.6X -parse dates from Dataset[String] 48728 49071 309 0.2 4872.8 0.0X -from_csv(timestamp) 62294 62493 260 0.2 6229.4 0.0X -from_csv(date) 44581 44665 117 0.2 4458.1 0.1X +read timestamp text from files 1121 1176 52 8.9 112.1 1.0X +read timestamps from files 21298 21366 105 0.5 2129.8 0.1X +infer timestamps from files 41008 41051 39 0.2 4100.8 0.0X +read date text from files 962 967 5 10.4 96.2 1.2X +read date from files 11749 11772 22 0.9 1174.9 0.1X +infer date from files 12426 12459 29 0.8 1242.6 0.1X +timestamp strings 1508 1519 9 6.6 150.8 0.7X +parse timestamps from Dataset[String] 21674 21997 455 0.5 2167.4 0.1X +infer timestamps from Dataset[String] 42141 42230 105 0.2 4214.1 0.0X +date strings 1694 1701 8 5.9 169.4 0.7X +parse dates from Dataset[String] 12929 12951 25 0.8 1292.9 0.1X +from_csv(timestamp) 20603 20786 166 0.5 2060.3 0.1X +from_csv(date) 12325 12338 12 0.8 1232.5 0.1X -Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.2 -Intel(R) Core(TM) i7-4850HQ CPU @ 2.30GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Filters pushdown: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -w/o filters 12557 12634 78 0.0 125572.9 1.0X -pushdown disabled 12449 12509 65 0.0 124486.4 1.0X -w/ filters 1372 1393 18 0.1 13724.8 9.1X +w/o filters 12455 12474 22 0.0 124553.8 1.0X +pushdown disabled 12462 12486 29 0.0 124624.9 1.0X +w/ filters 1073 1092 18 0.1 10727.6 11.6X diff --git a/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt b/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt index 920e0a7..03bc334 100644 --- a/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt +++ b/sql/core/benchmarks/JsonBenchmark-jdk11-results.txt @@ -3,110 +3,110 @@ Benchmark for performance of JSON parsing ================================================================================================ Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 84774 84927 264 1.2 847.7 1.0X -UTF-8 is set 119081 120155 1773 0.8 1190.8 0.7X +No encoding 46010 46118 113 2.2 460.1 1.0X +UTF-8 is set 54407 55427 1718 1.8 544.1 0.8X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 49293 49356 70 2.0 492.9 1.0X -UTF-8 is set 80183 80211 25 1.2 801.8 0.6X +No encoding 26614 28220 1461 3.8 266.1 1.0X +UTF-8 is set 42765 43400 550 2.3 427.6 0.6X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 61070 61476 536 0.2 6107.0 1.0X -UTF-8 is set 109765 109881 102 0.1 10976.5 0.6X +No encoding 35696 35821 113 0.3 3569.6 1.0X +UTF-8 is set 55441 56176 1037 0.2 5544.1 0.6X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 176999 178163 1008 0.0 353997.9 1.0X -UTF-8 is set 201209 201641 614 0.0 402419.0 0.9X +No encoding 61514 62968 NaN 0.0 123027.2 1.0X +UTF-8 is set 72096 72933 1162 0.0 144192.7 0.9X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns 18768 20587 496 0.5 1876.8 1.0X -Select 1 column 22642 22644 3 0.4 2264.2 0.8X +Select 10 columns 9859 9913 79 1.0 985.9 1.0X +Select 1 column 10981 11003 36 0.9 1098.1 0.9X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Short column without encoding 7697 7738 55 1.3 769.7 1.0X -Short column with UTF-8 14051 14189 176 0.7 1405.1 0.5X -Wide column without encoding 108999 110075 1085 0.1 10899.9 0.1X -Wide column with UTF-8 157433 157779 308 0.1 15743.3 0.0X +Short column without encoding 3555 3579 27 2.8 355.5 1.0X +Short column with UTF-8 5204 5227 35 1.9 520.4 0.7X +Wide column without encoding 60458 60637 164 0.2 6045.8 0.1X +Wide column with UTF-8 77544 78111 551 0.1 7754.4 0.0X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 644 647 4 15.5 64.4 1.0X -from_json 25859 25872 12 0.4 2585.9 0.0X -json_tuple 31679 31761 71 0.3 3167.9 0.0X -get_json_object 24772 25220 389 0.4 2477.2 0.0X +Text read 342 346 3 29.2 34.2 1.0X +from_json 7123 7318 179 1.4 712.3 0.0X +json_tuple 9843 9957 132 1.0 984.3 0.0X +get_json_object 7827 8046 194 1.3 782.7 0.0X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 3135 3165 52 15.9 62.7 1.0X -schema inferring 29383 29389 10 1.7 587.7 0.1X -parsing 32623 35183 NaN 1.5 652.5 0.1X +Text read 1856 1884 32 26.9 37.1 1.0X +schema inferring 16734 16900 153 3.0 334.7 0.1X +parsing 14884 15203 470 3.4 297.7 0.1X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 11874 11948 82 4.2 237.5 1.0X -Schema inferring 42382 42398 23 1.2 847.6 0.3X -Parsing without charset 36410 36442 54 1.4 728.2 0.3X -Parsing with UTF-8 62412 62463 48 0.8 1248.2 0.2X +Text read 5932 6148 228 8.4 118.6 1.0X +Schema inferring 20836 21938 1086 2.4 416.7 0.3X +Parsing without charset 18134 18661 457 2.8 362.7 0.3X +Parsing with UTF-8 27734 28069 378 1.8 554.7 0.2X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 2191 2209 20 4.6 219.1 1.0X -to_json(timestamp) 18670 19042 565 0.5 1867.0 0.1X -write timestamps to files 11836 13156 NaN 0.8 1183.6 0.2X -Create a dataset of dates 2321 2351 33 4.3 232.1 0.9X -to_json(date) 12703 12726 24 0.8 1270.3 0.2X -write dates to files 8230 8303 76 1.2 823.0 0.3X +Create a dataset of timestamps 889 914 28 11.2 88.9 1.0X +to_json(timestamp) 7920 8172 353 1.3 792.0 0.1X +write timestamps to files 6726 6822 129 1.5 672.6 0.1X +Create a dataset of dates 953 963 12 10.5 95.3 0.9X +to_json(date) 5370 5705 320 1.9 537.0 0.2X +write dates to files 4109 4166 52 2.4 410.9 0.2X -OpenJDK 64-Bit Server VM 11.0.5+10-post-Ubuntu-0ubuntu1.118.04 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 11.0.5+10-LTS on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -read timestamp text from files 2780 2795 13 3.6 278.0 1.0X -read timestamps from files 37158 37305 137 0.3 3715.8 0.1X -infer timestamps from files 73666 73838 149 0.1 7366.6 0.0X -read date text from files 2597 2609 10 3.9 259.7 1.1X -read date from files 24439 24501 56 0.4 2443.9 0.1X -timestamp strings 3052 3064 12 3.3 305.2 0.9X -parse timestamps from Dataset[String] 43611 43665 52 0.2 4361.1 0.1X -infer timestamps from Dataset[String] 83745 84153 376 0.1 8374.5 0.0X -date strings 4068 4076 10 2.5 406.8 0.7X -parse dates from Dataset[String] 34700 34807 118 0.3 3470.0 0.1X -from_json(timestamp) 64074 64124 53 0.2 6407.4 0.0X -from_json(date) 52520 52617 101 0.2 5252.0 0.1X +read timestamp text from files 1614 1675 55 6.2 161.4 1.0X +read timestamps from files 16640 16858 209 0.6 1664.0 0.1X +infer timestamps from files 33239 33388 227 0.3 3323.9 0.0X +read date text from files 1310 1340 44 7.6 131.0 1.2X +read date from files 9470 9513 41 1.1 947.0 0.2X +timestamp strings 1303 1342 47 7.7 130.3 1.2X +parse timestamps from Dataset[String] 17650 18073 380 0.6 1765.0 0.1X +infer timestamps from Dataset[String] 32623 34065 1330 0.3 3262.3 0.0X +date strings 1864 1871 7 5.4 186.4 0.9X +parse dates from Dataset[String] 10914 11316 482 0.9 1091.4 0.1X +from_json(timestamp) 21102 21990 929 0.5 2110.2 0.1X +from_json(date) 15275 15961 598 0.7 1527.5 0.1X diff --git a/sql/core/benchmarks/JsonBenchmark-results.txt b/sql/core/benchmarks/JsonBenchmark-results.txt index e435f57..0f188c4 100644 --- a/sql/core/benchmarks/JsonBenchmark-results.txt +++ b/sql/core/benchmarks/JsonBenchmark-results.txt @@ -3,110 +3,110 @@ Benchmark for performance of JSON parsing ================================================================================================ Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz JSON schema inferring: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 61888 61918 27 1.6 618.9 1.0X -UTF-8 is set 109057 113663 NaN 0.9 1090.6 0.6X +No encoding 38998 41002 NaN 2.6 390.0 1.0X +UTF-8 is set 61231 63282 1854 1.6 612.3 0.6X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz count a short column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 44517 44535 29 2.2 445.2 1.0X -UTF-8 is set 75722 75840 111 1.3 757.2 0.6X +No encoding 28272 28338 70 3.5 282.7 1.0X +UTF-8 is set 58681 62243 1517 1.7 586.8 0.5X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz count a wide column: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 63677 64090 633 0.2 6367.7 1.0X -UTF-8 is set 99424 99615 185 0.1 9942.4 0.6X +No encoding 44026 51829 1329 0.2 4402.6 1.0X +UTF-8 is set 65839 68596 500 0.2 6583.9 0.7X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz select wide row: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -No encoding 174052 174251 174 0.0 348104.1 1.0X -UTF-8 is set 189000 189098 113 0.0 378000.9 0.9X +No encoding 72144 74820 NaN 0.0 144287.6 1.0X +UTF-8 is set 69571 77888 NaN 0.0 139142.3 1.0X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Select a subset of 10 columns: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Select 10 columns 18387 18473 142 0.5 1838.7 1.0X -Select 1 column 25560 25571 13 0.4 2556.0 0.7X +Select 10 columns 9502 9604 106 1.1 950.2 1.0X +Select 1 column 11861 11948 109 0.8 1186.1 0.8X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz creation of JSON parser per line: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Short column without encoding 9323 9384 58 1.1 932.3 1.0X -Short column with UTF-8 14016 14058 55 0.7 1401.6 0.7X -Wide column without encoding 133258 133532 382 0.1 13325.8 0.1X -Wide column with UTF-8 181212 181283 61 0.1 18121.2 0.1X +Short column without encoding 3830 3846 15 2.6 383.0 1.0X +Short column with UTF-8 5538 5543 7 1.8 553.8 0.7X +Wide column without encoding 66899 69158 NaN 0.1 6689.9 0.1X +Wide column with UTF-8 90052 93235 NaN 0.1 9005.2 0.0X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz JSON functions: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 1168 1174 5 8.6 116.8 1.0X -from_json 22604 23571 883 0.4 2260.4 0.1X -json_tuple 29979 30053 91 0.3 2997.9 0.0X -get_json_object 21987 22263 241 0.5 2198.7 0.1X +Text read 659 674 13 15.2 65.9 1.0X +from_json 7676 7943 405 1.3 767.6 0.1X +json_tuple 9881 10172 273 1.0 988.1 0.1X +get_json_object 7949 8055 119 1.3 794.9 0.1X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Dataset of json strings: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 5831 5842 14 8.6 116.6 1.0X -schema inferring 31372 31456 73 1.6 627.4 0.2X -parsing 35911 36191 254 1.4 718.2 0.2X +Text read 3314 3326 17 15.1 66.3 1.0X +schema inferring 16549 17037 484 3.0 331.0 0.2X +parsing 15138 15283 172 3.3 302.8 0.2X Preparing data for benchmarking ... -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Json files in the per-line mode: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Text read 10249 10314 77 4.9 205.0 1.0X -Schema inferring 35403 35436 40 1.4 708.1 0.3X -Parsing without charset 32875 32879 4 1.5 657.5 0.3X -Parsing with UTF-8 53444 53519 100 0.9 1068.9 0.2X +Text read 5136 5446 268 9.7 102.7 1.0X +Schema inferring 19864 20568 1191 2.5 397.3 0.3X +Parsing without charset 17535 17888 329 2.9 350.7 0.3X +Parsing with UTF-8 25609 25758 218 2.0 512.2 0.2X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Write dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -Create a dataset of timestamps 1909 1924 17 5.2 190.9 1.0X -to_json(timestamp) 18956 19122 208 0.5 1895.6 0.1X -write timestamps to files 13446 13472 43 0.7 1344.6 0.1X -Create a dataset of dates 2180 2200 28 4.6 218.0 0.9X -to_json(date) 12780 12899 109 0.8 1278.0 0.1X -write dates to files 7835 7865 29 1.3 783.5 0.2X +Create a dataset of timestamps 784 790 7 12.8 78.4 1.0X +to_json(timestamp) 8005 8055 50 1.2 800.5 0.1X +write timestamps to files 6515 6559 45 1.5 651.5 0.1X +Create a dataset of dates 854 881 24 11.7 85.4 0.9X +to_json(date) 5187 5194 7 1.9 518.7 0.2X +write dates to files 3663 3684 22 2.7 366.3 0.2X -OpenJDK 64-Bit Server VM 1.8.0_232-8u232-b09-0ubuntu1~18.04.1-b09 on Linux 4.15.0-1044-aws -Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz +Java HotSpot(TM) 64-Bit Server VM 1.8.0_231-b11 on Mac OS X 10.15.4 +Intel(R) Core(TM) i9-9980HK CPU @ 2.40GHz Read dates and timestamps: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -read timestamp text from files 2467 2477 9 4.1 246.7 1.0X -read timestamps from files 40186 40342 135 0.2 4018.6 0.1X -infer timestamps from files 82005 82079 71 0.1 8200.5 0.0X -read date text from files 2243 2264 22 4.5 224.3 1.1X -read date from files 24852 24863 19 0.4 2485.2 0.1X -timestamp strings 3836 3854 16 2.6 383.6 0.6X -parse timestamps from Dataset[String] 51521 51697 242 0.2 5152.1 0.0X -infer timestamps from Dataset[String] 97300 97398 133 0.1 9730.0 0.0X -date strings 4488 4491 5 2.2 448.8 0.5X -parse dates from Dataset[String] 37918 37976 68 0.3 3791.8 0.1X -from_json(timestamp) 69611 69632 36 0.1 6961.1 0.0X -from_json(date) 56598 56974 347 0.2 5659.8 0.0X +read timestamp text from files 1297 1316 26 7.7 129.7 1.0X +read timestamps from files 16915 17723 963 0.6 1691.5 0.1X +infer timestamps from files 33967 34304 360 0.3 3396.7 0.0X +read date text from files 1095 1100 7 9.1 109.5 1.2X +read date from files 8376 8513 209 1.2 837.6 0.2X +timestamp strings 1807 1816 8 5.5 180.7 0.7X +parse timestamps from Dataset[String] 18189 18242 74 0.5 1818.9 0.1X +infer timestamps from Dataset[String] 37906 38547 571 0.3 3790.6 0.0X +date strings 2191 2194 4 4.6 219.1 0.6X +parse dates from Dataset[String] 11593 11625 33 0.9 1159.3 0.1X +from_json(timestamp) 22589 22650 101 0.4 2258.9 0.1X +from_json(date) 16479 16619 159 0.6 1647.9 0.1X diff --git a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala index 89fb4d5..b9e0d50 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/CsvFunctionsSuite.scala @@ -212,4 +212,16 @@ class CsvFunctionsSuite extends QueryTest with SharedSparkSession { assert(readback(0).getAs[Row](0).getAs[Date](0).getTime >= 0) } } + + test("optional datetime parser does not affect csv time formatting") { + val s = "2015-08-26 12:34:46" + def toDF(p: String): DataFrame = sql( + s""" + |SELECT + | to_csv( + | named_struct('time', timestamp'$s'), map('timestampFormat', "$p") + | ) + | """.stripMargin) + checkAnswer(toDF("yyyy-MM-dd'T'HH:mm:ss.SSSXXX"), toDF("yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]")) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala index 8cc5c22..b989b5d 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/JsonFunctionsSuite.scala @@ -688,4 +688,16 @@ class JsonFunctionsSuite extends QueryTest with SharedSparkSession { options.asJava)), Seq(Row("string"))) } + + test("optional datetime parser does not affect json time formatting") { + val s = "2015-08-26 12:34:46" + def toDF(p: String): DataFrame = sql( + s""" + |SELECT + | to_json( + | named_struct('time', timestamp'$s'), map('timestampFormat', "$p") + | ) + | """.stripMargin) + checkAnswer(toDF("yyyy-MM-dd'T'HH:mm:ss.SSSXXX"), toDF("yyyy-MM-dd'T'HH:mm:ss[.SSS][XXX]")) + } } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala index e2abb39..53d287b 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/csv/CSVBenchmark.scala @@ -238,7 +238,9 @@ object CSVBenchmark extends SqlBasedBenchmark { def timestampStr: Dataset[String] = { spark.range(0, rowsNum, 1, 1).mapPartitions { iter => - iter.map(i => s"1970-01-01T01:02:03.${100 + i % 100}Z") + iter.map { + i => s"1970-01-01T01:02:03.${i % 200}Z".stripSuffix(".0Z") + } }.select($"value".as("timestamp")).as[String] } diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala index bcecacc..5693088 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala @@ -445,7 +445,9 @@ object JsonBenchmark extends SqlBasedBenchmark { def timestampStr: Dataset[String] = { spark.range(0, rowsNum, 1, 1).mapPartitions { iter => - iter.map(i => s"""{"timestamp":"1970-01-01T01:02:03.${100 + i % 100}Z"}""") + iter.map { i => + s"""{"timestamp":"1970-01-01T01:02:03.${i % 200}Z"}""".stripSuffix(".0Z") + } }.select($"value".as("timestamp")).as[String] } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org