spark git commit: [SPARK-25663][SPARK-25661][SQL][TEST] Refactor BuiltInDataSourceWriteBenchmark, DataSourceWriteBenchmark and AvroWriteBenchmark to use main method

dongjoon Wed, 31 Oct 2018 03:04:00 -0700

Repository: spark
Updated Branches:
  refs/heads/master 34c3bc9f1 -> f8484e49e



[SPARK-25663][SPARK-25661][SQL][TEST] Refactor BuiltInDataSourceWriteBenchmark, 
DataSourceWriteBenchmark and AvroWriteBenchmark to use main method

## What changes were proposed in this pull request?

Refactor BuiltInDataSourceWriteBenchmark, DataSourceWriteBenchmark and 
AvroWriteBenchmark to use main method.

```
SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain 
org.apache.spark.sql.execution.benchmark.BuiltInDataSourceWriteBenchmark"

SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "avro/test:runMain 
org.apache.spark.sql.execution.benchmark.AvroWriteBenchmark"
```
## How was this patch tested?

manual tests

Closes #22861 from yucai/BuiltInDataSourceWriteBenchmark.

Lead-authored-by: yucai <y...@ebay.com>
Co-authored-by: Yucai Yu <yucai...@foxmail.com>
Co-authored-by: Dongjoon Hyun <dongj...@apache.org>
Signed-off-by: Dongjoon Hyun <dongj...@apache.org>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/f8484e49
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/f8484e49
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/f8484e49

Branch: refs/heads/master
Commit: f8484e49ef83445dd57f8f5ba4b39d2f47bd3c80
Parents: 34c3bc9
Author: yucai <y...@ebay.com>
Authored: Wed Oct 31 03:03:42 2018 -0700
Committer: Dongjoon Hyun <dongj...@apache.org>
Committed: Wed Oct 31 03:03:42 2018 -0700

----------------------------------------------------------------------
 .../benchmarks/AvroWriteBenchmark-results.txt   | 10 +++
 .../benchmark/AvroWriteBenchmark.scala          | 27 ++++----
 .../BuiltInDataSourceWriteBenchmark-results.txt | 60 +++++++++++++++++
 .../BuiltInDataSourceWriteBenchmark.scala       | 68 +++++++-------------
 .../benchmark/DataSourceWriteBenchmark.scala    | 15 +----
 5 files changed, 108 insertions(+), 72 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/f8484e49/external/avro/benchmarks/AvroWriteBenchmark-results.txt
----------------------------------------------------------------------
diff --git a/external/avro/benchmarks/AvroWriteBenchmark-results.txt 
b/external/avro/benchmarks/AvroWriteBenchmark-results.txt
new file mode 100644
index 0000000..fb2a773
--- /dev/null
+++ b/external/avro/benchmarks/AvroWriteBenchmark-results.txt
@@ -0,0 +1,10 @@
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Avro writer benchmark:                   Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Output Single Int Column                      3213 / 3373          4.9         
204.3       1.0X
+Output Single Double Column                   3313 / 3345          4.7         
210.7       1.0X
+Output Int and String Column                  7303 / 7316          2.2         
464.3       0.4X
+Output Partitions                             5309 / 5691          3.0         
337.5       0.6X
+Output Buckets                                7031 / 7557          2.2         
447.0       0.5X
+

http://git-wip-us.apache.org/repos/asf/spark/blob/f8484e49/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala
----------------------------------------------------------------------
diff --git 
a/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala
 
b/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala
index df13b4a..0b11434 100644
--- 
a/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala
+++ 
b/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala
@@ -19,22 +19,19 @@ package org.apache.spark.sql.execution.benchmark
 
 /**
  * Benchmark to measure Avro data sources write performance.
- * Usage:
- * 1. with spark-submit: bin/spark-submit --class <this class> <spark sql test 
jar>
- * 2. with sbt: build/sbt "avro/test:runMain <this class>"
+ * To run this benchmark:
+ * {{{
+ *   1. without sbt: bin/spark-submit --class <this class>
+ *        --jars <spark core test jar>,<spark catalyst test jar>,
+  *              <spark sql test jar>,<spark avro jar>
+ *        <spark avro test jar>
+ *   2. build/sbt "sql/test:runMain <this class>"
+ *   3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt 
"avro/test:runMain <this class>"
+ *      Results will be written to "benchmarks/AvroWriteBenchmark-results.txt".
+ *  }}}
  */
 object AvroWriteBenchmark extends DataSourceWriteBenchmark {
-  def main(args: Array[String]): Unit = {
-    /*
-    Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz
-    Avro writer benchmark:                   Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-    
------------------------------------------------------------------------------------------------
-    Output Single Int Column                      2481 / 2499          6.3     
    157.8       1.0X
-    Output Single Double Column                   2705 / 2710          5.8     
    172.0       0.9X
-    Output Int and String Column                  5539 / 5639          2.8     
    352.2       0.4X
-    Output Partitions                             4613 / 5004          3.4     
    293.3       0.5X
-    Output Buckets                                5554 / 5561          2.8     
    353.1       0.4X
-    */
-    runBenchmark("Avro")
+  override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+    runDataSourceBenchmark("Avro")
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/f8484e49/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt
----------------------------------------------------------------------
diff --git a/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt 
b/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt
new file mode 100644
index 0000000..9d656fc
--- /dev/null
+++ b/sql/core/benchmarks/BuiltInDataSourceWriteBenchmark-results.txt
@@ -0,0 +1,60 @@
+================================================================================================
+Parquet writer benchmark
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+Parquet writer benchmark:                Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Output Single Int Column                      2354 / 2438          6.7         
149.7       1.0X
+Output Single Double Column                   2462 / 2485          6.4         
156.5       1.0X
+Output Int and String Column                  8083 / 8100          1.9         
513.9       0.3X
+Output Partitions                             5015 / 5027          3.1         
318.8       0.5X
+Output Buckets                                6883 / 6887          2.3         
437.6       0.3X
+
+
+================================================================================================
+ORC writer benchmark
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+ORC writer benchmark:                    Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Output Single Int Column                      1769 / 1789          8.9         
112.4       1.0X
+Output Single Double Column                   1989 / 2009          7.9         
126.5       0.9X
+Output Int and String Column                  7323 / 7400          2.1         
465.6       0.2X
+Output Partitions                             4374 / 4381          3.6         
278.1       0.4X
+Output Buckets                                6086 / 6104          2.6         
386.9       0.3X
+
+
+================================================================================================
+JSON writer benchmark
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+JSON writer benchmark:                   Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Output Single Int Column                      2954 / 4085          5.3         
187.8       1.0X
+Output Single Double Column                   3832 / 3837          4.1         
243.6       0.8X
+Output Int and String Column                 9591 / 10336          1.6         
609.8       0.3X
+Output Partitions                             4956 / 4994          3.2         
315.1       0.6X
+Output Buckets                                6608 / 6676          2.4         
420.1       0.4X
+
+
+================================================================================================
+CSV writer benchmark
+================================================================================================
+
+OpenJDK 64-Bit Server VM 1.8.0_191-b12 on Linux 3.10.0-862.3.2.el7.x86_64
+Intel(R) Xeon(R) CPU E5-2670 v2 @ 2.50GHz
+CSV writer benchmark:                    Best/Avg Time(ms)    Rate(M/s)   Per 
Row(ns)   Relative
+------------------------------------------------------------------------------------------------
+Output Single Int Column                      4118 / 4125          3.8         
261.8       1.0X
+Output Single Double Column                   4888 / 4891          3.2         
310.8       0.8X
+Output Int and String Column                  9788 / 9872          1.6         
622.3       0.4X
+Output Partitions                             6578 / 6640          2.4         
418.2       0.6X
+Output Buckets                                9125 / 9171          1.7         
580.2       0.5X
+
+

http://git-wip-us.apache.org/repos/asf/spark/blob/f8484e49/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BuiltInDataSourceWriteBenchmark.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BuiltInDataSourceWriteBenchmark.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BuiltInDataSourceWriteBenchmark.scala
index 2de516c..cd97324 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BuiltInDataSourceWriteBenchmark.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BuiltInDataSourceWriteBenchmark.scala
@@ -18,62 +18,40 @@ package org.apache.spark.sql.execution.benchmark
 
 /**
  * Benchmark to measure built-in data sources write performance.
- * By default it measures 4 data source format: Parquet, ORC, JSON, CSV. Run 
it with spark-submit:
- *   spark-submit --class <this class> <spark sql test jar>
- * Or with sbt:
- *   build/sbt "sql/test:runMain <this class>"
+ * To run this benchmark:
+ * {{{
+ *   By default it measures 4 data source format: Parquet, ORC, JSON, CSV.
+ *   1. without sbt: bin/spark-submit --class <this class>
+ *        --jars <spark core test jar>,<spark catalyst test jar> <spark sql 
test jar>
+ *   2. build/sbt "sql/test:runMain <this class>"
+ *   3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt 
"sql/test:runMain <this class>"
+ *      Results will be written to 
"benchmarks/BuiltInDataSourceWriteBenchmark-results.txt".
+ *
+ *   To measure specified formats, run it with arguments.
+ *   1. without sbt:
+ *        bin/spark-submit --class <this class> <spark sql test jar> format1 
[format2] [...]
+ *   2. build/sbt "sql/test:runMain <this class> format1 [format2] [...]"
+ *   3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt
+ *        "sql/test:runMain <this class> format1 [format2] [...]"
+ *      Results will be written to 
"benchmarks/BuiltInDataSourceWriteBenchmark-results.txt".
+ * }}}
  *
- * To measure specified formats, run it with arguments:
- *   spark-submit --class <this class> <spark sql test jar> format1 [format2] 
[...]
- * Or with sbt:
- *   build/sbt "sql/test:runMain <this class> format1 [format2] [...]"
  */
 object BuiltInDataSourceWriteBenchmark extends DataSourceWriteBenchmark {
-  def main(args: Array[String]): Unit = {
-    val formats: Seq[String] = if (args.isEmpty) {
+  override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
+    val formats: Seq[String] = if (mainArgs.isEmpty) {
       Seq("Parquet", "ORC", "JSON", "CSV")
     } else {
-      args
+      mainArgs
     }
 
     spark.conf.set("spark.sql.parquet.compression.codec", "snappy")
     spark.conf.set("spark.sql.orc.compression.codec", "snappy")
-    /*
-    Intel(R) Core(TM) i7-6920HQ CPU @ 2.90GHz
-    Parquet writer benchmark:                Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-    
------------------------------------------------------------------------------------------------
-    Output Single Int Column                      1815 / 1932          8.7     
    115.4       1.0X
-    Output Single Double Column                   1877 / 1878          8.4     
    119.3       1.0X
-    Output Int and String Column                  6265 / 6543          2.5     
    398.3       0.3X
-    Output Partitions                             4067 / 4457          3.9     
    258.6       0.4X
-    Output Buckets                                5608 / 5820          2.8     
    356.6       0.3X
-
-    ORC writer benchmark:                    Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-    
------------------------------------------------------------------------------------------------
-    Output Single Int Column                      1201 / 1239         13.1     
     76.3       1.0X
-    Output Single Double Column                   1542 / 1600         10.2     
     98.0       0.8X
-    Output Int and String Column                  6495 / 6580          2.4     
    412.9       0.2X
-    Output Partitions                             3648 / 3842          4.3     
    231.9       0.3X
-    Output Buckets                                5022 / 5145          3.1     
    319.3       0.2X
-
-    JSON writer benchmark:                   Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-    
------------------------------------------------------------------------------------------------
-    Output Single Int Column                      1988 / 2093          7.9     
    126.4       1.0X
-    Output Single Double Column                   2854 / 2911          5.5     
    181.4       0.7X
-    Output Int and String Column                  6467 / 6653          2.4     
    411.1       0.3X
-    Output Partitions                             4548 / 5055          3.5     
    289.1       0.4X
-    Output Buckets                                5664 / 5765          2.8     
    360.1       0.4X
 
-    CSV writer benchmark:                    Best/Avg Time(ms)    Rate(M/s)   
Per Row(ns)   Relative
-    
------------------------------------------------------------------------------------------------
-    Output Single Int Column                      3025 / 3190          5.2     
    192.3       1.0X
-    Output Single Double Column                   3575 / 3634          4.4     
    227.3       0.8X
-    Output Int and String Column                  7313 / 7399          2.2     
    464.9       0.4X
-    Output Partitions                             5105 / 5190          3.1     
    324.6       0.6X
-    Output Buckets                                6986 / 6992          2.3     
    444.1       0.4X
-    */
     formats.foreach { format =>
-      runBenchmark(format)
+      runBenchmark(s"$format writer benchmark") {
+        runDataSourceBenchmark(format)
+      }
     }
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/f8484e49/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceWriteBenchmark.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceWriteBenchmark.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceWriteBenchmark.scala
index 994d6b5..405d607 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceWriteBenchmark.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/DataSourceWriteBenchmark.scala
@@ -16,18 +16,9 @@
  */
 package org.apache.spark.sql.execution.benchmark
 
-import org.apache.spark.SparkConf
 import org.apache.spark.benchmark.Benchmark
-import org.apache.spark.sql.SparkSession
-import org.apache.spark.sql.internal.SQLConf
 
-trait DataSourceWriteBenchmark {
-  val conf = new SparkConf()
-    .setAppName("DataSourceWriteBenchmark")
-    .setIfMissing("spark.master", "local[1]")
-    .set(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key, "true")
-
-  val spark = SparkSession.builder.config(conf).getOrCreate()
+trait DataSourceWriteBenchmark extends SqlBasedBenchmark {
 
   val tempTable = "temp"
   val numRows = 1024 * 1024 * 15
@@ -75,7 +66,7 @@ trait DataSourceWriteBenchmark {
     }
   }
 
-  def runBenchmark(format: String): Unit = {
+  def runDataSourceBenchmark(format: String): Unit = {
     val tableInt = "tableInt"
     val tableDouble = "tableDouble"
     val tableIntString = "tableIntString"
@@ -84,7 +75,7 @@ trait DataSourceWriteBenchmark {
     withTempTable(tempTable) {
       spark.range(numRows).createOrReplaceTempView(tempTable)
       withTable(tableInt, tableDouble, tableIntString, tablePartition, 
tableBucket) {
-        val benchmark = new Benchmark(s"$format writer benchmark", numRows)
+        val benchmark = new Benchmark(s"$format writer benchmark", numRows, 
output = output)
         writeNumeric(tableInt, format, benchmark, "Int")
         writeNumeric(tableDouble, format, benchmark, "Double")
         writeIntString(tableIntString, format, benchmark)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

spark git commit: [SPARK-25663][SPARK-25661][SQL][TEST] Refactor BuiltInDataSourceWriteBenchmark, DataSourceWriteBenchmark and AvroWriteBenchmark to use main method

Reply via email to