Repository: spark Updated Branches: refs/heads/master 720c94fe7 -> b9b54b1c8
[SPARK-21368][SQL] TPCDSQueryBenchmark can't refer query files. ## What changes were proposed in this pull request? TPCDSQueryBenchmark packaged into a jar doesn't work with spark-submit. It's because of the failure of reference query files in the jar file. ## How was this patch tested? Ran the benchmark. Author: sarutak <saru...@oss.nttdata.co.jp> Author: Kousuke Saruta <saru...@oss.nttdata.co.jp> Closes #18592 from sarutak/fix-tpcds-benchmark. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/b9b54b1c Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/b9b54b1c Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/b9b54b1c Branch: refs/heads/master Commit: b9b54b1c88fe499c3bafdaf85ca1c3e4490bc900 Parents: 720c94f Author: sarutak <saru...@oss.nttdata.co.jp> Authored: Tue Sep 12 10:49:46 2017 -0700 Committer: gatorsmile <gatorsm...@gmail.com> Committed: Tue Sep 12 10:49:46 2017 -0700 ---------------------------------------------------------------------- .../benchmark/TPCDSQueryBenchmark.scala | 19 ++---- .../TPCDSQueryBenchmarkArguments.scala | 69 ++++++++++++++++++++ 2 files changed, 74 insertions(+), 14 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/b9b54b1c/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala index d2d0136..63d118c 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmark.scala @@ -17,8 +17,6 @@ package org.apache.spark.sql.execution.benchmark -import java.io.File - import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.TableIdentifier @@ -31,7 +29,7 @@ import org.apache.spark.util.Benchmark /** * Benchmark to measure TPCDS query performance. * To run this: - * spark-submit --class <this class> --jars <spark sql test jar> + * spark-submit --class <this class> <spark sql test jar> <TPCDS data location> */ object TPCDSQueryBenchmark { val conf = @@ -61,12 +59,10 @@ object TPCDSQueryBenchmark { } def tpcdsAll(dataLocation: String, queries: Seq[String]): Unit = { - require(dataLocation.nonEmpty, - "please modify the value of dataLocation to point to your local TPCDS data") val tableSizes = setupTables(dataLocation) queries.foreach { name => - val queryString = fileToString(new File(Thread.currentThread().getContextClassLoader - .getResource(s"tpcds/$name.sql").getFile)) + val queryString = resourceToString(s"tpcds/$name.sql", + classLoader = Thread.currentThread().getContextClassLoader) // This is an indirect hack to estimate the size of each query's input by traversing the // logical plan and adding up the sizes of all tables that appear in the plan. Note that this @@ -99,6 +95,7 @@ object TPCDSQueryBenchmark { } def main(args: Array[String]): Unit = { + val benchmarkArgs = new TPCDSQueryBenchmarkArguments(args) // List of all TPC-DS queries val tpcdsQueries = Seq( @@ -113,12 +110,6 @@ object TPCDSQueryBenchmark { "q81", "q82", "q83", "q84", "q85", "q86", "q87", "q88", "q89", "q90", "q91", "q92", "q93", "q94", "q95", "q96", "q97", "q98", "q99") - // In order to run this benchmark, please follow the instructions at - // https://github.com/databricks/spark-sql-perf/blob/master/README.md to generate the TPCDS data - // locally (preferably with a scale factor of 5 for benchmarking). Thereafter, the value of - // dataLocation below needs to be set to the location where the generated data is stored. - val dataLocation = "" - - tpcdsAll(dataLocation, queries = tpcdsQueries) + tpcdsAll(benchmarkArgs.dataLocation, queries = tpcdsQueries) } } http://git-wip-us.apache.org/repos/asf/spark/blob/b9b54b1c/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala ---------------------------------------------------------------------- diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala new file mode 100644 index 0000000..8edc77b --- /dev/null +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/TPCDSQueryBenchmarkArguments.scala @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.execution.benchmark + +class TPCDSQueryBenchmarkArguments(val args: Array[String]) { + var dataLocation: String = null + + parseArgs(args.toList) + validateArguments() + + private def parseArgs(inputArgs: List[String]): Unit = { + var args = inputArgs + + while(args.nonEmpty) { + args match { + case ("--data-location") :: value :: tail => + dataLocation = value + args = tail + + case _ => + // scalastyle:off println + System.err.println("Unknown/unsupported param " + args) + // scalastyle:on println + printUsageAndExit(1) + } + } + } + + private def printUsageAndExit(exitCode: Int): Unit = { + // scalastyle:off + System.err.println(""" + |Usage: spark-submit --class <this class> <spark sql test jar> [Options] + |Options: + | --data-location Path to TPCDS data + | + |------------------------------------------------------------------------------------------------------------------ + |In order to run this benchmark, please follow the instructions at + |https://github.com/databricks/spark-sql-perf/blob/master/README.md + |to generate the TPCDS data locally (preferably with a scale factor of 5 for benchmarking). + |Thereafter, the value of <TPCDS data location> needs to be set to the location where the generated data is stored. + """.stripMargin) + // scalastyle:on + System.exit(exitCode) + } + + private def validateArguments(): Unit = { + if (dataLocation == null) { + // scalastyle:off println + System.err.println("Must specify a data location") + // scalastyle:on println + printUsageAndExit(-1) + } + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org