Github user jackylk commented on a diff in the pull request: https://github.com/apache/carbondata/pull/1713#discussion_r178245698 --- Diff: examples/spark2/src/main/scala/org/apache/carbondata/benchmark/ConcurrentQueryBenchmark.scala --- @@ -0,0 +1,631 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.carbondata.benchmark + +import java.io.File +import java.text.SimpleDateFormat +import java.util +import java.util.Date +import java.util.concurrent.{Callable, Executors, Future, TimeUnit} + +import scala.util.Random + +import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession} +import org.apache.spark.sql.types._ + +import org.apache.carbondata.core.constants.{CarbonCommonConstants, CarbonVersionConstants} +import org.apache.carbondata.core.util.{CarbonProperties, CarbonUtil} + +// scalastyle:off println +/** + * Test concurrent query performance of CarbonData + * + * This benchmark will print out some information: + * 1.Environment information + * 2.Parameters information + * 3.concurrent query performance result using parquet format + * 4.concurrent query performance result using CarbonData format + * + * This benchmark default run in local model, + * user can change 'runInLocal' to false if want to run in cluster, + * user can change variables like: + * + * spark-submit \ + --class org.apache.carbondata.benchmark.ConcurrentQueryBenchmark \ + --master yarn \ + --deploy-mode client \ + --driver-memory 16g \ + --executor-cores 4g \ + --executor-memory 24g \ + --num-executors 3 \ + concurrencyTest.jar \ + totalNum threadNum taskNum resultIsEmpty runInLocal generateFile deleteFile + * details in initParameters method of this benchmark + */ +object ConcurrentQueryBenchmark { + + // generate number of data + var totalNum = 1 * 1000 * 1000 + // the number of thread pool + var threadNum = 16 + // task number of spark sql query + var taskNum = 100 + // whether is result empty, if true then result is empty + var resultIsEmpty = true + // the store path of task details + var path: String = "/tmp/carbondata" + // whether run in local or cluster + var runInLocal = true + // whether generate new file + var generateFile = true + // whether delete file + var deleteFile = true + + val cardinalityId = 100 * 1000 * 1000 + val cardinalityCity = 6 + + def parquetTableName: String = "Num" + totalNum + "_" + "comparetest_parquet" + + def orcTableName: String = "Num" + totalNum + "_" + "comparetest_orc" + + def carbonTableName(version: String): String = + "Num" + totalNum + "_" + s"comparetest_carbonV$version" + + // Table schema: + // +-------------+-----------+-------------+-------------+------------+ + // | Column name | Data type | Cardinality | Column type | Dictionary | + // +-------------+-----------+-------------+-------------+------------+ + // | id | string | 100,000,000 | dimension | no | + // +-------------+-----------+-------------+-------------+------------+ + // | city | string | 6 | dimension | yes | + // +-------------+-----------+-------------+-------------+------------+ + // | country | string | 6 | dimension | yes | + // +-------------+-----------+-------------+-------------+------------+ + // | planet | string | 10,007 | dimension | yes | + // +-------------+-----------+-------------+-------------+------------+ + // | m1 | short | NA | measure | no | + // +-------------+-----------+-------------+-------------+------------+ + // | m2 | int | NA | measure | no | + // +-------------+-----------+-------------+-------------+------------+ + // | m3 | big int | NA | measure | no | + // +-------------+-----------+-------------+-------------+------------+ + // | m4 | double | NA | measure | no | + // +-------------+-----------+-------------+-------------+------------+ + // | m5 | decimal | NA | measure | no | + // +-------------+-----------+-------------+-------------+------------+ + /** + * generate DataFrame with above table schema + * + * @param spark SparkSession + * @return Dataframe of test data + */ + private def generateDataFrame(spark: SparkSession): DataFrame = { --- End diff -- It is better to move this to a separate class in benchmark package, it can be reused in two benchmark class
---