[GitHub] carbondata pull request #1713: [CARBONDATA-1899] Optimize CarbonData concurr...

jackylk Fri, 30 Mar 2018 00:28:03 -0700

Github user jackylk commented on a diff in the pull request:

    https://github.com/apache/carbondata/pull/1713#discussion_r178245698
  
    --- Diff: 
examples/spark2/src/main/scala/org/apache/carbondata/benchmark/ConcurrentQueryBenchmark.scala
 ---
    @@ -0,0 +1,631 @@
    +/*
    + * Licensed to the Apache Software Foundation (ASF) under one or more
    + * contributor license agreements.  See the NOTICE file distributed with
    + * this work for additional information regarding copyright ownership.
    + * The ASF licenses this file to You under the Apache License, Version 2.0
    + * (the "License"); you may not use this file except in compliance with
    + * the License.  You may obtain a copy of the License at
    + *
    + *    http://www.apache.org/licenses/LICENSE-2.0
    + *
    + * Unless required by applicable law or agreed to in writing, software
    + * distributed under the License is distributed on an "AS IS" BASIS,
    + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    + * See the License for the specific language governing permissions and
    + * limitations under the License.
    + */
    +
    +package org.apache.carbondata.benchmark
    +
    +import java.io.File
    +import java.text.SimpleDateFormat
    +import java.util
    +import java.util.Date
    +import java.util.concurrent.{Callable, Executors, Future, TimeUnit}
    +
    +import scala.util.Random
    +
    +import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession}
    +import org.apache.spark.sql.types._
    +
    +import org.apache.carbondata.core.constants.{CarbonCommonConstants, 
CarbonVersionConstants}
    +import org.apache.carbondata.core.util.{CarbonProperties, CarbonUtil}
    +
    +// scalastyle:off println
    +/**
    + * Test concurrent query performance of CarbonData
    + *
    + * This benchmark will print out some information:
    + * 1.Environment information
    + * 2.Parameters information
    + * 3.concurrent query performance result using parquet format
    + * 4.concurrent query performance result using CarbonData format
    + *
    + * This benchmark default run in local model,
    + * user can change 'runInLocal' to false if want to run in cluster,
    + * user can change variables like:
    + *
    + * spark-submit \
    +    --class org.apache.carbondata.benchmark.ConcurrentQueryBenchmark \
    +    --master  yarn \
    +    --deploy-mode client \
    +    --driver-memory 16g \
    +    --executor-cores 4g \
    +    --executor-memory 24g \
    +    --num-executors 3  \
    +    concurrencyTest.jar \
    +    totalNum threadNum taskNum resultIsEmpty runInLocal generateFile 
deleteFile
    + * details in initParameters method of this benchmark
    + */
    +object ConcurrentQueryBenchmark {
    +
    +  // generate number of data
    +  var totalNum = 1 * 1000 * 1000
    +  // the number of thread pool
    +  var threadNum = 16
    +  // task number of spark sql query
    +  var taskNum = 100
    +  // whether is result empty, if true then result is empty
    +  var resultIsEmpty = true
    +  // the store path of task details
    +  var path: String = "/tmp/carbondata"
    +  // whether run in local or cluster
    +  var runInLocal = true
    +  // whether generate new file
    +  var generateFile = true
    +  // whether delete file
    +  var deleteFile = true
    +
    +  val cardinalityId = 100 * 1000 * 1000
    +  val cardinalityCity = 6
    +
    +  def parquetTableName: String = "Num" + totalNum + "_" + 
"comparetest_parquet"
    +
    +  def orcTableName: String = "Num" + totalNum + "_" + "comparetest_orc"
    +
    +  def carbonTableName(version: String): String =
    +    "Num" + totalNum + "_" + s"comparetest_carbonV$version"
    +
    +  // Table schema:
    +  // +-------------+-----------+-------------+-------------+------------+
    +  // | Column name | Data type | Cardinality | Column type | Dictionary |
    +  // +-------------+-----------+-------------+-------------+------------+
    +  // | id          | string    | 100,000,000 | dimension   | no         |
    +  // +-------------+-----------+-------------+-------------+------------+
    +  // | city        | string    | 6           | dimension   | yes        |
    +  // +-------------+-----------+-------------+-------------+------------+
    +  // | country     | string    | 6           | dimension   | yes        |
    +  // +-------------+-----------+-------------+-------------+------------+
    +  // | planet      | string    | 10,007      | dimension   | yes        |
    +  // +-------------+-----------+-------------+-------------+------------+
    +  // | m1          | short     | NA          | measure     | no         |
    +  // +-------------+-----------+-------------+-------------+------------+
    +  // | m2          | int       | NA          | measure     | no         |
    +  // +-------------+-----------+-------------+-------------+------------+
    +  // | m3          | big int   | NA          | measure     | no         |
    +  // +-------------+-----------+-------------+-------------+------------+
    +  // | m4          | double    | NA          | measure     | no         |
    +  // +-------------+-----------+-------------+-------------+------------+
    +  // | m5          | decimal   | NA          | measure     | no         |
    +  // +-------------+-----------+-------------+-------------+------------+
    +  /**
    +   * generate DataFrame with above table schema
    +   *
    +   * @param spark SparkSession
    +   * @return Dataframe of test data
    +   */
    +  private def generateDataFrame(spark: SparkSession): DataFrame = {
    --- End diff --
    
    It is better to move this to a separate class in benchmark package, it can 
be reused in two benchmark class

---

[GitHub] carbondata pull request #1713: [CARBONDATA-1899] Optimize CarbonData concurr...

Reply via email to