Github user dongjoon-hyun commented on a diff in the pull request:

    https://github.com/apache/spark/pull/22920#discussion_r230546330
  
    --- Diff: 
sql/core/src/test/scala/org/apache/spark/sql/execution/datasources/json/JsonBenchmark.scala
 ---
    @@ -158,26 +166,78 @@ object JSONBenchmark extends SqlBasedBenchmark {
     
           val ds = spark.read.schema(schema).json(path.getAbsolutePath)
     
    -      benchmark.addCase(s"Select $colsNum columns + count()", 3) { _ =>
    +      benchmark.addCase(s"Select $colsNum columns + count()", numIters) { 
_ =>
             ds.select("*").filter((_: Row) => true).count()
           }
    -      benchmark.addCase(s"Select 1 column + count()", 3) { _ =>
    +      benchmark.addCase(s"Select 1 column + count()", numIters) { _ =>
             ds.select($"col1").filter((_: Row) => true).count()
           }
    -      benchmark.addCase(s"count()", 3) { _ =>
    +      benchmark.addCase(s"count()", numIters) { _ =>
             ds.count()
           }
     
           benchmark.run()
         }
       }
     
    +  def jsonParserCreation(rowsNum: Int, numIters: Int): Unit = {
    +    val benchmark = new Benchmark("creation of JSON parser per line", 
rowsNum, output = output)
    +
    +    withTempPath { path =>
    +      prepareDataInfo(benchmark)
    +
    +      val shortColumnPath = path.getAbsolutePath + "/short"
    +      val shortSchema = writeShortColumn(shortColumnPath, rowsNum)
    +
    +      val wideColumnPath = path.getAbsolutePath + "/wide"
    +      val wideSchema = writeWideColumn(wideColumnPath, rowsNum)
    +
    +      benchmark.addCase("Short column without encoding", numIters) { _ =>
    +        spark.read
    +          .schema(shortSchema)
    +          .json(shortColumnPath)
    +          .filter((_: Row) => true)
    +          .count()
    +      }
    +
    +      benchmark.addCase("Short column with UTF-8", numIters) { _ =>
    +        spark.read
    +          .option("encoding", "UTF-8")
    +          .schema(shortSchema)
    +          .json(shortColumnPath)
    +          .filter((_: Row) => true)
    +          .count()
    +      }
    +
    +      benchmark.addCase("Wide column without encoding", numIters) { _ =>
    +        spark.read
    +          .schema(wideSchema)
    +          .json(wideColumnPath)
    +          .filter((_: Row) => true)
    +          .count()
    +      }
    +
    +      benchmark.addCase("Wide column with UTF-8", numIters) { _ =>
    +        spark.read
    +          .option("encoding", "UTF-8")
    +          .schema(wideSchema)
    +          .json(wideColumnPath)
    +          .filter((_: Row) => true)
    +          .count()
    +      }
    +
    +      benchmark.run()
    +    }
    +  }
    +
       override def runBenchmarkSuite(mainArgs: Array[String]): Unit = {
    +    val numIters = 2
    --- End diff --
    
    Thank you for updating, @MaxGekk .
    Do we have a reason to decrease this value from 3 to 2 in this PR?
    If this is to reduce the time, let's keep the original value. 
    This benchmark is not executed frequently.


---

---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to