viirya commented on a change in pull request #32473: URL: https://github.com/apache/spark/pull/32473#discussion_r628861781
########## File path: sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/BloomFilterBenchmark.scala ########## @@ -81,8 +80,51 @@ object BloomFilterBenchmark extends SqlBasedBenchmark { } } + private def writeParquetBenchmark(): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + runBenchmark(s"Parquet Write") { + val benchmark = new Benchmark(s"Write ${scaleFactor}M rows", N, output = output) + benchmark.addCase("Without bloom filter") { _ => + df.write.mode("overwrite").parquet(path + "/withoutBF") + } + benchmark.addCase("With bloom filter") { _ => + df.write.mode("overwrite") + .option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) + .parquet(path + "/withBF") + } + benchmark.run() + } + } + } + + private def readParquetBenchmark(): Unit = { + withTempPath { dir => + val path = dir.getCanonicalPath + + df.write.parquet(path + "/withoutBF") + df.write.option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) + .parquet(path + "/withBF") Review comment: You need to set row group size, e.g. ```scala df.write.option(ParquetOutputFormat.BLOOM_FILTER_ENABLED + "#value", true) .option("parquet.block.size", 1024 * 1024) .parquet(path + "/withBF") ``` Then you will see the benchmark difference. ``` [info] Running benchmark: Read a row from 100M rows [info] Running case: Without bloom filter [info] Stopped after 3 iterations, 2674 ms [info] Running case: With bloom filter [info] Stopped after 5 iterations, 2383 ms [info] OpenJDK 64-Bit Server VM 1.8.0_265-b01 on Mac OS X 10.16 [info] Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz [info] Read a row from 100M rows: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative [info] ------------------------------------------------------------------------------------------------------------------------ [info] Without bloom filter 872 892 19 114.7 8.7 1.0X [info] With bloom filter 473 477 3 211.4 4.7 1.8X ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org