Github user cloud-fan commented on a diff in the pull request: https://github.com/apache/spark/pull/22661#discussion_r224767594 --- Diff: sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/JoinBenchmark.scala --- @@ -19,229 +19,163 @@ package org.apache.spark.sql.execution.benchmark import org.apache.spark.sql.execution.joins._ import org.apache.spark.sql.functions._ +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.IntegerType /** - * Benchmark to measure performance for aggregate primitives. - * To run this: - * build/sbt "sql/test-only *benchmark.JoinBenchmark" - * - * Benchmarks in this file are skipped in normal builds. + * Benchmark to measure performance for joins. + * To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class <this class> --jars <spark core test jar> <spark sql test jar> + * 2. build/sbt "sql/test:runMain <this class>" + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/test:runMain <this class>" + * Results will be written to "benchmarks/JoinBenchmark-results.txt". + * }}} */ -class JoinBenchmark extends BenchmarkWithCodegen { +object JoinBenchmark extends SqlBasedBenchmark { - ignore("broadcast hash join, long key") { + def broadcastHashJoinLongKey(): Unit = { val N = 20 << 20 val M = 1 << 16 - val dim = broadcast(sparkSession.range(M).selectExpr("id as k", "cast(id as string) as v")) - runBenchmark("Join w long", N) { - val df = sparkSession.range(N).join(dim, (col("id") % M) === col("k")) + val dim = broadcast(spark.range(M).selectExpr("id as k", "cast(id as string) as v")) + codegenBenchmark("Join w long", N) { + val df = spark.range(N).join(dim, (col("id") % M) === col("k")) assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined) df.count() } - - /* - Java HotSpot(TM) 64-Bit Server VM 1.7.0_60-b19 on Mac OS X 10.9.5 - Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - Join w long: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - ------------------------------------------------------------------------------------------- - Join w long codegen=false 3002 / 3262 7.0 143.2 1.0X - Join w long codegen=true 321 / 371 65.3 15.3 9.3X - */ } - ignore("broadcast hash join, long key with duplicates") { + def broadcastHashJoinLongKeyWithDuplicates(): Unit = { val N = 20 << 20 val M = 1 << 16 - - val dim = broadcast(sparkSession.range(M).selectExpr("id as k", "cast(id as string) as v")) - runBenchmark("Join w long duplicated", N) { - val dim = broadcast(sparkSession.range(M).selectExpr("cast(id/10 as long) as k")) - val df = sparkSession.range(N).join(dim, (col("id") % M) === col("k")) + val dim = broadcast(spark.range(M).selectExpr("cast(id/10 as long) as k")) + codegenBenchmark("Join w long duplicated", N) { + val df = spark.range(N).join(dim, (col("id") % M) === col("k")) assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined) df.count() } - - /* - *Java HotSpot(TM) 64-Bit Server VM 1.7.0_60-b19 on Mac OS X 10.9.5 - *Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - *Join w long duplicated: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - *------------------------------------------------------------------------------------------- - *Join w long duplicated codegen=false 3446 / 3478 6.1 164.3 1.0X - *Join w long duplicated codegen=true 322 / 351 65.2 15.3 10.7X - */ } - ignore("broadcast hash join, two int key") { + def broadcastHashJoinTwoIntKey(): Unit = { val N = 20 << 20 val M = 1 << 16 - val dim2 = broadcast(sparkSession.range(M) + val dim2 = broadcast(spark.range(M) .selectExpr("cast(id as int) as k1", "cast(id as int) as k2", "cast(id as string) as v")) - runBenchmark("Join w 2 ints", N) { - val df = sparkSession.range(N).join(dim2, + codegenBenchmark("Join w 2 ints", N) { + val df = spark.range(N).join(dim2, (col("id") % M).cast(IntegerType) === col("k1") && (col("id") % M).cast(IntegerType) === col("k2")) assert(df.queryExecution.sparkPlan.find(_.isInstanceOf[BroadcastHashJoinExec]).isDefined) df.count() } - - /* - *Java HotSpot(TM) 64-Bit Server VM 1.7.0_60-b19 on Mac OS X 10.9.5 - *Intel(R) Core(TM) i7-4558U CPU @ 2.80GHz - *Join w 2 ints: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative - *------------------------------------------------------------------------------------------- - *Join w 2 ints codegen=false 4426 / 4501 4.7 211.1 1.0X - *Join w 2 ints codegen=true 791 / 818 26.5 37.7 5.6X - */ --- End diff -- This seems caused by the bug fix: https://github.com/apache/spark/pull/15390 So the performance is reasonable.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org