Github user yucai commented on a diff in the pull request: https://github.com/apache/spark/pull/22847#discussion_r229919857 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala --- @@ -812,6 +812,17 @@ object SQLConf { .intConf .createWithDefault(65535) + val CODEGEN_METHOD_SPLIT_THRESHOLD = buildConf("spark.sql.codegen.methodSplitThreshold") + .internal() + .doc("The threshold of source code length without comment of a single Java function by " + + "codegen to be split. When the generated Java function source code exceeds this threshold" + + ", it will be split into multiple small functions. We can't know how many bytecode will " + + "be generated, so use the code length as metric. A function's bytecode should not go " + + "beyond 8KB, otherwise it will not be JITted; it also should not be too small, otherwise " + + "there will be many function calls.") + .intConf --- End diff -- Seems like long alias names have no influence. ``` [info] Java HotSpot(TM) 64-Bit Server VM 1.8.0_162-b12 on Mac OS X 10.13.6 [info] Intel(R) Core(TM) i7-7820HQ CPU @ 2.90GHz [info] projection on wide table: Best/Avg Time(ms) Rate(M/s) Per Row(ns) Relative [info] ------------------------------------------------------------------------------------------------ [info] split threshold 10 6512 / 6736 0.2 6210.4 1.0X [info] split threshold 100 5730 / 6329 0.2 5464.9 1.1X [info] split threshold 1024 3119 / 3184 0.3 2974.6 2.1X [info] split threshold 2048 2981 / 3100 0.4 2842.9 2.2X [info] split threshold 4096 3289 / 3379 0.3 3136.6 2.0X [info] split threshold 8196 4307 / 4338 0.2 4108.0 1.5X [info] split threshold 65536 29147 / 30212 0.0 27797.0 0.2X ``` No `averylongprefixrepeatedmultipletimes` in the **expression code gen**: ``` /* 047 */ private void createExternalRow_0_8(InternalRow i, Object[] values_0) { /* 048 */ /* 049 */ // input[80, bigint, false] /* 050 */ long value_81 = i.getLong(80); /* 051 */ if (false) { /* 052 */ values_0[80] = null; /* 053 */ } else { /* 054 */ values_0[80] = value_81; /* 055 */ } /* 056 */ /* 057 */ // input[81, bigint, false] /* 058 */ long value_82 = i.getLong(81); /* 059 */ if (false) { /* 060 */ values_0[81] = null; /* 061 */ } else { /* 062 */ values_0[81] = value_82; /* 063 */ } /* 064 */ /* 065 */ // input[82, bigint, false] /* 066 */ long value_83 = i.getLong(82); /* 067 */ if (false) { /* 068 */ values_0[82] = null; /* 069 */ } else { /* 070 */ values_0[82] = value_83; /* 071 */ } /* 072 */ ... ``` My benchmark: ``` object WideTableBenchmark extends SqlBasedBenchmark { override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { runBenchmark("projection on wide table") { val N = 1 << 20 val df = spark.range(N) val columns = (0 until 400).map{ i => s"id as averylongprefixrepeatedmultipletimes_id$i"} val benchmark = new Benchmark("projection on wide table", N, output = output) Seq("10", "100", "1024", "2048", "4096", "8196", "65536").foreach { n => benchmark.addCase(s"split threshold $n", numIters = 5) { iter => withSQLConf("spark.testing.codegen.splitThreshold" -> n) { df.selectExpr(columns: _*).foreach(identity(_)) } } } benchmark.run() } } } ``` Will keep benchmarking for the complex expression.
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org