Repository: spark Updated Branches: refs/heads/branch-2.0 ef0253ff6 -> c4cebd572
[SPARK-16238] Metrics for generated method and class bytecode size ## What changes were proposed in this pull request? This extends SPARK-15860 to include metrics for the actual bytecode size of janino-generated methods. They can be accessed in the same way as any other codahale metric, e.g. ``` scala> org.apache.spark.metrics.source.CodegenMetrics.METRIC_GENERATED_CLASS_BYTECODE_SIZE.getSnapshot().getValues() res7: Array[Long] = Array(532, 532, 532, 542, 1479, 2670, 3585, 3585) scala> org.apache.spark.metrics.source.CodegenMetrics.METRIC_GENERATED_METHOD_BYTECODE_SIZE.getSnapshot().getValues() res8: Array[Long] = Array(5, 5, 5, 5, 10, 10, 10, 10, 15, 15, 15, 38, 63, 79, 88, 94, 94, 94, 132, 132, 165, 165, 220, 220) ``` ## How was this patch tested? Small unit test, also verified manually that the performance impact is minimal (<10%). hvanhovell Author: Eric Liang <e...@databricks.com> Closes #13934 from ericl/spark-16238. (cherry picked from commit 23c58653f900bfb71ef2b3186a95ad2562c33969) Signed-off-by: Reynold Xin <r...@databricks.com> Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c4cebd57 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c4cebd57 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c4cebd57 Branch: refs/heads/branch-2.0 Commit: c4cebd5725e6d8ade8c0a02652e251d04903da72 Parents: ef0253f Author: Eric Liang <e...@databricks.com> Authored: Wed Jun 29 15:07:32 2016 -0700 Committer: Reynold Xin <r...@databricks.com> Committed: Wed Jun 29 15:07:38 2016 -0700 ---------------------------------------------------------------------- .../spark/metrics/source/StaticSources.scala | 12 ++++++ .../expressions/codegen/CodeGenerator.scala | 40 +++++++++++++++++++- .../expressions/CodeGenerationSuite.scala | 4 ++ 3 files changed, 55 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/c4cebd57/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala ---------------------------------------------------------------------- diff --git a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala index 6819222..6bba259 100644 --- a/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala +++ b/core/src/main/scala/org/apache/spark/metrics/source/StaticSources.scala @@ -47,4 +47,16 @@ object CodegenMetrics extends Source { * Histogram of the time it took to compile source code text (in milliseconds). */ val METRIC_COMPILATION_TIME = metricRegistry.histogram(MetricRegistry.name("compilationTime")) + + /** + * Histogram of the bytecode size of each class generated by CodeGenerator. + */ + val METRIC_GENERATED_CLASS_BYTECODE_SIZE = + metricRegistry.histogram(MetricRegistry.name("generatedClassSize")) + + /** + * Histogram of the bytecode size of each method in classes generated by CodeGenerator. + */ + val METRIC_GENERATED_METHOD_BYTECODE_SIZE = + metricRegistry.histogram(MetricRegistry.name("generatedMethodSize")) } http://git-wip-us.apache.org/repos/asf/spark/blob/c4cebd57/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala index 6392ff4..16fb1f6 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/codegen/CodeGenerator.scala @@ -17,11 +17,16 @@ package org.apache.spark.sql.catalyst.expressions.codegen +import java.io.ByteArrayInputStream +import java.util.{Map => JavaMap} + +import scala.collection.JavaConverters._ import scala.collection.mutable import scala.collection.mutable.ArrayBuffer import com.google.common.cache.{CacheBuilder, CacheLoader} -import org.codehaus.janino.ClassBodyEvaluator +import org.codehaus.janino.{ByteArrayClassLoader, ClassBodyEvaluator, SimpleCompiler} +import org.codehaus.janino.util.ClassFile import scala.language.existentials import org.apache.spark.SparkEnv @@ -876,6 +881,7 @@ object CodeGenerator extends Logging { try { evaluator.cook("generated.java", code.body) + recordCompilationStats(evaluator) } catch { case e: Exception => val msg = s"failed to compile: $e\n$formatted" @@ -886,6 +892,38 @@ object CodeGenerator extends Logging { } /** + * Records the generated class and method bytecode sizes by inspecting janino private fields. + */ + private def recordCompilationStats(evaluator: ClassBodyEvaluator): Unit = { + // First retrieve the generated classes. + val classes = { + val resultField = classOf[SimpleCompiler].getDeclaredField("result") + resultField.setAccessible(true) + val loader = resultField.get(evaluator).asInstanceOf[ByteArrayClassLoader] + val classesField = loader.getClass.getDeclaredField("classes") + classesField.setAccessible(true) + classesField.get(loader).asInstanceOf[JavaMap[String, Array[Byte]]].asScala + } + + // Then walk the classes to get at the method bytecode. + val codeAttr = Utils.classForName("org.codehaus.janino.util.ClassFile$CodeAttribute") + val codeAttrField = codeAttr.getDeclaredField("code") + codeAttrField.setAccessible(true) + classes.foreach { case (_, classBytes) => + CodegenMetrics.METRIC_GENERATED_CLASS_BYTECODE_SIZE.update(classBytes.length) + val cf = new ClassFile(new ByteArrayInputStream(classBytes)) + cf.methodInfos.asScala.foreach { method => + method.getAttributes().foreach { a => + if (a.getClass.getName == codeAttr.getName) { + CodegenMetrics.METRIC_GENERATED_METHOD_BYTECODE_SIZE.update( + codeAttrField.get(a).asInstanceOf[Array[Byte]].length) + } + } + } + } + } + + /** * A cache of generated classes. * * From the Guava Docs: A Cache is similar to ConcurrentMap, but not quite the same. The most http://git-wip-us.apache.org/repos/asf/spark/blob/c4cebd57/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala index 60dd03f..8ea8f61 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CodeGenerationSuite.scala @@ -53,9 +53,13 @@ class CodeGenerationSuite extends SparkFunSuite with ExpressionEvalHelper { test("metrics are recorded on compile") { val startCount1 = CodegenMetrics.METRIC_COMPILATION_TIME.getCount() val startCount2 = CodegenMetrics.METRIC_SOURCE_CODE_SIZE.getCount() + val startCount3 = CodegenMetrics.METRIC_GENERATED_CLASS_BYTECODE_SIZE.getCount() + val startCount4 = CodegenMetrics.METRIC_GENERATED_METHOD_BYTECODE_SIZE.getCount() GenerateOrdering.generate(Add(Literal(123), Literal(1)).asc :: Nil) assert(CodegenMetrics.METRIC_COMPILATION_TIME.getCount() == startCount1 + 1) assert(CodegenMetrics.METRIC_SOURCE_CODE_SIZE.getCount() == startCount2 + 1) + assert(CodegenMetrics.METRIC_GENERATED_CLASS_BYTECODE_SIZE.getCount() > startCount1) + assert(CodegenMetrics.METRIC_GENERATED_METHOD_BYTECODE_SIZE.getCount() > startCount1) } test("SPARK-8443: split wide projections into blocks due to JVM code size limit") { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org