This is an automated email from the ASF dual-hosted git repository. joshrosen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new ec032ce [SPARK-28112][TEST] Fix Kryo exception perf. bottleneck in tests due to absence of ML/MLlib classes ec032ce is described below commit ec032cea4f91a5ee6ce51e2216de23104486a053 Author: Josh Rosen <rosenvi...@gmail.com> AuthorDate: Wed Jun 19 19:06:22 2019 -0700 [SPARK-28112][TEST] Fix Kryo exception perf. bottleneck in tests due to absence of ML/MLlib classes ## What changes were proposed in this pull request? In a nutshell, it looks like the absence of ML / MLlib classes on the classpath causes code in KryoSerializer to throw and catch ClassNotFoundExceptions whenever instantiating a new serializer in newInstance(). This isn't a performance problem in production (since MLlib is on the classpath there) but it's a huge issue in tests and appears to account for an enormous amount of test time We can address this problem by reducing the total number of ClassNotFoundExceptions by performing the class existence checks once and storing the results in KryoSerializer instances rather than repeating the checks on each newInstance() call. ## How was this patch tested? The existing tests. Authored-by: Josh Rosen <joshrosendatabricks.com> Closes #24916 from gatorsmile/kryoException. Lead-authored-by: Josh Rosen <rosenvi...@gmail.com> Co-authored-by: gatorsmile <gatorsm...@gmail.com> Signed-off-by: Josh Rosen <rosenvi...@gmail.com> --- .../apache/spark/serializer/KryoSerializer.scala | 78 +++++++++++++--------- 1 file changed, 45 insertions(+), 33 deletions(-) diff --git a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala index 3969106..20774c8 100644 --- a/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala +++ b/core/src/main/scala/org/apache/spark/serializer/KryoSerializer.scala @@ -212,40 +212,8 @@ class KryoSerializer(conf: SparkConf) // We can't load those class directly in order to avoid unnecessary jar dependencies. // We load them safely, ignore it if the class not found. - Seq( - "org.apache.spark.sql.catalyst.expressions.UnsafeRow", - "org.apache.spark.sql.catalyst.expressions.UnsafeArrayData", - "org.apache.spark.sql.catalyst.expressions.UnsafeMapData", - - "org.apache.spark.ml.attribute.Attribute", - "org.apache.spark.ml.attribute.AttributeGroup", - "org.apache.spark.ml.attribute.BinaryAttribute", - "org.apache.spark.ml.attribute.NominalAttribute", - "org.apache.spark.ml.attribute.NumericAttribute", - - "org.apache.spark.ml.feature.Instance", - "org.apache.spark.ml.feature.LabeledPoint", - "org.apache.spark.ml.feature.OffsetInstance", - "org.apache.spark.ml.linalg.DenseMatrix", - "org.apache.spark.ml.linalg.DenseVector", - "org.apache.spark.ml.linalg.Matrix", - "org.apache.spark.ml.linalg.SparseMatrix", - "org.apache.spark.ml.linalg.SparseVector", - "org.apache.spark.ml.linalg.Vector", - "org.apache.spark.ml.stat.distribution.MultivariateGaussian", - "org.apache.spark.ml.tree.impl.TreePoint", - "org.apache.spark.mllib.clustering.VectorWithNorm", - "org.apache.spark.mllib.linalg.DenseMatrix", - "org.apache.spark.mllib.linalg.DenseVector", - "org.apache.spark.mllib.linalg.Matrix", - "org.apache.spark.mllib.linalg.SparseMatrix", - "org.apache.spark.mllib.linalg.SparseVector", - "org.apache.spark.mllib.linalg.Vector", - "org.apache.spark.mllib.regression.LabeledPoint", - "org.apache.spark.mllib.stat.distribution.MultivariateGaussian" - ).foreach { name => + KryoSerializer.loadableSparkClasses.foreach { clazz => try { - val clazz = Utils.classForName(name) kryo.register(clazz) } catch { case NonFatal(_) => // do nothing @@ -516,6 +484,50 @@ private[serializer] object KryoSerializer { } } ) + + // classForName() is expensive in case the class is not found, so we filter the list of + // SQL / ML / MLlib classes once and then re-use that filtered list in newInstance() calls. + private lazy val loadableSparkClasses: Seq[Class[_]] = { + Seq( + "org.apache.spark.sql.catalyst.expressions.UnsafeRow", + "org.apache.spark.sql.catalyst.expressions.UnsafeArrayData", + "org.apache.spark.sql.catalyst.expressions.UnsafeMapData", + + "org.apache.spark.ml.attribute.Attribute", + "org.apache.spark.ml.attribute.AttributeGroup", + "org.apache.spark.ml.attribute.BinaryAttribute", + "org.apache.spark.ml.attribute.NominalAttribute", + "org.apache.spark.ml.attribute.NumericAttribute", + + "org.apache.spark.ml.feature.Instance", + "org.apache.spark.ml.feature.LabeledPoint", + "org.apache.spark.ml.feature.OffsetInstance", + "org.apache.spark.ml.linalg.DenseMatrix", + "org.apache.spark.ml.linalg.DenseVector", + "org.apache.spark.ml.linalg.Matrix", + "org.apache.spark.ml.linalg.SparseMatrix", + "org.apache.spark.ml.linalg.SparseVector", + "org.apache.spark.ml.linalg.Vector", + "org.apache.spark.ml.stat.distribution.MultivariateGaussian", + "org.apache.spark.ml.tree.impl.TreePoint", + "org.apache.spark.mllib.clustering.VectorWithNorm", + "org.apache.spark.mllib.linalg.DenseMatrix", + "org.apache.spark.mllib.linalg.DenseVector", + "org.apache.spark.mllib.linalg.Matrix", + "org.apache.spark.mllib.linalg.SparseMatrix", + "org.apache.spark.mllib.linalg.SparseVector", + "org.apache.spark.mllib.linalg.Vector", + "org.apache.spark.mllib.regression.LabeledPoint", + "org.apache.spark.mllib.stat.distribution.MultivariateGaussian" + ).flatMap { name => + try { + Some[Class[_]](Utils.classForName(name)) + } catch { + case NonFatal(_) => None // do nothing + case _: NoClassDefFoundError if Utils.isTesting => None // See SPARK-23422. + } + } + } } /** --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org