spark git commit: [SPARK-24216][SQL] Spark TypedAggregateExpression uses getSimpleName that is not safe in scala

2018-06-12 Thread wenchen
Repository: spark
Updated Branches:
  refs/heads/master f0ef1b311 -> cc88d7fad


[SPARK-24216][SQL] Spark TypedAggregateExpression uses getSimpleName that is 
not safe in scala

## What changes were proposed in this pull request?

When user create a aggregator object in scala and pass the aggregator to Spark 
Dataset's agg() method, Spark's will initialize TypedAggregateExpression with 
the nodeName field as aggregator.getClass.getSimpleName. However, getSimpleName 
is not safe in scala environment, depending on how user creates the aggregator 
object. For example, if the aggregator class full qualified name is 
"com.my.company.MyUtils$myAgg$2$", the getSimpleName will throw 
java.lang.InternalError "Malformed class name". This has been reported in 
scalatest https://github.com/scalatest/scalatest/pull/1044 and discussed in 
many scala upstream jiras such as SI-8110, SI-5425.

To fix this issue, we follow the solution in 
https://github.com/scalatest/scalatest/pull/1044 to add safer version of 
getSimpleName as a util method, and TypedAggregateExpression will invoke this 
util method rather than getClass.getSimpleName.

## How was this patch tested?
added unit test

Author: Fangshi Li 

Closes #21276 from fangshil/SPARK-24216.


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/cc88d7fa
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/cc88d7fa
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/cc88d7fa

Branch: refs/heads/master
Commit: cc88d7fad16e8b5cbf7b6b9bfe412908782b4a45
Parents: f0ef1b3
Author: Fangshi Li 
Authored: Tue Jun 12 12:10:08 2018 -0700
Committer: Wenchen Fan 
Committed: Tue Jun 12 12:10:08 2018 -0700

--
 .../org/apache/spark/util/AccumulatorV2.scala   |  6 +-
 .../scala/org/apache/spark/util/Utils.scala | 59 +++-
 .../org/apache/spark/util/UtilsSuite.scala  | 16 ++
 .../apache/spark/ml/util/Instrumentation.scala  |  5 +-
 .../aggregate/TypedAggregateExpression.scala|  5 +-
 .../v2/DataSourceV2StringFormat.scala   |  4 +-
 6 files changed, 89 insertions(+), 6 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/cc88d7fa/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
--
diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala 
b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
index 3b469a6..bf618b4 100644
--- a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
+++ b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
@@ -200,10 +200,12 @@ abstract class AccumulatorV2[IN, OUT] extends 
Serializable {
   }
 
   override def toString: String = {
+// getClass.getSimpleName can cause Malformed class name error,
+// call safer `Utils.getSimpleName` instead
 if (metadata == null) {
-  "Un-registered Accumulator: " + getClass.getSimpleName
+  "Un-registered Accumulator: " + Utils.getSimpleName(getClass)
 } else {
-  getClass.getSimpleName + s"(id: $id, name: $name, value: $value)"
+  Utils.getSimpleName(getClass) + s"(id: $id, name: $name, value: $value)"
 }
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/cc88d7fa/core/src/main/scala/org/apache/spark/util/Utils.scala
--
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala 
b/core/src/main/scala/org/apache/spark/util/Utils.scala
index f9191a5..7428db2 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -19,6 +19,7 @@ package org.apache.spark.util
 
 import java.io._
 import java.lang.{Byte => JByte}
+import java.lang.InternalError
 import java.lang.management.{LockInfo, ManagementFactory, MonitorInfo, 
ThreadInfo}
 import java.lang.reflect.InvocationTargetException
 import java.math.{MathContext, RoundingMode}
@@ -1820,7 +1821,7 @@ private[spark] object Utils extends Logging {
 
   /** Return the class name of the given object, removing all dollar signs */
   def getFormattedClassName(obj: AnyRef): String = {
-obj.getClass.getSimpleName.replace("$", "")
+getSimpleName(obj.getClass).replace("$", "")
   }
 
   /**
@@ -2715,6 +2716,62 @@ private[spark] object Utils extends Logging {
 HashCodes.fromBytes(secretBytes).toString()
   }
 
+  /**
+   * Safer than Class obj's getSimpleName which may throw Malformed class name 
error in scala.
+   * This method mimicks scalatest's getSimpleNameOfAnObjectsClass.
+   */
+  def getSimpleName(cls: Class[_]): String = {
+try {
+  return cls.getSimpleName
+} catch {
+  case err: InternalError => return 
stripDollars(stripPackages(cls.getName))
+}
+  }
+
+  /**
+   * Remove the pa

spark git commit: [SPARK-24216][SQL] Spark TypedAggregateExpression uses getSimpleName that is not safe in scala

2018-06-15 Thread wenchen
Repository: spark
Updated Branches:
  refs/heads/branch-2.3 d42610440 -> 9d63e540e


[SPARK-24216][SQL] Spark TypedAggregateExpression uses getSimpleName that is 
not safe in scala

When user create a aggregator object in scala and pass the aggregator to Spark 
Dataset's agg() method, Spark's will initialize TypedAggregateExpression with 
the nodeName field as aggregator.getClass.getSimpleName. However, getSimpleName 
is not safe in scala environment, depending on how user creates the aggregator 
object. For example, if the aggregator class full qualified name is 
"com.my.company.MyUtils$myAgg$2$", the getSimpleName will throw 
java.lang.InternalError "Malformed class name". This has been reported in 
scalatest https://github.com/scalatest/scalatest/pull/1044 and discussed in 
many scala upstream jiras such as SI-8110, SI-5425.

To fix this issue, we follow the solution in 
https://github.com/scalatest/scalatest/pull/1044 to add safer version of 
getSimpleName as a util method, and TypedAggregateExpression will invoke this 
util method rather than getClass.getSimpleName.

added unit test

Author: Fangshi Li 

Closes #21276 from fangshil/SPARK-24216.

(cherry picked from commit cc88d7fad16e8b5cbf7b6b9bfe412908782b4a45)
Signed-off-by: Wenchen Fan 


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/9d63e540
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/9d63e540
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/9d63e540

Branch: refs/heads/branch-2.3
Commit: 9d63e540e00bc655faf6d8fe1d0035bc0b9a9192
Parents: d426104
Author: Fangshi Li 
Authored: Tue Jun 12 12:10:08 2018 -0700
Committer: Wenchen Fan 
Committed: Fri Jun 15 20:23:05 2018 -0700

--
 .../org/apache/spark/util/AccumulatorV2.scala   |  6 +-
 .../scala/org/apache/spark/util/Utils.scala | 59 +++-
 .../org/apache/spark/util/UtilsSuite.scala  | 16 ++
 .../apache/spark/ml/util/Instrumentation.scala  |  5 +-
 .../aggregate/TypedAggregateExpression.scala|  5 +-
 5 files changed, 86 insertions(+), 5 deletions(-)
--


http://git-wip-us.apache.org/repos/asf/spark/blob/9d63e540/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
--
diff --git a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala 
b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
index 3b469a6..bf618b4 100644
--- a/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
+++ b/core/src/main/scala/org/apache/spark/util/AccumulatorV2.scala
@@ -200,10 +200,12 @@ abstract class AccumulatorV2[IN, OUT] extends 
Serializable {
   }
 
   override def toString: String = {
+// getClass.getSimpleName can cause Malformed class name error,
+// call safer `Utils.getSimpleName` instead
 if (metadata == null) {
-  "Un-registered Accumulator: " + getClass.getSimpleName
+  "Un-registered Accumulator: " + Utils.getSimpleName(getClass)
 } else {
-  getClass.getSimpleName + s"(id: $id, name: $name, value: $value)"
+  Utils.getSimpleName(getClass) + s"(id: $id, name: $name, value: $value)"
 }
   }
 }

http://git-wip-us.apache.org/repos/asf/spark/blob/9d63e540/core/src/main/scala/org/apache/spark/util/Utils.scala
--
diff --git a/core/src/main/scala/org/apache/spark/util/Utils.scala 
b/core/src/main/scala/org/apache/spark/util/Utils.scala
index 12d0934..d4b72e8 100644
--- a/core/src/main/scala/org/apache/spark/util/Utils.scala
+++ b/core/src/main/scala/org/apache/spark/util/Utils.scala
@@ -19,6 +19,7 @@ package org.apache.spark.util
 
 import java.io._
 import java.lang.{Byte => JByte}
+import java.lang.InternalError
 import java.lang.management.{LockInfo, ManagementFactory, MonitorInfo, 
ThreadInfo}
 import java.lang.reflect.InvocationTargetException
 import java.math.{MathContext, RoundingMode}
@@ -1875,7 +1876,7 @@ private[spark] object Utils extends Logging {
 
   /** Return the class name of the given object, removing all dollar signs */
   def getFormattedClassName(obj: AnyRef): String = {
-obj.getClass.getSimpleName.replace("$", "")
+getSimpleName(obj.getClass).replace("$", "")
   }
 
   /** Return an option that translates JNothing to None */
@@ -2817,6 +2818,62 @@ private[spark] object Utils extends Logging {
 HashCodes.fromBytes(secretBytes).toString()
   }
 
+  /**
+   * Safer than Class obj's getSimpleName which may throw Malformed class name 
error in scala.
+   * This method mimicks scalatest's getSimpleNameOfAnObjectsClass.
+   */
+  def getSimpleName(cls: Class[_]): String = {
+try {
+  return cls.getSimpleName
+} catch {
+  case err: InternalError => return 
stripDollars(stripPackages(cls.getName))
+}
+  }
+
+  /**