This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 6cd0bef [SPARK-31416][SQL] Check more strictly that a field name can be used as a valid Java identifier for codegen 6cd0bef is described below commit 6cd0bef7fe872b5d9472a2140db188fb9f1eede8 Author: Kousuke Saruta <saru...@oss.nttdata.com> AuthorDate: Sun Apr 12 13:14:41 2020 -0700 [SPARK-31416][SQL] Check more strictly that a field name can be used as a valid Java identifier for codegen ### What changes were proposed in this pull request? Check more strictly that a field name can be used as a valid Java identifier in `ScalaReflection.serializerFor` To check that, `SourceVersion` is used so that we need not add reserved keywords to be checked manually for the future Java versions (e.g, underscore, var, yield), . ### Why are the changes needed? In the current implementation, `enum` is not checked even though it's a reserved keyword. Also, there are lots of characters and sequences of character including numeric literals but they are not checked. So we can't get better error message with following code. ``` case class Data(`0`: Int) Seq(Data(1)).toDF.show 20/04/11 03:24:24 ERROR CodeGenerator: failed to compile: org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 43, Column 1: Expression "value_0 = value_3" is not a type org.codehaus.commons.compiler.CompileException: File 'generated.java', Line 43, Column 1: Expression "value_0 = value_3" is not a type ... ``` ### Does this PR introduce any user-facing change? Yes. With this change and the code example above, we can get following error message. ``` java.lang.UnsupportedOperationException: `0` is not a valid identifier of Java and cannot be used as field name - root class: "Data" ... ``` ### How was this patch tested? Add another assertion to existing test case. Closes #28184 from sarutak/improve-identifier-check. Authored-by: Kousuke Saruta <saru...@oss.nttdata.com> Signed-off-by: Dongjoon Hyun <dongj...@apache.org> --- .../apache/spark/sql/catalyst/ScalaReflection.scala | 20 ++++++++++---------- .../sql/catalyst/expressions/objects/objects.scala | 3 +-- .../scala/org/apache/spark/sql/DatasetSuite.scala | 10 ---------- .../spark/sql/ScalaReflectionRelationSuite.scala | 21 +++++++++++++++++++++ 4 files changed, 32 insertions(+), 22 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala index 3694832..9c8da32 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/ScalaReflection.scala @@ -17,6 +17,8 @@ package org.apache.spark.sql.catalyst +import javax.lang.model.SourceVersion + import org.apache.commons.lang3.reflect.ConstructorUtils import org.apache.spark.internal.Logging @@ -539,9 +541,10 @@ object ScalaReflection extends ScalaReflection { val params = getConstructorParameters(t) val fields = params.map { case (fieldName, fieldType) => - if (javaKeywords.contains(fieldName)) { - throw new UnsupportedOperationException(s"`$fieldName` is a reserved keyword and " + - "cannot be used as field name\n" + walkedTypePath) + if (SourceVersion.isKeyword(fieldName) || + !SourceVersion.isIdentifier(encodeFieldNameToIdentifier(fieldName))) { + throw new UnsupportedOperationException(s"`$fieldName` is not a valid identifier of " + + "Java and cannot be used as field name\n" + walkedTypePath) } // SPARK-26730 inputObject won't be null with If's guard below. And KnownNotNul @@ -784,13 +787,6 @@ object ScalaReflection extends ScalaReflection { } } - private val javaKeywords = Set("abstract", "assert", "boolean", "break", "byte", "case", "catch", - "char", "class", "const", "continue", "default", "do", "double", "else", "extends", "false", - "final", "finally", "float", "for", "goto", "if", "implements", "import", "instanceof", "int", - "interface", "long", "native", "new", "null", "package", "private", "protected", "public", - "return", "short", "static", "strictfp", "super", "switch", "synchronized", "this", "throw", - "throws", "transient", "true", "try", "void", "volatile", "while") - val typeJavaMapping = Map[DataType, Class[_]]( BooleanType -> classOf[Boolean], ByteType -> classOf[Byte], @@ -849,6 +845,10 @@ object ScalaReflection extends ScalaReflection { Seq.empty } } + + def encodeFieldNameToIdentifier(fieldName: String): String = { + TermName(fieldName).encodedName.toString + } } /** diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala index 54abd09..d5de95c 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/objects/objects.scala @@ -28,7 +28,6 @@ import org.apache.spark.{SparkConf, SparkEnv} import org.apache.spark.serializer._ import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, ScalaReflection} -import org.apache.spark.sql.catalyst.ScalaReflection.universe.TermName import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.expressions.codegen._ @@ -311,7 +310,7 @@ case class Invoke( override def nullable: Boolean = targetObject.nullable || needNullCheck || returnNullable override def children: Seq[Expression] = targetObject +: arguments - private lazy val encodedFunctionName = TermName(functionName).encodedName.toString + private lazy val encodedFunctionName = ScalaReflection.encodeFieldNameToIdentifier(functionName) @transient lazy val method = targetObject.dataType match { case ObjectType(cls) => diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala index b4ed4ec..af65957 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -1222,14 +1222,6 @@ class DatasetSuite extends QueryTest assert(result == Set(ClassData("a", 1) -> null, ClassData("b", 2) -> ClassData("x", 2))) } - test("better error message when use java reserved keyword as field name") { - val e = intercept[UnsupportedOperationException] { - Seq(InvalidInJava(1)).toDS() - } - assert(e.getMessage.contains( - "`abstract` is a reserved keyword and cannot be used as field name")) - } - test("Dataset should support flat input object to be null") { checkDataset(Seq("a", null).toDS(), "a", null) } @@ -1964,8 +1956,6 @@ case class ClassNullableData(a: String, b: Integer) case class NestedStruct(f: ClassData) case class DeepNestedStruct(f: NestedStruct) -case class InvalidInJava(`abstract`: Int) - /** * A class used to test serialization using encoders. This class throws exceptions when using * Java serialization -- so the only way it can be "serialized" is through our encoders. diff --git a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala index 7e305e0..d51a461 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/ScalaReflectionRelationSuite.scala @@ -74,9 +74,14 @@ case class ComplexReflectData( mapFieldContainsNull: Map[Int, Option[Long]], dataField: Data) +case class InvalidInJava(`abstract`: Int) + class ScalaReflectionRelationSuite extends SparkFunSuite with SharedSparkSession { import testImplicits._ + // To avoid syntax error thrown by genjavadoc, make this case class non-top level and private. + private case class InvalidInJava2(`0`: Int) + test("query case class RDD") { val data = ReflectData("a", 1, 1L, 1.toFloat, 1.toDouble, 1.toShort, 1.toByte, true, new java.math.BigDecimal(1), Date.valueOf("1970-01-01"), new Timestamp(12345), Seq(1, 2, 3), @@ -142,4 +147,20 @@ class ScalaReflectionRelationSuite extends SparkFunSuite with SharedSparkSession Map(10 -> 100L, 20 -> 200L, 30 -> null), Row(null, "abc")))) } + + test("better error message when use java reserved keyword as field name") { + val e = intercept[UnsupportedOperationException] { + Seq(InvalidInJava(1)).toDS() + } + assert(e.getMessage.contains( + "`abstract` is not a valid identifier of Java and cannot be used as field name")) + } + + test("better error message when use invalid java identifier as field name") { + val e1 = intercept[UnsupportedOperationException] { + Seq(InvalidInJava2(1)).toDS() + } + assert(e1.getMessage.contains( + "`0` is not a valid identifier of Java and cannot be used as field name")) + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org