This is an automated email from the ASF dual-hosted git repository. dongjoon pushed a commit to branch branch-3.5 in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-3.5 by this push: new d7c3794a0c56 [SPARK-47904][SQL][3.5] Preserve case in Avro schema when using enableStableIdentifiersForUnionType d7c3794a0c56 is described below commit d7c3794a0c567b12e8c8e18132aa362f11acdf5f Author: Ivan Sadikov <ivan.sadi...@databricks.com> AuthorDate: Mon Apr 22 15:36:13 2024 -0700 [SPARK-47904][SQL][3.5] Preserve case in Avro schema when using enableStableIdentifiersForUnionType ### What changes were proposed in this pull request? Backport of https://github.com/apache/spark/pull/46126 to branch-3.5. When `enableStableIdentifiersForUnionType` is enabled, all of the types are lowercased which creates a problem when field types are case-sensitive: Union type with fields: ``` Schema.createEnum("myENUM", "", null, List[String]("E1", "e2").asJava), Schema.createRecord("myRecord2", "", null, false, List[Schema.Field](new Schema.Field("F", Schema.create(Type.FLOAT))).asJava) ``` would become ``` struct<member_myenum: string, member_myrecord2: struct<f: float>> ``` but instead should be ``` struct<member_myENUM: string, member_myRecord2: struct<F: float>> ``` ### Why are the changes needed? Fixes a bug of lowercasing the field name (the type portion). ### Does this PR introduce _any_ user-facing change? Yes, if a user enables `enableStableIdentifiersForUnionType` and has Union types, all fields will preserve the case. Previously, the field names would be all in lowercase. ### How was this patch tested? I added a test case to verify the new field names. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #46169 from sadikovi/SPARK-47904-3.5. Authored-by: Ivan Sadikov <ivan.sadi...@databricks.com> Signed-off-by: Dongjoon Hyun <dh...@apple.com> --- .../apache/spark/sql/avro/SchemaConverters.scala | 10 +++---- .../org/apache/spark/sql/avro/AvroSuite.scala | 31 ++++++++++++++++++++-- 2 files changed, 34 insertions(+), 7 deletions(-) diff --git a/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala index 06abe977e3b0..af358a8d1c96 100644 --- a/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala +++ b/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala @@ -183,14 +183,14 @@ object SchemaConverters { // Avro's field name may be case sensitive, so field names for two named type // could be "a" and "A" and we need to distinguish them. In this case, we throw // an exception. - val temp_name = s"member_${s.getName.toLowerCase(Locale.ROOT)}" - if (fieldNameSet.contains(temp_name)) { + // Stable id prefix can be empty so the name of the field can be just the type. + val tempFieldName = s"member_${s.getName}" + if (!fieldNameSet.add(tempFieldName.toLowerCase(Locale.ROOT))) { throw new IncompatibleSchemaException( - "Cannot generate stable indentifier for Avro union type due to name " + + "Cannot generate stable identifier for Avro union type due to name " + s"conflict of type name ${s.getName}") } - fieldNameSet.add(temp_name) - temp_name + tempFieldName } else { s"member$i" } diff --git a/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala index 1df99210a55a..01c9dfb57a19 100644 --- a/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala +++ b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala @@ -370,7 +370,7 @@ abstract class AvroSuite "", Seq()) } - assert(e.getMessage.contains("Cannot generate stable indentifier")) + assert(e.getMessage.contains("Cannot generate stable identifier")) } { val e = intercept[Exception] { @@ -381,7 +381,7 @@ abstract class AvroSuite "", Seq()) } - assert(e.getMessage.contains("Cannot generate stable indentifier")) + assert(e.getMessage.contains("Cannot generate stable identifier")) } // Two array types or two map types are not allowed in union. { @@ -434,6 +434,33 @@ abstract class AvroSuite } } + test("SPARK-47904: Test that field name case is preserved") { + checkUnionStableId( + List( + Schema.createEnum("myENUM", "", null, List[String]("E1", "e2").asJava), + Schema.createRecord("myRecord", "", null, false, + List[Schema.Field](new Schema.Field("f", Schema.createFixed("myField", "", null, 6))) + .asJava), + Schema.createRecord("myRecord2", "", null, false, + List[Schema.Field](new Schema.Field("F", Schema.create(Type.FLOAT))) + .asJava)), + "struct<member_myENUM: string, member_myRecord: struct<f: binary>, " + + "member_myRecord2: struct<F: float>>", + Seq()) + + { + val e = intercept[Exception] { + checkUnionStableId( + List( + Schema.createRecord("myRecord", "", null, false, List[Schema.Field]().asJava), + Schema.createRecord("myrecord", "", null, false, List[Schema.Field]().asJava)), + "", + Seq()) + } + assert(e.getMessage.contains("Cannot generate stable identifier")) + } + } + test("SPARK-27858 Union type: More than one non-null type") { Seq(true, false).foreach { isStableUnionMember => withTempDir { dir => --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org