This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch branch-3.5
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-3.5 by this push:
     new d7c3794a0c56 [SPARK-47904][SQL][3.5] Preserve case in Avro schema when 
using enableStableIdentifiersForUnionType
d7c3794a0c56 is described below

commit d7c3794a0c567b12e8c8e18132aa362f11acdf5f
Author: Ivan Sadikov <ivan.sadi...@databricks.com>
AuthorDate: Mon Apr 22 15:36:13 2024 -0700

    [SPARK-47904][SQL][3.5] Preserve case in Avro schema when using 
enableStableIdentifiersForUnionType
    
    ### What changes were proposed in this pull request?
    
    Backport of https://github.com/apache/spark/pull/46126 to branch-3.5.
    
    When `enableStableIdentifiersForUnionType` is enabled, all of the types are 
lowercased which creates a problem when field types are case-sensitive:
    
    Union type with fields:
    ```
    Schema.createEnum("myENUM", "", null, List[String]("E1", "e2").asJava),
    Schema.createRecord("myRecord2", "", null, false, List[Schema.Field](new 
Schema.Field("F", Schema.create(Type.FLOAT))).asJava)
    ```
    
    would become
    
    ```
    struct<member_myenum: string, member_myrecord2: struct<f: float>>
    ```
    
    but instead should be
    ```
    struct<member_myENUM: string, member_myRecord2: struct<F: float>>
    ```
    
    ### Why are the changes needed?
    
    Fixes a bug of lowercasing the field name (the type portion).
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, if a user enables `enableStableIdentifiersForUnionType` and has Union 
types, all fields will preserve the case. Previously, the field names would be 
all in lowercase.
    
    ### How was this patch tested?
    
    I added a test case to verify the new field names.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #46169 from sadikovi/SPARK-47904-3.5.
    
    Authored-by: Ivan Sadikov <ivan.sadi...@databricks.com>
    Signed-off-by: Dongjoon Hyun <dh...@apple.com>
---
 .../apache/spark/sql/avro/SchemaConverters.scala   | 10 +++----
 .../org/apache/spark/sql/avro/AvroSuite.scala      | 31 ++++++++++++++++++++--
 2 files changed, 34 insertions(+), 7 deletions(-)

diff --git 
a/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala
 
b/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala
index 06abe977e3b0..af358a8d1c96 100644
--- 
a/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala
+++ 
b/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala
@@ -183,14 +183,14 @@ object SchemaConverters {
                   // Avro's field name may be case sensitive, so field names 
for two named type
                   // could be "a" and "A" and we need to distinguish them. In 
this case, we throw
                   // an exception.
-                  val temp_name = 
s"member_${s.getName.toLowerCase(Locale.ROOT)}"
-                  if (fieldNameSet.contains(temp_name)) {
+                  // Stable id prefix can be empty so the name of the field 
can be just the type.
+                  val tempFieldName = s"member_${s.getName}"
+                  if 
(!fieldNameSet.add(tempFieldName.toLowerCase(Locale.ROOT))) {
                     throw new IncompatibleSchemaException(
-                      "Cannot generate stable indentifier for Avro union type 
due to name " +
+                      "Cannot generate stable identifier for Avro union type 
due to name " +
                       s"conflict of type name ${s.getName}")
                   }
-                  fieldNameSet.add(temp_name)
-                  temp_name
+                  tempFieldName
                 } else {
                   s"member$i"
                 }
diff --git 
a/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala 
b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala
index 1df99210a55a..01c9dfb57a19 100644
--- a/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala
+++ b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala
@@ -370,7 +370,7 @@ abstract class AvroSuite
           "",
           Seq())
       }
-      assert(e.getMessage.contains("Cannot generate stable indentifier"))
+      assert(e.getMessage.contains("Cannot generate stable identifier"))
     }
     {
       val e = intercept[Exception] {
@@ -381,7 +381,7 @@ abstract class AvroSuite
           "",
           Seq())
       }
-      assert(e.getMessage.contains("Cannot generate stable indentifier"))
+      assert(e.getMessage.contains("Cannot generate stable identifier"))
     }
     // Two array types or two map types are not allowed in union.
     {
@@ -434,6 +434,33 @@ abstract class AvroSuite
     }
   }
 
+  test("SPARK-47904: Test that field name case is preserved") {
+    checkUnionStableId(
+      List(
+        Schema.createEnum("myENUM", "", null, List[String]("E1", "e2").asJava),
+        Schema.createRecord("myRecord", "", null, false,
+          List[Schema.Field](new Schema.Field("f", 
Schema.createFixed("myField", "", null, 6)))
+            .asJava),
+        Schema.createRecord("myRecord2", "", null, false,
+          List[Schema.Field](new Schema.Field("F", Schema.create(Type.FLOAT)))
+            .asJava)),
+      "struct<member_myENUM: string, member_myRecord: struct<f: binary>, " +
+                    "member_myRecord2: struct<F: float>>",
+      Seq())
+
+    {
+      val e = intercept[Exception] {
+        checkUnionStableId(
+          List(
+            Schema.createRecord("myRecord", "", null, false, 
List[Schema.Field]().asJava),
+            Schema.createRecord("myrecord", "", null, false, 
List[Schema.Field]().asJava)),
+          "",
+          Seq())
+      }
+      assert(e.getMessage.contains("Cannot generate stable identifier"))
+    }
+  }
+
   test("SPARK-27858 Union type: More than one non-null type") {
     Seq(true, false).foreach { isStableUnionMember =>
       withTempDir { dir =>


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to