This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 6385cf1bf155 [SPARK-47144][CONNECT][SQL][PYTHON] Fix Spark Connect 
collation error by adding collateId protobuf field
6385cf1bf155 is described below

commit 6385cf1bf15584909fdc3d578980f0879607634d
Author: Nikola Mandic <nikola.man...@databricks.com>
AuthorDate: Wed Feb 28 13:07:24 2024 +0800

    [SPARK-47144][CONNECT][SQL][PYTHON] Fix Spark Connect collation error by 
adding collateId protobuf field
    
    ### What changes were proposed in this pull request?
    
    Collated expression `SELECT 'abc' COLLATE 'UCS_BASIC_LCASE'` is failing 
when connecting to sever using Spark Connect (in this case with pyspark):
    ```
    pyspark.errors.exceptions.connect.SparkConnectGrpcException: 
(org.apache.spark.sql.connect.common.InvalidPlanInput) Does not support convert 
string(UCS_BASIC_LCASE) to connect proto types.
    ```
    When using default collation `UCS_BASIC`, the error is not occurring.
    
    Fix the error by making `StringType` protobuf compliant with Spark datatype 
which now includes collation id.
    
    ### Why are the changes needed?
    
    To fix the error which collations introduce in Spark connect.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, when previously running
    ```
    $ ./bin/pyspark --remote 'sc://localhost'
    ...
    >>> spark.sql("SELECT 'abc' COLLATE 'UCS_BASIC_LCASE'")
    ...
    ```
    the described error would appear. These changes remove the error.
    
    ### How was this patch tested?
    
    Changes include test which simulates described behavior by utilizing 
`SparkConnectServerTest` to establish client-server connection. Test is invoked 
by `build/sbt 'connect/testOnly *SparkConnectServiceE2ESuite'`.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No.
    
    Closes #45233 from nikolamand-db/nikolamand-db/SPARK-47144.
    
    Authored-by: Nikola Mandic <nikola.man...@databricks.com>
    Signed-off-by: Wenchen Fan <wenc...@databricks.com>
---
 .../apache/spark/sql/PlanGenerationTestSuite.scala |   5 ++
 .../src/main/protobuf/spark/connect/types.proto    |   1 +
 .../connect/common/DataTypeProtoConverter.scala    |  11 ++-
 .../explain-results/select_collated_string.explain |   2 +
 .../query-tests/queries/csv_from_dataset.json      |   1 +
 .../query-tests/queries/csv_from_dataset.proto.bin | Bin 156 -> 158 bytes
 .../query-tests/queries/function_lit_array.json    |   2 +
 .../queries/function_lit_array.proto.bin           | Bin 885 -> 889 bytes
 .../query-tests/queries/function_typedLit.json     |  16 +++++
 .../queries/function_typedLit.proto.bin            | Bin 1167 -> 1199 bytes
 .../query-tests/queries/json_from_dataset.json     |   1 +
 .../queries/json_from_dataset.proto.bin            | Bin 167 -> 169 bytes
 .../queries/select_collated_string.json            |  20 ++++++
 .../queries/select_collated_string.proto.bin       | Bin 0 -> 65 bytes
 .../connect/planner/SparkConnectProtoSuite.scala   |   8 +++
 python/pyspark/sql/connect/proto/types_pb2.py      |  78 ++++++++++-----------
 python/pyspark/sql/connect/proto/types_pb2.pyi     |   8 ++-
 .../org/apache/spark/sql/types/DataType.scala      |   2 +-
 .../org/apache/spark/sql/types/StringType.scala    |   2 +-
 19 files changed, 113 insertions(+), 44 deletions(-)

diff --git 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
index b52f75a2914d..ee98a1aceea3 100644
--- 
a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
+++ 
b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala
@@ -698,6 +698,11 @@ class PlanGenerationTestSuite
     simple.distinct()
   }
 
+  test("select collated string") {
+    val schema = StructType(StructField("s", StringType(1)) :: Nil)
+    createLocalRelation(schema.catalogString).select("s")
+  }
+
   /* Column API */
   private def columnTest(name: String)(f: => Column): Unit = {
     test("column " + name) {
diff --git 
a/connector/connect/common/src/main/protobuf/spark/connect/types.proto 
b/connector/connect/common/src/main/protobuf/spark/connect/types.proto
index 365db6059447..48f7385330c8 100644
--- a/connector/connect/common/src/main/protobuf/spark/connect/types.proto
+++ b/connector/connect/common/src/main/protobuf/spark/connect/types.proto
@@ -101,6 +101,7 @@ message DataType {
 
   message String {
     uint32 type_variation_reference = 1;
+    uint32 collation_id = 2;
   }
 
   message Binary {
diff --git 
a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala
 
b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala
index f80bc207f6a5..1f580a0ffc0a 100644
--- 
a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala
+++ 
b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala
@@ -45,7 +45,7 @@ object DataTypeProtoConverter {
       case proto.DataType.KindCase.DOUBLE => DoubleType
       case proto.DataType.KindCase.DECIMAL => 
toCatalystDecimalType(t.getDecimal)
 
-      case proto.DataType.KindCase.STRING => StringType
+      case proto.DataType.KindCase.STRING => toCatalystStringType(t.getString)
       case proto.DataType.KindCase.CHAR => CharType(t.getChar.getLength)
       case proto.DataType.KindCase.VAR_CHAR => 
VarcharType(t.getVarChar.getLength)
 
@@ -79,6 +79,9 @@ object DataTypeProtoConverter {
     }
   }
 
+  private def toCatalystStringType(t: proto.DataType.String): StringType =
+    StringType(t.getCollationId)
+
   private def toCatalystYearMonthIntervalType(t: 
proto.DataType.YearMonthInterval) = {
     (t.hasStartField, t.hasEndField) match {
       case (true, true) => YearMonthIntervalType(t.getStartField.toByte, 
t.getEndField.toByte)
@@ -171,7 +174,11 @@ object DataTypeProtoConverter {
             
proto.DataType.Decimal.newBuilder().setPrecision(precision).setScale(scale).build())
           .build()
 
-      case StringType => ProtoDataTypes.StringType
+      case s: StringType =>
+        proto.DataType
+          .newBuilder()
+          
.setString(proto.DataType.String.newBuilder().setCollationId(s.collationId).build())
+          .build()
 
       case CharType(length) =>
         proto.DataType
diff --git 
a/connector/connect/common/src/test/resources/query-tests/explain-results/select_collated_string.explain
 
b/connector/connect/common/src/test/resources/query-tests/explain-results/select_collated_string.explain
new file mode 100644
index 000000000000..44a6eca89a0c
--- /dev/null
+++ 
b/connector/connect/common/src/test/resources/query-tests/explain-results/select_collated_string.explain
@@ -0,0 +1,2 @@
+Project [s#0]
++- LocalRelation <empty>, [s#0]
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json
 
b/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json
index d34fcb6f758e..33f6007ec68a 100644
--- 
a/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json
+++ 
b/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json
@@ -18,6 +18,7 @@
           "name": "c1",
           "dataType": {
             "string": {
+              "collationId": 0
             }
           },
           "nullable": true
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin
 
b/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin
index 5f8bd50685ca..da4ad9bf9a4e 100644
Binary files 
a/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin
 and 
b/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin
 differ
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.json
 
b/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.json
index 77fa899bb9b7..adf8cabd97b1 100644
--- 
a/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.json
+++ 
b/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.json
@@ -305,6 +305,7 @@
         "array": {
           "elementType": {
             "string": {
+              "collationId": 0
             }
           },
           "elements": [{
@@ -323,6 +324,7 @@
         "array": {
           "elementType": {
             "string": {
+              "collationId": 0
             }
           },
           "elements": [{
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin
 
b/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin
index 9763bed6b502..d8b4407f6cfa 100644
Binary files 
a/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin
 and 
b/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin
 differ
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.json
 
b/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.json
index 7d2717d9f4c8..1e651f0455c7 100644
--- 
a/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.json
+++ 
b/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.json
@@ -200,6 +200,7 @@
         "map": {
           "keyType": {
             "string": {
+              "collationId": 0
             }
           },
           "valueType": {
@@ -227,6 +228,7 @@
                 "name": "_1",
                 "dataType": {
                   "string": {
+                    "collationId": 0
                   }
                 },
                 "nullable": true
@@ -402,6 +404,7 @@
             "map": {
               "keyType": {
                 "string": {
+                  "collationId": 0
                 }
               },
               "valueType": {
@@ -414,6 +417,7 @@
             "map": {
               "keyType": {
                 "string": {
+                  "collationId": 0
                 }
               },
               "valueType": {
@@ -435,6 +439,7 @@
             "map": {
               "keyType": {
                 "string": {
+                  "collationId": 0
                 }
               },
               "valueType": {
@@ -456,6 +461,7 @@
             "map": {
               "keyType": {
                 "string": {
+                  "collationId": 0
                 }
               },
               "valueType": {
@@ -487,6 +493,7 @@
             "map": {
               "keyType": {
                 "string": {
+                  "collationId": 0
                 }
               },
               "valueType": {
@@ -504,6 +511,7 @@
             "map": {
               "keyType": {
                 "string": {
+                  "collationId": 0
                 }
               },
               "valueType": {
@@ -525,6 +533,7 @@
             "map": {
               "keyType": {
                 "string": {
+                  "collationId": 0
                 }
               },
               "valueType": {
@@ -567,6 +576,7 @@
                   "map": {
                     "keyType": {
                       "string": {
+                        "collationId": 0
                       }
                     },
                     "valueType": {
@@ -584,6 +594,7 @@
                       "name": "_1",
                       "dataType": {
                         "string": {
+                          "collationId": 0
                         }
                       },
                       "nullable": true
@@ -597,6 +608,7 @@
                           },
                           "valueType": {
                             "string": {
+                              "collationId": 0
                             }
                           },
                           "valueContainsNull": true
@@ -628,6 +640,7 @@
             "map": {
               "keyType": {
                 "string": {
+                  "collationId": 0
                 }
               },
               "valueType": {
@@ -653,6 +666,7 @@
                     "name": "_1",
                     "dataType": {
                       "string": {
+                        "collationId": 0
                       }
                     },
                     "nullable": true
@@ -666,6 +680,7 @@
                         },
                         "valueType": {
                           "string": {
+                            "collationId": 0
                           }
                         },
                         "valueContainsNull": true
@@ -685,6 +700,7 @@
                   },
                   "valueType": {
                     "string": {
+                      "collationId": 0
                     }
                   },
                   "keys": [{
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.proto.bin
 
b/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.proto.bin
index 30a447ecfdee..b3f61830bee0 100644
Binary files 
a/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.proto.bin
 and 
b/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.proto.bin
 differ
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json
 
b/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json
index d6f992d09a5c..537c218952a4 100644
--- 
a/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json
+++ 
b/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json
@@ -18,6 +18,7 @@
           "name": "c1",
           "dataType": {
             "string": {
+              "collationId": 0
             }
           },
           "nullable": true
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin
 
b/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin
index 0fce9d9ff8c7..297ab2bf0262 100644
Binary files 
a/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin
 and 
b/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin
 differ
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json
 
b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json
new file mode 100644
index 000000000000..db065b36e345
--- /dev/null
+++ 
b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json
@@ -0,0 +1,20 @@
+{
+  "common": {
+    "planId": "1"
+  },
+  "project": {
+    "input": {
+      "common": {
+        "planId": "0"
+      },
+      "localRelation": {
+        "schema": "struct\u003cs:string COLLATE 
\u0027UCS_BASIC_LCASE\u0027\u003e"
+      }
+    },
+    "expressions": [{
+      "unresolvedAttribute": {
+        "unparsedIdentifier": "s"
+      }
+    }]
+  }
+}
\ No newline at end of file
diff --git 
a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin
 
b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin
new file mode 100644
index 000000000000..3a5661e54ce0
Binary files /dev/null and 
b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin
 differ
diff --git 
a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala
 
b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala
index 0b27ccdbef89..bd52a16d5b22 100644
--- 
a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala
+++ 
b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala
@@ -1044,6 +1044,14 @@ class SparkConnectProtoSuite extends PlanTest with 
SparkConnectPlanTest {
     }
   }
 
+  test("SPARK-47144: Collated string") {
+    Seq("UCS_BASIC", "UCS_BASIC_LCASE", "UNICODE", 
"UNICODE_CI").map(collationName =>
+      Seq(
+        s"select 'abc' collate '$collationName'",
+        s"select collation('abc' collate '$collationName')").map(query =>
+        comparePlans(connect.sql(query), spark.sql(query))))
+  }
+
   private def createLocalRelationProtoByAttributeReferences(
       attrs: Seq[AttributeReference]): proto.Relation = {
     val localRelationBuilder = proto.LocalRelation.newBuilder()
diff --git a/python/pyspark/sql/connect/proto/types_pb2.py 
b/python/pyspark/sql/connect/proto/types_pb2.py
index e768441d2d2a..ebe5b5877515 100644
--- a/python/pyspark/sql/connect/proto/types_pb2.py
+++ b/python/pyspark/sql/connect/proto/types_pb2.py
@@ -29,7 +29,7 @@ _sym_db = _symbol_database.Default()
 
 
 DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
-    
b"\n\x19spark/connect/types.proto\x12\rspark.connect\"\xc9!\n\x08\x44\x61taType\x12\x32\n\x04null\x18\x01
 
\x01(\x0b\x32\x1c.spark.connect.DataType.NULLH\x00R\x04null\x12\x38\n\x06\x62inary\x18\x02
 
\x01(\x0b\x32\x1e.spark.connect.DataType.BinaryH\x00R\x06\x62inary\x12;\n\x07\x62oolean\x18\x03
 
\x01(\x0b\x32\x1f.spark.connect.DataType.BooleanH\x00R\x07\x62oolean\x12\x32\n\x04\x62yte\x18\x04
 
\x01(\x0b\x32\x1c.spark.connect.DataType.ByteH\x00R\x04\x62yte\x12\x35\n\x05short\x18\x05
 \x01(\x [...]
+    
b"\n\x19spark/connect/types.proto\x12\rspark.connect\"\xec!\n\x08\x44\x61taType\x12\x32\n\x04null\x18\x01
 
\x01(\x0b\x32\x1c.spark.connect.DataType.NULLH\x00R\x04null\x12\x38\n\x06\x62inary\x18\x02
 
\x01(\x0b\x32\x1e.spark.connect.DataType.BinaryH\x00R\x06\x62inary\x12;\n\x07\x62oolean\x18\x03
 
\x01(\x0b\x32\x1f.spark.connect.DataType.BooleanH\x00R\x07\x62oolean\x12\x32\n\x04\x62yte\x18\x04
 
\x01(\x0b\x32\x1c.spark.connect.DataType.ByteH\x00R\x04\x62yte\x12\x35\n\x05short\x18\x05
 \x01(\x [...]
 )
 
 _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals())
@@ -40,7 +40,7 @@ if _descriptor._USE_C_DESCRIPTORS == False:
         b"\n\036org.apache.spark.connect.protoP\001Z\022internal/generated"
     )
     _DATATYPE._serialized_start = 45
-    _DATATYPE._serialized_end = 4342
+    _DATATYPE._serialized_end = 4377
     _DATATYPE_BOOLEAN._serialized_start = 1595
     _DATATYPE_BOOLEAN._serialized_end = 1662
     _DATATYPE_BYTE._serialized_start = 1664
@@ -56,41 +56,41 @@ if _descriptor._USE_C_DESCRIPTORS == False:
     _DATATYPE_DOUBLE._serialized_start = 1999
     _DATATYPE_DOUBLE._serialized_end = 2065
     _DATATYPE_STRING._serialized_start = 2067
-    _DATATYPE_STRING._serialized_end = 2133
-    _DATATYPE_BINARY._serialized_start = 2135
-    _DATATYPE_BINARY._serialized_end = 2201
-    _DATATYPE_NULL._serialized_start = 2203
-    _DATATYPE_NULL._serialized_end = 2267
-    _DATATYPE_TIMESTAMP._serialized_start = 2269
-    _DATATYPE_TIMESTAMP._serialized_end = 2338
-    _DATATYPE_DATE._serialized_start = 2340
-    _DATATYPE_DATE._serialized_end = 2404
-    _DATATYPE_TIMESTAMPNTZ._serialized_start = 2406
-    _DATATYPE_TIMESTAMPNTZ._serialized_end = 2478
-    _DATATYPE_CALENDARINTERVAL._serialized_start = 2480
-    _DATATYPE_CALENDARINTERVAL._serialized_end = 2556
-    _DATATYPE_YEARMONTHINTERVAL._serialized_start = 2559
-    _DATATYPE_YEARMONTHINTERVAL._serialized_end = 2738
-    _DATATYPE_DAYTIMEINTERVAL._serialized_start = 2741
-    _DATATYPE_DAYTIMEINTERVAL._serialized_end = 2918
-    _DATATYPE_CHAR._serialized_start = 2920
-    _DATATYPE_CHAR._serialized_end = 3008
-    _DATATYPE_VARCHAR._serialized_start = 3010
-    _DATATYPE_VARCHAR._serialized_end = 3101
-    _DATATYPE_DECIMAL._serialized_start = 3104
-    _DATATYPE_DECIMAL._serialized_end = 3257
-    _DATATYPE_STRUCTFIELD._serialized_start = 3260
-    _DATATYPE_STRUCTFIELD._serialized_end = 3421
-    _DATATYPE_STRUCT._serialized_start = 3423
-    _DATATYPE_STRUCT._serialized_end = 3550
-    _DATATYPE_ARRAY._serialized_start = 3553
-    _DATATYPE_ARRAY._serialized_end = 3715
-    _DATATYPE_MAP._serialized_start = 3718
-    _DATATYPE_MAP._serialized_end = 3937
-    _DATATYPE_VARIANT._serialized_start = 3939
-    _DATATYPE_VARIANT._serialized_end = 4006
-    _DATATYPE_UDT._serialized_start = 4009
-    _DATATYPE_UDT._serialized_end = 4280
-    _DATATYPE_UNPARSED._serialized_start = 4282
-    _DATATYPE_UNPARSED._serialized_end = 4334
+    _DATATYPE_STRING._serialized_end = 2168
+    _DATATYPE_BINARY._serialized_start = 2170
+    _DATATYPE_BINARY._serialized_end = 2236
+    _DATATYPE_NULL._serialized_start = 2238
+    _DATATYPE_NULL._serialized_end = 2302
+    _DATATYPE_TIMESTAMP._serialized_start = 2304
+    _DATATYPE_TIMESTAMP._serialized_end = 2373
+    _DATATYPE_DATE._serialized_start = 2375
+    _DATATYPE_DATE._serialized_end = 2439
+    _DATATYPE_TIMESTAMPNTZ._serialized_start = 2441
+    _DATATYPE_TIMESTAMPNTZ._serialized_end = 2513
+    _DATATYPE_CALENDARINTERVAL._serialized_start = 2515
+    _DATATYPE_CALENDARINTERVAL._serialized_end = 2591
+    _DATATYPE_YEARMONTHINTERVAL._serialized_start = 2594
+    _DATATYPE_YEARMONTHINTERVAL._serialized_end = 2773
+    _DATATYPE_DAYTIMEINTERVAL._serialized_start = 2776
+    _DATATYPE_DAYTIMEINTERVAL._serialized_end = 2953
+    _DATATYPE_CHAR._serialized_start = 2955
+    _DATATYPE_CHAR._serialized_end = 3043
+    _DATATYPE_VARCHAR._serialized_start = 3045
+    _DATATYPE_VARCHAR._serialized_end = 3136
+    _DATATYPE_DECIMAL._serialized_start = 3139
+    _DATATYPE_DECIMAL._serialized_end = 3292
+    _DATATYPE_STRUCTFIELD._serialized_start = 3295
+    _DATATYPE_STRUCTFIELD._serialized_end = 3456
+    _DATATYPE_STRUCT._serialized_start = 3458
+    _DATATYPE_STRUCT._serialized_end = 3585
+    _DATATYPE_ARRAY._serialized_start = 3588
+    _DATATYPE_ARRAY._serialized_end = 3750
+    _DATATYPE_MAP._serialized_start = 3753
+    _DATATYPE_MAP._serialized_end = 3972
+    _DATATYPE_VARIANT._serialized_start = 3974
+    _DATATYPE_VARIANT._serialized_end = 4041
+    _DATATYPE_UDT._serialized_start = 4044
+    _DATATYPE_UDT._serialized_end = 4315
+    _DATATYPE_UNPARSED._serialized_start = 4317
+    _DATATYPE_UNPARSED._serialized_end = 4369
 # @@protoc_insertion_point(module_scope)
diff --git a/python/pyspark/sql/connect/proto/types_pb2.pyi 
b/python/pyspark/sql/connect/proto/types_pb2.pyi
index 49883e6337c4..e6b34d3485c2 100644
--- a/python/pyspark/sql/connect/proto/types_pb2.pyi
+++ b/python/pyspark/sql/connect/proto/types_pb2.pyi
@@ -178,16 +178,22 @@ class DataType(google.protobuf.message.Message):
         DESCRIPTOR: google.protobuf.descriptor.Descriptor
 
         TYPE_VARIATION_REFERENCE_FIELD_NUMBER: builtins.int
+        COLLATION_ID_FIELD_NUMBER: builtins.int
         type_variation_reference: builtins.int
+        collation_id: builtins.int
         def __init__(
             self,
             *,
             type_variation_reference: builtins.int = ...,
+            collation_id: builtins.int = ...,
         ) -> None: ...
         def ClearField(
             self,
             field_name: typing_extensions.Literal[
-                "type_variation_reference", b"type_variation_reference"
+                "collation_id",
+                b"collation_id",
+                "type_variation_reference",
+                b"type_variation_reference",
             ],
         ) -> None: ...
 
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala 
b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala
index fb0a8e586e0c..efaf6e6bfd6a 100644
--- a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala
+++ b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala
@@ -117,7 +117,7 @@ object DataType {
   private val FIXED_DECIMAL = """decimal\(\s*(\d+)\s*,\s*(\-?\d+)\s*\)""".r
   private val CHAR_TYPE = """char\(\s*(\d+)\s*\)""".r
   private val VARCHAR_TYPE = """varchar\(\s*(\d+)\s*\)""".r
-  private val COLLATED_STRING_TYPE = """string\s+COLLATE\s+([\w_]+)""".r
+  private val COLLATED_STRING_TYPE = """string\s+COLLATE\s+'([\w_]+)'""".r
 
   def fromDDL(ddl: String): DataType = {
     parseTypeWithFallback(
diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala 
b/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala
index 3fe0e1c9ce3f..6cfa9ce5558b 100644
--- a/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala
+++ b/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala
@@ -39,7 +39,7 @@ class StringType private(val collationId: Int) extends 
AtomicType with Serializa
    */
   override def typeName: String =
     if (isDefaultCollation) "string"
-    else s"string COLLATE 
${CollationFactory.fetchCollation(collationId).collationName}"
+    else s"string COLLATE 
'${CollationFactory.fetchCollation(collationId).collationName}'"
 
   override def equals(obj: Any): Boolean =
     obj.isInstanceOf[StringType] && obj.asInstanceOf[StringType].collationId 
== collationId


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org
For additional commands, e-mail: commits-h...@spark.apache.org

Reply via email to