This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 6385cf1bf155 [SPARK-47144][CONNECT][SQL][PYTHON] Fix Spark Connect collation error by adding collateId protobuf field 6385cf1bf155 is described below commit 6385cf1bf15584909fdc3d578980f0879607634d Author: Nikola Mandic <nikola.man...@databricks.com> AuthorDate: Wed Feb 28 13:07:24 2024 +0800 [SPARK-47144][CONNECT][SQL][PYTHON] Fix Spark Connect collation error by adding collateId protobuf field ### What changes were proposed in this pull request? Collated expression `SELECT 'abc' COLLATE 'UCS_BASIC_LCASE'` is failing when connecting to sever using Spark Connect (in this case with pyspark): ``` pyspark.errors.exceptions.connect.SparkConnectGrpcException: (org.apache.spark.sql.connect.common.InvalidPlanInput) Does not support convert string(UCS_BASIC_LCASE) to connect proto types. ``` When using default collation `UCS_BASIC`, the error is not occurring. Fix the error by making `StringType` protobuf compliant with Spark datatype which now includes collation id. ### Why are the changes needed? To fix the error which collations introduce in Spark connect. ### Does this PR introduce _any_ user-facing change? Yes, when previously running ``` $ ./bin/pyspark --remote 'sc://localhost' ... >>> spark.sql("SELECT 'abc' COLLATE 'UCS_BASIC_LCASE'") ... ``` the described error would appear. These changes remove the error. ### How was this patch tested? Changes include test which simulates described behavior by utilizing `SparkConnectServerTest` to establish client-server connection. Test is invoked by `build/sbt 'connect/testOnly *SparkConnectServiceE2ESuite'`. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #45233 from nikolamand-db/nikolamand-db/SPARK-47144. Authored-by: Nikola Mandic <nikola.man...@databricks.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../apache/spark/sql/PlanGenerationTestSuite.scala | 5 ++ .../src/main/protobuf/spark/connect/types.proto | 1 + .../connect/common/DataTypeProtoConverter.scala | 11 ++- .../explain-results/select_collated_string.explain | 2 + .../query-tests/queries/csv_from_dataset.json | 1 + .../query-tests/queries/csv_from_dataset.proto.bin | Bin 156 -> 158 bytes .../query-tests/queries/function_lit_array.json | 2 + .../queries/function_lit_array.proto.bin | Bin 885 -> 889 bytes .../query-tests/queries/function_typedLit.json | 16 +++++ .../queries/function_typedLit.proto.bin | Bin 1167 -> 1199 bytes .../query-tests/queries/json_from_dataset.json | 1 + .../queries/json_from_dataset.proto.bin | Bin 167 -> 169 bytes .../queries/select_collated_string.json | 20 ++++++ .../queries/select_collated_string.proto.bin | Bin 0 -> 65 bytes .../connect/planner/SparkConnectProtoSuite.scala | 8 +++ python/pyspark/sql/connect/proto/types_pb2.py | 78 ++++++++++----------- python/pyspark/sql/connect/proto/types_pb2.pyi | 8 ++- .../org/apache/spark/sql/types/DataType.scala | 2 +- .../org/apache/spark/sql/types/StringType.scala | 2 +- 19 files changed, 113 insertions(+), 44 deletions(-) diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala index b52f75a2914d..ee98a1aceea3 100644 --- a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala @@ -698,6 +698,11 @@ class PlanGenerationTestSuite simple.distinct() } + test("select collated string") { + val schema = StructType(StructField("s", StringType(1)) :: Nil) + createLocalRelation(schema.catalogString).select("s") + } + /* Column API */ private def columnTest(name: String)(f: => Column): Unit = { test("column " + name) { diff --git a/connector/connect/common/src/main/protobuf/spark/connect/types.proto b/connector/connect/common/src/main/protobuf/spark/connect/types.proto index 365db6059447..48f7385330c8 100644 --- a/connector/connect/common/src/main/protobuf/spark/connect/types.proto +++ b/connector/connect/common/src/main/protobuf/spark/connect/types.proto @@ -101,6 +101,7 @@ message DataType { message String { uint32 type_variation_reference = 1; + uint32 collation_id = 2; } message Binary { diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala index f80bc207f6a5..1f580a0ffc0a 100644 --- a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala @@ -45,7 +45,7 @@ object DataTypeProtoConverter { case proto.DataType.KindCase.DOUBLE => DoubleType case proto.DataType.KindCase.DECIMAL => toCatalystDecimalType(t.getDecimal) - case proto.DataType.KindCase.STRING => StringType + case proto.DataType.KindCase.STRING => toCatalystStringType(t.getString) case proto.DataType.KindCase.CHAR => CharType(t.getChar.getLength) case proto.DataType.KindCase.VAR_CHAR => VarcharType(t.getVarChar.getLength) @@ -79,6 +79,9 @@ object DataTypeProtoConverter { } } + private def toCatalystStringType(t: proto.DataType.String): StringType = + StringType(t.getCollationId) + private def toCatalystYearMonthIntervalType(t: proto.DataType.YearMonthInterval) = { (t.hasStartField, t.hasEndField) match { case (true, true) => YearMonthIntervalType(t.getStartField.toByte, t.getEndField.toByte) @@ -171,7 +174,11 @@ object DataTypeProtoConverter { proto.DataType.Decimal.newBuilder().setPrecision(precision).setScale(scale).build()) .build() - case StringType => ProtoDataTypes.StringType + case s: StringType => + proto.DataType + .newBuilder() + .setString(proto.DataType.String.newBuilder().setCollationId(s.collationId).build()) + .build() case CharType(length) => proto.DataType diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/select_collated_string.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/select_collated_string.explain new file mode 100644 index 000000000000..44a6eca89a0c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/select_collated_string.explain @@ -0,0 +1,2 @@ +Project [s#0] ++- LocalRelation <empty>, [s#0] diff --git a/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json b/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json index d34fcb6f758e..33f6007ec68a 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json @@ -18,6 +18,7 @@ "name": "c1", "dataType": { "string": { + "collationId": 0 } }, "nullable": true diff --git a/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin index 5f8bd50685ca..da4ad9bf9a4e 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.json b/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.json index 77fa899bb9b7..adf8cabd97b1 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.json @@ -305,6 +305,7 @@ "array": { "elementType": { "string": { + "collationId": 0 } }, "elements": [{ @@ -323,6 +324,7 @@ "array": { "elementType": { "string": { + "collationId": 0 } }, "elements": [{ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin index 9763bed6b502..d8b4407f6cfa 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.json b/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.json index 7d2717d9f4c8..1e651f0455c7 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.json @@ -200,6 +200,7 @@ "map": { "keyType": { "string": { + "collationId": 0 } }, "valueType": { @@ -227,6 +228,7 @@ "name": "_1", "dataType": { "string": { + "collationId": 0 } }, "nullable": true @@ -402,6 +404,7 @@ "map": { "keyType": { "string": { + "collationId": 0 } }, "valueType": { @@ -414,6 +417,7 @@ "map": { "keyType": { "string": { + "collationId": 0 } }, "valueType": { @@ -435,6 +439,7 @@ "map": { "keyType": { "string": { + "collationId": 0 } }, "valueType": { @@ -456,6 +461,7 @@ "map": { "keyType": { "string": { + "collationId": 0 } }, "valueType": { @@ -487,6 +493,7 @@ "map": { "keyType": { "string": { + "collationId": 0 } }, "valueType": { @@ -504,6 +511,7 @@ "map": { "keyType": { "string": { + "collationId": 0 } }, "valueType": { @@ -525,6 +533,7 @@ "map": { "keyType": { "string": { + "collationId": 0 } }, "valueType": { @@ -567,6 +576,7 @@ "map": { "keyType": { "string": { + "collationId": 0 } }, "valueType": { @@ -584,6 +594,7 @@ "name": "_1", "dataType": { "string": { + "collationId": 0 } }, "nullable": true @@ -597,6 +608,7 @@ }, "valueType": { "string": { + "collationId": 0 } }, "valueContainsNull": true @@ -628,6 +640,7 @@ "map": { "keyType": { "string": { + "collationId": 0 } }, "valueType": { @@ -653,6 +666,7 @@ "name": "_1", "dataType": { "string": { + "collationId": 0 } }, "nullable": true @@ -666,6 +680,7 @@ }, "valueType": { "string": { + "collationId": 0 } }, "valueContainsNull": true @@ -685,6 +700,7 @@ }, "valueType": { "string": { + "collationId": 0 } }, "keys": [{ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.proto.bin index 30a447ecfdee..b3f61830bee0 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/function_typedLit.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json b/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json index d6f992d09a5c..537c218952a4 100644 --- a/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json +++ b/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json @@ -18,6 +18,7 @@ "name": "c1", "dataType": { "string": { + "collationId": 0 } }, "nullable": true diff --git a/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin index 0fce9d9ff8c7..297ab2bf0262 100644 Binary files a/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin and b/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json new file mode 100644 index 000000000000..db065b36e345 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.json @@ -0,0 +1,20 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cs:string COLLATE \u0027UCS_BASIC_LCASE\u0027\u003e" + } + }, + "expressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "s" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin new file mode 100644 index 000000000000..3a5661e54ce0 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/select_collated_string.proto.bin differ diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala index 0b27ccdbef89..bd52a16d5b22 100644 --- a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala @@ -1044,6 +1044,14 @@ class SparkConnectProtoSuite extends PlanTest with SparkConnectPlanTest { } } + test("SPARK-47144: Collated string") { + Seq("UCS_BASIC", "UCS_BASIC_LCASE", "UNICODE", "UNICODE_CI").map(collationName => + Seq( + s"select 'abc' collate '$collationName'", + s"select collation('abc' collate '$collationName')").map(query => + comparePlans(connect.sql(query), spark.sql(query)))) + } + private def createLocalRelationProtoByAttributeReferences( attrs: Seq[AttributeReference]): proto.Relation = { val localRelationBuilder = proto.LocalRelation.newBuilder() diff --git a/python/pyspark/sql/connect/proto/types_pb2.py b/python/pyspark/sql/connect/proto/types_pb2.py index e768441d2d2a..ebe5b5877515 100644 --- a/python/pyspark/sql/connect/proto/types_pb2.py +++ b/python/pyspark/sql/connect/proto/types_pb2.py @@ -29,7 +29,7 @@ _sym_db = _symbol_database.Default() DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile( - b"\n\x19spark/connect/types.proto\x12\rspark.connect\"\xc9!\n\x08\x44\x61taType\x12\x32\n\x04null\x18\x01 \x01(\x0b\x32\x1c.spark.connect.DataType.NULLH\x00R\x04null\x12\x38\n\x06\x62inary\x18\x02 \x01(\x0b\x32\x1e.spark.connect.DataType.BinaryH\x00R\x06\x62inary\x12;\n\x07\x62oolean\x18\x03 \x01(\x0b\x32\x1f.spark.connect.DataType.BooleanH\x00R\x07\x62oolean\x12\x32\n\x04\x62yte\x18\x04 \x01(\x0b\x32\x1c.spark.connect.DataType.ByteH\x00R\x04\x62yte\x12\x35\n\x05short\x18\x05 \x01(\x [...] + b"\n\x19spark/connect/types.proto\x12\rspark.connect\"\xec!\n\x08\x44\x61taType\x12\x32\n\x04null\x18\x01 \x01(\x0b\x32\x1c.spark.connect.DataType.NULLH\x00R\x04null\x12\x38\n\x06\x62inary\x18\x02 \x01(\x0b\x32\x1e.spark.connect.DataType.BinaryH\x00R\x06\x62inary\x12;\n\x07\x62oolean\x18\x03 \x01(\x0b\x32\x1f.spark.connect.DataType.BooleanH\x00R\x07\x62oolean\x12\x32\n\x04\x62yte\x18\x04 \x01(\x0b\x32\x1c.spark.connect.DataType.ByteH\x00R\x04\x62yte\x12\x35\n\x05short\x18\x05 \x01(\x [...] ) _builder.BuildMessageAndEnumDescriptors(DESCRIPTOR, globals()) @@ -40,7 +40,7 @@ if _descriptor._USE_C_DESCRIPTORS == False: b"\n\036org.apache.spark.connect.protoP\001Z\022internal/generated" ) _DATATYPE._serialized_start = 45 - _DATATYPE._serialized_end = 4342 + _DATATYPE._serialized_end = 4377 _DATATYPE_BOOLEAN._serialized_start = 1595 _DATATYPE_BOOLEAN._serialized_end = 1662 _DATATYPE_BYTE._serialized_start = 1664 @@ -56,41 +56,41 @@ if _descriptor._USE_C_DESCRIPTORS == False: _DATATYPE_DOUBLE._serialized_start = 1999 _DATATYPE_DOUBLE._serialized_end = 2065 _DATATYPE_STRING._serialized_start = 2067 - _DATATYPE_STRING._serialized_end = 2133 - _DATATYPE_BINARY._serialized_start = 2135 - _DATATYPE_BINARY._serialized_end = 2201 - _DATATYPE_NULL._serialized_start = 2203 - _DATATYPE_NULL._serialized_end = 2267 - _DATATYPE_TIMESTAMP._serialized_start = 2269 - _DATATYPE_TIMESTAMP._serialized_end = 2338 - _DATATYPE_DATE._serialized_start = 2340 - _DATATYPE_DATE._serialized_end = 2404 - _DATATYPE_TIMESTAMPNTZ._serialized_start = 2406 - _DATATYPE_TIMESTAMPNTZ._serialized_end = 2478 - _DATATYPE_CALENDARINTERVAL._serialized_start = 2480 - _DATATYPE_CALENDARINTERVAL._serialized_end = 2556 - _DATATYPE_YEARMONTHINTERVAL._serialized_start = 2559 - _DATATYPE_YEARMONTHINTERVAL._serialized_end = 2738 - _DATATYPE_DAYTIMEINTERVAL._serialized_start = 2741 - _DATATYPE_DAYTIMEINTERVAL._serialized_end = 2918 - _DATATYPE_CHAR._serialized_start = 2920 - _DATATYPE_CHAR._serialized_end = 3008 - _DATATYPE_VARCHAR._serialized_start = 3010 - _DATATYPE_VARCHAR._serialized_end = 3101 - _DATATYPE_DECIMAL._serialized_start = 3104 - _DATATYPE_DECIMAL._serialized_end = 3257 - _DATATYPE_STRUCTFIELD._serialized_start = 3260 - _DATATYPE_STRUCTFIELD._serialized_end = 3421 - _DATATYPE_STRUCT._serialized_start = 3423 - _DATATYPE_STRUCT._serialized_end = 3550 - _DATATYPE_ARRAY._serialized_start = 3553 - _DATATYPE_ARRAY._serialized_end = 3715 - _DATATYPE_MAP._serialized_start = 3718 - _DATATYPE_MAP._serialized_end = 3937 - _DATATYPE_VARIANT._serialized_start = 3939 - _DATATYPE_VARIANT._serialized_end = 4006 - _DATATYPE_UDT._serialized_start = 4009 - _DATATYPE_UDT._serialized_end = 4280 - _DATATYPE_UNPARSED._serialized_start = 4282 - _DATATYPE_UNPARSED._serialized_end = 4334 + _DATATYPE_STRING._serialized_end = 2168 + _DATATYPE_BINARY._serialized_start = 2170 + _DATATYPE_BINARY._serialized_end = 2236 + _DATATYPE_NULL._serialized_start = 2238 + _DATATYPE_NULL._serialized_end = 2302 + _DATATYPE_TIMESTAMP._serialized_start = 2304 + _DATATYPE_TIMESTAMP._serialized_end = 2373 + _DATATYPE_DATE._serialized_start = 2375 + _DATATYPE_DATE._serialized_end = 2439 + _DATATYPE_TIMESTAMPNTZ._serialized_start = 2441 + _DATATYPE_TIMESTAMPNTZ._serialized_end = 2513 + _DATATYPE_CALENDARINTERVAL._serialized_start = 2515 + _DATATYPE_CALENDARINTERVAL._serialized_end = 2591 + _DATATYPE_YEARMONTHINTERVAL._serialized_start = 2594 + _DATATYPE_YEARMONTHINTERVAL._serialized_end = 2773 + _DATATYPE_DAYTIMEINTERVAL._serialized_start = 2776 + _DATATYPE_DAYTIMEINTERVAL._serialized_end = 2953 + _DATATYPE_CHAR._serialized_start = 2955 + _DATATYPE_CHAR._serialized_end = 3043 + _DATATYPE_VARCHAR._serialized_start = 3045 + _DATATYPE_VARCHAR._serialized_end = 3136 + _DATATYPE_DECIMAL._serialized_start = 3139 + _DATATYPE_DECIMAL._serialized_end = 3292 + _DATATYPE_STRUCTFIELD._serialized_start = 3295 + _DATATYPE_STRUCTFIELD._serialized_end = 3456 + _DATATYPE_STRUCT._serialized_start = 3458 + _DATATYPE_STRUCT._serialized_end = 3585 + _DATATYPE_ARRAY._serialized_start = 3588 + _DATATYPE_ARRAY._serialized_end = 3750 + _DATATYPE_MAP._serialized_start = 3753 + _DATATYPE_MAP._serialized_end = 3972 + _DATATYPE_VARIANT._serialized_start = 3974 + _DATATYPE_VARIANT._serialized_end = 4041 + _DATATYPE_UDT._serialized_start = 4044 + _DATATYPE_UDT._serialized_end = 4315 + _DATATYPE_UNPARSED._serialized_start = 4317 + _DATATYPE_UNPARSED._serialized_end = 4369 # @@protoc_insertion_point(module_scope) diff --git a/python/pyspark/sql/connect/proto/types_pb2.pyi b/python/pyspark/sql/connect/proto/types_pb2.pyi index 49883e6337c4..e6b34d3485c2 100644 --- a/python/pyspark/sql/connect/proto/types_pb2.pyi +++ b/python/pyspark/sql/connect/proto/types_pb2.pyi @@ -178,16 +178,22 @@ class DataType(google.protobuf.message.Message): DESCRIPTOR: google.protobuf.descriptor.Descriptor TYPE_VARIATION_REFERENCE_FIELD_NUMBER: builtins.int + COLLATION_ID_FIELD_NUMBER: builtins.int type_variation_reference: builtins.int + collation_id: builtins.int def __init__( self, *, type_variation_reference: builtins.int = ..., + collation_id: builtins.int = ..., ) -> None: ... def ClearField( self, field_name: typing_extensions.Literal[ - "type_variation_reference", b"type_variation_reference" + "collation_id", + b"collation_id", + "type_variation_reference", + b"type_variation_reference", ], ) -> None: ... diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala index fb0a8e586e0c..efaf6e6bfd6a 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/DataType.scala @@ -117,7 +117,7 @@ object DataType { private val FIXED_DECIMAL = """decimal\(\s*(\d+)\s*,\s*(\-?\d+)\s*\)""".r private val CHAR_TYPE = """char\(\s*(\d+)\s*\)""".r private val VARCHAR_TYPE = """varchar\(\s*(\d+)\s*\)""".r - private val COLLATED_STRING_TYPE = """string\s+COLLATE\s+([\w_]+)""".r + private val COLLATED_STRING_TYPE = """string\s+COLLATE\s+'([\w_]+)'""".r def fromDDL(ddl: String): DataType = { parseTypeWithFallback( diff --git a/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala b/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala index 3fe0e1c9ce3f..6cfa9ce5558b 100644 --- a/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala +++ b/sql/api/src/main/scala/org/apache/spark/sql/types/StringType.scala @@ -39,7 +39,7 @@ class StringType private(val collationId: Int) extends AtomicType with Serializa */ override def typeName: String = if (isDefaultCollation) "string" - else s"string COLLATE ${CollationFactory.fetchCollation(collationId).collationName}" + else s"string COLLATE '${CollationFactory.fetchCollation(collationId).collationName}'" override def equals(obj: Any): Boolean = obj.isInstanceOf[StringType] && obj.asInstanceOf[StringType].collationId == collationId --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org