stefankandic commented on code in PR #46280: URL: https://github.com/apache/spark/pull/46280#discussion_r1601793860
########## python/pyspark/sql/tests/test_types.py: ########## @@ -549,6 +549,129 @@ def test_convert_list_to_str(self): self.assertEqual(df.count(), 1) self.assertEqual(df.head(), Row(name="[123]", income=120)) + def test_schema_with_collations_json_ser_de(self): + from pyspark.sql.types import _parse_datatype_json_string + + unicode_collation = "UNICODE" + + simple_struct = StructType([StructField("c1", StringType(unicode_collation))]) + + nested_struct = StructType([StructField("nested", simple_struct)]) + + array_in_schema = StructType( + [StructField("array", ArrayType(StringType(unicode_collation)))] + ) + + map_in_schema = StructType( + [ + StructField( + "map", MapType(StringType(unicode_collation), StringType(unicode_collation)) + ) + ] + ) + + array_in_map = StructType( + [ + StructField( + "arrInMap", + MapType( + StringType(unicode_collation), ArrayType(StringType(unicode_collation)) + ), + ) + ] + ) + + nested_array_in_map = StructType( + [ + StructField( + "nestedArrayInMap", + ArrayType( + MapType( + StringType(unicode_collation), + ArrayType(ArrayType(StringType(unicode_collation))), + ) + ), + ) + ] + ) + + schema_with_multiple_fields = StructType( + simple_struct.fields + + nested_struct.fields + + array_in_schema.fields + + map_in_schema.fields + + array_in_map.fields + + nested_array_in_map.fields + ) + + schemas = [ + simple_struct, + nested_struct, + array_in_schema, + map_in_schema, + nested_array_in_map, + array_in_map, + schema_with_multiple_fields, + ] + + for schema in schemas: + scala_datatype = self.spark._jsparkSession.parseDataType(schema.json()) + python_datatype = _parse_datatype_json_string(scala_datatype.json()) + assert schema == python_datatype + assert schema == _parse_datatype_json_string(schema.json()) + + def test_schema_with_collations_on_non_string_types(self): + from pyspark.sql.types import _parse_datatype_json_string, _COLLATIONS_METADATA_KEY + + collations_on_int_col_json = f""" + {{ + "type": "struct", + "fields": [ + {{ + "name": "c1", + "type": "integer", + "nullable": true, + "metadata": {{ + "{_COLLATIONS_METADATA_KEY}": {{ + "c1": "icu.UNICODE" + }} + }} + }} + ] + }} + """ + + collations_in_map_json = f""" + {{ + "type": "struct", + "fields": [ + {{ + "name": "mapField", + "type": {{ + "type": "map", + "keyType": "string", + "valueType": "integer", + "valueContainsNull": true + }}, + "nullable": true, + "metadata": {{ + "{_COLLATIONS_METADATA_KEY}": {{ + "mapField.value": "icu.UNICODE" Review Comment: We talked about this one a bit offline, but I would rather tackle this as a separate issue than just a collation protocol error. Currently, both python and scala code will not fail when encountering duplicate keys; python will just pick one to put in the dictionary and scala will have both in the `JObject`. What do you think @cloud-fan ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org