uros-db commented on code in PR #46511: URL: https://github.com/apache/spark/pull/46511#discussion_r1596329450
########## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ########## @@ -102,20 +102,28 @@ public void testContains() throws SparkException { assertContains("äbćδe", "ÄbćδE", "UNICODE_CI", true); assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); // Case-variable character length - assertContains("abİo12", "i̇o", "UNICODE_CI", true); - assertContains("abi̇o12", "İo", "UNICODE_CI", true); - assertContains("the İodine", "the i̇odine", "UTF8_BINARY_LCASE", true); - assertContains("the i̇odine", "the İodine", "UTF8_BINARY_LCASE", true); - assertContains("The İodiNe", " i̇oDin", "UTF8_BINARY_LCASE", true); - assertContains("İodiNe", "i̇oDine", "UTF8_BINARY_LCASE", true); - assertContains("İodiNe", " i̇oDin", "UTF8_BINARY_LCASE", false); - // Characters with the same binary lowercase representation - assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true); - assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true); - assertContains("The KKelvin.", "KKelvin", "UTF8_BINARY_LCASE", true); - assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true); - assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true); - assertContains("The KKelvin.", "KKelvin,", "UTF8_BINARY_LCASE", false); + assertContains("i̇", "i", "UNICODE_CI", false); + assertContains("i̇", "İ", "UNICODE_CI", true); + assertContains("İ", "i", "UNICODE_CI", false); + assertContains("adi̇os", "io", "UNICODE_CI", false); + assertContains("adi̇os", "Io", "UNICODE_CI", false); + assertContains("adi̇os", "i̇o", "UNICODE_CI", true); + assertContains("adi̇os", "İo", "UNICODE_CI", true); + assertContains("adİos", "io", "UNICODE_CI", false); + assertContains("adİos", "Io", "UNICODE_CI", false); + assertContains("adİos", "i̇o", "UNICODE_CI", true); + assertContains("adİos", "İo", "UNICODE_CI", true); + assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // note: different from UNICODE_CI + assertContains("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertContains("İ", "i", "UTF8_BINARY_LCASE", false); + assertContains("adi̇os", "io", "UTF8_BINARY_LCASE", false); + assertContains("adi̇os", "Io", "UTF8_BINARY_LCASE", false); + assertContains("adi̇os", "i̇o", "UTF8_BINARY_LCASE", true); + assertContains("adi̇os", "İo", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertContains("adİos", "io", "UTF8_BINARY_LCASE", false); + assertContains("adİos", "Io", "UTF8_BINARY_LCASE", false); + assertContains("adİos", "i̇o", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI Review Comment: let's justify this, considering haystack="adİos" and needle="i̇o" there exists `len>0` such that `start=3` and `len=2` satisfy the condition `lower(substring(haystack, start, len)) == lower(needle)` i.e. `lower(substring("adİos", 3, 2)) == lower("İo") == "i̇o" == lower("i̇o")` hence, `contains("adİos", "i̇o")` should actually return `true` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org