mkaravel commented on code in PR #46700: URL: https://github.com/apache/spark/pull/46700#discussion_r1628813249
########## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ########## @@ -26,6 +27,156 @@ // checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { + /** + * A list containing some of the supported collations in Spark. Use this list to iterate over + * all the important collation groups (binary, lowercase, icu) for complete unit test coverage. + * Note: this list may come in handy when the Spark function result is the same regardless of + * the specified collations (as often seen in some pass-through Spark expressions). + */ + private final String[] testSupportedCollations = + {"UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI"}; + + /** + * Collation-aware UTF8String comparison. + */ + + private void assertStringCompare(String s1, String s2, String collationName, int expected) + throws SparkException { + UTF8String l = UTF8String.fromString(s1); + UTF8String r = UTF8String.fromString(s2); + int compare = CollationFactory.fetchCollation(collationName).comparator.compare(l, r); + assertEquals(Integer.signum(expected), Integer.signum(compare)); + } + + @Test + public void testCompare() throws SparkException { + for (String collationName: testSupportedCollations) { + // Edge cases + assertStringCompare("", "", collationName, 0); + assertStringCompare("a", "", collationName, 1); + assertStringCompare("", "a", collationName, -1); + // Basic tests + assertStringCompare("a", "a", collationName, 0); + assertStringCompare("a", "b", collationName, -1); + assertStringCompare("b", "a", collationName, 1); + assertStringCompare("A", "A", collationName, 0); + assertStringCompare("A", "B", collationName, -1); + assertStringCompare("B", "A", collationName, 1); + assertStringCompare("aa", "a", collationName, 1); + assertStringCompare("b", "bb", collationName, -1); + assertStringCompare("abc", "a", collationName, 1); + assertStringCompare("abc", "b", collationName, -1); + assertStringCompare("abc", "ab", collationName, 1); + assertStringCompare("abc", "abc", collationName, 0); + // ASCII strings + assertStringCompare("aaaa", "aaa", collationName, 1); + assertStringCompare("hello", "world", collationName, -1); + assertStringCompare("Spark", "Spark", collationName, 0); + // Non-ASCII strings + assertStringCompare("ü", "ü", collationName, 0); + assertStringCompare("ü", "", collationName, 1); + assertStringCompare("", "ü", collationName, -1); + assertStringCompare("äü", "äü", collationName, 0); + assertStringCompare("äxx", "äx", collationName, 1); + assertStringCompare("a", "ä", collationName, -1); + } + // Non-ASCII strings + assertStringCompare("äü", "bü", "UTF8_BINARY", 1); + assertStringCompare("bxx", "bü", "UTF8_BINARY", -1); + assertStringCompare("äü", "bü", "UTF8_BINARY_LCASE", 1); + assertStringCompare("bxx", "bü", "UTF8_BINARY_LCASE", -1); + assertStringCompare("äü", "bü", "UNICODE", -1); + assertStringCompare("bxx", "bü", "UNICODE", 1); + assertStringCompare("äü", "bü", "UNICODE_CI", -1); + assertStringCompare("bxx", "bü", "UNICODE_CI", 1); + // Case variation + assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1); + assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0); + assertStringCompare("AbcD", "aBCd", "UNICODE", 1); + assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0); + // Accent variation + assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1); + assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1); + assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0); + // Case-variable character length + assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1); + assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1); + assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0); + assertStringCompare("i\u0307", "İ", "UNICODE", -1); + assertStringCompare("İ", "i\u0307", "UNICODE", 1); + assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0); + assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0); + assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0); + assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0); + assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0); + assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0); + assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0); + assertStringCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0); + // Conditional case mapping + assertStringCompare("ς", "σ", "UTF8_BINARY", -1); + assertStringCompare("ς", "Σ", "UTF8_BINARY", 1); + assertStringCompare("σ", "Σ", "UTF8_BINARY", 1); + assertStringCompare("ς", "σ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("ς", "σ", "UNICODE", 1); + assertStringCompare("ς", "Σ", "UNICODE", 1); + assertStringCompare("σ", "Σ", "UNICODE", -1); + assertStringCompare("ς", "σ", "UNICODE_CI", 0); + assertStringCompare("ς", "Σ", "UNICODE_CI", 0); + assertStringCompare("σ", "Σ", "UNICODE_CI", 0); + } + + private void assertLowerCaseCodePoints(UTF8String target, UTF8String expected, + Boolean useCodePoints) { + if (useCodePoints) { + assertEquals(expected.toString(), + CollationAwareUTF8String.lowerCaseCodePoints(target.toString())); + } else { + assertEquals(expected, target.toLowerCase()); + } + } + + @Test + public void testLowerCaseCodePoints() { + // Edge cases + assertLowerCaseCodePoints(UTF8String.fromString(""), UTF8String.fromString(""), false); + assertLowerCaseCodePoints(UTF8String.fromString(""), UTF8String.fromString(""), true); + // Basic tests + assertLowerCaseCodePoints(UTF8String.fromString("abcd"), UTF8String.fromString("abcd"), false); + assertLowerCaseCodePoints(UTF8String.fromString("AbCd"), UTF8String.fromString("abcd"), false); + assertLowerCaseCodePoints(UTF8String.fromString("abcd"), UTF8String.fromString("abcd"), true); + assertLowerCaseCodePoints(UTF8String.fromString("aBcD"), UTF8String.fromString("abcd"), true); + // Accent variation + assertLowerCaseCodePoints(UTF8String.fromString("AbĆd"), UTF8String.fromString("abćd"), false); + assertLowerCaseCodePoints(UTF8String.fromString("aBcΔ"), UTF8String.fromString("abcδ"), true); + // Case-variable character length + assertLowerCaseCodePoints( + UTF8String.fromString("İoDiNe"), UTF8String.fromString("i̇odine"), false); + assertLowerCaseCodePoints( + UTF8String.fromString("Abi̇o12"), UTF8String.fromString("abi̇o12"), false); + assertLowerCaseCodePoints( + UTF8String.fromString("İodInE"), UTF8String.fromString("i̇odine"), true); + assertLowerCaseCodePoints( + UTF8String.fromString("aBi̇o12"), UTF8String.fromString("abi̇o12"), true); + // Conditional case mapping + assertLowerCaseCodePoints( + UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"), UTF8String.fromString("θαλασσινος"), false); + assertLowerCaseCodePoints( + UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"), UTF8String.fromString("θαλασσινοσ"), true); + // Surrogate pairs are treated as invalid UTF8 sequences + assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[] + {(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte) 0x80}), + UTF8String.fromString("\ufffd\ufffd"), false); + assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[] + {(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte) 0x80}), + UTF8String.fromString("\ufffd\ufffd"), true); Review Comment: Here we are "suffering" from the fact that in order to compare two UTF8String values, we first convert them to Java strings and then compare them codepoint-by-codepoint. The conversion to Java strings replaces this surrogate pair with two U+FFFD characters. However, this is UTF16 semantics. In UTF8 we have illegal byte sequences here and the decomposition of the byte array into valid UTF8 _subsequences_ gives six different subsequences (one byte each) and since each subsequence is illegal, ICU4C would replace each with U+FFFD. ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -296,6 +344,45 @@ public static String toLowerCase(final String target, final int collationId) { return UCharacter.toLowerCase(locale, target); } + /** + * Converts a single code point to lowercase using ICU rules, with special handling for + * one-to-many case mappings (i.e. characters that map to multiple characters in lowercase). + * + * @param codePoint The code point to convert to lowercase. + * @param sb The StringBuilder to append the lowercase character to. + */ + private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) { + if (codePoint == 0x0130) { + // Latin capital letter I with dot above is mapped to 2 lowercase characters. + sb.appendCodePoint(0x0069); + sb.appendCodePoint(0x0307); + } + else if (codePoint == 0x03C2) { + // Greek final and non-final capital letter sigma should be mapped the same. + sb.appendCodePoint(0x03C3); + } + else { + // All other characters should follow context-unaware ICU single-code point case mapping. + sb.appendCodePoint(UCharacter.toLowerCase(codePoint)); + } + } + + /** + * Converts an entire string to lowercase using ICU rules, code point by code point, with + * special handling for one-to-many case mappings (i.e. characters that map to multiple + * characters in lowercase). This method omits information about context-sensitive case mappings. Review Comment: > This method omits information about context-sensitive case mappings. Should we explain what this means and how we do it? ########## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ########## @@ -26,6 +27,156 @@ // checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { + /** + * A list containing some of the supported collations in Spark. Use this list to iterate over + * all the important collation groups (binary, lowercase, icu) for complete unit test coverage. + * Note: this list may come in handy when the Spark function result is the same regardless of + * the specified collations (as often seen in some pass-through Spark expressions). + */ + private final String[] testSupportedCollations = + {"UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI"}; + + /** + * Collation-aware UTF8String comparison. + */ + + private void assertStringCompare(String s1, String s2, String collationName, int expected) + throws SparkException { + UTF8String l = UTF8String.fromString(s1); + UTF8String r = UTF8String.fromString(s2); + int compare = CollationFactory.fetchCollation(collationName).comparator.compare(l, r); + assertEquals(Integer.signum(expected), Integer.signum(compare)); + } + + @Test + public void testCompare() throws SparkException { + for (String collationName: testSupportedCollations) { + // Edge cases + assertStringCompare("", "", collationName, 0); + assertStringCompare("a", "", collationName, 1); + assertStringCompare("", "a", collationName, -1); + // Basic tests + assertStringCompare("a", "a", collationName, 0); + assertStringCompare("a", "b", collationName, -1); + assertStringCompare("b", "a", collationName, 1); + assertStringCompare("A", "A", collationName, 0); + assertStringCompare("A", "B", collationName, -1); + assertStringCompare("B", "A", collationName, 1); + assertStringCompare("aa", "a", collationName, 1); + assertStringCompare("b", "bb", collationName, -1); + assertStringCompare("abc", "a", collationName, 1); + assertStringCompare("abc", "b", collationName, -1); + assertStringCompare("abc", "ab", collationName, 1); + assertStringCompare("abc", "abc", collationName, 0); + // ASCII strings + assertStringCompare("aaaa", "aaa", collationName, 1); + assertStringCompare("hello", "world", collationName, -1); + assertStringCompare("Spark", "Spark", collationName, 0); + // Non-ASCII strings + assertStringCompare("ü", "ü", collationName, 0); + assertStringCompare("ü", "", collationName, 1); + assertStringCompare("", "ü", collationName, -1); + assertStringCompare("äü", "äü", collationName, 0); + assertStringCompare("äxx", "äx", collationName, 1); + assertStringCompare("a", "ä", collationName, -1); + } + // Non-ASCII strings + assertStringCompare("äü", "bü", "UTF8_BINARY", 1); + assertStringCompare("bxx", "bü", "UTF8_BINARY", -1); + assertStringCompare("äü", "bü", "UTF8_BINARY_LCASE", 1); + assertStringCompare("bxx", "bü", "UTF8_BINARY_LCASE", -1); + assertStringCompare("äü", "bü", "UNICODE", -1); + assertStringCompare("bxx", "bü", "UNICODE", 1); + assertStringCompare("äü", "bü", "UNICODE_CI", -1); + assertStringCompare("bxx", "bü", "UNICODE_CI", 1); + // Case variation + assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1); + assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0); + assertStringCompare("AbcD", "aBCd", "UNICODE", 1); + assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0); + // Accent variation + assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1); + assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1); + assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0); + // Case-variable character length + assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1); + assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1); + assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0); + assertStringCompare("i\u0307", "İ", "UNICODE", -1); + assertStringCompare("İ", "i\u0307", "UNICODE", 1); + assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0); + assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0); + assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0); + assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0); + assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0); + assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0); + assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0); + assertStringCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0); + // Conditional case mapping + assertStringCompare("ς", "σ", "UTF8_BINARY", -1); + assertStringCompare("ς", "Σ", "UTF8_BINARY", 1); + assertStringCompare("σ", "Σ", "UTF8_BINARY", 1); + assertStringCompare("ς", "σ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("ς", "σ", "UNICODE", 1); + assertStringCompare("ς", "Σ", "UNICODE", 1); + assertStringCompare("σ", "Σ", "UNICODE", -1); + assertStringCompare("ς", "σ", "UNICODE_CI", 0); + assertStringCompare("ς", "Σ", "UNICODE_CI", 0); + assertStringCompare("σ", "Σ", "UNICODE_CI", 0); + } + + private void assertLowerCaseCodePoints(UTF8String target, UTF8String expected, + Boolean useCodePoints) { + if (useCodePoints) { + assertEquals(expected.toString(), + CollationAwareUTF8String.lowerCaseCodePoints(target.toString())); + } else { + assertEquals(expected, target.toLowerCase()); + } + } + + @Test + public void testLowerCaseCodePoints() { + // Edge cases + assertLowerCaseCodePoints(UTF8String.fromString(""), UTF8String.fromString(""), false); + assertLowerCaseCodePoints(UTF8String.fromString(""), UTF8String.fromString(""), true); + // Basic tests + assertLowerCaseCodePoints(UTF8String.fromString("abcd"), UTF8String.fromString("abcd"), false); + assertLowerCaseCodePoints(UTF8String.fromString("AbCd"), UTF8String.fromString("abcd"), false); + assertLowerCaseCodePoints(UTF8String.fromString("abcd"), UTF8String.fromString("abcd"), true); + assertLowerCaseCodePoints(UTF8String.fromString("aBcD"), UTF8String.fromString("abcd"), true); + // Accent variation + assertLowerCaseCodePoints(UTF8String.fromString("AbĆd"), UTF8String.fromString("abćd"), false); + assertLowerCaseCodePoints(UTF8String.fromString("aBcΔ"), UTF8String.fromString("abcδ"), true); + // Case-variable character length + assertLowerCaseCodePoints( + UTF8String.fromString("İoDiNe"), UTF8String.fromString("i̇odine"), false); + assertLowerCaseCodePoints( + UTF8String.fromString("Abi̇o12"), UTF8String.fromString("abi̇o12"), false); + assertLowerCaseCodePoints( + UTF8String.fromString("İodInE"), UTF8String.fromString("i̇odine"), true); + assertLowerCaseCodePoints( + UTF8String.fromString("aBi̇o12"), UTF8String.fromString("abi̇o12"), true); + // Conditional case mapping + assertLowerCaseCodePoints( + UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"), UTF8String.fromString("θαλασσινος"), false); + assertLowerCaseCodePoints( + UTF8String.fromString("ΘΑΛΑΣΣΙΝΟΣ"), UTF8String.fromString("θαλασσινοσ"), true); + // Surrogate pairs are treated as invalid UTF8 sequences + assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[] + {(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte) 0x80}), + UTF8String.fromString("\ufffd\ufffd"), false); + assertLowerCaseCodePoints(UTF8String.fromBytes(new byte[] + {(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte) 0x80}), + UTF8String.fromString("\ufffd\ufffd"), true); Review Comment: This is actually a very interesting example. The C/C++ version of ICU replaces this byte array ```scala {(byte) 0xED, (byte) 0xA0, (byte) 0x80, (byte) 0xED, (byte) 0xB0, (byte) 0x80} ``` with 6 U+FFFD characters. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org