uros-db commented on code in PR #46700: URL: https://github.com/apache/spark/pull/46700#discussion_r1613391451
########## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ########## @@ -17,15 +17,136 @@ package org.apache.spark.unsafe.types; import org.apache.spark.SparkException; +import org.apache.spark.sql.catalyst.util.CollationAwareUTF8String; import org.apache.spark.sql.catalyst.util.CollationFactory; import org.apache.spark.sql.catalyst.util.CollationSupport; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; - +// checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { + /** + * Collation-aware UTF8String comparison. + */ + + private void assertStringCompare(String s1, String s2, String collationName, int expected) + throws SparkException { + UTF8String l = UTF8String.fromString(s1); + UTF8String r = UTF8String.fromString(s2); + int compare = CollationFactory.fetchCollation(collationName).comparator.compare(l, r); + assertEquals(Integer.signum(expected), Integer.signum(compare)); + } + + @Test + public void testCompare() throws SparkException { + // Edge cases + assertStringCompare("", "", "UTF8_BINARY", 0); + assertStringCompare("a", "", "UTF8_BINARY", 1); + assertStringCompare("", "a", "UTF8_BINARY", -1); + assertStringCompare("", "", "UTF8_BINARY_LCASE", 0); + assertStringCompare("a", "", "UTF8_BINARY_LCASE", 1); + assertStringCompare("", "a", "UTF8_BINARY_LCASE", -1); + assertStringCompare("", "", "UNICODE", 0); + assertStringCompare("a", "", "UNICODE", 1); + assertStringCompare("", "a", "UNICODE", -1); + assertStringCompare("", "", "UNICODE_CI", 0); + assertStringCompare("a", "", "UNICODE_CI", 1); + assertStringCompare("", "a", "UNICODE_CI", -1); + // Basic tests + assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1); + assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0); + assertStringCompare("AbcD", "aBCd", "UNICODE", 1); + assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0); + // Accent variation + assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1); + assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1); + assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0); + // Case-variable character length + assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1); + assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1); + assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0); + assertStringCompare("i\u0307", "İ", "UNICODE", -1); + assertStringCompare("İ", "i\u0307", "UNICODE", 1); + assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0); + assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0); + assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0); + assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0); + assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0); + assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0); + assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0); + assertStringCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0); + // Conditional case mapping + assertStringCompare("ς", "σ", "UTF8_BINARY", -1); + assertStringCompare("ς", "Σ", "UTF8_BINARY", 1); + assertStringCompare("σ", "Σ", "UTF8_BINARY", 1); + assertStringCompare("ς", "σ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0); + assertStringCompare("ς", "σ", "UNICODE", 1); + assertStringCompare("ς", "Σ", "UNICODE", 1); + assertStringCompare("σ", "Σ", "UNICODE", -1); + assertStringCompare("ς", "σ", "UNICODE_CI", 0); + assertStringCompare("ς", "Σ", "UNICODE_CI", 0); + assertStringCompare("σ", "Σ", "UNICODE_CI", 0); + } + + private void assertLcaseCompare(String target, String expected, String collationName) + throws SparkException { + if (collationName.equals("UTF8_BINARY")) { + UTF8String targetUTF8 = UTF8String.fromString(target); + UTF8String expectedUTF8 = UTF8String.fromString(expected); + assertEquals(expectedUTF8, targetUTF8.toLowerCase()); + } else if (collationName.equals("UTF8_BINARY_LCASE")) { + assertEquals(expected, CollationAwareUTF8String.lowerCaseCodePoints(target)); + } else { + int collationId = CollationFactory.collationNameToId(collationName); + assertEquals(expected, CollationAwareUTF8String.toLowerCase(target, collationId)); + } + } + + @Test + public void testLcaseCompare() throws SparkException { + // Edge cases + assertLcaseCompare("", "", "UTF8_BINARY"); + assertLcaseCompare("", "", "UTF8_BINARY_LCASE"); + assertLcaseCompare("", "", "UNICODE"); + assertLcaseCompare("", "", "UNICODE_CI"); + // Basic tests + assertLcaseCompare("abcd", "abcd", "UTF8_BINARY"); + assertLcaseCompare("AbCd", "abcd", "UTF8_BINARY"); + assertLcaseCompare("abcd", "abcd", "UTF8_BINARY_LCASE"); + assertLcaseCompare("aBcD", "abcd", "UTF8_BINARY_LCASE"); + assertLcaseCompare("abcd", "abcd", "UNICODE"); + assertLcaseCompare("aBCd", "abcd", "UNICODE"); + assertLcaseCompare("abcd", "abcd", "UNICODE_CI"); + assertLcaseCompare("AbcD", "abcd", "UNICODE_CI"); + // Accent variation + assertLcaseCompare("AbĆd", "abćd", "UTF8_BINARY"); + assertLcaseCompare("aBcΔ", "abcδ", "UTF8_BINARY_LCASE"); + assertLcaseCompare("ÄbcD", "äbcd", "UNICODE"); + assertLcaseCompare("aB́Cd", "ab́cd", "UNICODE_CI"); + // Case-variable character length + assertLcaseCompare("İoDiNe", "i̇odine", "UTF8_BINARY"); + assertLcaseCompare("Abi̇o12", "abi̇o12", "UTF8_BINARY"); + assertLcaseCompare("İodInE", "i̇odine", "UTF8_BINARY_LCASE"); + assertLcaseCompare("aBi̇o12", "abi̇o12", "UTF8_BINARY_LCASE"); + assertLcaseCompare("İoDinE", "i̇odine", "UNICODE"); + assertLcaseCompare("abi̇O12", "abi̇o12", "UNICODE"); + assertLcaseCompare("İodINe", "i̇odine", "UNICODE_CI"); + assertLcaseCompare("ABi̇o12", "abi̇o12", "UNICODE_CI"); + // Conditional case mapping + assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UTF8_BINARY"); + assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινοσ", "UTF8_BINARY_LCASE"); // != UNICODE_CI + assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE"); + assertLcaseCompare("ΘΑΛΑΣΣΙΝΟΣ", "θαλασσινος", "UNICODE_CI"); Review Comment: `testLower` tests the output of the appropriate `toLowerCase` method for each collation, that is not the intent here instead, I want `testLcaseCompare` to do the other thing you mentioned, which is: compare `CollationAwareUTF8String.lowerCaseCodePoints` against `CollationAwareUTF8String.toLowerCase` for the UTF8_BINARY_LCASE -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org