Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]
cloud-fan closed pull request #46700: [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE URL: https://github.com/apache/spark/pull/46700 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]
cloud-fan commented on PR #46700: URL: https://github.com/apache/spark/pull/46700#issuecomment-2153224186 thanks, merging to master! -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]
uros-db commented on PR #46700: URL: https://github.com/apache/spark/pull/46700#issuecomment-2153120424 @cloud-fan please review -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]
uros-db commented on PR #46700: URL: https://github.com/apache/spark/pull/46700#issuecomment-2151524400 in conclusion: we'll need to modify the logic for illegal UTF8 byte sequence replacement, to stay consistent with ICU4C however, we'll do this in a follow-up PR -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]
uros-db commented on code in PR #46700: URL: https://github.com/apache/spark/pull/46700#discussion_r1628861090 ## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ## @@ -26,6 +27,156 @@ // checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { + /** + * A list containing some of the supported collations in Spark. Use this list to iterate over + * all the important collation groups (binary, lowercase, icu) for complete unit test coverage. + * Note: this list may come in handy when the Spark function result is the same regardless of + * the specified collations (as often seen in some pass-through Spark expressions). + */ + private final String[] testSupportedCollations = +{"UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI"}; + + /** + * Collation-aware UTF8String comparison. + */ + + private void assertStringCompare(String s1, String s2, String collationName, int expected) + throws SparkException { +UTF8String l = UTF8String.fromString(s1); +UTF8String r = UTF8String.fromString(s2); +int compare = CollationFactory.fetchCollation(collationName).comparator.compare(l, r); +assertEquals(Integer.signum(expected), Integer.signum(compare)); + } + + @Test + public void testCompare() throws SparkException { +for (String collationName: testSupportedCollations) { + // Edge cases + assertStringCompare("", "", collationName, 0); + assertStringCompare("a", "", collationName, 1); + assertStringCompare("", "a", collationName, -1); + // Basic tests + assertStringCompare("a", "a", collationName, 0); + assertStringCompare("a", "b", collationName, -1); + assertStringCompare("b", "a", collationName, 1); + assertStringCompare("A", "A", collationName, 0); + assertStringCompare("A", "B", collationName, -1); + assertStringCompare("B", "A", collationName, 1); + assertStringCompare("aa", "a", collationName, 1); + assertStringCompare("b", "bb", collationName, -1); + assertStringCompare("abc", "a", collationName, 1); + assertStringCompare("abc", "b", collationName, -1); + assertStringCompare("abc", "ab", collationName, 1); + assertStringCompare("abc", "abc", collationName, 0); + // ASCII strings + assertStringCompare("", "aaa", collationName, 1); + assertStringCompare("hello", "world", collationName, -1); + assertStringCompare("Spark", "Spark", collationName, 0); + // Non-ASCII strings + assertStringCompare("ü", "ü", collationName, 0); + assertStringCompare("ü", "", collationName, 1); + assertStringCompare("", "ü", collationName, -1); + assertStringCompare("äü", "äü", collationName, 0); + assertStringCompare("äxx", "äx", collationName, 1); + assertStringCompare("a", "ä", collationName, -1); +} +// Non-ASCII strings +assertStringCompare("äü", "bü", "UTF8_BINARY", 1); +assertStringCompare("bxx", "bü", "UTF8_BINARY", -1); +assertStringCompare("äü", "bü", "UTF8_BINARY_LCASE", 1); +assertStringCompare("bxx", "bü", "UTF8_BINARY_LCASE", -1); +assertStringCompare("äü", "bü", "UNICODE", -1); +assertStringCompare("bxx", "bü", "UNICODE", 1); +assertStringCompare("äü", "bü", "UNICODE_CI", -1); +assertStringCompare("bxx", "bü", "UNICODE_CI", 1); +// Case variation +assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1); +assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0); +assertStringCompare("AbcD", "aBCd", "UNICODE", 1); +assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0); +// Accent variation +assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1); +assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1); +assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0); +// Case-variable character length +assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1); +assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1); +assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0); +assertStringCompare("i\u0307", "İ", "UNICODE", -1); +assertStringCompare("İ", "i\u0307", "UNICODE", 1); +assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0); +assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0); +assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0); +assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0); +assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0); +assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0); +assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0); +assertStringCompare("İi\u0307",
Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]
uros-db commented on code in PR #46700: URL: https://github.com/apache/spark/pull/46700#discussion_r1628823909 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ## @@ -296,6 +344,45 @@ public static String toLowerCase(final String target, final int collationId) { return UCharacter.toLowerCase(locale, target); } + /** + * Converts a single code point to lowercase using ICU rules, with special handling for + * one-to-many case mappings (i.e. characters that map to multiple characters in lowercase). + * + * @param codePoint The code point to convert to lowercase. + * @param sb The StringBuilder to append the lowercase character to. + */ + private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) { +if (codePoint == 0x0130) { + // Latin capital letter I with dot above is mapped to 2 lowercase characters. + sb.appendCodePoint(0x0069); + sb.appendCodePoint(0x0307); +} +else if (codePoint == 0x03C2) { + // Greek final and non-final capital letter sigma should be mapped the same. + sb.appendCodePoint(0x03C3); +} +else { + // All other characters should follow context-unaware ICU single-code point case mapping. + sb.appendCodePoint(UCharacter.toLowerCase(codePoint)); +} + } + + /** + * Converts an entire string to lowercase using ICU rules, code point by code point, with + * special handling for one-to-many case mappings (i.e. characters that map to multiple + * characters in lowercase). This method omits information about context-sensitive case mappings. Review Comment: adding some more info here -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]
mkaravel commented on code in PR #46700: URL: https://github.com/apache/spark/pull/46700#discussion_r1628822473 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ## @@ -296,6 +344,45 @@ public static String toLowerCase(final String target, final int collationId) { return UCharacter.toLowerCase(locale, target); } + /** + * Converts a single code point to lowercase using ICU rules, with special handling for + * one-to-many case mappings (i.e. characters that map to multiple characters in lowercase). + * + * @param codePoint The code point to convert to lowercase. + * @param sb The StringBuilder to append the lowercase character to. + */ + private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) { +if (codePoint == 0x0130) { + // Latin capital letter I with dot above is mapped to 2 lowercase characters. + sb.appendCodePoint(0x0069); + sb.appendCodePoint(0x0307); +} +else if (codePoint == 0x03C2) { + // Greek final and non-final capital letter sigma should be mapped the same. + sb.appendCodePoint(0x03C3); +} +else { + // All other characters should follow context-unaware ICU single-code point case mapping. + sb.appendCodePoint(UCharacter.toLowerCase(codePoint)); Review Comment: Sounds good. Will then approve this PR. LGTM for the scope it tries to cover. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]
mkaravel commented on code in PR #46700: URL: https://github.com/apache/spark/pull/46700#discussion_r1628821647 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java: ## @@ -412,9 +412,9 @@ protected Collation buildCollation() { "UTF8_BINARY_LCASE", PROVIDER_SPARK, null, -UTF8String::compareLowerCase, Review Comment: I like the idea by @dbatomic. I think the `UTF8String` class should just have functionality for the UTF8_BINARY collation. Whatever is relevant to UTF8_BINARY_LCASE looks like it should be in a separate class. Mixing functionality in confusing for the reader of either class. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]
uros-db commented on code in PR #46700: URL: https://github.com/apache/spark/pull/46700#discussion_r1628820660 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ## @@ -296,6 +344,45 @@ public static String toLowerCase(final String target, final int collationId) { return UCharacter.toLowerCase(locale, target); } + /** + * Converts a single code point to lowercase using ICU rules, with special handling for + * one-to-many case mappings (i.e. characters that map to multiple characters in lowercase). + * + * @param codePoint The code point to convert to lowercase. + * @param sb The StringBuilder to append the lowercase character to. + */ + private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) { +if (codePoint == 0x0130) { + // Latin capital letter I with dot above is mapped to 2 lowercase characters. + sb.appendCodePoint(0x0069); + sb.appendCodePoint(0x0307); +} +else if (codePoint == 0x03C2) { + // Greek final and non-final capital letter sigma should be mapped the same. + sb.appendCodePoint(0x03C3); +} +else { + // All other characters should follow context-unaware ICU single-code point case mapping. + sb.appendCodePoint(UCharacter.toLowerCase(codePoint)); Review Comment: that's correct, since the logic of illegal UTF8 byte sequences is a bit involved and requires thorough testing, I plan to do that in a follow-up PR -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]
mkaravel commented on code in PR #46700: URL: https://github.com/apache/spark/pull/46700#discussion_r1628813249 ## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ## @@ -26,6 +27,156 @@ // checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { + /** + * A list containing some of the supported collations in Spark. Use this list to iterate over + * all the important collation groups (binary, lowercase, icu) for complete unit test coverage. + * Note: this list may come in handy when the Spark function result is the same regardless of + * the specified collations (as often seen in some pass-through Spark expressions). + */ + private final String[] testSupportedCollations = +{"UTF8_BINARY", "UTF8_BINARY_LCASE", "UNICODE", "UNICODE_CI"}; + + /** + * Collation-aware UTF8String comparison. + */ + + private void assertStringCompare(String s1, String s2, String collationName, int expected) + throws SparkException { +UTF8String l = UTF8String.fromString(s1); +UTF8String r = UTF8String.fromString(s2); +int compare = CollationFactory.fetchCollation(collationName).comparator.compare(l, r); +assertEquals(Integer.signum(expected), Integer.signum(compare)); + } + + @Test + public void testCompare() throws SparkException { +for (String collationName: testSupportedCollations) { + // Edge cases + assertStringCompare("", "", collationName, 0); + assertStringCompare("a", "", collationName, 1); + assertStringCompare("", "a", collationName, -1); + // Basic tests + assertStringCompare("a", "a", collationName, 0); + assertStringCompare("a", "b", collationName, -1); + assertStringCompare("b", "a", collationName, 1); + assertStringCompare("A", "A", collationName, 0); + assertStringCompare("A", "B", collationName, -1); + assertStringCompare("B", "A", collationName, 1); + assertStringCompare("aa", "a", collationName, 1); + assertStringCompare("b", "bb", collationName, -1); + assertStringCompare("abc", "a", collationName, 1); + assertStringCompare("abc", "b", collationName, -1); + assertStringCompare("abc", "ab", collationName, 1); + assertStringCompare("abc", "abc", collationName, 0); + // ASCII strings + assertStringCompare("", "aaa", collationName, 1); + assertStringCompare("hello", "world", collationName, -1); + assertStringCompare("Spark", "Spark", collationName, 0); + // Non-ASCII strings + assertStringCompare("ü", "ü", collationName, 0); + assertStringCompare("ü", "", collationName, 1); + assertStringCompare("", "ü", collationName, -1); + assertStringCompare("äü", "äü", collationName, 0); + assertStringCompare("äxx", "äx", collationName, 1); + assertStringCompare("a", "ä", collationName, -1); +} +// Non-ASCII strings +assertStringCompare("äü", "bü", "UTF8_BINARY", 1); +assertStringCompare("bxx", "bü", "UTF8_BINARY", -1); +assertStringCompare("äü", "bü", "UTF8_BINARY_LCASE", 1); +assertStringCompare("bxx", "bü", "UTF8_BINARY_LCASE", -1); +assertStringCompare("äü", "bü", "UNICODE", -1); +assertStringCompare("bxx", "bü", "UNICODE", 1); +assertStringCompare("äü", "bü", "UNICODE_CI", -1); +assertStringCompare("bxx", "bü", "UNICODE_CI", 1); +// Case variation +assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1); +assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0); +assertStringCompare("AbcD", "aBCd", "UNICODE", 1); +assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0); +// Accent variation +assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1); +assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1); +assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0); +// Case-variable character length +assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1); +assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1); +assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0); +assertStringCompare("i\u0307", "İ", "UNICODE", -1); +assertStringCompare("İ", "i\u0307", "UNICODE", 1); +assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0); +assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0); +assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0); +assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0); +assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0); +assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0); +assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0); +assertStringCompare("İi\u0307",
Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]
mkaravel commented on code in PR #46700: URL: https://github.com/apache/spark/pull/46700#discussion_r1628799978 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ## @@ -296,6 +344,45 @@ public static String toLowerCase(final String target, final int collationId) { return UCharacter.toLowerCase(locale, target); } + /** + * Converts a single code point to lowercase using ICU rules, with special handling for + * one-to-many case mappings (i.e. characters that map to multiple characters in lowercase). + * + * @param codePoint The code point to convert to lowercase. + * @param sb The StringBuilder to append the lowercase character to. + */ + private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) { +if (codePoint == 0x0130) { + // Latin capital letter I with dot above is mapped to 2 lowercase characters. + sb.appendCodePoint(0x0069); + sb.appendCodePoint(0x0307); +} +else if (codePoint == 0x03C2) { + // Greek final and non-final capital letter sigma should be mapped the same. + sb.appendCodePoint(0x03C3); +} +else { + // All other characters should follow context-unaware ICU single-code point case mapping. + sb.appendCodePoint(UCharacter.toLowerCase(codePoint)); Review Comment: I believe this does not map illegal UTF8 byte sequences to U+FFFD. Do we plan to make this change as part of this PR, or in a follow up one? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]
uros-db commented on code in PR #46700: URL: https://github.com/apache/spark/pull/46700#discussion_r1627182142 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java: ## @@ -412,9 +412,9 @@ protected Collation buildCollation() { "UTF8_BINARY_LCASE", PROVIDER_SPARK, null, -UTF8String::compareLowerCase, Review Comment: Agreed, fixed in recent commit also added "fast" comparison for ASCII strings -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]
uros-db commented on code in PR #46700: URL: https://github.com/apache/spark/pull/46700#discussion_r1615675019 ## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ## @@ -17,15 +17,136 @@ package org.apache.spark.unsafe.types; import org.apache.spark.SparkException; +import org.apache.spark.sql.catalyst.util.CollationAwareUTF8String; import org.apache.spark.sql.catalyst.util.CollationFactory; import org.apache.spark.sql.catalyst.util.CollationSupport; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; - +// checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { + /** + * Collation-aware UTF8String comparison. + */ + + private void assertStringCompare(String s1, String s2, String collationName, int expected) + throws SparkException { +UTF8String l = UTF8String.fromString(s1); +UTF8String r = UTF8String.fromString(s2); +int compare = CollationFactory.fetchCollation(collationName).comparator.compare(l, r); +assertEquals(Integer.signum(expected), Integer.signum(compare)); + } + + @Test + public void testCompare() throws SparkException { +// Edge cases +assertStringCompare("", "", "UTF8_BINARY", 0); +assertStringCompare("a", "", "UTF8_BINARY", 1); +assertStringCompare("", "a", "UTF8_BINARY", -1); +assertStringCompare("", "", "UTF8_BINARY_LCASE", 0); +assertStringCompare("a", "", "UTF8_BINARY_LCASE", 1); +assertStringCompare("", "a", "UTF8_BINARY_LCASE", -1); +assertStringCompare("", "", "UNICODE", 0); +assertStringCompare("a", "", "UNICODE", 1); +assertStringCompare("", "a", "UNICODE", -1); +assertStringCompare("", "", "UNICODE_CI", 0); +assertStringCompare("a", "", "UNICODE_CI", 1); +assertStringCompare("", "a", "UNICODE_CI", -1); +// Basic tests +assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1); +assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0); +assertStringCompare("AbcD", "aBCd", "UNICODE", 1); +assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0); +// Accent variation +assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1); +assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1); +assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0); +// Case-variable character length +assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1); +assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1); +assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0); +assertStringCompare("i\u0307", "İ", "UNICODE", -1); +assertStringCompare("İ", "i\u0307", "UNICODE", 1); +assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0); +assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0); +assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0); +assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0); +assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0); +assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0); +assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0); +assertStringCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0); +// Conditional case mapping +assertStringCompare("ς", "σ", "UTF8_BINARY", -1); +assertStringCompare("ς", "Σ", "UTF8_BINARY", 1); +assertStringCompare("σ", "Σ", "UTF8_BINARY", 1); +assertStringCompare("ς", "σ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("ς", "σ", "UNICODE", 1); +assertStringCompare("ς", "Σ", "UNICODE", 1); +assertStringCompare("σ", "Σ", "UNICODE", -1); +assertStringCompare("ς", "σ", "UNICODE_CI", 0); +assertStringCompare("ς", "Σ", "UNICODE_CI", 0); +assertStringCompare("σ", "Σ", "UNICODE_CI", 0); + } + + private void assertLcaseCompare(String target, String expected, String collationName) + throws SparkException { +if (collationName.equals("UTF8_BINARY")) { + UTF8String targetUTF8 = UTF8String.fromString(target); + UTF8String expectedUTF8 = UTF8String.fromString(expected); + assertEquals(expectedUTF8, targetUTF8.toLowerCase()); +} else if (collationName.equals("UTF8_BINARY_LCASE")) { + assertEquals(expected, CollationAwareUTF8String.lowerCaseCodePoints(target)); +} else { + int collationId = CollationFactory.collationNameToId(collationName); + assertEquals(expected, CollationAwareUTF8String.toLowerCase(target, collationId)); +} + } + + @Test + public void testLcaseCompare() throws SparkException { +// Edge cases +
Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]
uros-db commented on code in PR #46700: URL: https://github.com/apache/spark/pull/46700#discussion_r1613391451 ## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ## @@ -17,15 +17,136 @@ package org.apache.spark.unsafe.types; import org.apache.spark.SparkException; +import org.apache.spark.sql.catalyst.util.CollationAwareUTF8String; import org.apache.spark.sql.catalyst.util.CollationFactory; import org.apache.spark.sql.catalyst.util.CollationSupport; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; - +// checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { + /** + * Collation-aware UTF8String comparison. + */ + + private void assertStringCompare(String s1, String s2, String collationName, int expected) + throws SparkException { +UTF8String l = UTF8String.fromString(s1); +UTF8String r = UTF8String.fromString(s2); +int compare = CollationFactory.fetchCollation(collationName).comparator.compare(l, r); +assertEquals(Integer.signum(expected), Integer.signum(compare)); + } + + @Test + public void testCompare() throws SparkException { +// Edge cases +assertStringCompare("", "", "UTF8_BINARY", 0); +assertStringCompare("a", "", "UTF8_BINARY", 1); +assertStringCompare("", "a", "UTF8_BINARY", -1); +assertStringCompare("", "", "UTF8_BINARY_LCASE", 0); +assertStringCompare("a", "", "UTF8_BINARY_LCASE", 1); +assertStringCompare("", "a", "UTF8_BINARY_LCASE", -1); +assertStringCompare("", "", "UNICODE", 0); +assertStringCompare("a", "", "UNICODE", 1); +assertStringCompare("", "a", "UNICODE", -1); +assertStringCompare("", "", "UNICODE_CI", 0); +assertStringCompare("a", "", "UNICODE_CI", 1); +assertStringCompare("", "a", "UNICODE_CI", -1); +// Basic tests +assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1); +assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0); +assertStringCompare("AbcD", "aBCd", "UNICODE", 1); +assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0); +// Accent variation +assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1); +assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1); +assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0); +// Case-variable character length +assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1); +assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1); +assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0); +assertStringCompare("i\u0307", "İ", "UNICODE", -1); +assertStringCompare("İ", "i\u0307", "UNICODE", 1); +assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0); +assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0); +assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0); +assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("İi\u0307", "İi\u0307", "UTF8_BINARY_LCASE", 0); +assertStringCompare("i\u0307İ", "i\u0307İ", "UNICODE_CI", 0); +assertStringCompare("i\u0307İ", "İi\u0307", "UNICODE_CI", 0); +assertStringCompare("İi\u0307", "i\u0307İ", "UNICODE_CI", 0); +assertStringCompare("İi\u0307", "İi\u0307", "UNICODE_CI", 0); +// Conditional case mapping +assertStringCompare("ς", "σ", "UTF8_BINARY", -1); +assertStringCompare("ς", "Σ", "UTF8_BINARY", 1); +assertStringCompare("σ", "Σ", "UTF8_BINARY", 1); +assertStringCompare("ς", "σ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("ς", "Σ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("σ", "Σ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("ς", "σ", "UNICODE", 1); +assertStringCompare("ς", "Σ", "UNICODE", 1); +assertStringCompare("σ", "Σ", "UNICODE", -1); +assertStringCompare("ς", "σ", "UNICODE_CI", 0); +assertStringCompare("ς", "Σ", "UNICODE_CI", 0); +assertStringCompare("σ", "Σ", "UNICODE_CI", 0); + } + + private void assertLcaseCompare(String target, String expected, String collationName) + throws SparkException { +if (collationName.equals("UTF8_BINARY")) { + UTF8String targetUTF8 = UTF8String.fromString(target); + UTF8String expectedUTF8 = UTF8String.fromString(expected); + assertEquals(expectedUTF8, targetUTF8.toLowerCase()); +} else if (collationName.equals("UTF8_BINARY_LCASE")) { + assertEquals(expected, CollationAwareUTF8String.lowerCaseCodePoints(target)); +} else { + int collationId = CollationFactory.collationNameToId(collationName); + assertEquals(expected, CollationAwareUTF8String.toLowerCase(target, collationId)); +} + } + + @Test + public void testLcaseCompare() throws SparkException { +// Edge cases +
Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]
uros-db commented on code in PR #46700: URL: https://github.com/apache/spark/pull/46700#discussion_r1613207707 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ## @@ -147,6 +162,45 @@ public static String toLowerCase(final String target, final int collationId) { return UCharacter.toLowerCase(locale, target); } + /** + * Converts a single code point to lowercase using ICU rules, with special handling for + * conditional case mappings (i.e. characters that map to multiple characters in lowercase). Review Comment: ah, so "conditional case mapping" == "context-sensitivity" instead, what I wanted to say is just "one-to-many case mapping" -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]
uros-db commented on code in PR #46700: URL: https://github.com/apache/spark/pull/46700#discussion_r1613207707 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ## @@ -147,6 +162,45 @@ public static String toLowerCase(final String target, final int collationId) { return UCharacter.toLowerCase(locale, target); } + /** + * Converts a single code point to lowercase using ICU rules, with special handling for + * conditional case mappings (i.e. characters that map to multiple characters in lowercase). Review Comment: ah, so "conditional case mapping" == "context-sensitivity" instead, what I wanted to say is just "case mapping" -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org - To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org
Re: [PR] [SPARK-48283][SQL] Modify string comparison for UTF8_BINARY_LCASE [spark]
mkaravel commented on code in PR #46700: URL: https://github.com/apache/spark/pull/46700#discussion_r1612550274 ## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ## @@ -147,6 +162,45 @@ public static String toLowerCase(final String target, final int collationId) { return UCharacter.toLowerCase(locale, target); } + /** + * Converts a single code point to lowercase using ICU rules, with special handling for + * conditional case mappings (i.e. characters that map to multiple characters in lowercase). + * + * @param codePoint The code point to convert to lowercase. + * @param sb The StringBuilder to append the lowercase character to. + */ + private static void lowercaseCodePoint(final int codePoint, final StringBuilder sb) { +// Latin capital letter I with dot above is mapped to 2 lowercase characters. +if (codePoint == 0x0130) { + sb.append("i̇"); +} +// Greek final and non-final capital letter sigma should be mapped the same. +else if (codePoint == 0x03C2) { + sb.append("σ"); +} +// All other characters should follow context-unaware ICU single-code point case mapping. +else { + sb.appendCodePoint(UCharacter.toLowerCase(codePoint)); +} Review Comment: I think the following is easier to read: ```suggestion // Latin capital letter I with dot above is mapped to 2 lowercase characters. if (codePoint == 0x0130) { // Latin capital letter I with dot above is mapped to 2 lowercase characters. sb.append("i̇"); } else if (codePoint == 0x03C2) { // Greek final and non-final capital letter sigma should be mapped the same. sb.append("σ"); } else { // All other characters should follow context-unaware ICU single-code point case mapping. sb.appendCodePoint(UCharacter.toLowerCase(codePoint)); } ``` ## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ## @@ -17,15 +17,136 @@ package org.apache.spark.unsafe.types; import org.apache.spark.SparkException; +import org.apache.spark.sql.catalyst.util.CollationAwareUTF8String; import org.apache.spark.sql.catalyst.util.CollationFactory; import org.apache.spark.sql.catalyst.util.CollationSupport; import org.junit.jupiter.api.Test; import static org.junit.jupiter.api.Assertions.*; - +// checkstyle.off: AvoidEscapedUnicodeCharacters public class CollationSupportSuite { + /** + * Collation-aware UTF8String comparison. + */ + + private void assertStringCompare(String s1, String s2, String collationName, int expected) + throws SparkException { +UTF8String l = UTF8String.fromString(s1); +UTF8String r = UTF8String.fromString(s2); +int compare = CollationFactory.fetchCollation(collationName).comparator.compare(l, r); +assertEquals(Integer.signum(expected), Integer.signum(compare)); + } + + @Test + public void testCompare() throws SparkException { +// Edge cases +assertStringCompare("", "", "UTF8_BINARY", 0); +assertStringCompare("a", "", "UTF8_BINARY", 1); +assertStringCompare("", "a", "UTF8_BINARY", -1); +assertStringCompare("", "", "UTF8_BINARY_LCASE", 0); +assertStringCompare("a", "", "UTF8_BINARY_LCASE", 1); +assertStringCompare("", "a", "UTF8_BINARY_LCASE", -1); +assertStringCompare("", "", "UNICODE", 0); +assertStringCompare("a", "", "UNICODE", 1); +assertStringCompare("", "a", "UNICODE", -1); +assertStringCompare("", "", "UNICODE_CI", 0); +assertStringCompare("a", "", "UNICODE_CI", 1); +assertStringCompare("", "a", "UNICODE_CI", -1); +// Basic tests +assertStringCompare("AbCd", "aBcD", "UTF8_BINARY", -1); +assertStringCompare("ABCD", "abcd", "UTF8_BINARY_LCASE", 0); +assertStringCompare("AbcD", "aBCd", "UNICODE", 1); +assertStringCompare("abcd", "ABCD", "UNICODE_CI", 0); +// Accent variation +assertStringCompare("aBćD", "ABĆD", "UTF8_BINARY", 1); +assertStringCompare("AbCδ", "ABCΔ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("äBCd", "ÄBCD", "UNICODE", -1); +assertStringCompare("Ab́cD", "AB́CD", "UNICODE_CI", 0); +// Case-variable character length +assertStringCompare("i\u0307", "İ", "UTF8_BINARY", -1); +assertStringCompare("İ", "i\u0307", "UTF8_BINARY", 1); +assertStringCompare("i\u0307", "İ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("İ", "i\u0307", "UTF8_BINARY_LCASE", 0); +assertStringCompare("i\u0307", "İ", "UNICODE", -1); +assertStringCompare("İ", "i\u0307", "UNICODE", 1); +assertStringCompare("i\u0307", "İ", "UNICODE_CI", 0); +assertStringCompare("İ", "i\u0307", "UNICODE_CI", 0); +assertStringCompare("i\u0307İ", "i\u0307İ", "UTF8_BINARY_LCASE", 0); +assertStringCompare("i\u0307İ", "İi\u0307", "UTF8_BINARY_LCASE", 0); +assertStringCompare("İi\u0307", "i\u0307İ", "UTF8_BINARY_LCASE",