uros-db commented on code in PR #46511: URL: https://github.com/apache/spark/pull/46511#discussion_r1596738413
########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -34,6 +34,15 @@ * Utility class for collation-aware UTF8String operations. */ public class CollationAwareUTF8String { + + public static boolean lowercaseMatchAt(final UTF8String l, final UTF8String r, + int pos, int len) { + if (len + pos > l.numChars() || pos < 0) { + return false; + } + return l.substring(pos, pos + len).toLowerCase().equals(r); Review Comment: > Shouldn't this be? > > return l.substring(pos, pos + len).toLowerCase().equals(r.toLowerCase()); not here, because we know needle is not gonna change (we won't take any substrings of needle), then we can lowercase the needle only once (before the function call) in order to avoid multiple lowercasing (this is only important for `contains` which turns out to be quadratic - just like ICU StringSearch algorithm, while `startsWith` and `endsWith` are linear in nature because one bound is known in advance) ########## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ########## @@ -102,20 +102,28 @@ public void testContains() throws SparkException { assertContains("äbćδe", "ÄbćδE", "UNICODE_CI", true); assertContains("äbćδe", "ÄBcΔÉ", "UNICODE_CI", false); // Case-variable character length - assertContains("abİo12", "i̇o", "UNICODE_CI", true); - assertContains("abi̇o12", "İo", "UNICODE_CI", true); - assertContains("the İodine", "the i̇odine", "UTF8_BINARY_LCASE", true); - assertContains("the i̇odine", "the İodine", "UTF8_BINARY_LCASE", true); - assertContains("The İodiNe", " i̇oDin", "UTF8_BINARY_LCASE", true); - assertContains("İodiNe", "i̇oDine", "UTF8_BINARY_LCASE", true); - assertContains("İodiNe", " i̇oDin", "UTF8_BINARY_LCASE", false); - // Characters with the same binary lowercase representation - assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true); - assertContains("The Kelvin.", "Kelvin", "UTF8_BINARY_LCASE", true); - assertContains("The KKelvin.", "KKelvin", "UTF8_BINARY_LCASE", true); - assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true); - assertContains("2 Kelvin.", "2 Kelvin", "UTF8_BINARY_LCASE", true); - assertContains("The KKelvin.", "KKelvin,", "UTF8_BINARY_LCASE", false); + assertContains("i̇", "i", "UNICODE_CI", false); + assertContains("i̇", "İ", "UNICODE_CI", true); + assertContains("İ", "i", "UNICODE_CI", false); + assertContains("adi̇os", "io", "UNICODE_CI", false); + assertContains("adi̇os", "Io", "UNICODE_CI", false); + assertContains("adi̇os", "i̇o", "UNICODE_CI", true); + assertContains("adi̇os", "İo", "UNICODE_CI", true); + assertContains("adİos", "io", "UNICODE_CI", false); + assertContains("adİos", "Io", "UNICODE_CI", false); + assertContains("adİos", "i̇o", "UNICODE_CI", true); + assertContains("adİos", "İo", "UNICODE_CI", true); + assertContains("i̇", "i", "UTF8_BINARY_LCASE", true); // note: different from UNICODE_CI + assertContains("i̇", "İ", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertContains("İ", "i", "UTF8_BINARY_LCASE", false); + assertContains("adi̇os", "io", "UTF8_BINARY_LCASE", false); + assertContains("adi̇os", "Io", "UTF8_BINARY_LCASE", false); + assertContains("adi̇os", "i̇o", "UTF8_BINARY_LCASE", true); + assertContains("adi̇os", "İo", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI + assertContains("adİos", "io", "UTF8_BINARY_LCASE", false); + assertContains("adİos", "Io", "UTF8_BINARY_LCASE", false); + assertContains("adİos", "i̇o", "UTF8_BINARY_LCASE", false); // note: different from UNICODE_CI Review Comment: update: resolved in latest commit -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org