mkaravel commented on code in PR #46589: URL: https://github.com/apache/spark/pull/46589#discussion_r1603752983
########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -278,47 +431,29 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string, return UTF8String.EMPTY_UTF8; } - UTF8String lowercaseString = string.toLowerCase(); UTF8String lowercaseDelimiter = delimiter.toLowerCase(); if (count > 0) { - int idx = -1; + // search left to right (note: the start code point is inclusive) Review Comment: ```suggestion // Search left to right (note: the start code point is inclusive). ``` Please use regular sentences as comments. ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -278,47 +431,29 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string, return UTF8String.EMPTY_UTF8; } - UTF8String lowercaseString = string.toLowerCase(); UTF8String lowercaseDelimiter = delimiter.toLowerCase(); if (count > 0) { - int idx = -1; + // search left to right (note: the start code point is inclusive) + int matchLength = -1; while (count > 0) { - idx = lowercaseString.find(lowercaseDelimiter, idx + 1); - if (idx >= 0) { - count--; - } else { - // can not find enough delim - return string; - } - } - if (idx == 0) { - return UTF8String.EMPTY_UTF8; + matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1); + if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter + else return string; // cannot find enough delimiters in the string Review Comment: ```suggestion else return string; // Cannot find enough delimiters in the string. ``` ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -278,47 +431,29 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string, return UTF8String.EMPTY_UTF8; } - UTF8String lowercaseString = string.toLowerCase(); UTF8String lowercaseDelimiter = delimiter.toLowerCase(); if (count > 0) { - int idx = -1; + // search left to right (note: the start code point is inclusive) + int matchLength = -1; while (count > 0) { - idx = lowercaseString.find(lowercaseDelimiter, idx + 1); - if (idx >= 0) { - count--; - } else { - // can not find enough delim - return string; - } - } - if (idx == 0) { - return UTF8String.EMPTY_UTF8; + matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1); + if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter + else return string; // cannot find enough delimiters in the string } - byte[] bytes = new byte[idx]; - copyMemory(string.getBaseObject(), string.getBaseOffset(), bytes, BYTE_ARRAY_OFFSET, idx); - return UTF8String.fromBytes(bytes); - + if (matchLength == 0) return UTF8String.EMPTY_UTF8; + return string.substring(0, matchLength); } else { - int idx = string.numBytes() - delimiter.numBytes() + 1; + // search right to left (note: the end code point is exclusive) + int matchLength = string.numChars() + 1; count = -count; while (count > 0) { - idx = lowercaseString.rfind(lowercaseDelimiter, idx - 1); - if (idx >= 0) { - count--; - } else { - // can not find enough delim - return string; - } + matchLength = lowercaseRFind(string, lowercaseDelimiter, matchLength - 1); + if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter + else return string; // cannot find enough delimiters in the string } - if (idx + delimiter.numBytes() == string.numBytes()) { - return UTF8String.EMPTY_UTF8; - } - int size = string.numBytes() - delimiter.numBytes() - idx; - byte[] bytes = new byte[size]; - copyMemory(string.getBaseObject(), string.getBaseOffset() + idx + delimiter.numBytes(), - bytes, BYTE_ARRAY_OFFSET, size); - return UTF8String.fromBytes(bytes); + if (matchLength == string.numChars()) return UTF8String.EMPTY_UTF8; + return string.substring(matchLength, string.numChars()); Review Comment: ```suggestion return string.substring(matchLength, string.numChars()); ``` Same here. ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -278,47 +431,29 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string, return UTF8String.EMPTY_UTF8; } - UTF8String lowercaseString = string.toLowerCase(); UTF8String lowercaseDelimiter = delimiter.toLowerCase(); if (count > 0) { - int idx = -1; + // search left to right (note: the start code point is inclusive) + int matchLength = -1; while (count > 0) { - idx = lowercaseString.find(lowercaseDelimiter, idx + 1); - if (idx >= 0) { - count--; - } else { - // can not find enough delim - return string; - } - } - if (idx == 0) { - return UTF8String.EMPTY_UTF8; + matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1); + if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter + else return string; // cannot find enough delimiters in the string } - byte[] bytes = new byte[idx]; - copyMemory(string.getBaseObject(), string.getBaseOffset(), bytes, BYTE_ARRAY_OFFSET, idx); - return UTF8String.fromBytes(bytes); - + if (matchLength == 0) return UTF8String.EMPTY_UTF8; + return string.substring(0, matchLength); } else { - int idx = string.numBytes() - delimiter.numBytes() + 1; + // search right to left (note: the end code point is exclusive) + int matchLength = string.numChars() + 1; count = -count; while (count > 0) { - idx = lowercaseString.rfind(lowercaseDelimiter, idx - 1); - if (idx >= 0) { - count--; - } else { - // can not find enough delim - return string; - } + matchLength = lowercaseRFind(string, lowercaseDelimiter, matchLength - 1); + if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter Review Comment: ```suggestion if (matchLength != MATCH_NOT_FOUND) --count; // Found a delimiter. ``` ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -278,47 +431,29 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string, return UTF8String.EMPTY_UTF8; } - UTF8String lowercaseString = string.toLowerCase(); UTF8String lowercaseDelimiter = delimiter.toLowerCase(); if (count > 0) { - int idx = -1; + // search left to right (note: the start code point is inclusive) + int matchLength = -1; while (count > 0) { - idx = lowercaseString.find(lowercaseDelimiter, idx + 1); - if (idx >= 0) { - count--; - } else { - // can not find enough delim - return string; - } - } - if (idx == 0) { - return UTF8String.EMPTY_UTF8; + matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1); + if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter + else return string; // cannot find enough delimiters in the string } - byte[] bytes = new byte[idx]; - copyMemory(string.getBaseObject(), string.getBaseOffset(), bytes, BYTE_ARRAY_OFFSET, idx); - return UTF8String.fromBytes(bytes); - + if (matchLength == 0) return UTF8String.EMPTY_UTF8; + return string.substring(0, matchLength); } else { - int idx = string.numBytes() - delimiter.numBytes() + 1; + // search right to left (note: the end code point is exclusive) Review Comment: ```suggestion // Search right to left (note: the end code point is exclusive). ``` ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -183,6 +320,22 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co return 0; } + /** + * Returns the position of the first occurrence of the pattern string + * in the target string from the specified position (0-based index), + * with respect to the UTF8_BINARY_LCASE collation. + * + * @param target the string to be searched in + * @param pattern the string to be searched for + * @param start the start position for searching (in the target string) + * @return the position of the first occurrence of pattern in target, if not found, -1 returned. + */ + public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern, + final int start) { + if (pattern.numChars() == 0) return 0; + return lowercaseFind(target, pattern.toLowerCase(), start); + } + Review Comment: I am not reviewing changes from this point and above. I have reviewed them in https://github.com/apache/spark/pull/46511 ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -278,47 +431,29 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string, return UTF8String.EMPTY_UTF8; } - UTF8String lowercaseString = string.toLowerCase(); UTF8String lowercaseDelimiter = delimiter.toLowerCase(); if (count > 0) { - int idx = -1; + // search left to right (note: the start code point is inclusive) + int matchLength = -1; while (count > 0) { - idx = lowercaseString.find(lowercaseDelimiter, idx + 1); - if (idx >= 0) { - count--; - } else { - // can not find enough delim - return string; - } - } - if (idx == 0) { - return UTF8String.EMPTY_UTF8; + matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1); + if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter Review Comment: ```suggestion if (matchLength != MATCH_NOT_FOUND) --count; // Found a delimiter. ``` ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -278,47 +431,29 @@ public static UTF8String lowercaseSubStringIndex(final UTF8String string, return UTF8String.EMPTY_UTF8; } - UTF8String lowercaseString = string.toLowerCase(); UTF8String lowercaseDelimiter = delimiter.toLowerCase(); if (count > 0) { - int idx = -1; + // search left to right (note: the start code point is inclusive) + int matchLength = -1; while (count > 0) { - idx = lowercaseString.find(lowercaseDelimiter, idx + 1); - if (idx >= 0) { - count--; - } else { - // can not find enough delim - return string; - } - } - if (idx == 0) { - return UTF8String.EMPTY_UTF8; + matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength + 1); + if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter + else return string; // cannot find enough delimiters in the string } - byte[] bytes = new byte[idx]; - copyMemory(string.getBaseObject(), string.getBaseOffset(), bytes, BYTE_ARRAY_OFFSET, idx); - return UTF8String.fromBytes(bytes); - + if (matchLength == 0) return UTF8String.EMPTY_UTF8; + return string.substring(0, matchLength); Review Comment: ```suggestion return string.substring(0, matchLength); ``` The `UTF8String.substring` method does what you want for you. ########## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ########## @@ -798,6 +816,30 @@ public void testSubstringIndex() throws SparkException { assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo"); + assertSubstringIndex("abi̇12", "i", 1, "UNICODE_CI", "abi̇12"); + assertSubstringIndex("abi̇12", "\u0307", 1, "UNICODE_CI", "abi̇12"); + assertSubstringIndex("abi̇12", "İ", 1, "UNICODE_CI", "ab"); + assertSubstringIndex("abİ12", "i", 1, "UNICODE_CI", "abİ12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UNICODE_CI", "İo12İoi̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UNICODE_CI", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UNICODE_CI", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UNICODE_CI", "ai̇bİoi̇o12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UNICODE_CI", "ai̇bİoi̇o12"); + assertSubstringIndex("abi̇12", "i", 1, "UTF8_BINARY_LCASE", "ab"); // != UNICODE_CI + assertSubstringIndex("abi̇12", "\u0307", 1, "UTF8_BINARY_LCASE", "abi"); // != UNICODE_CI + assertSubstringIndex("abi̇12", "İ", 1, "UTF8_BINARY_LCASE", "ab"); + assertSubstringIndex("abİ12", "i", 1, "UTF8_BINARY_LCASE", "abİ12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UTF8_BINARY_LCASE", "ai̇bİoi̇o12"); Review Comment: Same for these. ########## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ########## @@ -567,8 +567,26 @@ public void testStringInstr() throws SparkException { assertStringInstr("aaads", "dS", "UNICODE_CI", 4); assertStringInstr("test大千世界X大千世界", "界y", "UNICODE_CI", 0); assertStringInstr("test大千世界X大千世界", "界x", "UNICODE_CI", 8); - assertStringInstr("abİo12", "i̇o", "UNICODE_CI", 3); - assertStringInstr("abi̇o12", "İo", "UNICODE_CI", 3); + assertStringInstr("i̇", "i", "UNICODE_CI", 0); + assertStringInstr("i̇", "\u0307", "UNICODE_CI", 0); + assertStringInstr("i̇", "İ", "UNICODE_CI", 1); + assertStringInstr("İ", "i", "UNICODE_CI", 0); + assertStringInstr("İoi̇o12", "i̇o", "UNICODE_CI", 1); + assertStringInstr("i̇oİo12", "İo", "UNICODE_CI", 1); + assertStringInstr("abİoi̇o", "i̇o", "UNICODE_CI", 3); + assertStringInstr("abi̇oİo", "İo", "UNICODE_CI", 3); + assertStringInstr("ai̇oxXİo", "Xx", "UNICODE_CI", 5); + assertStringInstr("aİoi̇oxx", "XX", "UNICODE_CI", 7); + assertStringInstr("i̇", "i", "UTF8_BINARY_LCASE", 1); // != UNICODE_CI + assertStringInstr("i̇", "\u0307", "UTF8_BINARY_LCASE", 2); // != UNICODE_CI + assertStringInstr("i̇", "İ", "UTF8_BINARY_LCASE", 1); + assertStringInstr("İ", "i", "UTF8_BINARY_LCASE", 0); + assertStringInstr("İoi̇o12", "i̇o", "UTF8_BINARY_LCASE", 1); + assertStringInstr("i̇oİo12", "İo", "UTF8_BINARY_LCASE", 1); + assertStringInstr("abİoi̇o", "i̇o", "UTF8_BINARY_LCASE", 3); + assertStringInstr("abi̇oİo", "İo", "UTF8_BINARY_LCASE", 3); Review Comment: Can we also add? ``` assertStringInstr("abI\u0307oi̇o", "İo", "UTF8_BINARY_LCASE", 3); assertStringInstr("abİoi̇o", "\u0307o", "UTF8_BINARY_LCASE", 6); ``` ########## common/unsafe/src/test/java/org/apache/spark/unsafe/types/CollationSupportSuite.java: ########## @@ -798,6 +816,30 @@ public void testSubstringIndex() throws SparkException { assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo"); assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo"); + assertSubstringIndex("abi̇12", "i", 1, "UNICODE_CI", "abi̇12"); + assertSubstringIndex("abi̇12", "\u0307", 1, "UNICODE_CI", "abi̇12"); + assertSubstringIndex("abi̇12", "İ", 1, "UNICODE_CI", "ab"); + assertSubstringIndex("abİ12", "i", 1, "UNICODE_CI", "abİ12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UNICODE_CI", "İo12İoi̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UNICODE_CI", "İo12İoi̇o"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UNICODE_CI", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UNICODE_CI", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", 3, "UNICODE_CI", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", 3, "UNICODE_CI", "ai̇bi̇oİo12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", 3, "UNICODE_CI", "ai̇bİoi̇o12"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", 3, "UNICODE_CI", "ai̇bİoi̇o12"); + assertSubstringIndex("abi̇12", "i", 1, "UTF8_BINARY_LCASE", "ab"); // != UNICODE_CI + assertSubstringIndex("abi̇12", "\u0307", 1, "UTF8_BINARY_LCASE", "abi"); // != UNICODE_CI + assertSubstringIndex("abi̇12", "İ", 1, "UTF8_BINARY_LCASE", "ab"); + assertSubstringIndex("abİ12", "i", 1, "UTF8_BINARY_LCASE", "abİ12"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "İo", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o"); + assertSubstringIndex("ai̇bi̇oİo12İoi̇o", "i̇o", -4, "UTF8_BINARY_LCASE", "İo12İoi̇o"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "İo", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo"); + assertSubstringIndex("ai̇bİoi̇o12i̇oİo", "i̇o", -4, "UTF8_BINARY_LCASE", "i̇o12i̇oİo"); Review Comment: Can we also add interesting test cases where there is no match ? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org