mkaravel commented on code in PR #46511: URL: https://github.com/apache/spark/pull/46511#discussion_r1599455730
########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -34,6 +34,27 @@ * Utility class for collation-aware UTF8String operations. */ public class CollationAwareUTF8String { + + public static boolean lowercaseMatchFrom(final UTF8String l, final UTF8String r, int pos) { + if (pos < 0) return false; + for (int len = 0; len <= l.numChars() - pos; len++) { Review Comment: ```suggestion for (int len = 0; len <= l.numChars() - pos; ++len) { ``` ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -34,6 +34,27 @@ * Utility class for collation-aware UTF8String operations. */ public class CollationAwareUTF8String { + + public static boolean lowercaseMatchFrom(final UTF8String l, final UTF8String r, int pos) { + if (pos < 0) return false; Review Comment: Why do we return false in this case? Also why not make this an assert and make sure we never call this method with a negative `pos`? ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -34,6 +34,27 @@ * Utility class for collation-aware UTF8String operations. */ public class CollationAwareUTF8String { + + public static boolean lowercaseMatchFrom(final UTF8String l, final UTF8String r, int pos) { + if (pos < 0) return false; + for (int len = 0; len <= l.numChars() - pos; len++) { + if (l.substring(pos, pos + len).toLowerCase().equals(r)) { + return true; + } + } + return false; + } + + public static boolean lowercaseMatchUntil(final UTF8String l, final UTF8String r, int pos) { + if (pos > l.numChars()) return false; Review Comment: Same comment regarding the bounds for `pos`. ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -183,6 +204,19 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co return 0; } + public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern, + final int start) { + if (pattern.numChars() == 0) return 0; Review Comment: I would expect `start` to be returned here, not `0`. ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -34,6 +34,27 @@ * Utility class for collation-aware UTF8String operations. */ public class CollationAwareUTF8String { + + public static boolean lowercaseMatchFrom(final UTF8String l, final UTF8String r, int pos) { + if (pos < 0) return false; + for (int len = 0; len <= l.numChars() - pos; len++) { + if (l.substring(pos, pos + len).toLowerCase().equals(r)) { Review Comment: ```suggestion if (l.substring(pos, pos + len).toLowerCase().equals(r.toLowerCase())) { ``` Somehow we need to state that `r` is already in lowercase (comment and/or change the name of the variable). ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -183,6 +204,19 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co return 0; } + public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern, + final int start) { + if (pattern.numChars() == 0) return 0; + int lenHaystack = target.numChars(), lenNeedle = pattern.numChars(); + final UTF8String needle = pattern.toLowerCase(); + for (int i = start; i <= (lenHaystack - lenNeedle); i++) { Review Comment: ```suggestion for (int i = start; i <= (lenHaystack - lenNeedle); ++i) { ``` ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ########## @@ -118,7 +118,8 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { return l.contains(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - return l.containsInLowerCase(r); + if (r.numBytes() == 0) return true; Review Comment: Do you need this? There is special case inside `lowercaseIndexOf` that handles it. ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ########## @@ -156,7 +157,8 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { return l.startsWith(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - return l.startsWithInLowerCase(r); + if (r.numBytes() == 0) return true; Review Comment: Same here. ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -34,6 +34,27 @@ * Utility class for collation-aware UTF8String operations. */ public class CollationAwareUTF8String { + + public static boolean lowercaseMatchFrom(final UTF8String l, final UTF8String r, int pos) { + if (pos < 0) return false; + for (int len = 0; len <= l.numChars() - pos; len++) { + if (l.substring(pos, pos + len).toLowerCase().equals(r)) { + return true; + } + } + return false; + } + + public static boolean lowercaseMatchUntil(final UTF8String l, final UTF8String r, int pos) { + if (pos > l.numChars()) return false; + for (int len = 1; len <= pos; len++) { Review Comment: ```suggestion for (int len = 1; len <= pos; ++len) { ``` ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java: ########## @@ -183,6 +204,19 @@ public static int findInSet(final UTF8String match, final UTF8String set, int co return 0; } + public static int lowercaseIndexOf(final UTF8String target, final UTF8String pattern, Review Comment: Please add documentation comments to all the functions, specifically regarding the indices (1-based) and the return values (here for example we want to say that in case we don't have a match we return `-1`). ########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ########## @@ -193,7 +195,8 @@ public static boolean execBinary(final UTF8String l, final UTF8String r) { return l.endsWith(r); } public static boolean execLowercase(final UTF8String l, final UTF8String r) { - return l.endsWithInLowerCase(r); + if (r.numBytes() == 0) return true; Review Comment: And here. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org