mkaravel commented on code in PR #46589:
URL: https://github.com/apache/spark/pull/46589#discussion_r1603752983
##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##
@@ -278,47 +431,29 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
return UTF8String.EMPTY_UTF8;
}
-UTF8String lowercaseString = string.toLowerCase();
UTF8String lowercaseDelimiter = delimiter.toLowerCase();
if (count > 0) {
- int idx = -1;
+ // search left to right (note: the start code point is inclusive)
Review Comment:
```suggestion
// Search left to right (note: the start code point is inclusive).
```
Please use regular sentences as comments.
##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##
@@ -278,47 +431,29 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
return UTF8String.EMPTY_UTF8;
}
-UTF8String lowercaseString = string.toLowerCase();
UTF8String lowercaseDelimiter = delimiter.toLowerCase();
if (count > 0) {
- int idx = -1;
+ // search left to right (note: the start code point is inclusive)
+ int matchLength = -1;
while (count > 0) {
-idx = lowercaseString.find(lowercaseDelimiter, idx + 1);
-if (idx >= 0) {
- count--;
-} else {
- // can not find enough delim
- return string;
-}
- }
- if (idx == 0) {
-return UTF8String.EMPTY_UTF8;
+matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength +
1);
+if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter
+else return string; // cannot find enough delimiters in the string
Review Comment:
```suggestion
else return string; // Cannot find enough delimiters in the string.
```
##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##
@@ -278,47 +431,29 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
return UTF8String.EMPTY_UTF8;
}
-UTF8String lowercaseString = string.toLowerCase();
UTF8String lowercaseDelimiter = delimiter.toLowerCase();
if (count > 0) {
- int idx = -1;
+ // search left to right (note: the start code point is inclusive)
+ int matchLength = -1;
while (count > 0) {
-idx = lowercaseString.find(lowercaseDelimiter, idx + 1);
-if (idx >= 0) {
- count--;
-} else {
- // can not find enough delim
- return string;
-}
- }
- if (idx == 0) {
-return UTF8String.EMPTY_UTF8;
+matchLength = lowercaseFind(string, lowercaseDelimiter, matchLength +
1);
+if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter
+else return string; // cannot find enough delimiters in the string
}
- byte[] bytes = new byte[idx];
- copyMemory(string.getBaseObject(), string.getBaseOffset(), bytes,
BYTE_ARRAY_OFFSET, idx);
- return UTF8String.fromBytes(bytes);
-
+ if (matchLength == 0) return UTF8String.EMPTY_UTF8;
+ return string.substring(0, matchLength);
} else {
- int idx = string.numBytes() - delimiter.numBytes() + 1;
+ // search right to left (note: the end code point is exclusive)
+ int matchLength = string.numChars() + 1;
count = -count;
while (count > 0) {
-idx = lowercaseString.rfind(lowercaseDelimiter, idx - 1);
-if (idx >= 0) {
- count--;
-} else {
- // can not find enough delim
- return string;
-}
+matchLength = lowercaseRFind(string, lowercaseDelimiter, matchLength -
1);
+if (matchLength > MATCH_NOT_FOUND) count--; // found a delimiter
+else return string; // cannot find enough delimiters in the string
}
- if (idx + delimiter.numBytes() == string.numBytes()) {
-return UTF8String.EMPTY_UTF8;
- }
- int size = string.numBytes() - delimiter.numBytes() - idx;
- byte[] bytes = new byte[size];
- copyMemory(string.getBaseObject(), string.getBaseOffset() + idx +
delimiter.numBytes(),
-bytes, BYTE_ARRAY_OFFSET, size);
- return UTF8String.fromBytes(bytes);
+ if (matchLength == string.numChars()) return UTF8String.EMPTY_UTF8;
+ return string.substring(matchLength, string.numChars());
Review Comment:
```suggestion
return string.substring(matchLength, string.numChars());
```
Same here.
##
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java:
##
@@ -278,47 +431,29 @@ public static UTF8String lowercaseSubStringIndex(final
UTF8String string,
return UTF8String.EMPTY_UTF8;
}
-UTF8String lowercaseString = string.toLowerCase();
UTF8