uros-db commented on code in PR #46206: URL: https://github.com/apache/spark/pull/46206#discussion_r1580483101
########## common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java: ########## @@ -403,6 +657,289 @@ private static int indexOf(final UTF8String target, final UTF8String pattern, return stringSearch.next(); } + private static UTF8String lowercaseTrim( + final UTF8String srcString, + final UTF8String trimString) { + // Matching UTF8String behavior for null `trimString`. + if (trimString == null) { + return null; + } + + UTF8String leftTrimmed = lowercaseTrimLeft(srcString, trimString); + return lowercaseTrimRight(leftTrimmed, trimString); + } + + private static UTF8String lowercaseTrimLeft( + final UTF8String srcString, + final UTF8String trimString) { + // Matching UTF8String behavior for null `trimString`. + if (trimString == null) { + return null; + } + + // The searching byte position in the srcString. + int searchIdx = 0; + // The byte position of a first non-matching character in the srcString. + int trimByteIdx = 0; + // Number of bytes in srcString. + int numBytes = srcString.numBytes(); + // Convert trimString to lowercase so it can be searched properly. + UTF8String lowercaseTrimString = trimString.toLowerCase(); + + while (searchIdx < numBytes) { + UTF8String searchChar = srcString.copyUTF8String( + searchIdx, + searchIdx + UTF8String.numBytesForFirstByte(srcString.getByte(searchIdx)) - 1); + int searchCharBytes = searchChar.numBytes(); + + // Try to find the matching for the searchChar in the trimString. + if (lowercaseTrimString.find(searchChar.toLowerCase(), 0) >= 0) { + trimByteIdx += searchCharBytes; + searchIdx += searchCharBytes; + } else { + // No matching, exit the search. + break; + } + } + + if (searchIdx == 0) { + // Nothing trimmed - return original string (not converted to lowercase). + return srcString; + } + if (trimByteIdx >= numBytes) { + // Everything trimmed. + return UTF8String.EMPTY_UTF8; + } + return srcString.copyUTF8String(trimByteIdx, numBytes - 1); + } + + private static UTF8String lowercaseTrimRight( + final UTF8String srcString, + final UTF8String trimString) { + // Matching UTF8String behavior for null `trimString`. + if (trimString == null) { + return null; + } + + // Number of bytes iterated from the srcString. + int byteIdx = 0; + // Number of characters iterated from the srcString. + int numChars = 0; + // Number of bytes in srcString. + int numBytes = srcString.numBytes(); + // Array of character length for the srcString. + int[] stringCharLen = new int[numBytes]; + // Array of the first byte position for each character in the srcString + int[] stringCharPos = new int[numBytes]; + // Non-final value for trim string to use. + UTF8String lowercaseTrimString = trimString.toLowerCase(); + + // Build the position and length array. + while (byteIdx < numBytes) { + stringCharPos[numChars] = byteIdx; + stringCharLen[numChars] = UTF8String.numBytesForFirstByte(srcString.getByte(byteIdx)); + byteIdx += stringCharLen[numChars]; + numChars++; + } + + // Index trimEnd points to the first no matching byte position from the right side of + // the source string. + int trimByteIdx = numBytes - 1; + + while (numChars > 0) { + UTF8String searchChar = srcString.copyUTF8String( + stringCharPos[numChars - 1], + stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1); + + if(lowercaseTrimString.find(searchChar.toLowerCase(), 0) >= 0) { + trimByteIdx -= stringCharLen[numChars - 1]; + numChars--; + } else { + break; + } + } + + if (trimByteIdx == numBytes - 1) { + // Nothing trimmed. + return srcString; + } + if (trimByteIdx < 0) { + // Everything trimmed. + return UTF8String.EMPTY_UTF8; + } + return srcString.copyUTF8String(0, trimByteIdx); + } + + private static UTF8String trim( + final UTF8String srcString, + int collationId) { + UTF8String leftTrimmed = trimLeft(srcString, collationId); + return trimRight(leftTrimmed, collationId); + } + + private static UTF8String trim( + final UTF8String srcString, + final UTF8String trimString, + int collationId) { + // Matching UTF8String behavior for null `trimString`. + if (trimString == null) { + return null; + } + + UTF8String leftTrimmed = trimLeft(srcString, trimString, collationId); + return trimRight(leftTrimmed, trimString, collationId); + } + + private static UTF8String trimLeft( + final UTF8String srcString, + int collationId) { + return trimLeft(srcString, UTF8String.fromString(" "), collationId); + } + + private static UTF8String trimLeft( + final UTF8String srcString, + final UTF8String trimString, + int collationId) { + // Matching UTF8String behavior for null `trimString`. + if (trimString == null) { + return null; + } + + // The searching byte position in the srcString. + int searchIdx = 0; + // The byte position of a first non-matching character in the srcString. + int trimByteIdx = 0; + // Number of bytes in srcString. + int numBytes = srcString.numBytes(); + + // Create ICU StringSearch object. + StringSearch stringSearch = CollationFactory.getStringSearch( + trimString, UTF8String.EMPTY_UTF8, collationId); + // Create hash set to save seen chars + Set<UTF8String> seenChars = new HashSet<>(); + + while (searchIdx < numBytes) { + UTF8String searchChar = srcString.copyUTF8String( + searchIdx, + searchIdx + UTF8String.numBytesForFirstByte(srcString.getByte(searchIdx)) - 1); + int searchCharBytes = searchChar.numBytes(); + + // First check if we have already seen this char in srcString. + if (seenChars.contains(searchChar)) { + trimByteIdx += searchCharBytes; + searchIdx += searchCharBytes; + continue; + } + + // Otherwise, try to find the matching for the searchChar in the trimString. + stringSearch.reset(); + stringSearch.setPattern(searchChar.toString()); + int searchCharIdx = stringSearch.next(); + + if (searchCharIdx != StringSearch.DONE + && stringSearch.getMatchLength() == stringSearch.getPattern().length()) { + trimByteIdx += searchCharBytes; + searchIdx += searchCharBytes; + seenChars.add(searchChar); + } else { + // No matching, exit the search. + break; + } + } + + if (searchIdx == 0) { + // Nothing trimmed - return original string (not converted to lowercase). + return srcString; + } + if (trimByteIdx >= numBytes) { + // Everything trimmed. + return UTF8String.EMPTY_UTF8; + } + return srcString.copyUTF8String(trimByteIdx, numBytes - 1); + } + + private static UTF8String trimRight( + final UTF8String srcString, + int collationId) { + return trimRight(srcString, UTF8String.fromString(" "), collationId); + } + + private static UTF8String trimRight( + final UTF8String srcString, + final UTF8String trimString, + int collationId) { + // Matching UTF8String behavior for null `trimString`. + if (trimString == null) { + return null; + } + + // Number of bytes iterated from the srcString. + int byteIdx = 0; + // Number of characters iterated from the srcString. + int numChars = 0; + // Number of bytes in srcString. + int numBytes = srcString.numBytes(); + // Array of character length for the srcString. + int[] stringCharLen = new int[numBytes]; + // Array of the first byte position for each character in the srcString. + int[] stringCharPos = new int[numBytes]; + + // Build the position and length array. + while (byteIdx < numBytes) { + stringCharPos[numChars] = byteIdx; + stringCharLen[numChars] = UTF8String.numBytesForFirstByte(srcString.getByte(byteIdx)); + byteIdx += stringCharLen[numChars]; + numChars++; + } + + // Create ICU StringSearch object. + StringSearch stringSearch = CollationFactory.getStringSearch( + trimString, UTF8String.EMPTY_UTF8, collationId); + // Create hash set to save seen chars + Set<UTF8String> seenChars = new HashSet<>(); + + // Index trimEnd points to the first no matching byte position from the right side of + // the source string. + int trimByteIdx = numBytes - 1; + + while (numChars > 0) { + UTF8String searchChar = srcString.copyUTF8String( + stringCharPos[numChars - 1], + stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1); + + // First check if we have already seen this char in srcString. + if (seenChars.contains(searchChar)) { + trimByteIdx -= stringCharLen[numChars - 1]; + numChars--; + continue; + } + + // Otherwise, try to find the matching for the searchChar in the trimString. + stringSearch.reset(); + stringSearch.setPattern(searchChar.toString()); + int searchCharIdx = stringSearch.next(); + + if (searchCharIdx != StringSearch.DONE + && stringSearch.getMatchLength() == stringSearch.getPattern().length()) { Review Comment: instead, we should assume that `stringSearch.getPattern().length()` will be different than `stringSearch.getMatchLength()` in some cases (this is fine, ICU took care of this and that's why StringSearch interface is the way it is) consider the example of trying to do trim("i̇abc", "İ") so after we are done searching for "İ" in the target string, `getMatchLength()` will be 2 because it arises from the corresponding target substring (which is "i̇"), but pattern is still "İ" (which is a 1 character), hence `stringSearch.getMatchLength() != stringSearch.getPattern().length() ` (which is absolutely **fine**) now consider the example of trying to do trim("İabc", "i̇") so after we are done searching for "i̇" in the target string, `getMatchLength()` will be 1 because it arises from the corresponding target substring (which is "İ"), but pattern is still "i̇" (which is a 2 characters), hence `stringSearch.getMatchLength() != stringSearch.getPattern().length() ` (which is absolutely **fine**) border line: always use `getMatchLength()` as the only valid source of truth, and don't try to assume that you will know in advance how long the match is, especially not using pattern.length() -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org