Re: [PR] [SPARK-47409][SQL] Add support for collation for StringTrim type of functions/expressions [spark]

via GitHub Thu, 25 Apr 2024 21:52:47 -0700


uros-db commented on code in PR #46206:
URL: https://github.com/apache/spark/pull/46206#discussion_r1580477853



##########
common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java:
##########
@@ -403,6 +657,289 @@ private static int indexOf(final UTF8String target, final 
UTF8String pattern,
       return stringSearch.next();
     }
 
+    private static UTF8String lowercaseTrim(
+        final UTF8String srcString,
+        final UTF8String trimString) {
+      // Matching UTF8String behavior for null `trimString`.
+      if (trimString == null) {
+        return null;
+      }
+
+      UTF8String leftTrimmed = lowercaseTrimLeft(srcString, trimString);
+      return lowercaseTrimRight(leftTrimmed, trimString);
+    }
+
+    private static UTF8String lowercaseTrimLeft(
+        final UTF8String srcString,
+        final UTF8String trimString) {
+      // Matching UTF8String behavior for null `trimString`.
+      if (trimString == null) {
+        return null;
+      }
+
+      // The searching byte position in the srcString.
+      int searchIdx = 0;
+      // The byte position of a first non-matching character in the srcString.
+      int trimByteIdx = 0;
+      // Number of bytes in srcString.
+      int numBytes = srcString.numBytes();
+      // Convert trimString to lowercase so it can be searched properly.
+      UTF8String lowercaseTrimString = trimString.toLowerCase();
+
+      while (searchIdx < numBytes) {
+        UTF8String searchChar = srcString.copyUTF8String(
+          searchIdx,
+          searchIdx + 
UTF8String.numBytesForFirstByte(srcString.getByte(searchIdx)) - 1);
+        int searchCharBytes = searchChar.numBytes();
+
+        // Try to find the matching for the searchChar in the trimString.
+        if (lowercaseTrimString.find(searchChar.toLowerCase(), 0) >= 0) {
+          trimByteIdx += searchCharBytes;
+          searchIdx += searchCharBytes;
+        } else {
+          // No matching, exit the search.
+          break;
+        }
+      }
+
+      if (searchIdx == 0) {
+        // Nothing trimmed - return original string (not converted to 
lowercase).
+        return srcString;
+      }
+      if (trimByteIdx >= numBytes) {
+        // Everything trimmed.
+        return UTF8String.EMPTY_UTF8;
+      }
+      return srcString.copyUTF8String(trimByteIdx, numBytes - 1);
+    }
+
+    private static UTF8String lowercaseTrimRight(
+        final UTF8String srcString,
+        final UTF8String trimString) {
+      // Matching UTF8String behavior for null `trimString`.
+      if (trimString == null) {
+        return null;
+      }
+
+      // Number of bytes iterated from the srcString.
+      int byteIdx = 0;
+      // Number of characters iterated from the srcString.
+      int numChars = 0;
+      // Number of bytes in srcString.
+      int numBytes = srcString.numBytes();
+      // Array of character length for the srcString.
+      int[] stringCharLen = new int[numBytes];
+      // Array of the first byte position for each character in the srcString
+      int[] stringCharPos = new int[numBytes];
+      // Non-final value for trim string to use.
+      UTF8String lowercaseTrimString = trimString.toLowerCase();
+
+      // Build the position and length array.
+      while (byteIdx < numBytes) {
+        stringCharPos[numChars] = byteIdx;
+        stringCharLen[numChars] = 
UTF8String.numBytesForFirstByte(srcString.getByte(byteIdx));
+        byteIdx += stringCharLen[numChars];
+        numChars++;
+      }
+
+      // Index trimEnd points to the first no matching byte position from the 
right side of
+      //  the source string.
+      int trimByteIdx = numBytes - 1;
+
+      while (numChars > 0) {
+        UTF8String searchChar = srcString.copyUTF8String(
+          stringCharPos[numChars - 1],
+          stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1);
+
+        if(lowercaseTrimString.find(searchChar.toLowerCase(), 0) >= 0) {
+          trimByteIdx -= stringCharLen[numChars - 1];
+          numChars--;
+        } else {
+          break;
+        }
+      }
+
+      if (trimByteIdx == numBytes - 1) {
+        // Nothing trimmed.
+        return srcString;
+      }
+      if (trimByteIdx < 0) {
+        // Everything trimmed.
+        return UTF8String.EMPTY_UTF8;
+      }
+      return srcString.copyUTF8String(0, trimByteIdx);
+    }
+
+    private static UTF8String trim(
+        final UTF8String srcString,
+        int collationId) {
+      UTF8String leftTrimmed = trimLeft(srcString, collationId);
+      return trimRight(leftTrimmed, collationId);
+    }
+
+    private static UTF8String trim(
+        final UTF8String srcString,
+        final UTF8String trimString,
+        int collationId) {
+      // Matching UTF8String behavior for null `trimString`.
+      if (trimString == null) {
+        return null;
+      }
+
+      UTF8String leftTrimmed = trimLeft(srcString, trimString, collationId);
+      return trimRight(leftTrimmed, trimString, collationId);
+    }
+
+    private static UTF8String trimLeft(
+        final UTF8String srcString,
+        int collationId) {
+      return trimLeft(srcString, UTF8String.fromString(" "), collationId);
+    }
+
+    private static UTF8String trimLeft(
+        final UTF8String srcString,
+        final UTF8String trimString,
+        int collationId) {
+      // Matching UTF8String behavior for null `trimString`.
+      if (trimString == null) {
+        return null;
+      }
+
+      // The searching byte position in the srcString.
+      int searchIdx = 0;
+      // The byte position of a first non-matching character in the srcString.
+      int trimByteIdx = 0;
+      // Number of bytes in srcString.
+      int numBytes = srcString.numBytes();
+
+      // Create ICU StringSearch object.
+      StringSearch stringSearch = CollationFactory.getStringSearch(
+        trimString, UTF8String.EMPTY_UTF8, collationId);
+      // Create hash set to save seen chars
+      Set<UTF8String> seenChars = new HashSet<>();
+
+      while (searchIdx < numBytes) {
+        UTF8String searchChar = srcString.copyUTF8String(
+          searchIdx,
+          searchIdx + 
UTF8String.numBytesForFirstByte(srcString.getByte(searchIdx)) - 1);
+        int searchCharBytes = searchChar.numBytes();
+
+        // First check if we have already seen this char in srcString.
+        if (seenChars.contains(searchChar)) {
+          trimByteIdx += searchCharBytes;
+          searchIdx += searchCharBytes;
+          continue;
+        }
+
+        // Otherwise, try to find the matching for the searchChar in the 
trimString.
+        stringSearch.reset();
+        stringSearch.setPattern(searchChar.toString());
+        int searchCharIdx = stringSearch.next();
+
+        if (searchCharIdx != StringSearch.DONE
+            && stringSearch.getMatchLength() == 
stringSearch.getPattern().length()) {
+          trimByteIdx += searchCharBytes;
+          searchIdx += searchCharBytes;
+          seenChars.add(searchChar);
+        } else {
+          // No matching, exit the search.
+          break;
+        }
+      }
+
+      if (searchIdx == 0) {
+        // Nothing trimmed - return original string (not converted to 
lowercase).
+        return srcString;
+      }
+      if (trimByteIdx >= numBytes) {
+        // Everything trimmed.
+        return UTF8String.EMPTY_UTF8;
+      }
+      return srcString.copyUTF8String(trimByteIdx, numBytes - 1);
+    }
+
+    private static UTF8String trimRight(
+        final UTF8String srcString,
+        int collationId) {
+      return trimRight(srcString, UTF8String.fromString(" "), collationId);
+    }
+
+    private static UTF8String trimRight(
+        final UTF8String srcString,
+        final UTF8String trimString,
+        int collationId) {
+      // Matching UTF8String behavior for null `trimString`.
+      if (trimString == null) {
+        return null;
+      }
+
+      // Number of bytes iterated from the srcString.
+      int byteIdx = 0;
+      // Number of characters iterated from the srcString.
+      int numChars = 0;
+      // Number of bytes in srcString.
+      int numBytes = srcString.numBytes();
+      // Array of character length for the srcString.
+      int[] stringCharLen = new int[numBytes];
+      // Array of the first byte position for each character in the srcString.
+      int[] stringCharPos = new int[numBytes];
+
+      // Build the position and length array.
+      while (byteIdx < numBytes) {
+        stringCharPos[numChars] = byteIdx;
+        stringCharLen[numChars] = 
UTF8String.numBytesForFirstByte(srcString.getByte(byteIdx));
+        byteIdx += stringCharLen[numChars];
+        numChars++;
+      }
+
+      // Create ICU StringSearch object.
+      StringSearch stringSearch = CollationFactory.getStringSearch(
+        trimString, UTF8String.EMPTY_UTF8, collationId);
+      // Create hash set to save seen chars
+      Set<UTF8String> seenChars = new HashSet<>();
+
+      // Index trimEnd points to the first no matching byte position from the 
right side of
+      //  the source string.
+      int trimByteIdx = numBytes - 1;
+
+      while (numChars > 0) {
+        UTF8String searchChar = srcString.copyUTF8String(
+          stringCharPos[numChars - 1],
+          stringCharPos[numChars - 1] + stringCharLen[numChars - 1] - 1);
+
+        // First check if we have already seen this char in srcString.
+        if (seenChars.contains(searchChar)) {
+          trimByteIdx -= stringCharLen[numChars - 1];
+          numChars--;
+          continue;
+        }
+
+        // Otherwise, try to find the matching for the searchChar in the 
trimString.
+        stringSearch.reset();
+        stringSearch.setPattern(searchChar.toString());
+        int searchCharIdx = stringSearch.next();
+
+        if (searchCharIdx != StringSearch.DONE
+            && stringSearch.getMatchLength() == 
stringSearch.getPattern().length()) {

Review Comment:
   here's the problem: we shouldn't check "stringSearch.getMatchLength() == 
stringSearch.getPattern().length()" because this is a false assumption (we are 
assuming that the match, which comes from target, and pattern will have the 
same number of characters - this is **not** true for "i̇" and "İ")



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Re: [PR] [SPARK-47409][SQL] Add support for collation for StringTrim type of functions/expressions [spark]

Reply via email to