mkaravel commented on code in PR #46899: URL: https://github.com/apache/spark/pull/46899#discussion_r1634210360
########## common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java: ########## @@ -270,6 +279,129 @@ public byte[] getBytes() { } } + /** + * Utility methods and constants for UTF-8 string validation. + */ + + private static boolean isValidContinuationByte(byte b) { + return b >= (byte) 0x80 && b <= (byte) 0xBF; + } + + private static boolean isValidSecondByte(byte b, byte firstByte) { + return switch (firstByte) { + case (byte) 0xE0 -> b >= (byte) 0xA0 && b <= (byte) 0xBF; + case (byte) 0xED -> b >= (byte) 0x80 && b <= (byte) 0x9F; + case (byte) 0xF0 -> b >= (byte) 0x90 && b <= (byte) 0xBF; + case (byte) 0xF4 -> b >= (byte) 0x80 && b <= (byte) 0x8F; + default -> isValidContinuationByte(b); + }; + } + + private static final byte[] UNICODE_REPLACEMENT_CHARACTER = + new byte[] { (byte) 0xEF, (byte) 0xBF, (byte) 0xBD }; + + private static void appendReplacementCharacter(ArrayList<Byte> bytes) { + for (byte b : UTF8String.UNICODE_REPLACEMENT_CHARACTER) bytes.add(b); + } + + /** + * Returns a validated version of the current UTF-8 string by replacing invalid UTF-8 sequences + * with the Unicode replacement character (U+FFFD), as per the rules defined in the Unicode + * standard 15, Section 3.9, Paragraph D86, Table 3-7. This behaviour is consistent with the + * behaviour of `UnicodeString` in ICU4C. + * + * @return A new UTF8String that is a valid UTF8 string. + */ + public UTF8String makeValid() { + ArrayList<Byte> bytes = new ArrayList<>(); + int byteIndex = 0; + while (byteIndex < numBytes) { + // Read the first byte. + byte firstByte = getByte(byteIndex); + int expectedLen = bytesOfCodePointInUTF8[firstByte & 0xFF]; + int codePointLen = Math.min(expectedLen, numBytes - byteIndex); + // 0B UTF-8 sequence (invalid first byte). + if (codePointLen == 0) { + appendReplacementCharacter(bytes); + ++byteIndex; + continue; + } + // 1B UTF-8 sequence (ASCII or invalid). Review Comment: As per your comment, this handles ASCII and invalid (truncated) characters at the end of the string. Maybe this should be reflected in the comment: ```java // 1B UTF-8 sequence (ASCII or truncated). ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org