edponce commented on a change in pull request #10317: URL: https://github.com/apache/arrow/pull/10317#discussion_r632630010
########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {} #endif // ARROW_WITH_UTF8PROC +template <typename Type> +struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> { + using Base = StringTransform<Type, AsciiReverse<Type>>; + using offset_type = typename Base::offset_type; + + bool Transform(const uint8_t* input, offset_type input_string_ncodeunits, + uint8_t* output, offset_type* output_written) { + uint8_t utf8_char_found = 0; + for (offset_type i = 0; i < input_string_ncodeunits; i++) { + // if a utf8 char is found, report to utf8_char_found + utf8_char_found |= input[i] & 0x80; + output[input_string_ncodeunits - i - 1] = input[i]; + } + // todo: finalize if L278 check is required or not. If not required, + // simply use the following + // std::reverse_copy(input, input + input_string_ncodeunits, output); + *output_written = input_string_ncodeunits; + return utf8_char_found == 0; + } +}; + +/* + * UTF8 codeunit size can be determined by looking at the leading 4 bits of BYTE1 + */ +const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 2, 2, 3, 4}; + +template <typename Type> +struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> { + using Base = StringTransform<Type, Utf8Reverse<Type>>; + using offset_type = typename Base::offset_type; + + bool Transform(const uint8_t* input, offset_type input_string_ncodeunits, + uint8_t* output, offset_type* output_written) { + offset_type i = 0; + while (i < input_string_ncodeunits) { + uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4]; // right shift leading 4 bits + std::copy(input + i, input + (i + offset), + output + (input_string_ncodeunits - i - offset)); Review comment: Good eye @pitrou. Possibly add a check to ensure `input + i + offset` does not goes out-of-bounds before trying to copy. An alternative would be to add an inner loop to traverse the codepoints in a UTF8 character, but this may be less performant than using the LUT. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org