[GitHub] [arrow] pitrou commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

GitBox Fri, 14 May 2021 08:52:25 -0700


pitrou commented on a change in pull request #10317:
URL: https://github.com/apache/arrow/pull/10317#discussion_r632624475




##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;

Review comment:
       Use `bool` instead.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of 
BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];  // right shift 
leading 4 bits
+      std::copy(input + i, input + (i + offset),
+                output + (input_string_ncodeunits - i - offset));

Review comment:
       This is potentially reading out of bounds if there is a truncated utf8 
character at the end of the input.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of 
BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};

Review comment:
       This deserves to be an inline function in `util/utf8.h`, I think 
(perhaps it even already exists?).
   
   

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -266,6 +266,52 @@ void EnsureLookupTablesFilled() {}
 
 #endif  // ARROW_WITH_UTF8PROC
 
+template <typename Type>
+struct AsciiReverse : StringTransform<Type, AsciiReverse<Type>> {
+  using Base = StringTransform<Type, AsciiReverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    uint8_t utf8_char_found = 0;
+    for (offset_type i = 0; i < input_string_ncodeunits; i++) {
+      // if a utf8 char is found, report to utf8_char_found
+      utf8_char_found |= input[i] & 0x80;
+      output[input_string_ncodeunits - i - 1] = input[i];
+    }
+    //    todo: finalize if L278 check is required or not. If not required,
+    //    simply use the following
+    //    std::reverse_copy(input, input + input_string_ncodeunits, output);
+    *output_written = input_string_ncodeunits;
+    return utf8_char_found == 0;
+  }
+};
+
+/*
+ * UTF8 codeunit size can be determined by looking at the leading 4 bits of 
BYTE1
+ */
+const std::array<uint8_t, 16> UTF8_BYTE_SIZE_LUT{1, 1, 1, 1, 1, 1, 1, 1,
+                                                 0, 0, 0, 0, 2, 2, 3, 4};
+
+template <typename Type>
+struct Utf8Reverse : StringTransform<Type, Utf8Reverse<Type>> {
+  using Base = StringTransform<Type, Utf8Reverse<Type>>;
+  using offset_type = typename Base::offset_type;
+
+  bool Transform(const uint8_t* input, offset_type input_string_ncodeunits,
+                 uint8_t* output, offset_type* output_written) {
+    offset_type i = 0;
+    while (i < input_string_ncodeunits) {
+      uint8_t offset = UTF8_BYTE_SIZE_LUT[input[i] >> 4];  // right shift 
leading 4 bits
+      std::copy(input + i, input + (i + offset),
+                output + (input_string_ncodeunits - i - offset));
+      i += offset;

Review comment:
       There's a problem: `offset` can be 0 in the lookup table, and then you 
have an infinite loop.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -91,6 +91,25 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
                    "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
+TYPED_TEST(TestStringKernels, AsciiReverse) {
+  this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", 
this->type(),
+                   R"(["dcba", null, "", "bbb"])");
+
+  Datum input = ArrayFromJSON(this->type(), "[\"aAazZæÆ&\", null, \"\", 
\"bbb\"]");
+  ASSERT_NOT_OK(CallFunction("ascii_reverse", {input}));
+}
+
+TYPED_TEST(TestStringKernels, Utf8Reverse) {
+  this->CheckUnary("utf8_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("utf8_reverse", R"(["abcd", null, "", "bbb"])", 
this->type(),
+                   R"(["dcba", null, "", "bbb"])");
+  this->CheckUnary("utf8_reverse", "[\"aAazZæÆ&\", null, \"\", \"bbb\"]", 
this->type(),
+                   "[\"&ÆæZzaAa\", null, \"\", \"bbb\"]");
+  this->CheckUnary("utf8_reverse", "[\"ɑɽⱤæÆ&\", null, \"\", \"bbb\"]", 
this->type(),
+                   "[\"&ÆæⱤɽɑ\", null, \"\", \"bbb\"]");
+}

Review comment:
       Can you add an example of invalid UTF8? We don't care that it produces 
useful results, but it should not crash or access memory out of bounds.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -91,6 +91,25 @@ TYPED_TEST(TestStringKernels, AsciiLower) {
                    "[\"aaazzæÆ&\", null, \"\", \"bbb\"]");
 }
 
+TYPED_TEST(TestStringKernels, AsciiReverse) {
+  this->CheckUnary("ascii_reverse", "[]", this->type(), "[]");
+  this->CheckUnary("ascii_reverse", R"(["abcd", null, "", "bbb"])", 
this->type(),
+                   R"(["dcba", null, "", "bbb"])");
+
+  Datum input = ArrayFromJSON(this->type(), "[\"aAazZæÆ&\", null, \"\", 
\"bbb\"]");
+  ASSERT_NOT_OK(CallFunction("ascii_reverse", {input}));

Review comment:
       Please test the error return more precisely, instead of only checking 
that it's not ok.




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

For queries about this service, please contact Infrastructure at:
[email protected]

[GitHub] [arrow] pitrou commented on a change in pull request #10317: ARROW-12713 [C++] String reverse kernel

Reply via email to