[GitHub] [arrow] edponce commented on a change in pull request #11298: ARROW-14205: [C++] Add unicode normalization to scalar string

GitBox Mon, 01 Nov 2021 22:39:11 -0700


edponce commented on a change in pull request #11298:
URL: https://github.com/apache/arrow/pull/11298#discussion_r740712348




##########
File path: cpp/src/arrow/compute/api_scalar.h
##########
@@ -395,6 +399,17 @@ struct ARROW_EXPORT WeekOptions : public FunctionOptions {
   bool first_week_is_fully_in_year;
 };
 
+struct ARROW_EXPORT Utf8NormalizeOptions : public FunctionOptions {

Review comment:
       I think you should guard this option and kernel with `#ifdef 
ARROW_WITH_UTF8PROC` as these should not be used without such support. 
Everything related to this compute function should be guarded.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc
##########
@@ -933,6 +933,48 @@ TYPED_TEST(TestStringKernels, Utf8Reverse) {
   
ASSERT_TRUE(res->array()->buffers[1]->Equals(*malformed_input->data()->buffers[1]));
 }
 
+TYPED_TEST(TestStringKernels, Utf8Normalize) {
+  Utf8NormalizeOptions nfc_options{Utf8NormalizeOptions::Method::NFC};
+  Utf8NormalizeOptions nfkc_options{Utf8NormalizeOptions::Method::NFKC};
+  Utf8NormalizeOptions nfd_options{Utf8NormalizeOptions::Method::NFD};
+  Utf8NormalizeOptions nfkd_options{Utf8NormalizeOptions::Method::NFKD};
+
+  this->CheckUnary("utf8_normalize", "[]", this->type(), "[]", &nfc_options);
+  this->CheckUnary("utf8_normalize", "[]", this->type(), "[]", &nfkc_options);
+  this->CheckUnary("utf8_normalize", "[]", this->type(), "[]", &nfd_options);
+  this->CheckUnary("utf8_normalize", "[]", this->type(), "[]", &nfkd_options);
+  this->CheckUnary("utf8_normalize", R"([null, ""])", this->type(), R"([null, 
""])",
+                   &nfc_options);
+  this->CheckUnary("utf8_normalize", R"([null, ""])", this->type(), R"([null, 
""])",
+                   &nfkc_options);
+  this->CheckUnary("utf8_normalize", R"([null, ""])", this->type(), R"([null, 
""])",
+                   &nfd_options);
+  this->CheckUnary("utf8_normalize", R"([null, ""])", this->type(), R"([null, 
""])",
+                   &nfkd_options);

Review comment:
       No need to check these case (empty, null).

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -672,11 +678,64 @@ struct Utf8TitleTransform : public 
FunctionalCaseMappingTransform {
 template <typename Type>
 using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>;
 
+struct Utf8NormalizeTransform : public FunctionalCaseMappingTransform {
+  using State = OptionsWrapper<Utf8NormalizeOptions>;
+
+  const Utf8NormalizeOptions* options;
+
+  explicit Utf8NormalizeTransform(const Utf8NormalizeOptions& options)
+      : options{&options} {}
+
+  int64_t MaxCodeunits(const uint8_t* input, int64_t ninputs,
+                       int64_t input_ncodeunits) override {
+    const auto option = GenerateUtf8NormalizeOption(options->method);

Review comment:
       Since this is a constant value that only depends on the given options, 
we can move the `GenerateUtf8NormalizeOption` to the `Utf8NormalizeOptions` and 
call it in the constructor, and add a `utf8proc_option_t utf8proc_form` data 
member to hold this value. Then, this value can be accessed in the kernel as 
`options_.utf8proc_form`.

##########
File path: cpp/src/arrow/compute/api_scalar.cc
##########
@@ -173,6 +173,30 @@ struct EnumTraits<compute::RoundMode>
     return "<INVALID>";
   }
 };
+
+template <>
+struct EnumTraits<compute::Utf8NormalizeOptions::Method>
+    : BasicEnumTraits<compute::Utf8NormalizeOptions::Method,
+                      compute::Utf8NormalizeOptions::Method::NFC,
+                      compute::Utf8NormalizeOptions::Method::NFKC,
+                      compute::Utf8NormalizeOptions::Method::NFD,
+                      compute::Utf8NormalizeOptions::Method::NFKD> {
+  static std::string name() { return "compute::Utf8NormalizeOptions::Method"; }

Review comment:
       In `name()`, you can remove the `compute::` part from the string-based 
namespace bc the enum is inside the `Utf8NormalizeOptions` class.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -672,11 +678,64 @@ struct Utf8TitleTransform : public 
FunctionalCaseMappingTransform {
 template <typename Type>
 using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>;
 
+struct Utf8NormalizeTransform : public FunctionalCaseMappingTransform {
+  using State = OptionsWrapper<Utf8NormalizeOptions>;
+
+  const Utf8NormalizeOptions* options;
+
+  explicit Utf8NormalizeTransform(const Utf8NormalizeOptions& options)
+      : options{&options} {}
+
+  int64_t MaxCodeunits(const uint8_t* input, int64_t ninputs,
+                       int64_t input_ncodeunits) override {
+    const auto option = GenerateUtf8NormalizeOption(options->method);
+    const auto n_chars =
+        utf8proc_decompose_custom(input, input_ncodeunits, NULL, 0, option, 
NULL, NULL);
+
+    // convert to byte length
+    return n_chars * 4;
+  }
+
+  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+                    uint8_t* output, int64_t output_string_ncodeunits) {
+    const auto option = GenerateUtf8NormalizeOption(options->method);
+    const auto n_chars = utf8proc_decompose_custom(
+        input, input_string_ncodeunits, 
reinterpret_cast<utf8proc_int32_t*>(output),
+        output_string_ncodeunits, option, NULL, NULL);
+    if (n_chars < 0) return output_string_ncodeunits;
+
+    const auto n_bytes =
+        utf8proc_reencode(reinterpret_cast<utf8proc_int32_t*>(output), 
n_chars, option);
+    return n_bytes;
+  }

Review comment:
       Several comments:
   1. Why not use [`utf8proc_NFC()` and 
friends](https://juliastrings.github.io/utf8proc/doc/utf8proc_8h.html#a11717c1d4b4725be52102a1cc5cd1d3c)?
 These expect a null-terminated string so these would require copying the input 
string and null-terminating it. If this approach is used, then 
`Utf8NormalizeOptions` do not need to be resolved in constructor but here in 
the kernel.
   2. 
[`utf8proc_reencode()`](https://juliastrings.github.io/utf8proc/doc/utf8proc_8h.html#a5e7debdee382d00b237d75f0932a3e73)
 has a warning message regarding its size and IIUC it creates a 
null-terminating string which you are writing into the output. AFAIK, Arrow 
stores strings contiguously without null-terminations.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -672,11 +678,64 @@ struct Utf8TitleTransform : public 
FunctionalCaseMappingTransform {
 template <typename Type>
 using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>;
 
+struct Utf8NormalizeTransform : public FunctionalCaseMappingTransform {
+  using State = OptionsWrapper<Utf8NormalizeOptions>;
+
+  const Utf8NormalizeOptions* options;
+
+  explicit Utf8NormalizeTransform(const Utf8NormalizeOptions& options)
+      : options{&options} {}
+
+  int64_t MaxCodeunits(const uint8_t* input, int64_t ninputs,
+                       int64_t input_ncodeunits) override {
+    const auto option = GenerateUtf8NormalizeOption(options->method);
+    const auto n_chars =
+        utf8proc_decompose_custom(input, input_ncodeunits, NULL, 0, option, 
NULL, NULL);
+
+    // convert to byte length
+    return n_chars * 4;
+  }
+
+  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+                    uint8_t* output, int64_t output_string_ncodeunits) {
+    const auto option = GenerateUtf8NormalizeOption(options->method);
+    const auto n_chars = utf8proc_decompose_custom(

Review comment:
       use `utf8proc_decompose()` instead

##########
File path: cpp/src/arrow/compute/api_scalar.h
##########
@@ -395,6 +399,17 @@ struct ARROW_EXPORT WeekOptions : public FunctionOptions {
   bool first_week_is_fully_in_year;
 };
 
+struct ARROW_EXPORT Utf8NormalizeOptions : public FunctionOptions {
+ public:
+  enum Method { NFC, NFKC, NFD, NFKD };

Review comment:
       I think a more suitable name for the enum is `Form` as these are "normal 
forms" of the string.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -672,11 +678,64 @@ struct Utf8TitleTransform : public 
FunctionalCaseMappingTransform {
 template <typename Type>
 using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>;
 
+struct Utf8NormalizeTransform : public FunctionalCaseMappingTransform {
+  using State = OptionsWrapper<Utf8NormalizeOptions>;
+
+  const Utf8NormalizeOptions* options;
+
+  explicit Utf8NormalizeTransform(const Utf8NormalizeOptions& options)
+      : options{&options} {}

Review comment:
       `... : options_(options) {}`

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -672,11 +678,64 @@ struct Utf8TitleTransform : public 
FunctionalCaseMappingTransform {
 template <typename Type>
 using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>;
 
+struct Utf8NormalizeTransform : public FunctionalCaseMappingTransform {
+  using State = OptionsWrapper<Utf8NormalizeOptions>;
+
+  const Utf8NormalizeOptions* options;
+
+  explicit Utf8NormalizeTransform(const Utf8NormalizeOptions& options)
+      : options{&options} {}
+
+  int64_t MaxCodeunits(const uint8_t* input, int64_t ninputs,
+                       int64_t input_ncodeunits) override {
+    const auto option = GenerateUtf8NormalizeOption(options->method);
+    const auto n_chars =
+        utf8proc_decompose_custom(input, input_ncodeunits, NULL, 0, option, 
NULL, NULL);
+
+    // convert to byte length
+    return n_chars * 4;
+  }
+
+  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+                    uint8_t* output, int64_t output_string_ncodeunits) {
+    const auto option = GenerateUtf8NormalizeOption(options->method);
+    const auto n_chars = utf8proc_decompose_custom(
+        input, input_string_ncodeunits, 
reinterpret_cast<utf8proc_int32_t*>(output),
+        output_string_ncodeunits, option, NULL, NULL);
+    if (n_chars < 0) return output_string_ncodeunits;
+
+    const auto n_bytes =
+        utf8proc_reencode(reinterpret_cast<utf8proc_int32_t*>(output), 
n_chars, option);
+    return n_bytes;
+  }
+
+ private:
+  utf8proc_option_t GenerateUtf8NormalizeOption(Utf8NormalizeOptions::Method 
method) {
+    switch (method) {
+      case Utf8NormalizeOptions::Method::NFC:
+        return static_cast<utf8proc_option_t>(UTF8PROC_STABLE | 
UTF8PROC_COMPOSE);
+      case Utf8NormalizeOptions::Method::NFKC:
+        return static_cast<utf8proc_option_t>(UTF8PROC_STABLE | 
UTF8PROC_COMPOSE |
+                                              UTF8PROC_COMPAT);
+      case Utf8NormalizeOptions::Method::NFD:
+        return static_cast<utf8proc_option_t>(UTF8PROC_STABLE | 
UTF8PROC_DECOMPOSE);
+      case Utf8NormalizeOptions::Method::NFKD:
+        return static_cast<utf8proc_option_t>(UTF8PROC_STABLE | 
UTF8PROC_DECOMPOSE |
+                                              UTF8PROC_COMPAT);
+      default:
+        return static_cast<utf8proc_option_t>(UTF8PROC_STABLE | 
UTF8PROC_COMPOSE);

Review comment:
       use `default: break;` instead.

##########
File path: cpp/src/arrow/compute/kernels/scalar_string.cc
##########
@@ -672,11 +678,64 @@ struct Utf8TitleTransform : public 
FunctionalCaseMappingTransform {
 template <typename Type>
 using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>;
 
+struct Utf8NormalizeTransform : public FunctionalCaseMappingTransform {
+  using State = OptionsWrapper<Utf8NormalizeOptions>;
+
+  const Utf8NormalizeOptions* options;

Review comment:
       `const Utf8NormalizeOptions& options_`




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] edponce commented on a change in pull request #11298: ARROW-14205: [C++] Add unicode normalization to scalar string

Reply via email to