edponce commented on a change in pull request #11298: URL: https://github.com/apache/arrow/pull/11298#discussion_r740712348
########## File path: cpp/src/arrow/compute/api_scalar.h ########## @@ -395,6 +399,17 @@ struct ARROW_EXPORT WeekOptions : public FunctionOptions { bool first_week_is_fully_in_year; }; +struct ARROW_EXPORT Utf8NormalizeOptions : public FunctionOptions { Review comment: I think you should guard this option and kernel with `#ifdef ARROW_WITH_UTF8PROC` as these should not be used without such support. Everything related to this compute function should be guarded. ########## File path: cpp/src/arrow/compute/kernels/scalar_string_test.cc ########## @@ -933,6 +933,48 @@ TYPED_TEST(TestStringKernels, Utf8Reverse) { ASSERT_TRUE(res->array()->buffers[1]->Equals(*malformed_input->data()->buffers[1])); } +TYPED_TEST(TestStringKernels, Utf8Normalize) { + Utf8NormalizeOptions nfc_options{Utf8NormalizeOptions::Method::NFC}; + Utf8NormalizeOptions nfkc_options{Utf8NormalizeOptions::Method::NFKC}; + Utf8NormalizeOptions nfd_options{Utf8NormalizeOptions::Method::NFD}; + Utf8NormalizeOptions nfkd_options{Utf8NormalizeOptions::Method::NFKD}; + + this->CheckUnary("utf8_normalize", "[]", this->type(), "[]", &nfc_options); + this->CheckUnary("utf8_normalize", "[]", this->type(), "[]", &nfkc_options); + this->CheckUnary("utf8_normalize", "[]", this->type(), "[]", &nfd_options); + this->CheckUnary("utf8_normalize", "[]", this->type(), "[]", &nfkd_options); + this->CheckUnary("utf8_normalize", R"([null, ""])", this->type(), R"([null, ""])", + &nfc_options); + this->CheckUnary("utf8_normalize", R"([null, ""])", this->type(), R"([null, ""])", + &nfkc_options); + this->CheckUnary("utf8_normalize", R"([null, ""])", this->type(), R"([null, ""])", + &nfd_options); + this->CheckUnary("utf8_normalize", R"([null, ""])", this->type(), R"([null, ""])", + &nfkd_options); Review comment: No need to check these case (empty, null). ########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -672,11 +678,64 @@ struct Utf8TitleTransform : public FunctionalCaseMappingTransform { template <typename Type> using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>; +struct Utf8NormalizeTransform : public FunctionalCaseMappingTransform { + using State = OptionsWrapper<Utf8NormalizeOptions>; + + const Utf8NormalizeOptions* options; + + explicit Utf8NormalizeTransform(const Utf8NormalizeOptions& options) + : options{&options} {} + + int64_t MaxCodeunits(const uint8_t* input, int64_t ninputs, + int64_t input_ncodeunits) override { + const auto option = GenerateUtf8NormalizeOption(options->method); Review comment: Since this is a constant value that only depends on the given options, we can move the `GenerateUtf8NormalizeOption` to the `Utf8NormalizeOptions` and call it in the constructor, and add a `utf8proc_option_t utf8proc_form` data member to hold this value. Then, this value can be accessed in the kernel as `options_.utf8proc_form`. ########## File path: cpp/src/arrow/compute/api_scalar.cc ########## @@ -173,6 +173,30 @@ struct EnumTraits<compute::RoundMode> return "<INVALID>"; } }; + +template <> +struct EnumTraits<compute::Utf8NormalizeOptions::Method> + : BasicEnumTraits<compute::Utf8NormalizeOptions::Method, + compute::Utf8NormalizeOptions::Method::NFC, + compute::Utf8NormalizeOptions::Method::NFKC, + compute::Utf8NormalizeOptions::Method::NFD, + compute::Utf8NormalizeOptions::Method::NFKD> { + static std::string name() { return "compute::Utf8NormalizeOptions::Method"; } Review comment: In `name()`, you can remove the `compute::` part from the string-based namespace bc the enum is inside the `Utf8NormalizeOptions` class. ########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -672,11 +678,64 @@ struct Utf8TitleTransform : public FunctionalCaseMappingTransform { template <typename Type> using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>; +struct Utf8NormalizeTransform : public FunctionalCaseMappingTransform { + using State = OptionsWrapper<Utf8NormalizeOptions>; + + const Utf8NormalizeOptions* options; + + explicit Utf8NormalizeTransform(const Utf8NormalizeOptions& options) + : options{&options} {} + + int64_t MaxCodeunits(const uint8_t* input, int64_t ninputs, + int64_t input_ncodeunits) override { + const auto option = GenerateUtf8NormalizeOption(options->method); + const auto n_chars = + utf8proc_decompose_custom(input, input_ncodeunits, NULL, 0, option, NULL, NULL); + + // convert to byte length + return n_chars * 4; + } + + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output, int64_t output_string_ncodeunits) { + const auto option = GenerateUtf8NormalizeOption(options->method); + const auto n_chars = utf8proc_decompose_custom( + input, input_string_ncodeunits, reinterpret_cast<utf8proc_int32_t*>(output), + output_string_ncodeunits, option, NULL, NULL); + if (n_chars < 0) return output_string_ncodeunits; + + const auto n_bytes = + utf8proc_reencode(reinterpret_cast<utf8proc_int32_t*>(output), n_chars, option); + return n_bytes; + } Review comment: Several comments: 1. Why not use [`utf8proc_NFC()` and friends](https://juliastrings.github.io/utf8proc/doc/utf8proc_8h.html#a11717c1d4b4725be52102a1cc5cd1d3c)? These expect a null-terminated string so these would require copying the input string and null-terminating it. If this approach is used, then `Utf8NormalizeOptions` do not need to be resolved in constructor but here in the kernel. 2. [`utf8proc_reencode()`](https://juliastrings.github.io/utf8proc/doc/utf8proc_8h.html#a5e7debdee382d00b237d75f0932a3e73) has a warning message regarding its size and IIUC it creates a null-terminating string which you are writing into the output. AFAIK, Arrow stores strings contiguously without null-terminations. ########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -672,11 +678,64 @@ struct Utf8TitleTransform : public FunctionalCaseMappingTransform { template <typename Type> using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>; +struct Utf8NormalizeTransform : public FunctionalCaseMappingTransform { + using State = OptionsWrapper<Utf8NormalizeOptions>; + + const Utf8NormalizeOptions* options; + + explicit Utf8NormalizeTransform(const Utf8NormalizeOptions& options) + : options{&options} {} + + int64_t MaxCodeunits(const uint8_t* input, int64_t ninputs, + int64_t input_ncodeunits) override { + const auto option = GenerateUtf8NormalizeOption(options->method); + const auto n_chars = + utf8proc_decompose_custom(input, input_ncodeunits, NULL, 0, option, NULL, NULL); + + // convert to byte length + return n_chars * 4; + } + + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output, int64_t output_string_ncodeunits) { + const auto option = GenerateUtf8NormalizeOption(options->method); + const auto n_chars = utf8proc_decompose_custom( Review comment: use `utf8proc_decompose()` instead ########## File path: cpp/src/arrow/compute/api_scalar.h ########## @@ -395,6 +399,17 @@ struct ARROW_EXPORT WeekOptions : public FunctionOptions { bool first_week_is_fully_in_year; }; +struct ARROW_EXPORT Utf8NormalizeOptions : public FunctionOptions { + public: + enum Method { NFC, NFKC, NFD, NFKD }; Review comment: I think a more suitable name for the enum is `Form` as these are "normal forms" of the string. ########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -672,11 +678,64 @@ struct Utf8TitleTransform : public FunctionalCaseMappingTransform { template <typename Type> using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>; +struct Utf8NormalizeTransform : public FunctionalCaseMappingTransform { + using State = OptionsWrapper<Utf8NormalizeOptions>; + + const Utf8NormalizeOptions* options; + + explicit Utf8NormalizeTransform(const Utf8NormalizeOptions& options) + : options{&options} {} Review comment: `... : options_(options) {}` ########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -672,11 +678,64 @@ struct Utf8TitleTransform : public FunctionalCaseMappingTransform { template <typename Type> using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>; +struct Utf8NormalizeTransform : public FunctionalCaseMappingTransform { + using State = OptionsWrapper<Utf8NormalizeOptions>; + + const Utf8NormalizeOptions* options; + + explicit Utf8NormalizeTransform(const Utf8NormalizeOptions& options) + : options{&options} {} + + int64_t MaxCodeunits(const uint8_t* input, int64_t ninputs, + int64_t input_ncodeunits) override { + const auto option = GenerateUtf8NormalizeOption(options->method); + const auto n_chars = + utf8proc_decompose_custom(input, input_ncodeunits, NULL, 0, option, NULL, NULL); + + // convert to byte length + return n_chars * 4; + } + + int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits, + uint8_t* output, int64_t output_string_ncodeunits) { + const auto option = GenerateUtf8NormalizeOption(options->method); + const auto n_chars = utf8proc_decompose_custom( + input, input_string_ncodeunits, reinterpret_cast<utf8proc_int32_t*>(output), + output_string_ncodeunits, option, NULL, NULL); + if (n_chars < 0) return output_string_ncodeunits; + + const auto n_bytes = + utf8proc_reencode(reinterpret_cast<utf8proc_int32_t*>(output), n_chars, option); + return n_bytes; + } + + private: + utf8proc_option_t GenerateUtf8NormalizeOption(Utf8NormalizeOptions::Method method) { + switch (method) { + case Utf8NormalizeOptions::Method::NFC: + return static_cast<utf8proc_option_t>(UTF8PROC_STABLE | UTF8PROC_COMPOSE); + case Utf8NormalizeOptions::Method::NFKC: + return static_cast<utf8proc_option_t>(UTF8PROC_STABLE | UTF8PROC_COMPOSE | + UTF8PROC_COMPAT); + case Utf8NormalizeOptions::Method::NFD: + return static_cast<utf8proc_option_t>(UTF8PROC_STABLE | UTF8PROC_DECOMPOSE); + case Utf8NormalizeOptions::Method::NFKD: + return static_cast<utf8proc_option_t>(UTF8PROC_STABLE | UTF8PROC_DECOMPOSE | + UTF8PROC_COMPAT); + default: + return static_cast<utf8proc_option_t>(UTF8PROC_STABLE | UTF8PROC_COMPOSE); Review comment: use `default: break;` instead. ########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -672,11 +678,64 @@ struct Utf8TitleTransform : public FunctionalCaseMappingTransform { template <typename Type> using Utf8Title = StringTransformExec<Type, Utf8TitleTransform>; +struct Utf8NormalizeTransform : public FunctionalCaseMappingTransform { + using State = OptionsWrapper<Utf8NormalizeOptions>; + + const Utf8NormalizeOptions* options; Review comment: `const Utf8NormalizeOptions& options_` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org