pitrou commented on code in PR #46815:
URL: https://github.com/apache/arrow/pull/46815#discussion_r2160896262
##########
cpp/src/arrow/compute/kernels/scalar_string_utf8.cc:
##########
@@ -987,13 +987,77 @@ const FunctionDoc utf8_rpad_doc(
"the given UTF8 codeunit.\nNull values emit null."),
{"strings"}, "PadOptions", /*options_required=*/true);
+struct Utf8ZFillTransform : public StringTransformBase {
+ using State = OptionsWrapper<PadOptions>;
+
+ const PadOptions& options_;
+
+ explicit Utf8ZFillTransform(const PadOptions& options) : options_(options) {}
+
+ Status PreExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out)
override {
+ if (!options_.padding.empty() && options_.padding != " ") {
+ auto str = reinterpret_cast<const uint8_t*>(options_.padding.data());
+ auto strlen = options_.padding.size();
+ if (util::UTF8Length(str, str + strlen) != 1) {
+ return Status::Invalid("Padding must be one codepoint, got '",
options_.padding,
+ "'");
+ }
+ } else if (options_.padding.empty()) {
+ return Status::Invalid("Padding must be one codepoint, got ''");
+ }
+ return Status::OK();
+ }
+ int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+ return input_ncodeunits + 4 * ninputs * options_.width;
+ }
+
+ int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+ uint8_t* output) {
+ const int64_t input_width = util::UTF8Length(input, input +
input_string_ncodeunits);
+ if (input_width >= options_.width) {
+ std::copy(input, input + input_string_ncodeunits, output);
+ return input_string_ncodeunits;
+ }
+ const int64_t spaces = options_.width - input_width;
+ uint8_t* start = output;
+ // sign-aware padding:
+ if (input_string_ncodeunits > 0 && (input[0] == '+' || input[0] == '-')) {
+ *output++ = input[0];
+ input++;
+ input_string_ncodeunits--;
+ }
+ int64_t num_zeros = spaces;
+ std::string padding = options_.padding;
+ if (padding == " ") {
+ padding = "0";
+ }
Review Comment:
As @AlenkaF pointed out, we can't do that because it prevents the user from
having a whitespace-filled zero padding.
If we really want "0" as a default, we should therefore use a separate
`ZeroFillOptions`.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]