zanmato1984 commented on code in PR #46815:
URL: https://github.com/apache/arrow/pull/46815#discussion_r2161061235


##########
cpp/src/arrow/compute/kernels/scalar_string_utf8.cc:
##########
@@ -987,13 +987,77 @@ const FunctionDoc utf8_rpad_doc(
      "the given UTF8 codeunit.\nNull values emit null."),
     {"strings"}, "PadOptions", /*options_required=*/true);
 
+struct Utf8ZFillTransform : public StringTransformBase {
+  using State = OptionsWrapper<PadOptions>;
+
+  const PadOptions& options_;
+
+  explicit Utf8ZFillTransform(const PadOptions& options) : options_(options) {}
+
+  Status PreExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) 
override {
+    if (!options_.padding.empty() && options_.padding != " ") {
+      auto str = reinterpret_cast<const uint8_t*>(options_.padding.data());
+      auto strlen = options_.padding.size();
+      if (util::UTF8Length(str, str + strlen) != 1) {
+        return Status::Invalid("Padding must be one codepoint, got '", 
options_.padding,
+                               "'");
+      }
+    } else if (options_.padding.empty()) {
+      return Status::Invalid("Padding must be one codepoint, got ''");
+    }
+    return Status::OK();
+  }
+  int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+    return input_ncodeunits + 4 * ninputs * options_.width;
+  }
+
+  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+                    uint8_t* output) {
+    const int64_t input_width = util::UTF8Length(input, input + 
input_string_ncodeunits);
+    if (input_width >= options_.width) {
+      std::copy(input, input + input_string_ncodeunits, output);
+      return input_string_ncodeunits;
+    }
+    const int64_t spaces = options_.width - input_width;
+    uint8_t* start = output;
+    // sign-aware padding:
+    if (input_string_ncodeunits > 0 && (input[0] == '+' || input[0] == '-')) {
+      *output++ = input[0];
+      input++;
+      input_string_ncodeunits--;
+    }
+    int64_t num_zeros = spaces;
+    std::string padding = options_.padding;
+    if (padding == " ") {
+      padding = "0";
+    }
+    while (num_zeros > 0) {
+      output = std::copy(padding.begin(), padding.end(), output);
+      num_zeros--;
+    }
+    output = std::copy(input, input + input_string_ncodeunits, output);
+    return output - start;
+  }
+};
+
+template <typename Type>
+using Utf8ZFill = StringTransformExecWithState<Type, Utf8ZFillTransform>;
+
+const FunctionDoc utf8_zfill_doc(
+    "Left-pad strings to a given width, preserving leading sign characters",
+    ("For each string in `strings`, emit a string of length `width` by \n"
+     "prepending the given padding character (defaults to '0' if not 
specified). \n"
+     "If the string starts with '+' or '-', the sign is preserved and padding 
\n"
+     "occurs after the sign. Null values emit null."),
+    {"strings"}, "PadOptions", /*options_required=*/true);
+
 void AddUtf8StringPad(FunctionRegistry* registry) {
   MakeUnaryStringBatchKernelWithState<Utf8LPad>("utf8_lpad", registry, 
utf8_lpad_doc);
   MakeUnaryStringBatchKernelWithState<Utf8RPad>("utf8_rpad", registry, 
utf8_rpad_doc);
   MakeUnaryStringBatchKernelWithState<Utf8Center>("utf8_center", registry,
                                                   utf8_center_doc);
+  MakeUnaryStringBatchKernelWithState<Utf8ZFill>("utf8_zfill", registry, 
utf8_zfill_doc);
 }
-

Review Comment:
   Why removing this blank line?



##########
cpp/src/arrow/compute/kernels/scalar_string_utf8.cc:
##########
@@ -987,13 +987,77 @@ const FunctionDoc utf8_rpad_doc(
      "the given UTF8 codeunit.\nNull values emit null."),
     {"strings"}, "PadOptions", /*options_required=*/true);
 
+struct Utf8ZFillTransform : public StringTransformBase {
+  using State = OptionsWrapper<PadOptions>;
+
+  const PadOptions& options_;
+
+  explicit Utf8ZFillTransform(const PadOptions& options) : options_(options) {}
+
+  Status PreExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) 
override {
+    if (!options_.padding.empty() && options_.padding != " ") {
+      auto str = reinterpret_cast<const uint8_t*>(options_.padding.data());
+      auto strlen = options_.padding.size();
+      if (util::UTF8Length(str, str + strlen) != 1) {
+        return Status::Invalid("Padding must be one codepoint, got '", 
options_.padding,
+                               "'");
+      }
+    } else if (options_.padding.empty()) {
+      return Status::Invalid("Padding must be one codepoint, got ''");
+    }
+    return Status::OK();
+  }
+  int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {

Review Comment:
   ```suggestion
   
     int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to