pitrou commented on code in PR #46815:
URL: https://github.com/apache/arrow/pull/46815#discussion_r2164354597


##########
cpp/src/arrow/compute/kernels/scalar_string_utf8.cc:
##########
@@ -987,11 +987,74 @@ const FunctionDoc utf8_rpad_doc(
      "the given UTF8 codeunit.\nNull values emit null."),
     {"strings"}, "PadOptions", /*options_required=*/true);
 
+struct Utf8ZeroFillTransform : public StringTransformBase {
+  using State = OptionsWrapper<ZeroFillOptions>;
+
+  const ZeroFillOptions& options_;
+
+  explicit Utf8ZeroFillTransform(const ZeroFillOptions& options) : 
options_(options) {}
+
+  Status PreExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) 
override {
+    if (!options_.padding.empty()) {
+      auto str = reinterpret_cast<const uint8_t*>(options_.padding.data());
+      auto strlen = options_.padding.size();
+      if (util::UTF8Length(str, str + strlen) != 1) {
+        return Status::Invalid("Padding must be one codepoint, got '", 
options_.padding,
+                               "'");
+      }
+    } else {
+      return Status::Invalid("Padding must be one codepoint, got ''");
+    }
+    return Status::OK();
+  }
+
+  int64_t MaxCodeunits(int64_t ninputs, int64_t input_ncodeunits) override {
+    return input_ncodeunits + 4 * ninputs * options_.width;
+  }
+
+  int64_t Transform(const uint8_t* input, int64_t input_string_ncodeunits,
+                    uint8_t* output) {
+    const int64_t input_width = util::UTF8Length(input, input + 
input_string_ncodeunits);
+    if (input_width >= options_.width) {
+      std::copy(input, input + input_string_ncodeunits, output);
+      return input_string_ncodeunits;
+    }
+    const int64_t spaces = options_.width - input_width;
+    uint8_t* start = output;
+    // sign-aware padding:
+    if (input_string_ncodeunits > 0 && (input[0] == '+' || input[0] == '-')) {
+      *output++ = input[0];
+      input++;
+      input_string_ncodeunits--;
+    }
+    int64_t num_zeros = spaces;

Review Comment:
   Why not call `spaces` `num_zeros` from the start?



##########
python/pyarrow/compute.py:
##########
@@ -338,6 +339,8 @@ def _make_global_functions():
 
 
 _make_global_functions()
+# Alias for consistency
+utf8_zfill = utf8_zero_fill = globals()["utf8_zero_fill"]

Review Comment:
   This can be simplified to:
   ```suggestion
   utf8_zfill = utf8_zero_fill
   ```



##########
python/pyarrow/_compute.pyx:
##########
@@ -1142,6 +1142,36 @@ class PadOptions(_PadOptions):
         self._set_options(width, padding, lean_left_on_odd_padding)
 
 
+cdef class _ZeroFillOptions(FunctionOptions):
+    def _set_options(self, width, padding):
+        self.wrapped.reset(new CZeroFillOptions(width, tobytes(padding)))
+
+
+class ZeroFillOptions(_ZeroFillOptions):
+    """
+    Options for utf8_zero_fill.
+
+    Parameters
+    ----------
+    width : int
+        Desired string length.
+    padding : str, default "0"
+        Padding character. Should be one byte or Unicode codepoint.

Review Comment:
   There is no Ascii version of this function, so we can simplify the 
description just like in C++.
   ```suggestion
           Padding character. Should be one Unicode codepoint.
   ```



##########
python/pyarrow/tests/test_compute.py:
##########
@@ -1051,6 +1052,31 @@ def test_pad():
     assert pc.utf8_rpad(arr, 3).tolist() == [None, 'รก  ', 'abcd']
 
 
+def test_utf8_zfill():
+    assert pc.utf8_zfill is pc.utf8_zero_fill
+    arr = pa.array(['A', 'AB', 'ABC', None])

Review Comment:
   Instead of writing the test examples by hand, perhaps validate that 
`pc.utf8_zero_fill` has the same semantics as `str.zfill`?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to