This is an automated email from the ASF dual-hosted git repository.

raulcd pushed a commit to branch maint-13.0.0
in repository https://gitbox.apache.org/repos/asf/arrow.git

commit 0f747311a4f7a461d36a429a668bd7917fb4fc03
Author: Ben Harkins <[email protected]>
AuthorDate: Tue Jul 11 02:28:43 2023 -0400

    GH-36311: [C++] Fix integer overflows in `utf8_slice_codeunits` (#36575)
    
    
    
    ### Rationale for this change
    
    The default value for the `SliceOptions::stop` is `INT64_MAX`, which isn't 
considered in several internal calculations - resulting in integer overflows 
and unexpected behavior when `stop` isn't provided.
    
    Also note that running the included tests without the fixes should result 
in ubsan errors (it did for me, at least).
    
    ### What changes are included in this PR?
    
    - Adds some logic to `SliceCodunitsTransform` that handles potential 
overflows
    - Adds tests for cases where the `start` param is positive/negative and 
`stop` is the maximum value
    
    **Update**
    Discovered that `utf8_slice_codeunits` deviates from Python array behavior 
when `stop=None` and `step < 0`, so further changes were made:
    - Handles `INT64_MIN` for `SliceOptions::stop` on C++ side, adds more tests.
    - Updates Python bindings for `SliceOptions` so that the default value when 
`stop=None` (`sys.maxsize`) is negated when `step < 0`
    - Adds `None` as a possible `stop` value in Python tests
    
    ### Are these changes tested?
    
    Yes (tests are included)
    
    ### Are there any user-facing changes?
    
    In theory, altering the behavior of `utf8_slice_codepoints` when 
`stop=None` and `step < 0` could be considered a breaking change. That being 
said, the current implementation produces incorrect results whenever `None` is 
even used, so it probably isn't one in practice...
    
    * Closes: #36311
    
    Authored-by: benibus <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 .../arrow/compute/kernels/scalar_string_test.cc    | 35 ++++++++++++++++++++++
 .../arrow/compute/kernels/scalar_string_utf8.cc    | 18 +++++++----
 python/pyarrow/_compute.pyx                        |  2 ++
 python/pyarrow/tests/test_compute.py               |  2 +-
 4 files changed, 51 insertions(+), 6 deletions(-)

diff --git a/cpp/src/arrow/compute/kernels/scalar_string_test.cc 
b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
index 4581e6377a..ff14f5e7a5 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_test.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_test.cc
@@ -2091,6 +2091,14 @@ TYPED_TEST(TestStringKernels, SliceCodeunitsPosPos) {
   options_step_neg.stop = 0;
   this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", 
"𝑓öõḍ","𝑓öõḍš"])",
                    this->type(), R"(["", "", "ö", "õ", "ḍö", "šõ"])", 
&options_step_neg);
+
+  constexpr auto max = std::numeric_limits<int64_t>::max();
+  SliceOptions options_max_step{1, max, 2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "", "ö", "ö", "öḍ", "öḍ"])", 
&options_max_step);
+  SliceOptions options_max_step_neg{1, max, -2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "", "", "", "", ""])", 
&options_max_step_neg);
 }
 
 TYPED_TEST(TestStringKernels, SliceCodeunitsPosNeg) {
@@ -2107,6 +2115,15 @@ TYPED_TEST(TestStringKernels, SliceCodeunitsPosNeg) {
   this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", 
"𝑓öõḍ","𝑓öõḍš"])",
                    this->type(), R"(["", "𝑓", "ö", "õ𝑓", "ḍö", "ḍö"])",
                    &options_step_neg);
+
+  constexpr auto min = std::numeric_limits<int64_t>::min();
+  SliceOptions options_min_step{2, min, 2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "", "", "", "", ""])", 
&options_min_step);
+  SliceOptions options_min_step_neg{2, min, -2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "𝑓", "ö", "õ𝑓", "õ𝑓", "õ𝑓"])",
+                   &options_min_step_neg);
 }
 
 TYPED_TEST(TestStringKernels, SliceCodeunitsNegNeg) {
@@ -2123,6 +2140,15 @@ TYPED_TEST(TestStringKernels, SliceCodeunitsNegNeg) {
   this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
                    this->type(), R"(["", "𝑓", "ö", "õ𝑓", "ḍö", "šõ"])",
                    &options_step_neg);
+
+  constexpr auto min = std::numeric_limits<int64_t>::min();
+  SliceOptions options_min_step{-2, min, 2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "", "", "", "", ""])", 
&options_min_step);
+  SliceOptions options_min_step_neg{-2, min, -2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "", "𝑓", "ö", "õ𝑓", "ḍö"])",
+                   &options_min_step_neg);
 }
 
 TYPED_TEST(TestStringKernels, SliceCodeunitsNegPos) {
@@ -2138,6 +2164,15 @@ TYPED_TEST(TestStringKernels, SliceCodeunitsNegPos) {
   options_step_neg.stop = 0;
   this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
                    this->type(), R"(["", "", "ö", "õ", "ḍö", "šõ"])", 
&options_step_neg);
+
+  constexpr auto max = std::numeric_limits<int64_t>::max();
+  SliceOptions options_max_step{-3, max, 2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "𝑓", "𝑓", "𝑓õ", "öḍ", "õš"])",
+                   &options_max_step);
+  SliceOptions options_max_step_neg{-3, max, -2};
+  this->CheckUnary("utf8_slice_codeunits", R"(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", 
"𝑓öõḍš"])",
+                   this->type(), R"(["", "", "", "", "", ""])", 
&options_max_step_neg);
 }
 
 #endif  // ARROW_WITH_UTF8PROC
diff --git a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc 
b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc
index fb197e13a6..cf8a697fea 100644
--- a/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc
+++ b/cpp/src/arrow/compute/kernels/scalar_string_utf8.cc
@@ -1090,7 +1090,8 @@ struct SliceCodeunitsTransform : StringSliceTransformBase 
{
       // on the resulting slice lengths, so return a worst case estimate.
       return input_ncodeunits;
     }
-    int64_t max_slice_codepoints = (opt.stop - opt.start + opt.step - 1) / 
opt.step;
+    int64_t stop = std::clamp(opt.stop, -input_ncodeunits, input_ncodeunits);
+    int64_t max_slice_codepoints = (stop - opt.start + opt.step - 1) / 
opt.step;
     // The maximum UTF8 byte size of a codepoint is 4
     return std::min(input_ncodeunits,
                     4 * ninputs * std::max<int64_t>(0, max_slice_codepoints));
@@ -1133,7 +1134,7 @@ struct SliceCodeunitsTransform : StringSliceTransformBase 
{
       } else if (opt.stop < 0) {
         // or from the end (but we will never need to < begin_sliced)
         RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
-            begin_sliced, end, &end_sliced, -opt.stop));
+            begin_sliced, end, &end_sliced, Negate(opt.stop)));
       } else {
         // zero length slice
         return 0;
@@ -1158,7 +1159,7 @@ struct SliceCodeunitsTransform : StringSliceTransformBase 
{
         // or begin_sliced), but begin_sliced and opt.start can be 'out of 
sync',
         // for instance when start=-100, when the string length is only 10.
         RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
-            begin_sliced, end, &end_sliced, -opt.stop));
+            begin_sliced, end, &end_sliced, Negate(opt.stop)));
       } else {
         // zero length slice
         return 0;
@@ -1214,11 +1215,12 @@ struct SliceCodeunitsTransform : 
StringSliceTransformBase {
 
     // similar to opt.start
     if (opt.stop >= 0) {
+      int64_t length = std::min(opt.stop, std::numeric_limits<int64_t>::max() 
- 1) + 1;
       RETURN_IF_UTF8_ERROR(
-          arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, opt.stop 
+ 1));
+          arrow::util::UTF8AdvanceCodepoints(begin, end, &end_sliced, length));
     } else {
       RETURN_IF_UTF8_ERROR(arrow::util::UTF8AdvanceCodepointsReverse(
-          begin, end, &end_sliced, -opt.stop - 1));
+          begin, end, &end_sliced, Negate(opt.stop) - 1));
     }
     end_sliced--;
 
@@ -1240,6 +1242,12 @@ struct SliceCodeunitsTransform : 
StringSliceTransformBase {
   }
 
 #undef RETURN_IF_UTF8_ERROR
+
+ private:
+  static int64_t Negate(int64_t v) {
+    constexpr auto max = std::numeric_limits<int64_t>::max();
+    return -max > v ? max : -v;
+  }
 };
 
 template <typename Type>
diff --git a/python/pyarrow/_compute.pyx b/python/pyarrow/_compute.pyx
index ab63a7a19f..ac7efeff41 100644
--- a/python/pyarrow/_compute.pyx
+++ b/python/pyarrow/_compute.pyx
@@ -1201,6 +1201,8 @@ class SliceOptions(_SliceOptions):
     def __init__(self, start, stop=None, step=1):
         if stop is None:
             stop = sys.maxsize
+            if step < 0:
+                stop = -stop
         self._set_options(start, stop, step)
 
 
diff --git a/python/pyarrow/tests/test_compute.py 
b/python/pyarrow/tests/test_compute.py
index 865fecc7b2..e47e5d3f3e 100644
--- a/python/pyarrow/tests/test_compute.py
+++ b/python/pyarrow/tests/test_compute.py
@@ -537,7 +537,7 @@ def test_trim():
 def test_slice_compatibility():
     arr = pa.array(["", "𝑓", "𝑓ö", "𝑓öõ", "𝑓öõḍ", "𝑓öõḍš"])
     for start in range(-6, 6):
-        for stop in range(-6, 6):
+        for stop in itertools.chain(range(-6, 6), [None]):
             for step in [-3, -2, -1, 1, 2, 3]:
                 expected = pa.array([k.as_py()[start:stop:step]
                                      for k in arr])

Reply via email to