lidavidm commented on code in PR #14395:
URL: https://github.com/apache/arrow/pull/14395#discussion_r1015933712


##########
python/pyarrow/_compute.pyx:
##########
@@ -1168,6 +1168,45 @@ class SliceOptions(_SliceOptions):
         self._set_options(start, stop, step)
 
 
+cdef class _ListSliceOptions(FunctionOptions):
+    cpdef _set_options(self, start, stop=None, step=1, 
return_fixed_size_list=None):
+        cdef:
+            CListSliceOptions* opts
+        opts = new CListSliceOptions(
+            start,
+            <optional[int64_t]>nullopt if stop is None \
+                else <optional[int64_t]>(<int64_t>stop),
+            step,
+            <optional[c_bool]>nullopt if return_fixed_size_list is None \
+                else <optional[c_bool]>(<c_bool>return_fixed_size_list)
+        )
+        self.wrapped.reset(opts)
+
+
+class ListSliceOptions(_ListSliceOptions):
+    """
+    Options for list array slicing.
+
+    Parameters
+    ----------
+    start : int
+        Index to start slicing inner list elements (inclusive).
+    stop : Optional[int], default None
+        If given, index to stop slicing at (exclusive).
+        If not given, slicing will stop at the end. (NotImplemented)
+    step : int, default 1
+        Slice step.
+    return_fixed_size_list : bool, default None

Review Comment:
   nit: type as `Optional[bool]`?



##########
cpp/src/arrow/compute/kernels/scalar_nested_test.cc:
##########
@@ -116,6 +117,125 @@ TEST(TestScalarNested, ListElementInvalid) {
               Raises(StatusCode::Invalid));
 }
 
+TEST(TestScalarNested, ListSliceVariableOutput) {
+  const auto value_types = {float32(), int32()};
+  for (auto value_type : value_types) {
+    /* Variable list size output required variable size list input. */
+    auto inputs = {ArrayFromJSON(list(value_type), "[[1, 2, 3], [4, 5], [6], 
null]")};
+    for (auto input : inputs) {

Review Comment:
   nit: why use a loop for only one input?



##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -87,6 +89,198 @@ Status GetListElementIndex(const ExecValue& value, T* out) {
   return Status::OK();
 }
 
+template <typename Type, typename IndexType>
+struct ListSlice {
+  using offset_type = typename Type::offset_type;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    const auto opts = OptionsWrapper<ListSliceOptions>::Get(ctx);
+
+    // Invariants
+    if (!opts.stop.has_value()) {
+      // TODO: Support slicing to arbitrary end

Review Comment:
   Do you plan to tackle the TODOs in this PR, or have you filed follow up 
tickets? (If the latter, can they be referenced inline like `TODO(ARROW-NNNNN): 
...`?)



##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -87,6 +89,198 @@ Status GetListElementIndex(const ExecValue& value, T* out) {
   return Status::OK();
 }
 
+template <typename Type, typename IndexType>
+struct ListSlice {
+  using offset_type = typename Type::offset_type;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    const auto opts = OptionsWrapper<ListSliceOptions>::Get(ctx);
+
+    // Invariants
+    if (!opts.stop.has_value()) {
+      // TODO: Support slicing to arbitrary end
+      // For variable size list, this would be the largest difference in 
offsets
+      // For fixed size list, this would be the fixed size.
+      return Status::NotImplemented(
+          "Slicing to end not yet implemented, please set `stop` parameter.");
+    }
+    if (opts.start < 0 || opts.start >= opts.stop.value()) {
+      // TODO: support start == stop which should give empty lists
+      return Status::Invalid("`start`(", opts.start,
+                             ") should be greater than 0 and smaller than 
`stop`(",
+                             opts.stop, ")");
+    }
+    if (opts.step != 1) {
+      // TODO: support step in slicing
+      return Status::NotImplemented(
+          "Setting `step` to anything other than 1 is not supported; got 
step=",
+          opts.step);
+    }
+
+    const ArraySpan& list_ = batch[0].array;
+    const Type* list_type = checked_cast<const Type*>(list_.type);
+    const auto field_name = list_type->field(0)->name();
+    const auto value_type = list_type->field(0)->WithName(field_name);
+    const auto return_fixed_size_list = opts.return_fixed_size_list.value_or(
+        list_type->id() == arrow::Type::FIXED_SIZE_LIST);
+    std::unique_ptr<ArrayBuilder> builder;
+
+    // construct array values
+    if (return_fixed_size_list) {
+      RETURN_NOT_OK(MakeBuilder(
+          ctx->memory_pool(),
+          fixed_size_list(value_type,
+                          static_cast<int32_t>(opts.stop.value() - 
opts.start)),
+          &builder));
+      RETURN_NOT_OK(BuildArray<FixedSizeListBuilder>(batch, opts, *builder));
+    } else {
+      if constexpr (std::is_same_v<Type, LargeListType>) {
+        RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), large_list(value_type), 
&builder));
+        RETURN_NOT_OK(BuildArray<LargeListBuilder>(batch, opts, *builder));
+      } else {
+        RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(value_type), 
&builder));
+        RETURN_NOT_OK(BuildArray<ListBuilder>(batch, opts, *builder));
+      }
+    }
+
+    // build output arrays and set result
+    ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
+    out->value = std::move(result->data());
+    return Status::OK();
+  }
+
+  template <typename BuilderType>
+  static Status BuildArray(const ExecSpan& batch, const ListSliceOptions& opts,
+                           ArrayBuilder& builder) {
+    if constexpr (std::is_same_v<Type, FixedSizeListType>) {
+      RETURN_NOT_OK(BuildArrayFromFixedSizeListType<BuilderType>(batch, opts, 
builder));
+    } else {
+      RETURN_NOT_OK(BuildArrayFromListType<BuilderType>(batch, opts, builder));
+    }
+    return Status::OK();
+  }
+
+  template <typename BuilderType>
+  static Status BuildArrayFromFixedSizeListType(const ExecSpan& batch,
+                                                const ListSliceOptions& opts,
+                                                ArrayBuilder& builder) {
+    const auto list_size =
+        checked_cast<const FixedSizeListType&>(*batch[0].type()).list_size();
+    const ArraySpan& list_ = batch[0].array;

Review Comment:
   nit: it's a little confusing to suffix names with underscores, generally 
that convention is reserved for member variables. (Plus I don't think `list` is 
a keyword or builtin like it is in Python.)



##########
cpp/src/arrow/compute/kernels/scalar_nested_test.cc:
##########
@@ -116,6 +117,125 @@ TEST(TestScalarNested, ListElementInvalid) {
               Raises(StatusCode::Invalid));
 }
 
+TEST(TestScalarNested, ListSliceVariableOutput) {
+  const auto value_types = {float32(), int32()};
+  for (auto value_type : value_types) {
+    /* Variable list size output required variable size list input. */
+    auto inputs = {ArrayFromJSON(list(value_type), "[[1, 2, 3], [4, 5], [6], 
null]")};
+    for (auto input : inputs) {
+      ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1,
+                            /*return_fixed_size_list=*/false);
+      auto expected = ArrayFromJSON(list(value_type), "[[1, 2], [4, 5], [6], 
null]");
+      CheckScalarUnary("list_slice", input, expected, &args);
+
+      args.start = 1;
+      expected = ArrayFromJSON(list(value_type), "[[2], [5], [], null]");
+      CheckScalarUnary("list_slice", input, expected, &args);
+
+      args.start = 2;
+      args.stop = 4;
+      expected = ArrayFromJSON(list(value_type), "[[3], [], [], null]");
+      CheckScalarUnary("list_slice", input, expected, &args);
+    }
+  }
+
+  // Verify passing `return_fixed_size_list=false` with fixed size input
+  // returns variable size even if stop is beyond list_size
+  ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1,
+                        /*return_fixed_size_list=*/false);
+  auto input = ArrayFromJSON(fixed_size_list(int32(), 1), "[[1]]");
+  auto expected = ArrayFromJSON(list(int32()), "[[1]]");
+  CheckScalarUnary("list_slice", input, expected, &args);
+}
+
+TEST(TestScalarNested, ListSliceFixedOutput) {
+  const auto value_types = {float32(), int32()};
+  for (auto value_type : value_types) {
+    auto inputs = {ArrayFromJSON(list(value_type), "[[1, 2, 3], [4, 5], [6], 
null]"),
+                   ArrayFromJSON(fixed_size_list(value_type, 3),
+                                 "[[1, 2, 3], [4, 5, null], [6, null, null], 
null]")};
+    for (auto input : inputs) {
+      ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1,
+                            /*return_fixed_size_list=*/true);
+      auto expected = ArrayFromJSON(fixed_size_list(value_type, 2),
+                                    "[[1, 2], [4, 5], [6, null], null]");
+      CheckScalarUnary("list_slice", input, expected, &args);
+
+      args.start = 1;
+      expected =
+          ArrayFromJSON(fixed_size_list(value_type, 1), "[[2], [5], [null], 
null]");
+      CheckScalarUnary("list_slice", input, expected, &args);
+
+      args.start = 2;
+      args.stop = 4;
+      expected = ArrayFromJSON(fixed_size_list(value_type, 2),
+                               "[[3, null], [null, null], [null, null], 
null]");
+      CheckScalarUnary("list_slice", input, expected, &args);
+    }
+  }
+}
+
+TEST(TestScalarNested, ListSliceOutputEqualsInputType) {
+  // Default is to return same type as the one passed in.
+  auto inputs = {
+      ArrayFromJSON(list(int8()), "[[1, 2, 3], [4, 5], [6, null], null]"),
+      ArrayFromJSON(large_list(int8()), "[[1, 2, 3], [4, 5], [6, null], 
null]"),
+      ArrayFromJSON(fixed_size_list(int8(), 2), "[[1, 2], [4, 5], [6, null], 
null]")};
+  for (auto input : inputs) {
+    ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1);
+    auto expected = ArrayFromJSON(input->type(), "[[1, 2], [4, 5], [6, null], 
null]");
+    CheckScalarUnary("list_slice", input, expected, &args);
+  }
+}
+
+TEST(TestScalarNested, ListSliceBadParameters) {
+  auto input = ArrayFromJSON(list(int32()), "[[1]]");
+
+  // negative start
+  ListSliceOptions args(/*start=*/-1, /*stop=*/1, /*step=*/1,
+                        /*return_fixed_size_list=*/true);
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid,
+      ::testing::HasSubstr(
+          "`start`(-1) should be greater than 0 and smaller than `stop`(1)"),
+      CallFunction("list_slice", {input}, &args));
+  // start greater than stop
+  args.start = 1;
+  args.stop = 0;
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid,
+      ::testing::HasSubstr(
+          "`start`(1) should be greater than 0 and smaller than `stop`(0)"),
+      CallFunction("list_slice", {input}, &args));
+  // start same as stop
+  args.stop = args.start;
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid,
+      ::testing::HasSubstr(
+          "`start`(1) should be greater than 0 and smaller than `stop`(1)"),
+      CallFunction("list_slice", {input}, &args));
+  // stop not set and FixedSizeList requested
+  args.stop = std::nullopt;
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      NotImplemented,
+      ::testing::HasSubstr("NotImplemented: Unable to produce 
FixedSizeListArray without "
+                           "`stop` being set."),
+      CallFunction("list_slice", {input}, &args));
+  // stop not set and ListArray requested
+  args.stop = std::nullopt;
+  args.return_fixed_size_list = false;
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      NotImplemented, ::testing::HasSubstr("Slicing to end not yet 
implemented"),
+      CallFunction("list_slice", {input}, &args));
+  // step other than `1` not implmented
+  args.stop = 2;
+  args.step = 2;
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      NotImplemented,
+      ::testing::HasSubstr("Setting `step` to anything other than 1 is not 
supported"),
+      CallFunction("list_slice", {input}, &args));
+}
+

Review Comment:
   I'd still like to see what happens when the child array has its own offset.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to