lidavidm commented on code in PR #14395:
URL: https://github.com/apache/arrow/pull/14395#discussion_r1012792310


##########
cpp/src/arrow/util/string_builder.h:
##########
@@ -44,6 +45,11 @@ class ARROW_EXPORT StringStreamWrapper {
 
 }  // namespace detail
 
+template <typename T>
+std::ostream& operator<<(std::ostream& os, std::optional<T> const& opt) {

Review Comment:
   This will affect everyone who includes the header, which I don't think is 
desirable



##########
python/pyarrow/_compute.pyx:
##########
@@ -1168,6 +1168,46 @@ class SliceOptions(_SliceOptions):
         self._set_options(start, stop, step)
 
 
+cdef class _ListSliceOptions(FunctionOptions):
+    cpdef _set_options(self, start, stop=None, step=1, 
return_fixed_size_list=True):
+        # NB: Assigning optional[int64_t] on stack seems available once 
Cython>=3 available.
+        # due to cpp_locals directive:
+        # 
https://cython.readthedocs.io/en/latest/src/userguide/wrapping_CPlusPlus.html#cpp-locals-directive

Review Comment:
   Interesting, but since std::optional has a default/nullary constructor, 
shouldn't we be able to use it on the stack? (The docs talk about a feature 
_implemented via_ optional, but not necessarily optional itself)



##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -87,6 +89,198 @@ Status GetListElementIndex(const ExecValue& value, T* out) {
   return Status::OK();
 }
 
+template <typename Type, typename IndexType>
+struct ListSlice {
+  using offset_type = typename Type::offset_type;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    const auto opts = OptionsWrapper<ListSliceOptions>::Get(ctx);
+
+    // Invariants
+    if (!opts.stop.has_value()) {
+      // TODO: Support slicing to arbitrary end
+      // For variable size list, this would be the largest difference in 
offsets
+      // For fixed size list, this would be the fixed size.
+      return Status::NotImplemented(
+          "Slicing to end not yet implemented, please set `stop` parameter.");
+    }
+    if (opts.start < 0 || opts.start >= opts.stop.value()) {
+      // TODO: support start == stop which should give empty lists
+      return Status::Invalid("`start`(", opts.start,
+                             ") should be greater than 0 and smaller than 
`stop`(",
+                             opts.stop, ")");
+    }
+    if (opts.step != 1) {
+      // TODO: support step in slicing
+      return Status::NotImplemented(
+          "Setting `step` to anything other than 1 is not supported; got 
step=",
+          opts.step);
+    }
+
+    const ArraySpan& list_ = batch[0].array;
+    const Type* list_type = checked_cast<const Type*>(list_.type);
+    const auto value_type = list_type->value_type();
+
+    std::unique_ptr<ArrayBuilder> builder;
+
+    // construct array values
+    if (opts.return_fixed_size_list) {
+      RETURN_NOT_OK(MakeBuilder(
+          ctx->memory_pool(),
+          fixed_size_list(value_type,
+                          static_cast<int32_t>(opts.stop.value() - 
opts.start)),
+          &builder));
+      RETURN_NOT_OK(BuildArray<FixedSizeListBuilder>(batch, opts, *builder));
+    } else {
+      if constexpr (std::is_same_v<Type, LargeListType>) {
+        RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), large_list(value_type), 
&builder));
+        RETURN_NOT_OK(BuildArray<LargeListBuilder>(batch, opts, *builder));
+      } else {
+        RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(value_type), 
&builder));
+        RETURN_NOT_OK(BuildArray<ListBuilder>(batch, opts, *builder));
+      }
+    }
+
+    // build output arrays and set result
+    ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
+    out->value = result->data();
+    return Status::OK();
+  }
+
+  template <typename BuilderType>
+  static Status BuildArray(const ExecSpan& batch, const ListSliceOptions& opts,
+                           ArrayBuilder& builder) {
+    if constexpr (std::is_same_v<Type, FixedSizeListType>) {
+      RETURN_NOT_OK(BuildArrayFromFixedSizeListType<BuilderType>(batch, opts, 
builder));
+    } else {
+      RETURN_NOT_OK(BuildArrayFromListType<BuilderType>(batch, opts, builder));
+    }
+    return Status::OK();
+  }
+
+  template <typename BuilderType>
+  static Status BuildArrayFromFixedSizeListType(const ExecSpan& batch,
+                                                const ListSliceOptions& opts,
+                                                ArrayBuilder& builder) {
+    const auto list_size =
+        checked_cast<const FixedSizeListType&>(*batch[0].type()).list_size();
+    const ArraySpan& list_ = batch[0].array;
+    const ArraySpan& list_values = list_.child_data[0];

Review Comment:
   Are we accounting for either array's offset here?



##########
cpp/src/arrow/compute/kernels/scalar_nested.cc:
##########
@@ -87,6 +89,198 @@ Status GetListElementIndex(const ExecValue& value, T* out) {
   return Status::OK();
 }
 
+template <typename Type, typename IndexType>
+struct ListSlice {
+  using offset_type = typename Type::offset_type;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    const auto opts = OptionsWrapper<ListSliceOptions>::Get(ctx);
+
+    // Invariants
+    if (!opts.stop.has_value()) {
+      // TODO: Support slicing to arbitrary end
+      // For variable size list, this would be the largest difference in 
offsets
+      // For fixed size list, this would be the fixed size.
+      return Status::NotImplemented(
+          "Slicing to end not yet implemented, please set `stop` parameter.");
+    }
+    if (opts.start < 0 || opts.start >= opts.stop.value()) {
+      // TODO: support start == stop which should give empty lists
+      return Status::Invalid("`start`(", opts.start,
+                             ") should be greater than 0 and smaller than 
`stop`(",
+                             opts.stop, ")");
+    }
+    if (opts.step != 1) {
+      // TODO: support step in slicing
+      return Status::NotImplemented(
+          "Setting `step` to anything other than 1 is not supported; got 
step=",
+          opts.step);
+    }
+
+    const ArraySpan& list_ = batch[0].array;
+    const Type* list_type = checked_cast<const Type*>(list_.type);
+    const auto value_type = list_type->value_type();
+
+    std::unique_ptr<ArrayBuilder> builder;
+
+    // construct array values
+    if (opts.return_fixed_size_list) {
+      RETURN_NOT_OK(MakeBuilder(
+          ctx->memory_pool(),
+          fixed_size_list(value_type,
+                          static_cast<int32_t>(opts.stop.value() - 
opts.start)),
+          &builder));
+      RETURN_NOT_OK(BuildArray<FixedSizeListBuilder>(batch, opts, *builder));
+    } else {
+      if constexpr (std::is_same_v<Type, LargeListType>) {
+        RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), large_list(value_type), 
&builder));
+        RETURN_NOT_OK(BuildArray<LargeListBuilder>(batch, opts, *builder));
+      } else {
+        RETURN_NOT_OK(MakeBuilder(ctx->memory_pool(), list(value_type), 
&builder));
+        RETURN_NOT_OK(BuildArray<ListBuilder>(batch, opts, *builder));
+      }
+    }
+
+    // build output arrays and set result
+    ARROW_ASSIGN_OR_RAISE(auto result, builder->Finish());
+    out->value = result->data();
+    return Status::OK();
+  }
+
+  template <typename BuilderType>
+  static Status BuildArray(const ExecSpan& batch, const ListSliceOptions& opts,
+                           ArrayBuilder& builder) {
+    if constexpr (std::is_same_v<Type, FixedSizeListType>) {
+      RETURN_NOT_OK(BuildArrayFromFixedSizeListType<BuilderType>(batch, opts, 
builder));
+    } else {
+      RETURN_NOT_OK(BuildArrayFromListType<BuilderType>(batch, opts, builder));
+    }
+    return Status::OK();
+  }
+
+  template <typename BuilderType>
+  static Status BuildArrayFromFixedSizeListType(const ExecSpan& batch,
+                                                const ListSliceOptions& opts,
+                                                ArrayBuilder& builder) {
+    const auto list_size =
+        checked_cast<const FixedSizeListType&>(*batch[0].type()).list_size();
+    const ArraySpan& list_ = batch[0].array;
+    const ArraySpan& list_values = list_.child_data[0];
+    const offset_type n_offsets = list_values.length / list_size;
+
+    auto list_builder = checked_cast<BuilderType*>(&builder);
+    for (offset_type offset = 0; offset < n_offsets * list_size;
+         offset = offset + list_size) {
+      auto next_offset = offset + list_size;
+      if (list_.IsNull(offset / list_size)) {
+        RETURN_NOT_OK(list_builder->AppendNull());
+      } else {
+        RETURN_NOT_OK(SetValues<BuilderType>(list_builder, offset, 
next_offset, &opts,
+                                             &list_values));
+      }
+    }
+    return Status::OK();
+  }
+
+  template <typename BuilderType>
+  static Status BuildArrayFromListType(const ExecSpan& batch,
+                                       const ListSliceOptions& opts,
+                                       ArrayBuilder& builder) {
+    const ArraySpan& list_ = batch[0].array;
+    const offset_type* offsets = list_.GetValues<offset_type>(1);
+    const auto n_offsets = static_cast<offset_type>(
+        static_cast<size_t>(list_.GetBuffer(1)->size()) / sizeof(offset_type));

Review Comment:
   Same here, we don't seem to be accounting for offsets.



##########
cpp/src/arrow/compute/kernels/scalar_nested_test.cc:
##########
@@ -116,6 +117,97 @@ TEST(TestScalarNested, ListElementInvalid) {
               Raises(StatusCode::Invalid));
 }
 
+void CheckListSlice(std::shared_ptr<Array> input, std::shared_ptr<Array> 
expected,
+                    const ListSliceOptions& args) {
+  ASSERT_OK_AND_ASSIGN(auto result, CallFunction("list_slice", {input}, 
&args));
+  ASSERT_EQ(result, expected);
+}
+
+TEST(TestScalarNested, ListSliceVariableOutput) {
+  const auto value_types = {float32(), int32()};
+  for (auto value_type : value_types) {
+    /* Variable list size output required variable size list input. */
+    auto inputs = {ArrayFromJSON(list(value_type), "[[1, 2, 3], [4, 5], [6], 
null]")};
+    for (auto input : inputs) {
+      ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1,
+                            /*return_fixed_size_list=*/false);
+      auto expected = ArrayFromJSON(list(value_type), "[[1, 2], [4, 5], [6], 
null]");
+      CheckListSlice(input, expected, args);
+
+      args.start = 1;
+      expected = ArrayFromJSON(list(value_type), "[[2], [5], [], null]");
+      CheckListSlice(input, expected, args);
+
+      args.start = 2;
+      args.stop = 4;
+      expected = ArrayFromJSON(list(value_type), "[[3], [], [], null]");
+      CheckListSlice(input, expected, args);
+    }
+  }
+
+  // Verify passing `return_fixed_size_list=false` with fixed size input
+  // returns variable size even if stop is beyond list_size
+  ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1,
+                        /*return_fixed_size_list=*/false);
+  auto input = ArrayFromJSON(fixed_size_list(int32(), 1), "[[1]]");
+  auto expected = ArrayFromJSON(list(int32()), "[[1]]");
+  CheckListSlice(input, expected, args);
+}
+
+TEST(TestScalarNested, ListSliceFixedOutput) {
+  const auto value_types = {float32(), int32()};
+  for (auto value_type : value_types) {
+    auto inputs = {ArrayFromJSON(list(value_type), "[[1, 2, 3], [4, 5], [6], 
null]"),
+                   ArrayFromJSON(fixed_size_list(value_type, 3),
+                                 "[[1, 2, 3], [4, 5, null], [6, null, null], 
null]")};
+    for (auto input : inputs) {
+      ListSliceOptions args(/*start=*/0, /*stop=*/2, /*step=*/1,
+                            /*return_fixed_size_list=*/true);
+      auto expected = ArrayFromJSON(fixed_size_list(value_type, 2),
+                                    "[[1, 2], [4, 5], [6, null], null]");
+      CheckListSlice(input, expected, args);
+
+      args.start = 1;
+      expected =
+          ArrayFromJSON(fixed_size_list(value_type, 1), "[[2], [5], [null], 
null]");
+      CheckListSlice(input, expected, args);
+
+      args.start = 2;
+      args.stop = 4;
+      expected = ArrayFromJSON(fixed_size_list(value_type, 2),
+                               "[[3, null], [null, null], [null, null], 
null]");
+      CheckListSlice(input, expected, args);
+    }
+  }
+}
+
+TEST(TestScalarNested, ListSliceBadParameters) {
+  auto input = ArrayFromJSON(list(int32()), "[[1]]");
+
+  // negative start
+  SliceOptions args(-1, 1);
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid,
+      ::testing::HasSubstr(
+          "`start`(-1) should be greater than 0 and smaller than `stop`(1)"),
+      CallFunction("list_slice", {input}, &args));
+  // start greater than stop
+  args.start = 1;
+  args.stop = 0;
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid,
+      ::testing::HasSubstr(
+          "`start`(1) should be greater than 0 and smaller than `stop`(0)"),
+      CallFunction("list_slice", {input}, &args));
+  // start same as stop
+  args.stop = args.start;
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid,
+      ::testing::HasSubstr(
+          "`start`(1) should be greater than 0 and smaller than `stop`(1)"),
+      CallFunction("list_slice", {input}, &args));

Review Comment:
   Should probably also add a test for step, if we don't expect to support it 
right now



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to