js8544 commented on code in PR #37292:
URL: https://github.com/apache/arrow/pull/37292#discussion_r1301419907


##########
cpp/src/arrow/compute/kernels/scalar_cast_nested.cc:
##########
@@ -145,6 +153,135 @@ void AddListCast(CastFunction* func) {
   DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel)));
 }
 
+template <typename DestType>
+struct CastFixedToVarList {
+  using dest_offset_type = typename DestType::offset_type;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    const CastOptions& options = CastState::Get(ctx);
+
+    auto child_type = checked_cast<const DestType&>(*out->type()).value_type();
+
+    const ArraySpan& in_array = batch[0].array;
+
+    ArrayData* out_array = out->array_data().get();
+    ARROW_ASSIGN_OR_RAISE(out_array->buffers[0],
+                          GetNullBitmapBuffer(in_array, ctx->memory_pool()));
+
+    const auto& in_type = checked_cast<const 
FixedSizeListType&>(*in_array.type);
+    const int32_t list_size = in_type.list_size();
+
+    // Allocate a new offsets buffer
+    ARROW_ASSIGN_OR_RAISE(out_array->buffers[1],
+                          ctx->Allocate(sizeof(dest_offset_type) * 
(batch.length + 1)));
+    auto* offsets = out_array->GetMutableValues<dest_offset_type>(1);
+    dest_offset_type offset = 0;
+    for (int64_t i = 0; i <= batch.length; ++i) {
+      offsets[i] = offset;
+      offset += list_size;
+    }
+
+    // Handle values
+    std::shared_ptr<ArrayData> values = in_array.child_data[0].ToArrayData();
+    if (in_array.offset > 0 || (in_array.length * list_size < batch.length)) {

Review Comment:
   ```suggestion
       if (in_array.offset > 0 || (in_array.length * list_size < 
values.length)) {
   ```
   Do you mean to compare with values length? I think `in_array.length` is 
always equal to `batch.length`.



##########
cpp/src/arrow/compute/kernels/scalar_cast_nested.cc:
##########
@@ -145,6 +153,135 @@ void AddListCast(CastFunction* func) {
   DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel)));
 }
 
+template <typename DestType>
+struct CastFixedToVarList {
+  using dest_offset_type = typename DestType::offset_type;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    const CastOptions& options = CastState::Get(ctx);
+
+    auto child_type = checked_cast<const DestType&>(*out->type()).value_type();
+
+    const ArraySpan& in_array = batch[0].array;
+
+    ArrayData* out_array = out->array_data().get();
+    ARROW_ASSIGN_OR_RAISE(out_array->buffers[0],
+                          GetNullBitmapBuffer(in_array, ctx->memory_pool()));
+
+    const auto& in_type = checked_cast<const 
FixedSizeListType&>(*in_array.type);
+    const int32_t list_size = in_type.list_size();
+
+    // Allocate a new offsets buffer
+    ARROW_ASSIGN_OR_RAISE(out_array->buffers[1],
+                          ctx->Allocate(sizeof(dest_offset_type) * 
(batch.length + 1)));
+    auto* offsets = out_array->GetMutableValues<dest_offset_type>(1);
+    dest_offset_type offset = 0;
+    for (int64_t i = 0; i <= batch.length; ++i) {
+      offsets[i] = offset;
+      offset += list_size;
+    }
+
+    // Handle values
+    std::shared_ptr<ArrayData> values = in_array.child_data[0].ToArrayData();
+    if (in_array.offset > 0 || (in_array.length * list_size < batch.length)) {
+      values = values->Slice(in_array.offset * list_size, in_array.length * 
list_size);
+    }
+    ARROW_ASSIGN_OR_RAISE(Datum cast_values,
+                          Cast(values, child_type, options, 
ctx->exec_context()));
+    DCHECK(cast_values.is_array());
+    out_array->child_data.push_back(cast_values.array());
+
+    return Status::OK();
+  }
+};
+
+template <typename SrcType>
+struct CastVarToFixedList {
+  using src_offset_type = typename SrcType::offset_type;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    const CastOptions& options = CastState::Get(ctx);
+
+    auto child_type = checked_cast<const 
FixedSizeListType&>(*out->type()).value_type();
+
+    const ArraySpan& in_array = batch[0].array;
+
+    const auto& out_type = checked_cast<const 
FixedSizeListType&>(*out->type());
+    const int32_t list_size = out_type.list_size();
+
+    // Validate lengths by comparing to the expected offsets.
+    const auto* offsets = in_array.GetValues<src_offset_type>(1);
+    src_offset_type expected_offset = offsets[0];
+    if (in_array.GetNullCount() > 0) {
+      for (int64_t i = 0; i <= batch.length; ++i) {
+        if (in_array.IsNull(i)) {
+          expected_offset += offsets[i + 1] + list_size;
+        } else {
+          if (offsets[i] != expected_offset) {
+            return Status::Invalid("ListType can only be casted to 
FixedSizeListType ",
+                                   "if the lists are all the expected size.");
+          }
+          expected_offset += list_size;
+        }
+      }
+    } else {
+      // Don't need to check null slots if there are no nulls
+      for (int64_t i = 0; i <= batch.length; ++i) {
+        if (offsets[i] != expected_offset) {
+          return Status::Invalid("ListType can only be casted to 
FixedSizeListType ",
+                                 "if the lists are all the expected size.");
+        }
+        expected_offset += list_size;
+      }
+    }

Review Comment:
   ```suggestion
       src_offset_type expected_offset = offsets[0] + list_size;
       if (in_array.GetNullCount() > 0) {
         for (int64_t i = 0; i < batch.length; ++i) {
           if (in_array.IsNull(i)) {
             expected_offset = offsets[i + 1] + list_size;
           } else {
             if (offsets[i + 1] != expected_offset) {
               return Status::Invalid("ListType can only be casted to 
FixedSizeListType ",
                                      "if the lists are all the expected 
size.");
             }
             expected_offset += list_size;
           }
         }
       } else {
         // Don't need to check null slots if there are no nulls
         for (int64_t i = 0; i < batch.length; ++i) {
           if (offsets[i + 1] != expected_offset) {
             return Status::Invalid("ListType can only be casted to 
FixedSizeListType ",
                                    "if the lists are all the expected size.");
           }
           expected_offset += list_size;
         }
       }
   ```
   I think `expected_offset` should match the ending index of each slot, i.e. 
`offsets[i + 1]`, not the starting index, isn't it? Otherwise `null`s could 
mess things up.



##########
cpp/src/arrow/compute/kernels/scalar_cast_test.cc:
##########
@@ -2326,6 +2326,76 @@ TEST(Cast, FSLToFSLOptionsPassThru) {
   CheckCast(fsl_int32, ArrayFromJSON(fixed_size_list(int16(), 1), 
"[[32689]]"), options);
 }
 
+void CheckCastList(const std::shared_ptr<DataType>& from_type,
+                   const std::shared_ptr<DataType>& to_type,
+                   const std::string& json_data) {
+  CheckCast(ArrayFromJSON(from_type, json_data), ArrayFromJSON(to_type, 
json_data));
+}
+
+TEST(Cast, FSLToList) {
+  CheckCastList(fixed_size_list(int16(), 2), list(int16()),
+                "[[0, 1], [2, 3], [null, 5], null]");
+  // Large variant
+  CheckCastList(fixed_size_list(int16(), 2), large_list(int16()),
+                "[[0, 1], [2, 3], [null, 5], null]");
+  // Different child types
+  CheckCastList(fixed_size_list(int16(), 2), list(int32()),
+                "[[0, 1], [2, 3], [null, 5], null]");
+  // No nulls
+  CheckCastList(fixed_size_list(int16(), 2), list(int32()), "[[0, 1], [2, 3], 
[4, 5]]");
+  // Nested lists
+  CheckCastList(fixed_size_list(list(int16()), 2), list(list(int32())),
+                "[[[0, 1], [2, 3]], [[4, 5], null]]");
+  // Sliced children (top-level slicing handled in CheckCast)
+  auto children_src = ArrayFromJSON(int32(), "[1, 2, null, 4, 5, null]");
+  children_src = children_src->Slice(2);
+  auto from =
+      std::make_shared<FixedSizeListArray>(fixed_size_list(int32(), 2), 2, 
children_src);
+  auto to = ArrayFromJSON(list(int32()), "[[null, 4], [5, null]]");
+  CheckCast(from, to);
+  // Options pass through
+  auto fsl_int32 = ArrayFromJSON(fixed_size_list(int32(), 1), "[[87654321]]");
+  auto options = CastOptions::Safe(list(int16()));
+  CheckCastFails(fsl_int32, options);
+  options.allow_int_overflow = true;
+  CheckCast(fsl_int32, ArrayFromJSON(fixed_size_list(int16(), 1), 
"[[32689]]"), options);
+}
+
+TEST(Cast, ListToFSL) {
+  CheckCastList(list(int16()), fixed_size_list(int16(), 2),
+                "[[0, 1], [2, 3], [null, 5], null]");
+  // Large variant
+  CheckCastList(large_list(int16()), fixed_size_list(int16(), 2),
+                "[[0, 1], [2, 3], [null, 5], null]");
+  // Different child types
+  CheckCastList(list(int32()), fixed_size_list(int16(), 2),
+                "[[0, 1], [2, 3], [null, 5], null]");
+  // No nulls
+  CheckCastList(list(int32()), fixed_size_list(int16(), 2), "[[0, 1], [2, 3], 
[4, 5]]");
+  // Nested lists
+  CheckCastList(list(list(int32())), fixed_size_list(list(int16()), 2),
+                "[[[0, 1], [2, 3]], [[4, 5], null]]");
+  // Sliced children (top-level slicing handled in CheckCast)
+  auto children_src = ArrayFromJSON(int32(), "[1, 2, null, 4, 5, null]");
+  children_src = children_src->Slice(2);
+  ASSERT_OK_AND_ASSIGN(
+      auto from,
+      ListArray::FromArrays(*ArrayFromJSON(int32(), "[0, 2, 4]"), 
*children_src));
+  auto to = ArrayFromJSON(fixed_size_list(int32(), 2), "[[null, 4], [5, 
null]]");
+  CheckCast(from, to);
+  // Options pass through
+  auto fsl_int32 = ArrayFromJSON(fixed_size_list(int32(), 1), "[[87654321]]");
+  auto options = CastOptions::Safe(list(int16()));
+  CheckCastFails(fsl_int32, options);
+  options.allow_int_overflow = true;
+  CheckCast(fsl_int32, ArrayFromJSON(fixed_size_list(int16(), 1), 
"[[32689]]"), options);
+
+  // Invalid fixed_size_list cast if inconsistent size
+  ASSERT_RAISES(Invalid, Cast(ArrayFromJSON(list(int32()), "[[0, 1, 2], null, 
[3, 4]]"),
+                              CastOptions::Safe(fixed_size_list(int32(), 3))))
+      << "Size of FixedList is not the same.";
+}

Review Comment:
   Should we also have tests for:
   1. Inputs with `null`s not at the end.
   2. Inputs with manually constructed offsets where `null` elements have 
non-zero value length. `ArrayFromJSON` always has `offset[i+1]==offset[i]` for 
null elements.



##########
cpp/src/arrow/compute/kernels/scalar_cast_test.cc:
##########
@@ -2326,6 +2326,76 @@ TEST(Cast, FSLToFSLOptionsPassThru) {
   CheckCast(fsl_int32, ArrayFromJSON(fixed_size_list(int16(), 1), 
"[[32689]]"), options);
 }
 
+void CheckCastList(const std::shared_ptr<DataType>& from_type,
+                   const std::shared_ptr<DataType>& to_type,
+                   const std::string& json_data) {
+  CheckCast(ArrayFromJSON(from_type, json_data), ArrayFromJSON(to_type, 
json_data));
+}
+
+TEST(Cast, FSLToList) {
+  CheckCastList(fixed_size_list(int16(), 2), list(int16()),
+                "[[0, 1], [2, 3], [null, 5], null]");
+  // Large variant
+  CheckCastList(fixed_size_list(int16(), 2), large_list(int16()),
+                "[[0, 1], [2, 3], [null, 5], null]");
+  // Different child types
+  CheckCastList(fixed_size_list(int16(), 2), list(int32()),
+                "[[0, 1], [2, 3], [null, 5], null]");
+  // No nulls
+  CheckCastList(fixed_size_list(int16(), 2), list(int32()), "[[0, 1], [2, 3], 
[4, 5]]");
+  // Nested lists
+  CheckCastList(fixed_size_list(list(int16()), 2), list(list(int32())),
+                "[[[0, 1], [2, 3]], [[4, 5], null]]");
+  // Sliced children (top-level slicing handled in CheckCast)
+  auto children_src = ArrayFromJSON(int32(), "[1, 2, null, 4, 5, null]");
+  children_src = children_src->Slice(2);
+  auto from =
+      std::make_shared<FixedSizeListArray>(fixed_size_list(int32(), 2), 2, 
children_src);
+  auto to = ArrayFromJSON(list(int32()), "[[null, 4], [5, null]]");
+  CheckCast(from, to);
+  // Options pass through
+  auto fsl_int32 = ArrayFromJSON(fixed_size_list(int32(), 1), "[[87654321]]");
+  auto options = CastOptions::Safe(list(int16()));
+  CheckCastFails(fsl_int32, options);
+  options.allow_int_overflow = true;
+  CheckCast(fsl_int32, ArrayFromJSON(fixed_size_list(int16(), 1), 
"[[32689]]"), options);
+}
+
+TEST(Cast, ListToFSL) {
+  CheckCastList(list(int16()), fixed_size_list(int16(), 2),
+                "[[0, 1], [2, 3], [null, 5], null]");
+  // Large variant
+  CheckCastList(large_list(int16()), fixed_size_list(int16(), 2),
+                "[[0, 1], [2, 3], [null, 5], null]");
+  // Different child types
+  CheckCastList(list(int32()), fixed_size_list(int16(), 2),
+                "[[0, 1], [2, 3], [null, 5], null]");
+  // No nulls
+  CheckCastList(list(int32()), fixed_size_list(int16(), 2), "[[0, 1], [2, 3], 
[4, 5]]");
+  // Nested lists
+  CheckCastList(list(list(int32())), fixed_size_list(list(int16()), 2),
+                "[[[0, 1], [2, 3]], [[4, 5], null]]");
+  // Sliced children (top-level slicing handled in CheckCast)
+  auto children_src = ArrayFromJSON(int32(), "[1, 2, null, 4, 5, null]");
+  children_src = children_src->Slice(2);
+  ASSERT_OK_AND_ASSIGN(
+      auto from,
+      ListArray::FromArrays(*ArrayFromJSON(int32(), "[0, 2, 4]"), 
*children_src));
+  auto to = ArrayFromJSON(fixed_size_list(int32(), 2), "[[null, 4], [5, 
null]]");
+  CheckCast(from, to);
+  // Options pass through
+  auto fsl_int32 = ArrayFromJSON(fixed_size_list(int32(), 1), "[[87654321]]");
+  auto options = CastOptions::Safe(list(int16()));
+  CheckCastFails(fsl_int32, options);
+  options.allow_int_overflow = true;
+  CheckCast(fsl_int32, ArrayFromJSON(fixed_size_list(int16(), 1), 
"[[32689]]"), options);
+
+  // Invalid fixed_size_list cast if inconsistent size
+  ASSERT_RAISES(Invalid, Cast(ArrayFromJSON(list(int32()), "[[0, 1, 2], null, 
[3, 4]]"),
+                              CastOptions::Safe(fixed_size_list(int32(), 3))))
+      << "Size of FixedList is not the same.";

Review Comment:
   This error message seems to be from the FSL->FSL cast kernel, not the one in 
this PR?



##########
cpp/src/arrow/compute/kernels/scalar_cast_nested.cc:
##########
@@ -145,6 +153,135 @@ void AddListCast(CastFunction* func) {
   DCHECK_OK(func->AddKernel(SrcType::type_id, std::move(kernel)));
 }
 
+template <typename DestType>
+struct CastFixedToVarList {
+  using dest_offset_type = typename DestType::offset_type;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    const CastOptions& options = CastState::Get(ctx);
+
+    auto child_type = checked_cast<const DestType&>(*out->type()).value_type();
+
+    const ArraySpan& in_array = batch[0].array;
+
+    ArrayData* out_array = out->array_data().get();
+    ARROW_ASSIGN_OR_RAISE(out_array->buffers[0],
+                          GetNullBitmapBuffer(in_array, ctx->memory_pool()));
+
+    const auto& in_type = checked_cast<const 
FixedSizeListType&>(*in_array.type);
+    const int32_t list_size = in_type.list_size();
+
+    // Allocate a new offsets buffer
+    ARROW_ASSIGN_OR_RAISE(out_array->buffers[1],
+                          ctx->Allocate(sizeof(dest_offset_type) * 
(batch.length + 1)));
+    auto* offsets = out_array->GetMutableValues<dest_offset_type>(1);
+    dest_offset_type offset = 0;
+    for (int64_t i = 0; i <= batch.length; ++i) {
+      offsets[i] = offset;
+      offset += list_size;
+    }
+
+    // Handle values
+    std::shared_ptr<ArrayData> values = in_array.child_data[0].ToArrayData();
+    if (in_array.offset > 0 || (in_array.length * list_size < batch.length)) {
+      values = values->Slice(in_array.offset * list_size, in_array.length * 
list_size);
+    }
+    ARROW_ASSIGN_OR_RAISE(Datum cast_values,
+                          Cast(values, child_type, options, 
ctx->exec_context()));
+    DCHECK(cast_values.is_array());
+    out_array->child_data.push_back(cast_values.array());
+
+    return Status::OK();
+  }
+};
+
+template <typename SrcType>
+struct CastVarToFixedList {
+  using src_offset_type = typename SrcType::offset_type;
+
+  static Status Exec(KernelContext* ctx, const ExecSpan& batch, ExecResult* 
out) {
+    const CastOptions& options = CastState::Get(ctx);
+
+    auto child_type = checked_cast<const 
FixedSizeListType&>(*out->type()).value_type();
+
+    const ArraySpan& in_array = batch[0].array;
+
+    const auto& out_type = checked_cast<const 
FixedSizeListType&>(*out->type());
+    const int32_t list_size = out_type.list_size();
+
+    // Validate lengths by comparing to the expected offsets.
+    const auto* offsets = in_array.GetValues<src_offset_type>(1);
+    src_offset_type expected_offset = offsets[0];
+    if (in_array.GetNullCount() > 0) {
+      for (int64_t i = 0; i <= batch.length; ++i) {
+        if (in_array.IsNull(i)) {
+          expected_offset += offsets[i + 1] + list_size;
+        } else {
+          if (offsets[i] != expected_offset) {
+            return Status::Invalid("ListType can only be casted to 
FixedSizeListType ",
+                                   "if the lists are all the expected size.");
+          }
+          expected_offset += list_size;
+        }
+      }
+    } else {
+      // Don't need to check null slots if there are no nulls
+      for (int64_t i = 0; i <= batch.length; ++i) {
+        if (offsets[i] != expected_offset) {
+          return Status::Invalid("ListType can only be casted to 
FixedSizeListType ",
+                                 "if the lists are all the expected size.");
+        }
+        expected_offset += list_size;
+      }
+    }
+
+    ArrayData* out_array = out->array_data().get();
+    ARROW_ASSIGN_OR_RAISE(out_array->buffers[0],
+                          GetNullBitmapBuffer(in_array, ctx->memory_pool()));
+
+    // Handle values
+    std::shared_ptr<ArrayData> values = in_array.child_data[0].ToArrayData();
+    ARROW_ASSIGN_OR_RAISE(Datum cast_values_datum,
+                          Cast(values, child_type, options, 
ctx->exec_context()));
+
+    DCHECK(cast_values_datum.is_array());
+    std::shared_ptr<ArrayData> cast_values = cast_values_datum.array();
+
+    if (in_array.GetNullCount() > 0 && in_array.length > 0) {

Review Comment:
   ```suggestion
       if (in_array.GetNullCount() > 0) {
   ```
   I'm not sure if this check is necessary. If the batch is empty, the executor 
would return an empty array without invoking the kernel: 
https://github.com/apache/arrow/blob/main/cpp/src/arrow/compute/exec.cc#L786



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to