zanmato1984 commented on code in PR #48171:
URL: https://github.com/apache/arrow/pull/48171#discussion_r2604883727
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -304,10 +306,32 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
}
}
- // Start with a zero-copy cast, but change indices to expected size
+ // Start with a zero-copy cast, but change indices to the correct size and
set validity
+ // bitmap and offset if needed.
RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
- return CastBinaryToBinaryOffsets<typename I::offset_type, typename
O::offset_type>(
- ctx, input, out->array_data().get());
+ if constexpr (sizeof(typename I::offset_type) != sizeof(typename
O::offset_type)) {
+ std::shared_ptr<ArrayData> input_arr = input.ToArrayData();
+ ArrayData* output = out->array_data().get();
+
+ // Slice buffers to reduce allocation when casting the offsets buffer
Review Comment:
I think we can be more verbose about this comment - it might not be obvious
for others to see how the allocation is reduced.
##########
cpp/src/arrow/compute/kernels/scalar_cast_test.cc:
##########
@@ -3400,6 +3400,34 @@ TEST(Cast, StringToString) {
}
}
+TEST(Cast, StringToStringWithOffset) {
+ // GH-43660: Check casting String Arrays with nonzero offset
+ for (auto from_type : {utf8(), large_utf8()}) {
+ for (auto to_type : {utf8(), large_utf8()}) {
+ for (int64_t offset : {3, 8, 10, 12}) {
+ auto input_with_nulls = R"([
+ "foo", null, "bar", null, "quu", "foo", "baz", "bar",
+ null, "bar", "baz", null
+ ])";
+
+ auto input_arr_with_nulls = ArrayFromJSON(from_type, input_with_nulls);
+ auto output_arr_with_nulls = ArrayFromJSON(to_type, input_with_nulls);
+ CheckCast(input_arr_with_nulls->Slice(offset),
+ output_arr_with_nulls->Slice(offset));
Review Comment:
Maybe with some slice-with-length cases as well?
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -304,10 +306,32 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
}
}
- // Start with a zero-copy cast, but change indices to expected size
+ // Start with a zero-copy cast, but change indices to the correct size and
set validity
+ // bitmap and offset if needed.
RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
- return CastBinaryToBinaryOffsets<typename I::offset_type, typename
O::offset_type>(
- ctx, input, out->array_data().get());
+ if constexpr (sizeof(typename I::offset_type) != sizeof(typename
O::offset_type)) {
+ std::shared_ptr<ArrayData> input_arr = input.ToArrayData();
+ ArrayData* output = out->array_data().get();
+
+ // Slice buffers to reduce allocation when casting the offsets buffer
+ int64_t input_offset = input_arr->offset;
+ size_t input_offset_type_size = sizeof(typename I::offset_type);
+ if (output->null_count != 0 && output->buffers[0]) {
+ // Avoid reallocation of the validity buffer by allowing some padding
bits
+ output->offset = input_arr->offset % 8;
Review Comment:
```suggestion
output->offset = input_offset % 8;
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]