mapleFU commented on code in PR #44822:
URL: https://github.com/apache/arrow/pull/44822#discussion_r1855369115
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -327,31 +329,43 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
}
}
- const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes(
- input.GetValues<BinaryViewType::c_type>(1), input.length);
-
- // TODO(GH-43573): A more efficient implementation that copies the validity
- // bitmap all at once is possible, but would mean we don't delegate all the
- // building logic to the ArrayBuilder implementation for the output type.
- OutputBuilderType builder(options.to_type.GetSharedPtr(),
ctx->memory_pool());
- RETURN_NOT_OK(builder.Resize(input.length));
- RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes));
- arrow::internal::ArraySpanInlineVisitor<I> visitor;
- RETURN_NOT_OK(visitor.VisitStatus(
- input,
- [&](std::string_view v) {
- // Append valid string view
- return builder.Append(v);
+ ArrayData* output = out->array_data().get();
+ output->length = input.length;
+ output->SetNullCount(input.null_count);
+
+ // Set up bitmap
+ if (input.offset == output->offset) {
+ output->buffers[0] = input.GetBuffer(0);
+ } else {
+ if (input.buffers[0].data != NULLPTR) {
Review Comment:
```
// When the offsets are different (e.g., due to slice operation), we
need to check if
// the null bitmap buffer is not null before copying it. The null bitmap
buffer can be
// null if the input array value does not contain any null value.
```
Do we also need a comment here?
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -327,31 +329,43 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
}
}
- const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes(
- input.GetValues<BinaryViewType::c_type>(1), input.length);
-
- // TODO(GH-43573): A more efficient implementation that copies the validity
- // bitmap all at once is possible, but would mean we don't delegate all the
- // building logic to the ArrayBuilder implementation for the output type.
- OutputBuilderType builder(options.to_type.GetSharedPtr(),
ctx->memory_pool());
- RETURN_NOT_OK(builder.Resize(input.length));
- RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes));
- arrow::internal::ArraySpanInlineVisitor<I> visitor;
- RETURN_NOT_OK(visitor.VisitStatus(
- input,
- [&](std::string_view v) {
- // Append valid string view
- return builder.Append(v);
+ ArrayData* output = out->array_data().get();
+ output->length = input.length;
+ output->SetNullCount(input.null_count);
+
+ // Set up bitmap
+ if (input.offset == output->offset) {
+ output->buffers[0] = input.GetBuffer(0);
+ } else {
+ if (input.buffers[0].data != NULLPTR) {
+ ARROW_ASSIGN_OR_RAISE(
+ output->buffers[0],
+ arrow::internal::CopyBitmap(ctx->memory_pool(),
input.buffers[0].data,
+ input.offset, input.length));
+ }
+ }
+
+ // Set up offset and data buffer
+ DataBuilder data_builder(ctx->memory_pool());
Review Comment:
The previous implementation caculate a `sum_of_binary_view_sizes`, and
`ReserveData` for it. Why did here doesn't use same way to reserve data? Would
blindly check append the buffer being faster?
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -327,31 +329,43 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
}
}
- const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes(
- input.GetValues<BinaryViewType::c_type>(1), input.length);
-
- // TODO(GH-43573): A more efficient implementation that copies the validity
- // bitmap all at once is possible, but would mean we don't delegate all the
- // building logic to the ArrayBuilder implementation for the output type.
- OutputBuilderType builder(options.to_type.GetSharedPtr(),
ctx->memory_pool());
- RETURN_NOT_OK(builder.Resize(input.length));
- RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes));
- arrow::internal::ArraySpanInlineVisitor<I> visitor;
- RETURN_NOT_OK(visitor.VisitStatus(
- input,
- [&](std::string_view v) {
- // Append valid string view
- return builder.Append(v);
+ ArrayData* output = out->array_data().get();
+ output->length = input.length;
+ output->SetNullCount(input.null_count);
+
+ // Set up bitmap
Review Comment:
```suggestion
// Set up validity bitmap
```
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -327,31 +329,43 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
}
}
- const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes(
- input.GetValues<BinaryViewType::c_type>(1), input.length);
-
- // TODO(GH-43573): A more efficient implementation that copies the validity
- // bitmap all at once is possible, but would mean we don't delegate all the
- // building logic to the ArrayBuilder implementation for the output type.
- OutputBuilderType builder(options.to_type.GetSharedPtr(),
ctx->memory_pool());
- RETURN_NOT_OK(builder.Resize(input.length));
- RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes));
- arrow::internal::ArraySpanInlineVisitor<I> visitor;
- RETURN_NOT_OK(visitor.VisitStatus(
- input,
- [&](std::string_view v) {
- // Append valid string view
- return builder.Append(v);
+ ArrayData* output = out->array_data().get();
+ output->length = input.length;
+ output->SetNullCount(input.null_count);
+
+ // Set up bitmap
+ if (input.offset == output->offset) {
+ output->buffers[0] = input.GetBuffer(0);
+ } else {
+ if (input.buffers[0].data != NULLPTR) {
+ ARROW_ASSIGN_OR_RAISE(
+ output->buffers[0],
+ arrow::internal::CopyBitmap(ctx->memory_pool(),
input.buffers[0].data,
+ input.offset, input.length));
+ }
+ }
+
+ // Set up offset and data buffer
+ DataBuilder data_builder(ctx->memory_pool());
+ OffsetBuilder offset_builder(ctx->memory_pool());
+ RETURN_NOT_OK(offset_builder.Reserve(batch.length + 1));
+ offset_builder.UnsafeAppend(0); // offsets start at 0
+ RETURN_NOT_OK(VisitArraySpanInline<I>(
+ batch[0].array,
+ [&](std::string_view s) {
+ // for non-null value, append string view to buffer and calculate
offset
+ ARROW_RETURN_NOT_OK(data_builder.Append(
+ reinterpret_cast<const uint8_t*>(s.data()),
static_cast<int64_t>(s.size())));
+
offset_builder.UnsafeAppend(static_cast<offset_type>(data_builder.length()));
+ return Status::OK();
},
[&]() {
- // Append null
- builder.UnsafeAppendNull();
+ // for null value, no need to update buffer
Review Comment:
```suggestion
// for null value, no need to update data buffer
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]