mapleFU commented on code in PR #43302:
URL: https://github.com/apache/arrow/pull/43302#discussion_r1706384422
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -305,19 +310,225 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
ctx, input, out->array_data().get());
}
+// String View -> Offset String
+template <typename O, typename I>
+enable_if_t<is_binary_view_like_type<I>::value &&
is_base_binary_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using OutputBuilderType = typename TypeTraits<O>::BuilderType;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes(
+ input.GetValues<BinaryViewType::c_type>(1), input.length);
+
+ // TODO(GH-43573): A more efficient implementation that copies the validity
+ // bitmap all at once is possible, but would mean we don't delegate all the
+ // building logic to the ArrayBuilder implementation for the output type.
+ OutputBuilderType builder(options.to_type.GetSharedPtr(),
ctx->memory_pool());
+ RETURN_NOT_OK(builder.Resize(input.length));
+ RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes));
+ arrow::internal::ArraySpanInlineVisitor<I> visitor;
+ RETURN_NOT_OK(visitor.VisitStatus(
+ input,
+ [&](std::string_view v) {
+ // Append valid string view
+ return builder.Append(v);
+ },
+ [&]() {
+ // Append null
+ builder.UnsafeAppendNull();
+ return Status::OK();
+ }));
+
+ std::shared_ptr<ArrayData> output_array;
+ RETURN_NOT_OK(builder.FinishInternal(&output_array));
+ out->value = std::move(output_array);
+ return Status::OK();
+}
+
+// Offset String -> String View
+template <typename O, typename I>
+enable_if_t<is_base_binary_type<I>::value &&
is_binary_view_like_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using offset_type = typename I::offset_type;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // Start with a zero-copy cast, then reconfigure the view and data buffers
+ RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+ ArrayData* output = out->array_data().get();
+ auto offsets_buffer = std::move(output->buffers[1]);
+ auto data_buffer = std::move(output->buffers[2]);
Review Comment:
Maybe not directly related, can we also check unused in CI?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]