bkietz commented on code in PR #43302:
URL: https://github.com/apache/arrow/pull/43302#discussion_r1693596023
##########
cpp/src/arrow/visit_data_inline.h:
##########
@@ -249,7 +249,7 @@ VisitArraySpanInline(const ArraySpan& arr, ValidFunc&&
valid_func, NullFunc&& nu
// The scalar value's type depends on the array data type:
// - the type's `c_type`, if any
// - for boolean arrays, a `bool`
-// - for binary, string and fixed-size binary arrays, a `std::string_view`
+// - for binary, string[-view] and fixed-size binary arrays, a
`std::string_view`
Review Comment:
Nit: inline the other types
```suggestion
// - for binary, string, large binary and string, binary and string view,
and fixed-size binary arrays, a `std::string_view`
```
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -305,19 +310,198 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
ctx, input, out->array_data().get());
}
+// View -> Span
+template <typename O, typename I>
+enable_if_t<is_binary_view_like_type<I>::value &&
is_base_binary_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using OutputBuilderType = typename TypeTraits<O>::BuilderType;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // XXX: a more efficient implementation that zero-copies the validity bitmap
+ // is possible, but requires a more complex implementation for building the
+ // offsets and data buffers
+ OutputBuilderType builder(options.to_type.GetSharedPtr(),
ctx->memory_pool());
+ RETURN_NOT_OK(builder.Resize(input.length));
+ arrow::internal::ArraySpanInlineVisitor<I> visitor;
+ RETURN_NOT_OK(visitor.VisitStatus(
+ input, [&](std::string_view v) { return builder.Append(v); },
+ [&]() {
+ builder.UnsafeAppendNull();
+ return Status::OK();
+ }));
+
+ std::shared_ptr<ArrayData> output_array;
+ RETURN_NOT_OK(builder.FinishInternal(&output_array));
+ out->value = std::move(output_array);
+ return Status::OK();
+}
+
+// Span -> View
+template <typename O, typename I>
+enable_if_t<is_base_binary_type<I>::value &&
is_binary_view_like_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using offset_type = typename I::offset_type;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // Start with a zero-copy cast, then reconfigure the view and data buffers
+ RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+ ArrayData* output = out->array_data().get();
+ auto offsets_buffer = std::move(output->buffers[1]);
+ auto data_buffer = std::move(output->buffers[2]);
+
+ const int64_t total_length = input.offset + input.length;
+ const auto* validity = input.GetValues<uint8_t>(0, 0);
+ const auto* input_offsets = input.GetValues<offset_type>(1);
+ const auto* input_data = input.GetValues<uint8_t>(2, 0);
+
+ // Turn buffers[1] into a buffer of empty BinaryViewType::c_type entries.
+ ARROW_ASSIGN_OR_RAISE(output->buffers[1],
+ ctx->Allocate(total_length * BinaryViewType::kSize));
+ memset(output->buffers[1]->mutable_data(), 0, total_length *
BinaryViewType::kSize);
+ auto* out_views = output->GetMutableValues<BinaryViewType::c_type>(1);
+
+ bool all_entries_are_inline = true;
+ VisitSetBitRunsVoid(
+ validity, output->offset, output->length,
+ [&](int64_t start_offset, int64_t run_length) {
+ for (int64_t i = start_offset; i < start_offset + run_length; i++) {
+ const offset_type data_offset = input_offsets[i];
+ const offset_type data_length = input_offsets[i + 1] - data_offset;
+ auto& out_view = out_views[i];
+ if (data_length <= BinaryViewType::kInlineSize) {
+ out_view.inlined.size = static_cast<int32_t>(data_length);
+ memcpy(out_view.inlined.data.data(), input_data + data_offset,
data_length);
+ } else {
+ out_view.ref.size = static_cast<int32_t>(data_length);
+ memcpy(out_view.ref.prefix.data(), input_data + data_offset,
+ BinaryViewType::kPrefixSize);
+ // out_view.ref.buffer_index = 0;
+ out_view.ref.offset = static_cast<int32_t>(data_offset);
+ // TODO(felipecrv): validate data_offsets can't overflow
+ all_entries_are_inline = false;
+ }
Review Comment:
(if they're not the helpers you'd like to have, please extract some)
##########
cpp/src/arrow/type_traits.h:
##########
@@ -1624,6 +1639,16 @@ static inline bool is_binary(const DataType& type) {
return is_binary(type.id())
/// Convenience for checking using the type's id
static inline bool is_string(const DataType& type) { return
is_string(type.id()); }
+/// \brief Check for a binary-view-like type
+///
+/// \param[in] type the type to check
+/// \return whether type is a binary-view-like type
+///
+/// Convenience for checking using the type's id
+static inline bool is_binary_view_like(const DataType& type) {
Review Comment:
I think there's no utility in constexpr type predicates for now. I have a
proposal somewhere for an improved C++17 `visit` function where we might use
constexpr type predicates:
```c++
const DataType& type = ...;
VisitType(type, [&](auto type) {
if constexpr (is_binary_view_like(type)) {
// access to type specific properties can be done here
decltype(type)::TypeClass::is_utf8;
}
});
```
##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -305,19 +310,198 @@ BinaryToBinaryCastExec(KernelContext* ctx, const
ExecSpan& batch, ExecResult* ou
ctx, input, out->array_data().get());
}
+// View -> Span
+template <typename O, typename I>
+enable_if_t<is_binary_view_like_type<I>::value &&
is_base_binary_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using OutputBuilderType = typename TypeTraits<O>::BuilderType;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // XXX: a more efficient implementation that zero-copies the validity bitmap
+ // is possible, but requires a more complex implementation for building the
+ // offsets and data buffers
+ OutputBuilderType builder(options.to_type.GetSharedPtr(),
ctx->memory_pool());
+ RETURN_NOT_OK(builder.Resize(input.length));
+ arrow::internal::ArraySpanInlineVisitor<I> visitor;
+ RETURN_NOT_OK(visitor.VisitStatus(
+ input, [&](std::string_view v) { return builder.Append(v); },
+ [&]() {
+ builder.UnsafeAppendNull();
+ return Status::OK();
+ }));
+
+ std::shared_ptr<ArrayData> output_array;
+ RETURN_NOT_OK(builder.FinishInternal(&output_array));
+ out->value = std::move(output_array);
+ return Status::OK();
+}
+
+// Span -> View
+template <typename O, typename I>
+enable_if_t<is_base_binary_type<I>::value &&
is_binary_view_like_type<O>::value, Status>
+BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult*
out) {
+ using offset_type = typename I::offset_type;
+ const CastOptions& options = checked_cast<const
CastState&>(*ctx->state()).options;
+ const ArraySpan& input = batch[0].array;
+
+ if constexpr (!I::is_utf8 && O::is_utf8) {
+ if (!options.allow_invalid_utf8) {
+ InitializeUTF8();
+ ArraySpanVisitor<I> visitor;
+ Utf8Validator validator;
+ RETURN_NOT_OK(visitor.Visit(input, &validator));
+ }
+ }
+
+ // Start with a zero-copy cast, then reconfigure the view and data buffers
+ RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out));
+ ArrayData* output = out->array_data().get();
+ auto offsets_buffer = std::move(output->buffers[1]);
+ auto data_buffer = std::move(output->buffers[2]);
+
+ const int64_t total_length = input.offset + input.length;
+ const auto* validity = input.GetValues<uint8_t>(0, 0);
+ const auto* input_offsets = input.GetValues<offset_type>(1);
+ const auto* input_data = input.GetValues<uint8_t>(2, 0);
+
+ // Turn buffers[1] into a buffer of empty BinaryViewType::c_type entries.
+ ARROW_ASSIGN_OR_RAISE(output->buffers[1],
+ ctx->Allocate(total_length * BinaryViewType::kSize));
+ memset(output->buffers[1]->mutable_data(), 0, total_length *
BinaryViewType::kSize);
+ auto* out_views = output->GetMutableValues<BinaryViewType::c_type>(1);
+
+ bool all_entries_are_inline = true;
+ VisitSetBitRunsVoid(
+ validity, output->offset, output->length,
+ [&](int64_t start_offset, int64_t run_length) {
+ for (int64_t i = start_offset; i < start_offset + run_length; i++) {
+ const offset_type data_offset = input_offsets[i];
+ const offset_type data_length = input_offsets[i + 1] - data_offset;
+ auto& out_view = out_views[i];
+ if (data_length <= BinaryViewType::kInlineSize) {
+ out_view.inlined.size = static_cast<int32_t>(data_length);
+ memcpy(out_view.inlined.data.data(), input_data + data_offset,
data_length);
+ } else {
+ out_view.ref.size = static_cast<int32_t>(data_length);
+ memcpy(out_view.ref.prefix.data(), input_data + data_offset,
+ BinaryViewType::kPrefixSize);
+ // out_view.ref.buffer_index = 0;
+ out_view.ref.offset = static_cast<int32_t>(data_offset);
+ // TODO(felipecrv): validate data_offsets can't overflow
+ all_entries_are_inline = false;
+ }
Review Comment:
There are some [helper
functions](https://github.com/bkietz/arrow/blob/7aea8bf7a65d679bd71d973b358f997eb3b6c6af/cpp/src/arrow/util/binary_view_util.h#L40-L41)
which could be reused here:
```suggestion
out_views[i] = ToBinaryView(input_data + data_offset, data_length,
0, static_cast<int32_t>(data_offset));
all_entries_are_inline &= out_view[i].is_inline();
if constexpr (sizeof(data_offset) == sizeof(int64_t)) {
any_overflowed_offsets |= data_offset >
std::numeric_limits<int32_t>::max();
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]