pitrou commented on code in PR #43302: URL: https://github.com/apache/arrow/pull/43302#discussion_r1743972141
########## cpp/src/arrow/compute/kernels/scalar_cast_string.cc: ########## @@ -305,19 +310,243 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou ctx, input, out->array_data().get()); } +// String View -> Offset String +template <typename O, typename I> +enable_if_t<is_binary_view_like_type<I>::value && is_base_binary_type<O>::value, Status> +BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + using OutputBuilderType = typename TypeTraits<O>::BuilderType; + const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options; + const ArraySpan& input = batch[0].array; + + if constexpr (!I::is_utf8 && O::is_utf8) { + if (!options.allow_invalid_utf8) { + InitializeUTF8(); + ArraySpanVisitor<I> visitor; + Utf8Validator validator; + RETURN_NOT_OK(visitor.Visit(input, &validator)); + } + } + + const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes( + input.GetValues<BinaryViewType::c_type>(1), input.length); + + // TODO(GH-43573): A more efficient implementation that copies the validity + // bitmap all at once is possible, but would mean we don't delegate all the + // building logic to the ArrayBuilder implementation for the output type. + OutputBuilderType builder(options.to_type.GetSharedPtr(), ctx->memory_pool()); + RETURN_NOT_OK(builder.Resize(input.length)); + RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes)); + arrow::internal::ArraySpanInlineVisitor<I> visitor; + RETURN_NOT_OK(visitor.VisitStatus( + input, + [&](std::string_view v) { + // Append valid string view + return builder.Append(v); + }, + [&]() { + // Append null + builder.UnsafeAppendNull(); + return Status::OK(); + })); + + std::shared_ptr<ArrayData> output_array; + RETURN_NOT_OK(builder.FinishInternal(&output_array)); + out->value = std::move(output_array); + return Status::OK(); +} + +// Offset String -> String View +template <typename O, typename I> +enable_if_t<is_base_binary_type<I>::value && is_binary_view_like_type<O>::value, Status> +BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + using offset_type = typename I::offset_type; + const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options; + const ArraySpan& input = batch[0].array; + + if constexpr (!I::is_utf8 && O::is_utf8) { + if (!options.allow_invalid_utf8) { + InitializeUTF8(); + ArraySpanVisitor<I> visitor; + Utf8Validator validator; + RETURN_NOT_OK(visitor.Visit(input, &validator)); + } + } + + // Start with a zero-copy cast, then reconfigure the view and data buffers + RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out)); + ArrayData* output = out->array_data().get(); + + const int64_t total_length = input.offset + input.length; + const auto* validity = input.GetValues<uint8_t>(0, 0); + const auto* input_offsets = input.GetValues<offset_type>(1); + const auto* input_data = input.GetValues<uint8_t>(2, 0); + + // Turn buffers[1] into a buffer of empty BinaryViewType::c_type entries. + ARROW_ASSIGN_OR_RAISE(output->buffers[1], + ctx->Allocate(total_length * BinaryViewType::kSize)); + memset(output->buffers[1]->mutable_data(), 0, total_length * BinaryViewType::kSize); + + // Check against offset overflow + if constexpr (sizeof(offset_type) > 4) { + if (total_length > 0) { + // Offsets are monotonically increasing, that is, offsets[j] <= offsets[j+1] for + // 0 <= j < length, even for null slots. So we only need to check the last offset. + const int64_t max_data_offset = input_offsets[input.length]; + if (ARROW_PREDICT_FALSE(max_data_offset > std::numeric_limits<int32_t>::max())) { + // A more complicated loop could work by slicing the data buffer into + // more than one variadic buffer, but this is probably overkill for now + // before someone hits this problem in practice. + return Status::Invalid("Failed casting from ", input.type->ToString(), " to ", Review Comment: Perhaps make this `Status::CapacityError` though we rarely use it. ########## cpp/src/arrow/compute/kernels/scalar_cast_test.cc: ########## @@ -216,8 +223,15 @@ TEST(Cast, CanCast) { ExpectCannotCast(timestamp(TimeUnit::MICRO), {binary(), large_binary()}); // no formatting supported - ExpectCanCast(fixed_size_binary(3), - {binary(), utf8(), large_binary(), large_utf8(), fixed_size_binary(3)}); + ExpectCanCast(fixed_size_binary(3), { + utf8(), + large_utf8(), + utf8_view(), + binary(), + large_binary(), + binary_view(), Review Comment: You could just use `kBaseBinaryAndViewTypes` here and have a separate `ExpectCanCast` for the identity cast? ########## cpp/src/arrow/compute/kernels/scalar_cast_string.cc: ########## @@ -305,19 +310,243 @@ BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* ou ctx, input, out->array_data().get()); } +// String View -> Offset String +template <typename O, typename I> +enable_if_t<is_binary_view_like_type<I>::value && is_base_binary_type<O>::value, Status> +BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + using OutputBuilderType = typename TypeTraits<O>::BuilderType; + const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options; + const ArraySpan& input = batch[0].array; + + if constexpr (!I::is_utf8 && O::is_utf8) { + if (!options.allow_invalid_utf8) { + InitializeUTF8(); + ArraySpanVisitor<I> visitor; + Utf8Validator validator; + RETURN_NOT_OK(visitor.Visit(input, &validator)); + } + } + + const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes( + input.GetValues<BinaryViewType::c_type>(1), input.length); + + // TODO(GH-43573): A more efficient implementation that copies the validity + // bitmap all at once is possible, but would mean we don't delegate all the + // building logic to the ArrayBuilder implementation for the output type. + OutputBuilderType builder(options.to_type.GetSharedPtr(), ctx->memory_pool()); + RETURN_NOT_OK(builder.Resize(input.length)); + RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes)); + arrow::internal::ArraySpanInlineVisitor<I> visitor; + RETURN_NOT_OK(visitor.VisitStatus( + input, + [&](std::string_view v) { + // Append valid string view + return builder.Append(v); + }, + [&]() { + // Append null + builder.UnsafeAppendNull(); + return Status::OK(); + })); + + std::shared_ptr<ArrayData> output_array; + RETURN_NOT_OK(builder.FinishInternal(&output_array)); + out->value = std::move(output_array); + return Status::OK(); +} + +// Offset String -> String View +template <typename O, typename I> +enable_if_t<is_base_binary_type<I>::value && is_binary_view_like_type<O>::value, Status> +BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + using offset_type = typename I::offset_type; + const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options; + const ArraySpan& input = batch[0].array; + + if constexpr (!I::is_utf8 && O::is_utf8) { + if (!options.allow_invalid_utf8) { + InitializeUTF8(); + ArraySpanVisitor<I> visitor; + Utf8Validator validator; + RETURN_NOT_OK(visitor.Visit(input, &validator)); + } + } + + // Start with a zero-copy cast, then reconfigure the view and data buffers + RETURN_NOT_OK(ZeroCopyCastExec(ctx, batch, out)); + ArrayData* output = out->array_data().get(); + + const int64_t total_length = input.offset + input.length; + const auto* validity = input.GetValues<uint8_t>(0, 0); + const auto* input_offsets = input.GetValues<offset_type>(1); + const auto* input_data = input.GetValues<uint8_t>(2, 0); + + // Turn buffers[1] into a buffer of empty BinaryViewType::c_type entries. + ARROW_ASSIGN_OR_RAISE(output->buffers[1], + ctx->Allocate(total_length * BinaryViewType::kSize)); + memset(output->buffers[1]->mutable_data(), 0, total_length * BinaryViewType::kSize); + + // Check against offset overflow + if constexpr (sizeof(offset_type) > 4) { + if (total_length > 0) { + // Offsets are monotonically increasing, that is, offsets[j] <= offsets[j+1] for + // 0 <= j < length, even for null slots. So we only need to check the last offset. + const int64_t max_data_offset = input_offsets[input.length]; + if (ARROW_PREDICT_FALSE(max_data_offset > std::numeric_limits<int32_t>::max())) { + // A more complicated loop could work by slicing the data buffer into + // more than one variadic buffer, but this is probably overkill for now + // before someone hits this problem in practice. + return Status::Invalid("Failed casting from ", input.type->ToString(), " to ", + output->type->ToString(), + ": input array too large for efficient conversion."); + } + } + } + + auto* out_views = output->GetMutableValues<BinaryViewType::c_type>(1); + + // If all entries are inline, we can drop the extra data buffer for + // large strings in output->buffers[2]. + bool all_entries_are_inline = true; + VisitSetBitRunsVoid( + validity, output->offset, output->length, + [&](int64_t start_offset, int64_t run_length) { + for (int64_t i = start_offset; i < start_offset + run_length; i++) { + const offset_type data_offset = input_offsets[i]; + const offset_type data_length = input_offsets[i + 1] - data_offset; + auto& out_view = out_views[i]; + if (data_length <= BinaryViewType::kInlineSize) { + out_view.inlined.size = static_cast<int32_t>(data_length); + memcpy(out_view.inlined.data.data(), input_data + data_offset, data_length); + } else { + out_view.ref.size = static_cast<int32_t>(data_length); + memcpy(out_view.ref.prefix.data(), input_data + data_offset, + BinaryViewType::kPrefixSize); + // (buffer_index is 0'd by the memset of the buffer 1 above) + // out_view.ref.buffer_index = 0; + out_view.ref.offset = static_cast<int32_t>(data_offset); + all_entries_are_inline = false; + } + } + }); + if (all_entries_are_inline) { + output->buffers[2] = nullptr; + } + return Status::OK(); +} + +// String View -> String View +template <typename O, typename I> +enable_if_t<is_binary_view_like_type<I>::value && is_binary_view_like_type<O>::value, + Status> +BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { + const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options; + const ArraySpan& input = batch[0].array; + + if constexpr (!I::is_utf8 && O::is_utf8) { + if (!options.allow_invalid_utf8) { + InitializeUTF8(); + ArraySpanVisitor<I> visitor; + Utf8Validator validator; + RETURN_NOT_OK(visitor.Visit(input, &validator)); + } + } + + return ZeroCopyCastExec(ctx, batch, out); +} + +// Fixed -> String View template <typename O, typename I> enable_if_t<std::is_same<I, FixedSizeBinaryType>::value && - !std::is_same<O, FixedSizeBinaryType>::value, + is_binary_view_like_type<O>::value, Status> BinaryToBinaryCastExec(KernelContext* ctx, const ExecSpan& batch, ExecResult* out) { const CastOptions& options = checked_cast<const CastState&>(*ctx->state()).options; const ArraySpan& input = batch[0].array; - if (O::is_utf8 && !options.allow_invalid_utf8) { - InitializeUTF8(); - ArraySpanVisitor<I> visitor; - Utf8Validator validator; - RETURN_NOT_OK(visitor.Visit(input, &validator)); + if constexpr (!I::is_utf8 && O::is_utf8) { + if (!options.allow_invalid_utf8) { + InitializeUTF8(); + ArraySpanVisitor<I> visitor; + Utf8Validator validator; + RETURN_NOT_OK(visitor.Visit(input, &validator)); + } + } + + const int32_t fixed_size_width = input.type->byte_width(); + const int64_t total_length = input.offset + input.length; + + ArrayData* output = out->array_data().get(); + DCHECK_EQ(output->length, input.length); + output->offset = input.offset; + output->buffers.resize(3); + output->SetNullCount(input.null_count); + // Share the validity bitmap buffer + output->buffers[0] = input.GetBuffer(0); + // Init buffers[1] with input.length empty BinaryViewType::c_type entries. + ARROW_ASSIGN_OR_RAISE(output->buffers[1], + ctx->Allocate(total_length * BinaryViewType::kSize)); + memset(output->buffers[1]->mutable_data(), 0, total_length * BinaryViewType::kSize); + auto* out_views = output->GetMutableValues<BinaryViewType::c_type>(1); + + auto data_buffer = input.GetBuffer(1); + const auto* data = data_buffer->data(); + + // Check against offset overflow + if (total_length > 0) { + const int64_t max_data_offset = (total_length - 1) * fixed_size_width; + if (ARROW_PREDICT_FALSE(max_data_offset > std::numeric_limits<int32_t>::max())) { + // A more complicated loop could work by slicing the data buffer into + // more than one variadic buffer, but this is probably overkill for now + // before someone hits this problem in practice. + return Status::Invalid("Failed casting from ", input.type->ToString(), " to ", Review Comment: Ditto here ########## cpp/src/arrow/compute/kernels/scalar_cast_test.cc: ########## @@ -198,11 +201,15 @@ TEST(Cast, CanCast) { ExpectCannotCast(from_numeric, {null()}); } - for (auto from_base_binary : kBaseBinaryTypes) { + for (auto from_base_binary : kBaseBinaryAndViewTypes) { ExpectCanCast(from_base_binary, {boolean()}); ExpectCanCast(from_base_binary, kNumericTypes); ExpectCanCast(from_base_binary, kBaseBinaryTypes); - ExpectCanCast(dictionary(int64(), from_base_binary), {from_base_binary}); + /// TODO(GH-43010): include is_binary_view_like() types here once array_take + /// can handle string-views Review Comment: Nit: regular comments ```suggestion // TODO(GH-43010): include is_binary_view_like() types here once array_take // can handle string-views ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org