pitrou commented on a change in pull request #8468: URL: https://github.com/apache/arrow/pull/8468#discussion_r596070474
########## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ########## @@ -1194,6 +1198,197 @@ void AddSplit(FunctionRegistry* registry) { #endif } +// ---------------------------------------------------------------------- +// replace substring + +template <typename Type, typename Derived> +struct ReplaceSubStringBase { + using ArrayType = typename TypeTraits<Type>::ArrayType; + using ScalarType = typename TypeTraits<Type>::ScalarType; + using BuilderType = typename TypeTraits<Type>::BuilderType; + using offset_type = typename Type::offset_type; + using ValueDataBuilder = TypedBufferBuilder<uint8_t>; + using OffsetBuilder = TypedBufferBuilder<offset_type>; + using State = OptionsWrapper<ReplaceSubstringOptions>; + + static void Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + Derived derived(ctx, State::Get(ctx)); + if (ctx->status().ok()) { + derived.Replace(ctx, batch, out); + } + } + void Replace(KernelContext* ctx, const ExecBatch& batch, Datum* out) { + std::shared_ptr<ValueDataBuilder> value_data_builder = + std::make_shared<ValueDataBuilder>(); + std::shared_ptr<OffsetBuilder> offset_builder = std::make_shared<OffsetBuilder>(); + + if (batch[0].kind() == Datum::ARRAY) { + // We already know how many strings we have, so we can use Reserve/UnsafeAppend + KERNEL_RETURN_IF_ERROR(ctx, offset_builder->Reserve(batch[0].array()->length)); + + const ArrayData& input = *batch[0].array(); + KERNEL_RETURN_IF_ERROR(ctx, offset_builder->Append(0)); // offsets start at 0 + KERNEL_RETURN_IF_ERROR( + ctx, VisitArrayDataInline<Type>( + input, + [&](util::string_view s) { + RETURN_NOT_OK(static_cast<Derived&>(*this).ReplaceString( + s, value_data_builder.get())); + offset_builder->UnsafeAppend( + static_cast<offset_type>(value_data_builder->length())); + return Status::OK(); + }, + [&]() { + // offset for null value + offset_builder->UnsafeAppend( + static_cast<offset_type>(value_data_builder->length())); + return Status::OK(); + })); + ArrayData* output = out->mutable_array(); + KERNEL_RETURN_IF_ERROR(ctx, value_data_builder->Finish(&output->buffers[2])); + KERNEL_RETURN_IF_ERROR(ctx, offset_builder->Finish(&output->buffers[1])); + } else { + const auto& input = checked_cast<const ScalarType&>(*batch[0].scalar()); + auto result = std::make_shared<ScalarType>(); + if (input.is_valid) { + util::string_view s = static_cast<util::string_view>(*input.value); + KERNEL_RETURN_IF_ERROR( + ctx, static_cast<Derived&>(*this).ReplaceString(s, value_data_builder.get())); + KERNEL_RETURN_IF_ERROR(ctx, value_data_builder->Finish(&result->value)); + result->is_valid = true; + } + out->value = result; + } + } +}; + +template <typename Type> +struct ReplaceSubString : ReplaceSubStringBase<Type, ReplaceSubString<Type>> { Review comment: Well, you don't need to refactor other kernels for now, but I suppose this one could easily be adapted, no? :-) ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org