pitrou commented on a change in pull request #11793:
URL: https://github.com/apache/arrow/pull/11793#discussion_r761007256
##########
File path: cpp/src/arrow/compute/kernels/codegen_internal.h
##########
@@ -1169,6 +1169,19 @@ ArrayKernelExec
GeneratePhysicalNumeric(detail::GetTypeId get_id) {
}
}
+template <template <typename... Args> class Generator, typename... Args>
+ArrayKernelExec GeneratePhysicalDecimalToPhysicalDecimal(detail::GetTypeId
get_id) {
Review comment:
This name is a bit weird, why not simply `GenerateDecimal`?
##########
File path: cpp/src/arrow/compute/kernels/scalar_compare.cc
##########
@@ -439,6 +472,330 @@ struct ScalarMinMax {
}
};
+template <typename Type, typename Op>
+struct BinaryScalarMinMax {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ using offset_type = typename Type::offset_type;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx);
+ if (std::all_of(batch.values.begin(), batch.values.end(),
+ [](const Datum& d) { return d.is_scalar(); })) {
+ return ExecOnlyScalar(ctx, options, batch, out);
+ }
+ return ExecContainingArrays(ctx, options, batch, out);
+ }
+
+ static Status ExecOnlyScalar(KernelContext* ctx,
+ const ElementWiseAggregateOptions& options,
+ const ExecBatch& batch, Datum* out) {
+ if (batch.values.empty()) {
+ return Status::OK();
+ }
+ auto output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ if (!options.skip_nulls) {
+ // any nulls in the input will produce a null output
+ for (const auto& value : batch.values) {
+ if (!value.scalar()->is_valid) {
+ output->is_valid = false;
+ return Status::OK();
+ }
+ }
+ }
+ const auto& first_scalar = *batch.values.front().scalar();
+ string_view result = UnboxScalar<Type>::Unbox(first_scalar);
+ bool valid = first_scalar.is_valid;
+ for (size_t i = 1; i < batch.values.size(); i++) {
+ const auto& scalar = *batch[i].scalar();
+ if (!scalar.is_valid) {
+ DCHECK(options.skip_nulls);
+ continue;
+ } else {
+ string_view value = UnboxScalar<Type>::Unbox(scalar);
+ result = !valid ? value : Op::Call(result, value);
+ valid = true;
+ }
+ }
+ if (valid) {
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(result.size()));
+ std::copy(result.begin(), result.end(), output->value->mutable_data());
+ output->is_valid = true;
+ } else {
+ output->is_valid = false;
+ }
+ return Status::OK();
+ }
+
+ static Status ExecContainingArrays(KernelContext* ctx,
+ const ElementWiseAggregateOptions&
options,
+ const ExecBatch& batch, Datum* out) {
+ // Presize data to avoid reallocations, using an upper bound estimation of
final size.
+ int64_t estimated_final_size = 0;
+ for (int64_t i = 0; i < batch.length; i++) {
+ auto size = CalculateRowSizeUpperBound(options, batch, i);
+ if (size > 0) estimated_final_size += size;
+ }
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(batch.length));
+ RETURN_NOT_OK(builder.ReserveData(estimated_final_size));
+
+ std::vector<util::optional<string_view>> valid_cols(batch.values.size());
+ for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) {
+ size_t num_valid = 0;
+ for (size_t col = 0; col < batch.values.size(); col++) {
+ if (batch[col].is_scalar()) {
+ const auto& scalar = *batch[col].scalar();
+ if (scalar.is_valid) {
+ valid_cols[col] = UnboxScalar<Type>::Unbox(scalar);
+ num_valid++;
+ } else {
+ valid_cols[col] = util::nullopt;
+ }
+ } else {
+ const auto& array = *batch[col].array();
+ if (!array.MayHaveNulls() ||
+ bit_util::GetBit(array.buffers[0]->data(), array.offset + row)) {
+ const auto offsets = array.GetValues<offset_type>(1);
+ const auto data = array.GetValues<uint8_t>(2,
/*absolute_offset=*/0);
+ const int64_t length = offsets[row + 1] - offsets[row];
+ valid_cols[col] =
+ string_view(reinterpret_cast<const char*>(data +
offsets[row]), length);
+ num_valid++;
+ } else {
+ valid_cols[col] = util::nullopt;
+ }
+ }
+ }
+
+ if (num_valid == 0 || (num_valid < batch.values.size() &&
!options.skip_nulls)) {
+ // We had some nulls
+ builder.UnsafeAppendNull();
+ continue;
+ }
+ auto result = valid_cols.front();
+ for (size_t col = 1; col < batch.values.size(); ++col) {
+ const auto value = valid_cols[col];
+ if (!value) {
+ DCHECK(options.skip_nulls);
+ continue;
+ }
+ result = !result ? *value : Op::Call(*result, *value);
+ }
+ if (result) {
+ builder.UnsafeAppend(*result);
+ } else {
+ builder.UnsafeAppendNull();
+ }
+ }
+
+ std::shared_ptr<Array> string_array;
+ RETURN_NOT_OK(builder.Finish(&string_array));
+ *out = *string_array->data();
+ out->mutable_array()->type = batch[0].type();
+ DCHECK_EQ(batch.length, out->array()->length);
+ DCHECK_GE(estimated_final_size,
+ checked_cast<const
ArrayType&>(*string_array).total_values_length());
+ return Status::OK();
+ }
+
+ // Compute and upper bound for the length of the output for the given
position,
+ // or -1 if it would be null.
+ static int64_t CalculateRowSizeUpperBound(const ElementWiseAggregateOptions&
options,
+ const ExecBatch& batch, const
int64_t index) {
+ const auto num_args = batch.values.size();
+ int64_t final_size = 0;
+ for (size_t i = 0; i < num_args; i++) {
+ int64_t element_size = 0;
+ bool valid = true;
+ if (batch[i].is_scalar()) {
+ const auto& scalar = *batch[i].scalar();
+ valid = scalar.is_valid;
+ element_size =
static_cast<int64_t>(UnboxScalar<Type>::Unbox(scalar).size());
+ } else {
+ const auto& array = *batch[i].array();
+ valid = !array.MayHaveNulls() ||
+ bit_util::GetBit(array.buffers[0]->data(), array.offset +
index);
+ const auto offsets = array.GetValues<offset_type>(1);
+ element_size = offsets[index + 1] - offsets[index];
+ }
+ if (!valid) {
+ if (options.skip_nulls) {
+ continue;
+ }
+ return -1;
+ }
+ // Conservative estimation of the element size.
+ final_size = std::max(final_size, element_size);
+ }
+ return final_size;
+ }
+};
+
+template <typename Op>
+struct FixedSizeBinaryScalarMinMax {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx);
+ if (std::all_of(batch.values.begin(), batch.values.end(),
+ [](const Datum& d) { return d.is_scalar(); })) {
+ return ExecOnlyScalar(ctx, options, batch, out);
+ }
+ return ExecContainingArrays(ctx, options, batch, out);
+ }
+
+ static Status ExecOnlyScalar(KernelContext* ctx,
+ const ElementWiseAggregateOptions& options,
+ const ExecBatch& batch, Datum* out) {
+ if (batch.values.empty()) {
+ return Status::OK();
+ }
+ auto output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ const size_t num_args = batch.values.size();
+
+ const auto batch_type = batch[0].type();
+ const auto binary_type = checked_cast<const
FixedSizeBinaryType*>(batch_type.get());
+ int64_t final_size = CalculateRowSize(options, batch, 0,
binary_type->byte_width());
+ if (final_size < 0) {
+ output->is_valid = false;
+ return Status::OK();
+ }
+ string_view result =
+
UnboxScalar<FixedSizeBinaryType>::Unbox(*batch.values.front().scalar());
+ for (size_t i = 1; i < num_args; i++) {
+ const auto& scalar = *batch[i].scalar();
+ if (!scalar.is_valid && options.skip_nulls) {
+ continue;
+ }
+ if (scalar.is_valid) {
+ string_view value = UnboxScalar<FixedSizeBinaryType>::Unbox(scalar);
+ result = result.empty() ? value : Op::Call(result, value);
+ }
+ }
+ if (!result.empty()) {
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(final_size));
+ uint8_t* buf = output->value->mutable_data();
+ buf = std::copy(result.begin(), result.end(), buf);
+ output->is_valid = true;
+ DCHECK_GE(final_size, buf - output->value->mutable_data());
+ }
+ return Status::OK();
+ }
+
+ static Status ExecContainingArrays(KernelContext* ctx,
+ const ElementWiseAggregateOptions&
options,
+ const ExecBatch& batch, Datum* out) {
+ const auto batch_type = batch[0].type();
+ const auto binary_type = checked_cast<const
FixedSizeBinaryType*>(batch_type.get());
+ int32_t byte_width = binary_type->byte_width();
+ // Presize data to avoid reallocations
+ int64_t final_size = 0;
+ for (int64_t i = 0; i < batch.length; i++) {
+ auto size = CalculateRowSize(options, batch, i, byte_width);
+ if (size > 0) final_size += size;
+ }
+ FixedSizeBinaryBuilder builder(batch_type);
+ RETURN_NOT_OK(builder.Reserve(batch.length));
+ RETURN_NOT_OK(builder.ReserveData(final_size));
+
+ std::vector<string_view> valid_cols(batch.values.size());
+ for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) {
+ size_t num_valid = 0;
+ for (size_t col = 0; col < batch.values.size(); col++) {
+ if (batch[col].is_scalar()) {
+ const auto& scalar = *batch[col].scalar();
+ if (scalar.is_valid) {
+ valid_cols[col] = UnboxScalar<FixedSizeBinaryType>::Unbox(scalar);
+ num_valid++;
+ } else {
+ valid_cols[col] = string_view();
+ }
+ } else {
+ const auto& array = *batch[col].array();
+ if (!array.MayHaveNulls() ||
+ bit_util::GetBit(array.buffers[0]->data(), array.offset + row)) {
+ const auto data = array.GetValues<uint8_t>(1,
/*absolute_offset=*/0);
+ valid_cols[col] = string_view(
+ reinterpret_cast<const char*>(data) + row * byte_width,
byte_width);
+ num_valid++;
+ } else {
+ valid_cols[col] = string_view();
+ }
+ }
+ }
+
+ if (num_valid < batch.values.size() && !options.skip_nulls) {
+ // We had some nulls
+ builder.UnsafeAppendNull();
+ continue;
+ }
+ auto result = valid_cols.front();
+ for (size_t col = 1; col < batch.values.size(); ++col) {
+ const auto value = valid_cols[col];
+ if (value.empty()) {
+ DCHECK(options.skip_nulls);
+ continue;
+ }
+ result = result.empty() ? value : Op::Call(result, value);
+ }
+ if (result.empty()) {
+ builder.UnsafeAppendNull();
+ } else {
+ builder.UnsafeAppend(result);
+ }
+ }
+
+ std::shared_ptr<Array> string_array;
+ RETURN_NOT_OK(builder.Finish(&string_array));
+ *out = *string_array->data();
+ out->mutable_array()->type = batch[0].type();
+ DCHECK_EQ(batch.length, out->array()->length);
+ return Status::OK();
+ }
+
+ // Compute the length of the output for the given position, or -1 if it
would be null.
+ static int64_t CalculateRowSize(const ElementWiseAggregateOptions& options,
+ const ExecBatch& batch, const int64_t index,
+ int32_t byte_width) {
+ const auto num_args = batch.values.size();
+ int32_t final_size = 0;
+ for (size_t i = 0; i < num_args; i++) {
+ bool valid = true;
+ if (batch[i].is_scalar()) {
+ const auto& scalar = *batch[i].scalar();
+ valid = scalar.is_valid;
+ } else {
+ const auto& array = *batch[i].array();
+ valid = !array.MayHaveNulls() ||
+ bit_util::GetBit(array.buffers[0]->data(), array.offset +
index);
+ }
+ if (!valid) {
+ if (options.skip_nulls) {
+ continue;
+ }
+ return -1;
+ }
+ final_size = std::max(final_size, byte_width);
+ }
+ return final_size;
+ }
+};
+
+Result<ValueDescr> ResolveMinOrMaxOutputType(KernelContext*,
+ const std::vector<ValueDescr>&
args) {
+ if (args.empty()) {
+ return null();
+ }
+ auto first_type = args[0].type;
+ for (size_t i = 1; i < args.size(); ++i) {
+ auto type = args[i].type;
+ if (*type != *first_type) {
+ return Status::NotImplemented(
+ "Different decimal types not implemented for {min,
max}_element_wise");
Review comment:
I think the error message is too specific, as you could have fixed-size
binary types with different byte widths as well.
##########
File path: cpp/src/arrow/compute/kernels/scalar_compare.cc
##########
@@ -439,6 +472,330 @@ struct ScalarMinMax {
}
};
+template <typename Type, typename Op>
+struct BinaryScalarMinMax {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ using offset_type = typename Type::offset_type;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx);
+ if (std::all_of(batch.values.begin(), batch.values.end(),
+ [](const Datum& d) { return d.is_scalar(); })) {
+ return ExecOnlyScalar(ctx, options, batch, out);
+ }
+ return ExecContainingArrays(ctx, options, batch, out);
+ }
+
+ static Status ExecOnlyScalar(KernelContext* ctx,
+ const ElementWiseAggregateOptions& options,
+ const ExecBatch& batch, Datum* out) {
+ if (batch.values.empty()) {
+ return Status::OK();
+ }
+ auto output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ if (!options.skip_nulls) {
+ // any nulls in the input will produce a null output
+ for (const auto& value : batch.values) {
+ if (!value.scalar()->is_valid) {
+ output->is_valid = false;
+ return Status::OK();
+ }
+ }
+ }
+ const auto& first_scalar = *batch.values.front().scalar();
+ string_view result = UnboxScalar<Type>::Unbox(first_scalar);
+ bool valid = first_scalar.is_valid;
+ for (size_t i = 1; i < batch.values.size(); i++) {
+ const auto& scalar = *batch[i].scalar();
+ if (!scalar.is_valid) {
+ DCHECK(options.skip_nulls);
+ continue;
+ } else {
+ string_view value = UnboxScalar<Type>::Unbox(scalar);
+ result = !valid ? value : Op::Call(result, value);
+ valid = true;
+ }
+ }
+ if (valid) {
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(result.size()));
+ std::copy(result.begin(), result.end(), output->value->mutable_data());
+ output->is_valid = true;
+ } else {
+ output->is_valid = false;
+ }
+ return Status::OK();
+ }
+
+ static Status ExecContainingArrays(KernelContext* ctx,
+ const ElementWiseAggregateOptions&
options,
+ const ExecBatch& batch, Datum* out) {
+ // Presize data to avoid reallocations, using an upper bound estimation of
final size.
+ int64_t estimated_final_size = 0;
+ for (int64_t i = 0; i < batch.length; i++) {
+ auto size = CalculateRowSizeUpperBound(options, batch, i);
+ if (size > 0) estimated_final_size += size;
+ }
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(batch.length));
+ RETURN_NOT_OK(builder.ReserveData(estimated_final_size));
+
+ std::vector<util::optional<string_view>> valid_cols(batch.values.size());
+ for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) {
Review comment:
Please just use `int64_t`.
##########
File path: cpp/src/arrow/compute/kernels/scalar_compare.cc
##########
@@ -439,6 +472,330 @@ struct ScalarMinMax {
}
};
+template <typename Type, typename Op>
+struct BinaryScalarMinMax {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ using offset_type = typename Type::offset_type;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx);
+ if (std::all_of(batch.values.begin(), batch.values.end(),
+ [](const Datum& d) { return d.is_scalar(); })) {
+ return ExecOnlyScalar(ctx, options, batch, out);
+ }
+ return ExecContainingArrays(ctx, options, batch, out);
+ }
+
+ static Status ExecOnlyScalar(KernelContext* ctx,
+ const ElementWiseAggregateOptions& options,
+ const ExecBatch& batch, Datum* out) {
+ if (batch.values.empty()) {
+ return Status::OK();
+ }
+ auto output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ if (!options.skip_nulls) {
+ // any nulls in the input will produce a null output
+ for (const auto& value : batch.values) {
+ if (!value.scalar()->is_valid) {
+ output->is_valid = false;
+ return Status::OK();
+ }
+ }
+ }
+ const auto& first_scalar = *batch.values.front().scalar();
+ string_view result = UnboxScalar<Type>::Unbox(first_scalar);
+ bool valid = first_scalar.is_valid;
+ for (size_t i = 1; i < batch.values.size(); i++) {
+ const auto& scalar = *batch[i].scalar();
+ if (!scalar.is_valid) {
+ DCHECK(options.skip_nulls);
+ continue;
+ } else {
+ string_view value = UnboxScalar<Type>::Unbox(scalar);
+ result = !valid ? value : Op::Call(result, value);
+ valid = true;
+ }
+ }
+ if (valid) {
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(result.size()));
+ std::copy(result.begin(), result.end(), output->value->mutable_data());
+ output->is_valid = true;
+ } else {
+ output->is_valid = false;
+ }
+ return Status::OK();
+ }
+
+ static Status ExecContainingArrays(KernelContext* ctx,
+ const ElementWiseAggregateOptions&
options,
+ const ExecBatch& batch, Datum* out) {
+ // Presize data to avoid reallocations, using an upper bound estimation of
final size.
+ int64_t estimated_final_size = 0;
+ for (int64_t i = 0; i < batch.length; i++) {
+ auto size = CalculateRowSizeUpperBound(options, batch, i);
+ if (size > 0) estimated_final_size += size;
+ }
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(batch.length));
+ RETURN_NOT_OK(builder.ReserveData(estimated_final_size));
+
+ std::vector<util::optional<string_view>> valid_cols(batch.values.size());
+ for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) {
+ size_t num_valid = 0;
+ for (size_t col = 0; col < batch.values.size(); col++) {
+ if (batch[col].is_scalar()) {
+ const auto& scalar = *batch[col].scalar();
+ if (scalar.is_valid) {
+ valid_cols[col] = UnboxScalar<Type>::Unbox(scalar);
+ num_valid++;
+ } else {
+ valid_cols[col] = util::nullopt;
+ }
+ } else {
+ const auto& array = *batch[col].array();
+ if (!array.MayHaveNulls() ||
+ bit_util::GetBit(array.buffers[0]->data(), array.offset + row)) {
+ const auto offsets = array.GetValues<offset_type>(1);
+ const auto data = array.GetValues<uint8_t>(2,
/*absolute_offset=*/0);
+ const int64_t length = offsets[row + 1] - offsets[row];
+ valid_cols[col] =
+ string_view(reinterpret_cast<const char*>(data +
offsets[row]), length);
+ num_valid++;
+ } else {
+ valid_cols[col] = util::nullopt;
+ }
+ }
+ }
+
+ if (num_valid == 0 || (num_valid < batch.values.size() &&
!options.skip_nulls)) {
+ // We had some nulls
+ builder.UnsafeAppendNull();
+ continue;
+ }
+ auto result = valid_cols.front();
+ for (size_t col = 1; col < batch.values.size(); ++col) {
+ const auto value = valid_cols[col];
+ if (!value) {
+ DCHECK(options.skip_nulls);
+ continue;
+ }
+ result = !result ? *value : Op::Call(*result, *value);
+ }
+ if (result) {
+ builder.UnsafeAppend(*result);
+ } else {
+ builder.UnsafeAppendNull();
+ }
+ }
+
+ std::shared_ptr<Array> string_array;
+ RETURN_NOT_OK(builder.Finish(&string_array));
+ *out = *string_array->data();
+ out->mutable_array()->type = batch[0].type();
+ DCHECK_EQ(batch.length, out->array()->length);
+ DCHECK_GE(estimated_final_size,
+ checked_cast<const
ArrayType&>(*string_array).total_values_length());
+ return Status::OK();
+ }
+
+ // Compute and upper bound for the length of the output for the given
position,
Review comment:
"an"
##########
File path: cpp/src/arrow/compute/kernels/scalar_compare.cc
##########
@@ -439,6 +472,330 @@ struct ScalarMinMax {
}
};
+template <typename Type, typename Op>
+struct BinaryScalarMinMax {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ using offset_type = typename Type::offset_type;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx);
+ if (std::all_of(batch.values.begin(), batch.values.end(),
+ [](const Datum& d) { return d.is_scalar(); })) {
+ return ExecOnlyScalar(ctx, options, batch, out);
+ }
+ return ExecContainingArrays(ctx, options, batch, out);
+ }
+
+ static Status ExecOnlyScalar(KernelContext* ctx,
+ const ElementWiseAggregateOptions& options,
+ const ExecBatch& batch, Datum* out) {
+ if (batch.values.empty()) {
+ return Status::OK();
+ }
+ auto output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ if (!options.skip_nulls) {
+ // any nulls in the input will produce a null output
+ for (const auto& value : batch.values) {
+ if (!value.scalar()->is_valid) {
+ output->is_valid = false;
+ return Status::OK();
+ }
+ }
+ }
+ const auto& first_scalar = *batch.values.front().scalar();
+ string_view result = UnboxScalar<Type>::Unbox(first_scalar);
+ bool valid = first_scalar.is_valid;
+ for (size_t i = 1; i < batch.values.size(); i++) {
+ const auto& scalar = *batch[i].scalar();
+ if (!scalar.is_valid) {
+ DCHECK(options.skip_nulls);
+ continue;
+ } else {
+ string_view value = UnboxScalar<Type>::Unbox(scalar);
+ result = !valid ? value : Op::Call(result, value);
+ valid = true;
+ }
+ }
+ if (valid) {
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(result.size()));
+ std::copy(result.begin(), result.end(), output->value->mutable_data());
+ output->is_valid = true;
+ } else {
+ output->is_valid = false;
+ }
+ return Status::OK();
+ }
+
+ static Status ExecContainingArrays(KernelContext* ctx,
+ const ElementWiseAggregateOptions&
options,
+ const ExecBatch& batch, Datum* out) {
+ // Presize data to avoid reallocations, using an upper bound estimation of
final size.
+ int64_t estimated_final_size = 0;
+ for (int64_t i = 0; i < batch.length; i++) {
+ auto size = CalculateRowSizeUpperBound(options, batch, i);
Review comment:
I wonder how desirable it is to bear the cost of going through all the
data a second time instead of simply dynamically-sizing the output (you could
presize it using a heuristic that doesn't have to be a strict upper bound, for
example use the largest length of all input arrays). @lidavidm Thoughts?
##########
File path: cpp/src/arrow/compute/kernels/scalar_compare.cc
##########
@@ -439,6 +472,330 @@ struct ScalarMinMax {
}
};
+template <typename Type, typename Op>
+struct BinaryScalarMinMax {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ using offset_type = typename Type::offset_type;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx);
+ if (std::all_of(batch.values.begin(), batch.values.end(),
+ [](const Datum& d) { return d.is_scalar(); })) {
+ return ExecOnlyScalar(ctx, options, batch, out);
+ }
+ return ExecContainingArrays(ctx, options, batch, out);
+ }
+
+ static Status ExecOnlyScalar(KernelContext* ctx,
+ const ElementWiseAggregateOptions& options,
+ const ExecBatch& batch, Datum* out) {
+ if (batch.values.empty()) {
+ return Status::OK();
+ }
+ auto output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ if (!options.skip_nulls) {
+ // any nulls in the input will produce a null output
+ for (const auto& value : batch.values) {
+ if (!value.scalar()->is_valid) {
+ output->is_valid = false;
+ return Status::OK();
+ }
+ }
+ }
+ const auto& first_scalar = *batch.values.front().scalar();
+ string_view result = UnboxScalar<Type>::Unbox(first_scalar);
+ bool valid = first_scalar.is_valid;
+ for (size_t i = 1; i < batch.values.size(); i++) {
+ const auto& scalar = *batch[i].scalar();
+ if (!scalar.is_valid) {
+ DCHECK(options.skip_nulls);
+ continue;
+ } else {
+ string_view value = UnboxScalar<Type>::Unbox(scalar);
+ result = !valid ? value : Op::Call(result, value);
+ valid = true;
+ }
+ }
+ if (valid) {
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(result.size()));
+ std::copy(result.begin(), result.end(), output->value->mutable_data());
+ output->is_valid = true;
+ } else {
+ output->is_valid = false;
+ }
+ return Status::OK();
+ }
+
+ static Status ExecContainingArrays(KernelContext* ctx,
+ const ElementWiseAggregateOptions&
options,
+ const ExecBatch& batch, Datum* out) {
+ // Presize data to avoid reallocations, using an upper bound estimation of
final size.
+ int64_t estimated_final_size = 0;
+ for (int64_t i = 0; i < batch.length; i++) {
+ auto size = CalculateRowSizeUpperBound(options, batch, i);
+ if (size > 0) estimated_final_size += size;
+ }
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(batch.length));
+ RETURN_NOT_OK(builder.ReserveData(estimated_final_size));
+
+ std::vector<util::optional<string_view>> valid_cols(batch.values.size());
+ for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) {
+ size_t num_valid = 0;
Review comment:
You could write an internal helper to update the result e.g.:
```c++
util::optional<string_view> result;
auto visit_value = [&](string_view v) {
result = !result ? *value : Op::Call(*result, *value);
};
for (size_t col = 0; col < batch.values.size(); col++) {
if (batch[col].is_scalar()) {
const auto& scalar = *batch[col].scalar();
if (scalar.is_valid) {
visit_value(UnboxScalar<Type>::Unbox(scalar));
} else if (!options.skip_nulls) {
break;
}
} else {
const auto& array = *batch[col].array();
if (!array.MayHaveNulls() ||
bit_util::GetBit(array.buffers[0]->data(), array.offset +
row)) {
const auto offsets = array.GetValues<offset_type>(1);
const auto data = array.GetValues<uint8_t>(2,
/*absolute_offset=*/0);
const int64_t length = offsets[row + 1] - offsets[row];
visit_value(
string_view(reinterpret_cast<const char*>(data +
offsets[row]), length));
num_valid++;
} else if (!options.skip_nulls) {
break;
}
}
}
if (result) {
builder.UnsafeAppend(*result);
} else {
builder.UnsafeAppendNull();
}
```
##########
File path: cpp/src/arrow/compute/kernels/scalar_compare_test.cc
##########
@@ -1430,6 +1817,262 @@ TYPED_TEST(TestVarArgsCompareParametricTemporal,
MaxElementWise) {
{this->array("[1, null, 3, 4]"), this->array("[2, 2, null,
2]")});
}
+TYPED_TEST(TestVarArgsCompareBinary, MaxElementWise) {
+ this->AssertNullScalar(MaxElementWise, {});
+ this->AssertNullScalar(MaxElementWise, {this->scalar("null"),
this->scalar("null")});
+
+ this->Assert(MaxElementWise, this->scalar(R"("0")"),
{this->scalar(R"("0")")});
+ this->Assert(MaxElementWise, this->scalar(R"("2")"),
+ {this->scalar(R"("2")"), this->scalar(R"("0")"),
this->scalar(R"("1")")});
+ this->Assert(MaxElementWise, this->scalar(R"("2")"),
+ {this->scalar(R"("2")"), this->scalar(R"("0")"),
this->scalar(R"("1")"),
+ this->scalar("null")});
+ this->Assert(MaxElementWise, this->scalar(R"("1")"),
+ {this->scalar("null"), this->scalar("null"),
this->scalar(R"("1")"),
+ this->scalar("null")});
+ this->Assert(MaxElementWise, this->scalar(R"("")"),
+ {this->scalar(R"("")"), this->scalar(R"("")")});
+ this->Assert(MaxElementWise, this->scalar(R"("")"),
+ {this->scalar(R"("")"), this->scalar("null")});
+ this->Assert(MaxElementWise, this->scalar(R"("2")"),
+ {this->scalar(R"("2")"), this->scalar(R"("")")});
+ this->Assert(MaxElementWise, this->scalar(R"("2")"),
+ {this->scalar(R"("")"), this->scalar(R"("2")")});
+ this->Assert(MaxElementWise, (this->array("[]")), {this->array("[]")});
+ this->Assert(MaxElementWise, this->array(R"(["1", "2", "3", null])"),
+ {this->array(R"(["1", "2", "3", null])")});
+ this->Assert(MaxElementWise, this->array(R"(["1", "2"])"),
+ {this->array(R"(["1", ""])"), this->array(R"(["", "2"])")});
+
+ this->Assert(MaxElementWise, this->array(R"(["2", "2", "3", "4"])"),
+ {this->array(R"(["1", "2", "3", "4"])"),
this->scalar(R"("2")")});
+ this->Assert(MaxElementWise, this->array(R"(["2", "2", "3", "4"])"),
+ {this->array(R"(["1", null, "3", "4"])"),
this->scalar(R"("2")")});
+ this->Assert(MaxElementWise, this->array(R"(["4", "4", "4", "4"])"),
+ {this->array(R"(["1", null, "3", "4"])"),
this->scalar(R"("2")"),
+ this->scalar(R"("4")")});
+ this->Assert(MaxElementWise, this->array(R"(["2", "2", "3", "4"])"),
+ {this->array(R"(["1", null, "3", "4"])"), this->scalar("null"),
+ this->scalar(R"("2")")});
+
+ this->Assert(
+ MaxElementWise, this->array(R"(["2", "2", "3", "4"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array(R"(["2", "2", "2",
"2"])")});
+ this->Assert(
+ MaxElementWise, this->array(R"(["2", "2", "3", "4"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array(R"(["2", null, "2",
"2"])")});
+ this->Assert(
+ MaxElementWise, this->array(R"(["2", "2", "3", "4"])"),
+ {this->array(R"(["1", null, "3", "4"])"), this->array(R"(["2", "2", "2",
"2"])")});
+
+ this->Assert(MaxElementWise, this->array(R"(["4", "2", null, "6"])"),
+ {this->array(R"(["1", "2", null, null])"),
+ this->array(R"(["4", null, null, "6"])")});
+ this->Assert(MaxElementWise, this->array(R"(["4", "2", null, "6"])"),
+ {this->array(R"(["4", null, null, "6"])"),
+ this->array(R"(["1", "2", null, null])")});
+ this->Assert(
+ MaxElementWise, this->array(R"(["1", "2", "3", "4"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array("[null, null, null,
null]")});
+ this->Assert(
+ MaxElementWise, this->array(R"(["1", "2", "3", "4"])"),
+ {this->array("[null, null, null, null]"), this->array(R"(["1", "2", "3",
"4"])")});
+
+ this->Assert(MaxElementWise, this->array(R"(["1", "2", "3", "4"])"),
+ {this->scalar(R"("1")"), this->array(R"(["1", "2", "3",
"4"])")});
+ this->Assert(MaxElementWise, this->array(R"(["1", "1", "1", "1"])"),
+ {this->scalar(R"("1")"), this->array("[null, null, null,
null]")});
+ this->Assert(MaxElementWise, this->array(R"(["1", "1", "1", "1"])"),
+ {this->scalar("null"), this->array(R"(["1", "1", "1", "1"])")});
+ this->Assert(MaxElementWise, this->array("[null, null, null, null]"),
+ {this->scalar("null"), this->array("[null, null, null,
null]")});
+
+ this->Assert(MaxElementWise, this->scalar(R"("ab")"),
{this->scalar(R"("ab")")});
+ this->Assert(
+ MaxElementWise, this->scalar(R"("c")"),
+ {this->scalar(R"("bb")"), this->scalar(R"("aaa")"),
this->scalar(R"("c")")});
+ this->Assert(MaxElementWise, this->scalar(R"("c")"),
+ {this->scalar(R"("bb")"), this->scalar(R"("aaa")"),
this->scalar(R"("c")"),
+ this->scalar("null")});
+ this->Assert(MaxElementWise, this->scalar(R"("aa")"),
+ {this->scalar("null"), this->scalar("null"),
this->scalar(R"("aa")"),
+ this->scalar("null")});
+
+ this->Assert(MaxElementWise, this->array(R"(["aaa", "b", "cc", null])"),
+ {this->array(R"(["aaa", "b", "cc", null])")});
+ this->Assert(MaxElementWise, this->array(R"(["bb", "bb", "cc", "dddd"])"),
+ {this->array(R"(["aaa", "bb", "cc", "dddd"])"),
this->scalar(R"("bb")")});
+ this->Assert(MaxElementWise, this->array(R"(["bb", "bb", "cc", "dddd"])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar(R"("bb")")});
+ this->Assert(MaxElementWise, this->array(R"(["dddd", "dddd", "dddd",
"dddd"])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar(R"("bb")"),
+ this->scalar(R"("dddd")")});
+ this->Assert(MaxElementWise, this->array(R"(["bb", "bb", "cc", "dddd"])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar("null"),
+ this->scalar(R"("bb")")});
+
+ this->Assert(MaxElementWise, this->array(R"(["gg", "bar", "h", "iii"])"),
+ {this->array(R"([null, "a", "bb", "cccc"])"),
+ this->array(R"(["gg", null, "h", "iii"])"),
+ this->array(R"(["foo", "bar", null, "bb"])")});
+
+ // Test null handling
+ this->element_wise_aggregate_options_.skip_nulls = false;
+ this->AssertNullScalar(MaxElementWise, {this->scalar("null"),
this->scalar("null")});
+ this->AssertNullScalar(MaxElementWise, {this->scalar(R"("0")"),
this->scalar("null")});
+
+ this->Assert(MaxElementWise, this->array(R"(["4", null, "4", "4"])"),
+ {this->array(R"(["1", null, "3", "4"])"),
this->scalar(R"("2")"),
+ this->scalar(R"("4")")});
+ this->Assert(MaxElementWise, this->array("[null, null, null, null]"),
+ {this->array(R"(["1", null, "3", "4"])"), this->scalar("null"),
+ this->scalar(R"("2")")});
+ this->Assert(
+ MaxElementWise, this->array(R"(["2", null, "3", "4"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array(R"(["2", null, "2",
"2"])")});
+
+ this->Assert(MaxElementWise, this->array("[null, null, null, null]"),
+ {this->scalar(R"("1")"), this->array("[null, null, null,
null]")});
+ this->Assert(MaxElementWise, this->array("[null, null, null, null]"),
+ {this->scalar("null"), this->array(R"(["1", "1", "1", "1"])")});
+
+ this->Assert(MaxElementWise, this->scalar("null"),
+ {this->scalar(R"("bb")"), this->scalar(R"("aaa")"),
this->scalar(R"("c")"),
+ this->scalar("null")});
+ this->Assert(MaxElementWise, this->scalar("null"),
+ {this->scalar("null"), this->scalar("null"),
this->scalar(R"("aa")"),
+ this->scalar("null")});
+
+ this->Assert(MaxElementWise, this->array(R"(["bb", null, "cc", "dddd"])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar(R"("bb")")});
+ this->Assert(MaxElementWise, this->array(R"(["dddd", null, "dddd",
"dddd"])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar(R"("bb")"),
+ this->scalar(R"("dddd")")});
+ this->Assert(MaxElementWise, this->array(R"([null, null, null, null])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar("null"),
+ this->scalar(R"("bb")")});
+
+ this->Assert(MaxElementWise, this->array(R"([null, null, null, "iii"])"),
+ {this->array(R"([null, "a", "bb", "cccc"])"),
+ this->array(R"(["gg", null, "h", "iii"])"),
+ this->array(R"(["foo", "bar", null, "bb"])")});
+}
+
+TYPED_TEST(TestVarArgsCompareFixedSizeBinary, MaxElementWise) {
+ this->AssertNullScalar(MaxElementWise, {});
+ this->AssertNullScalar(MaxElementWise, {this->scalar("null"),
this->scalar("null")});
+
+ this->Assert(MaxElementWise, this->scalar(R"("0")"),
{this->scalar(R"("0")")});
+ this->Assert(MaxElementWise, this->scalar(R"("2")"),
+ {this->scalar(R"("2")"), this->scalar(R"("0")"),
this->scalar(R"("1")")});
+ this->Assert(MaxElementWise, this->scalar(R"("2")"),
+ {this->scalar(R"("2")"), this->scalar(R"("0")"),
this->scalar(R"("1")"),
+ this->scalar("null")});
+ this->Assert(MaxElementWise, this->scalar(R"("1")"),
+ {this->scalar("null"), this->scalar("null"),
this->scalar(R"("1")"),
+ this->scalar("null")});
+
+ this->Assert(MaxElementWise, (this->array("[]")), {this->array("[]")});
+ this->Assert(MaxElementWise, this->array(R"(["1", "2", "3", null])"),
+ {this->array(R"(["1", "2", "3", null])")});
+
+ this->Assert(MaxElementWise, this->array(R"(["2", "2", "3", "4"])"),
+ {this->array(R"(["1", "2", "3", "4"])"),
this->scalar(R"("2")")});
+ this->Assert(MaxElementWise, this->array(R"(["2", "2", "3", "4"])"),
+ {this->array(R"(["1", null, "3", "4"])"),
this->scalar(R"("2")")});
+ this->Assert(MaxElementWise, this->array(R"(["4", "4", "4", "4"])"),
+ {this->array(R"(["1", null, "3", "4"])"),
this->scalar(R"("2")"),
+ this->scalar(R"("4")")});
+ this->Assert(MaxElementWise, this->array(R"(["2", "2", "3", "4"])"),
+ {this->array(R"(["1", null, "3", "4"])"), this->scalar("null"),
+ this->scalar(R"("2")")});
+
+ this->Assert(
+ MaxElementWise, this->array(R"(["2", "2", "3", "4"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array(R"(["2", "2", "2",
"2"])")});
+ this->Assert(
+ MaxElementWise, this->array(R"(["2", "2", "3", "4"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array(R"(["2", null, "2",
"2"])")});
+ this->Assert(
+ MaxElementWise, this->array(R"(["2", "2", "3", "4"])"),
+ {this->array(R"(["1", null, "3", "4"])"), this->array(R"(["2", "2", "2",
"2"])")});
+
+ this->Assert(MaxElementWise, this->array(R"(["4", "2", null, "6"])"),
+ {this->array(R"(["1", "2", null, null])"),
+ this->array(R"(["4", null, null, "6"])")});
+ this->Assert(MaxElementWise, this->array(R"(["4", "2", null, "6"])"),
+ {this->array(R"(["4", null, null, "6"])"),
+ this->array(R"(["1", "2", null, null])")});
+ this->Assert(
+ MaxElementWise, this->array(R"(["1", "2", "3", "4"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array("[null, null, null,
null]")});
+ this->Assert(
+ MaxElementWise, this->array(R"(["1", "2", "3", "4"])"),
+ {this->array("[null, null, null, null]"), this->array(R"(["1", "2", "3",
"4"])")});
+
+ this->Assert(MaxElementWise, this->array(R"(["1", "2", "3", "4"])"),
+ {this->scalar(R"("1")"), this->array(R"(["1", "2", "3",
"4"])")});
+ this->Assert(MaxElementWise, this->array(R"(["1", "1", "1", "1"])"),
+ {this->scalar(R"("1")"), this->array("[null, null, null,
null]")});
+ this->Assert(MaxElementWise, this->array(R"(["1", "1", "1", "1"])"),
+ {this->scalar("null"), this->array(R"(["1", "1", "1", "1"])")});
+ this->Assert(MaxElementWise, this->array("[null, null, null, null]"),
+ {this->scalar("null"), this->array("[null, null, null,
null]")});
+
+ this->Assert(MaxElementWise,
+ this->array(R"(["abc", "abd", "abd", "abc", "abc"])",
/*byte_width=*/3),
+ {this->array(R"(["abc", "abc", "abd", null, "abc"])",
/*byte_width=*/3),
+ this->array(R"(["abc", "abd", "abc", "abc", null])",
/*byte_width=*/3)});
+ this->Assert(MaxElementWise, this->scalar(R"("abe")", /*byte_width=*/3),
+ {this->scalar(R"("abe")", /*byte_width=*/3),
+ this->scalar(R"("abc")", /*byte_width=*/3),
+ this->scalar(R"("abd")", /*byte_width=*/3)});
+
+ this->Assert(MaxElementWise,
+ this->array(R"(["abc", "abc", "abd", "abc", "abc"])",
/*byte_width=*/3),
+ {this->array(R"(["abc", "abc", "abd", null, "abc"])",
/*byte_width=*/3),
+ this->scalar(R"("abc")", /*byte_width=*/3)});
+ this->Assert(MaxElementWise,
+ this->array(R"(["abc", "abc", "abd", "abc", "abc"])",
/*byte_width=*/3),
+ {this->array(R"(["abc", null, "abd", null, "abc"])",
/*byte_width=*/3),
+ this->scalar(R"("abc")", /*byte_width=*/3)});
+ this->Assert(MaxElementWise,
+ this->array(R"(["abd", "abd", "abd", "abd", "abd"])",
/*byte_width=*/3),
+ {this->array(R"(["abc", null, "abd", null, "abc"])",
/*byte_width=*/3),
+ this->scalar(R"("abc")", /*byte_width=*/3),
+ this->scalar(R"("abd")", /*byte_width=*/3)});
+ this->Assert(MaxElementWise,
+ this->array(R"(["abc", "abc", "abd", "abc", "abc"])",
/*byte_width=*/3),
+ {this->array(R"(["abc", null, "abd", null, "abc"])",
/*byte_width=*/3),
+ this->scalar("null", /*byte_width=*/3),
+ this->scalar(R"("abc")", /*byte_width=*/3)});
+
+ // Test null handling
+ this->element_wise_aggregate_options_.skip_nulls = false;
+ this->AssertNullScalar(MaxElementWise, {this->scalar("null"),
this->scalar("null")});
+ this->AssertNullScalar(MaxElementWise, {this->scalar(R"("0")"),
this->scalar("null")});
+
+ this->Assert(MaxElementWise, this->array(R"(["4", null, "4", "4"])"),
+ {this->array(R"(["1", null, "3", "4"])"),
this->scalar(R"("2")"),
+ this->scalar(R"("4")")});
+ this->Assert(MaxElementWise, this->array("[null, null, null, null]"),
+ {this->array(R"(["1", null, "3", "4"])"), this->scalar("null"),
+ this->scalar(R"("2")")});
+ this->Assert(
+ MaxElementWise, this->array(R"(["2", null, "3", "4"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array(R"(["2", null, "2",
"2"])")});
+
+ this->Assert(MaxElementWise, this->array("[null, null, null, null]"),
+ {this->scalar(R"("1")"), this->array("[null, null, null,
null]")});
+ this->Assert(MaxElementWise, this->array("[null, null, null, null]"),
+ {this->scalar("null"), this->array(R"(["1", "1", "1", "1"])")});
+
+ this->Assert(MaxElementWise,
+ this->array(R"(["abc", "abd", "abd", null, null])",
/*byte_width=*/3),
+ {this->array(R"(["abc", "abc", "abd", null, "abc"])",
/*byte_width=*/3),
+ this->array(R"(["abc", "abd", "abc", "abc", null])",
/*byte_width=*/3)});
+}
+
TEST(TestMaxElementWiseMinElementWise, CommonTemporal) {
Review comment:
Can you add a test somewhere that checks an error is returned if input
types are different (e.g. different decimals...).
##########
File path: cpp/src/arrow/compute/kernels/scalar_compare.cc
##########
@@ -439,6 +472,330 @@ struct ScalarMinMax {
}
};
+template <typename Type, typename Op>
+struct BinaryScalarMinMax {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ using offset_type = typename Type::offset_type;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx);
+ if (std::all_of(batch.values.begin(), batch.values.end(),
+ [](const Datum& d) { return d.is_scalar(); })) {
+ return ExecOnlyScalar(ctx, options, batch, out);
+ }
+ return ExecContainingArrays(ctx, options, batch, out);
+ }
+
+ static Status ExecOnlyScalar(KernelContext* ctx,
+ const ElementWiseAggregateOptions& options,
+ const ExecBatch& batch, Datum* out) {
+ if (batch.values.empty()) {
+ return Status::OK();
+ }
+ auto output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ if (!options.skip_nulls) {
+ // any nulls in the input will produce a null output
+ for (const auto& value : batch.values) {
+ if (!value.scalar()->is_valid) {
+ output->is_valid = false;
+ return Status::OK();
+ }
+ }
+ }
+ const auto& first_scalar = *batch.values.front().scalar();
+ string_view result = UnboxScalar<Type>::Unbox(first_scalar);
+ bool valid = first_scalar.is_valid;
+ for (size_t i = 1; i < batch.values.size(); i++) {
+ const auto& scalar = *batch[i].scalar();
+ if (!scalar.is_valid) {
+ DCHECK(options.skip_nulls);
+ continue;
+ } else {
+ string_view value = UnboxScalar<Type>::Unbox(scalar);
+ result = !valid ? value : Op::Call(result, value);
+ valid = true;
+ }
+ }
+ if (valid) {
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(result.size()));
+ std::copy(result.begin(), result.end(), output->value->mutable_data());
+ output->is_valid = true;
+ } else {
+ output->is_valid = false;
+ }
+ return Status::OK();
+ }
+
+ static Status ExecContainingArrays(KernelContext* ctx,
+ const ElementWiseAggregateOptions&
options,
+ const ExecBatch& batch, Datum* out) {
+ // Presize data to avoid reallocations, using an upper bound estimation of
final size.
+ int64_t estimated_final_size = 0;
+ for (int64_t i = 0; i < batch.length; i++) {
+ auto size = CalculateRowSizeUpperBound(options, batch, i);
+ if (size > 0) estimated_final_size += size;
+ }
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(batch.length));
+ RETURN_NOT_OK(builder.ReserveData(estimated_final_size));
+
+ std::vector<util::optional<string_view>> valid_cols(batch.values.size());
Review comment:
I'm not sure why you're going through this intermediate vector. You can
probably do without it.
##########
File path: cpp/src/arrow/compute/kernels/scalar_compare_test.cc
##########
@@ -1310,6 +1392,262 @@ TYPED_TEST(TestVarArgsCompareParametricTemporal,
MinElementWise) {
{this->array("[1, null, 3, 4]"), this->array("[2, 2, null,
2]")});
}
+TYPED_TEST(TestVarArgsCompareBinary, MinElementWise) {
+ this->AssertNullScalar(MinElementWise, {});
+ this->AssertNullScalar(MinElementWise, {this->scalar("null"),
this->scalar("null")});
+
+ this->Assert(MinElementWise, this->scalar(R"("0")"),
{this->scalar(R"("0")")});
+ this->Assert(MinElementWise, this->scalar(R"("0")"),
+ {this->scalar(R"("2")"), this->scalar(R"("0")"),
this->scalar(R"("1")")});
+ this->Assert(MinElementWise, this->scalar(R"("0")"),
+ {this->scalar(R"("2")"), this->scalar(R"("0")"),
this->scalar(R"("1")"),
+ this->scalar("null")});
+ this->Assert(MinElementWise, this->scalar(R"("1")"),
+ {this->scalar("null"), this->scalar("null"),
this->scalar(R"("1")"),
+ this->scalar("null")});
+ this->Assert(MinElementWise, this->scalar(R"("")"),
+ {this->scalar(R"("")"), this->scalar(R"("")")});
+ this->Assert(MinElementWise, this->scalar(R"("")"),
+ {this->scalar(R"("")"), this->scalar("null")});
+ this->Assert(MinElementWise, this->scalar(R"("")"),
+ {this->scalar(R"("2")"), this->scalar(R"("")")});
+ this->Assert(MinElementWise, this->scalar(R"("")"),
+ {this->scalar(R"("")"), this->scalar(R"("2")")});
+ this->Assert(MinElementWise, (this->array("[]")), {this->array("[]")});
+ this->Assert(MinElementWise, this->array(R"(["1", "2", "3", null])"),
+ {this->array(R"(["1", "2", "3", null])")});
+ this->Assert(MinElementWise, this->array(R"(["", ""])"),
+ {this->array(R"(["1", ""])"), this->array(R"(["", "2"])")});
+
+ this->Assert(MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", "2", "3", "4"])"),
this->scalar(R"("2")")});
+ this->Assert(MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", null, "3", "4"])"),
this->scalar(R"("2")")});
+ this->Assert(MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", null, "3", "4"])"),
this->scalar(R"("2")"),
+ this->scalar(R"("4")")});
+ this->Assert(MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", null, "3", "4"])"), this->scalar("null"),
+ this->scalar(R"("2")")});
+
+ this->Assert(
+ MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array(R"(["2", "2", "2",
"2"])")});
+ this->Assert(
+ MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array(R"(["2", null, "2",
"2"])")});
+ this->Assert(
+ MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", null, "3", "4"])"), this->array(R"(["2", "2", "2",
"2"])")});
+
+ this->Assert(MinElementWise, this->array(R"(["1", "2", null, "6"])"),
+ {this->array(R"(["1", "2", null, null])"),
+ this->array(R"(["4", null, null, "6"])")});
+ this->Assert(MinElementWise, this->array(R"(["1", "2", null, "6"])"),
+ {this->array(R"(["4", null, null, "6"])"),
+ this->array(R"(["1", "2", null, null])")});
+ this->Assert(
+ MinElementWise, this->array(R"(["1", "2", "3", "4"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array("[null, null, null,
null]")});
+ this->Assert(
+ MinElementWise, this->array(R"(["1", "2", "3", "4"])"),
+ {this->array("[null, null, null, null]"), this->array(R"(["1", "2", "3",
"4"])")});
+
+ this->Assert(MinElementWise, this->array(R"(["1", "1", "1", "1"])"),
+ {this->scalar(R"("1")"), this->array(R"(["1", "2", "3",
"4"])")});
+ this->Assert(MinElementWise, this->array(R"(["1", "1", "1", "1"])"),
+ {this->scalar(R"("1")"), this->array("[null, null, null,
null]")});
+ this->Assert(MinElementWise, this->array(R"(["1", "1", "1", "1"])"),
+ {this->scalar("null"), this->array(R"(["1", "1", "1", "1"])")});
+ this->Assert(MinElementWise, this->array("[null, null, null, null]"),
+ {this->scalar("null"), this->array("[null, null, null,
null]")});
+
+ this->Assert(MinElementWise, this->scalar(R"("ab")"),
{this->scalar(R"("ab")")});
+ this->Assert(
+ MinElementWise, this->scalar(R"("aaa")"),
+ {this->scalar(R"("bb")"), this->scalar(R"("aaa")"),
this->scalar(R"("c")")});
+ this->Assert(MinElementWise, this->scalar(R"("aaa")"),
+ {this->scalar(R"("bb")"), this->scalar(R"("aaa")"),
this->scalar(R"("c")"),
+ this->scalar("null")});
+ this->Assert(MinElementWise, this->scalar(R"("aa")"),
+ {this->scalar("null"), this->scalar("null"),
this->scalar(R"("aa")"),
+ this->scalar("null")});
+
+ this->Assert(MinElementWise, this->array(R"(["aaa", "b", "cc", null])"),
+ {this->array(R"(["aaa", "b", "cc", null])")});
+ this->Assert(MinElementWise, this->array(R"(["aaa", "bb", "bb", "bb"])"),
+ {this->array(R"(["aaa", "bb", "cc", "dddd"])"),
this->scalar(R"("bb")")});
+ this->Assert(MinElementWise, this->array(R"(["aaa", "bb", "bb", "bb"])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar(R"("bb")")});
+ this->Assert(MinElementWise, this->array(R"(["aaa", "bb", "bb", "bb"])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar(R"("bb")"),
+ this->scalar(R"("dddd")")});
+ this->Assert(MinElementWise, this->array(R"(["aaa", "bb", "bb", "bb"])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar("null"),
+ this->scalar(R"("bb")")});
+
+ this->Assert(MinElementWise, this->array(R"(["foo", "a", "bb", "bb"])"),
+ {this->array(R"([null, "a", "bb", "cccc"])"),
+ this->array(R"(["gg", null, "h", "iii"])"),
+ this->array(R"(["foo", "bar", null, "bb"])")});
+
+ // Test null handling
+ this->element_wise_aggregate_options_.skip_nulls = false;
+ this->AssertNullScalar(MinElementWise, {this->scalar("null"),
this->scalar("null")});
+ this->AssertNullScalar(MinElementWise, {this->scalar(R"("0")"),
this->scalar("null")});
+
+ this->Assert(MinElementWise, this->array(R"(["1", null, "2", "2"])"),
+ {this->array(R"(["1", null, "3", "4"])"),
this->scalar(R"("2")"),
+ this->scalar(R"("4")")});
+ this->Assert(MinElementWise, this->array("[null, null, null, null]"),
+ {this->array(R"(["1", null, "3", "4"])"), this->scalar("null"),
+ this->scalar(R"("2")")});
+ this->Assert(
+ MinElementWise, this->array(R"(["1", null, "2", "2"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array(R"(["2", null, "2",
"2"])")});
+
+ this->Assert(MinElementWise, this->array("[null, null, null, null]"),
+ {this->scalar(R"("1")"), this->array("[null, null, null,
null]")});
+ this->Assert(MinElementWise, this->array("[null, null, null, null]"),
+ {this->scalar("null"), this->array(R"(["1", "1", "1", "1"])")});
+
+ this->Assert(MinElementWise, this->scalar("null"),
+ {this->scalar(R"("bb")"), this->scalar(R"("aaa")"),
this->scalar(R"("c")"),
+ this->scalar("null")});
+ this->Assert(MinElementWise, this->scalar("null"),
+ {this->scalar("null"), this->scalar("null"),
this->scalar(R"("aa")"),
+ this->scalar("null")});
+
+ this->Assert(MinElementWise, this->array(R"(["aaa", null, "bb", "bb"])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar(R"("bb")")});
+ this->Assert(MinElementWise, this->array(R"(["aaa", null, "bb", "bb"])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar(R"("bb")"),
+ this->scalar(R"("dddd")")});
+ this->Assert(MinElementWise, this->array(R"([null, null, null, null])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar("null"),
+ this->scalar(R"("bb")")});
+
+ this->Assert(MinElementWise, this->array(R"([null, null, null, "bb"])"),
+ {this->array(R"([null, "a", "bb", "cccc"])"),
+ this->array(R"(["gg", null, "h", "iii"])"),
+ this->array(R"(["foo", "bar", null, "bb"])")});
Review comment:
Is there a reason for having so many hand-written tests? Or can this be
shrinked a bit?
##########
File path: cpp/src/arrow/compute/kernels/scalar_compare_test.cc
##########
@@ -1310,6 +1392,262 @@ TYPED_TEST(TestVarArgsCompareParametricTemporal,
MinElementWise) {
{this->array("[1, null, 3, 4]"), this->array("[2, 2, null,
2]")});
}
+TYPED_TEST(TestVarArgsCompareBinary, MinElementWise) {
+ this->AssertNullScalar(MinElementWise, {});
+ this->AssertNullScalar(MinElementWise, {this->scalar("null"),
this->scalar("null")});
+
+ this->Assert(MinElementWise, this->scalar(R"("0")"),
{this->scalar(R"("0")")});
+ this->Assert(MinElementWise, this->scalar(R"("0")"),
+ {this->scalar(R"("2")"), this->scalar(R"("0")"),
this->scalar(R"("1")")});
+ this->Assert(MinElementWise, this->scalar(R"("0")"),
+ {this->scalar(R"("2")"), this->scalar(R"("0")"),
this->scalar(R"("1")"),
+ this->scalar("null")});
+ this->Assert(MinElementWise, this->scalar(R"("1")"),
+ {this->scalar("null"), this->scalar("null"),
this->scalar(R"("1")"),
+ this->scalar("null")});
+ this->Assert(MinElementWise, this->scalar(R"("")"),
+ {this->scalar(R"("")"), this->scalar(R"("")")});
+ this->Assert(MinElementWise, this->scalar(R"("")"),
+ {this->scalar(R"("")"), this->scalar("null")});
+ this->Assert(MinElementWise, this->scalar(R"("")"),
+ {this->scalar(R"("2")"), this->scalar(R"("")")});
+ this->Assert(MinElementWise, this->scalar(R"("")"),
+ {this->scalar(R"("")"), this->scalar(R"("2")")});
+ this->Assert(MinElementWise, (this->array("[]")), {this->array("[]")});
+ this->Assert(MinElementWise, this->array(R"(["1", "2", "3", null])"),
+ {this->array(R"(["1", "2", "3", null])")});
+ this->Assert(MinElementWise, this->array(R"(["", ""])"),
+ {this->array(R"(["1", ""])"), this->array(R"(["", "2"])")});
+
+ this->Assert(MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", "2", "3", "4"])"),
this->scalar(R"("2")")});
+ this->Assert(MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", null, "3", "4"])"),
this->scalar(R"("2")")});
+ this->Assert(MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", null, "3", "4"])"),
this->scalar(R"("2")"),
+ this->scalar(R"("4")")});
+ this->Assert(MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", null, "3", "4"])"), this->scalar("null"),
+ this->scalar(R"("2")")});
+
+ this->Assert(
+ MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array(R"(["2", "2", "2",
"2"])")});
+ this->Assert(
+ MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array(R"(["2", null, "2",
"2"])")});
+ this->Assert(
+ MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", null, "3", "4"])"), this->array(R"(["2", "2", "2",
"2"])")});
+
+ this->Assert(MinElementWise, this->array(R"(["1", "2", null, "6"])"),
+ {this->array(R"(["1", "2", null, null])"),
+ this->array(R"(["4", null, null, "6"])")});
+ this->Assert(MinElementWise, this->array(R"(["1", "2", null, "6"])"),
+ {this->array(R"(["4", null, null, "6"])"),
+ this->array(R"(["1", "2", null, null])")});
+ this->Assert(
+ MinElementWise, this->array(R"(["1", "2", "3", "4"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array("[null, null, null,
null]")});
+ this->Assert(
+ MinElementWise, this->array(R"(["1", "2", "3", "4"])"),
+ {this->array("[null, null, null, null]"), this->array(R"(["1", "2", "3",
"4"])")});
+
+ this->Assert(MinElementWise, this->array(R"(["1", "1", "1", "1"])"),
+ {this->scalar(R"("1")"), this->array(R"(["1", "2", "3",
"4"])")});
+ this->Assert(MinElementWise, this->array(R"(["1", "1", "1", "1"])"),
+ {this->scalar(R"("1")"), this->array("[null, null, null,
null]")});
+ this->Assert(MinElementWise, this->array(R"(["1", "1", "1", "1"])"),
+ {this->scalar("null"), this->array(R"(["1", "1", "1", "1"])")});
+ this->Assert(MinElementWise, this->array("[null, null, null, null]"),
+ {this->scalar("null"), this->array("[null, null, null,
null]")});
+
+ this->Assert(MinElementWise, this->scalar(R"("ab")"),
{this->scalar(R"("ab")")});
+ this->Assert(
+ MinElementWise, this->scalar(R"("aaa")"),
+ {this->scalar(R"("bb")"), this->scalar(R"("aaa")"),
this->scalar(R"("c")")});
+ this->Assert(MinElementWise, this->scalar(R"("aaa")"),
+ {this->scalar(R"("bb")"), this->scalar(R"("aaa")"),
this->scalar(R"("c")"),
+ this->scalar("null")});
+ this->Assert(MinElementWise, this->scalar(R"("aa")"),
+ {this->scalar("null"), this->scalar("null"),
this->scalar(R"("aa")"),
+ this->scalar("null")});
+
+ this->Assert(MinElementWise, this->array(R"(["aaa", "b", "cc", null])"),
+ {this->array(R"(["aaa", "b", "cc", null])")});
+ this->Assert(MinElementWise, this->array(R"(["aaa", "bb", "bb", "bb"])"),
+ {this->array(R"(["aaa", "bb", "cc", "dddd"])"),
this->scalar(R"("bb")")});
+ this->Assert(MinElementWise, this->array(R"(["aaa", "bb", "bb", "bb"])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar(R"("bb")")});
+ this->Assert(MinElementWise, this->array(R"(["aaa", "bb", "bb", "bb"])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar(R"("bb")"),
+ this->scalar(R"("dddd")")});
+ this->Assert(MinElementWise, this->array(R"(["aaa", "bb", "bb", "bb"])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar("null"),
+ this->scalar(R"("bb")")});
+
+ this->Assert(MinElementWise, this->array(R"(["foo", "a", "bb", "bb"])"),
+ {this->array(R"([null, "a", "bb", "cccc"])"),
+ this->array(R"(["gg", null, "h", "iii"])"),
+ this->array(R"(["foo", "bar", null, "bb"])")});
+
+ // Test null handling
+ this->element_wise_aggregate_options_.skip_nulls = false;
+ this->AssertNullScalar(MinElementWise, {this->scalar("null"),
this->scalar("null")});
+ this->AssertNullScalar(MinElementWise, {this->scalar(R"("0")"),
this->scalar("null")});
+
+ this->Assert(MinElementWise, this->array(R"(["1", null, "2", "2"])"),
+ {this->array(R"(["1", null, "3", "4"])"),
this->scalar(R"("2")"),
+ this->scalar(R"("4")")});
+ this->Assert(MinElementWise, this->array("[null, null, null, null]"),
+ {this->array(R"(["1", null, "3", "4"])"), this->scalar("null"),
+ this->scalar(R"("2")")});
+ this->Assert(
+ MinElementWise, this->array(R"(["1", null, "2", "2"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array(R"(["2", null, "2",
"2"])")});
+
+ this->Assert(MinElementWise, this->array("[null, null, null, null]"),
+ {this->scalar(R"("1")"), this->array("[null, null, null,
null]")});
+ this->Assert(MinElementWise, this->array("[null, null, null, null]"),
+ {this->scalar("null"), this->array(R"(["1", "1", "1", "1"])")});
+
+ this->Assert(MinElementWise, this->scalar("null"),
+ {this->scalar(R"("bb")"), this->scalar(R"("aaa")"),
this->scalar(R"("c")"),
+ this->scalar("null")});
+ this->Assert(MinElementWise, this->scalar("null"),
+ {this->scalar("null"), this->scalar("null"),
this->scalar(R"("aa")"),
+ this->scalar("null")});
+
+ this->Assert(MinElementWise, this->array(R"(["aaa", null, "bb", "bb"])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar(R"("bb")")});
+ this->Assert(MinElementWise, this->array(R"(["aaa", null, "bb", "bb"])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar(R"("bb")"),
+ this->scalar(R"("dddd")")});
+ this->Assert(MinElementWise, this->array(R"([null, null, null, null])"),
+ {this->array(R"(["aaa", null, "cc", "dddd"])"),
this->scalar("null"),
+ this->scalar(R"("bb")")});
+
+ this->Assert(MinElementWise, this->array(R"([null, null, null, "bb"])"),
+ {this->array(R"([null, "a", "bb", "cccc"])"),
+ this->array(R"(["gg", null, "h", "iii"])"),
+ this->array(R"(["foo", "bar", null, "bb"])")});
+}
+
+TYPED_TEST(TestVarArgsCompareFixedSizeBinary, MinElementWise) {
+ this->AssertNullScalar(MinElementWise, {});
+ this->AssertNullScalar(MinElementWise, {this->scalar("null"),
this->scalar("null")});
+
+ this->Assert(MinElementWise, this->scalar(R"("0")"),
{this->scalar(R"("0")")});
+ this->Assert(MinElementWise, this->scalar(R"("0")"),
+ {this->scalar(R"("2")"), this->scalar(R"("0")"),
this->scalar(R"("1")")});
+ this->Assert(MinElementWise, this->scalar(R"("0")"),
+ {this->scalar(R"("2")"), this->scalar(R"("0")"),
this->scalar(R"("1")"),
+ this->scalar("null")});
+ this->Assert(MinElementWise, this->scalar(R"("1")"),
+ {this->scalar("null"), this->scalar("null"),
this->scalar(R"("1")"),
+ this->scalar("null")});
+
+ this->Assert(MinElementWise, (this->array("[]")), {this->array("[]")});
+ this->Assert(MinElementWise, this->array(R"(["1", "2", "3", null])"),
+ {this->array(R"(["1", "2", "3", null])")});
+
+ this->Assert(MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", "2", "3", "4"])"),
this->scalar(R"("2")")});
+ this->Assert(MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", null, "3", "4"])"),
this->scalar(R"("2")")});
+ this->Assert(MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", null, "3", "4"])"),
this->scalar(R"("2")"),
+ this->scalar(R"("4")")});
+ this->Assert(MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", null, "3", "4"])"), this->scalar("null"),
+ this->scalar(R"("2")")});
+
+ this->Assert(
+ MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array(R"(["2", "2", "2",
"2"])")});
+ this->Assert(
+ MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array(R"(["2", null, "2",
"2"])")});
+ this->Assert(
+ MinElementWise, this->array(R"(["1", "2", "2", "2"])"),
+ {this->array(R"(["1", null, "3", "4"])"), this->array(R"(["2", "2", "2",
"2"])")});
+
+ this->Assert(MinElementWise, this->array(R"(["1", "2", null, "6"])"),
+ {this->array(R"(["1", "2", null, null])"),
+ this->array(R"(["4", null, null, "6"])")});
+ this->Assert(MinElementWise, this->array(R"(["1", "2", null, "6"])"),
+ {this->array(R"(["4", null, null, "6"])"),
+ this->array(R"(["1", "2", null, null])")});
+ this->Assert(
+ MinElementWise, this->array(R"(["1", "2", "3", "4"])"),
+ {this->array(R"(["1", "2", "3", "4"])"), this->array("[null, null, null,
null]")});
+ this->Assert(
+ MinElementWise, this->array(R"(["1", "2", "3", "4"])"),
+ {this->array("[null, null, null, null]"), this->array(R"(["1", "2", "3",
"4"])")});
+
+ this->Assert(MinElementWise, this->array(R"(["1", "1", "1", "1"])"),
+ {this->scalar(R"("1")"), this->array(R"(["1", "2", "3",
"4"])")});
+ this->Assert(MinElementWise, this->array(R"(["1", "1", "1", "1"])"),
+ {this->scalar(R"("1")"), this->array("[null, null, null,
null]")});
+ this->Assert(MinElementWise, this->array(R"(["1", "1", "1", "1"])"),
+ {this->scalar("null"), this->array(R"(["1", "1", "1", "1"])")});
+ this->Assert(MinElementWise, this->array("[null, null, null, null]"),
+ {this->scalar("null"), this->array("[null, null, null,
null]")});
+
+ this->Assert(MinElementWise,
+ this->array(R"(["abc", "abc", "abc", "abc", "abc"])",
/*byte_width=*/3),
+ {this->array(R"(["abc", "abc", "abd", null, "abc"])",
/*byte_width=*/3),
+ this->array(R"(["abc", "abd", "abc", "abc", null])",
/*byte_width=*/3)});
+ this->Assert(MinElementWise, this->scalar(R"("abc")", /*byte_width=*/3),
+ {this->scalar(R"("abe")", /*byte_width=*/3),
+ this->scalar(R"("abc")", /*byte_width=*/3),
+ this->scalar(R"("abd")", /*byte_width=*/3)});
+
+ this->Assert(MinElementWise,
+ this->array(R"(["abc", "abc", "abc", "abc", "abc"])",
/*byte_width=*/3),
+ {this->array(R"(["abc", "abc", "abd", null, "abc"])",
/*byte_width=*/3),
+ this->scalar(R"("abc")", /*byte_width=*/3)});
+ this->Assert(MinElementWise,
+ this->array(R"(["abc", "abc", "abc", "abc", "abc"])",
/*byte_width=*/3),
+ {this->array(R"(["abc", null, "abd", null, "abc"])",
/*byte_width=*/3),
+ this->scalar(R"("abc")", /*byte_width=*/3)});
+ this->Assert(MinElementWise,
+ this->array(R"(["abc", "abc", "abc", "abc", "abc"])",
/*byte_width=*/3),
+ {this->array(R"(["abc", null, "abd", null, "abc"])",
/*byte_width=*/3),
+ this->scalar(R"("abc")", /*byte_width=*/3),
+ this->scalar(R"("abd")", /*byte_width=*/3)});
+ this->Assert(MinElementWise,
+ this->array(R"(["abc", "abc", "abc", "abc", "abc"])",
/*byte_width=*/3),
+ {this->array(R"(["abc", null, "abd", null, "abc"])",
/*byte_width=*/3),
+ this->scalar("null", /*byte_width=*/3),
+ this->scalar(R"("abc")", /*byte_width=*/3)});
Review comment:
I don't think it makes sense to test with several byte widths. Just use
a single one (e.g. 3) and you'll be able to cut down on the number of
hand-written tests.
##########
File path: cpp/src/arrow/compute/kernels/scalar_compare.cc
##########
@@ -439,6 +472,330 @@ struct ScalarMinMax {
}
};
+template <typename Type, typename Op>
+struct BinaryScalarMinMax {
+ using ArrayType = typename TypeTraits<Type>::ArrayType;
+ using BuilderType = typename TypeTraits<Type>::BuilderType;
+ using offset_type = typename Type::offset_type;
+
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx);
+ if (std::all_of(batch.values.begin(), batch.values.end(),
+ [](const Datum& d) { return d.is_scalar(); })) {
+ return ExecOnlyScalar(ctx, options, batch, out);
+ }
+ return ExecContainingArrays(ctx, options, batch, out);
+ }
+
+ static Status ExecOnlyScalar(KernelContext* ctx,
+ const ElementWiseAggregateOptions& options,
+ const ExecBatch& batch, Datum* out) {
+ if (batch.values.empty()) {
+ return Status::OK();
+ }
+ auto output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ if (!options.skip_nulls) {
+ // any nulls in the input will produce a null output
+ for (const auto& value : batch.values) {
+ if (!value.scalar()->is_valid) {
+ output->is_valid = false;
+ return Status::OK();
+ }
+ }
+ }
+ const auto& first_scalar = *batch.values.front().scalar();
+ string_view result = UnboxScalar<Type>::Unbox(first_scalar);
+ bool valid = first_scalar.is_valid;
+ for (size_t i = 1; i < batch.values.size(); i++) {
+ const auto& scalar = *batch[i].scalar();
+ if (!scalar.is_valid) {
+ DCHECK(options.skip_nulls);
+ continue;
+ } else {
+ string_view value = UnboxScalar<Type>::Unbox(scalar);
+ result = !valid ? value : Op::Call(result, value);
+ valid = true;
+ }
+ }
+ if (valid) {
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(result.size()));
+ std::copy(result.begin(), result.end(), output->value->mutable_data());
+ output->is_valid = true;
+ } else {
+ output->is_valid = false;
+ }
+ return Status::OK();
+ }
+
+ static Status ExecContainingArrays(KernelContext* ctx,
+ const ElementWiseAggregateOptions&
options,
+ const ExecBatch& batch, Datum* out) {
+ // Presize data to avoid reallocations, using an upper bound estimation of
final size.
+ int64_t estimated_final_size = 0;
+ for (int64_t i = 0; i < batch.length; i++) {
+ auto size = CalculateRowSizeUpperBound(options, batch, i);
+ if (size > 0) estimated_final_size += size;
+ }
+ BuilderType builder(ctx->memory_pool());
+ RETURN_NOT_OK(builder.Reserve(batch.length));
+ RETURN_NOT_OK(builder.ReserveData(estimated_final_size));
+
+ std::vector<util::optional<string_view>> valid_cols(batch.values.size());
+ for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) {
+ size_t num_valid = 0;
+ for (size_t col = 0; col < batch.values.size(); col++) {
+ if (batch[col].is_scalar()) {
+ const auto& scalar = *batch[col].scalar();
+ if (scalar.is_valid) {
+ valid_cols[col] = UnboxScalar<Type>::Unbox(scalar);
+ num_valid++;
+ } else {
+ valid_cols[col] = util::nullopt;
+ }
+ } else {
+ const auto& array = *batch[col].array();
+ if (!array.MayHaveNulls() ||
+ bit_util::GetBit(array.buffers[0]->data(), array.offset + row)) {
+ const auto offsets = array.GetValues<offset_type>(1);
+ const auto data = array.GetValues<uint8_t>(2,
/*absolute_offset=*/0);
+ const int64_t length = offsets[row + 1] - offsets[row];
+ valid_cols[col] =
+ string_view(reinterpret_cast<const char*>(data +
offsets[row]), length);
+ num_valid++;
+ } else {
+ valid_cols[col] = util::nullopt;
+ }
+ }
+ }
+
+ if (num_valid == 0 || (num_valid < batch.values.size() &&
!options.skip_nulls)) {
+ // We had some nulls
+ builder.UnsafeAppendNull();
+ continue;
+ }
+ auto result = valid_cols.front();
+ for (size_t col = 1; col < batch.values.size(); ++col) {
+ const auto value = valid_cols[col];
+ if (!value) {
+ DCHECK(options.skip_nulls);
+ continue;
+ }
+ result = !result ? *value : Op::Call(*result, *value);
+ }
+ if (result) {
+ builder.UnsafeAppend(*result);
+ } else {
+ builder.UnsafeAppendNull();
+ }
+ }
+
+ std::shared_ptr<Array> string_array;
+ RETURN_NOT_OK(builder.Finish(&string_array));
+ *out = *string_array->data();
+ out->mutable_array()->type = batch[0].type();
+ DCHECK_EQ(batch.length, out->array()->length);
+ DCHECK_GE(estimated_final_size,
+ checked_cast<const
ArrayType&>(*string_array).total_values_length());
+ return Status::OK();
+ }
+
+ // Compute and upper bound for the length of the output for the given
position,
+ // or -1 if it would be null.
+ static int64_t CalculateRowSizeUpperBound(const ElementWiseAggregateOptions&
options,
+ const ExecBatch& batch, const
int64_t index) {
+ const auto num_args = batch.values.size();
+ int64_t final_size = 0;
+ for (size_t i = 0; i < num_args; i++) {
+ int64_t element_size = 0;
+ bool valid = true;
+ if (batch[i].is_scalar()) {
+ const auto& scalar = *batch[i].scalar();
+ valid = scalar.is_valid;
+ element_size =
static_cast<int64_t>(UnboxScalar<Type>::Unbox(scalar).size());
+ } else {
+ const auto& array = *batch[i].array();
+ valid = !array.MayHaveNulls() ||
+ bit_util::GetBit(array.buffers[0]->data(), array.offset +
index);
+ const auto offsets = array.GetValues<offset_type>(1);
+ element_size = offsets[index + 1] - offsets[index];
+ }
+ if (!valid) {
+ if (options.skip_nulls) {
+ continue;
+ }
+ return -1;
+ }
+ // Conservative estimation of the element size.
+ final_size = std::max(final_size, element_size);
+ }
+ return final_size;
+ }
+};
+
+template <typename Op>
+struct FixedSizeBinaryScalarMinMax {
+ static Status Exec(KernelContext* ctx, const ExecBatch& batch, Datum* out) {
+ const ElementWiseAggregateOptions& options = MinMaxState::Get(ctx);
+ if (std::all_of(batch.values.begin(), batch.values.end(),
+ [](const Datum& d) { return d.is_scalar(); })) {
+ return ExecOnlyScalar(ctx, options, batch, out);
+ }
+ return ExecContainingArrays(ctx, options, batch, out);
+ }
+
+ static Status ExecOnlyScalar(KernelContext* ctx,
+ const ElementWiseAggregateOptions& options,
+ const ExecBatch& batch, Datum* out) {
+ if (batch.values.empty()) {
+ return Status::OK();
+ }
+ auto output = checked_cast<BaseBinaryScalar*>(out->scalar().get());
+ const size_t num_args = batch.values.size();
+
+ const auto batch_type = batch[0].type();
+ const auto binary_type = checked_cast<const
FixedSizeBinaryType*>(batch_type.get());
+ int64_t final_size = CalculateRowSize(options, batch, 0,
binary_type->byte_width());
+ if (final_size < 0) {
+ output->is_valid = false;
+ return Status::OK();
+ }
+ string_view result =
+
UnboxScalar<FixedSizeBinaryType>::Unbox(*batch.values.front().scalar());
+ for (size_t i = 1; i < num_args; i++) {
+ const auto& scalar = *batch[i].scalar();
+ if (!scalar.is_valid && options.skip_nulls) {
+ continue;
+ }
+ if (scalar.is_valid) {
+ string_view value = UnboxScalar<FixedSizeBinaryType>::Unbox(scalar);
+ result = result.empty() ? value : Op::Call(result, value);
+ }
+ }
+ if (!result.empty()) {
+ ARROW_ASSIGN_OR_RAISE(output->value, ctx->Allocate(final_size));
+ uint8_t* buf = output->value->mutable_data();
+ buf = std::copy(result.begin(), result.end(), buf);
+ output->is_valid = true;
+ DCHECK_GE(final_size, buf - output->value->mutable_data());
+ }
+ return Status::OK();
+ }
+
+ static Status ExecContainingArrays(KernelContext* ctx,
+ const ElementWiseAggregateOptions&
options,
+ const ExecBatch& batch, Datum* out) {
+ const auto batch_type = batch[0].type();
+ const auto binary_type = checked_cast<const
FixedSizeBinaryType*>(batch_type.get());
+ int32_t byte_width = binary_type->byte_width();
+ // Presize data to avoid reallocations
+ int64_t final_size = 0;
+ for (int64_t i = 0; i < batch.length; i++) {
+ auto size = CalculateRowSize(options, batch, i, byte_width);
+ if (size > 0) final_size += size;
+ }
+ FixedSizeBinaryBuilder builder(batch_type);
+ RETURN_NOT_OK(builder.Reserve(batch.length));
+ RETURN_NOT_OK(builder.ReserveData(final_size));
+
+ std::vector<string_view> valid_cols(batch.values.size());
+ for (size_t row = 0; row < static_cast<size_t>(batch.length); row++) {
+ size_t num_valid = 0;
+ for (size_t col = 0; col < batch.values.size(); col++) {
+ if (batch[col].is_scalar()) {
+ const auto& scalar = *batch[col].scalar();
+ if (scalar.is_valid) {
+ valid_cols[col] = UnboxScalar<FixedSizeBinaryType>::Unbox(scalar);
+ num_valid++;
+ } else {
+ valid_cols[col] = string_view();
+ }
+ } else {
+ const auto& array = *batch[col].array();
+ if (!array.MayHaveNulls() ||
+ bit_util::GetBit(array.buffers[0]->data(), array.offset + row)) {
+ const auto data = array.GetValues<uint8_t>(1,
/*absolute_offset=*/0);
+ valid_cols[col] = string_view(
+ reinterpret_cast<const char*>(data) + row * byte_width,
byte_width);
+ num_valid++;
+ } else {
+ valid_cols[col] = string_view();
+ }
+ }
+ }
+
+ if (num_valid < batch.values.size() && !options.skip_nulls) {
+ // We had some nulls
+ builder.UnsafeAppendNull();
+ continue;
+ }
+ auto result = valid_cols.front();
+ for (size_t col = 1; col < batch.values.size(); ++col) {
+ const auto value = valid_cols[col];
+ if (value.empty()) {
+ DCHECK(options.skip_nulls);
+ continue;
+ }
+ result = result.empty() ? value : Op::Call(result, value);
+ }
+ if (result.empty()) {
+ builder.UnsafeAppendNull();
+ } else {
+ builder.UnsafeAppend(result);
+ }
+ }
+
+ std::shared_ptr<Array> string_array;
+ RETURN_NOT_OK(builder.Finish(&string_array));
+ *out = *string_array->data();
+ out->mutable_array()->type = batch[0].type();
+ DCHECK_EQ(batch.length, out->array()->length);
+ return Status::OK();
+ }
+
+ // Compute the length of the output for the given position, or -1 if it
would be null.
+ static int64_t CalculateRowSize(const ElementWiseAggregateOptions& options,
Review comment:
Why do you need this function? The input types should all have the same
width (it is ensured by `ResolveMinOrMaxOutputType`).
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]