pitrou commented on code in PR #49375:
URL: https://github.com/apache/arrow/pull/49375#discussion_r2878855237
##########
cpp/src/arrow/compute/kernels/scalar_if_else_test.cc:
##########
@@ -608,6 +609,93 @@ TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinaryRand) {
CheckIfElseOutput(cond, left, right, expected_data);
}
+Result<std::shared_ptr<Array>> MakeAllocatedVarBinaryArray(
+ const std::shared_ptr<DataType>& type, int64_t data_length) {
+ ARROW_ASSIGN_OR_RAISE(auto values, AllocateBuffer(data_length));
+ if (type->id() == Type::STRING || type->id() == Type::BINARY) {
+ if (data_length > std::numeric_limits<int32_t>::max()) {
+ return Status::Invalid("data_length exceeds int32 offset range");
+ }
+ auto offsets =
+ Buffer::FromVector(std::vector<int32_t>{0,
static_cast<int32_t>(data_length)});
+ return MakeArray(
+ ArrayData::Make(type, 1, {nullptr, std::move(offsets),
std::move(values)}));
+ }
+ if (type->id() != Type::LARGE_STRING && type->id() != Type::LARGE_BINARY) {
+ return Status::TypeError("unsupported var-binary type for helper: ",
*type);
+ }
+ auto offsets = Buffer::FromVector(std::vector<int64_t>{0, data_length});
+ return MakeArray(
+ ArrayData::Make(type, 1, {nullptr, std::move(offsets),
std::move(values)}));
+}
+
+TEST_F(TestIfElseKernel, IfElseStringAndLargeStringAAAOverflowBehavior) {
+ constexpr int64_t kPerSide =
+ static_cast<int64_t>(std::numeric_limits<int32_t>::max() / 2) + 4096;
+ auto cond = ArrayFromJSON(boolean(), "[true]");
+
+ auto check = [&](const std::shared_ptr<DataType>& type, bool
expect_capacity_error) {
+ ASSERT_OK_AND_ASSIGN(auto left, MakeAllocatedVarBinaryArray(type,
kPerSide));
+ ASSERT_OK_AND_ASSIGN(auto right, MakeAllocatedVarBinaryArray(type,
kPerSide));
+
+ auto maybe_out = CallFunction("if_else", {cond, left, right});
+ if (expect_capacity_error) {
+ ASSERT_TRUE(maybe_out.status().IsCapacityError()) << maybe_out.status();
+ ASSERT_THAT(
+ maybe_out.status().message(),
+ ::testing::HasSubstr("Result may exceed offset capacity for this
type"));
+ } else {
+ ASSERT_OK(maybe_out.status()) << maybe_out.status();
+ }
+ };
+
+ check(TypeTraits<StringType>::type_singleton(), true);
Review Comment:
`TypeTraits<StringType>::type_singleton()` is simply `utf8()`.
##########
cpp/src/arrow/compute/kernels/scalar_if_else.cc:
##########
@@ -817,13 +839,15 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
static Status Call(KernelContext* ctx, const ArraySpan& cond, const Scalar&
left,
const Scalar& right, ExecResult* out) {
std::string_view left_data = internal::UnboxScalar<Type>::Unbox(left);
- auto left_size = static_cast<OffsetType>(left_data.size());
std::string_view right_data = internal::UnboxScalar<Type>::Unbox(right);
- auto right_size = static_cast<OffsetType>(right_data.size());
Review Comment:
Same here too :)
##########
cpp/src/arrow/compute/kernels/scalar_if_else_test.cc:
##########
@@ -608,6 +609,93 @@ TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinaryRand) {
CheckIfElseOutput(cond, left, right, expected_data);
}
+Result<std::shared_ptr<Array>> MakeAllocatedVarBinaryArray(
+ const std::shared_ptr<DataType>& type, int64_t data_length) {
+ ARROW_ASSIGN_OR_RAISE(auto values, AllocateBuffer(data_length));
+ if (type->id() == Type::STRING || type->id() == Type::BINARY) {
+ if (data_length > std::numeric_limits<int32_t>::max()) {
+ return Status::Invalid("data_length exceeds int32 offset range");
+ }
+ auto offsets =
+ Buffer::FromVector(std::vector<int32_t>{0,
static_cast<int32_t>(data_length)});
+ return MakeArray(
+ ArrayData::Make(type, 1, {nullptr, std::move(offsets),
std::move(values)}));
+ }
+ if (type->id() != Type::LARGE_STRING && type->id() != Type::LARGE_BINARY) {
+ return Status::TypeError("unsupported var-binary type for helper: ",
*type);
+ }
+ auto offsets = Buffer::FromVector(std::vector<int64_t>{0, data_length});
+ return MakeArray(
+ ArrayData::Make(type, 1, {nullptr, std::move(offsets),
std::move(values)}));
+}
+
+TEST_F(TestIfElseKernel, IfElseStringAndLargeStringAAAOverflowBehavior) {
+ constexpr int64_t kPerSide =
+ static_cast<int64_t>(std::numeric_limits<int32_t>::max() / 2) + 4096;
+ auto cond = ArrayFromJSON(boolean(), "[true]");
+
+ auto check = [&](const std::shared_ptr<DataType>& type, bool
expect_capacity_error) {
+ ASSERT_OK_AND_ASSIGN(auto left, MakeAllocatedVarBinaryArray(type,
kPerSide));
+ ASSERT_OK_AND_ASSIGN(auto right, MakeAllocatedVarBinaryArray(type,
kPerSide));
+
+ auto maybe_out = CallFunction("if_else", {cond, left, right});
+ if (expect_capacity_error) {
+ ASSERT_TRUE(maybe_out.status().IsCapacityError()) << maybe_out.status();
+ ASSERT_THAT(
+ maybe_out.status().message(),
+ ::testing::HasSubstr("Result may exceed offset capacity for this
type"));
+ } else {
+ ASSERT_OK(maybe_out.status()) << maybe_out.status();
+ }
+ };
+
+ check(TypeTraits<StringType>::type_singleton(), true);
+ check(TypeTraits<LargeStringType>::type_singleton(), false);
+}
+
+TEST_F(TestIfElseKernel, IfElseBinaryCapacityErrorASA) {
+ constexpr int32_t capacity_limit = std::numeric_limits<int32_t>::max();
+
+ auto cond = ArrayFromJSON(boolean(), "[true]");
+ auto left = Datum(std::make_shared<BinaryScalar>("x"));
+ ASSERT_OK_AND_ASSIGN(
+ auto right,
MakeAllocatedVarBinaryArray(TypeTraits<StringType>::type_singleton(),
+ capacity_limit));
Review Comment:
Should also check large_string here.
##########
cpp/src/arrow/compute/kernels/scalar_if_else.cc:
##########
@@ -745,18 +760,21 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
ctx->Allocate(offset_length));
std::memcpy(out_data->buffers[1]->mutable_data(), right_offsets,
offset_length);
- auto right_data_length = right_offsets[right.length] - right_offsets[0];
+ int64_t right_data_length =
static_cast<int64_t>(right_offsets[right.length]) -
+ static_cast<int64_t>(right_offsets[0]);
ARROW_ASSIGN_OR_RAISE(out_data->buffers[2],
ctx->Allocate(right_data_length));
std::memcpy(out_data->buffers[2]->mutable_data(), right_data,
right_data_length);
return Status::OK();
}
std::string_view left_data = internal::UnboxScalar<Type>::Unbox(left);
- auto left_size = static_cast<OffsetType>(left_data.size());
Review Comment:
I think you could have kept this here, as `right` is already known to
satisfy the max capacity (the scalar is valid for the given type).
##########
cpp/src/arrow/compute/kernels/scalar_if_else.cc:
##########
@@ -785,18 +803,22 @@ struct IfElseFunctor<Type, enable_if_base_binary<Type>> {
ARROW_ASSIGN_OR_RAISE(out_data->buffers[1],
ctx->Allocate(offset_length));
std::memcpy(out_data->buffers[1]->mutable_data(), left_offsets,
offset_length);
- auto left_data_length = left_offsets[left.length] - left_offsets[0];
+ int64_t left_data_length =
static_cast<int64_t>(left_offsets[left.length]) -
+ static_cast<int64_t>(left_offsets[0]);
+ ARROW_RETURN_NOT_OK(ValidateCapacityForOffsetType(left_data_length));
ARROW_ASSIGN_OR_RAISE(out_data->buffers[2],
ctx->Allocate(left_data_length));
std::memcpy(out_data->buffers[2]->mutable_data(), left_data,
left_data_length);
return Status::OK();
}
std::string_view right_data = internal::UnboxScalar<Type>::Unbox(right);
- auto right_size = static_cast<OffsetType>(right_data.size());
Review Comment:
Same here.
##########
cpp/src/arrow/compute/kernels/scalar_if_else_test.cc:
##########
@@ -608,6 +609,93 @@ TYPED_TEST(TestIfElseBaseBinary, IfElseBaseBinaryRand) {
CheckIfElseOutput(cond, left, right, expected_data);
}
+Result<std::shared_ptr<Array>> MakeAllocatedVarBinaryArray(
+ const std::shared_ptr<DataType>& type, int64_t data_length) {
+ ARROW_ASSIGN_OR_RAISE(auto values, AllocateBuffer(data_length));
+ if (type->id() == Type::STRING || type->id() == Type::BINARY) {
+ if (data_length > std::numeric_limits<int32_t>::max()) {
+ return Status::Invalid("data_length exceeds int32 offset range");
+ }
+ auto offsets =
+ Buffer::FromVector(std::vector<int32_t>{0,
static_cast<int32_t>(data_length)});
+ return MakeArray(
+ ArrayData::Make(type, 1, {nullptr, std::move(offsets),
std::move(values)}));
+ }
+ if (type->id() != Type::LARGE_STRING && type->id() != Type::LARGE_BINARY) {
+ return Status::TypeError("unsupported var-binary type for helper: ",
*type);
+ }
+ auto offsets = Buffer::FromVector(std::vector<int64_t>{0, data_length});
+ return MakeArray(
+ ArrayData::Make(type, 1, {nullptr, std::move(offsets),
std::move(values)}));
+}
+
+TEST_F(TestIfElseKernel, IfElseStringAndLargeStringAAAOverflowBehavior) {
+ constexpr int64_t kPerSide =
+ static_cast<int64_t>(std::numeric_limits<int32_t>::max() / 2) + 4096;
+ auto cond = ArrayFromJSON(boolean(), "[true]");
+
+ auto check = [&](const std::shared_ptr<DataType>& type, bool
expect_capacity_error) {
+ ASSERT_OK_AND_ASSIGN(auto left, MakeAllocatedVarBinaryArray(type,
kPerSide));
+ ASSERT_OK_AND_ASSIGN(auto right, MakeAllocatedVarBinaryArray(type,
kPerSide));
+
+ auto maybe_out = CallFunction("if_else", {cond, left, right});
+ if (expect_capacity_error) {
+ ASSERT_TRUE(maybe_out.status().IsCapacityError()) << maybe_out.status();
+ ASSERT_THAT(
+ maybe_out.status().message(),
+ ::testing::HasSubstr("Result may exceed offset capacity for this
type"));
+ } else {
+ ASSERT_OK(maybe_out.status()) << maybe_out.status();
+ }
Review Comment:
I think you should factor out this part into a common helper than can also
be used in the scalar tests below.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]