This is an automated email from the ASF dual-hosted git repository.
felipecrv pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new a1376879ce GH-43185: [C++] Suggest a cast when Concatenate fails due
to offsets overflow (#43190)
a1376879ce is described below
commit a1376879ced6c0bc14dcc1e27c0c23c6ad1554a9
Author: Felipe Oliveira Carvalho <[email protected]>
AuthorDate: Wed Jul 17 13:39:22 2024 -0300
GH-43185: [C++] Suggest a cast when Concatenate fails due to offsets
overflow (#43190)
## Rationale for this change
When arrays using 32-bit offsets into data buffers are concatenated and the
data buffers of the results grow beyond 2GB, `Concatenate` returns a bad
`Status` with a very simple message:
`"offset overflow while concatenating arrays"`
The contract that `Concatenate` honors is very simple: arrays of input type
T lead to output of the same type T, so we can't, for instance, return a
`LARGE_STRING` [1] array when the input is `STRING`.
But we can **suggest a cast** to the caller in case an overflow error is
detected. Either programatically (by taking an output parameter) or by giving a
better error message to users.
[1] `LARGE_STRING` can use 64-bit offsets
### What changes are included in this PR?
- Suggest casts when concatenation of the values of an FSL fail due to
overflow
- Suggest casts when concatenation of [LARGE_]LIST_VIEW array fails due to
overflow
- Suggest casts when concatenation of [LARGE_]LIST array fails due to
overflow
- Suggest a cast to LARGE_(BINARY|STRING) when offsets overflow
### Are these changes tested?
Yes.
* GitHub Issue: #43185
Lead-authored-by: Felipe Oliveira Carvalho <[email protected]>
Co-authored-by: Benjamin Kietzman <[email protected]>
Signed-off-by: Felipe Oliveira Carvalho <[email protected]>
---
cpp/src/arrow/array/concatenate.cc | 262 +++++++++++++++++++++++++-------
cpp/src/arrow/array/concatenate.h | 16 ++
cpp/src/arrow/array/concatenate_test.cc | 103 ++++++++++++-
3 files changed, 316 insertions(+), 65 deletions(-)
diff --git a/cpp/src/arrow/array/concatenate.cc
b/cpp/src/arrow/array/concatenate.cc
index 87e55246c7..b4638dd659 100644
--- a/cpp/src/arrow/array/concatenate.cc
+++ b/cpp/src/arrow/array/concatenate.cc
@@ -75,6 +75,31 @@ struct Bitmap {
bool AllSet() const { return data == nullptr; }
};
+enum class OffsetBufferOpOutcome {
+ kOk,
+ kOffsetOverflow,
+};
+
+Status OffsetOverflowStatus() {
+ return Status::Invalid("offset overflow while concatenating arrays");
+}
+
+#define RETURN_IF_NOT_OK_OUTCOME(outcome) \
+ switch (outcome) { \
+ case OffsetBufferOpOutcome::kOk: \
+ break; \
+ case OffsetBufferOpOutcome::kOffsetOverflow: \
+ return OffsetOverflowStatus(); \
+ }
+
+struct ErrorHints {
+ /// \brief Suggested cast to avoid overflow during concatenation.
+ ///
+ /// If the concatenation of offsets overflows, this field might be set to the
+ /// a type that uses larger offsets (e.g. large_utf8, large_list).
+ std::shared_ptr<DataType> suggested_cast;
+};
+
// Allocate a buffer and concatenate bitmaps into it.
Status ConcatenateBitmaps(const std::vector<Bitmap>& bitmaps, MemoryPool* pool,
std::shared_ptr<Buffer>* out) {
@@ -112,15 +137,16 @@ int64_t SumBufferSizesInBytes(const BufferVector&
buffers) {
// Write offsets in src into dst, adjusting them such that first_offset
// will be the first offset written.
template <typename Offset>
-Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst,
- Range* values_range);
+Result<OffsetBufferOpOutcome> PutOffsets(const Buffer& src, Offset
first_offset,
+ Offset* dst, Range* values_range);
// Concatenate buffers holding offsets into a single buffer of offsets,
// also computing the ranges of values spanned by each buffer of offsets.
template <typename Offset>
-Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool,
- std::shared_ptr<Buffer>* out,
- std::vector<Range>* values_ranges) {
+Result<OffsetBufferOpOutcome> ConcatenateOffsets(const BufferVector& buffers,
+ MemoryPool* pool,
+ std::shared_ptr<Buffer>* out,
+ std::vector<Range>*
values_ranges) {
values_ranges->resize(buffers.size());
// allocate output buffer
@@ -133,26 +159,30 @@ Status ConcatenateOffsets(const BufferVector& buffers,
MemoryPool* pool,
for (size_t i = 0; i < buffers.size(); ++i) {
// the first offset from buffers[i] will be adjusted to values_length
// (the cumulative length of values spanned by offsets in previous buffers)
- RETURN_NOT_OK(PutOffsets<Offset>(*buffers[i], values_length,
- out_data + elements_length,
&(*values_ranges)[i]));
+ ARROW_ASSIGN_OR_RAISE(auto outcome, PutOffsets<Offset>(*buffers[i],
values_length,
+ out_data +
elements_length,
+
&(*values_ranges)[i]));
+ if (ARROW_PREDICT_FALSE(outcome != OffsetBufferOpOutcome::kOk)) {
+ return outcome;
+ }
elements_length += buffers[i]->size() / sizeof(Offset);
values_length += static_cast<Offset>((*values_ranges)[i].length);
}
// the final element in out_data is the length of all values spanned by the
offsets
out_data[out_size_in_bytes / sizeof(Offset)] = values_length;
- return Status::OK();
+ return OffsetBufferOpOutcome::kOk;
}
template <typename Offset>
-Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst,
- Range* values_range) {
+Result<OffsetBufferOpOutcome> PutOffsets(const Buffer& src, Offset
first_offset,
+ Offset* dst, Range* values_range) {
if (src.size() == 0) {
// It's allowed to have an empty offsets buffer for a 0-length array
// (see Array::Validate)
values_range->offset = 0;
values_range->length = 0;
- return Status::OK();
+ return OffsetBufferOpOutcome::kOk;
}
// Get the range of offsets to transfer from src
@@ -162,8 +192,9 @@ Status PutOffsets(const Buffer& src, Offset first_offset,
Offset* dst,
// Compute the range of values which is spanned by this range of offsets
values_range->offset = src_begin[0];
values_range->length = *src_end - values_range->offset;
- if (first_offset > std::numeric_limits<Offset>::max() -
values_range->length) {
- return Status::Invalid("offset overflow while concatenating arrays");
+ if (ARROW_PREDICT_FALSE(first_offset >
+ std::numeric_limits<Offset>::max() -
values_range->length)) {
+ return OffsetBufferOpOutcome::kOffsetOverflow;
}
// Write offsets into dst, ensuring that the first offset written is
@@ -175,12 +206,14 @@ Status PutOffsets(const Buffer& src, Offset first_offset,
Offset* dst,
std::transform(src_begin, src_end, dst, [displacement](Offset offset) {
return SafeSignedAdd(offset, displacement);
});
- return Status::OK();
+ return OffsetBufferOpOutcome::kOk;
}
template <typename offset_type>
-Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const
Buffer& src,
- offset_type displacement, offset_type* dst);
+Result<OffsetBufferOpOutcome> PutListViewOffsets(const ArrayData& input,
+ offset_type* sizes, const
Buffer& src,
+ offset_type displacement,
+ offset_type* dst);
// Concatenate buffers holding list-view offsets into a single buffer of
offsets
//
@@ -198,10 +231,10 @@ Status PutListViewOffsets(const ArrayData& input,
offset_type* sizes, const Buff
// \param[in] in The child arrays
// \param[in,out] sizes The concatenated sizes buffer
template <typename offset_type>
-Status ConcatenateListViewOffsets(const ArrayDataVector& in, offset_type*
sizes,
- const BufferVector& offset_buffers,
- const std::vector<Range>& value_ranges,
- MemoryPool* pool, std::shared_ptr<Buffer>*
out) {
+Result<OffsetBufferOpOutcome> ConcatenateListViewOffsets(
+ const ArrayDataVector& in, offset_type* sizes, const BufferVector&
offset_buffers,
+ const std::vector<Range>& value_ranges, MemoryPool* pool,
+ std::shared_ptr<Buffer>* out) {
DCHECK_EQ(offset_buffers.size(), value_ranges.size());
// Allocate resulting offsets buffer and initialize it with zeros
@@ -216,26 +249,32 @@ Status ConcatenateListViewOffsets(const ArrayDataVector&
in, offset_type* sizes,
for (size_t i = 0; i < offset_buffers.size(); ++i) {
const auto displacement =
static_cast<offset_type>(num_child_values - value_ranges[i].offset);
- RETURN_NOT_OK(PutListViewOffsets(*in[i], /*sizes=*/sizes + elements_length,
- /*src=*/*offset_buffers[i], displacement,
- /*dst=*/out_offsets + elements_length));
+ ARROW_ASSIGN_OR_RAISE(auto outcome,
+ PutListViewOffsets(*in[i], /*sizes=*/sizes +
elements_length,
+ /*src=*/*offset_buffers[i],
displacement,
+ /*dst=*/out_offsets +
elements_length));
+ if (ARROW_PREDICT_FALSE(outcome != OffsetBufferOpOutcome::kOk)) {
+ return outcome;
+ }
elements_length += offset_buffers[i]->size() / sizeof(offset_type);
num_child_values += value_ranges[i].length;
if (num_child_values > std::numeric_limits<offset_type>::max()) {
- return Status::Invalid("offset overflow while concatenating arrays");
+ return OffsetBufferOpOutcome::kOffsetOverflow;
}
}
DCHECK_EQ(elements_length,
static_cast<int64_t>(out_size_in_bytes / sizeof(offset_type)));
- return Status::OK();
+ return OffsetBufferOpOutcome::kOk;
}
template <typename offset_type>
-Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const
Buffer& src,
- offset_type displacement, offset_type* dst) {
+Result<OffsetBufferOpOutcome> PutListViewOffsets(const ArrayData& input,
+ offset_type* sizes, const
Buffer& src,
+ offset_type displacement,
+ offset_type* dst) {
if (src.size() == 0) {
- return Status::OK();
+ return OffsetBufferOpOutcome::kOk;
}
const auto& validity_buffer = input.buffers[0];
if (validity_buffer) {
@@ -291,7 +330,7 @@ Status PutListViewOffsets(const ArrayData& input,
offset_type* sizes, const Buff
}
}
}
- return Status::OK();
+ return OffsetBufferOpOutcome::kOk;
}
class ConcatenateImpl {
@@ -316,11 +355,17 @@ class ConcatenateImpl {
}
}
- Status Concatenate(std::shared_ptr<ArrayData>* out) && {
+ Status Concatenate(std::shared_ptr<ArrayData>* out, ErrorHints* out_hints)
&& {
if (out_->null_count != 0 &&
internal::may_have_validity_bitmap(out_->type->id())) {
RETURN_NOT_OK(ConcatenateBitmaps(Bitmaps(0), pool_, &out_->buffers[0]));
}
- RETURN_NOT_OK(VisitTypeInline(*out_->type, this));
+ auto status = VisitTypeInline(*out_->type, this);
+ if (!status.ok()) {
+ if (out_hints) {
+ out_hints->suggested_cast = std::move(suggested_cast_);
+ }
+ return status;
+ }
*out = std::move(out_);
return Status::OK();
}
@@ -337,11 +382,29 @@ class ConcatenateImpl {
return ConcatenateBuffers(buffers, pool_).Value(&out_->buffers[1]);
}
- Status Visit(const BinaryType&) {
+ Status Visit(const BinaryType& input_type) {
std::vector<Range> value_ranges;
ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t)));
- RETURN_NOT_OK(ConcatenateOffsets<int32_t>(index_buffers, pool_,
&out_->buffers[1],
- &value_ranges));
+ ARROW_ASSIGN_OR_RAISE(
+ auto outcome, ConcatenateOffsets<int32_t>(index_buffers, pool_,
&out_->buffers[1],
+ &value_ranges));
+ switch (outcome) {
+ case OffsetBufferOpOutcome::kOk:
+ break;
+ case OffsetBufferOpOutcome::kOffsetOverflow:
+ switch (input_type.id()) {
+ case Type::BINARY:
+ suggested_cast_ = large_binary();
+ break;
+ case Type::STRING:
+ suggested_cast_ = large_utf8();
+ break;
+ default:
+ DCHECK(false) << "unexpected type id from BinaryType: " <<
input_type;
+ break;
+ }
+ return OffsetOverflowStatus();
+ }
ARROW_ASSIGN_OR_RAISE(auto value_buffers, Buffers(2, value_ranges));
return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]);
}
@@ -349,8 +412,10 @@ class ConcatenateImpl {
Status Visit(const LargeBinaryType&) {
std::vector<Range> value_ranges;
ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int64_t)));
- RETURN_NOT_OK(ConcatenateOffsets<int64_t>(index_buffers, pool_,
&out_->buffers[1],
- &value_ranges));
+ ARROW_ASSIGN_OR_RAISE(
+ auto outcome, ConcatenateOffsets<int64_t>(index_buffers, pool_,
&out_->buffers[1],
+ &value_ranges));
+ RETURN_IF_NOT_OK_OUTCOME(outcome);
ARROW_ASSIGN_OR_RAISE(auto value_buffers, Buffers(2, value_ranges));
return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]);
}
@@ -394,22 +459,44 @@ class ConcatenateImpl {
return Status::OK();
}
- Status Visit(const ListType&) {
+ Status Visit(const ListType& input_type) {
std::vector<Range> value_ranges;
ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t)));
- RETURN_NOT_OK(ConcatenateOffsets<int32_t>(index_buffers, pool_,
&out_->buffers[1],
- &value_ranges));
+ ARROW_ASSIGN_OR_RAISE(auto offsets_outcome,
+ ConcatenateOffsets<int32_t>(index_buffers, pool_,
+ &out_->buffers[1],
&value_ranges));
+ switch (offsets_outcome) {
+ case OffsetBufferOpOutcome::kOk:
+ break;
+ case OffsetBufferOpOutcome::kOffsetOverflow:
+ suggested_cast_ = large_list(input_type.value_type());
+ return OffsetOverflowStatus();
+ }
ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, value_ranges));
- return ConcatenateImpl(child_data,
pool_).Concatenate(&out_->child_data[0]);
+ ErrorHints child_error_hints;
+ auto status = ConcatenateImpl(child_data, pool_)
+ .Concatenate(&out_->child_data[0], &child_error_hints);
+ if (!status.ok() && child_error_hints.suggested_cast) {
+ suggested_cast_ = list(std::move(child_error_hints.suggested_cast));
+ }
+ return status;
}
Status Visit(const LargeListType&) {
std::vector<Range> value_ranges;
ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int64_t)));
- RETURN_NOT_OK(ConcatenateOffsets<int64_t>(index_buffers, pool_,
&out_->buffers[1],
- &value_ranges));
+ ARROW_ASSIGN_OR_RAISE(
+ auto outcome, ConcatenateOffsets<int64_t>(index_buffers, pool_,
&out_->buffers[1],
+ &value_ranges));
+ RETURN_IF_NOT_OK_OUTCOME(outcome);
ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, value_ranges));
- return ConcatenateImpl(child_data,
pool_).Concatenate(&out_->child_data[0]);
+ ErrorHints child_error_hints;
+ auto status = ConcatenateImpl(child_data, pool_)
+ .Concatenate(&out_->child_data[0], &child_error_hints);
+ if (!status.ok() && child_error_hints.suggested_cast) {
+ suggested_cast_ =
large_list(std::move(child_error_hints.suggested_cast));
+ }
+ return status;
}
template <typename T>
@@ -430,8 +517,17 @@ class ConcatenateImpl {
}
// Concatenate the values
+ ErrorHints child_error_hints;
ARROW_ASSIGN_OR_RAISE(ArrayDataVector value_data, ChildData(0,
value_ranges));
- RETURN_NOT_OK(ConcatenateImpl(value_data,
pool_).Concatenate(&out_->child_data[0]));
+ auto values_status = ConcatenateImpl(value_data, pool_)
+ .Concatenate(&out_->child_data[0],
&child_error_hints);
+ if (!values_status.ok()) {
+ if (child_error_hints.suggested_cast) {
+ suggested_cast_ = std::make_shared<std::remove_reference_t<T>>(
+ std::move(child_error_hints.suggested_cast));
+ }
+ return values_status;
+ }
out_->child_data[0]->type = type.value_type();
// Concatenate the sizes first
@@ -440,22 +536,39 @@ class ConcatenateImpl {
// Concatenate the offsets
ARROW_ASSIGN_OR_RAISE(auto offset_buffers, Buffers(1,
sizeof(offset_type)));
- RETURN_NOT_OK(ConcatenateListViewOffsets<offset_type>(
- in_, /*sizes=*/out_->buffers[2]->mutable_data_as<offset_type>(),
offset_buffers,
- value_ranges, pool_, &out_->buffers[1]));
-
+ ARROW_ASSIGN_OR_RAISE(
+ auto outcome, ConcatenateListViewOffsets<offset_type>(
+ in_,
/*sizes=*/out_->buffers[2]->mutable_data_as<offset_type>(),
+ offset_buffers, value_ranges, pool_,
&out_->buffers[1]));
+ switch (outcome) {
+ case OffsetBufferOpOutcome::kOk:
+ break;
+ case OffsetBufferOpOutcome::kOffsetOverflow:
+ if constexpr (T::type_id == Type::LIST_VIEW) {
+ suggested_cast_ = large_list_view(type.value_type());
+ }
+ return OffsetOverflowStatus();
+ }
return Status::OK();
}
- Status Visit(const FixedSizeListType& fixed_size_list) {
- ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0,
fixed_size_list.list_size()));
- return ConcatenateImpl(child_data,
pool_).Concatenate(&out_->child_data[0]);
+ Status Visit(const FixedSizeListType& fsl_type) {
+ ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, fsl_type.list_size()));
+ ErrorHints hints;
+ auto status =
+ ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0],
&hints);
+ if (!status.ok() && hints.suggested_cast) {
+ suggested_cast_ =
+ fixed_size_list(std::move(hints.suggested_cast),
fsl_type.list_size());
+ }
+ return status;
}
Status Visit(const StructType& s) {
for (int i = 0; i < s.num_fields(); ++i) {
ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(i));
- RETURN_NOT_OK(ConcatenateImpl(child_data,
pool_).Concatenate(&out_->child_data[i]));
+ RETURN_NOT_OK(ConcatenateImpl(child_data, pool_)
+ .Concatenate(&out_->child_data[i], /*hints=*/nullptr));
}
return Status::OK();
}
@@ -570,8 +683,8 @@ class ConcatenateImpl {
case UnionMode::SPARSE: {
for (int i = 0; i < u.num_fields(); i++) {
ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(i));
- RETURN_NOT_OK(
- ConcatenateImpl(child_data,
pool_).Concatenate(&out_->child_data[i]));
+ RETURN_NOT_OK(ConcatenateImpl(child_data, pool_)
+ .Concatenate(&out_->child_data[i],
/*hints=*/nullptr));
}
break;
}
@@ -581,8 +694,8 @@ class ConcatenateImpl {
for (size_t j = 0; j < in_.size(); j++) {
child_data[j] = in_[j]->child_data[i];
}
- RETURN_NOT_OK(
- ConcatenateImpl(child_data,
pool_).Concatenate(&out_->child_data[i]));
+ RETURN_NOT_OK(ConcatenateImpl(child_data, pool_)
+ .Concatenate(&out_->child_data[i],
/*hints=*/nullptr));
}
break;
}
@@ -666,7 +779,8 @@ class ConcatenateImpl {
storage_data[i]->type = e.storage_type();
}
std::shared_ptr<ArrayData> out_storage;
- RETURN_NOT_OK(ConcatenateImpl(storage_data,
pool_).Concatenate(&out_storage));
+ RETURN_NOT_OK(ConcatenateImpl(storage_data, pool_)
+ .Concatenate(&out_storage, /*hints=*/nullptr));
out_storage->type = in_[0]->type;
out_ = std::move(out_storage);
return Status::OK();
@@ -797,11 +911,18 @@ class ConcatenateImpl {
const ArrayDataVector& in_;
MemoryPool* pool_;
std::shared_ptr<ArrayData> out_;
+ std::shared_ptr<DataType> suggested_cast_;
};
} // namespace
-Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays,
MemoryPool* pool) {
+namespace internal {
+
+Result<std::shared_ptr<Array>> Concatenate(
+ const ArrayVector& arrays, MemoryPool* pool,
+ std::shared_ptr<DataType>* out_suggested_cast) {
+ DCHECK(out_suggested_cast);
+ *out_suggested_cast = nullptr;
if (arrays.size() == 0) {
return Status::Invalid("Must pass at least one array");
}
@@ -818,8 +939,31 @@ Result<std::shared_ptr<Array>> Concatenate(const
ArrayVector& arrays, MemoryPool
}
std::shared_ptr<ArrayData> out_data;
- RETURN_NOT_OK(ConcatenateImpl(data, pool).Concatenate(&out_data));
+ ErrorHints hints;
+ auto status = ConcatenateImpl(data, pool).Concatenate(&out_data, &hints);
+ if (!status.ok()) {
+ if (hints.suggested_cast) {
+ DCHECK(status.IsInvalid());
+ *out_suggested_cast = std::move(hints.suggested_cast);
+ }
+ return status;
+ }
return MakeArray(std::move(out_data));
}
+} // namespace internal
+
+Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays,
MemoryPool* pool) {
+ std::shared_ptr<DataType> suggested_cast;
+ auto result = internal::Concatenate(arrays, pool, &suggested_cast);
+ if (!result.ok() && suggested_cast && arrays.size() > 0) {
+ DCHECK(result.status().IsInvalid());
+ return Status::Invalid(result.status().message(), ", consider casting
input from `",
+ *arrays[0]->type(), "` to `", *suggested_cast, "`
first.");
+ }
+ return result;
+}
+
+#undef RETURN_IF_NOT_OK_OUTCOME
+
} // namespace arrow
diff --git a/cpp/src/arrow/array/concatenate.h
b/cpp/src/arrow/array/concatenate.h
index e7597aad81..aada5624d6 100644
--- a/cpp/src/arrow/array/concatenate.h
+++ b/cpp/src/arrow/array/concatenate.h
@@ -24,6 +24,22 @@
#include "arrow/util/visibility.h"
namespace arrow {
+namespace internal {
+
+/// \brief Concatenate arrays
+///
+/// \param[in] arrays a vector of arrays to be concatenated
+/// \param[in] pool memory to store the result will be allocated from this
memory pool
+/// \param[out] out_suggested_cast if a non-OK Result is returned, the
function might set
+/// out_suggested_cast to a cast suggestion that would allow concatenating
the arrays
+/// without overflow of offsets (e.g. string to large_string)
+///
+/// \return the concatenated array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays,
MemoryPool* pool,
+ std::shared_ptr<DataType>*
out_suggested_cast);
+
+} // namespace internal
/// \brief Concatenate arrays
///
diff --git a/cpp/src/arrow/array/concatenate_test.cc
b/cpp/src/arrow/array/concatenate_test.cc
index af595e897f..aea5311575 100644
--- a/cpp/src/arrow/array/concatenate_test.cc
+++ b/cpp/src/arrow/array/concatenate_test.cc
@@ -29,6 +29,7 @@
#include <utility>
#include <vector>
+#include <gmock/gmock-matchers.h>
#include <gtest/gtest.h>
#include "arrow/array.h"
@@ -42,6 +43,7 @@
#include "arrow/testing/util.h"
#include "arrow/type.h"
#include "arrow/util/list_util.h"
+#include "arrow/util/unreachable.h"
namespace arrow {
@@ -661,14 +663,103 @@ TEST_F(ConcatenateTest, ExtensionType) {
});
}
+std::shared_ptr<DataType> LargeVersionOfType(const std::shared_ptr<DataType>&
type) {
+ switch (type->id()) {
+ case Type::BINARY:
+ return large_binary();
+ case Type::STRING:
+ return large_utf8();
+ case Type::LIST:
+ return large_list(static_cast<const ListType&>(*type).value_type());
+ case Type::LIST_VIEW:
+ return large_list_view(static_cast<const
ListViewType&>(*type).value_type());
+ case Type::LARGE_BINARY:
+ case Type::LARGE_STRING:
+ case Type::LARGE_LIST:
+ case Type::LARGE_LIST_VIEW:
+ return type;
+ default:
+ Unreachable();
+ }
+}
+
+std::shared_ptr<DataType> fixed_size_list_of_1(std::shared_ptr<DataType> type)
{
+ return fixed_size_list(std::move(type), 1);
+}
+
TEST_F(ConcatenateTest, OffsetOverflow) {
- auto fake_long = ArrayFromJSON(utf8(), "[\"\"]");
- fake_long->data()->GetMutableValues<int32_t>(1)[1] =
+ using TypeFactory = std::shared_ptr<DataType> (*)(std::shared_ptr<DataType>);
+ static const std::vector<TypeFactory> kNestedTypeFactories = {
+ list, large_list, list_view, large_list_view, fixed_size_list_of_1,
+ };
+
+ auto* pool = default_memory_pool();
+ std::shared_ptr<DataType> suggested_cast;
+ for (auto& ty : {binary(), utf8()}) {
+ auto large_ty = LargeVersionOfType(ty);
+
+ auto fake_long = ArrayFromJSON(ty, "[\"\"]");
+ fake_long->data()->GetMutableValues<int32_t>(1)[1] =
+ std::numeric_limits<int32_t>::max();
+ // XXX: since the data fake_long claims to own isn't there, this would
+ // segfault if Concatenate didn't detect overflow and raise an error.
+ auto concatenate_status = Concatenate({fake_long, fake_long});
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid,
+ ::testing::StrEq("Invalid: offset overflow while concatenating arrays,
"
+ "consider casting input from `" +
+ ty->ToString() + "` to `large_" + ty->ToString() + "`
first."),
+ concatenate_status);
+
+ concatenate_status =
+ internal::Concatenate({fake_long, fake_long}, pool, &suggested_cast);
+ // Message is doesn't contain the suggested cast type when the caller
+ // asks for it by passing the output parameter.
+ EXPECT_RAISES_WITH_MESSAGE_THAT(
+ Invalid, ::testing::StrEq("Invalid: offset overflow while
concatenating arrays"),
+ concatenate_status);
+ ASSERT_TRUE(large_ty->Equals(*suggested_cast));
+
+ // Check that the suggested cast is correct when concatenation
+ // fails due to the child array being too large.
+ for (auto factory : kNestedTypeFactories) {
+ auto nested_ty = factory(ty);
+ auto expected_suggestion = factory(large_ty);
+ auto fake_long_list = ArrayFromJSON(nested_ty, "[[\"\"]]");
+ fake_long_list->data()->child_data[0] = fake_long->data();
+
+ ASSERT_RAISES(Invalid, internal::Concatenate({fake_long_list,
fake_long_list}, pool,
+ &suggested_cast)
+ .status());
+ ASSERT_TRUE(suggested_cast->Equals(*expected_suggestion));
+ }
+ }
+
+ auto list_ty = list(utf8());
+ auto fake_long_list = ArrayFromJSON(list_ty, "[[\"Hello\"]]");
+ fake_long_list->data()->GetMutableValues<int32_t>(1)[1] =
std::numeric_limits<int32_t>::max();
- std::shared_ptr<Array> concatenated;
- // XX since the data fake_long claims to own isn't there, this will segfault
if
- // Concatenate doesn't detect overflow and raise an error.
- ASSERT_RAISES(Invalid, Concatenate({fake_long, fake_long}).status());
+ ASSERT_RAISES(Invalid, internal::Concatenate({fake_long_list,
fake_long_list}, pool,
+ &suggested_cast)
+ .status());
+ ASSERT_TRUE(suggested_cast->Equals(LargeVersionOfType(list_ty)));
+
+ auto list_view_ty = list_view(null());
+ auto fake_long_list_view = ArrayFromJSON(list_view_ty, "[[], []]");
+ {
+ constexpr int kInt32Max = std::numeric_limits<int32_t>::max();
+ auto* values = fake_long_list_view->data()->child_data[0].get();
+ auto* mutable_offsets =
fake_long_list_view->data()->GetMutableValues<int32_t>(1);
+ auto* mutable_sizes =
fake_long_list_view->data()->GetMutableValues<int32_t>(2);
+ values->length = 2 * static_cast<int64_t>(kInt32Max);
+ mutable_offsets[1] = kInt32Max;
+ mutable_offsets[0] = kInt32Max;
+ mutable_sizes[0] = kInt32Max;
+ }
+ ASSERT_RAISES(Invalid, internal::Concatenate({fake_long_list_view,
fake_long_list_view},
+ pool, &suggested_cast)
+ .status());
+ ASSERT_TRUE(suggested_cast->Equals(LargeVersionOfType(list_view_ty)));
}
TEST_F(ConcatenateTest, DictionaryConcatenateWithEmptyUint16) {