This is an automated email from the ASF dual-hosted git repository.

felipecrv pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new a1376879ce GH-43185: [C++] Suggest a cast when Concatenate fails due 
to offsets overflow (#43190)
a1376879ce is described below

commit a1376879ced6c0bc14dcc1e27c0c23c6ad1554a9
Author: Felipe Oliveira Carvalho <[email protected]>
AuthorDate: Wed Jul 17 13:39:22 2024 -0300

    GH-43185: [C++] Suggest a cast when Concatenate fails due to offsets 
overflow (#43190)
    
    ## Rationale for this change
    
    When arrays using 32-bit offsets into data buffers are concatenated and the 
data buffers of the results grow beyond 2GB, `Concatenate` returns a bad 
`Status` with a very simple message:
    
    `"offset overflow while concatenating arrays"`
    
    The contract that `Concatenate` honors is very simple: arrays of input type 
T lead to output of the same type T, so we can't, for instance, return a 
`LARGE_STRING` [1] array when the input is `STRING`.
    
    But we can **suggest a cast** to the caller in case an overflow error is 
detected. Either programatically (by taking an output parameter) or by giving a 
better error message to users.
    
    [1] `LARGE_STRING` can use 64-bit offsets
    
    ### What changes are included in this PR?
    
     - Suggest casts when concatenation of the values of an FSL fail due to 
overflow
     - Suggest casts when concatenation of [LARGE_]LIST_VIEW array fails due to 
overflow
     - Suggest casts when concatenation of [LARGE_]LIST array fails due to 
overflow
     - Suggest a cast to LARGE_(BINARY|STRING) when offsets overflow
    
    ### Are these changes tested?
    
    Yes.
    * GitHub Issue: #43185
    
    Lead-authored-by: Felipe Oliveira Carvalho <[email protected]>
    Co-authored-by: Benjamin Kietzman <[email protected]>
    Signed-off-by: Felipe Oliveira Carvalho <[email protected]>
---
 cpp/src/arrow/array/concatenate.cc      | 262 +++++++++++++++++++++++++-------
 cpp/src/arrow/array/concatenate.h       |  16 ++
 cpp/src/arrow/array/concatenate_test.cc | 103 ++++++++++++-
 3 files changed, 316 insertions(+), 65 deletions(-)

diff --git a/cpp/src/arrow/array/concatenate.cc 
b/cpp/src/arrow/array/concatenate.cc
index 87e55246c7..b4638dd659 100644
--- a/cpp/src/arrow/array/concatenate.cc
+++ b/cpp/src/arrow/array/concatenate.cc
@@ -75,6 +75,31 @@ struct Bitmap {
   bool AllSet() const { return data == nullptr; }
 };
 
+enum class OffsetBufferOpOutcome {
+  kOk,
+  kOffsetOverflow,
+};
+
+Status OffsetOverflowStatus() {
+  return Status::Invalid("offset overflow while concatenating arrays");
+}
+
+#define RETURN_IF_NOT_OK_OUTCOME(outcome)        \
+  switch (outcome) {                             \
+    case OffsetBufferOpOutcome::kOk:             \
+      break;                                     \
+    case OffsetBufferOpOutcome::kOffsetOverflow: \
+      return OffsetOverflowStatus();             \
+  }
+
+struct ErrorHints {
+  /// \brief Suggested cast to avoid overflow during concatenation.
+  ///
+  /// If the concatenation of offsets overflows, this field might be set to the
+  /// a type that uses larger offsets (e.g. large_utf8, large_list).
+  std::shared_ptr<DataType> suggested_cast;
+};
+
 // Allocate a buffer and concatenate bitmaps into it.
 Status ConcatenateBitmaps(const std::vector<Bitmap>& bitmaps, MemoryPool* pool,
                           std::shared_ptr<Buffer>* out) {
@@ -112,15 +137,16 @@ int64_t SumBufferSizesInBytes(const BufferVector& 
buffers) {
 // Write offsets in src into dst, adjusting them such that first_offset
 // will be the first offset written.
 template <typename Offset>
-Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst,
-                  Range* values_range);
+Result<OffsetBufferOpOutcome> PutOffsets(const Buffer& src, Offset 
first_offset,
+                                         Offset* dst, Range* values_range);
 
 // Concatenate buffers holding offsets into a single buffer of offsets,
 // also computing the ranges of values spanned by each buffer of offsets.
 template <typename Offset>
-Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool,
-                          std::shared_ptr<Buffer>* out,
-                          std::vector<Range>* values_ranges) {
+Result<OffsetBufferOpOutcome> ConcatenateOffsets(const BufferVector& buffers,
+                                                 MemoryPool* pool,
+                                                 std::shared_ptr<Buffer>* out,
+                                                 std::vector<Range>* 
values_ranges) {
   values_ranges->resize(buffers.size());
 
   // allocate output buffer
@@ -133,26 +159,30 @@ Status ConcatenateOffsets(const BufferVector& buffers, 
MemoryPool* pool,
   for (size_t i = 0; i < buffers.size(); ++i) {
     // the first offset from buffers[i] will be adjusted to values_length
     // (the cumulative length of values spanned by offsets in previous buffers)
-    RETURN_NOT_OK(PutOffsets<Offset>(*buffers[i], values_length,
-                                     out_data + elements_length, 
&(*values_ranges)[i]));
+    ARROW_ASSIGN_OR_RAISE(auto outcome, PutOffsets<Offset>(*buffers[i], 
values_length,
+                                                           out_data + 
elements_length,
+                                                           
&(*values_ranges)[i]));
+    if (ARROW_PREDICT_FALSE(outcome != OffsetBufferOpOutcome::kOk)) {
+      return outcome;
+    }
     elements_length += buffers[i]->size() / sizeof(Offset);
     values_length += static_cast<Offset>((*values_ranges)[i].length);
   }
 
   // the final element in out_data is the length of all values spanned by the 
offsets
   out_data[out_size_in_bytes / sizeof(Offset)] = values_length;
-  return Status::OK();
+  return OffsetBufferOpOutcome::kOk;
 }
 
 template <typename Offset>
-Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst,
-                  Range* values_range) {
+Result<OffsetBufferOpOutcome> PutOffsets(const Buffer& src, Offset 
first_offset,
+                                         Offset* dst, Range* values_range) {
   if (src.size() == 0) {
     // It's allowed to have an empty offsets buffer for a 0-length array
     // (see Array::Validate)
     values_range->offset = 0;
     values_range->length = 0;
-    return Status::OK();
+    return OffsetBufferOpOutcome::kOk;
   }
 
   // Get the range of offsets to transfer from src
@@ -162,8 +192,9 @@ Status PutOffsets(const Buffer& src, Offset first_offset, 
Offset* dst,
   // Compute the range of values which is spanned by this range of offsets
   values_range->offset = src_begin[0];
   values_range->length = *src_end - values_range->offset;
-  if (first_offset > std::numeric_limits<Offset>::max() - 
values_range->length) {
-    return Status::Invalid("offset overflow while concatenating arrays");
+  if (ARROW_PREDICT_FALSE(first_offset >
+                          std::numeric_limits<Offset>::max() - 
values_range->length)) {
+    return OffsetBufferOpOutcome::kOffsetOverflow;
   }
 
   // Write offsets into dst, ensuring that the first offset written is
@@ -175,12 +206,14 @@ Status PutOffsets(const Buffer& src, Offset first_offset, 
Offset* dst,
   std::transform(src_begin, src_end, dst, [displacement](Offset offset) {
     return SafeSignedAdd(offset, displacement);
   });
-  return Status::OK();
+  return OffsetBufferOpOutcome::kOk;
 }
 
 template <typename offset_type>
-Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const 
Buffer& src,
-                          offset_type displacement, offset_type* dst);
+Result<OffsetBufferOpOutcome> PutListViewOffsets(const ArrayData& input,
+                                                 offset_type* sizes, const 
Buffer& src,
+                                                 offset_type displacement,
+                                                 offset_type* dst);
 
 // Concatenate buffers holding list-view offsets into a single buffer of 
offsets
 //
@@ -198,10 +231,10 @@ Status PutListViewOffsets(const ArrayData& input, 
offset_type* sizes, const Buff
 // \param[in] in The child arrays
 // \param[in,out] sizes The concatenated sizes buffer
 template <typename offset_type>
-Status ConcatenateListViewOffsets(const ArrayDataVector& in, offset_type* 
sizes,
-                                  const BufferVector& offset_buffers,
-                                  const std::vector<Range>& value_ranges,
-                                  MemoryPool* pool, std::shared_ptr<Buffer>* 
out) {
+Result<OffsetBufferOpOutcome> ConcatenateListViewOffsets(
+    const ArrayDataVector& in, offset_type* sizes, const BufferVector& 
offset_buffers,
+    const std::vector<Range>& value_ranges, MemoryPool* pool,
+    std::shared_ptr<Buffer>* out) {
   DCHECK_EQ(offset_buffers.size(), value_ranges.size());
 
   // Allocate resulting offsets buffer and initialize it with zeros
@@ -216,26 +249,32 @@ Status ConcatenateListViewOffsets(const ArrayDataVector& 
in, offset_type* sizes,
   for (size_t i = 0; i < offset_buffers.size(); ++i) {
     const auto displacement =
         static_cast<offset_type>(num_child_values - value_ranges[i].offset);
-    RETURN_NOT_OK(PutListViewOffsets(*in[i], /*sizes=*/sizes + elements_length,
-                                     /*src=*/*offset_buffers[i], displacement,
-                                     /*dst=*/out_offsets + elements_length));
+    ARROW_ASSIGN_OR_RAISE(auto outcome,
+                          PutListViewOffsets(*in[i], /*sizes=*/sizes + 
elements_length,
+                                             /*src=*/*offset_buffers[i], 
displacement,
+                                             /*dst=*/out_offsets + 
elements_length));
+    if (ARROW_PREDICT_FALSE(outcome != OffsetBufferOpOutcome::kOk)) {
+      return outcome;
+    }
     elements_length += offset_buffers[i]->size() / sizeof(offset_type);
     num_child_values += value_ranges[i].length;
     if (num_child_values > std::numeric_limits<offset_type>::max()) {
-      return Status::Invalid("offset overflow while concatenating arrays");
+      return OffsetBufferOpOutcome::kOffsetOverflow;
     }
   }
   DCHECK_EQ(elements_length,
             static_cast<int64_t>(out_size_in_bytes / sizeof(offset_type)));
 
-  return Status::OK();
+  return OffsetBufferOpOutcome::kOk;
 }
 
 template <typename offset_type>
-Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const 
Buffer& src,
-                          offset_type displacement, offset_type* dst) {
+Result<OffsetBufferOpOutcome> PutListViewOffsets(const ArrayData& input,
+                                                 offset_type* sizes, const 
Buffer& src,
+                                                 offset_type displacement,
+                                                 offset_type* dst) {
   if (src.size() == 0) {
-    return Status::OK();
+    return OffsetBufferOpOutcome::kOk;
   }
   const auto& validity_buffer = input.buffers[0];
   if (validity_buffer) {
@@ -291,7 +330,7 @@ Status PutListViewOffsets(const ArrayData& input, 
offset_type* sizes, const Buff
       }
     }
   }
-  return Status::OK();
+  return OffsetBufferOpOutcome::kOk;
 }
 
 class ConcatenateImpl {
@@ -316,11 +355,17 @@ class ConcatenateImpl {
     }
   }
 
-  Status Concatenate(std::shared_ptr<ArrayData>* out) && {
+  Status Concatenate(std::shared_ptr<ArrayData>* out, ErrorHints* out_hints) 
&& {
     if (out_->null_count != 0 && 
internal::may_have_validity_bitmap(out_->type->id())) {
       RETURN_NOT_OK(ConcatenateBitmaps(Bitmaps(0), pool_, &out_->buffers[0]));
     }
-    RETURN_NOT_OK(VisitTypeInline(*out_->type, this));
+    auto status = VisitTypeInline(*out_->type, this);
+    if (!status.ok()) {
+      if (out_hints) {
+        out_hints->suggested_cast = std::move(suggested_cast_);
+      }
+      return status;
+    }
     *out = std::move(out_);
     return Status::OK();
   }
@@ -337,11 +382,29 @@ class ConcatenateImpl {
     return ConcatenateBuffers(buffers, pool_).Value(&out_->buffers[1]);
   }
 
-  Status Visit(const BinaryType&) {
+  Status Visit(const BinaryType& input_type) {
     std::vector<Range> value_ranges;
     ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t)));
-    RETURN_NOT_OK(ConcatenateOffsets<int32_t>(index_buffers, pool_, 
&out_->buffers[1],
-                                              &value_ranges));
+    ARROW_ASSIGN_OR_RAISE(
+        auto outcome, ConcatenateOffsets<int32_t>(index_buffers, pool_, 
&out_->buffers[1],
+                                                  &value_ranges));
+    switch (outcome) {
+      case OffsetBufferOpOutcome::kOk:
+        break;
+      case OffsetBufferOpOutcome::kOffsetOverflow:
+        switch (input_type.id()) {
+          case Type::BINARY:
+            suggested_cast_ = large_binary();
+            break;
+          case Type::STRING:
+            suggested_cast_ = large_utf8();
+            break;
+          default:
+            DCHECK(false) << "unexpected type id from BinaryType: " << 
input_type;
+            break;
+        }
+        return OffsetOverflowStatus();
+    }
     ARROW_ASSIGN_OR_RAISE(auto value_buffers, Buffers(2, value_ranges));
     return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]);
   }
@@ -349,8 +412,10 @@ class ConcatenateImpl {
   Status Visit(const LargeBinaryType&) {
     std::vector<Range> value_ranges;
     ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int64_t)));
-    RETURN_NOT_OK(ConcatenateOffsets<int64_t>(index_buffers, pool_, 
&out_->buffers[1],
-                                              &value_ranges));
+    ARROW_ASSIGN_OR_RAISE(
+        auto outcome, ConcatenateOffsets<int64_t>(index_buffers, pool_, 
&out_->buffers[1],
+                                                  &value_ranges));
+    RETURN_IF_NOT_OK_OUTCOME(outcome);
     ARROW_ASSIGN_OR_RAISE(auto value_buffers, Buffers(2, value_ranges));
     return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]);
   }
@@ -394,22 +459,44 @@ class ConcatenateImpl {
     return Status::OK();
   }
 
-  Status Visit(const ListType&) {
+  Status Visit(const ListType& input_type) {
     std::vector<Range> value_ranges;
     ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t)));
-    RETURN_NOT_OK(ConcatenateOffsets<int32_t>(index_buffers, pool_, 
&out_->buffers[1],
-                                              &value_ranges));
+    ARROW_ASSIGN_OR_RAISE(auto offsets_outcome,
+                          ConcatenateOffsets<int32_t>(index_buffers, pool_,
+                                                      &out_->buffers[1], 
&value_ranges));
+    switch (offsets_outcome) {
+      case OffsetBufferOpOutcome::kOk:
+        break;
+      case OffsetBufferOpOutcome::kOffsetOverflow:
+        suggested_cast_ = large_list(input_type.value_type());
+        return OffsetOverflowStatus();
+    }
     ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, value_ranges));
-    return ConcatenateImpl(child_data, 
pool_).Concatenate(&out_->child_data[0]);
+    ErrorHints child_error_hints;
+    auto status = ConcatenateImpl(child_data, pool_)
+                      .Concatenate(&out_->child_data[0], &child_error_hints);
+    if (!status.ok() && child_error_hints.suggested_cast) {
+      suggested_cast_ = list(std::move(child_error_hints.suggested_cast));
+    }
+    return status;
   }
 
   Status Visit(const LargeListType&) {
     std::vector<Range> value_ranges;
     ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int64_t)));
-    RETURN_NOT_OK(ConcatenateOffsets<int64_t>(index_buffers, pool_, 
&out_->buffers[1],
-                                              &value_ranges));
+    ARROW_ASSIGN_OR_RAISE(
+        auto outcome, ConcatenateOffsets<int64_t>(index_buffers, pool_, 
&out_->buffers[1],
+                                                  &value_ranges));
+    RETURN_IF_NOT_OK_OUTCOME(outcome);
     ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, value_ranges));
-    return ConcatenateImpl(child_data, 
pool_).Concatenate(&out_->child_data[0]);
+    ErrorHints child_error_hints;
+    auto status = ConcatenateImpl(child_data, pool_)
+                      .Concatenate(&out_->child_data[0], &child_error_hints);
+    if (!status.ok() && child_error_hints.suggested_cast) {
+      suggested_cast_ = 
large_list(std::move(child_error_hints.suggested_cast));
+    }
+    return status;
   }
 
   template <typename T>
@@ -430,8 +517,17 @@ class ConcatenateImpl {
     }
 
     // Concatenate the values
+    ErrorHints child_error_hints;
     ARROW_ASSIGN_OR_RAISE(ArrayDataVector value_data, ChildData(0, 
value_ranges));
-    RETURN_NOT_OK(ConcatenateImpl(value_data, 
pool_).Concatenate(&out_->child_data[0]));
+    auto values_status = ConcatenateImpl(value_data, pool_)
+                             .Concatenate(&out_->child_data[0], 
&child_error_hints);
+    if (!values_status.ok()) {
+      if (child_error_hints.suggested_cast) {
+        suggested_cast_ = std::make_shared<std::remove_reference_t<T>>(
+            std::move(child_error_hints.suggested_cast));
+      }
+      return values_status;
+    }
     out_->child_data[0]->type = type.value_type();
 
     // Concatenate the sizes first
@@ -440,22 +536,39 @@ class ConcatenateImpl {
 
     // Concatenate the offsets
     ARROW_ASSIGN_OR_RAISE(auto offset_buffers, Buffers(1, 
sizeof(offset_type)));
-    RETURN_NOT_OK(ConcatenateListViewOffsets<offset_type>(
-        in_, /*sizes=*/out_->buffers[2]->mutable_data_as<offset_type>(), 
offset_buffers,
-        value_ranges, pool_, &out_->buffers[1]));
-
+    ARROW_ASSIGN_OR_RAISE(
+        auto outcome, ConcatenateListViewOffsets<offset_type>(
+                          in_, 
/*sizes=*/out_->buffers[2]->mutable_data_as<offset_type>(),
+                          offset_buffers, value_ranges, pool_, 
&out_->buffers[1]));
+    switch (outcome) {
+      case OffsetBufferOpOutcome::kOk:
+        break;
+      case OffsetBufferOpOutcome::kOffsetOverflow:
+        if constexpr (T::type_id == Type::LIST_VIEW) {
+          suggested_cast_ = large_list_view(type.value_type());
+        }
+        return OffsetOverflowStatus();
+    }
     return Status::OK();
   }
 
-  Status Visit(const FixedSizeListType& fixed_size_list) {
-    ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, 
fixed_size_list.list_size()));
-    return ConcatenateImpl(child_data, 
pool_).Concatenate(&out_->child_data[0]);
+  Status Visit(const FixedSizeListType& fsl_type) {
+    ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, fsl_type.list_size()));
+    ErrorHints hints;
+    auto status =
+        ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0], 
&hints);
+    if (!status.ok() && hints.suggested_cast) {
+      suggested_cast_ =
+          fixed_size_list(std::move(hints.suggested_cast), 
fsl_type.list_size());
+    }
+    return status;
   }
 
   Status Visit(const StructType& s) {
     for (int i = 0; i < s.num_fields(); ++i) {
       ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(i));
-      RETURN_NOT_OK(ConcatenateImpl(child_data, 
pool_).Concatenate(&out_->child_data[i]));
+      RETURN_NOT_OK(ConcatenateImpl(child_data, pool_)
+                        .Concatenate(&out_->child_data[i], /*hints=*/nullptr));
     }
     return Status::OK();
   }
@@ -570,8 +683,8 @@ class ConcatenateImpl {
       case UnionMode::SPARSE: {
         for (int i = 0; i < u.num_fields(); i++) {
           ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(i));
-          RETURN_NOT_OK(
-              ConcatenateImpl(child_data, 
pool_).Concatenate(&out_->child_data[i]));
+          RETURN_NOT_OK(ConcatenateImpl(child_data, pool_)
+                            .Concatenate(&out_->child_data[i], 
/*hints=*/nullptr));
         }
         break;
       }
@@ -581,8 +694,8 @@ class ConcatenateImpl {
           for (size_t j = 0; j < in_.size(); j++) {
             child_data[j] = in_[j]->child_data[i];
           }
-          RETURN_NOT_OK(
-              ConcatenateImpl(child_data, 
pool_).Concatenate(&out_->child_data[i]));
+          RETURN_NOT_OK(ConcatenateImpl(child_data, pool_)
+                            .Concatenate(&out_->child_data[i], 
/*hints=*/nullptr));
         }
         break;
       }
@@ -666,7 +779,8 @@ class ConcatenateImpl {
       storage_data[i]->type = e.storage_type();
     }
     std::shared_ptr<ArrayData> out_storage;
-    RETURN_NOT_OK(ConcatenateImpl(storage_data, 
pool_).Concatenate(&out_storage));
+    RETURN_NOT_OK(ConcatenateImpl(storage_data, pool_)
+                      .Concatenate(&out_storage, /*hints=*/nullptr));
     out_storage->type = in_[0]->type;
     out_ = std::move(out_storage);
     return Status::OK();
@@ -797,11 +911,18 @@ class ConcatenateImpl {
   const ArrayDataVector& in_;
   MemoryPool* pool_;
   std::shared_ptr<ArrayData> out_;
+  std::shared_ptr<DataType> suggested_cast_;
 };
 
 }  // namespace
 
-Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays, 
MemoryPool* pool) {
+namespace internal {
+
+Result<std::shared_ptr<Array>> Concatenate(
+    const ArrayVector& arrays, MemoryPool* pool,
+    std::shared_ptr<DataType>* out_suggested_cast) {
+  DCHECK(out_suggested_cast);
+  *out_suggested_cast = nullptr;
   if (arrays.size() == 0) {
     return Status::Invalid("Must pass at least one array");
   }
@@ -818,8 +939,31 @@ Result<std::shared_ptr<Array>> Concatenate(const 
ArrayVector& arrays, MemoryPool
   }
 
   std::shared_ptr<ArrayData> out_data;
-  RETURN_NOT_OK(ConcatenateImpl(data, pool).Concatenate(&out_data));
+  ErrorHints hints;
+  auto status = ConcatenateImpl(data, pool).Concatenate(&out_data, &hints);
+  if (!status.ok()) {
+    if (hints.suggested_cast) {
+      DCHECK(status.IsInvalid());
+      *out_suggested_cast = std::move(hints.suggested_cast);
+    }
+    return status;
+  }
   return MakeArray(std::move(out_data));
 }
 
+}  // namespace internal
+
+Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays, 
MemoryPool* pool) {
+  std::shared_ptr<DataType> suggested_cast;
+  auto result = internal::Concatenate(arrays, pool, &suggested_cast);
+  if (!result.ok() && suggested_cast && arrays.size() > 0) {
+    DCHECK(result.status().IsInvalid());
+    return Status::Invalid(result.status().message(), ", consider casting 
input from `",
+                           *arrays[0]->type(), "` to `", *suggested_cast, "` 
first.");
+  }
+  return result;
+}
+
+#undef RETURN_IF_NOT_OK_OUTCOME
+
 }  // namespace arrow
diff --git a/cpp/src/arrow/array/concatenate.h 
b/cpp/src/arrow/array/concatenate.h
index e7597aad81..aada5624d6 100644
--- a/cpp/src/arrow/array/concatenate.h
+++ b/cpp/src/arrow/array/concatenate.h
@@ -24,6 +24,22 @@
 #include "arrow/util/visibility.h"
 
 namespace arrow {
+namespace internal {
+
+/// \brief Concatenate arrays
+///
+/// \param[in] arrays a vector of arrays to be concatenated
+/// \param[in] pool memory to store the result will be allocated from this 
memory pool
+/// \param[out] out_suggested_cast if a non-OK Result is returned, the 
function might set
+///   out_suggested_cast to a cast suggestion that would allow concatenating 
the arrays
+///   without overflow of offsets (e.g. string to large_string)
+///
+/// \return the concatenated array
+ARROW_EXPORT
+Result<std::shared_ptr<Array>> Concatenate(const ArrayVector& arrays, 
MemoryPool* pool,
+                                           std::shared_ptr<DataType>* 
out_suggested_cast);
+
+}  // namespace internal
 
 /// \brief Concatenate arrays
 ///
diff --git a/cpp/src/arrow/array/concatenate_test.cc 
b/cpp/src/arrow/array/concatenate_test.cc
index af595e897f..aea5311575 100644
--- a/cpp/src/arrow/array/concatenate_test.cc
+++ b/cpp/src/arrow/array/concatenate_test.cc
@@ -29,6 +29,7 @@
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock-matchers.h>
 #include <gtest/gtest.h>
 
 #include "arrow/array.h"
@@ -42,6 +43,7 @@
 #include "arrow/testing/util.h"
 #include "arrow/type.h"
 #include "arrow/util/list_util.h"
+#include "arrow/util/unreachable.h"
 
 namespace arrow {
 
@@ -661,14 +663,103 @@ TEST_F(ConcatenateTest, ExtensionType) {
   });
 }
 
+std::shared_ptr<DataType> LargeVersionOfType(const std::shared_ptr<DataType>& 
type) {
+  switch (type->id()) {
+    case Type::BINARY:
+      return large_binary();
+    case Type::STRING:
+      return large_utf8();
+    case Type::LIST:
+      return large_list(static_cast<const ListType&>(*type).value_type());
+    case Type::LIST_VIEW:
+      return large_list_view(static_cast<const 
ListViewType&>(*type).value_type());
+    case Type::LARGE_BINARY:
+    case Type::LARGE_STRING:
+    case Type::LARGE_LIST:
+    case Type::LARGE_LIST_VIEW:
+      return type;
+    default:
+      Unreachable();
+  }
+}
+
+std::shared_ptr<DataType> fixed_size_list_of_1(std::shared_ptr<DataType> type) 
{
+  return fixed_size_list(std::move(type), 1);
+}
+
 TEST_F(ConcatenateTest, OffsetOverflow) {
-  auto fake_long = ArrayFromJSON(utf8(), "[\"\"]");
-  fake_long->data()->GetMutableValues<int32_t>(1)[1] =
+  using TypeFactory = std::shared_ptr<DataType> (*)(std::shared_ptr<DataType>);
+  static const std::vector<TypeFactory> kNestedTypeFactories = {
+      list, large_list, list_view, large_list_view, fixed_size_list_of_1,
+  };
+
+  auto* pool = default_memory_pool();
+  std::shared_ptr<DataType> suggested_cast;
+  for (auto& ty : {binary(), utf8()}) {
+    auto large_ty = LargeVersionOfType(ty);
+
+    auto fake_long = ArrayFromJSON(ty, "[\"\"]");
+    fake_long->data()->GetMutableValues<int32_t>(1)[1] =
+        std::numeric_limits<int32_t>::max();
+    // XXX: since the data fake_long claims to own isn't there, this would
+    // segfault if Concatenate didn't detect overflow and raise an error.
+    auto concatenate_status = Concatenate({fake_long, fake_long});
+    EXPECT_RAISES_WITH_MESSAGE_THAT(
+        Invalid,
+        ::testing::StrEq("Invalid: offset overflow while concatenating arrays, 
"
+                         "consider casting input from `" +
+                         ty->ToString() + "` to `large_" + ty->ToString() + "` 
first."),
+        concatenate_status);
+
+    concatenate_status =
+        internal::Concatenate({fake_long, fake_long}, pool, &suggested_cast);
+    // Message is doesn't contain the suggested cast type when the caller
+    // asks for it by passing the output parameter.
+    EXPECT_RAISES_WITH_MESSAGE_THAT(
+        Invalid, ::testing::StrEq("Invalid: offset overflow while 
concatenating arrays"),
+        concatenate_status);
+    ASSERT_TRUE(large_ty->Equals(*suggested_cast));
+
+    // Check that the suggested cast is correct when concatenation
+    // fails due to the child array being too large.
+    for (auto factory : kNestedTypeFactories) {
+      auto nested_ty = factory(ty);
+      auto expected_suggestion = factory(large_ty);
+      auto fake_long_list = ArrayFromJSON(nested_ty, "[[\"\"]]");
+      fake_long_list->data()->child_data[0] = fake_long->data();
+
+      ASSERT_RAISES(Invalid, internal::Concatenate({fake_long_list, 
fake_long_list}, pool,
+                                                   &suggested_cast)
+                                 .status());
+      ASSERT_TRUE(suggested_cast->Equals(*expected_suggestion));
+    }
+  }
+
+  auto list_ty = list(utf8());
+  auto fake_long_list = ArrayFromJSON(list_ty, "[[\"Hello\"]]");
+  fake_long_list->data()->GetMutableValues<int32_t>(1)[1] =
       std::numeric_limits<int32_t>::max();
-  std::shared_ptr<Array> concatenated;
-  // XX since the data fake_long claims to own isn't there, this will segfault 
if
-  // Concatenate doesn't detect overflow and raise an error.
-  ASSERT_RAISES(Invalid, Concatenate({fake_long, fake_long}).status());
+  ASSERT_RAISES(Invalid, internal::Concatenate({fake_long_list, 
fake_long_list}, pool,
+                                               &suggested_cast)
+                             .status());
+  ASSERT_TRUE(suggested_cast->Equals(LargeVersionOfType(list_ty)));
+
+  auto list_view_ty = list_view(null());
+  auto fake_long_list_view = ArrayFromJSON(list_view_ty, "[[], []]");
+  {
+    constexpr int kInt32Max = std::numeric_limits<int32_t>::max();
+    auto* values = fake_long_list_view->data()->child_data[0].get();
+    auto* mutable_offsets = 
fake_long_list_view->data()->GetMutableValues<int32_t>(1);
+    auto* mutable_sizes = 
fake_long_list_view->data()->GetMutableValues<int32_t>(2);
+    values->length = 2 * static_cast<int64_t>(kInt32Max);
+    mutable_offsets[1] = kInt32Max;
+    mutable_offsets[0] = kInt32Max;
+    mutable_sizes[0] = kInt32Max;
+  }
+  ASSERT_RAISES(Invalid, internal::Concatenate({fake_long_list_view, 
fake_long_list_view},
+                                               pool, &suggested_cast)
+                             .status());
+  ASSERT_TRUE(suggested_cast->Equals(LargeVersionOfType(list_view_ty)));
 }
 
 TEST_F(ConcatenateTest, DictionaryConcatenateWithEmptyUint16) {

Reply via email to