This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 16806a1486 GH-36776: [C++] Make ListArray::FromArrays() handle sliced 
offsets Arrays containing nulls (#36780)
16806a1486 is described below

commit 16806a1486f644093b9578243cc55ac051c60056
Author: Felipe Oliveira Carvalho <felipe...@gmail.com>
AuthorDate: Tue Jul 25 10:56:09 2023 -0300

    GH-36776: [C++] Make ListArray::FromArrays() handle sliced offsets Arrays 
containing nulls (#36780)
    
    ### Rationale for this change
    
    It fixes the issue described in #36776
    
    ### What changes are included in this PR?
    
    The fix and a simplifications of the interaction between 
`ListArrayFromArrays` and `CleanOffsets` which is rather involved.
    
    ### Are these changes tested?
    
    Yes. I added a test that reproduces the issue before adding the fixes.
    * Closes: #36776
    
    Authored-by: Felipe Oliveira Carvalho <felipe...@gmail.com>
    Signed-off-by: Antoine Pitrou <anto...@python.org>
---
 cpp/src/arrow/array/array_list_test.cc | 57 ++++++++++++++++++++++
 cpp/src/arrow/array/array_nested.cc    | 87 +++++++++++++++++++---------------
 2 files changed, 106 insertions(+), 38 deletions(-)

diff --git a/cpp/src/arrow/array/array_list_test.cc 
b/cpp/src/arrow/array/array_list_test.cc
index 2a00cadcab..a3a2f99851 100644
--- a/cpp/src/arrow/array/array_list_test.cc
+++ b/cpp/src/arrow/array/array_list_test.cc
@@ -215,17 +215,20 @@ class TestListArray : public ::testing::Test {
     // Offsets with nulls will match.
     ASSERT_OK_AND_ASSIGN(auto result,
                          ArrayType::FromArrays(*offsets_w_nulls, *values, 
pool_));
+    ASSERT_OK(result->ValidateFull());
     AssertArraysEqual(*result, *expected);
 
     // Offets without nulls, will replace null with empty list
     ASSERT_OK_AND_ASSIGN(result,
                          ArrayType::FromArrays(*offsets_wo_nulls, *values, 
pool_));
+    ASSERT_OK(result->ValidateFull());
     AssertArraysEqual(*result, *std::dynamic_pointer_cast<ArrayType>(
                                    ArrayFromJSON(type, "[[0], [], [0, null], 
[0]]")));
 
     // Specify non-null offsets with null_bitmap
     ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*offsets_wo_nulls, 
*values, pool_,
                                                        
expected->null_bitmap()));
+    ASSERT_OK(result->ValidateFull());
     AssertArraysEqual(*result, *expected);
 
     // Cannot specify both null offsets with null_bitmap
@@ -233,6 +236,58 @@ class TestListArray : public ::testing::Test {
                                                  expected->null_bitmap()));
   }
 
+  void TestFromArraysWithSlicedOffsets() {
+    std::vector<offset_type> offsets = {-1, -1, 0, 1, 2, 4};
+
+    std::shared_ptr<Array> offsets_wo_nulls;
+    ArrayFromVector<OffsetType, offset_type>(offsets, &offsets_wo_nulls);
+
+    auto type = std::make_shared<T>(int32());
+    auto expected = std::dynamic_pointer_cast<ArrayType>(
+        ArrayFromJSON(type, "[[0], [1], [0, null]]"));
+    auto values = expected->values();
+
+    // Apply an offset to the offsets array
+    auto sliced_offsets = offsets_wo_nulls->Slice(2, 4);
+    ASSERT_OK_AND_ASSIGN(auto result,
+                         ArrayType::FromArrays(*sliced_offsets, *values, 
pool_));
+    ASSERT_OK(result->ValidateFull());
+    AssertArraysEqual(*result, *expected);
+
+    // Non-zero starter offset
+    sliced_offsets = offsets_wo_nulls->Slice(3, 3);
+    ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*sliced_offsets, 
*values, pool_));
+    ASSERT_OK(result->ValidateFull());
+    AssertArraysEqual(*result, *expected->Slice(1, 2));
+  }
+
+  void TestFromArraysWithSlicedNullOffsets() {
+    std::vector<offset_type> offsets = {-1, -1, 0, 1, 1, 3};
+    std::vector<bool> offsets_w_nulls_is_valid = {true, true, true, false, 
true, true};
+
+    std::shared_ptr<Array> offsets_w_nulls;
+    ArrayFromVector<OffsetType, offset_type>(offsets_w_nulls_is_valid, offsets,
+                                             &offsets_w_nulls);
+
+    auto type = std::make_shared<T>(int32());
+    auto expected = std::dynamic_pointer_cast<ArrayType>(
+        ArrayFromJSON(type, "[[0], null, [0, null]]"));
+    auto values = expected->values();
+
+    // Apply an offset to the offsets array with nulls (GH-36776)
+    auto sliced_offsets = offsets_w_nulls->Slice(2, 4);
+    ASSERT_OK_AND_ASSIGN(auto result,
+                         ArrayType::FromArrays(*sliced_offsets, *values, 
pool_));
+    ASSERT_OK(result->ValidateFull());
+    AssertArraysEqual(*result, *expected);
+
+    // Non-zero starter offset
+    sliced_offsets = offsets_w_nulls->Slice(3, 3);
+    ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*sliced_offsets, 
*values, pool_));
+    ASSERT_OK(result->ValidateFull());
+    AssertArraysEqual(*result, *expected->Slice(1, 2));
+  }
+
   void TestFromArrays() {
     std::shared_ptr<Array> offsets1, offsets2, offsets3, offsets4, offsets5, 
values;
 
@@ -586,6 +641,8 @@ TYPED_TEST(TestListArray, FromArrays) { 
this->TestFromArrays(); }
 
 TYPED_TEST(TestListArray, FromArraysWithNullBitMap) {
   this->TestFromArraysWithNullBitMap();
+  this->TestFromArraysWithSlicedOffsets();
+  this->TestFromArraysWithSlicedNullOffsets();
 }
 
 TYPED_TEST(TestListArray, AppendNull) { this->TestAppendNull(); }
diff --git a/cpp/src/arrow/array/array_nested.cc 
b/cpp/src/arrow/array/array_nested.cc
index 61eeb496e5..df60074c78 100644
--- a/cpp/src/arrow/array/array_nested.cc
+++ b/cpp/src/arrow/array/array_nested.cc
@@ -52,6 +52,9 @@ using internal::CopyBitmap;
 
 namespace {
 
+/// \brief Clean offsets when their null_count is greater than 0
+///
+/// \pre offsets.null_count() > 0
 template <typename TYPE>
 Result<BufferVector> CleanListOffsets(const std::shared_ptr<Buffer>& 
validity_buffer,
                                       const Array& offsets, MemoryPool* pool) {
@@ -59,43 +62,36 @@ Result<BufferVector> CleanListOffsets(const 
std::shared_ptr<Buffer>& validity_bu
   using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
   using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType;
 
-  const auto& typed_offsets = checked_cast<const OffsetArrayType&>(offsets);
+  DCHECK_GT(offsets.null_count(), 0);
   const int64_t num_offsets = offsets.length();
 
-  DCHECK(validity_buffer == nullptr || offsets.null_count() == 0)
-      << "When a validity_buffer is passed, offsets must have no nulls";
+  if (!offsets.IsValid(num_offsets - 1)) {
+    return Status::Invalid("Last list offset should be non-null");
+  }
 
-  if (offsets.null_count() > 0) {
-    if (!offsets.IsValid(num_offsets - 1)) {
-      return Status::Invalid("Last list offset should be non-null");
-    }
+  ARROW_ASSIGN_OR_RAISE(auto clean_offsets,
+                        AllocateBuffer(num_offsets * sizeof(offset_type), 
pool));
 
-    ARROW_ASSIGN_OR_RAISE(auto clean_offsets,
-                          AllocateBuffer(num_offsets * sizeof(offset_type), 
pool));
+  // Copy valid bits, ignoring the final offset (since for a length N list 
array,
+  // we have N + 1 offsets)
+  ARROW_ASSIGN_OR_RAISE(
+      auto clean_validity_buffer,
+      CopyBitmap(pool, offsets.null_bitmap()->data(), offsets.offset(), 
num_offsets - 1));
 
-    // Copy valid bits, ignoring the final offset (since for a length N list 
array,
-    // we have N + 1 offsets)
-    ARROW_ASSIGN_OR_RAISE(
-        auto clean_validity_buffer,
-        offsets.null_bitmap()->CopySlice(0, bit_util::BytesForBits(num_offsets 
- 1)));
-
-    const offset_type* raw_offsets = typed_offsets.raw_values();
-    auto clean_raw_offsets =
-        reinterpret_cast<offset_type*>(clean_offsets->mutable_data());
-
-    // Must work backwards so we can tell how many values were in the last 
non-null value
-    offset_type current_offset = raw_offsets[num_offsets - 1];
-    for (int64_t i = num_offsets - 1; i >= 0; --i) {
-      if (offsets.IsValid(i)) {
-        current_offset = raw_offsets[i];
-      }
-      clean_raw_offsets[i] = current_offset;
-    }
+  const offset_type* raw_offsets =
+      checked_cast<const OffsetArrayType&>(offsets).raw_values();
+  auto clean_raw_offsets = 
reinterpret_cast<offset_type*>(clean_offsets->mutable_data());
 
-    return BufferVector({std::move(clean_validity_buffer), 
std::move(clean_offsets)});
+  // Must work backwards so we can tell how many values were in the last 
non-null value
+  offset_type current_offset = raw_offsets[num_offsets - 1];
+  for (int64_t i = num_offsets - 1; i >= 0; --i) {
+    if (offsets.IsValid(i)) {
+      current_offset = raw_offsets[i];
+    }
+    clean_raw_offsets[i] = current_offset;
   }
 
-  return BufferVector({validity_buffer, typed_offsets.values()});
+  return BufferVector({std::move(clean_validity_buffer), 
std::move(clean_offsets)});
 }
 
 template <typename TYPE>
@@ -124,14 +120,21 @@ Result<std::shared_ptr<typename 
TypeTraits<TYPE>::ArrayType>> ListArrayFromArray
     return Status::NotImplemented("Null bitmap with offsets slice not 
supported.");
   }
 
-  std::shared_ptr<Buffer> offset_buf, validity_buf;
-  ARROW_ASSIGN_OR_RAISE(auto buffers, CleanListOffsets<TYPE>(null_bitmap, 
offsets, pool));
-  int64_t null_count_ = null_bitmap ? null_count : offsets.null_count();
+  // Clean the offsets if they contain nulls.
+  if (offsets.null_count() > 0) {
+    ARROW_ASSIGN_OR_RAISE(auto buffers,
+                          CleanListOffsets<TYPE>(null_bitmap, offsets, pool));
+    auto data = ArrayData::Make(type, offsets.length() - 1, std::move(buffers),
+                                {values.data()}, offsets.null_count(), 
/*offset=*/0);
+    return std::make_shared<ArrayType>(std::move(data));
+  }
 
-  std::shared_ptr<arrow::ArrayData> internal_data = ArrayData::Make(
-      type, offsets.length() - 1, std::move(buffers), null_count_, 
offsets.offset());
-  internal_data->child_data.push_back(values.data());
-  return std::make_shared<ArrayType>(internal_data);
+  using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType;
+  const auto& typed_offsets = checked_cast<const OffsetArrayType&>(offsets);
+  auto buffers = BufferVector({std::move(null_bitmap), 
typed_offsets.values()});
+  auto data = ArrayData::Make(type, offsets.length() - 1, std::move(buffers),
+                              {values.data()}, null_count, offsets.offset());
+  return std::make_shared<ArrayType>(std::move(data));
 }
 
 static std::shared_ptr<Array> SliceArrayWithOffsets(const Array& array, 
int64_t begin,
@@ -374,10 +377,18 @@ Result<std::shared_ptr<Array>> 
MapArray::FromArraysInternal(
     return Status::Invalid("Map key and item arrays must be equal length");
   }
 
-  ARROW_ASSIGN_OR_RAISE(auto buffers, CleanListOffsets<MapType>(NULLPTR, 
*offsets, pool));
+  if (offsets->null_count() > 0) {
+    ARROW_ASSIGN_OR_RAISE(auto buffers,
+                          CleanListOffsets<MapType>(NULLPTR, *offsets, pool));
+    return std::make_shared<MapArray>(type, offsets->length() - 1, 
std::move(buffers),
+                                      keys, items, offsets->null_count(), 0);
+  }
 
+  using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType;
+  const auto& typed_offsets = checked_cast<const OffsetArrayType&>(*offsets);
+  auto buffers = BufferVector({nullptr, typed_offsets.values()});
   return std::make_shared<MapArray>(type, offsets->length() - 1, 
std::move(buffers), keys,
-                                    items, offsets->null_count(), 
offsets->offset());
+                                    items, /*null_count=*/0, 
offsets->offset());
 }
 
 Result<std::shared_ptr<Array>> MapArray::FromArrays(const 
std::shared_ptr<Array>& offsets,

Reply via email to