This is an automated email from the ASF dual-hosted git repository. apitrou pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push: new 16806a1486 GH-36776: [C++] Make ListArray::FromArrays() handle sliced offsets Arrays containing nulls (#36780) 16806a1486 is described below commit 16806a1486f644093b9578243cc55ac051c60056 Author: Felipe Oliveira Carvalho <felipe...@gmail.com> AuthorDate: Tue Jul 25 10:56:09 2023 -0300 GH-36776: [C++] Make ListArray::FromArrays() handle sliced offsets Arrays containing nulls (#36780) ### Rationale for this change It fixes the issue described in #36776 ### What changes are included in this PR? The fix and a simplifications of the interaction between `ListArrayFromArrays` and `CleanOffsets` which is rather involved. ### Are these changes tested? Yes. I added a test that reproduces the issue before adding the fixes. * Closes: #36776 Authored-by: Felipe Oliveira Carvalho <felipe...@gmail.com> Signed-off-by: Antoine Pitrou <anto...@python.org> --- cpp/src/arrow/array/array_list_test.cc | 57 ++++++++++++++++++++++ cpp/src/arrow/array/array_nested.cc | 87 +++++++++++++++++++--------------- 2 files changed, 106 insertions(+), 38 deletions(-) diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index 2a00cadcab..a3a2f99851 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -215,17 +215,20 @@ class TestListArray : public ::testing::Test { // Offsets with nulls will match. ASSERT_OK_AND_ASSIGN(auto result, ArrayType::FromArrays(*offsets_w_nulls, *values, pool_)); + ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *expected); // Offets without nulls, will replace null with empty list ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*offsets_wo_nulls, *values, pool_)); + ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *std::dynamic_pointer_cast<ArrayType>( ArrayFromJSON(type, "[[0], [], [0, null], [0]]"))); // Specify non-null offsets with null_bitmap ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*offsets_wo_nulls, *values, pool_, expected->null_bitmap())); + ASSERT_OK(result->ValidateFull()); AssertArraysEqual(*result, *expected); // Cannot specify both null offsets with null_bitmap @@ -233,6 +236,58 @@ class TestListArray : public ::testing::Test { expected->null_bitmap())); } + void TestFromArraysWithSlicedOffsets() { + std::vector<offset_type> offsets = {-1, -1, 0, 1, 2, 4}; + + std::shared_ptr<Array> offsets_wo_nulls; + ArrayFromVector<OffsetType, offset_type>(offsets, &offsets_wo_nulls); + + auto type = std::make_shared<T>(int32()); + auto expected = std::dynamic_pointer_cast<ArrayType>( + ArrayFromJSON(type, "[[0], [1], [0, null]]")); + auto values = expected->values(); + + // Apply an offset to the offsets array + auto sliced_offsets = offsets_wo_nulls->Slice(2, 4); + ASSERT_OK_AND_ASSIGN(auto result, + ArrayType::FromArrays(*sliced_offsets, *values, pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*result, *expected); + + // Non-zero starter offset + sliced_offsets = offsets_wo_nulls->Slice(3, 3); + ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*sliced_offsets, *values, pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*result, *expected->Slice(1, 2)); + } + + void TestFromArraysWithSlicedNullOffsets() { + std::vector<offset_type> offsets = {-1, -1, 0, 1, 1, 3}; + std::vector<bool> offsets_w_nulls_is_valid = {true, true, true, false, true, true}; + + std::shared_ptr<Array> offsets_w_nulls; + ArrayFromVector<OffsetType, offset_type>(offsets_w_nulls_is_valid, offsets, + &offsets_w_nulls); + + auto type = std::make_shared<T>(int32()); + auto expected = std::dynamic_pointer_cast<ArrayType>( + ArrayFromJSON(type, "[[0], null, [0, null]]")); + auto values = expected->values(); + + // Apply an offset to the offsets array with nulls (GH-36776) + auto sliced_offsets = offsets_w_nulls->Slice(2, 4); + ASSERT_OK_AND_ASSIGN(auto result, + ArrayType::FromArrays(*sliced_offsets, *values, pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*result, *expected); + + // Non-zero starter offset + sliced_offsets = offsets_w_nulls->Slice(3, 3); + ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*sliced_offsets, *values, pool_)); + ASSERT_OK(result->ValidateFull()); + AssertArraysEqual(*result, *expected->Slice(1, 2)); + } + void TestFromArrays() { std::shared_ptr<Array> offsets1, offsets2, offsets3, offsets4, offsets5, values; @@ -586,6 +641,8 @@ TYPED_TEST(TestListArray, FromArrays) { this->TestFromArrays(); } TYPED_TEST(TestListArray, FromArraysWithNullBitMap) { this->TestFromArraysWithNullBitMap(); + this->TestFromArraysWithSlicedOffsets(); + this->TestFromArraysWithSlicedNullOffsets(); } TYPED_TEST(TestListArray, AppendNull) { this->TestAppendNull(); } diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 61eeb496e5..df60074c78 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -52,6 +52,9 @@ using internal::CopyBitmap; namespace { +/// \brief Clean offsets when their null_count is greater than 0 +/// +/// \pre offsets.null_count() > 0 template <typename TYPE> Result<BufferVector> CleanListOffsets(const std::shared_ptr<Buffer>& validity_buffer, const Array& offsets, MemoryPool* pool) { @@ -59,43 +62,36 @@ Result<BufferVector> CleanListOffsets(const std::shared_ptr<Buffer>& validity_bu using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType; using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType; - const auto& typed_offsets = checked_cast<const OffsetArrayType&>(offsets); + DCHECK_GT(offsets.null_count(), 0); const int64_t num_offsets = offsets.length(); - DCHECK(validity_buffer == nullptr || offsets.null_count() == 0) - << "When a validity_buffer is passed, offsets must have no nulls"; + if (!offsets.IsValid(num_offsets - 1)) { + return Status::Invalid("Last list offset should be non-null"); + } - if (offsets.null_count() > 0) { - if (!offsets.IsValid(num_offsets - 1)) { - return Status::Invalid("Last list offset should be non-null"); - } + ARROW_ASSIGN_OR_RAISE(auto clean_offsets, + AllocateBuffer(num_offsets * sizeof(offset_type), pool)); - ARROW_ASSIGN_OR_RAISE(auto clean_offsets, - AllocateBuffer(num_offsets * sizeof(offset_type), pool)); + // Copy valid bits, ignoring the final offset (since for a length N list array, + // we have N + 1 offsets) + ARROW_ASSIGN_OR_RAISE( + auto clean_validity_buffer, + CopyBitmap(pool, offsets.null_bitmap()->data(), offsets.offset(), num_offsets - 1)); - // Copy valid bits, ignoring the final offset (since for a length N list array, - // we have N + 1 offsets) - ARROW_ASSIGN_OR_RAISE( - auto clean_validity_buffer, - offsets.null_bitmap()->CopySlice(0, bit_util::BytesForBits(num_offsets - 1))); - - const offset_type* raw_offsets = typed_offsets.raw_values(); - auto clean_raw_offsets = - reinterpret_cast<offset_type*>(clean_offsets->mutable_data()); - - // Must work backwards so we can tell how many values were in the last non-null value - offset_type current_offset = raw_offsets[num_offsets - 1]; - for (int64_t i = num_offsets - 1; i >= 0; --i) { - if (offsets.IsValid(i)) { - current_offset = raw_offsets[i]; - } - clean_raw_offsets[i] = current_offset; - } + const offset_type* raw_offsets = + checked_cast<const OffsetArrayType&>(offsets).raw_values(); + auto clean_raw_offsets = reinterpret_cast<offset_type*>(clean_offsets->mutable_data()); - return BufferVector({std::move(clean_validity_buffer), std::move(clean_offsets)}); + // Must work backwards so we can tell how many values were in the last non-null value + offset_type current_offset = raw_offsets[num_offsets - 1]; + for (int64_t i = num_offsets - 1; i >= 0; --i) { + if (offsets.IsValid(i)) { + current_offset = raw_offsets[i]; + } + clean_raw_offsets[i] = current_offset; } - return BufferVector({validity_buffer, typed_offsets.values()}); + return BufferVector({std::move(clean_validity_buffer), std::move(clean_offsets)}); } template <typename TYPE> @@ -124,14 +120,21 @@ Result<std::shared_ptr<typename TypeTraits<TYPE>::ArrayType>> ListArrayFromArray return Status::NotImplemented("Null bitmap with offsets slice not supported."); } - std::shared_ptr<Buffer> offset_buf, validity_buf; - ARROW_ASSIGN_OR_RAISE(auto buffers, CleanListOffsets<TYPE>(null_bitmap, offsets, pool)); - int64_t null_count_ = null_bitmap ? null_count : offsets.null_count(); + // Clean the offsets if they contain nulls. + if (offsets.null_count() > 0) { + ARROW_ASSIGN_OR_RAISE(auto buffers, + CleanListOffsets<TYPE>(null_bitmap, offsets, pool)); + auto data = ArrayData::Make(type, offsets.length() - 1, std::move(buffers), + {values.data()}, offsets.null_count(), /*offset=*/0); + return std::make_shared<ArrayType>(std::move(data)); + } - std::shared_ptr<arrow::ArrayData> internal_data = ArrayData::Make( - type, offsets.length() - 1, std::move(buffers), null_count_, offsets.offset()); - internal_data->child_data.push_back(values.data()); - return std::make_shared<ArrayType>(internal_data); + using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType; + const auto& typed_offsets = checked_cast<const OffsetArrayType&>(offsets); + auto buffers = BufferVector({std::move(null_bitmap), typed_offsets.values()}); + auto data = ArrayData::Make(type, offsets.length() - 1, std::move(buffers), + {values.data()}, null_count, offsets.offset()); + return std::make_shared<ArrayType>(std::move(data)); } static std::shared_ptr<Array> SliceArrayWithOffsets(const Array& array, int64_t begin, @@ -374,10 +377,18 @@ Result<std::shared_ptr<Array>> MapArray::FromArraysInternal( return Status::Invalid("Map key and item arrays must be equal length"); } - ARROW_ASSIGN_OR_RAISE(auto buffers, CleanListOffsets<MapType>(NULLPTR, *offsets, pool)); + if (offsets->null_count() > 0) { + ARROW_ASSIGN_OR_RAISE(auto buffers, + CleanListOffsets<MapType>(NULLPTR, *offsets, pool)); + return std::make_shared<MapArray>(type, offsets->length() - 1, std::move(buffers), + keys, items, offsets->null_count(), 0); + } + using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType; + const auto& typed_offsets = checked_cast<const OffsetArrayType&>(*offsets); + auto buffers = BufferVector({nullptr, typed_offsets.values()}); return std::make_shared<MapArray>(type, offsets->length() - 1, std::move(buffers), keys, - items, offsets->null_count(), offsets->offset()); + items, /*null_count=*/0, offsets->offset()); } Result<std::shared_ptr<Array>> MapArray::FromArrays(const std::shared_ptr<Array>& offsets,