This is an automated email from the ASF dual-hosted git repository.
apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 16806a1486 GH-36776: [C++] Make ListArray::FromArrays() handle sliced
offsets Arrays containing nulls (#36780)
16806a1486 is described below
commit 16806a1486f644093b9578243cc55ac051c60056
Author: Felipe Oliveira Carvalho <[email protected]>
AuthorDate: Tue Jul 25 10:56:09 2023 -0300
GH-36776: [C++] Make ListArray::FromArrays() handle sliced offsets Arrays
containing nulls (#36780)
### Rationale for this change
It fixes the issue described in #36776
### What changes are included in this PR?
The fix and a simplifications of the interaction between
`ListArrayFromArrays` and `CleanOffsets` which is rather involved.
### Are these changes tested?
Yes. I added a test that reproduces the issue before adding the fixes.
* Closes: #36776
Authored-by: Felipe Oliveira Carvalho <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
---
cpp/src/arrow/array/array_list_test.cc | 57 ++++++++++++++++++++++
cpp/src/arrow/array/array_nested.cc | 87 +++++++++++++++++++---------------
2 files changed, 106 insertions(+), 38 deletions(-)
diff --git a/cpp/src/arrow/array/array_list_test.cc
b/cpp/src/arrow/array/array_list_test.cc
index 2a00cadcab..a3a2f99851 100644
--- a/cpp/src/arrow/array/array_list_test.cc
+++ b/cpp/src/arrow/array/array_list_test.cc
@@ -215,17 +215,20 @@ class TestListArray : public ::testing::Test {
// Offsets with nulls will match.
ASSERT_OK_AND_ASSIGN(auto result,
ArrayType::FromArrays(*offsets_w_nulls, *values,
pool_));
+ ASSERT_OK(result->ValidateFull());
AssertArraysEqual(*result, *expected);
// Offets without nulls, will replace null with empty list
ASSERT_OK_AND_ASSIGN(result,
ArrayType::FromArrays(*offsets_wo_nulls, *values,
pool_));
+ ASSERT_OK(result->ValidateFull());
AssertArraysEqual(*result, *std::dynamic_pointer_cast<ArrayType>(
ArrayFromJSON(type, "[[0], [], [0, null],
[0]]")));
// Specify non-null offsets with null_bitmap
ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*offsets_wo_nulls,
*values, pool_,
expected->null_bitmap()));
+ ASSERT_OK(result->ValidateFull());
AssertArraysEqual(*result, *expected);
// Cannot specify both null offsets with null_bitmap
@@ -233,6 +236,58 @@ class TestListArray : public ::testing::Test {
expected->null_bitmap()));
}
+ void TestFromArraysWithSlicedOffsets() {
+ std::vector<offset_type> offsets = {-1, -1, 0, 1, 2, 4};
+
+ std::shared_ptr<Array> offsets_wo_nulls;
+ ArrayFromVector<OffsetType, offset_type>(offsets, &offsets_wo_nulls);
+
+ auto type = std::make_shared<T>(int32());
+ auto expected = std::dynamic_pointer_cast<ArrayType>(
+ ArrayFromJSON(type, "[[0], [1], [0, null]]"));
+ auto values = expected->values();
+
+ // Apply an offset to the offsets array
+ auto sliced_offsets = offsets_wo_nulls->Slice(2, 4);
+ ASSERT_OK_AND_ASSIGN(auto result,
+ ArrayType::FromArrays(*sliced_offsets, *values,
pool_));
+ ASSERT_OK(result->ValidateFull());
+ AssertArraysEqual(*result, *expected);
+
+ // Non-zero starter offset
+ sliced_offsets = offsets_wo_nulls->Slice(3, 3);
+ ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*sliced_offsets,
*values, pool_));
+ ASSERT_OK(result->ValidateFull());
+ AssertArraysEqual(*result, *expected->Slice(1, 2));
+ }
+
+ void TestFromArraysWithSlicedNullOffsets() {
+ std::vector<offset_type> offsets = {-1, -1, 0, 1, 1, 3};
+ std::vector<bool> offsets_w_nulls_is_valid = {true, true, true, false,
true, true};
+
+ std::shared_ptr<Array> offsets_w_nulls;
+ ArrayFromVector<OffsetType, offset_type>(offsets_w_nulls_is_valid, offsets,
+ &offsets_w_nulls);
+
+ auto type = std::make_shared<T>(int32());
+ auto expected = std::dynamic_pointer_cast<ArrayType>(
+ ArrayFromJSON(type, "[[0], null, [0, null]]"));
+ auto values = expected->values();
+
+ // Apply an offset to the offsets array with nulls (GH-36776)
+ auto sliced_offsets = offsets_w_nulls->Slice(2, 4);
+ ASSERT_OK_AND_ASSIGN(auto result,
+ ArrayType::FromArrays(*sliced_offsets, *values,
pool_));
+ ASSERT_OK(result->ValidateFull());
+ AssertArraysEqual(*result, *expected);
+
+ // Non-zero starter offset
+ sliced_offsets = offsets_w_nulls->Slice(3, 3);
+ ASSERT_OK_AND_ASSIGN(result, ArrayType::FromArrays(*sliced_offsets,
*values, pool_));
+ ASSERT_OK(result->ValidateFull());
+ AssertArraysEqual(*result, *expected->Slice(1, 2));
+ }
+
void TestFromArrays() {
std::shared_ptr<Array> offsets1, offsets2, offsets3, offsets4, offsets5,
values;
@@ -586,6 +641,8 @@ TYPED_TEST(TestListArray, FromArrays) {
this->TestFromArrays(); }
TYPED_TEST(TestListArray, FromArraysWithNullBitMap) {
this->TestFromArraysWithNullBitMap();
+ this->TestFromArraysWithSlicedOffsets();
+ this->TestFromArraysWithSlicedNullOffsets();
}
TYPED_TEST(TestListArray, AppendNull) { this->TestAppendNull(); }
diff --git a/cpp/src/arrow/array/array_nested.cc
b/cpp/src/arrow/array/array_nested.cc
index 61eeb496e5..df60074c78 100644
--- a/cpp/src/arrow/array/array_nested.cc
+++ b/cpp/src/arrow/array/array_nested.cc
@@ -52,6 +52,9 @@ using internal::CopyBitmap;
namespace {
+/// \brief Clean offsets when their null_count is greater than 0
+///
+/// \pre offsets.null_count() > 0
template <typename TYPE>
Result<BufferVector> CleanListOffsets(const std::shared_ptr<Buffer>&
validity_buffer,
const Array& offsets, MemoryPool* pool) {
@@ -59,43 +62,36 @@ Result<BufferVector> CleanListOffsets(const
std::shared_ptr<Buffer>& validity_bu
using OffsetArrowType = typename CTypeTraits<offset_type>::ArrowType;
using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType;
- const auto& typed_offsets = checked_cast<const OffsetArrayType&>(offsets);
+ DCHECK_GT(offsets.null_count(), 0);
const int64_t num_offsets = offsets.length();
- DCHECK(validity_buffer == nullptr || offsets.null_count() == 0)
- << "When a validity_buffer is passed, offsets must have no nulls";
+ if (!offsets.IsValid(num_offsets - 1)) {
+ return Status::Invalid("Last list offset should be non-null");
+ }
- if (offsets.null_count() > 0) {
- if (!offsets.IsValid(num_offsets - 1)) {
- return Status::Invalid("Last list offset should be non-null");
- }
+ ARROW_ASSIGN_OR_RAISE(auto clean_offsets,
+ AllocateBuffer(num_offsets * sizeof(offset_type),
pool));
- ARROW_ASSIGN_OR_RAISE(auto clean_offsets,
- AllocateBuffer(num_offsets * sizeof(offset_type),
pool));
+ // Copy valid bits, ignoring the final offset (since for a length N list
array,
+ // we have N + 1 offsets)
+ ARROW_ASSIGN_OR_RAISE(
+ auto clean_validity_buffer,
+ CopyBitmap(pool, offsets.null_bitmap()->data(), offsets.offset(),
num_offsets - 1));
- // Copy valid bits, ignoring the final offset (since for a length N list
array,
- // we have N + 1 offsets)
- ARROW_ASSIGN_OR_RAISE(
- auto clean_validity_buffer,
- offsets.null_bitmap()->CopySlice(0, bit_util::BytesForBits(num_offsets
- 1)));
-
- const offset_type* raw_offsets = typed_offsets.raw_values();
- auto clean_raw_offsets =
- reinterpret_cast<offset_type*>(clean_offsets->mutable_data());
-
- // Must work backwards so we can tell how many values were in the last
non-null value
- offset_type current_offset = raw_offsets[num_offsets - 1];
- for (int64_t i = num_offsets - 1; i >= 0; --i) {
- if (offsets.IsValid(i)) {
- current_offset = raw_offsets[i];
- }
- clean_raw_offsets[i] = current_offset;
- }
+ const offset_type* raw_offsets =
+ checked_cast<const OffsetArrayType&>(offsets).raw_values();
+ auto clean_raw_offsets =
reinterpret_cast<offset_type*>(clean_offsets->mutable_data());
- return BufferVector({std::move(clean_validity_buffer),
std::move(clean_offsets)});
+ // Must work backwards so we can tell how many values were in the last
non-null value
+ offset_type current_offset = raw_offsets[num_offsets - 1];
+ for (int64_t i = num_offsets - 1; i >= 0; --i) {
+ if (offsets.IsValid(i)) {
+ current_offset = raw_offsets[i];
+ }
+ clean_raw_offsets[i] = current_offset;
}
- return BufferVector({validity_buffer, typed_offsets.values()});
+ return BufferVector({std::move(clean_validity_buffer),
std::move(clean_offsets)});
}
template <typename TYPE>
@@ -124,14 +120,21 @@ Result<std::shared_ptr<typename
TypeTraits<TYPE>::ArrayType>> ListArrayFromArray
return Status::NotImplemented("Null bitmap with offsets slice not
supported.");
}
- std::shared_ptr<Buffer> offset_buf, validity_buf;
- ARROW_ASSIGN_OR_RAISE(auto buffers, CleanListOffsets<TYPE>(null_bitmap,
offsets, pool));
- int64_t null_count_ = null_bitmap ? null_count : offsets.null_count();
+ // Clean the offsets if they contain nulls.
+ if (offsets.null_count() > 0) {
+ ARROW_ASSIGN_OR_RAISE(auto buffers,
+ CleanListOffsets<TYPE>(null_bitmap, offsets, pool));
+ auto data = ArrayData::Make(type, offsets.length() - 1, std::move(buffers),
+ {values.data()}, offsets.null_count(),
/*offset=*/0);
+ return std::make_shared<ArrayType>(std::move(data));
+ }
- std::shared_ptr<arrow::ArrayData> internal_data = ArrayData::Make(
- type, offsets.length() - 1, std::move(buffers), null_count_,
offsets.offset());
- internal_data->child_data.push_back(values.data());
- return std::make_shared<ArrayType>(internal_data);
+ using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType;
+ const auto& typed_offsets = checked_cast<const OffsetArrayType&>(offsets);
+ auto buffers = BufferVector({std::move(null_bitmap),
typed_offsets.values()});
+ auto data = ArrayData::Make(type, offsets.length() - 1, std::move(buffers),
+ {values.data()}, null_count, offsets.offset());
+ return std::make_shared<ArrayType>(std::move(data));
}
static std::shared_ptr<Array> SliceArrayWithOffsets(const Array& array,
int64_t begin,
@@ -374,10 +377,18 @@ Result<std::shared_ptr<Array>>
MapArray::FromArraysInternal(
return Status::Invalid("Map key and item arrays must be equal length");
}
- ARROW_ASSIGN_OR_RAISE(auto buffers, CleanListOffsets<MapType>(NULLPTR,
*offsets, pool));
+ if (offsets->null_count() > 0) {
+ ARROW_ASSIGN_OR_RAISE(auto buffers,
+ CleanListOffsets<MapType>(NULLPTR, *offsets, pool));
+ return std::make_shared<MapArray>(type, offsets->length() - 1,
std::move(buffers),
+ keys, items, offsets->null_count(), 0);
+ }
+ using OffsetArrayType = typename TypeTraits<OffsetArrowType>::ArrayType;
+ const auto& typed_offsets = checked_cast<const OffsetArrayType&>(*offsets);
+ auto buffers = BufferVector({nullptr, typed_offsets.values()});
return std::make_shared<MapArray>(type, offsets->length() - 1,
std::move(buffers), keys,
- items, offsets->null_count(),
offsets->offset());
+ items, /*null_count=*/0,
offsets->offset());
}
Result<std::shared_ptr<Array>> MapArray::FromArrays(const
std::shared_ptr<Array>& offsets,