This is an automated email from the ASF dual-hosted git repository. apitrou pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 66d0ad1 ARROW-2315: [C++/Python] Flatten struct array 66d0ad1 is described below commit 66d0ad179ec9b43fc5233c20850aef5d983eab57 Author: Antoine Pitrou <anto...@python.org> AuthorDate: Tue Apr 17 17:10:04 2018 +0200 ARROW-2315: [C++/Python] Flatten struct array Also adds tests for BitmapReader and BitmapWriter. Author: Antoine Pitrou <anto...@python.org> Closes #1755 from pitrou/ARROW-2315-struct-flatten and squashes the following commits: c14d492 <Antoine Pitrou> Address review comments @cpcloud f1ecb99 <Antoine Pitrou> Avoid C++14 binary literals 88b14c6 <Antoine Pitrou> Fix sliced flattening cced334 <Antoine Pitrou> ARROW-1886: Flatten struct array --- cpp/src/arrow/array.cc | 47 ++++++++++++ cpp/src/arrow/array.h | 6 ++ cpp/src/arrow/util/bit-util-test.cc | 144 +++++++++++++++++++++++++++++++++++ cpp/src/arrow/util/bit-util.cc | 54 +++++++++++++ cpp/src/arrow/util/bit-util.h | 5 ++ python/pyarrow/array.pxi | 24 ++++++ python/pyarrow/includes/libarrow.pxd | 2 + python/pyarrow/lib.pxd | 4 + python/pyarrow/tests/test_array.py | 38 +++++++++ 9 files changed, 324 insertions(+) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 80d64c8..c3fb04b 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -372,6 +372,53 @@ std::shared_ptr<Array> StructArray::field(int i) const { return boxed_fields_[i]; } +Status StructArray::Flatten(MemoryPool* pool, ArrayVector* out) const { + ArrayVector flattened; + std::shared_ptr<Buffer> null_bitmap = data_->buffers[0]; + + for (auto& child_data : data_->child_data) { + std::shared_ptr<Buffer> flattened_null_bitmap; + int64_t flattened_null_count = kUnknownNullCount; + + // Need to adjust for parent offset + if (data_->offset != 0 || data_->length != child_data->length) { + child_data = SliceData(*child_data, data_->offset, data_->length); + } + std::shared_ptr<Buffer> child_null_bitmap = child_data->buffers[0]; + const int64_t child_offset = child_data->offset; + + // The validity of a flattened datum is the logical AND of the struct + // element's validity and the individual field element's validity. + if (null_bitmap && child_null_bitmap) { + RETURN_NOT_OK(BitmapAnd(pool, child_null_bitmap->data(), child_offset, + null_bitmap_data_, data_->offset, data_->length, + child_offset, &flattened_null_bitmap)); + } else if (child_null_bitmap) { + flattened_null_bitmap = child_null_bitmap; + flattened_null_count = child_data->null_count; + } else if (null_bitmap) { + if (child_offset == data_->offset) { + flattened_null_bitmap = null_bitmap; + } else { + RETURN_NOT_OK(CopyBitmap(pool, null_bitmap_data_, data_->offset, data_->length, + &flattened_null_bitmap)); + } + flattened_null_count = data_->null_count; + } else { + flattened_null_count = 0; + } + + auto flattened_data = child_data->Copy(); + flattened_data->buffers[0] = flattened_null_bitmap; + flattened_data->null_count = flattened_null_count; + + flattened.push_back(MakeArray(flattened_data)); + } + + *out = flattened; + return Status::OK(); +} + // ---------------------------------------------------------------------- // UnionArray diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 07e7a13..0005c48 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -610,6 +610,12 @@ class ARROW_EXPORT StructArray : public Array { // count adjusted. std::shared_ptr<Array> field(int pos) const; + /// \brief Flatten this array as a vector of arrays, one for each field + /// + /// \param[in] pool The pool to allocate null bitmaps from, if necessary + /// \param[out] out The resulting vector of arrays + Status Flatten(MemoryPool* pool, ArrayVector* out) const; + private: // For caching boxed child data mutable std::vector<std::shared_ptr<Array>> boxed_fields_; diff --git a/cpp/src/arrow/util/bit-util-test.cc b/cpp/src/arrow/util/bit-util-test.cc index 4c64dea..ba4d8a3 100644 --- a/cpp/src/arrow/util/bit-util-test.cc +++ b/cpp/src/arrow/util/bit-util-test.cc @@ -18,6 +18,8 @@ #include <climits> #include <cstdint> #include <cstring> +#include <functional> +#include <initializer_list> #include <limits> #include <memory> #include <vector> @@ -41,6 +43,58 @@ static void EnsureCpuInfoInitialized() { } } +void WriteVectorToWriter(internal::BitmapWriter& writer, const std::vector<int> values) { + for (const auto& value : values) { + if (value) { + writer.Set(); + } else { + writer.Clear(); + } + writer.Next(); + } + writer.Finish(); +} + +void BitmapFromVector(const std::vector<int>& values, int64_t bit_offset, + std::shared_ptr<Buffer>* out_buffer, int64_t* out_length) { + const int64_t length = values.size(); + *out_length = length; + ASSERT_OK(GetEmptyBitmap(default_memory_pool(), length + bit_offset, out_buffer)); + auto writer = internal::BitmapWriter((*out_buffer)->mutable_data(), bit_offset, length); + WriteVectorToWriter(writer, values); +} + +#define ASSERT_READER_SET(reader) \ + do { \ + ASSERT_TRUE(reader.IsSet()); \ + ASSERT_FALSE(reader.IsNotSet()); \ + reader.Next(); \ + } while (false) + +#define ASSERT_READER_NOT_SET(reader) \ + do { \ + ASSERT_FALSE(reader.IsSet()); \ + ASSERT_TRUE(reader.IsNotSet()); \ + reader.Next(); \ + } while (false) + +// Assert that a BitmapReader yields the given bit values +void ASSERT_READER_VALUES(internal::BitmapReader& reader, std::vector<int> values) { + for (const auto& value : values) { + if (value) { + ASSERT_READER_SET(reader); + } else { + ASSERT_READER_NOT_SET(reader); + } + } +} + +// Assert equal contents of a memory area and a vector of bytes +void ASSERT_BYTES_EQ(const uint8_t* left, const std::vector<uint8_t>& right) { + auto left_array = std::vector<uint8_t>(left, left + right.size()); + ASSERT_EQ(std::vector<uint8_t>(std::begin(left_array), std::end(left_array)), right); +} + TEST(BitUtilTests, TestIsMultipleOf64) { using BitUtil::IsMultipleOf64; EXPECT_TRUE(IsMultipleOf64(64)); @@ -71,6 +125,20 @@ TEST(BitUtilTests, TestNextPower2) { ASSERT_EQ(1LL << 62, NextPower2((1LL << 62) - 1)); } +TEST(BitmapReader, NormalOperation) { + std::shared_ptr<Buffer> buffer; + int64_t length; + + for (int64_t offset : {0, 1, 3, 5, 7, 8, 12, 13, 21, 38, 75, 120}) { + BitmapFromVector({0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1}, offset, &buffer, + &length); + ASSERT_EQ(length, 14); + + auto reader = internal::BitmapReader(buffer->mutable_data(), offset, length); + ASSERT_READER_VALUES(reader, {0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1}); + } +} + TEST(BitmapReader, DoesNotReadOutOfBounds) { uint8_t bitmap[16] = {0}; @@ -95,6 +163,37 @@ TEST(BitmapReader, DoesNotReadOutOfBounds) { internal::BitmapReader r3(nullptr, 0, 0); } +TEST(BitmapWriter, NormalOperation) { + { + uint8_t bitmap[] = {0, 0, 0, 0}; + auto writer = internal::BitmapWriter(bitmap, 0, 12); + WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); + // {0b00110110, 0b1010, 0, 0} + ASSERT_BYTES_EQ(bitmap, {0x36, 0x0a, 0, 0}); + } + { + uint8_t bitmap[] = {0xff, 0xff, 0xff, 0xff}; + auto writer = internal::BitmapWriter(bitmap, 0, 12); + WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); + // {0b00110110, 0b11111010, 0xff, 0xff} + ASSERT_BYTES_EQ(bitmap, {0x36, 0xfa, 0xff, 0xff}); + } + { + uint8_t bitmap[] = {0, 0, 0, 0}; + auto writer = internal::BitmapWriter(bitmap, 3, 12); + WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); + // {0b10110000, 0b01010001, 0, 0} + ASSERT_BYTES_EQ(bitmap, {0xb0, 0x51, 0, 0}); + } + { + uint8_t bitmap[] = {0, 0, 0, 0}; + auto writer = internal::BitmapWriter(bitmap, 20, 12); + WriteVectorToWriter(writer, {0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1}); + // {0, 0, 0b01100000, 0b10100011} + ASSERT_BYTES_EQ(bitmap, {0, 0, 0x60, 0xa3}); + } +} + TEST(BitmapWriter, DoesNotWriteOutOfBounds) { uint8_t bitmap[16] = {0}; @@ -128,6 +227,51 @@ TEST(BitmapWriter, DoesNotWriteOutOfBounds) { ASSERT_EQ((length - 5), num_values); } +TEST(BitmapAnd, Aligned) { + std::shared_ptr<Buffer> left, right, out; + int64_t length; + + for (int64_t left_offset : {0, 1, 3, 5, 7, 8, 13, 21, 38, 75, 120}) { + BitmapFromVector({0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1}, left_offset, &left, + &length); + for (int64_t right_offset : {left_offset, left_offset + 8, left_offset + 40}) { + BitmapFromVector({0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0}, right_offset, &right, + &length); + for (int64_t out_offset : {left_offset, left_offset + 16, left_offset + 24}) { + ASSERT_OK(BitmapAnd(default_memory_pool(), left->mutable_data(), left_offset, + right->mutable_data(), right_offset, length, out_offset, + &out)); + auto reader = internal::BitmapReader(out->mutable_data(), out_offset, length); + ASSERT_READER_VALUES(reader, {0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0}); + } + } + } +} + +TEST(BitmapAnd, Unaligned) { + std::shared_ptr<Buffer> left, right, out; + int64_t length; + auto offset_values = {0, 1, 3, 5, 7, 8, 13, 21, 38, 75, 120}; + + for (int64_t left_offset : offset_values) { + BitmapFromVector({0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1}, left_offset, &left, + &length); + + for (int64_t right_offset : offset_values) { + BitmapFromVector({0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0}, right_offset, &right, + &length); + + for (int64_t out_offset : offset_values) { + ASSERT_OK(BitmapAnd(default_memory_pool(), left->mutable_data(), left_offset, + right->mutable_data(), right_offset, length, out_offset, + &out)); + auto reader = internal::BitmapReader(out->mutable_data(), out_offset, length); + ASSERT_READER_VALUES(reader, {0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0}); + } + } + } +} + static inline int64_t SlowCountBits(const uint8_t* data, int64_t bit_offset, int64_t length) { int64_t count = 0; diff --git a/cpp/src/arrow/util/bit-util.cc b/cpp/src/arrow/util/bit-util.cc index c77f0d0..d3b6bb3 100644 --- a/cpp/src/arrow/util/bit-util.cc +++ b/cpp/src/arrow/util/bit-util.cc @@ -172,4 +172,58 @@ bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right return true; } +namespace { + +void AlignedBitmapAnd(const uint8_t* left, int64_t left_offset, const uint8_t* right, + int64_t right_offset, uint8_t* out, int64_t out_offset, + int64_t length) { + DCHECK_EQ(left_offset % 8, right_offset % 8); + DCHECK_EQ(left_offset % 8, out_offset % 8); + + const int64_t nbytes = BitUtil::BytesForBits(length + left_offset); + left += left_offset / 8; + right += right_offset / 8; + out += out_offset / 8; + for (int64_t i = 0; i < nbytes; ++i) { + out[i] = left[i] & right[i]; + } +} + +void UnalignedBitmapAnd(const uint8_t* left, int64_t left_offset, const uint8_t* right, + int64_t right_offset, uint8_t* out, int64_t out_offset, + int64_t length) { + auto left_reader = internal::BitmapReader(left, left_offset, length); + auto right_reader = internal::BitmapReader(right, right_offset, length); + auto writer = internal::BitmapWriter(out, out_offset, length); + for (int64_t i = 0; i < length; ++i) { + if (left_reader.IsSet() && right_reader.IsSet()) { + writer.Set(); + } + left_reader.Next(); + right_reader.Next(); + writer.Next(); + } + writer.Finish(); +} + +} // namespace + +Status BitmapAnd(MemoryPool* pool, const uint8_t* left, int64_t left_offset, + const uint8_t* right, int64_t right_offset, int64_t length, + int64_t out_offset, std::shared_ptr<Buffer>* out_buffer) { + if ((out_offset % 8 == left_offset % 8) && (out_offset % 8 == right_offset % 8)) { + // Fast case: can use bytewise AND + const int64_t phys_bits = length + out_offset; + RETURN_NOT_OK(GetEmptyBitmap(pool, phys_bits, out_buffer)); + AlignedBitmapAnd(left, left_offset, right, right_offset, + (*out_buffer)->mutable_data(), out_offset, length); + } else { + // Unaligned + RETURN_NOT_OK(GetEmptyBitmap(pool, length + out_offset, out_buffer)); + UnalignedBitmapAnd(left, left_offset, right, right_offset, + (*out_buffer)->mutable_data(), out_offset, length); + } + return Status::OK(); +} + } // namespace arrow diff --git a/cpp/src/arrow/util/bit-util.h b/cpp/src/arrow/util/bit-util.h index 6649d45..5f171a0 100644 --- a/cpp/src/arrow/util/bit-util.h +++ b/cpp/src/arrow/util/bit-util.h @@ -531,6 +531,11 @@ ARROW_EXPORT bool BitmapEquals(const uint8_t* left, int64_t left_offset, const uint8_t* right, int64_t right_offset, int64_t bit_length); +ARROW_EXPORT +Status BitmapAnd(MemoryPool* pool, const uint8_t* left, int64_t left_offset, + const uint8_t* right, int64_t right_offset, int64_t length, + int64_t out_offset, std::shared_ptr<Buffer>* out_buffer); + } // namespace arrow #endif // ARROW_UTIL_BIT_UTIL_H diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 490a37b..c79157d 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -1013,6 +1013,30 @@ cdef class DictionaryArray(Array): cdef class StructArray(Array): + def flatten(self, MemoryPool memory_pool=None): + """ + Flatten this StructArray, returning one individual array for each + field in the struct. + + Parameters + ---------- + memory_pool : MemoryPool, default None + For memory allocations, if required, otherwise use default pool + + Returns + ------- + result : List[Array] + """ + cdef: + vector[shared_ptr[CArray]] arrays + CMemoryPool* pool = maybe_unbox_memory_pool(memory_pool) + CStructArray* sarr = <CStructArray*> self.ap + + with nogil: + check_status(sarr.Flatten(pool, &arrays)) + + return [pyarrow_wrap_array(arr) for arr in arrays] + @staticmethod def from_arrays(arrays, names=None): """ diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 8654c9c..70eb9cb 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -399,6 +399,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] field(int pos) + CStatus Flatten(CMemoryPool* pool, vector[shared_ptr[CArray]]* out) + CStatus ValidateArray(const CArray& array) cdef cppclass CChunkedArray" arrow::ChunkedArray": diff --git a/python/pyarrow/lib.pxd b/python/pyarrow/lib.pxd index 6f4100f..998eeaf 100644 --- a/python/pyarrow/lib.pxd +++ b/python/pyarrow/lib.pxd @@ -254,6 +254,10 @@ cdef class Decimal128Array(FixedSizeBinaryArray): pass +cdef class StructArray(Array): + pass + + cdef class ListArray(Array): pass diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 4a441fb..dbba508 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -778,3 +778,41 @@ def test_invalid_tensor_operation(): t = pa.Tensor() with pytest.raises(TypeError): t.to_numpy() + + +def test_struct_array_flatten(): + ty = pa.struct([pa.field('x', pa.int16()), + pa.field('y', pa.float32())]) + a = pa.array([(1, 2.5), (3, 4.5), (5, 6.5)], type=ty) + xs, ys = a.flatten() + assert xs.type == pa.int16() + assert ys.type == pa.float32() + assert xs.to_pylist() == [1, 3, 5] + assert ys.to_pylist() == [2.5, 4.5, 6.5] + xs, ys = a[1:].flatten() + assert xs.to_pylist() == [3, 5] + assert ys.to_pylist() == [4.5, 6.5] + + a = pa.array([(1, 2.5), None, (3, 4.5)], type=ty) + xs, ys = a.flatten() + assert xs.to_pylist() == [1, None, 3] + assert ys.to_pylist() == [2.5, None, 4.5] + xs, ys = a[1:].flatten() + assert xs.to_pylist() == [None, 3] + assert ys.to_pylist() == [None, 4.5] + + a = pa.array([(1, None), (2, 3.5), (None, 4.5)], type=ty) + xs, ys = a.flatten() + assert xs.to_pylist() == [1, 2, None] + assert ys.to_pylist() == [None, 3.5, 4.5] + xs, ys = a[1:].flatten() + assert xs.to_pylist() == [2, None] + assert ys.to_pylist() == [3.5, 4.5] + + a = pa.array([(1, None), None, (None, 2.5)], type=ty) + xs, ys = a.flatten() + assert xs.to_pylist() == [1, None, None] + assert ys.to_pylist() == [None, None, 2.5] + xs, ys = a[1:].flatten() + assert xs.to_pylist() == [None, None] + assert ys.to_pylist() == [None, 2.5] -- To stop receiving notification emails like this one, please contact apit...@apache.org.