Repository: arrow Updated Branches: refs/heads/master 0396240b5 -> 1541a08c7
ARROW-1177: [C++] Check for int32 offset overflow in ListBuilder, BinaryBuilder I also refactored BinaryBuilder to not inherit from ListBuilder, which is a bit cleaner. I added a draft of ARROW-507; it needs a unit test and to handle the case where some passed offsets are null (so they need to be sanitized) Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #853 from wesm/ARROW-1177 and squashes the following commits: f6be04f [Wes McKinney] Fix DCHECKs in ListBuilder, BinaryBuilder 28f17ab [Wes McKinney] Use binary strings for py2.7 c9e7502 [Wes McKinney] Fix some off-by-one errors 5a8be84 [Wes McKinney] Fix another warning 23adefc [Wes McKinney] Fix compiler warning 35ab4f2 [Wes McKinney] Refactoring BinaryBuilder. Add check for int32 offset overflow for List, Binary, String. Add basic ListArray::FromArrays method, add Python binding Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/1541a08c Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/1541a08c Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/1541a08c Branch: refs/heads/master Commit: 1541a08c795f03b2ce5884a00fc83d1fb79a448e Parents: 0396240 Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Mon Jul 17 18:32:30 2017 +0200 Committer: Uwe L. Korn <uw...@xhochy.com> Committed: Mon Jul 17 18:32:30 2017 +0200 ---------------------------------------------------------------------- cpp/src/arrow/array-test.cc | 11 +- cpp/src/arrow/array.cc | 26 ++++ cpp/src/arrow/array.h | 13 ++ cpp/src/arrow/buffer.h | 73 ++++++----- cpp/src/arrow/builder.cc | 168 ++++++++++++++++---------- cpp/src/arrow/builder.h | 60 ++++++--- cpp/src/arrow/ipc/ipc-json-test.cc | 4 +- cpp/src/arrow/ipc/ipc-read-write-test.cc | 5 +- cpp/src/arrow/ipc/test-common.h | 21 +++- cpp/src/arrow/test-util.h | 9 +- python/pyarrow/array.pxi | 36 +++++- python/pyarrow/includes/libarrow.pxd | 6 + python/pyarrow/tests/test_array.py | 12 ++ 13 files changed, 313 insertions(+), 131 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/array-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 01bef03..acb4819 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -71,13 +71,14 @@ Status MakeArrayFromValidBytes( std::shared_ptr<Buffer> null_buf; RETURN_NOT_OK(BitUtil::BytesToBits(v, &null_buf)); - BufferBuilder value_builder(pool); + TypedBufferBuilder<int32_t> value_builder(pool); for (size_t i = 0; i < v.size(); ++i) { - RETURN_NOT_OK(value_builder.Append<int32_t>(0)); + RETURN_NOT_OK(value_builder.Append(0)); } - *out = std::make_shared<Int32Array>( - v.size(), value_builder.Finish(), null_buf, null_count); + std::shared_ptr<Buffer> values; + RETURN_NOT_OK(value_builder.Finish(&values)); + *out = std::make_shared<Int32Array>(v.size(), values, null_buf, null_count); return Status::OK(); } @@ -996,7 +997,7 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) { vector<uint8_t> is_null = {0, 0, 0, 1, 0}; int N = static_cast<int>(strings.size()); - int reps = 1000; + int reps = 10; for (int j = 0; j < reps; ++j) { for (int i = 0; i < N; ++i) { http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/array.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 48a3bd5..f20b849 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -190,6 +190,32 @@ ListArray::ListArray(const std::shared_ptr<DataType>& type, int64_t length, SetData(internal_data); } +Status ListArray::FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, + std::shared_ptr<Array>* out) { + if (ARROW_PREDICT_FALSE(offsets.length() == 0)) { + return Status::Invalid("List offsets must have non-zero length"); + } + + if (ARROW_PREDICT_FALSE(offsets.null_count() > 0)) { + return Status::Invalid("Null offsets in ListArray::FromArrays not yet implemented"); + } + + if (ARROW_PREDICT_FALSE(offsets.type_id() != Type::INT32)) { + return Status::Invalid("List offsets must be signed int32"); + } + + BufferVector buffers = { + offsets.null_bitmap(), static_cast<const Int32Array&>(offsets).values()}; + + auto list_type = list(values.type()); + auto internal_data = std::make_shared<internal::ArrayData>(list_type, + offsets.length() - 1, std::move(buffers), offsets.null_count(), offsets.offset()); + internal_data->child_data.push_back(values.data()); + + *out = std::make_shared<ListArray>(internal_data); + return Status::OK(); +} + void ListArray::SetData(const std::shared_ptr<ArrayData>& data) { this->Array::SetData(data); auto value_offsets = data->buffers[1]; http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/array.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 80284cd..2da0b54 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -371,6 +371,19 @@ class ARROW_EXPORT ListArray : public Array { const std::shared_ptr<Buffer>& null_bitmap = nullptr, int64_t null_count = 0, int64_t offset = 0); + /// \brief Construct ListArray from array of offsets and child value array + /// + /// Note: does not validate input beyond sanity checks. Use + /// arrow::ValidateArray if you need stronger validation of inputs + /// + /// \param[in] offsets Array containing n + 1 offsets encoding length and size + /// \param[in] values Array containing + /// \param[in] pool MemoryPool in case new offsets array needs to be + /// allocated because of null values + /// \param[out] out Will have length equal to offsets.length() - 1 + static Status FromArrays(const Array& offsets, const Array& values, MemoryPool* pool, + std::shared_ptr<Array>* out); + /// \brief Return array object containing the list's values std::shared_ptr<Array> values() const; http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/buffer.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index b117b24..488a4c0 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -212,58 +212,71 @@ class ARROW_EXPORT BufferBuilder { return Status::OK(); } - template <typename T> + // Unsafe methods don't check existing size + void UnsafeAppend(const uint8_t* data, int64_t length) { + memcpy(data_ + size_, data, static_cast<size_t>(length)); + size_ += length; + } + + Status Finish(std::shared_ptr<Buffer>* out) { + // Do not shrink to fit to avoid unneeded realloc + if (size_ > 0) { RETURN_NOT_OK(buffer_->Resize(size_, false)); } + *out = buffer_; + Reset(); + return Status::OK(); + } + + void Reset() { + buffer_ = nullptr; + capacity_ = size_ = 0; + } + + int64_t capacity() const { return capacity_; } + int64_t length() const { return size_; } + const uint8_t* data() const { return data_; } + + protected: + std::shared_ptr<PoolBuffer> buffer_; + MemoryPool* pool_; + uint8_t* data_; + int64_t capacity_; + int64_t size_; +}; + +template <typename T> +class ARROW_EXPORT TypedBufferBuilder : public BufferBuilder { + public: + explicit TypedBufferBuilder(MemoryPool* pool) : BufferBuilder(pool) {} + Status Append(T arithmetic_value) { static_assert(std::is_arithmetic<T>::value, "Convenience buffer append only supports arithmetic types"); - return Append(reinterpret_cast<uint8_t*>(&arithmetic_value), sizeof(T)); + return BufferBuilder::Append( + reinterpret_cast<uint8_t*>(&arithmetic_value), sizeof(T)); } - template <typename T> Status Append(const T* arithmetic_values, int64_t num_elements) { static_assert(std::is_arithmetic<T>::value, "Convenience buffer append only supports arithmetic types"); - return Append( + return BufferBuilder::Append( reinterpret_cast<const uint8_t*>(arithmetic_values), num_elements * sizeof(T)); } - // Unsafe methods don't check existing size - void UnsafeAppend(const uint8_t* data, int64_t length) { - memcpy(data_ + size_, data, static_cast<size_t>(length)); - size_ += length; - } - - template <typename T> void UnsafeAppend(T arithmetic_value) { static_assert(std::is_arithmetic<T>::value, "Convenience buffer append only supports arithmetic types"); - UnsafeAppend(reinterpret_cast<uint8_t*>(&arithmetic_value), sizeof(T)); + BufferBuilder::UnsafeAppend(reinterpret_cast<uint8_t*>(&arithmetic_value), sizeof(T)); } - template <typename T> void UnsafeAppend(const T* arithmetic_values, int64_t num_elements) { static_assert(std::is_arithmetic<T>::value, "Convenience buffer append only supports arithmetic types"); - UnsafeAppend( + BufferBuilder::UnsafeAppend( reinterpret_cast<const uint8_t*>(arithmetic_values), num_elements * sizeof(T)); } - std::shared_ptr<Buffer> Finish() { - auto result = buffer_; - buffer_ = nullptr; - capacity_ = size_ = 0; - return result; - } - int64_t capacity() const { return capacity_; } - int64_t length() const { return size_; } - const uint8_t* data() const { return data_; } - - private: - std::shared_ptr<PoolBuffer> buffer_; - MemoryPool* pool_; - uint8_t* data_; - int64_t capacity_; - int64_t size_; + const T* data() const { return reinterpret_cast<const T*>(data_); } + int64_t length() const { return size_ / sizeof(T); } }; /// Allocate a new mutable buffer from a memory pool http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/builder.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 885c650..a2f24a7 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -21,6 +21,8 @@ #include <cstdint> #include <cstring> #include <limits> +#include <sstream> +#include <vector> #include "arrow/array.h" #include "arrow/buffer.h" @@ -99,21 +101,17 @@ Status ArrayBuilder::Reserve(int64_t elements) { return Status::OK(); } +void ArrayBuilder::Reset() { + capacity_ = length_ = null_count_ = 0; + null_bitmap_ = nullptr; +} + Status ArrayBuilder::SetNotNull(int64_t length) { RETURN_NOT_OK(Reserve(length)); UnsafeSetNotNull(length); return Status::OK(); } -void ArrayBuilder::UnsafeAppendToBitmap(bool is_valid) { - if (is_valid) { - BitUtil::SetBit(null_bitmap_data_, length_); - } else { - ++null_count_; - } - ++length_; -} - void ArrayBuilder::UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length) { if (valid_bytes == nullptr) { UnsafeSetNotNull(length); @@ -971,7 +969,8 @@ Status DecimalBuilder::Resize(int64_t capacity) { } Status DecimalBuilder::Finish(std::shared_ptr<Array>* out) { - std::shared_ptr<Buffer> data = byte_builder_.Finish(); + std::shared_ptr<Buffer> data; + RETURN_NOT_OK(byte_builder_.Finish(&data)); /// TODO(phillipc): not sure where to get the offset argument here *out = std::make_shared<DecimalArray>( @@ -987,65 +986,66 @@ ListBuilder::ListBuilder(MemoryPool* pool, std::unique_ptr<ArrayBuilder> value_b : ArrayBuilder(pool, type ? type : std::static_pointer_cast<DataType>( std::make_shared<ListType>(value_builder->type()))), - offset_builder_(pool), + offsets_builder_(pool), value_builder_(std::move(value_builder)) {} -ListBuilder::ListBuilder(MemoryPool* pool, std::shared_ptr<Array> values, - const std::shared_ptr<DataType>& type) - : ArrayBuilder(pool, - type ? type : std::static_pointer_cast<DataType>( - std::make_shared<ListType>(values->type()))), - offset_builder_(pool), - values_(values) {} - Status ListBuilder::Append( const int32_t* offsets, int64_t length, const uint8_t* valid_bytes) { RETURN_NOT_OK(Reserve(length)); UnsafeAppendToBitmap(valid_bytes, length); - offset_builder_.UnsafeAppend<int32_t>(offsets, length); + offsets_builder_.UnsafeAppend(offsets, length); return Status::OK(); } +Status ListBuilder::AppendNextOffset() { + int64_t num_values = value_builder_->length(); + if (ARROW_PREDICT_FALSE(num_values >= std::numeric_limits<int32_t>::max())) { + std::stringstream ss; + ss << "ListArray cannot contain more then INT32_MAX - 1 child elements," + << " have " << num_values; + return Status::Invalid(ss.str()); + } + return offsets_builder_.Append(static_cast<int32_t>(num_values)); +} + Status ListBuilder::Append(bool is_valid) { RETURN_NOT_OK(Reserve(1)); UnsafeAppendToBitmap(is_valid); - RETURN_NOT_OK( - offset_builder_.Append<int32_t>(static_cast<int32_t>(value_builder_->length()))); - return Status::OK(); + return AppendNextOffset(); } Status ListBuilder::Init(int64_t elements) { - DCHECK_LT(elements, std::numeric_limits<int64_t>::max()); + DCHECK_LT(elements, std::numeric_limits<int32_t>::max()); RETURN_NOT_OK(ArrayBuilder::Init(elements)); // one more then requested for offsets - return offset_builder_.Resize((elements + 1) * sizeof(int64_t)); + return offsets_builder_.Resize((elements + 1) * sizeof(int64_t)); } Status ListBuilder::Resize(int64_t capacity) { - DCHECK_LT(capacity, std::numeric_limits<int64_t>::max()); + DCHECK_LT(capacity, std::numeric_limits<int32_t>::max()); // one more then requested for offsets - RETURN_NOT_OK(offset_builder_.Resize((capacity + 1) * sizeof(int64_t))); + RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t))); return ArrayBuilder::Resize(capacity); } Status ListBuilder::Finish(std::shared_ptr<Array>* out) { + RETURN_NOT_OK(AppendNextOffset()); + + std::shared_ptr<Buffer> offsets; + RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); + std::shared_ptr<Array> items = values_; if (!items) { RETURN_NOT_OK(value_builder_->Finish(&items)); } - RETURN_NOT_OK(offset_builder_.Append<int64_t>(items->length())); - std::shared_ptr<Buffer> offsets = offset_builder_.Finish(); - *out = std::make_shared<ListArray>( type_, length_, offsets, items, null_bitmap_, null_count_); Reset(); - return Status::OK(); } void ListBuilder::Reset() { - capacity_ = length_ = null_count_ = 0; - null_bitmap_ = nullptr; + ArrayBuilder::Reset(); values_ = nullptr; } @@ -1057,52 +1057,97 @@ ArrayBuilder* ListBuilder::value_builder() const { // ---------------------------------------------------------------------- // String and binary -BinaryBuilder::BinaryBuilder(MemoryPool* pool) - : ListBuilder(pool, std::unique_ptr<ArrayBuilder>(new UInt8Builder(pool, uint8())), - binary()) { - byte_builder_ = static_cast<UInt8Builder*>(value_builder_.get()); +BinaryBuilder::BinaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type) + : ArrayBuilder(pool, type), offsets_builder_(pool), value_data_builder_(pool) {} + +BinaryBuilder::BinaryBuilder(MemoryPool* pool) : BinaryBuilder(pool, binary()) {} + +Status BinaryBuilder::Init(int64_t elements) { + DCHECK_LT(elements, std::numeric_limits<int32_t>::max()); + RETURN_NOT_OK(ArrayBuilder::Init(elements)); + // one more then requested for offsets + return offsets_builder_.Resize((elements + 1) * sizeof(int64_t)); +} + +Status BinaryBuilder::Resize(int64_t capacity) { + DCHECK_LT(capacity, std::numeric_limits<int32_t>::max()); + // one more then requested for offsets + RETURN_NOT_OK(offsets_builder_.Resize((capacity + 1) * sizeof(int64_t))); + return ArrayBuilder::Resize(capacity); } -BinaryBuilder::BinaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type) - : ListBuilder( - pool, std::unique_ptr<ArrayBuilder>(new UInt8Builder(pool, uint8())), type) { - byte_builder_ = static_cast<UInt8Builder*>(value_builder_.get()); +Status BinaryBuilder::AppendNextOffset() { + const int64_t num_bytes = value_data_builder_.length(); + if (ARROW_PREDICT_FALSE(num_bytes > kMaximumCapacity)) { + std::stringstream ss; + ss << "BinaryArray cannot contain more than " << kMaximumCapacity << " bytes, have " + << num_bytes; + return Status::Invalid(ss.str()); + } + return offsets_builder_.Append(static_cast<int32_t>(num_bytes)); } -Status BinaryBuilder::Finish(std::shared_ptr<Array>* out) { - std::shared_ptr<Array> result; - RETURN_NOT_OK(ListBuilder::Finish(&result)); +Status BinaryBuilder::Append(const uint8_t* value, int32_t length) { + RETURN_NOT_OK(Reserve(1)); + RETURN_NOT_OK(AppendNextOffset()); + RETURN_NOT_OK(value_data_builder_.Append(value, length)); + UnsafeAppendToBitmap(true); + return Status::OK(); +} - const auto list = std::dynamic_pointer_cast<ListArray>(result); - auto values = std::dynamic_pointer_cast<UInt8Array>(list->values()); +Status BinaryBuilder::AppendNull() { + RETURN_NOT_OK(AppendNextOffset()); + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(false); + return Status::OK(); +} + +Status BinaryBuilder::FinishInternal(std::shared_ptr<internal::ArrayData>* out) { + // Write final offset (values length) + RETURN_NOT_OK(AppendNextOffset()); + std::shared_ptr<Buffer> offsets, value_data; - *out = std::make_shared<BinaryArray>(list->length(), list->value_offsets(), - values->values(), list->null_bitmap(), list->null_count()); + RETURN_NOT_OK(offsets_builder_.Finish(&offsets)); + RETURN_NOT_OK(value_data_builder_.Finish(&value_data)); + + BufferVector buffers = {null_bitmap_, offsets, value_data}; + *out = std::make_shared<internal::ArrayData>( + type_, length_, std::move(buffers), null_count_, 0); return Status::OK(); } +Status BinaryBuilder::Finish(std::shared_ptr<Array>* out) { + std::shared_ptr<internal::ArrayData> data; + RETURN_NOT_OK(FinishInternal(&data)); + *out = std::make_shared<BinaryArray>(data); + Reset(); + return Status::OK(); +} + +void BinaryBuilder::Reset() { + ArrayBuilder::Reset(); + offsets_builder_.Reset(); + value_data_builder_.Reset(); +} + const uint8_t* BinaryBuilder::GetValue(int64_t i, int32_t* out_length) const { - const int32_t* offsets = reinterpret_cast<const int32_t*>(offset_builder_.data()); + const int32_t* offsets = offsets_builder_.data(); int32_t offset = offsets[i]; if (i == (length_ - 1)) { - *out_length = static_cast<int32_t>(value_builder_->length()) - offset; + *out_length = static_cast<int32_t>(value_data_builder_.length()) - offset; } else { *out_length = offsets[i + 1] - offset; } - return byte_builder_->data()->data() + offset; + return value_data_builder_.data() + offset; } StringBuilder::StringBuilder(MemoryPool* pool) : BinaryBuilder(pool, utf8()) {} Status StringBuilder::Finish(std::shared_ptr<Array>* out) { - std::shared_ptr<Array> result; - RETURN_NOT_OK(ListBuilder::Finish(&result)); - - const auto list = std::dynamic_pointer_cast<ListArray>(result); - auto values = std::dynamic_pointer_cast<UInt8Array>(list->values()); - - *out = std::make_shared<StringArray>(list->length(), list->value_offsets(), - values->values(), list->null_bitmap(), list->null_count()); + std::shared_ptr<internal::ArrayData> data; + RETURN_NOT_OK(FinishInternal(&data)); + *out = std::make_shared<StringArray>(data); + Reset(); return Status::OK(); } @@ -1139,19 +1184,18 @@ Status FixedSizeBinaryBuilder::AppendNull() { } Status FixedSizeBinaryBuilder::Init(int64_t elements) { - DCHECK_LT(elements, std::numeric_limits<int64_t>::max()); RETURN_NOT_OK(ArrayBuilder::Init(elements)); return byte_builder_.Resize(elements * byte_width_); } Status FixedSizeBinaryBuilder::Resize(int64_t capacity) { - DCHECK_LT(capacity, std::numeric_limits<int64_t>::max()); RETURN_NOT_OK(byte_builder_.Resize(capacity * byte_width_)); return ArrayBuilder::Resize(capacity); } Status FixedSizeBinaryBuilder::Finish(std::shared_ptr<Array>* out) { - std::shared_ptr<Buffer> data = byte_builder_.Finish(); + std::shared_ptr<Buffer> data; + RETURN_NOT_OK(byte_builder_.Finish(&data)); *out = std::make_shared<FixedSizeBinaryArray>( type_, length_, data, null_bitmap_, null_count_); return Status::OK(); http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/builder.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index c0a075b..6b54c9f 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -38,6 +38,12 @@ namespace arrow { class Array; +namespace internal { + +struct ArrayData; + +} // namespace internal + namespace decimal { template <typename T> @@ -127,12 +133,20 @@ class ARROW_EXPORT ArrayBuilder { // Child value array builders. These are owned by this class std::vector<std::unique_ptr<ArrayBuilder>> children_; - // + void Reset(); + // Unsafe operations (don't check capacity/don't resize) - // // Append to null bitmap. - void UnsafeAppendToBitmap(bool is_valid); + void UnsafeAppendToBitmap(bool is_valid) { + if (is_valid) { + BitUtil::SetBit(null_bitmap_data_, length_); + } else { + ++null_count_; + } + ++length_; + } + // Vector append. Treat each zero byte as a nullzero. If valid_bytes is null // assume all of length bits are valid. void UnsafeAppendToBitmap(const uint8_t* valid_bytes, int64_t length); @@ -494,7 +508,8 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { // ---------------------------------------------------------------------- // List builder -/// Builder class for variable-length list array value types +/// \class ListBuilder +/// \brief Builder class for variable-length list array value types /// /// To use this class, you must append values to the child array builder and use /// the Append function to delimit each distinct list value (once the values @@ -513,22 +528,18 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { ListBuilder(MemoryPool* pool, std::unique_ptr<ArrayBuilder> value_builder, const std::shared_ptr<DataType>& type = nullptr); - /// Use this constructor to build the list with a pre-existing values array - ListBuilder(MemoryPool* pool, std::shared_ptr<Array> values, - const std::shared_ptr<DataType>& type = nullptr); - Status Init(int64_t elements) override; Status Resize(int64_t capacity) override; Status Finish(std::shared_ptr<Array>* out) override; - /// Vector append + /// \brief Vector append /// /// If passed, valid_bytes is of equal length to values, and any zero byte /// will be considered as a null for that slot Status Append( const int32_t* offsets, int64_t length, const uint8_t* valid_bytes = nullptr); - /// Start a new variable-length list slot + /// \brief Start a new variable-length list slot /// /// This function should be called before beginning to append elements to the /// value builder @@ -539,25 +550,26 @@ class ARROW_EXPORT ListBuilder : public ArrayBuilder { ArrayBuilder* value_builder() const; protected: - BufferBuilder offset_builder_; + TypedBufferBuilder<int32_t> offsets_builder_; std::unique_ptr<ArrayBuilder> value_builder_; std::shared_ptr<Array> values_; + Status AppendNextOffset(); + void Reset(); }; // ---------------------------------------------------------------------- // Binary and String -class ARROW_EXPORT BinaryBuilder : public ListBuilder { +/// \class BinaryBuilder +/// \brief Builder class for variable-length binary data +class ARROW_EXPORT BinaryBuilder : public ArrayBuilder { public: explicit BinaryBuilder(MemoryPool* pool); explicit BinaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type); - Status Append(const uint8_t* value, int32_t length) { - RETURN_NOT_OK(ListBuilder::Append()); - return byte_builder_->Append(value, length); - } + Status Append(const uint8_t* value, int32_t length); Status Append(const char* value, int32_t length) { return Append(reinterpret_cast<const uint8_t*>(value), length); @@ -567,6 +579,10 @@ class ARROW_EXPORT BinaryBuilder : public ListBuilder { return Append(value.c_str(), static_cast<int32_t>(value.size())); } + Status AppendNull(); + + Status Init(int64_t elements) override; + Status Resize(int64_t capacity) override; Status Finish(std::shared_ptr<Array>* out) override; /// Temporary access to a value. @@ -575,10 +591,18 @@ class ARROW_EXPORT BinaryBuilder : public ListBuilder { const uint8_t* GetValue(int64_t i, int32_t* out_length) const; protected: - UInt8Builder* byte_builder_; + TypedBufferBuilder<int32_t> offsets_builder_; + TypedBufferBuilder<uint8_t> value_data_builder_; + + static constexpr int64_t kMaximumCapacity = std::numeric_limits<int32_t>::max() - 1; + + Status AppendNextOffset(); + Status FinishInternal(std::shared_ptr<internal::ArrayData>* out); + void Reset(); }; -// String builder +/// \class StringBuilder +/// \brief Builder class for UTF8 strings class ARROW_EXPORT StringBuilder : public BinaryBuilder { public: using BinaryBuilder::BinaryBuilder; http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/ipc/ipc-json-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index 318e318..79344df 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -169,7 +169,7 @@ TEST(TestJsonArrayWriter, NestedTypes) { std::vector<int32_t> offsets = {0, 0, 0, 1, 4, 7}; std::shared_ptr<Buffer> list_bitmap; - ASSERT_OK(test::GetBitmapFromBoolVector(list_is_valid, &list_bitmap)); + ASSERT_OK(test::GetBitmapFromVector(list_is_valid, &list_bitmap)); std::shared_ptr<Buffer> offsets_buffer = test::GetBufferFromVector(offsets); ListArray list_array(list(value_type), 5, offsets_buffer, values_array, list_bitmap, 1); @@ -179,7 +179,7 @@ TEST(TestJsonArrayWriter, NestedTypes) { // Struct std::vector<bool> struct_is_valid = {true, false, true, true, true, false, true}; std::shared_ptr<Buffer> struct_bitmap; - ASSERT_OK(test::GetBitmapFromBoolVector(struct_is_valid, &struct_bitmap)); + ASSERT_OK(test::GetBitmapFromVector(struct_is_valid, &struct_bitmap)); auto struct_type = struct_({field("f1", int32()), field("f2", int32()), field("f3", int32())}); http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/ipc/ipc-read-write-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/ipc-read-write-test.cc b/cpp/src/arrow/ipc/ipc-read-write-test.cc index 42f14b0..2119ff7 100644 --- a/cpp/src/arrow/ipc/ipc-read-write-test.cc +++ b/cpp/src/arrow/ipc/ipc-read-write-test.cc @@ -343,7 +343,7 @@ TEST_F(TestWriteRecordBatch, SliceTruncatesBuffers) { auto union_type = union_({field("f0", a0->type())}, {0}); std::vector<int32_t> type_ids(a0->length()); std::shared_ptr<Buffer> ids_buffer; - ASSERT_OK(test::CopyBufferFromVector(type_ids, &ids_buffer)); + ASSERT_OK(test::CopyBufferFromVector(type_ids, default_memory_pool(), &ids_buffer)); a1 = std::make_shared<UnionArray>(union_type, a0->length(), struct_children, ids_buffer); CheckArray(a1); @@ -355,7 +355,8 @@ TEST_F(TestWriteRecordBatch, SliceTruncatesBuffers) { type_offsets.push_back(i); } std::shared_ptr<Buffer> offsets_buffer; - ASSERT_OK(test::CopyBufferFromVector(type_offsets, &offsets_buffer)); + ASSERT_OK( + test::CopyBufferFromVector(type_offsets, default_memory_pool(), &offsets_buffer)); a1 = std::make_shared<UnionArray>( dense_union_type, a0->length(), struct_children, ids_buffer, offsets_buffer); CheckArray(a1); http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/ipc/test-common.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index a542c87..67a41ba 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -139,9 +139,16 @@ Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_li std::replace_if(offsets.begin(), offsets.end(), [child_length](int32_t offset) { return offset > child_length; }, child_length); } - ListBuilder builder(pool, child_array); - RETURN_NOT_OK(builder.Append(offsets.data(), num_lists, valid_lists.data())); - RETURN_NOT_OK(builder.Finish(out)); + + offsets[num_lists] = static_cast<int32_t>(child_array->length()); + + /// TODO(wesm): Implement support for nulls in ListArray::FromArrays + std::shared_ptr<Buffer> null_bitmap, offsets_buffer; + RETURN_NOT_OK(test::GetBitmapFromVector(valid_lists, &null_bitmap)); + RETURN_NOT_OK(test::CopyBufferFromVector(offsets, pool, &offsets_buffer)); + + *out = std::make_shared<ListArray>(list(child_array->type()), num_lists, offsets_buffer, + child_array, null_bitmap, kUnknownNullCount); return ValidateArray(**out); } @@ -398,7 +405,8 @@ Status MakeUnion(std::shared_ptr<RecordBatch>* out) { std::shared_ptr<Buffer> type_ids_buffer; std::vector<uint8_t> type_ids = {5, 10, 5, 5, 10, 10, 5}; - RETURN_NOT_OK(test::CopyBufferFromVector(type_ids, &type_ids_buffer)); + RETURN_NOT_OK( + test::CopyBufferFromVector(type_ids, default_memory_pool(), &type_ids_buffer)); std::vector<int32_t> u0_values = {0, 1, 2, 3, 4, 5, 6}; ArrayFromVector<Int32Type, int32_t>(u0_values, &sparse_children[0]); @@ -415,7 +423,8 @@ Status MakeUnion(std::shared_ptr<RecordBatch>* out) { std::shared_ptr<Buffer> offsets_buffer; std::vector<int32_t> offsets = {0, 0, 1, 2, 1, 2, 3}; - RETURN_NOT_OK(test::CopyBufferFromVector(offsets, &offsets_buffer)); + RETURN_NOT_OK( + test::CopyBufferFromVector(offsets, default_memory_pool(), &offsets_buffer)); std::vector<uint8_t> null_bytes(length, 1); null_bytes[2] = 0; @@ -479,7 +488,7 @@ Status MakeDictionary(std::shared_ptr<RecordBatch>* out) { ArrayFromVector<Int8Type, int8_t>(is_valid3, indices3_values, &indices3); std::shared_ptr<Buffer> null_bitmap; - RETURN_NOT_OK(test::GetBitmapFromBoolVector(is_valid, &null_bitmap)); + RETURN_NOT_OK(test::GetBitmapFromVector(is_valid, &null_bitmap)); std::shared_ptr<Array> a3 = std::make_shared<ListArray>(f3_type, length, std::static_pointer_cast<PrimitiveArray>(offsets)->values(), http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/cpp/src/arrow/test-util.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 2cff97a..2bc6625 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -102,10 +102,10 @@ std::shared_ptr<Buffer> GetBufferFromVector(const std::vector<T>& values) { template <typename T> inline Status CopyBufferFromVector( - const std::vector<T>& values, std::shared_ptr<Buffer>* result) { + const std::vector<T>& values, MemoryPool* pool, std::shared_ptr<Buffer>* result) { int64_t nbytes = static_cast<int>(values.size()) * sizeof(T); - auto buffer = std::make_shared<PoolBuffer>(default_memory_pool()); + auto buffer = std::make_shared<PoolBuffer>(pool); RETURN_NOT_OK(buffer->Resize(nbytes)); memcpy(buffer->mutable_data(), values.data(), nbytes); @@ -113,8 +113,9 @@ inline Status CopyBufferFromVector( return Status::OK(); } -static inline Status GetBitmapFromBoolVector( - const std::vector<bool>& is_valid, std::shared_ptr<Buffer>* result) { +template <typename T> +static inline Status GetBitmapFromVector( + const std::vector<T>& is_valid, std::shared_ptr<Buffer>* result) { size_t length = is_valid.size(); std::shared_ptr<MutableBuffer> buffer; http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/python/pyarrow/array.pxi ---------------------------------------------------------------------- diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index f8bccc7..c7a4415 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -40,7 +40,6 @@ cdef maybe_coerce_datetime64(values, dtype, DataType type, return values, type - def array(object sequence, DataType type=None, MemoryPool memory_pool=None, size=None): """ @@ -302,6 +301,19 @@ cdef class Array: """ return [x.as_py() for x in self] + def validate(self): + """ + Perform any validation checks implemented by + arrow::ValidateArray. Raises exception with error message if array does + not validate + + Raises + ------ + ArrowInvalid + """ + with nogil: + check_status(ValidateArray(deref(self.ap))) + cdef class Tensor: @@ -478,7 +490,27 @@ cdef class DecimalArray(FixedSizeBinaryArray): cdef class ListArray(Array): - pass + + @staticmethod + def from_arrays(Array offsets, Array values, MemoryPool pool=None): + """ + Construct ListArray from arrays of int32 offsets and values + + Parameters + ---------- + offset : Array (int32 type) + values : Array (any type) + + Returns + ------- + list_array : ListArray + """ + cdef shared_ptr[CArray] out + cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) + with nogil: + check_status(CListArray.FromArrays( + deref(offsets.ap), deref(values.ap), cpool, &out)) + return pyarrow_wrap_array(out) cdef class StringArray(Array): http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/python/pyarrow/includes/libarrow.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index dd791cd..44d83da 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -283,6 +283,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: c_string FormatValue(int i) cdef cppclass CListArray" arrow::ListArray"(CArray): + @staticmethod + CStatus FromArrays(const CArray& offsets, const CArray& values, + CMemoryPool* pool, shared_ptr[CArray]* out) + const int32_t* raw_value_offsets() int32_t value_offset(int i) int32_t value_length(int i) @@ -304,6 +308,8 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] field(int pos) const vector[shared_ptr[CArray]] fields() + CStatus ValidateArray(const CArray& array) + cdef cppclass CChunkedArray" arrow::ChunkedArray": int64_t length() int64_t null_count() http://git-wip-us.apache.org/repos/asf/arrow/blob/1541a08c/python/pyarrow/tests/test_array.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index 1a0ee61..5a373b4 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -199,6 +199,18 @@ def test_dictionary_with_pandas(): tm.assert_series_equal(pd.Series(pandas2), pd.Series(ex_pandas2)) +def test_list_from_arrays(): + offsets_arr = np.array([0, 2, 5, 8], dtype='i4') + offsets = pa.Array.from_pandas(offsets_arr, type=pa.int32()) + pyvalues = [b'a', b'b', b'c', b'd', b'e', b'f', b'g', b'h'] + values = pa.array(pyvalues, type=pa.binary()) + + result = pa.ListArray.from_arrays(offsets, values) + expected = pa.array([pyvalues[:2], pyvalues[2:5], pyvalues[5:8]]) + + assert result.equals(expected) + + def test_simple_type_construction(): result = pa.lib.TimestampType() with pytest.raises(TypeError):