Repository: arrow Updated Branches: refs/heads/master 83a4405ea -> c398fda75
ARROW-462: [C++] Implement in-memory conversions between non-nested primitive types and DictionaryArray equivalent Simple usage: ``` Array primitive_array; DictionaryBuilder<Type> builder; RETURN_NOT_OK(builder.AppendArray(primitive_array)); std::shared_ptr<Array> dict_encoded_array; RETURN_NOT_OK(builder.Finish(&dict_encoded_array)); ``` Author: Wes McKinney <wes.mckin...@twosigma.com> Author: Uwe L. Korn <uw...@xhochy.com> Closes #812 from xhochy/ARROW-462 and squashes the following commits: 9e289b6a [Wes McKinney] Add a couple doxygen strings. Place WrappedBinary, DictionaryScalar in internal namespace. Add BinaryDictionaryBuilder convenience 4147bdbe [Uwe L. Korn] ARROW-462: [C++] Implement in-memory conversions between non-nested primitive types and DictionaryArray equivalent Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/c398fda7 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/c398fda7 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/c398fda7 Branch: refs/heads/master Commit: c398fda75a90fc83bafb2aafb895760b44ecd436 Parents: 83a4405 Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Thu Jul 6 20:18:38 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Thu Jul 6 20:18:38 2017 -0400 ---------------------------------------------------------------------- cpp/src/arrow/array-test.cc | 34 ++++++++ cpp/src/arrow/array.h | 2 +- cpp/src/arrow/builder.cc | 165 +++++++++++++++++++++++++++------------ cpp/src/arrow/builder.h | 80 +++++++++++++++---- 4 files changed, 216 insertions(+), 65 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/c398fda7/cpp/src/arrow/array-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc index 64672d4..7ae03cf 100644 --- a/cpp/src/arrow/array-test.cc +++ b/cpp/src/arrow/array-test.cc @@ -1538,6 +1538,40 @@ TYPED_TEST(TestDictionaryBuilder, Basic) { ASSERT_TRUE(expected.Equals(result)); } +TYPED_TEST(TestDictionaryBuilder, ArrayConversion) { + NumericBuilder<TypeParam> builder(default_memory_pool()); + // DictionaryBuilder<TypeParam> builder(default_memory_pool()); + ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1))); + ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2))); + ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1))); + + std::shared_ptr<Array> intermediate_result; + ASSERT_OK(builder.Finish(&intermediate_result)); + DictionaryBuilder<TypeParam> dictionary_builder(default_memory_pool()); + ASSERT_OK(dictionary_builder.AppendArray(*intermediate_result)); + std::shared_ptr<Array> result; + ASSERT_OK(dictionary_builder.Finish(&result)); + + // Build expected data + NumericBuilder<TypeParam> dict_builder(default_memory_pool()); + ASSERT_OK(dict_builder.Append(static_cast<typename TypeParam::c_type>(1))); + ASSERT_OK(dict_builder.Append(static_cast<typename TypeParam::c_type>(2))); + std::shared_ptr<Array> dict_array; + ASSERT_OK(dict_builder.Finish(&dict_array)); + auto dtype = + std::make_shared<DictionaryType>(std::make_shared<TypeParam>(), dict_array); + + UInt8Builder int_builder(default_memory_pool()); + ASSERT_OK(int_builder.Append(0)); + ASSERT_OK(int_builder.Append(1)); + ASSERT_OK(int_builder.Append(0)); + std::shared_ptr<Array> int_array; + ASSERT_OK(int_builder.Finish(&int_array)); + + DictionaryArray expected(dtype, int_array); + ASSERT_TRUE(expected.Equals(result)); +} + TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) { using Scalar = typename TypeParam::c_type; // Skip this test for (u)int8 http://git-wip-us.apache.org/repos/asf/arrow/blob/c398fda7/cpp/src/arrow/array.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 1c9769f..59269ad 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -478,7 +478,7 @@ class ARROW_EXPORT DictionaryArray : public Array { std::shared_ptr<Array> indices() const { return indices_; } std::shared_ptr<Array> dictionary() const; - const DictionaryType* dict_type() { return dict_type_; } + const DictionaryType* dict_type() const { return dict_type_; } std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const override; http://git-wip-us.apache.org/repos/asf/arrow/blob/c398fda7/cpp/src/arrow/builder.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 16f252c..a57f75a 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -660,8 +660,8 @@ Status BooleanBuilder::Append( // ---------------------------------------------------------------------- // DictionaryBuilder -template <typename T, typename Scalar> -DictionaryBuilder<T, Scalar>::DictionaryBuilder( +template <typename T> +DictionaryBuilder<T>::DictionaryBuilder( MemoryPool* pool, const std::shared_ptr<DataType>& type) : ArrayBuilder(pool, type), hash_table_(new PoolBuffer(pool)), @@ -671,8 +671,8 @@ DictionaryBuilder<T, Scalar>::DictionaryBuilder( if (!::arrow::CpuInfo::initialized()) { ::arrow::CpuInfo::Init(); } } -template <typename T, typename Scalar> -Status DictionaryBuilder<T, Scalar>::Init(int64_t elements) { +template <typename T> +Status DictionaryBuilder<T>::Init(int64_t elements) { RETURN_NOT_OK(ArrayBuilder::Init(elements)); // Fill the initial hash table @@ -685,8 +685,8 @@ Status DictionaryBuilder<T, Scalar>::Init(int64_t elements) { return values_builder_.Init(elements); } -template <typename T, typename Scalar> -Status DictionaryBuilder<T, Scalar>::Resize(int64_t capacity) { +template <typename T> +Status DictionaryBuilder<T>::Resize(int64_t capacity) { if (capacity < kMinBuilderCapacity) { capacity = kMinBuilderCapacity; } if (capacity_ == 0) { @@ -696,8 +696,8 @@ Status DictionaryBuilder<T, Scalar>::Resize(int64_t capacity) { } } -template <typename T, typename Scalar> -Status DictionaryBuilder<T, Scalar>::Finish(std::shared_ptr<Array>* out) { +template <typename T> +Status DictionaryBuilder<T>::Finish(std::shared_ptr<Array>* out) { std::shared_ptr<Array> dictionary; RETURN_NOT_OK(dict_builder_.Finish(&dictionary)); auto type = std::make_shared<DictionaryType>(type_, dictionary); @@ -709,8 +709,8 @@ Status DictionaryBuilder<T, Scalar>::Finish(std::shared_ptr<Array>* out) { return Status::OK(); } -template <typename T, typename Scalar> -Status DictionaryBuilder<T, Scalar>::Append(const Scalar& value) { +template <typename T> +Status DictionaryBuilder<T>::Append(const Scalar& value) { RETURN_NOT_OK(Reserve(1)); // Based on DictEncoder<DType>::Put int j = HashValue(value) & mod_bitmask_; @@ -741,8 +741,26 @@ Status DictionaryBuilder<T, Scalar>::Append(const Scalar& value) { return Status::OK(); } -template <typename T, typename Scalar> -Status DictionaryBuilder<T, Scalar>::DoubleTableSize() { +template <typename T> +Status DictionaryBuilder<T>::AppendArray(const Array& array) { + const NumericArray<T>& numeric_array = static_cast<const NumericArray<T>&>(array); + for (int64_t i = 0; i < array.length(); i++) { + if (array.IsNull(i)) { + RETURN_NOT_OK(AppendNull()); + } else { + RETURN_NOT_OK(Append(numeric_array.Value(i))); + } + } + return Status::OK(); +} + +template <typename T> +Status DictionaryBuilder<T>::AppendNull() { + return values_builder_.AppendNull(); +} + +template <typename T> +Status DictionaryBuilder<T>::DoubleTableSize() { int new_size = hash_table_size_ * 2; auto new_hash_table = std::make_shared<PoolBuffer>(pool_); @@ -782,56 +800,71 @@ Status DictionaryBuilder<T, Scalar>::DoubleTableSize() { return Status::OK(); } -template <typename T, typename Scalar> -Scalar DictionaryBuilder<T, Scalar>::GetDictionaryValue(int64_t index) { +template <typename T> +typename DictionaryBuilder<T>::Scalar DictionaryBuilder<T>::GetDictionaryValue( + int64_t index) { const Scalar* data = reinterpret_cast<const Scalar*>(dict_builder_.data()->data()); return data[index]; } -template <typename T, typename Scalar> -int DictionaryBuilder<T, Scalar>::HashValue(const Scalar& value) { +template <typename T> +int DictionaryBuilder<T>::HashValue(const Scalar& value) { return HashUtil::Hash(&value, sizeof(Scalar), 0); } -template <typename T, typename Scalar> -bool DictionaryBuilder<T, Scalar>::SlotDifferent(hash_slot_t index, const Scalar& value) { +template <typename T> +bool DictionaryBuilder<T>::SlotDifferent(hash_slot_t index, const Scalar& value) { const Scalar other = GetDictionaryValue(static_cast<int64_t>(index)); return other != value; } -template <typename T, typename Scalar> -Status DictionaryBuilder<T, Scalar>::AppendDictionary(const Scalar& value) { +template <typename T> +Status DictionaryBuilder<T>::AppendDictionary(const Scalar& value) { return dict_builder_.Append(value); } -#define BINARY_DICTIONARY_SPECIALIZATIONS(Type) \ - template <> \ - WrappedBinary DictionaryBuilder<Type, WrappedBinary>::GetDictionaryValue( \ - int64_t index) { \ - int32_t v_len; \ - const uint8_t* v = dict_builder_.GetValue(static_cast<int64_t>(index), &v_len); \ - return WrappedBinary(v, v_len); \ - } \ - \ - template <> \ - int DictionaryBuilder<Type, WrappedBinary>::HashValue(const WrappedBinary& value) { \ - return HashUtil::Hash(value.ptr_, value.length_, 0); \ - } \ - \ - template <> \ - bool DictionaryBuilder<Type, WrappedBinary>::SlotDifferent( \ - hash_slot_t index, const WrappedBinary& value) { \ - int32_t other_length; \ - const uint8_t* other_value = \ - dict_builder_.GetValue(static_cast<int64_t>(index), &other_length); \ - return !(other_length == value.length_ && \ - 0 == memcmp(other_value, value.ptr_, value.length_)); \ - } \ - \ - template <> \ - Status DictionaryBuilder<Type, WrappedBinary>::AppendDictionary( \ - const WrappedBinary& value) { \ - return dict_builder_.Append(value.ptr_, value.length_); \ +#define BINARY_DICTIONARY_SPECIALIZATIONS(Type) \ + template <> \ + internal::WrappedBinary DictionaryBuilder<Type>::GetDictionaryValue(int64_t index) { \ + int32_t v_len; \ + const uint8_t* v = dict_builder_.GetValue(static_cast<int64_t>(index), &v_len); \ + return internal::WrappedBinary(v, v_len); \ + } \ + \ + template <> \ + Status DictionaryBuilder<Type>::AppendDictionary( \ + const internal::WrappedBinary& value) { \ + return dict_builder_.Append(value.ptr_, value.length_); \ + } \ + \ + template <> \ + Status DictionaryBuilder<Type>::AppendArray(const Array& array) { \ + const BinaryArray& binary_array = static_cast<const BinaryArray&>(array); \ + internal::WrappedBinary value(nullptr, 0); \ + for (int64_t i = 0; i < array.length(); i++) { \ + if (array.IsNull(i)) { \ + RETURN_NOT_OK(AppendNull()); \ + } else { \ + value.ptr_ = binary_array.GetValue(i, &value.length_); \ + RETURN_NOT_OK(Append(value)); \ + } \ + } \ + return Status::OK(); \ + } \ + \ + template <> \ + int DictionaryBuilder<Type>::HashValue(const internal::WrappedBinary& value) { \ + return HashUtil::Hash(value.ptr_, value.length_, 0); \ + } \ + \ + template <> \ + bool DictionaryBuilder<Type>::SlotDifferent( \ + hash_slot_t index, const internal::WrappedBinary& value) { \ + int32_t other_length; \ + const uint8_t* other_value = \ + dict_builder_.GetValue(static_cast<int64_t>(index), &other_length); \ + return !(other_length == value.length_ && \ + 0 == memcmp(other_value, value.ptr_, value.length_)); \ } BINARY_DICTIONARY_SPECIALIZATIONS(StringType); @@ -852,8 +885,8 @@ template class DictionaryBuilder<Time64Type>; template class DictionaryBuilder<TimestampType>; template class DictionaryBuilder<FloatType>; template class DictionaryBuilder<DoubleType>; -template class DictionaryBuilder<BinaryType, WrappedBinary>; -template class DictionaryBuilder<StringType, WrappedBinary>; +template class DictionaryBuilder<BinaryType>; +template class DictionaryBuilder<StringType>; // ---------------------------------------------------------------------- // DecimalBuilder @@ -1193,4 +1226,36 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type, } } +#define DICTIONARY_BUILDER_CASE(ENUM, BuilderType) \ + case Type::ENUM: \ + out->reset(new BuilderType(pool, type)); \ + return Status::OK(); + +Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type, + std::shared_ptr<ArrayBuilder>* out) { + switch (type->id()) { + DICTIONARY_BUILDER_CASE(UINT8, DictionaryBuilder<UInt8Type>); + DICTIONARY_BUILDER_CASE(INT8, DictionaryBuilder<Int8Type>); + DICTIONARY_BUILDER_CASE(UINT16, DictionaryBuilder<UInt16Type>); + DICTIONARY_BUILDER_CASE(INT16, DictionaryBuilder<Int16Type>); + DICTIONARY_BUILDER_CASE(UINT32, DictionaryBuilder<UInt32Type>); + DICTIONARY_BUILDER_CASE(INT32, DictionaryBuilder<Int32Type>); + DICTIONARY_BUILDER_CASE(UINT64, DictionaryBuilder<UInt64Type>); + DICTIONARY_BUILDER_CASE(INT64, DictionaryBuilder<Int64Type>); + DICTIONARY_BUILDER_CASE(DATE32, DictionaryBuilder<Date32Type>); + DICTIONARY_BUILDER_CASE(DATE64, DictionaryBuilder<Date64Type>); + DICTIONARY_BUILDER_CASE(TIME32, DictionaryBuilder<Time32Type>); + DICTIONARY_BUILDER_CASE(TIME64, DictionaryBuilder<Time64Type>); + DICTIONARY_BUILDER_CASE(TIMESTAMP, DictionaryBuilder<TimestampType>); + DICTIONARY_BUILDER_CASE(FLOAT, DictionaryBuilder<FloatType>); + DICTIONARY_BUILDER_CASE(DOUBLE, DictionaryBuilder<DoubleType>); + DICTIONARY_BUILDER_CASE(STRING, StringDictionaryBuilder); + DICTIONARY_BUILDER_CASE(BINARY, BinaryDictionaryBuilder); + // DICTIONARY_BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder); + // DICTIONARY_BUILDER_CASE(DECIMAL, DecimalBuilder); + default: + return Status::NotImplemented(type->ToString()); + } +} + } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/c398fda7/cpp/src/arrow/builder.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h index 5acefa1..12f3683 100644 --- a/cpp/src/arrow/builder.h +++ b/cpp/src/arrow/builder.h @@ -688,18 +688,55 @@ static constexpr hash_slot_t kHashSlotEmpty = std::numeric_limits<int32_t>::max( // The maximum load factor for the hash table before resizing. static constexpr double kMaxHashTableLoad = 0.7; -template <typename T, typename Scalar = typename T::c_type> +namespace internal { + +// TODO(ARROW-1176): Use Tensorflow's StringPiece instead of this here. +struct WrappedBinary { + WrappedBinary(const uint8_t* ptr, int32_t length) : ptr_(ptr), length_(length) {} + + const uint8_t* ptr_; + int32_t length_; +}; + +template <typename T> +struct DictionaryScalar { + using type = typename T::c_type; +}; + +template <> +struct DictionaryScalar<BinaryType> { + using type = WrappedBinary; +}; + +template <> +struct DictionaryScalar<StringType> { + using type = WrappedBinary; +}; + +} // namespace internal + +/// \brief Array builder for created encoded DictionaryArray from dense array +/// data +template <typename T> class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { public: + using Scalar = typename internal::DictionaryScalar<T>::type; explicit DictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type); - template <typename T1 = T, typename Scalar1 = Scalar> + template <typename T1 = T> explicit DictionaryBuilder( typename std::enable_if<TypeTraits<T1>::is_parameter_free, MemoryPool*>::type pool) - : DictionaryBuilder<T1, Scalar1>(pool, TypeTraits<T1>::type_singleton()) {} + : DictionaryBuilder<T1>(pool, TypeTraits<T1>::type_singleton()) {} + /// \brief Append a scalar value Status Append(const Scalar& value); + /// \brief Append a scalar null value + Status AppendNull(); + + /// \brief Append a whole dense array to the builder + Status AppendArray(const Array& array); + Status Init(int64_t elements) override; Status Resize(int64_t capacity) override; Status Finish(std::shared_ptr<Array>* out) override; @@ -725,31 +762,43 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { AdaptiveUIntBuilder values_builder_; }; -// TODO(ARROW-1176): Use Tensorflow's StringPiece instead of this here. -struct WrappedBinary { - WrappedBinary(const uint8_t* ptr, int32_t length) : ptr_(ptr), length_(length) {} +class ARROW_EXPORT BinaryDictionaryBuilder : public DictionaryBuilder<BinaryType> { + public: + using DictionaryBuilder::DictionaryBuilder; + using DictionaryBuilder::Append; - const uint8_t* ptr_; - int32_t length_; + Status Append(const uint8_t* value, int32_t length) { + return Append(internal::WrappedBinary(value, length)); + } + + Status Append(const char* value, int32_t length) { + return Append( + internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), length)); + } + + Status Append(const std::string& value) { + return Append(internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value.c_str()), + static_cast<int32_t>(value.size()))); + } }; -class ARROW_EXPORT StringDictionaryBuilder - : public DictionaryBuilder<StringType, WrappedBinary> { +/// \brief Dictionary array builder with convenience methods for strings +class ARROW_EXPORT StringDictionaryBuilder : public DictionaryBuilder<StringType> { public: using DictionaryBuilder::DictionaryBuilder; - using DictionaryBuilder::Append; Status Append(const uint8_t* value, int32_t length) { - return Append(WrappedBinary(value, length)); + return Append(internal::WrappedBinary(value, length)); } Status Append(const char* value, int32_t length) { - return Append(WrappedBinary(reinterpret_cast<const uint8_t*>(value), length)); + return Append( + internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), length)); } Status Append(const std::string& value) { - return Append(WrappedBinary(reinterpret_cast<const uint8_t*>(value.c_str()), + return Append(internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value.c_str()), static_cast<int32_t>(value.size()))); } }; @@ -760,6 +809,9 @@ class ARROW_EXPORT StringDictionaryBuilder Status ARROW_EXPORT MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type, std::unique_ptr<ArrayBuilder>* out); +Status ARROW_EXPORT MakeDictionaryBuilder(MemoryPool* pool, + const std::shared_ptr<DataType>& type, std::shared_ptr<ArrayBuilder>* out); + } // namespace arrow #endif // ARROW_BUILDER_H_