This is an automated email from the ASF dual-hosted git repository. uwe pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 65d0e19 ARROW-4776: [C++] Add DictionaryBuilder constructor which takes a dictionary array 65d0e19 is described below commit 65d0e1959a3a2a52acae85e9f5b465f345585476 Author: Benjamin Kietzman <bengil...@gmail.com> AuthorDate: Wed Mar 13 16:53:05 2019 +0100 ARROW-4776: [C++] Add DictionaryBuilder constructor which takes a dictionary array DictionaryBuilder can now be constructed with a user provided dictionary. This is also available through MakeBuilder Author: Benjamin Kietzman <bengil...@gmail.com> Author: Uwe L. Korn <xho...@users.noreply.github.com> Closes #3867 from bkietz/ARROW-4776-dictionarybuilder-should-support-bootstrapping-from-an-existing-dict-type and squashes the following commits: d2551c99 <Benjamin Kietzman> Use {String,Binary}DictionaryBuilder for those types 91da93b9 <Uwe L. Korn> remove unused std::stringstream 4514860b <Benjamin Kietzman> Add tests for MakeBuilder(dictionary type) 5fcbe263 <Benjamin Kietzman> Add dictionary constructor to DictionaryBuilder --- cpp/src/arrow/array-dict-test.cc | 136 +++++++++++++++++++++++++++++++++++- cpp/src/arrow/array/builder_dict.cc | 38 ++++++++-- cpp/src/arrow/array/builder_dict.h | 4 ++ cpp/src/arrow/builder.cc | 41 +++++++++++ 4 files changed, 210 insertions(+), 9 deletions(-) diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc index fcc9fea..8289aee 100644 --- a/cpp/src/arrow/array-dict-test.cc +++ b/cpp/src/arrow/array-dict-test.cc @@ -63,17 +63,67 @@ TYPED_TEST(TestDictionaryBuilder, Basic) { ASSERT_EQ(builder.length(), 4); ASSERT_EQ(builder.null_count(), 1); + // Build expected data + auto dict_array = ArrayFromJSON(std::make_shared<TypeParam>(), "[1, 2]"); + auto dict_type = dictionary(int8(), dict_array); + std::shared_ptr<Array> result; ASSERT_OK(builder.Finish(&result)); + auto int_array = ArrayFromJSON(int8(), "[0, 1, 0, null]"); + DictionaryArray expected(dict_type, int_array); + + ASSERT_TRUE(expected.Equals(result)); +} + +TYPED_TEST(TestDictionaryBuilder, ArrayInit) { + auto dict_array = ArrayFromJSON(std::make_shared<TypeParam>(), "[1, 2]"); + auto dict_type = dictionary(int8(), dict_array); + + DictionaryBuilder<TypeParam> builder(dict_array, default_memory_pool()); + ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1))); + ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2))); + ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1))); + ASSERT_OK(builder.AppendNull()); + + ASSERT_EQ(builder.length(), 4); + ASSERT_EQ(builder.null_count(), 1); + // Build expected data + + std::shared_ptr<Array> result; + ASSERT_OK(builder.Finish(&result)); + + auto int_array = ArrayFromJSON(int8(), "[0, 1, 0, null]"); + DictionaryArray expected(dict_type, int_array); + + AssertArraysEqual(expected, *result); +} + +TYPED_TEST(TestDictionaryBuilder, MakeBuilder) { auto dict_array = ArrayFromJSON(std::make_shared<TypeParam>(), "[1, 2]"); - auto dict_type = std::make_shared<DictionaryType>(int8(), dict_array); + auto dict_type = dictionary(int8(), dict_array); + std::unique_ptr<ArrayBuilder> boxed_builder; + ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder)); + auto& builder = checked_cast<DictionaryBuilder<TypeParam>&>(*boxed_builder); + + ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1))); + ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2))); + ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1))); + ASSERT_OK(builder.AppendNull()); + + ASSERT_EQ(builder.length(), 4); + ASSERT_EQ(builder.null_count(), 1); + + // Build expected data + + std::shared_ptr<Array> result; + ASSERT_OK(builder.Finish(&result)); auto int_array = ArrayFromJSON(int8(), "[0, 1, 0, null]"); DictionaryArray expected(dict_type, int_array); - ASSERT_TRUE(expected.Equals(result)); + AssertArraysEqual(expected, *result); } TYPED_TEST(TestDictionaryBuilder, ArrayConversion) { @@ -87,7 +137,7 @@ TYPED_TEST(TestDictionaryBuilder, ArrayConversion) { // Build expected data auto dict_array = ArrayFromJSON(type, "[1, 2]"); - auto dict_type = std::make_shared<DictionaryType>(int8(), dict_array); + auto dict_type = dictionary(int8(), dict_array); auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]"); DictionaryArray expected(dict_type, int_array); @@ -239,6 +289,47 @@ TEST(TestStringDictionaryBuilder, Basic) { ASSERT_TRUE(expected.Equals(result)); } +TEST(TestStringDictionaryBuilder, ArrayInit) { + auto dict_array = ArrayFromJSON(utf8(), R"(["test", "test2"])"); + auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]"); + + // Build the dictionary Array + StringDictionaryBuilder builder(dict_array, default_memory_pool()); + ASSERT_OK(builder.Append("test")); + ASSERT_OK(builder.Append("test2")); + ASSERT_OK(builder.Append("test")); + + std::shared_ptr<Array> result; + ASSERT_OK(builder.Finish(&result)); + + // Build expected data + DictionaryArray expected(dictionary(int8(), dict_array), int_array); + + AssertArraysEqual(expected, *result); +} + +TEST(TestStringDictionaryBuilder, MakeBuilder) { + auto dict_array = ArrayFromJSON(utf8(), R"(["test", "test2"])"); + auto dict_type = dictionary(int8(), dict_array); + auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]"); + std::unique_ptr<ArrayBuilder> boxed_builder; + ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder)); + auto& builder = checked_cast<StringDictionaryBuilder&>(*boxed_builder); + + // Build the dictionary Array + ASSERT_OK(builder.Append("test")); + ASSERT_OK(builder.Append("test2")); + ASSERT_OK(builder.Append("test")); + + std::shared_ptr<Array> result; + ASSERT_OK(builder.Finish(&result)); + + // Build expected data + DictionaryArray expected(dict_type, int_array); + + AssertArraysEqual(expected, *result); +} + // ARROW-4367 TEST(TestStringDictionaryBuilder, OnlyNull) { // Build the dictionary Array @@ -443,6 +534,45 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, Basic) { ASSERT_TRUE(expected.Equals(result)); } +TEST(TestFixedSizeBinaryDictionaryBuilder, ArrayInit) { + // Build the dictionary Array + auto dict_array = ArrayFromJSON(fixed_size_binary(4), R"(["abcd", "wxyz"])"); + util::string_view test = "abcd", test2 = "wxyz"; + DictionaryBuilder<FixedSizeBinaryType> builder(dict_array, default_memory_pool()); + ASSERT_OK(builder.Append(test)); + ASSERT_OK(builder.Append(test2)); + ASSERT_OK(builder.Append(test)); + + std::shared_ptr<Array> result; + FinishAndCheckPadding(&builder, &result); + + // Build expected data + auto indices = ArrayFromJSON(int8(), "[0, 1, 0]"); + DictionaryArray expected(dictionary(int8(), dict_array), indices); + AssertArraysEqual(expected, *result); +} + +TEST(TestFixedSizeBinaryDictionaryBuilder, MakeBuilder) { + // Build the dictionary Array + auto dict_array = ArrayFromJSON(fixed_size_binary(4), R"(["abcd", "wxyz"])"); + auto dict_type = dictionary(int8(), dict_array); + std::unique_ptr<ArrayBuilder> boxed_builder; + ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder)); + auto& builder = checked_cast<DictionaryBuilder<FixedSizeBinaryType>&>(*boxed_builder); + util::string_view test = "abcd", test2 = "wxyz"; + ASSERT_OK(builder.Append(test)); + ASSERT_OK(builder.Append(test2)); + ASSERT_OK(builder.Append(test)); + + std::shared_ptr<Array> result; + FinishAndCheckPadding(&builder, &result); + + // Build expected data + auto indices = ArrayFromJSON(int8(), "[0, 1, 0]"); + DictionaryArray expected(dict_type, indices); + AssertArraysEqual(expected, *result); +} + TEST(TestFixedSizeBinaryDictionaryBuilder, DeltaDictionary) { // Build the dictionary Array DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4), diff --git a/cpp/src/arrow/array/builder_dict.cc b/cpp/src/arrow/array/builder_dict.cc index cfc3d3d..b5fa0d6 100644 --- a/cpp/src/arrow/array/builder_dict.cc +++ b/cpp/src/arrow/array/builder_dict.cc @@ -51,10 +51,8 @@ struct UnifyDictionaryValues { Status Visit(const DataType&, void* = nullptr) { // Default implementation for non-dictionary-supported datatypes - std::stringstream ss; - ss << "Unification of " << value_type_->ToString() - << " dictionaries is not implemented"; - return Status::NotImplemented(ss.str()); + return Status::NotImplemented("Unification of ", value_type_, + " dictionaries is not implemented"); } template <typename T> @@ -153,12 +151,32 @@ class DictionaryBuilder<T>::MemoTableImpl public: using MemoTableType = typename internal::HashTraits<T>::MemoTableType; using MemoTableType::MemoTableType; + + MemoTableImpl(const std::shared_ptr<Array>& dictionary) + : MemoTableImpl(dictionary->length()) { + const auto& values = + static_cast<const typename TypeTraits<T>::ArrayType&>(*dictionary); + for (int64_t i = 0; i < values.length(); ++i) { + ARROW_IGNORE_EXPR(this->GetOrInsert(values.GetView(i))); + } + } }; template <typename T> DictionaryBuilder<T>::~DictionaryBuilder() {} template <typename T> +DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<Array>& dictionary, + MemoryPool* pool) + : ArrayBuilder(dictionary->type(), pool), + memo_table_(new MemoTableImpl(dictionary)), + delta_offset_(0), + byte_width_(-1), + values_builder_(pool) { + DCHECK_EQ(T::type_id, type_->id()) << "inconsistent type passed to DictionaryBuilder"; +} + +template <typename T> DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool) : ArrayBuilder(type, pool), @@ -175,6 +193,12 @@ DictionaryBuilder<NullType>::DictionaryBuilder(const std::shared_ptr<DataType>& DCHECK_EQ(Type::NA, type->id()) << "inconsistent type passed to DictionaryBuilder"; } +DictionaryBuilder<NullType>::DictionaryBuilder(const std::shared_ptr<Array>& dictionary, + MemoryPool* pool) + : ArrayBuilder(dictionary->type(), pool), values_builder_(pool) { + DCHECK_EQ(Type::NA, type_->id()) << "inconsistent type passed to DictionaryBuilder"; +} + template <> DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder( const std::shared_ptr<DataType>& type, MemoryPool* pool) @@ -202,7 +226,8 @@ Status DictionaryBuilder<T>::Resize(int64_t capacity) { delta_offset_ = 0; } RETURN_NOT_OK(values_builder_.Resize(capacity)); - return ArrayBuilder::Resize(capacity); + capacity_ = values_builder_.capacity(); + return Status::OK(); } Status DictionaryBuilder<NullType>::Resize(int64_t capacity) { @@ -210,7 +235,8 @@ Status DictionaryBuilder<NullType>::Resize(int64_t capacity) { capacity = std::max(capacity, kMinBuilderCapacity); RETURN_NOT_OK(values_builder_.Resize(capacity)); - return ArrayBuilder::Resize(capacity); + capacity_ = values_builder_.capacity(); + return Status::OK(); } template <typename T> diff --git a/cpp/src/arrow/array/builder_dict.h b/cpp/src/arrow/array/builder_dict.h index 6f02716..204d609 100644 --- a/cpp/src/arrow/array/builder_dict.h +++ b/cpp/src/arrow/array/builder_dict.h @@ -68,6 +68,8 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder { // The DictionaryType is instantiated on the Finish() call. DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool); + DictionaryBuilder(const std::shared_ptr<Array>& dictionary, MemoryPool* pool); + template <typename T1 = T> explicit DictionaryBuilder( typename std::enable_if<TypeTraits<T1>::is_parameter_free, MemoryPool*>::type pool) @@ -122,6 +124,8 @@ class ARROW_EXPORT DictionaryBuilder<NullType> : public ArrayBuilder { DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool); explicit DictionaryBuilder(MemoryPool* pool); + DictionaryBuilder(const std::shared_ptr<Array>& dictionary, MemoryPool* pool); + /// \brief Append a scalar null value Status AppendNull(); diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc index 8d0ab19..9669c08 100644 --- a/cpp/src/arrow/builder.cc +++ b/cpp/src/arrow/builder.cc @@ -25,6 +25,8 @@ #include "arrow/status.h" #include "arrow/type.h" #include "arrow/util/checked_cast.h" +#include "arrow/util/hashing.h" +#include "arrow/visitor_inline.h" namespace arrow { @@ -38,6 +40,40 @@ class MemoryPool; out->reset(new BuilderType(type, pool)); \ return Status::OK(); +struct DictionaryBuilderCase { + template <typename ValueType> + Status Visit(const ValueType&, typename ValueType::c_type* = nullptr) { + return CreateFor<ValueType>(); + } + + Status Visit(const BinaryType&) { return Create<BinaryDictionaryBuilder>(); } + Status Visit(const StringType&) { return Create<StringDictionaryBuilder>(); } + Status Visit(const FixedSizeBinaryType&) { return CreateFor<FixedSizeBinaryType>(); } + + Status Visit(const DataType& value_type) { return NotImplemented(value_type); } + Status Visit(const HalfFloatType& value_type) { return NotImplemented(value_type); } + Status NotImplemented(const DataType& value_type) { + return Status::NotImplemented( + "MakeBuilder: cannot construct builder for dictionaries with value type ", + value_type); + } + + template <typename ValueType> + Status CreateFor() { + return Create<DictionaryBuilder<ValueType>>(); + } + + template <typename BuilderType> + Status Create() { + out->reset(new BuilderType(dict_type.dictionary(), pool)); + return Status::OK(); + } + + MemoryPool* pool; + const DictionaryType& dict_type; + std::unique_ptr<ArrayBuilder>* out; +}; + // Initially looked at doing this with vtables, but shared pointers makes it // difficult // @@ -70,6 +106,11 @@ Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type, BUILDER_CASE(BINARY, BinaryBuilder); BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder); BUILDER_CASE(DECIMAL, Decimal128Builder); + case Type::DICTIONARY: { + const auto& dict_type = static_cast<const DictionaryType&>(*type); + DictionaryBuilderCase visitor = {pool, dict_type, out}; + return VisitTypeInline(*dict_type.dictionary()->type(), &visitor); + } case Type::LIST: { std::unique_ptr<ArrayBuilder> value_builder; std::shared_ptr<DataType> value_type =