This is an automated email from the ASF dual-hosted git repository.

uwe pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 65d0e19  ARROW-4776: [C++] Add DictionaryBuilder constructor which 
takes a dictionary array
65d0e19 is described below

commit 65d0e1959a3a2a52acae85e9f5b465f345585476
Author: Benjamin Kietzman <bengil...@gmail.com>
AuthorDate: Wed Mar 13 16:53:05 2019 +0100

    ARROW-4776: [C++] Add DictionaryBuilder constructor which takes a 
dictionary array
    
    DictionaryBuilder can now be constructed with a user provided dictionary.
    
    This is also available through MakeBuilder
    
    Author: Benjamin Kietzman <bengil...@gmail.com>
    Author: Uwe L. Korn <xho...@users.noreply.github.com>
    
    Closes #3867 from 
bkietz/ARROW-4776-dictionarybuilder-should-support-bootstrapping-from-an-existing-dict-type
 and squashes the following commits:
    
    d2551c99 <Benjamin Kietzman> Use {String,Binary}DictionaryBuilder for those 
types
    91da93b9 <Uwe L. Korn> remove unused std::stringstream
    4514860b <Benjamin Kietzman> Add tests for MakeBuilder(dictionary type)
    5fcbe263 <Benjamin Kietzman> Add dictionary constructor to DictionaryBuilder
---
 cpp/src/arrow/array-dict-test.cc    | 136 +++++++++++++++++++++++++++++++++++-
 cpp/src/arrow/array/builder_dict.cc |  38 ++++++++--
 cpp/src/arrow/array/builder_dict.h  |   4 ++
 cpp/src/arrow/builder.cc            |  41 +++++++++++
 4 files changed, 210 insertions(+), 9 deletions(-)

diff --git a/cpp/src/arrow/array-dict-test.cc b/cpp/src/arrow/array-dict-test.cc
index fcc9fea..8289aee 100644
--- a/cpp/src/arrow/array-dict-test.cc
+++ b/cpp/src/arrow/array-dict-test.cc
@@ -63,17 +63,67 @@ TYPED_TEST(TestDictionaryBuilder, Basic) {
   ASSERT_EQ(builder.length(), 4);
   ASSERT_EQ(builder.null_count(), 1);
 
+  // Build expected data
+  auto dict_array = ArrayFromJSON(std::make_shared<TypeParam>(), "[1, 2]");
+  auto dict_type = dictionary(int8(), dict_array);
+
   std::shared_ptr<Array> result;
   ASSERT_OK(builder.Finish(&result));
 
+  auto int_array = ArrayFromJSON(int8(), "[0, 1, 0, null]");
+  DictionaryArray expected(dict_type, int_array);
+
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TYPED_TEST(TestDictionaryBuilder, ArrayInit) {
+  auto dict_array = ArrayFromJSON(std::make_shared<TypeParam>(), "[1, 2]");
+  auto dict_type = dictionary(int8(), dict_array);
+
+  DictionaryBuilder<TypeParam> builder(dict_array, default_memory_pool());
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(builder.AppendNull());
+
+  ASSERT_EQ(builder.length(), 4);
+  ASSERT_EQ(builder.null_count(), 1);
+
   // Build expected data
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  auto int_array = ArrayFromJSON(int8(), "[0, 1, 0, null]");
+  DictionaryArray expected(dict_type, int_array);
+
+  AssertArraysEqual(expected, *result);
+}
+
+TYPED_TEST(TestDictionaryBuilder, MakeBuilder) {
   auto dict_array = ArrayFromJSON(std::make_shared<TypeParam>(), "[1, 2]");
-  auto dict_type = std::make_shared<DictionaryType>(int8(), dict_array);
+  auto dict_type = dictionary(int8(), dict_array);
+  std::unique_ptr<ArrayBuilder> boxed_builder;
+  ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder));
+  auto& builder = checked_cast<DictionaryBuilder<TypeParam>&>(*boxed_builder);
+
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(builder.AppendNull());
+
+  ASSERT_EQ(builder.length(), 4);
+  ASSERT_EQ(builder.null_count(), 1);
+
+  // Build expected data
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
 
   auto int_array = ArrayFromJSON(int8(), "[0, 1, 0, null]");
   DictionaryArray expected(dict_type, int_array);
 
-  ASSERT_TRUE(expected.Equals(result));
+  AssertArraysEqual(expected, *result);
 }
 
 TYPED_TEST(TestDictionaryBuilder, ArrayConversion) {
@@ -87,7 +137,7 @@ TYPED_TEST(TestDictionaryBuilder, ArrayConversion) {
 
   // Build expected data
   auto dict_array = ArrayFromJSON(type, "[1, 2]");
-  auto dict_type = std::make_shared<DictionaryType>(int8(), dict_array);
+  auto dict_type = dictionary(int8(), dict_array);
 
   auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");
   DictionaryArray expected(dict_type, int_array);
@@ -239,6 +289,47 @@ TEST(TestStringDictionaryBuilder, Basic) {
   ASSERT_TRUE(expected.Equals(result));
 }
 
+TEST(TestStringDictionaryBuilder, ArrayInit) {
+  auto dict_array = ArrayFromJSON(utf8(), R"(["test", "test2"])");
+  auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");
+
+  // Build the dictionary Array
+  StringDictionaryBuilder builder(dict_array, default_memory_pool());
+  ASSERT_OK(builder.Append("test"));
+  ASSERT_OK(builder.Append("test2"));
+  ASSERT_OK(builder.Append("test"));
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Build expected data
+  DictionaryArray expected(dictionary(int8(), dict_array), int_array);
+
+  AssertArraysEqual(expected, *result);
+}
+
+TEST(TestStringDictionaryBuilder, MakeBuilder) {
+  auto dict_array = ArrayFromJSON(utf8(), R"(["test", "test2"])");
+  auto dict_type = dictionary(int8(), dict_array);
+  auto int_array = ArrayFromJSON(int8(), "[0, 1, 0]");
+  std::unique_ptr<ArrayBuilder> boxed_builder;
+  ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder));
+  auto& builder = checked_cast<StringDictionaryBuilder&>(*boxed_builder);
+
+  // Build the dictionary Array
+  ASSERT_OK(builder.Append("test"));
+  ASSERT_OK(builder.Append("test2"));
+  ASSERT_OK(builder.Append("test"));
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Build expected data
+  DictionaryArray expected(dict_type, int_array);
+
+  AssertArraysEqual(expected, *result);
+}
+
 // ARROW-4367
 TEST(TestStringDictionaryBuilder, OnlyNull) {
   // Build the dictionary Array
@@ -443,6 +534,45 @@ TEST(TestFixedSizeBinaryDictionaryBuilder, Basic) {
   ASSERT_TRUE(expected.Equals(result));
 }
 
+TEST(TestFixedSizeBinaryDictionaryBuilder, ArrayInit) {
+  // Build the dictionary Array
+  auto dict_array = ArrayFromJSON(fixed_size_binary(4), R"(["abcd", "wxyz"])");
+  util::string_view test = "abcd", test2 = "wxyz";
+  DictionaryBuilder<FixedSizeBinaryType> builder(dict_array, 
default_memory_pool());
+  ASSERT_OK(builder.Append(test));
+  ASSERT_OK(builder.Append(test2));
+  ASSERT_OK(builder.Append(test));
+
+  std::shared_ptr<Array> result;
+  FinishAndCheckPadding(&builder, &result);
+
+  // Build expected data
+  auto indices = ArrayFromJSON(int8(), "[0, 1, 0]");
+  DictionaryArray expected(dictionary(int8(), dict_array), indices);
+  AssertArraysEqual(expected, *result);
+}
+
+TEST(TestFixedSizeBinaryDictionaryBuilder, MakeBuilder) {
+  // Build the dictionary Array
+  auto dict_array = ArrayFromJSON(fixed_size_binary(4), R"(["abcd", "wxyz"])");
+  auto dict_type = dictionary(int8(), dict_array);
+  std::unique_ptr<ArrayBuilder> boxed_builder;
+  ASSERT_OK(MakeBuilder(default_memory_pool(), dict_type, &boxed_builder));
+  auto& builder = 
checked_cast<DictionaryBuilder<FixedSizeBinaryType>&>(*boxed_builder);
+  util::string_view test = "abcd", test2 = "wxyz";
+  ASSERT_OK(builder.Append(test));
+  ASSERT_OK(builder.Append(test2));
+  ASSERT_OK(builder.Append(test));
+
+  std::shared_ptr<Array> result;
+  FinishAndCheckPadding(&builder, &result);
+
+  // Build expected data
+  auto indices = ArrayFromJSON(int8(), "[0, 1, 0]");
+  DictionaryArray expected(dict_type, indices);
+  AssertArraysEqual(expected, *result);
+}
+
 TEST(TestFixedSizeBinaryDictionaryBuilder, DeltaDictionary) {
   // Build the dictionary Array
   DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
diff --git a/cpp/src/arrow/array/builder_dict.cc 
b/cpp/src/arrow/array/builder_dict.cc
index cfc3d3d..b5fa0d6 100644
--- a/cpp/src/arrow/array/builder_dict.cc
+++ b/cpp/src/arrow/array/builder_dict.cc
@@ -51,10 +51,8 @@ struct UnifyDictionaryValues {
 
   Status Visit(const DataType&, void* = nullptr) {
     // Default implementation for non-dictionary-supported datatypes
-    std::stringstream ss;
-    ss << "Unification of " << value_type_->ToString()
-       << " dictionaries is not implemented";
-    return Status::NotImplemented(ss.str());
+    return Status::NotImplemented("Unification of ", value_type_,
+                                  " dictionaries is not implemented");
   }
 
   template <typename T>
@@ -153,12 +151,32 @@ class DictionaryBuilder<T>::MemoTableImpl
  public:
   using MemoTableType = typename internal::HashTraits<T>::MemoTableType;
   using MemoTableType::MemoTableType;
+
+  MemoTableImpl(const std::shared_ptr<Array>& dictionary)
+      : MemoTableImpl(dictionary->length()) {
+    const auto& values =
+        static_cast<const typename TypeTraits<T>::ArrayType&>(*dictionary);
+    for (int64_t i = 0; i < values.length(); ++i) {
+      ARROW_IGNORE_EXPR(this->GetOrInsert(values.GetView(i)));
+    }
+  }
 };
 
 template <typename T>
 DictionaryBuilder<T>::~DictionaryBuilder() {}
 
 template <typename T>
+DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<Array>& 
dictionary,
+                                        MemoryPool* pool)
+    : ArrayBuilder(dictionary->type(), pool),
+      memo_table_(new MemoTableImpl(dictionary)),
+      delta_offset_(0),
+      byte_width_(-1),
+      values_builder_(pool) {
+  DCHECK_EQ(T::type_id, type_->id()) << "inconsistent type passed to 
DictionaryBuilder";
+}
+
+template <typename T>
 DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<DataType>& type,
                                         MemoryPool* pool)
     : ArrayBuilder(type, pool),
@@ -175,6 +193,12 @@ DictionaryBuilder<NullType>::DictionaryBuilder(const 
std::shared_ptr<DataType>&
   DCHECK_EQ(Type::NA, type->id()) << "inconsistent type passed to 
DictionaryBuilder";
 }
 
+DictionaryBuilder<NullType>::DictionaryBuilder(const std::shared_ptr<Array>& 
dictionary,
+                                               MemoryPool* pool)
+    : ArrayBuilder(dictionary->type(), pool), values_builder_(pool) {
+  DCHECK_EQ(Type::NA, type_->id()) << "inconsistent type passed to 
DictionaryBuilder";
+}
+
 template <>
 DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
     const std::shared_ptr<DataType>& type, MemoryPool* pool)
@@ -202,7 +226,8 @@ Status DictionaryBuilder<T>::Resize(int64_t capacity) {
     delta_offset_ = 0;
   }
   RETURN_NOT_OK(values_builder_.Resize(capacity));
-  return ArrayBuilder::Resize(capacity);
+  capacity_ = values_builder_.capacity();
+  return Status::OK();
 }
 
 Status DictionaryBuilder<NullType>::Resize(int64_t capacity) {
@@ -210,7 +235,8 @@ Status DictionaryBuilder<NullType>::Resize(int64_t 
capacity) {
   capacity = std::max(capacity, kMinBuilderCapacity);
 
   RETURN_NOT_OK(values_builder_.Resize(capacity));
-  return ArrayBuilder::Resize(capacity);
+  capacity_ = values_builder_.capacity();
+  return Status::OK();
 }
 
 template <typename T>
diff --git a/cpp/src/arrow/array/builder_dict.h 
b/cpp/src/arrow/array/builder_dict.h
index 6f02716..204d609 100644
--- a/cpp/src/arrow/array/builder_dict.h
+++ b/cpp/src/arrow/array/builder_dict.h
@@ -68,6 +68,8 @@ class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
   // The DictionaryType is instantiated on the Finish() call.
   DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
 
+  DictionaryBuilder(const std::shared_ptr<Array>& dictionary, MemoryPool* 
pool);
+
   template <typename T1 = T>
   explicit DictionaryBuilder(
       typename std::enable_if<TypeTraits<T1>::is_parameter_free, 
MemoryPool*>::type pool)
@@ -122,6 +124,8 @@ class ARROW_EXPORT DictionaryBuilder<NullType> : public 
ArrayBuilder {
   DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
   explicit DictionaryBuilder(MemoryPool* pool);
 
+  DictionaryBuilder(const std::shared_ptr<Array>& dictionary, MemoryPool* 
pool);
+
   /// \brief Append a scalar null value
   Status AppendNull();
 
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 8d0ab19..9669c08 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -25,6 +25,8 @@
 #include "arrow/status.h"
 #include "arrow/type.h"
 #include "arrow/util/checked_cast.h"
+#include "arrow/util/hashing.h"
+#include "arrow/visitor_inline.h"
 
 namespace arrow {
 
@@ -38,6 +40,40 @@ class MemoryPool;
     out->reset(new BuilderType(type, pool)); \
     return Status::OK();
 
+struct DictionaryBuilderCase {
+  template <typename ValueType>
+  Status Visit(const ValueType&, typename ValueType::c_type* = nullptr) {
+    return CreateFor<ValueType>();
+  }
+
+  Status Visit(const BinaryType&) { return Create<BinaryDictionaryBuilder>(); }
+  Status Visit(const StringType&) { return Create<StringDictionaryBuilder>(); }
+  Status Visit(const FixedSizeBinaryType&) { return 
CreateFor<FixedSizeBinaryType>(); }
+
+  Status Visit(const DataType& value_type) { return 
NotImplemented(value_type); }
+  Status Visit(const HalfFloatType& value_type) { return 
NotImplemented(value_type); }
+  Status NotImplemented(const DataType& value_type) {
+    return Status::NotImplemented(
+        "MakeBuilder: cannot construct builder for dictionaries with value 
type ",
+        value_type);
+  }
+
+  template <typename ValueType>
+  Status CreateFor() {
+    return Create<DictionaryBuilder<ValueType>>();
+  }
+
+  template <typename BuilderType>
+  Status Create() {
+    out->reset(new BuilderType(dict_type.dictionary(), pool));
+    return Status::OK();
+  }
+
+  MemoryPool* pool;
+  const DictionaryType& dict_type;
+  std::unique_ptr<ArrayBuilder>* out;
+};
+
 // Initially looked at doing this with vtables, but shared pointers makes it
 // difficult
 //
@@ -70,6 +106,11 @@ Status MakeBuilder(MemoryPool* pool, const 
std::shared_ptr<DataType>& type,
       BUILDER_CASE(BINARY, BinaryBuilder);
       BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder);
       BUILDER_CASE(DECIMAL, Decimal128Builder);
+    case Type::DICTIONARY: {
+      const auto& dict_type = static_cast<const DictionaryType&>(*type);
+      DictionaryBuilderCase visitor = {pool, dict_type, out};
+      return VisitTypeInline(*dict_type.dictionary()->type(), &visitor);
+    }
     case Type::LIST: {
       std::unique_ptr<ArrayBuilder> value_builder;
       std::shared_ptr<DataType> value_type =

Reply via email to