[ 
https://issues.apache.org/jira/browse/ARROW-1882?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16279576#comment-16279576
 ] 

ASF GitHub Bot commented on ARROW-1882:
---------------------------------------

wesm closed pull request #1388: ARROW-1882: [C++] Reintroduce DictionaryBuilder
URL: https://github.com/apache/arrow/pull/1388
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt
index 94705781f..d645cca22 100644
--- a/cpp/src/arrow/CMakeLists.txt
+++ b/cpp/src/arrow/CMakeLists.txt
@@ -38,6 +38,7 @@ set(ARROW_SRCS
   util/compression.cc
   util/cpu-info.cc
   util/decimal.cc
+  util/hash.cc
   util/key_value_metadata.cc
 )
 
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index d894df131..7ff3261ec 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -1619,6 +1619,353 @@ TEST_F(TestAdaptiveUIntBuilder, TestAppendVector) {
   ASSERT_TRUE(expected_->Equals(result_));
 }
 
+// ----------------------------------------------------------------------
+// Dictionary tests
+
+template <typename Type>
+class TestDictionaryBuilder : public TestBuilder {};
+
+typedef ::testing::Types<Int8Type, UInt8Type, Int16Type, UInt16Type, Int32Type,
+                         UInt32Type, Int64Type, UInt64Type, FloatType, 
DoubleType>
+    PrimitiveDictionaries;
+
+TYPED_TEST_CASE(TestDictionaryBuilder, PrimitiveDictionaries);
+
+TYPED_TEST(TestDictionaryBuilder, Basic) {
+  DictionaryBuilder<TypeParam> builder(default_memory_pool());
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Build expected data
+  NumericBuilder<TypeParam> dict_builder;
+  ASSERT_OK(dict_builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(dict_builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  std::shared_ptr<Array> dict_array;
+  ASSERT_OK(dict_builder.Finish(&dict_array));
+  auto dtype = std::make_shared<DictionaryType>(int8(), dict_array);
+
+  Int8Builder int_builder;
+  ASSERT_OK(int_builder.Append(0));
+  ASSERT_OK(int_builder.Append(1));
+  ASSERT_OK(int_builder.Append(0));
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TYPED_TEST(TestDictionaryBuilder, ArrayConversion) {
+  NumericBuilder<TypeParam> builder;
+  // DictionaryBuilder<TypeParam> builder;
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+
+  std::shared_ptr<Array> intermediate_result;
+  ASSERT_OK(builder.Finish(&intermediate_result));
+  DictionaryBuilder<TypeParam> dictionary_builder(default_memory_pool());
+  ASSERT_OK(dictionary_builder.AppendArray(*intermediate_result));
+  std::shared_ptr<Array> result;
+  ASSERT_OK(dictionary_builder.Finish(&result));
+
+  // Build expected data
+  NumericBuilder<TypeParam> dict_builder;
+  ASSERT_OK(dict_builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(dict_builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  std::shared_ptr<Array> dict_array;
+  ASSERT_OK(dict_builder.Finish(&dict_array));
+  auto dtype = std::make_shared<DictionaryType>(int8(), dict_array);
+
+  Int8Builder int_builder;
+  ASSERT_OK(int_builder.Append(0));
+  ASSERT_OK(int_builder.Append(1));
+  ASSERT_OK(int_builder.Append(0));
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) {
+  using Scalar = typename TypeParam::c_type;
+  // Skip this test for (u)int8
+  if (sizeof(Scalar) > 1) {
+    // Build the dictionary Array
+    DictionaryBuilder<TypeParam> builder(default_memory_pool());
+    // Build expected data
+    NumericBuilder<TypeParam> dict_builder;
+    Int16Builder int_builder;
+
+    // Fill with 1024 different values
+    for (int64_t i = 0; i < 1024; i++) {
+      ASSERT_OK(builder.Append(static_cast<Scalar>(i)));
+      ASSERT_OK(dict_builder.Append(static_cast<Scalar>(i)));
+      ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
+    }
+    // Fill with an already existing value
+    for (int64_t i = 0; i < 1024; i++) {
+      ASSERT_OK(builder.Append(static_cast<Scalar>(1)));
+      ASSERT_OK(int_builder.Append(1));
+    }
+
+    // Finalize result
+    std::shared_ptr<Array> result;
+    ASSERT_OK(builder.Finish(&result));
+
+    // Finalize expected data
+    std::shared_ptr<Array> dict_array;
+    ASSERT_OK(dict_builder.Finish(&dict_array));
+    auto dtype = std::make_shared<DictionaryType>(int16(), dict_array);
+    std::shared_ptr<Array> int_array;
+    ASSERT_OK(int_builder.Finish(&int_array));
+
+    DictionaryArray expected(dtype, int_array);
+    ASSERT_TRUE(expected.Equals(result));
+  }
+}
+
+TEST(TestStringDictionaryBuilder, Basic) {
+  // Build the dictionary Array
+  StringDictionaryBuilder builder(default_memory_pool());
+  ASSERT_OK(builder.Append("test"));
+  ASSERT_OK(builder.Append("test2"));
+  ASSERT_OK(builder.Append("test"));
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Build expected data
+  StringBuilder str_builder;
+  ASSERT_OK(str_builder.Append("test"));
+  ASSERT_OK(str_builder.Append("test2"));
+  std::shared_ptr<Array> str_array;
+  ASSERT_OK(str_builder.Finish(&str_array));
+  auto dtype = std::make_shared<DictionaryType>(int8(), str_array);
+
+  Int8Builder int_builder;
+  ASSERT_OK(int_builder.Append(0));
+  ASSERT_OK(int_builder.Append(1));
+  ASSERT_OK(int_builder.Append(0));
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TEST(TestStringDictionaryBuilder, DoubleTableSize) {
+  // Build the dictionary Array
+  StringDictionaryBuilder builder(default_memory_pool());
+  // Build expected data
+  StringBuilder str_builder;
+  Int16Builder int_builder;
+
+  // Fill with 1024 different values
+  for (int64_t i = 0; i < 1024; i++) {
+    std::stringstream ss;
+    ss << "test" << i;
+    ASSERT_OK(builder.Append(ss.str()));
+    ASSERT_OK(str_builder.Append(ss.str()));
+    ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
+  }
+  // Fill with an already existing value
+  for (int64_t i = 0; i < 1024; i++) {
+    ASSERT_OK(builder.Append("test1"));
+    ASSERT_OK(int_builder.Append(1));
+  }
+
+  // Finalize result
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Finalize expected data
+  std::shared_ptr<Array> str_array;
+  ASSERT_OK(str_builder.Finish(&str_array));
+  auto dtype = std::make_shared<DictionaryType>(int16(), str_array);
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TEST(TestFixedSizeBinaryDictionaryBuilder, Basic) {
+  // Build the dictionary Array
+  DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
+                                                 default_memory_pool());
+  std::vector<uint8_t> test{12, 12, 11, 12};
+  std::vector<uint8_t> test2{12, 12, 11, 11};
+  ASSERT_OK(builder.Append(test.data()));
+  ASSERT_OK(builder.Append(test2.data()));
+  ASSERT_OK(builder.Append(test.data()));
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Build expected data
+  FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(4));
+  ASSERT_OK(fsb_builder.Append(test.data()));
+  ASSERT_OK(fsb_builder.Append(test2.data()));
+  std::shared_ptr<Array> fsb_array;
+  ASSERT_OK(fsb_builder.Finish(&fsb_array));
+  auto dtype = std::make_shared<DictionaryType>(int8(), fsb_array);
+
+  Int8Builder int_builder;
+  ASSERT_OK(int_builder.Append(0));
+  ASSERT_OK(int_builder.Append(1));
+  ASSERT_OK(int_builder.Append(0));
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TEST(TestFixedSizeBinaryDictionaryBuilder, DoubleTableSize) {
+  // Build the dictionary Array
+  DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
+                                                 default_memory_pool());
+  // Build expected data
+  FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(4));
+  Int16Builder int_builder;
+
+  // Fill with 1024 different values
+  for (int64_t i = 0; i < 1024; i++) {
+    std::vector<uint8_t> value{12, 12, static_cast<uint8_t>(i / 128),
+                               static_cast<uint8_t>(i % 128)};
+    ASSERT_OK(builder.Append(value.data()));
+    ASSERT_OK(fsb_builder.Append(value.data()));
+    ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
+  }
+  // Fill with an already existing value
+  std::vector<uint8_t> known_value{12, 12, 0, 1};
+  for (int64_t i = 0; i < 1024; i++) {
+    ASSERT_OK(builder.Append(known_value.data()));
+    ASSERT_OK(int_builder.Append(1));
+  }
+
+  // Finalize result
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Finalize expected data
+  std::shared_ptr<Array> fsb_array;
+  ASSERT_OK(fsb_builder.Finish(&fsb_array));
+  auto dtype = std::make_shared<DictionaryType>(int16(), fsb_array);
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TEST(TestFixedSizeBinaryDictionaryBuilder, InvalidTypeAppend) {
+  // Build the dictionary Array
+  DictionaryBuilder<FixedSizeBinaryType> builder(arrow::fixed_size_binary(4),
+                                                 default_memory_pool());
+  // Build an array with different byte width
+  FixedSizeBinaryBuilder fsb_builder(arrow::fixed_size_binary(5));
+  std::vector<uint8_t> value{100, 1, 1, 1, 1};
+  ASSERT_OK(fsb_builder.Append(value.data()));
+  std::shared_ptr<Array> fsb_array;
+  ASSERT_OK(fsb_builder.Finish(&fsb_array));
+
+  ASSERT_RAISES(Invalid, builder.AppendArray(*fsb_array));
+}
+
+TEST(TestDecimalDictionaryBuilder, Basic) {
+  // Build the dictionary Array
+  const auto& decimal_type = arrow::decimal(2, 0);
+  DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type, 
default_memory_pool());
+
+  // Test data
+  std::vector<Decimal128> test{12, 12, 11, 12};
+  for (const auto& value : test) {
+    ASSERT_OK(builder.Append(value.ToBytes().data()));
+  }
+
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Build expected data
+  FixedSizeBinaryBuilder decimal_builder(decimal_type);
+  ASSERT_OK(decimal_builder.Append(Decimal128(12).ToBytes()));
+  ASSERT_OK(decimal_builder.Append(Decimal128(11).ToBytes()));
+
+  std::shared_ptr<Array> decimal_array;
+  ASSERT_OK(decimal_builder.Finish(&decimal_array));
+  auto dtype = arrow::dictionary(int8(), decimal_array);
+
+  Int8Builder int_builder;
+  ASSERT_OK(int_builder.Append({0, 0, 1, 0}));
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
+TEST(TestDecimalDictionaryBuilder, DoubleTableSize) {
+  const auto& decimal_type = arrow::decimal(21, 0);
+
+  // Build the dictionary Array
+  DictionaryBuilder<FixedSizeBinaryType> builder(decimal_type, 
default_memory_pool());
+
+  // Build expected data
+  FixedSizeBinaryBuilder fsb_builder(decimal_type);
+  Int16Builder int_builder;
+
+  // Fill with 1024 different values
+  for (int64_t i = 0; i < 1024; i++) {
+    const uint8_t bytes[] = {0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             0,
+                             12,
+                             12,
+                             static_cast<uint8_t>(i / 128),
+                             static_cast<uint8_t>(i % 128)};
+    ASSERT_OK(builder.Append(bytes));
+    ASSERT_OK(fsb_builder.Append(bytes));
+    ASSERT_OK(int_builder.Append(static_cast<uint16_t>(i)));
+  }
+  // Fill with an already existing value
+  const uint8_t known_value[] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 12, 12, 
0, 1};
+  for (int64_t i = 0; i < 1024; i++) {
+    ASSERT_OK(builder.Append(known_value));
+    ASSERT_OK(int_builder.Append(1));
+  }
+
+  // Finalize result
+  std::shared_ptr<Array> result;
+  ASSERT_OK(builder.Finish(&result));
+
+  // Finalize expected data
+  std::shared_ptr<Array> fsb_array;
+  ASSERT_OK(fsb_builder.Finish(&fsb_array));
+
+  auto dtype = std::make_shared<DictionaryType>(int16(), fsb_array);
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
 // ----------------------------------------------------------------------
 // List tests
 
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 4d7fd5ff8..de132b5f6 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -34,6 +34,7 @@
 #include "arrow/util/cpu-info.h"
 #include "arrow/util/decimal.h"
 #include "arrow/util/hash-util.h"
+#include "arrow/util/hash.h"
 #include "arrow/util/logging.h"
 
 namespace arrow {
@@ -805,6 +806,305 @@ Status BooleanBuilder::Append(const std::vector<bool>& 
values) {
   return Status::OK();
 }
 
+// ----------------------------------------------------------------------
+// DictionaryBuilder
+
+using internal::WrappedBinary;
+
+template <typename T>
+DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<DataType>& type,
+                                        MemoryPool* pool)
+    : ArrayBuilder(type, pool),
+      hash_slots_(nullptr),
+      dict_builder_(type, pool),
+      values_builder_(pool),
+      byte_width_(-1) {
+  if (!::arrow::CpuInfo::initialized()) {
+    ::arrow::CpuInfo::Init();
+  }
+}
+
+DictionaryBuilder<NullType>::DictionaryBuilder(const 
std::shared_ptr<DataType>& type,
+                                               MemoryPool* pool)
+    : ArrayBuilder(type, pool), values_builder_(pool) {
+  if (!::arrow::CpuInfo::initialized()) {
+    ::arrow::CpuInfo::Init();
+  }
+}
+
+DictionaryBuilder<NullType>::~DictionaryBuilder() {}
+
+template <>
+DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
+    const std::shared_ptr<DataType>& type, MemoryPool* pool)
+    : ArrayBuilder(type, pool),
+      hash_slots_(nullptr),
+      dict_builder_(type, pool),
+      values_builder_(pool),
+      byte_width_(static_cast<const FixedSizeBinaryType&>(*type).byte_width()) 
{
+  if (!::arrow::CpuInfo::initialized()) {
+    ::arrow::CpuInfo::Init();
+  }
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::Init(int64_t elements) {
+  RETURN_NOT_OK(ArrayBuilder::Init(elements));
+
+  // Fill the initial hash table
+  RETURN_NOT_OK(internal::NewHashTable(kInitialHashTableSize, pool_, 
&hash_table_));
+  hash_slots_ = reinterpret_cast<int32_t*>(hash_table_->mutable_data());
+  hash_table_size_ = kInitialHashTableSize;
+  mod_bitmask_ = kInitialHashTableSize - 1;
+  hash_table_load_threshold_ =
+      static_cast<int64_t>(static_cast<double>(elements) * kMaxHashTableLoad);
+
+  return values_builder_.Init(elements);
+}
+
+Status DictionaryBuilder<NullType>::Init(int64_t elements) {
+  RETURN_NOT_OK(ArrayBuilder::Init(elements));
+  return values_builder_.Init(elements);
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::Resize(int64_t capacity) {
+  if (capacity < kMinBuilderCapacity) {
+    capacity = kMinBuilderCapacity;
+  }
+
+  if (capacity_ == 0) {
+    return Init(capacity);
+  } else {
+    return ArrayBuilder::Resize(capacity);
+  }
+}
+
+Status DictionaryBuilder<NullType>::Resize(int64_t capacity) {
+  if (capacity < kMinBuilderCapacity) {
+    capacity = kMinBuilderCapacity;
+  }
+
+  if (capacity_ == 0) {
+    return Init(capacity);
+  } else {
+    return ArrayBuilder::Resize(capacity);
+  }
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out) {
+  std::shared_ptr<Array> dictionary;
+  RETURN_NOT_OK(dict_builder_.Finish(&dictionary));
+
+  RETURN_NOT_OK(values_builder_.FinishInternal(out));
+  (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
+  return Status::OK();
+}
+
+Status DictionaryBuilder<NullType>::FinishInternal(std::shared_ptr<ArrayData>* 
out) {
+  std::shared_ptr<Array> dictionary = std::make_shared<NullArray>(0);
+
+  RETURN_NOT_OK(values_builder_.FinishInternal(out));
+  (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
+  return Status::OK();
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::Append(const Scalar& value) {
+  RETURN_NOT_OK(Reserve(1));
+  // Based on DictEncoder<DType>::Put
+  int64_t j = HashValue(value) & mod_bitmask_;
+  hash_slot_t index = hash_slots_[j];
+
+  // Find an empty slot
+  while (kHashSlotEmpty != index && SlotDifferent(index, value)) {
+    // Linear probing
+    ++j;
+    if (j == hash_table_size_) {
+      j = 0;
+    }
+    index = hash_slots_[j];
+  }
+
+  if (index == kHashSlotEmpty) {
+    // Not in the hash table, so we insert it now
+    index = static_cast<hash_slot_t>(dict_builder_.length());
+    hash_slots_[j] = index;
+    RETURN_NOT_OK(AppendDictionary(value));
+
+    if (ARROW_PREDICT_FALSE(static_cast<int32_t>(dict_builder_.length()) >
+                            hash_table_load_threshold_)) {
+      RETURN_NOT_OK(DoubleTableSize());
+    }
+  }
+
+  RETURN_NOT_OK(values_builder_.Append(index));
+
+  return Status::OK();
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::AppendArray(const Array& array) {
+  const auto& numeric_array = static_cast<const NumericArray<T>&>(array);
+  for (int64_t i = 0; i < array.length(); i++) {
+    if (array.IsNull(i)) {
+      RETURN_NOT_OK(AppendNull());
+    } else {
+      RETURN_NOT_OK(Append(numeric_array.Value(i)));
+    }
+  }
+  return Status::OK();
+}
+
+Status DictionaryBuilder<NullType>::AppendArray(const Array& array) {
+  for (int64_t i = 0; i < array.length(); i++) {
+    RETURN_NOT_OK(AppendNull());
+  }
+  return Status::OK();
+}
+
+template <>
+Status DictionaryBuilder<FixedSizeBinaryType>::AppendArray(const Array& array) 
{
+  if (!type_->Equals(*array.type())) {
+    return Status::Invalid("Cannot append FixedSizeBinary array with 
non-matching type");
+  }
+
+  const auto& numeric_array = static_cast<const FixedSizeBinaryArray&>(array);
+  for (int64_t i = 0; i < array.length(); i++) {
+    if (array.IsNull(i)) {
+      RETURN_NOT_OK(AppendNull());
+    } else {
+      RETURN_NOT_OK(Append(numeric_array.Value(i)));
+    }
+  }
+  return Status::OK();
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::AppendNull() {
+  return values_builder_.AppendNull();
+}
+
+Status DictionaryBuilder<NullType>::AppendNull() { return 
values_builder_.AppendNull(); }
+
+template <typename T>
+Status DictionaryBuilder<T>::DoubleTableSize() {
+#define INNER_LOOP                                                \
+  Scalar value = GetDictionaryValue(static_cast<int64_t>(index)); \
+  int64_t j = HashValue(value) & new_mod_bitmask;
+
+  DOUBLE_TABLE_SIZE(, INNER_LOOP);
+
+  return Status::OK();
+}
+
+template <typename T>
+typename DictionaryBuilder<T>::Scalar DictionaryBuilder<T>::GetDictionaryValue(
+    int64_t index) {
+  const Scalar* data = reinterpret_cast<const 
Scalar*>(dict_builder_.data()->data());
+  return data[index];
+}
+
+template <>
+const uint8_t* 
DictionaryBuilder<FixedSizeBinaryType>::GetDictionaryValue(int64_t index) {
+  return dict_builder_.GetValue(index);
+}
+
+template <typename T>
+int64_t DictionaryBuilder<T>::HashValue(const Scalar& value) {
+  return HashUtil::Hash(&value, sizeof(Scalar), 0);
+}
+
+template <>
+int64_t DictionaryBuilder<FixedSizeBinaryType>::HashValue(const Scalar& value) 
{
+  return HashUtil::Hash(value, byte_width_, 0);
+}
+
+template <typename T>
+bool DictionaryBuilder<T>::SlotDifferent(hash_slot_t index, const Scalar& 
value) {
+  const Scalar other = GetDictionaryValue(static_cast<int64_t>(index));
+  return other != value;
+}
+
+template <>
+bool DictionaryBuilder<FixedSizeBinaryType>::SlotDifferent(hash_slot_t index,
+                                                           const Scalar& 
value) {
+  int32_t width = static_cast<const FixedSizeBinaryType&>(*type_).byte_width();
+  const Scalar other = GetDictionaryValue(static_cast<int64_t>(index));
+  return memcmp(other, value, width) != 0;
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::AppendDictionary(const Scalar& value) {
+  return dict_builder_.Append(value);
+}
+
+#define BINARY_DICTIONARY_SPECIALIZATIONS(Type)                                
     \
+  template <>                                                                  
     \
+  WrappedBinary DictionaryBuilder<Type>::GetDictionaryValue(int64_t index) {   
     \
+    int32_t v_len;                                                             
     \
+    const uint8_t* v = dict_builder_.GetValue(static_cast<int64_t>(index), 
&v_len); \
+    return WrappedBinary(v, v_len);                                            
     \
+  }                                                                            
     \
+                                                                               
     \
+  template <>                                                                  
     \
+  Status DictionaryBuilder<Type>::AppendDictionary(const WrappedBinary& value) 
{    \
+    return dict_builder_.Append(value.ptr_, value.length_);                    
     \
+  }                                                                            
     \
+                                                                               
     \
+  template <>                                                                  
     \
+  Status DictionaryBuilder<Type>::AppendArray(const Array& array) {            
     \
+    const BinaryArray& binary_array = static_cast<const BinaryArray&>(array);  
     \
+    WrappedBinary value(nullptr, 0);                                           
     \
+    for (int64_t i = 0; i < array.length(); i++) {                             
     \
+      if (array.IsNull(i)) {                                                   
     \
+        RETURN_NOT_OK(AppendNull());                                           
     \
+      } else {                                                                 
     \
+        value.ptr_ = binary_array.GetValue(i, &value.length_);                 
     \
+        RETURN_NOT_OK(Append(value));                                          
     \
+      }                                                                        
     \
+    }                                                                          
     \
+    return Status::OK();                                                       
     \
+  }                                                                            
     \
+                                                                               
     \
+  template <>                                                                  
     \
+  int64_t DictionaryBuilder<Type>::HashValue(const WrappedBinary& value) {     
     \
+    return HashUtil::Hash(value.ptr_, value.length_, 0);                       
     \
+  }                                                                            
     \
+                                                                               
     \
+  template <>                                                                  
     \
+  bool DictionaryBuilder<Type>::SlotDifferent(hash_slot_t index,               
     \
+                                              const WrappedBinary& value) {    
     \
+    int32_t other_length;                                                      
     \
+    const uint8_t* other_value =                                               
     \
+        dict_builder_.GetValue(static_cast<int64_t>(index), &other_length);    
     \
+    return !(other_length == value.length_ &&                                  
     \
+             0 == memcmp(other_value, value.ptr_, value.length_));             
     \
+  }
+
+BINARY_DICTIONARY_SPECIALIZATIONS(StringType);
+BINARY_DICTIONARY_SPECIALIZATIONS(BinaryType);
+
+template class DictionaryBuilder<UInt8Type>;
+template class DictionaryBuilder<UInt16Type>;
+template class DictionaryBuilder<UInt32Type>;
+template class DictionaryBuilder<UInt64Type>;
+template class DictionaryBuilder<Int8Type>;
+template class DictionaryBuilder<Int16Type>;
+template class DictionaryBuilder<Int32Type>;
+template class DictionaryBuilder<Int64Type>;
+template class DictionaryBuilder<Date32Type>;
+template class DictionaryBuilder<Date64Type>;
+template class DictionaryBuilder<Time32Type>;
+template class DictionaryBuilder<Time64Type>;
+template class DictionaryBuilder<TimestampType>;
+template class DictionaryBuilder<FloatType>;
+template class DictionaryBuilder<DoubleType>;
+template class DictionaryBuilder<FixedSizeBinaryType>;
+template class DictionaryBuilder<BinaryType>;
+template class DictionaryBuilder<StringType>;
+
 // ----------------------------------------------------------------------
 // Decimal128Builder
 
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index e59e16658..ce7b8cd19 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -32,6 +32,7 @@
 #include "arrow/type.h"
 #include "arrow/type_traits.h"
 #include "arrow/util/bit-util.h"
+#include "arrow/util/hash.h"
 #include "arrow/util/macros.h"
 #include "arrow/util/visibility.h"
 
@@ -809,6 +810,158 @@ class ARROW_EXPORT StructBuilder : public ArrayBuilder {
   std::vector<std::unique_ptr<ArrayBuilder>> field_builders_;
 };
 
+// ----------------------------------------------------------------------
+// Dictionary builder
+
+namespace internal {
+
+// TODO(ARROW-1176): Use Tensorflow's StringPiece instead of this here.
+struct WrappedBinary {
+  WrappedBinary(const uint8_t* ptr, int32_t length) : ptr_(ptr), 
length_(length) {}
+
+  const uint8_t* ptr_;
+  int32_t length_;
+};
+
+template <typename T>
+struct DictionaryScalar {
+  using type = typename T::c_type;
+};
+
+template <>
+struct DictionaryScalar<BinaryType> {
+  using type = WrappedBinary;
+};
+
+template <>
+struct DictionaryScalar<StringType> {
+  using type = WrappedBinary;
+};
+
+template <>
+struct DictionaryScalar<FixedSizeBinaryType> {
+  using type = uint8_t const*;
+};
+
+}  // namespace internal
+
+/// \brief Array builder for created encoded DictionaryArray from dense array
+/// data
+template <typename T>
+class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
+ public:
+  using Scalar = typename internal::DictionaryScalar<T>::type;
+
+  ~DictionaryBuilder() {}
+
+  DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
+
+  template <typename T1 = T>
+  explicit DictionaryBuilder(
+      typename std::enable_if<TypeTraits<T1>::is_parameter_free, 
MemoryPool*>::type pool)
+      : DictionaryBuilder<T1>(TypeTraits<T1>::type_singleton(), pool) {}
+
+  /// \brief Append a scalar value
+  Status Append(const Scalar& value);
+
+  /// \brief Append a scalar null value
+  Status AppendNull();
+
+  /// \brief Append a whole dense array to the builder
+  Status AppendArray(const Array& array);
+
+  Status Init(int64_t elements) override;
+  Status Resize(int64_t capacity) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ protected:
+  Status DoubleTableSize();
+  Scalar GetDictionaryValue(int64_t index);
+  int64_t HashValue(const Scalar& value);
+  bool SlotDifferent(hash_slot_t slot, const Scalar& value);
+  Status AppendDictionary(const Scalar& value);
+
+  std::shared_ptr<Buffer> hash_table_;
+  int32_t* hash_slots_;
+
+  /// Size of the table. Must be a power of 2.
+  int64_t hash_table_size_;
+
+  // Store hash_table_size_ - 1, so that j & mod_bitmask_ is equivalent to j %
+  // hash_table_size_, but uses far fewer CPU cycles
+  int64_t mod_bitmask_;
+
+  typename TypeTraits<T>::BuilderType dict_builder_;
+  AdaptiveIntBuilder values_builder_;
+  int32_t byte_width_;
+
+  /// Size at which we decide to resize
+  int64_t hash_table_load_threshold_;
+};
+
+template <>
+class ARROW_EXPORT DictionaryBuilder<NullType> : public ArrayBuilder {
+ public:
+  ~DictionaryBuilder();
+
+  DictionaryBuilder(const std::shared_ptr<DataType>& type, MemoryPool* pool);
+  explicit DictionaryBuilder(MemoryPool* pool);
+
+  /// \brief Append a scalar null value
+  Status AppendNull();
+
+  /// \brief Append a whole dense array to the builder
+  Status AppendArray(const Array& array);
+
+  Status Init(int64_t elements) override;
+  Status Resize(int64_t capacity) override;
+  Status FinishInternal(std::shared_ptr<ArrayData>* out) override;
+
+ protected:
+  AdaptiveIntBuilder values_builder_;
+};
+
+class ARROW_EXPORT BinaryDictionaryBuilder : public 
DictionaryBuilder<BinaryType> {
+ public:
+  using DictionaryBuilder::Append;
+  using DictionaryBuilder::DictionaryBuilder;
+
+  Status Append(const uint8_t* value, int32_t length) {
+    return Append(internal::WrappedBinary(value, length));
+  }
+
+  Status Append(const char* value, int32_t length) {
+    return Append(
+        internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), 
length));
+  }
+
+  Status Append(const std::string& value) {
+    return Append(internal::WrappedBinary(reinterpret_cast<const 
uint8_t*>(value.c_str()),
+                                          static_cast<int32_t>(value.size())));
+  }
+};
+
+/// \brief Dictionary array builder with convenience methods for strings
+class ARROW_EXPORT StringDictionaryBuilder : public 
DictionaryBuilder<StringType> {
+ public:
+  using DictionaryBuilder::Append;
+  using DictionaryBuilder::DictionaryBuilder;
+
+  Status Append(const uint8_t* value, int32_t length) {
+    return Append(internal::WrappedBinary(value, length));
+  }
+
+  Status Append(const char* value, int32_t length) {
+    return Append(
+        internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), 
length));
+  }
+
+  Status Append(const std::string& value) {
+    return Append(internal::WrappedBinary(reinterpret_cast<const 
uint8_t*>(value.c_str()),
+                                          static_cast<int32_t>(value.size())));
+  }
+};
+
 // ----------------------------------------------------------------------
 // Helper functions
 
diff --git a/cpp/src/arrow/compute/kernels/hash.cc 
b/cpp/src/arrow/compute/kernels/hash.cc
index 750f1d36a..1face78bd 100644
--- a/cpp/src/arrow/compute/kernels/hash.cc
+++ b/cpp/src/arrow/compute/kernels/hash.cc
@@ -30,21 +30,13 @@
 #include "arrow/compute/kernel.h"
 #include "arrow/compute/kernels/util-internal.h"
 #include "arrow/util/hash-util.h"
+#include "arrow/util/hash.h"
 
 namespace arrow {
 namespace compute {
 
 namespace {
 
-// Initially 1024 elements
-static constexpr int64_t kInitialHashTableSize = 1 << 10;
-
-typedef int32_t hash_slot_t;
-static constexpr hash_slot_t kHashSlotEmpty = 
std::numeric_limits<int32_t>::max();
-
-// The maximum load factor for the hash table before resizing.
-static constexpr double kMaxHashTableLoad = 0.5;
-
 enum class SIMDMode : char { NOSIMD, SSE4, AVX2 };
 
 #define CHECK_IMPLEMENTED(KERNEL, FUNCNAME, TYPE)                  \
@@ -54,17 +46,6 @@ enum class SIMDMode : char { NOSIMD, SSE4, AVX2 };
     return Status::NotImplemented(ss.str());                       \
   }
 
-Status NewHashTable(int64_t size, MemoryPool* pool, std::shared_ptr<Buffer>* 
out) {
-  auto hash_table = std::make_shared<PoolBuffer>(pool);
-
-  RETURN_NOT_OK(hash_table->Resize(sizeof(hash_slot_t) * size));
-  int32_t* slots = reinterpret_cast<hash_slot_t*>(hash_table->mutable_data());
-  std::fill(slots, slots + size, kHashSlotEmpty);
-
-  *out = hash_table;
-  return Status::OK();
-}
-
 // This is a slight design concession -- some hash actions have the possibility
 // of failure. Rather than introduce extra error checking into all actions, we
 // will raise an internal exception so that only the actions where errors can
@@ -129,7 +110,7 @@ class HashTable {
 
 Status HashTable::Init(int64_t elements) {
   DCHECK_EQ(elements, BitUtil::NextPower2(elements));
-  RETURN_NOT_OK(NewHashTable(elements, pool_, &hash_table_));
+  RETURN_NOT_OK(internal::NewHashTable(elements, pool_, &hash_table_));
   hash_slots_ = reinterpret_cast<hash_slot_t*>(hash_table_->mutable_data());
   hash_table_size_ = elements;
   hash_table_load_threshold_ =
@@ -238,44 +219,6 @@ struct HashDictionary<Type, enable_if_has_c_type<Type>> {
     }                                                                          
          \
   }
 
-#define DOUBLE_TABLE_SIZE(SETUP_CODE, COMPUTE_HASH)                            
  \
-  do {                                                                         
  \
-    int64_t new_size = hash_table_size_ * 2;                                   
  \
-                                                                               
  \
-    std::shared_ptr<Buffer> new_hash_table;                                    
  \
-    RETURN_NOT_OK(NewHashTable(new_size, pool_, &new_hash_table));             
  \
-    int32_t* new_hash_slots =                                                  
  \
-        reinterpret_cast<hash_slot_t*>(new_hash_table->mutable_data());        
  \
-    int64_t new_mod_bitmask = new_size - 1;                                    
  \
-                                                                               
  \
-    SETUP_CODE;                                                                
  \
-                                                                               
  \
-    for (int i = 0; i < hash_table_size_; ++i) {                               
  \
-      hash_slot_t index = hash_slots_[i];                                      
  \
-                                                                               
  \
-      if (index == kHashSlotEmpty) {                                           
  \
-        continue;                                                              
  \
-      }                                                                        
  \
-                                                                               
  \
-      COMPUTE_HASH;                                                            
  \
-      while (kHashSlotEmpty != new_hash_slots[j]) {                            
  \
-        ++j;                                                                   
  \
-        if (ARROW_PREDICT_FALSE(j == new_size)) {                              
  \
-          j = 0;                                                               
  \
-        }                                                                      
  \
-      }                                                                        
  \
-                                                                               
  \
-      new_hash_slots[j] = index;                                               
  \
-    }                                                                          
  \
-                                                                               
  \
-    hash_table_ = new_hash_table;                                              
  \
-    hash_slots_ = reinterpret_cast<hash_slot_t*>(hash_table_->mutable_data()); 
  \
-    hash_table_size_ = new_size;                                               
  \
-    hash_table_load_threshold_ =                                               
  \
-        static_cast<int64_t>(static_cast<double>(new_size) * 
kMaxHashTableLoad); \
-    mod_bitmask_ = new_size - 1;                                               
  \
-  } while (false)
-
 template <typename Type, typename Action>
 class HashTableKernel<Type, Action, enable_if_has_c_type<Type>> : public 
HashTable {
  public:
diff --git a/cpp/src/arrow/util/CMakeLists.txt 
b/cpp/src/arrow/util/CMakeLists.txt
index 29b18a935..42613d6a5 100644
--- a/cpp/src/arrow/util/CMakeLists.txt
+++ b/cpp/src/arrow/util/CMakeLists.txt
@@ -34,6 +34,7 @@ install(FILES
   cpu-info.h
   decimal.h
   hash-util.h
+  hash.h
   key_value_metadata.h
   logging.h
   macros.h
diff --git a/cpp/src/arrow/util/hash.cc b/cpp/src/arrow/util/hash.cc
new file mode 100644
index 000000000..94ba52456
--- /dev/null
+++ b/cpp/src/arrow/util/hash.cc
@@ -0,0 +1,38 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "arrow/util/hash.h"
+
+#include "arrow/buffer.h"
+#include "arrow/status.h"
+
+namespace arrow {
+namespace internal {
+
+Status NewHashTable(int64_t size, MemoryPool* pool, std::shared_ptr<Buffer>* 
out) {
+  auto hash_table = std::make_shared<PoolBuffer>(pool);
+
+  RETURN_NOT_OK(hash_table->Resize(sizeof(hash_slot_t) * size));
+  int32_t* slots = reinterpret_cast<hash_slot_t*>(hash_table->mutable_data());
+  std::fill(slots, slots + size, kHashSlotEmpty);
+
+  *out = hash_table;
+  return Status::OK();
+}
+
+}  // namespace internal
+}  // namespace arrow
diff --git a/cpp/src/arrow/util/hash.h b/cpp/src/arrow/util/hash.h
new file mode 100644
index 000000000..359734271
--- /dev/null
+++ b/cpp/src/arrow/util/hash.h
@@ -0,0 +1,85 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#ifndef ARROW_UTIL_HASH_H
+#define ARROW_UTIL_HASH_H
+
+#include <cstdint>
+#include <limits>
+#include <memory>
+
+namespace arrow {
+
+class Buffer;
+class MemoryPool;
+class Status;
+
+typedef int32_t hash_slot_t;
+static constexpr hash_slot_t kHashSlotEmpty = 
std::numeric_limits<int32_t>::max();
+
+// Initially 1024 elements
+static constexpr int kInitialHashTableSize = 1 << 10;
+
+// The maximum load factor for the hash table before resizing.
+static constexpr double kMaxHashTableLoad = 0.5;
+
+namespace internal {
+
+#define DOUBLE_TABLE_SIZE(SETUP_CODE, COMPUTE_HASH)                            
  \
+  do {                                                                         
  \
+    int64_t new_size = hash_table_size_ * 2;                                   
  \
+                                                                               
  \
+    std::shared_ptr<Buffer> new_hash_table;                                    
  \
+    RETURN_NOT_OK(internal::NewHashTable(new_size, pool_, &new_hash_table));   
  \
+    int32_t* new_hash_slots =                                                  
  \
+        reinterpret_cast<hash_slot_t*>(new_hash_table->mutable_data());        
  \
+    int64_t new_mod_bitmask = new_size - 1;                                    
  \
+                                                                               
  \
+    SETUP_CODE;                                                                
  \
+                                                                               
  \
+    for (int i = 0; i < hash_table_size_; ++i) {                               
  \
+      hash_slot_t index = hash_slots_[i];                                      
  \
+                                                                               
  \
+      if (index == kHashSlotEmpty) {                                           
  \
+        continue;                                                              
  \
+      }                                                                        
  \
+                                                                               
  \
+      COMPUTE_HASH;                                                            
  \
+      while (kHashSlotEmpty != new_hash_slots[j]) {                            
  \
+        ++j;                                                                   
  \
+        if (ARROW_PREDICT_FALSE(j == new_size)) {                              
  \
+          j = 0;                                                               
  \
+        }                                                                      
  \
+      }                                                                        
  \
+                                                                               
  \
+      new_hash_slots[j] = index;                                               
  \
+    }                                                                          
  \
+                                                                               
  \
+    hash_table_ = new_hash_table;                                              
  \
+    hash_slots_ = reinterpret_cast<hash_slot_t*>(hash_table_->mutable_data()); 
  \
+    hash_table_size_ = new_size;                                               
  \
+    hash_table_load_threshold_ =                                               
  \
+        static_cast<int64_t>(static_cast<double>(new_size) * 
kMaxHashTableLoad); \
+    mod_bitmask_ = new_size - 1;                                               
  \
+  } while (false)
+
+Status NewHashTable(int64_t size, MemoryPool* pool, std::shared_ptr<Buffer>* 
out);
+
+}  // namespace internal
+}  // namespace arrow
+
+#endif  // ARROW_UTIL_HASH_H


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [C++] Reintroduce DictionaryBuilder
> -----------------------------------
>
>                 Key: ARROW-1882
>                 URL: https://issues.apache.org/jira/browse/ARROW-1882
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C++
>            Reporter: Uwe L. Korn
>            Assignee: Uwe L. Korn
>            Priority: Critical
>              Labels: pull-request-available
>             Fix For: 0.8.0
>
>
> We need the {{DictionaryBuilder}} to incrementally build Arrow Arrays of 
> {{DictionaryType}}. The kernels only support en-bloc conversions of Arrays 
> which yields a higher memory usage.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to