Repository: arrow
Updated Branches:
  refs/heads/master 83a4405ea -> c398fda75


ARROW-462: [C++] Implement in-memory conversions between non-nested primitive 
types and DictionaryArray equivalent

Simple usage:

```
Array primitive_array;
DictionaryBuilder<Type> builder;
RETURN_NOT_OK(builder.AppendArray(primitive_array));
std::shared_ptr<Array> dict_encoded_array;
RETURN_NOT_OK(builder.Finish(&dict_encoded_array));
```

Author: Wes McKinney <wes.mckin...@twosigma.com>
Author: Uwe L. Korn <uw...@xhochy.com>

Closes #812 from xhochy/ARROW-462 and squashes the following commits:

9e289b6a [Wes McKinney] Add a couple doxygen strings. Place WrappedBinary, 
DictionaryScalar in internal namespace. Add BinaryDictionaryBuilder convenience
4147bdbe [Uwe L. Korn] ARROW-462: [C++] Implement in-memory conversions between 
non-nested primitive types and DictionaryArray equivalent


Project: http://git-wip-us.apache.org/repos/asf/arrow/repo
Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/c398fda7
Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/c398fda7
Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/c398fda7

Branch: refs/heads/master
Commit: c398fda75a90fc83bafb2aafb895760b44ecd436
Parents: 83a4405
Author: Wes McKinney <wes.mckin...@twosigma.com>
Authored: Thu Jul 6 20:18:38 2017 -0400
Committer: Wes McKinney <wes.mckin...@twosigma.com>
Committed: Thu Jul 6 20:18:38 2017 -0400

----------------------------------------------------------------------
 cpp/src/arrow/array-test.cc |  34 ++++++++
 cpp/src/arrow/array.h       |   2 +-
 cpp/src/arrow/builder.cc    | 165 +++++++++++++++++++++++++++------------
 cpp/src/arrow/builder.h     |  80 +++++++++++++++----
 4 files changed, 216 insertions(+), 65 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/arrow/blob/c398fda7/cpp/src/arrow/array-test.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index 64672d4..7ae03cf 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -1538,6 +1538,40 @@ TYPED_TEST(TestDictionaryBuilder, Basic) {
   ASSERT_TRUE(expected.Equals(result));
 }
 
+TYPED_TEST(TestDictionaryBuilder, ArrayConversion) {
+  NumericBuilder<TypeParam> builder(default_memory_pool());
+  // DictionaryBuilder<TypeParam> builder(default_memory_pool());
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  ASSERT_OK(builder.Append(static_cast<typename TypeParam::c_type>(1)));
+
+  std::shared_ptr<Array> intermediate_result;
+  ASSERT_OK(builder.Finish(&intermediate_result));
+  DictionaryBuilder<TypeParam> dictionary_builder(default_memory_pool());
+  ASSERT_OK(dictionary_builder.AppendArray(*intermediate_result));
+  std::shared_ptr<Array> result;
+  ASSERT_OK(dictionary_builder.Finish(&result));
+
+  // Build expected data
+  NumericBuilder<TypeParam> dict_builder(default_memory_pool());
+  ASSERT_OK(dict_builder.Append(static_cast<typename TypeParam::c_type>(1)));
+  ASSERT_OK(dict_builder.Append(static_cast<typename TypeParam::c_type>(2)));
+  std::shared_ptr<Array> dict_array;
+  ASSERT_OK(dict_builder.Finish(&dict_array));
+  auto dtype =
+      std::make_shared<DictionaryType>(std::make_shared<TypeParam>(), 
dict_array);
+
+  UInt8Builder int_builder(default_memory_pool());
+  ASSERT_OK(int_builder.Append(0));
+  ASSERT_OK(int_builder.Append(1));
+  ASSERT_OK(int_builder.Append(0));
+  std::shared_ptr<Array> int_array;
+  ASSERT_OK(int_builder.Finish(&int_array));
+
+  DictionaryArray expected(dtype, int_array);
+  ASSERT_TRUE(expected.Equals(result));
+}
+
 TYPED_TEST(TestDictionaryBuilder, DoubleTableSize) {
   using Scalar = typename TypeParam::c_type;
   // Skip this test for (u)int8

http://git-wip-us.apache.org/repos/asf/arrow/blob/c398fda7/cpp/src/arrow/array.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h
index 1c9769f..59269ad 100644
--- a/cpp/src/arrow/array.h
+++ b/cpp/src/arrow/array.h
@@ -478,7 +478,7 @@ class ARROW_EXPORT DictionaryArray : public Array {
   std::shared_ptr<Array> indices() const { return indices_; }
   std::shared_ptr<Array> dictionary() const;
 
-  const DictionaryType* dict_type() { return dict_type_; }
+  const DictionaryType* dict_type() const { return dict_type_; }
 
   std::shared_ptr<Array> Slice(int64_t offset, int64_t length) const override;
 

http://git-wip-us.apache.org/repos/asf/arrow/blob/c398fda7/cpp/src/arrow/builder.cc
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.cc b/cpp/src/arrow/builder.cc
index 16f252c..a57f75a 100644
--- a/cpp/src/arrow/builder.cc
+++ b/cpp/src/arrow/builder.cc
@@ -660,8 +660,8 @@ Status BooleanBuilder::Append(
 // ----------------------------------------------------------------------
 // DictionaryBuilder
 
-template <typename T, typename Scalar>
-DictionaryBuilder<T, Scalar>::DictionaryBuilder(
+template <typename T>
+DictionaryBuilder<T>::DictionaryBuilder(
     MemoryPool* pool, const std::shared_ptr<DataType>& type)
     : ArrayBuilder(pool, type),
       hash_table_(new PoolBuffer(pool)),
@@ -671,8 +671,8 @@ DictionaryBuilder<T, Scalar>::DictionaryBuilder(
   if (!::arrow::CpuInfo::initialized()) { ::arrow::CpuInfo::Init(); }
 }
 
-template <typename T, typename Scalar>
-Status DictionaryBuilder<T, Scalar>::Init(int64_t elements) {
+template <typename T>
+Status DictionaryBuilder<T>::Init(int64_t elements) {
   RETURN_NOT_OK(ArrayBuilder::Init(elements));
 
   // Fill the initial hash table
@@ -685,8 +685,8 @@ Status DictionaryBuilder<T, Scalar>::Init(int64_t elements) 
{
   return values_builder_.Init(elements);
 }
 
-template <typename T, typename Scalar>
-Status DictionaryBuilder<T, Scalar>::Resize(int64_t capacity) {
+template <typename T>
+Status DictionaryBuilder<T>::Resize(int64_t capacity) {
   if (capacity < kMinBuilderCapacity) { capacity = kMinBuilderCapacity; }
 
   if (capacity_ == 0) {
@@ -696,8 +696,8 @@ Status DictionaryBuilder<T, Scalar>::Resize(int64_t 
capacity) {
   }
 }
 
-template <typename T, typename Scalar>
-Status DictionaryBuilder<T, Scalar>::Finish(std::shared_ptr<Array>* out) {
+template <typename T>
+Status DictionaryBuilder<T>::Finish(std::shared_ptr<Array>* out) {
   std::shared_ptr<Array> dictionary;
   RETURN_NOT_OK(dict_builder_.Finish(&dictionary));
   auto type = std::make_shared<DictionaryType>(type_, dictionary);
@@ -709,8 +709,8 @@ Status DictionaryBuilder<T, 
Scalar>::Finish(std::shared_ptr<Array>* out) {
   return Status::OK();
 }
 
-template <typename T, typename Scalar>
-Status DictionaryBuilder<T, Scalar>::Append(const Scalar& value) {
+template <typename T>
+Status DictionaryBuilder<T>::Append(const Scalar& value) {
   RETURN_NOT_OK(Reserve(1));
   // Based on DictEncoder<DType>::Put
   int j = HashValue(value) & mod_bitmask_;
@@ -741,8 +741,26 @@ Status DictionaryBuilder<T, Scalar>::Append(const Scalar& 
value) {
   return Status::OK();
 }
 
-template <typename T, typename Scalar>
-Status DictionaryBuilder<T, Scalar>::DoubleTableSize() {
+template <typename T>
+Status DictionaryBuilder<T>::AppendArray(const Array& array) {
+  const NumericArray<T>& numeric_array = static_cast<const 
NumericArray<T>&>(array);
+  for (int64_t i = 0; i < array.length(); i++) {
+    if (array.IsNull(i)) {
+      RETURN_NOT_OK(AppendNull());
+    } else {
+      RETURN_NOT_OK(Append(numeric_array.Value(i)));
+    }
+  }
+  return Status::OK();
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::AppendNull() {
+  return values_builder_.AppendNull();
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::DoubleTableSize() {
   int new_size = hash_table_size_ * 2;
   auto new_hash_table = std::make_shared<PoolBuffer>(pool_);
 
@@ -782,56 +800,71 @@ Status DictionaryBuilder<T, Scalar>::DoubleTableSize() {
   return Status::OK();
 }
 
-template <typename T, typename Scalar>
-Scalar DictionaryBuilder<T, Scalar>::GetDictionaryValue(int64_t index) {
+template <typename T>
+typename DictionaryBuilder<T>::Scalar DictionaryBuilder<T>::GetDictionaryValue(
+    int64_t index) {
   const Scalar* data = reinterpret_cast<const 
Scalar*>(dict_builder_.data()->data());
   return data[index];
 }
 
-template <typename T, typename Scalar>
-int DictionaryBuilder<T, Scalar>::HashValue(const Scalar& value) {
+template <typename T>
+int DictionaryBuilder<T>::HashValue(const Scalar& value) {
   return HashUtil::Hash(&value, sizeof(Scalar), 0);
 }
 
-template <typename T, typename Scalar>
-bool DictionaryBuilder<T, Scalar>::SlotDifferent(hash_slot_t index, const 
Scalar& value) {
+template <typename T>
+bool DictionaryBuilder<T>::SlotDifferent(hash_slot_t index, const Scalar& 
value) {
   const Scalar other = GetDictionaryValue(static_cast<int64_t>(index));
   return other != value;
 }
 
-template <typename T, typename Scalar>
-Status DictionaryBuilder<T, Scalar>::AppendDictionary(const Scalar& value) {
+template <typename T>
+Status DictionaryBuilder<T>::AppendDictionary(const Scalar& value) {
   return dict_builder_.Append(value);
 }
 
-#define BINARY_DICTIONARY_SPECIALIZATIONS(Type)                                
       \
-  template <>                                                                  
       \
-  WrappedBinary DictionaryBuilder<Type, WrappedBinary>::GetDictionaryValue(    
       \
-      int64_t index) {                                                         
       \
-    int32_t v_len;                                                             
       \
-    const uint8_t* v = dict_builder_.GetValue(static_cast<int64_t>(index), 
&v_len);   \
-    return WrappedBinary(v, v_len);                                            
       \
-  }                                                                            
       \
-                                                                               
       \
-  template <>                                                                  
       \
-  int DictionaryBuilder<Type, WrappedBinary>::HashValue(const WrappedBinary& 
value) { \
-    return HashUtil::Hash(value.ptr_, value.length_, 0);                       
       \
-  }                                                                            
       \
-                                                                               
       \
-  template <>                                                                  
       \
-  bool DictionaryBuilder<Type, WrappedBinary>::SlotDifferent(                  
       \
-      hash_slot_t index, const WrappedBinary& value) {                         
       \
-    int32_t other_length;                                                      
       \
-    const uint8_t* other_value =                                               
       \
-        dict_builder_.GetValue(static_cast<int64_t>(index), &other_length);    
       \
-    return !(other_length == value.length_ &&                                  
       \
-             0 == memcmp(other_value, value.ptr_, value.length_));             
       \
-  }                                                                            
       \
-                                                                               
       \
-  template <>                                                                  
       \
-  Status DictionaryBuilder<Type, WrappedBinary>::AppendDictionary(             
       \
-      const WrappedBinary& value) {                                            
       \
-    return dict_builder_.Append(value.ptr_, value.length_);                    
       \
+#define BINARY_DICTIONARY_SPECIALIZATIONS(Type)                                
        \
+  template <>                                                                  
        \
+  internal::WrappedBinary DictionaryBuilder<Type>::GetDictionaryValue(int64_t 
index) { \
+    int32_t v_len;                                                             
        \
+    const uint8_t* v = dict_builder_.GetValue(static_cast<int64_t>(index), 
&v_len);    \
+    return internal::WrappedBinary(v, v_len);                                  
        \
+  }                                                                            
        \
+                                                                               
        \
+  template <>                                                                  
        \
+  Status DictionaryBuilder<Type>::AppendDictionary(                            
        \
+      const internal::WrappedBinary& value) {                                  
        \
+    return dict_builder_.Append(value.ptr_, value.length_);                    
        \
+  }                                                                            
        \
+                                                                               
        \
+  template <>                                                                  
        \
+  Status DictionaryBuilder<Type>::AppendArray(const Array& array) {            
        \
+    const BinaryArray& binary_array = static_cast<const BinaryArray&>(array);  
        \
+    internal::WrappedBinary value(nullptr, 0);                                 
        \
+    for (int64_t i = 0; i < array.length(); i++) {                             
        \
+      if (array.IsNull(i)) {                                                   
        \
+        RETURN_NOT_OK(AppendNull());                                           
        \
+      } else {                                                                 
        \
+        value.ptr_ = binary_array.GetValue(i, &value.length_);                 
        \
+        RETURN_NOT_OK(Append(value));                                          
        \
+      }                                                                        
        \
+    }                                                                          
        \
+    return Status::OK();                                                       
        \
+  }                                                                            
        \
+                                                                               
        \
+  template <>                                                                  
        \
+  int DictionaryBuilder<Type>::HashValue(const internal::WrappedBinary& value) 
{       \
+    return HashUtil::Hash(value.ptr_, value.length_, 0);                       
        \
+  }                                                                            
        \
+                                                                               
        \
+  template <>                                                                  
        \
+  bool DictionaryBuilder<Type>::SlotDifferent(                                 
        \
+      hash_slot_t index, const internal::WrappedBinary& value) {               
        \
+    int32_t other_length;                                                      
        \
+    const uint8_t* other_value =                                               
        \
+        dict_builder_.GetValue(static_cast<int64_t>(index), &other_length);    
        \
+    return !(other_length == value.length_ &&                                  
        \
+             0 == memcmp(other_value, value.ptr_, value.length_));             
        \
   }
 
 BINARY_DICTIONARY_SPECIALIZATIONS(StringType);
@@ -852,8 +885,8 @@ template class DictionaryBuilder<Time64Type>;
 template class DictionaryBuilder<TimestampType>;
 template class DictionaryBuilder<FloatType>;
 template class DictionaryBuilder<DoubleType>;
-template class DictionaryBuilder<BinaryType, WrappedBinary>;
-template class DictionaryBuilder<StringType, WrappedBinary>;
+template class DictionaryBuilder<BinaryType>;
+template class DictionaryBuilder<StringType>;
 
 // ----------------------------------------------------------------------
 // DecimalBuilder
@@ -1193,4 +1226,36 @@ Status MakeBuilder(MemoryPool* pool, const 
std::shared_ptr<DataType>& type,
   }
 }
 
+#define DICTIONARY_BUILDER_CASE(ENUM, BuilderType) \
+  case Type::ENUM:                                 \
+    out->reset(new BuilderType(pool, type));       \
+    return Status::OK();
+
+Status MakeDictionaryBuilder(MemoryPool* pool, const 
std::shared_ptr<DataType>& type,
+    std::shared_ptr<ArrayBuilder>* out) {
+  switch (type->id()) {
+    DICTIONARY_BUILDER_CASE(UINT8, DictionaryBuilder<UInt8Type>);
+    DICTIONARY_BUILDER_CASE(INT8, DictionaryBuilder<Int8Type>);
+    DICTIONARY_BUILDER_CASE(UINT16, DictionaryBuilder<UInt16Type>);
+    DICTIONARY_BUILDER_CASE(INT16, DictionaryBuilder<Int16Type>);
+    DICTIONARY_BUILDER_CASE(UINT32, DictionaryBuilder<UInt32Type>);
+    DICTIONARY_BUILDER_CASE(INT32, DictionaryBuilder<Int32Type>);
+    DICTIONARY_BUILDER_CASE(UINT64, DictionaryBuilder<UInt64Type>);
+    DICTIONARY_BUILDER_CASE(INT64, DictionaryBuilder<Int64Type>);
+    DICTIONARY_BUILDER_CASE(DATE32, DictionaryBuilder<Date32Type>);
+    DICTIONARY_BUILDER_CASE(DATE64, DictionaryBuilder<Date64Type>);
+    DICTIONARY_BUILDER_CASE(TIME32, DictionaryBuilder<Time32Type>);
+    DICTIONARY_BUILDER_CASE(TIME64, DictionaryBuilder<Time64Type>);
+    DICTIONARY_BUILDER_CASE(TIMESTAMP, DictionaryBuilder<TimestampType>);
+    DICTIONARY_BUILDER_CASE(FLOAT, DictionaryBuilder<FloatType>);
+    DICTIONARY_BUILDER_CASE(DOUBLE, DictionaryBuilder<DoubleType>);
+    DICTIONARY_BUILDER_CASE(STRING, StringDictionaryBuilder);
+    DICTIONARY_BUILDER_CASE(BINARY, BinaryDictionaryBuilder);
+    // DICTIONARY_BUILDER_CASE(FIXED_SIZE_BINARY, FixedSizeBinaryBuilder);
+    // DICTIONARY_BUILDER_CASE(DECIMAL, DecimalBuilder);
+    default:
+      return Status::NotImplemented(type->ToString());
+  }
+}
+
 }  // namespace arrow

http://git-wip-us.apache.org/repos/asf/arrow/blob/c398fda7/cpp/src/arrow/builder.h
----------------------------------------------------------------------
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index 5acefa1..12f3683 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -688,18 +688,55 @@ static constexpr hash_slot_t kHashSlotEmpty = 
std::numeric_limits<int32_t>::max(
 // The maximum load factor for the hash table before resizing.
 static constexpr double kMaxHashTableLoad = 0.7;
 
-template <typename T, typename Scalar = typename T::c_type>
+namespace internal {
+
+// TODO(ARROW-1176): Use Tensorflow's StringPiece instead of this here.
+struct WrappedBinary {
+  WrappedBinary(const uint8_t* ptr, int32_t length) : ptr_(ptr), 
length_(length) {}
+
+  const uint8_t* ptr_;
+  int32_t length_;
+};
+
+template <typename T>
+struct DictionaryScalar {
+  using type = typename T::c_type;
+};
+
+template <>
+struct DictionaryScalar<BinaryType> {
+  using type = WrappedBinary;
+};
+
+template <>
+struct DictionaryScalar<StringType> {
+  using type = WrappedBinary;
+};
+
+}  // namespace internal
+
+/// \brief Array builder for created encoded DictionaryArray from dense array
+/// data
+template <typename T>
 class ARROW_EXPORT DictionaryBuilder : public ArrayBuilder {
  public:
+  using Scalar = typename internal::DictionaryScalar<T>::type;
   explicit DictionaryBuilder(MemoryPool* pool, const 
std::shared_ptr<DataType>& type);
 
-  template <typename T1 = T, typename Scalar1 = Scalar>
+  template <typename T1 = T>
   explicit DictionaryBuilder(
       typename std::enable_if<TypeTraits<T1>::is_parameter_free, 
MemoryPool*>::type pool)
-      : DictionaryBuilder<T1, Scalar1>(pool, TypeTraits<T1>::type_singleton()) 
{}
+      : DictionaryBuilder<T1>(pool, TypeTraits<T1>::type_singleton()) {}
 
+  /// \brief Append a scalar value
   Status Append(const Scalar& value);
 
+  /// \brief Append a scalar null value
+  Status AppendNull();
+
+  /// \brief Append a whole dense array to the builder
+  Status AppendArray(const Array& array);
+
   Status Init(int64_t elements) override;
   Status Resize(int64_t capacity) override;
   Status Finish(std::shared_ptr<Array>* out) override;
@@ -725,31 +762,43 @@ class ARROW_EXPORT DictionaryBuilder : public 
ArrayBuilder {
   AdaptiveUIntBuilder values_builder_;
 };
 
-// TODO(ARROW-1176): Use Tensorflow's StringPiece instead of this here.
-struct WrappedBinary {
-  WrappedBinary(const uint8_t* ptr, int32_t length) : ptr_(ptr), 
length_(length) {}
+class ARROW_EXPORT BinaryDictionaryBuilder : public 
DictionaryBuilder<BinaryType> {
+ public:
+  using DictionaryBuilder::DictionaryBuilder;
+  using DictionaryBuilder::Append;
 
-  const uint8_t* ptr_;
-  int32_t length_;
+  Status Append(const uint8_t* value, int32_t length) {
+    return Append(internal::WrappedBinary(value, length));
+  }
+
+  Status Append(const char* value, int32_t length) {
+    return Append(
+        internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), 
length));
+  }
+
+  Status Append(const std::string& value) {
+    return Append(internal::WrappedBinary(reinterpret_cast<const 
uint8_t*>(value.c_str()),
+        static_cast<int32_t>(value.size())));
+  }
 };
 
-class ARROW_EXPORT StringDictionaryBuilder
-    : public DictionaryBuilder<StringType, WrappedBinary> {
+/// \brief Dictionary array builder with convenience methods for strings
+class ARROW_EXPORT StringDictionaryBuilder : public 
DictionaryBuilder<StringType> {
  public:
   using DictionaryBuilder::DictionaryBuilder;
-
   using DictionaryBuilder::Append;
 
   Status Append(const uint8_t* value, int32_t length) {
-    return Append(WrappedBinary(value, length));
+    return Append(internal::WrappedBinary(value, length));
   }
 
   Status Append(const char* value, int32_t length) {
-    return Append(WrappedBinary(reinterpret_cast<const uint8_t*>(value), 
length));
+    return Append(
+        internal::WrappedBinary(reinterpret_cast<const uint8_t*>(value), 
length));
   }
 
   Status Append(const std::string& value) {
-    return Append(WrappedBinary(reinterpret_cast<const 
uint8_t*>(value.c_str()),
+    return Append(internal::WrappedBinary(reinterpret_cast<const 
uint8_t*>(value.c_str()),
         static_cast<int32_t>(value.size())));
   }
 };
@@ -760,6 +809,9 @@ class ARROW_EXPORT StringDictionaryBuilder
 Status ARROW_EXPORT MakeBuilder(MemoryPool* pool, const 
std::shared_ptr<DataType>& type,
     std::unique_ptr<ArrayBuilder>* out);
 
+Status ARROW_EXPORT MakeDictionaryBuilder(MemoryPool* pool,
+    const std::shared_ptr<DataType>& type, std::shared_ptr<ArrayBuilder>* out);
+
 }  // namespace arrow
 
 #endif  // ARROW_BUILDER_H_

Reply via email to