[ 
https://issues.apache.org/jira/browse/ARROW-1559?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16225491#comment-16225491
 ] 

ASF GitHub Bot commented on ARROW-1559:
---------------------------------------

wesm commented on a change in pull request #1266: WIP: ARROW-1559: Add unique 
kernel
URL: https://github.com/apache/arrow/pull/1266#discussion_r147791018
 
 

 ##########
 File path: cpp/src/arrow/builder.cc
 ##########
 @@ -1110,20 +1034,202 @@ Status DictionaryBuilder<T>::AppendDictionary(const 
Scalar& value) {
   }                                                                            
     \
                                                                                
     \
   template <>                                                                  
     \
-  int DictionaryBuilder<Type>::HashValue(const WrappedBinary& value) {         
     \
+  WrappedBinary UniqueBuilder<Type>::GetDictionaryValue(int64_t index) {       
     \
+    int32_t v_len;                                                             
     \
+    const uint8_t* v = dict_builder_.GetValue(static_cast<int64_t>(index), 
&v_len); \
+    return WrappedBinary(v, v_len);                                            
     \
+  }                                                                            
     \
+                                                                               
     \
+  template <>                                                                  
     \
+  Status UniqueBuilder<Type>::AppendDictionary(const WrappedBinary& value) {   
     \
+    return dict_builder_.Append(value.ptr_, value.length_);                    
     \
+  }                                                                            
     \
+                                                                               
     \
+  template <>                                                                  
     \
+  int UniqueBuilder<Type>::HashValue(const WrappedBinary& value) {             
     \
     return HashUtil::Hash(value.ptr_, value.length_, 0);                       
     \
   }                                                                            
     \
                                                                                
     \
   template <>                                                                  
     \
-  bool DictionaryBuilder<Type>::SlotDifferent(hash_slot_t index,               
     \
-                                              const WrappedBinary& value) {    
     \
+  bool UniqueBuilder<Type>::SlotDifferent(hash_slot_t index,                   
     \
+                                          const WrappedBinary& value) {        
     \
     int32_t other_length;                                                      
     \
     const uint8_t* other_value =                                               
     \
         dict_builder_.GetValue(static_cast<int64_t>(index), &other_length);    
     \
     return !(other_length == value.length_ &&                                  
     \
              0 == memcmp(other_value, value.ptr_, value.length_));             
     \
   }
 
+BINARY_UNIQUE_SPECIALIZATIONS(StringType);
+BINARY_UNIQUE_SPECIALIZATIONS(BinaryType);
+
+template <typename T>
+Status UniqueBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out) {
+  return dict_builder_.FinishInternal(out);
+}
+
+template class UniqueBuilder<UInt8Type>;
+template class UniqueBuilder<UInt16Type>;
+template class UniqueBuilder<UInt32Type>;
+template class UniqueBuilder<UInt64Type>;
+template class UniqueBuilder<Int8Type>;
+template class UniqueBuilder<Int16Type>;
+template class UniqueBuilder<Int32Type>;
+template class UniqueBuilder<Int64Type>;
+template class UniqueBuilder<Date32Type>;
+template class UniqueBuilder<Date64Type>;
+template class UniqueBuilder<Time32Type>;
+template class UniqueBuilder<Time64Type>;
+template class UniqueBuilder<TimestampType>;
+template class UniqueBuilder<FloatType>;
+template class UniqueBuilder<DoubleType>;
+template class UniqueBuilder<FixedSizeBinaryType>;
+template class UniqueBuilder<BinaryType>;
+template class UniqueBuilder<StringType>;
+
+// ----------------------------------------------------------------------
+// DictionaryBuilder
+
+template <typename T>
+DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<DataType>& type,
+                                        MemoryPool* pool)
+    : ArrayBuilder(type, pool), unique_builder_(type, pool), 
values_builder_(pool) {}
+
+DictionaryBuilder<NullType>::DictionaryBuilder(const 
std::shared_ptr<DataType>& type,
+                                               MemoryPool* pool)
+    : ArrayBuilder(type, pool), values_builder_(pool) {}
+
+DictionaryBuilder<NullType>::~DictionaryBuilder() {}
+
+template <>
+DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
+    const std::shared_ptr<DataType>& type, MemoryPool* pool)
+    : ArrayBuilder(type, pool), unique_builder_(type, pool), 
values_builder_(pool) {}
+
+template <typename T>
+Status DictionaryBuilder<T>::Init(int64_t elements) {
+  RETURN_NOT_OK(ArrayBuilder::Init(elements));
+  RETURN_NOT_OK(unique_builder_.Init(elements));
+  return values_builder_.Init(elements);
+}
+
+Status DictionaryBuilder<NullType>::Init(int64_t elements) {
+  RETURN_NOT_OK(ArrayBuilder::Init(elements));
+  return values_builder_.Init(elements);
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::Resize(int64_t capacity) {
+  if (capacity < kMinBuilderCapacity) {
+    capacity = kMinBuilderCapacity;
+  }
+
+  if (capacity_ == 0) {
+    return Init(capacity);
+  } else {
+    RETURN_NOT_OK(unique_builder_.Resize(capacity));
+    return ArrayBuilder::Resize(capacity);
+  }
+}
+
+Status DictionaryBuilder<NullType>::Resize(int64_t capacity) {
+  if (capacity < kMinBuilderCapacity) {
+    capacity = kMinBuilderCapacity;
+  }
+
+  if (capacity_ == 0) {
+    return Init(capacity);
+  } else {
+    return ArrayBuilder::Resize(capacity);
+  }
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out) {
+  std::shared_ptr<Array> dictionary;
+  RETURN_NOT_OK(unique_builder_.Finish(&dictionary));
+
+  RETURN_NOT_OK(values_builder_.FinishInternal(out));
+  (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
+  return Status::OK();
+}
+
+Status DictionaryBuilder<NullType>::FinishInternal(std::shared_ptr<ArrayData>* 
out) {
+  std::shared_ptr<Array> dictionary = std::make_shared<NullArray>(0);
+
+  RETURN_NOT_OK(values_builder_.FinishInternal(out));
+  (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
+  return Status::OK();
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::Append(const Scalar& value) {
+  RETURN_NOT_OK(Reserve(1));
+  int32_t index;
+  RETURN_NOT_OK(unique_builder_.Append(value, &index));
 
 Review comment:
   In my experience with implementing these things in the past, we are probably 
going to want to create a base hash table pass class that accepts a functor 
whose action can be inlined in the innermost loop of the hash table pass. This 
functor would accumulate the results of the hash table pass, whether it's one 
of:
   
   * isin (values_to_test, value_set) -> boolean array
   * match (values_to_test, value_set) -> integer array (positional version of 
isin)
   * unique (values) -> unique values
   * dictionary-encode / factorize (values) -> unique values + category indices
   * value-counts(values) -> unique values + counts
   
   We should write functors for each of these "actions" and then instantiate 
kernels for each input type and functor.
   
   On this note, probably all of this code (including the DictionaryBuilder 
code -- such that it does not disrupt parquet-cpp) should move to 
src/arrow/compute.
   
   I also question whether we need to preserve a scalar append API when instead 
we could do hash table passes in batches in the places where incremental 
dictionary-encoding is important

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [C++] Kernel implementations for "unique" (compute distinct elements of array)
> ------------------------------------------------------------------------------
>
>                 Key: ARROW-1559
>                 URL: https://issues.apache.org/jira/browse/ARROW-1559
>             Project: Apache Arrow
>          Issue Type: New Feature
>          Components: C++
>            Reporter: Wes McKinney
>            Assignee: Uwe L. Korn
>              Labels: Analytics, pull-request-available
>             Fix For: 0.8.0
>
>




--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to