[ 
https://issues.apache.org/jira/browse/ARROW-1559?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16225663#comment-16225663
 ] 

ASF GitHub Bot commented on ARROW-1559:
---------------------------------------

wesm commented on a change in pull request #1266: WIP: ARROW-1559: Add unique 
kernel
URL: https://github.com/apache/arrow/pull/1266#discussion_r147820948
 
 

 ##########
 File path: cpp/src/arrow/builder.cc
 ##########
 @@ -1110,20 +1034,202 @@ Status DictionaryBuilder<T>::AppendDictionary(const 
Scalar& value) {
   }                                                                            
     \
                                                                                
     \
   template <>                                                                  
     \
-  int DictionaryBuilder<Type>::HashValue(const WrappedBinary& value) {         
     \
+  WrappedBinary UniqueBuilder<Type>::GetDictionaryValue(int64_t index) {       
     \
+    int32_t v_len;                                                             
     \
+    const uint8_t* v = dict_builder_.GetValue(static_cast<int64_t>(index), 
&v_len); \
+    return WrappedBinary(v, v_len);                                            
     \
+  }                                                                            
     \
+                                                                               
     \
+  template <>                                                                  
     \
+  Status UniqueBuilder<Type>::AppendDictionary(const WrappedBinary& value) {   
     \
+    return dict_builder_.Append(value.ptr_, value.length_);                    
     \
+  }                                                                            
     \
+                                                                               
     \
+  template <>                                                                  
     \
+  int UniqueBuilder<Type>::HashValue(const WrappedBinary& value) {             
     \
     return HashUtil::Hash(value.ptr_, value.length_, 0);                       
     \
   }                                                                            
     \
                                                                                
     \
   template <>                                                                  
     \
-  bool DictionaryBuilder<Type>::SlotDifferent(hash_slot_t index,               
     \
-                                              const WrappedBinary& value) {    
     \
+  bool UniqueBuilder<Type>::SlotDifferent(hash_slot_t index,                   
     \
+                                          const WrappedBinary& value) {        
     \
     int32_t other_length;                                                      
     \
     const uint8_t* other_value =                                               
     \
         dict_builder_.GetValue(static_cast<int64_t>(index), &other_length);    
     \
     return !(other_length == value.length_ &&                                  
     \
              0 == memcmp(other_value, value.ptr_, value.length_));             
     \
   }
 
+BINARY_UNIQUE_SPECIALIZATIONS(StringType);
+BINARY_UNIQUE_SPECIALIZATIONS(BinaryType);
+
+template <typename T>
+Status UniqueBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out) {
+  return dict_builder_.FinishInternal(out);
+}
+
+template class UniqueBuilder<UInt8Type>;
+template class UniqueBuilder<UInt16Type>;
+template class UniqueBuilder<UInt32Type>;
+template class UniqueBuilder<UInt64Type>;
+template class UniqueBuilder<Int8Type>;
+template class UniqueBuilder<Int16Type>;
+template class UniqueBuilder<Int32Type>;
+template class UniqueBuilder<Int64Type>;
+template class UniqueBuilder<Date32Type>;
+template class UniqueBuilder<Date64Type>;
+template class UniqueBuilder<Time32Type>;
+template class UniqueBuilder<Time64Type>;
+template class UniqueBuilder<TimestampType>;
+template class UniqueBuilder<FloatType>;
+template class UniqueBuilder<DoubleType>;
+template class UniqueBuilder<FixedSizeBinaryType>;
+template class UniqueBuilder<BinaryType>;
+template class UniqueBuilder<StringType>;
+
+// ----------------------------------------------------------------------
+// DictionaryBuilder
+
+template <typename T>
+DictionaryBuilder<T>::DictionaryBuilder(const std::shared_ptr<DataType>& type,
+                                        MemoryPool* pool)
+    : ArrayBuilder(type, pool), unique_builder_(type, pool), 
values_builder_(pool) {}
+
+DictionaryBuilder<NullType>::DictionaryBuilder(const 
std::shared_ptr<DataType>& type,
+                                               MemoryPool* pool)
+    : ArrayBuilder(type, pool), values_builder_(pool) {}
+
+DictionaryBuilder<NullType>::~DictionaryBuilder() {}
+
+template <>
+DictionaryBuilder<FixedSizeBinaryType>::DictionaryBuilder(
+    const std::shared_ptr<DataType>& type, MemoryPool* pool)
+    : ArrayBuilder(type, pool), unique_builder_(type, pool), 
values_builder_(pool) {}
+
+template <typename T>
+Status DictionaryBuilder<T>::Init(int64_t elements) {
+  RETURN_NOT_OK(ArrayBuilder::Init(elements));
+  RETURN_NOT_OK(unique_builder_.Init(elements));
+  return values_builder_.Init(elements);
+}
+
+Status DictionaryBuilder<NullType>::Init(int64_t elements) {
+  RETURN_NOT_OK(ArrayBuilder::Init(elements));
+  return values_builder_.Init(elements);
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::Resize(int64_t capacity) {
+  if (capacity < kMinBuilderCapacity) {
+    capacity = kMinBuilderCapacity;
+  }
+
+  if (capacity_ == 0) {
+    return Init(capacity);
+  } else {
+    RETURN_NOT_OK(unique_builder_.Resize(capacity));
+    return ArrayBuilder::Resize(capacity);
+  }
+}
+
+Status DictionaryBuilder<NullType>::Resize(int64_t capacity) {
+  if (capacity < kMinBuilderCapacity) {
+    capacity = kMinBuilderCapacity;
+  }
+
+  if (capacity_ == 0) {
+    return Init(capacity);
+  } else {
+    return ArrayBuilder::Resize(capacity);
+  }
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::FinishInternal(std::shared_ptr<ArrayData>* out) {
+  std::shared_ptr<Array> dictionary;
+  RETURN_NOT_OK(unique_builder_.Finish(&dictionary));
+
+  RETURN_NOT_OK(values_builder_.FinishInternal(out));
+  (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
+  return Status::OK();
+}
+
+Status DictionaryBuilder<NullType>::FinishInternal(std::shared_ptr<ArrayData>* 
out) {
+  std::shared_ptr<Array> dictionary = std::make_shared<NullArray>(0);
+
+  RETURN_NOT_OK(values_builder_.FinishInternal(out));
+  (*out)->type = std::make_shared<DictionaryType>((*out)->type, dictionary);
+  return Status::OK();
+}
+
+template <typename T>
+Status DictionaryBuilder<T>::Append(const Scalar& value) {
+  RETURN_NOT_OK(Reserve(1));
+  int32_t index;
+  RETURN_NOT_OK(unique_builder_.Append(value, &index));
 
 Review comment:
   As part of this we need to also define a stateful API so that we can keep 
pushing more data through the kernel for the ones that need to maintain state. 
Some of the "modes" of the hash table will write their results into output 
memory, where others (e.g. unique) will keep accumulating the result

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [C++] Kernel implementations for "unique" (compute distinct elements of array)
> ------------------------------------------------------------------------------
>
>                 Key: ARROW-1559
>                 URL: https://issues.apache.org/jira/browse/ARROW-1559
>             Project: Apache Arrow
>          Issue Type: New Feature
>          Components: C++
>            Reporter: Wes McKinney
>            Assignee: Uwe L. Korn
>              Labels: Analytics, pull-request-available
>             Fix For: 0.8.0
>
>




--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to