icexelloss commented on code in PR #34311: URL: https://github.com/apache/arrow/pull/34311#discussion_r1127032532
########## cpp/src/arrow/compute/row/grouper.cc: ########## @@ -39,12 +43,521 @@ namespace arrow { using internal::checked_cast; +using internal::PrimitiveScalarBase; namespace compute { namespace { -struct GrouperImpl : Grouper { +constexpr uint32_t kNoGroupId = std::numeric_limits<uint32_t>::max(); + +using group_id_t = std::remove_const<decltype(kNoGroupId)>::type; +using GroupIdType = CTypeTraits<group_id_t>::ArrowType; +auto group_id_type = std::make_shared<GroupIdType>(); + +inline const uint8_t* GetValuesAsBytes(const ArrayData& data, int64_t offset = 0) { + DCHECK_GT(data.type->byte_width(), 0); + int64_t absolute_byte_offset = (data.offset + offset) * data.type->byte_width(); + return data.GetValues<uint8_t>(1, absolute_byte_offset); +} + +inline const uint8_t* GetValuesAsBytes(const ArraySpan& data, int64_t offset = 0) { + DCHECK_GT(data.type->byte_width(), 0); + int64_t absolute_byte_offset = (data.offset + offset) * data.type->byte_width(); + return data.GetValues<uint8_t>(1, absolute_byte_offset); +} + +template <typename Value> +Status CheckForGetNextSegment(const std::vector<Value>& values, int64_t length, + int64_t offset, const std::vector<TypeHolder>& key_types) { + if (offset < 0 || offset > length) { + return Status::Invalid("invalid grouping segmenter offset: ", offset); + } + if (values.size() != key_types.size()) { + return Status::Invalid("expected batch size ", key_types.size(), " but got ", + values.size()); + } + for (size_t i = 0; i < key_types.size(); i++) { + const auto& value = values[i]; + const auto& key_type = key_types[i]; + if (*value.type() != *key_type.type) { + return Status::Invalid("expected batch value ", i, " of type ", *key_type.type, + " but got ", *value.type()); + } + } + return Status::OK(); +} + +template <typename Batch> +enable_if_t<std::is_same<Batch, ExecSpan>::value || std::is_same<Batch, ExecBatch>::value, + Status> +CheckForGetNextSegment(const Batch& batch, int64_t offset, + const std::vector<TypeHolder>& key_types) { + return CheckForGetNextSegment(batch.values, batch.length, offset, key_types); +} + +struct BaseGroupingSegmenter : public GroupingSegmenter { + explicit BaseGroupingSegmenter(const std::vector<TypeHolder>& key_types) + : key_types_(key_types) {} + + const std::vector<TypeHolder>& key_types() const override { return key_types_; } + + std::vector<TypeHolder> key_types_; +}; + +GroupingSegment MakeSegment(int64_t batch_length, int64_t offset, int64_t length, + bool extends) { + return GroupingSegment{offset, length, offset + length >= batch_length, extends}; +} + +int64_t GetMatchLength(const uint8_t* match_bytes, int64_t match_width, Review Comment: I feel we are writing algorithms for the wrong use cases here. `A, A, B, B, A, A, A, A, A, A, A` is not our main use case to support and we should optimize/write the code for "ordered data" rather then "segment but not ordered data". Even in the AnyKeysSegmenter case I think we can do binary search instead of linear search for finding segment boundaries. Let's create a follow up to fix this. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org