pitrou commented on code in PR #50248:
URL: https://github.com/apache/arrow/pull/50248#discussion_r3472786404


##########
cpp/src/arrow/compute/kernels/vector_sort_internal.h:
##########
@@ -106,382 +106,315 @@ int CompareTypeValues(Value&& left, Value&& right, 
SortOrder order,
 }
 
 template <typename IndexType>
-struct GenericNullPartitionResult {
-  IndexType* non_nulls_begin;
-  IndexType* non_nulls_end;
-  IndexType* nulls_begin;
-  IndexType* nulls_end;
+struct GenericPartitionResultByNullLikeness {

Review Comment:
   Let's just keep the old name? Or name it `GenericNullLikePartition` which is 
a bit shorter?



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -679,33 +656,36 @@ class TableSorter {
     for (int64_t i = 0; i < num_batches; ++i) {
       const auto& batch = *batches_[i];
       end_offset += batch.num_rows();
-      RadixRecordBatchSorter sorter(indices_begin_ + begin_offset,
-                                    indices_begin_ + end_offset, batch, 
options_);
+      RadixRecordBatchSorter sorter(
+          {indices_.data() + begin_offset, indices_.data() + end_offset}, 
batch,
+          options_);
       ARROW_ASSIGN_OR_RAISE(sorted[i], sorter.Sort(begin_offset));
-      DCHECK_EQ(sorted[i].overall_begin(), indices_begin_ + begin_offset);
-      DCHECK_EQ(sorted[i].overall_end(), indices_begin_ + end_offset);
-      DCHECK_EQ(sorted[i].non_null_count() + sorted[i].null_count(), 
batch.num_rows());
+      DCHECK_EQ(sorted[i].overall_begin(), indices_.data() + begin_offset);
+      DCHECK_EQ(sorted[i].overall_end(), indices_.data() + end_offset);
+      DCHECK_EQ(static_cast<int64_t>(sorted[i].non_null_like_range.size() +
+                                     sorted[i].null_range.size()),
+                batch.num_rows());
       begin_offset = end_offset;
       // XXX this is an upper bound on the true null count

Review Comment:
   Is this XXX still true? Presumably it implied that `null_count` could also 
account for nan values, but that is not the case anymore?



##########
cpp/src/arrow/compute/kernels/vector_sort.cc:
##########
@@ -679,33 +656,36 @@ class TableSorter {
     for (int64_t i = 0; i < num_batches; ++i) {
       const auto& batch = *batches_[i];
       end_offset += batch.num_rows();
-      RadixRecordBatchSorter sorter(indices_begin_ + begin_offset,
-                                    indices_begin_ + end_offset, batch, 
options_);
+      RadixRecordBatchSorter sorter(
+          {indices_.data() + begin_offset, indices_.data() + end_offset}, 
batch,
+          options_);
       ARROW_ASSIGN_OR_RAISE(sorted[i], sorter.Sort(begin_offset));
-      DCHECK_EQ(sorted[i].overall_begin(), indices_begin_ + begin_offset);
-      DCHECK_EQ(sorted[i].overall_end(), indices_begin_ + end_offset);
-      DCHECK_EQ(sorted[i].non_null_count() + sorted[i].null_count(), 
batch.num_rows());
+      DCHECK_EQ(sorted[i].overall_begin(), indices_.data() + begin_offset);
+      DCHECK_EQ(sorted[i].overall_end(), indices_.data() + end_offset);
+      DCHECK_EQ(static_cast<int64_t>(sorted[i].non_null_like_range.size() +
+                                     sorted[i].null_range.size()),
+                batch.num_rows());

Review Comment:
   Shouldn't we also add `nan_range.size()` here?



##########
cpp/src/arrow/compute/kernels/vector_sort_internal.h:
##########
@@ -106,382 +106,315 @@ int CompareTypeValues(Value&& left, Value&& right, 
SortOrder order,
 }
 
 template <typename IndexType>
-struct GenericNullPartitionResult {
-  IndexType* non_nulls_begin;
-  IndexType* non_nulls_end;
-  IndexType* nulls_begin;
-  IndexType* nulls_end;
+struct GenericPartitionResultByNullLikeness {
+  std::span<IndexType> non_null_like_range;
+  std::span<IndexType> nan_range;
+  std::span<IndexType> null_range;
 
-  IndexType* overall_begin() const { return std::min(nulls_begin, 
non_nulls_begin); }
-
-  IndexType* overall_end() const { return std::max(nulls_end, non_nulls_end); }
-
-  int64_t non_null_count() const { return non_nulls_end - non_nulls_begin; }
+  IndexType* overall_begin() const {
+    return std::min(non_null_like_range.data(), null_range.data());
+  }
 
-  int64_t null_count() const { return nulls_end - nulls_begin; }
+  IndexType* overall_end() const {
+    return std::max(non_null_like_range.data() + non_null_like_range.size(),
+                    null_range.data() + null_range.size());
+  }
 
-  static GenericNullPartitionResult NoNulls(IndexType* indices_begin,
-                                            IndexType* indices_end,
-                                            NullPlacement null_placement) {
-    if (null_placement == NullPlacement::AtStart) {
-      return {indices_begin, indices_end, indices_begin, indices_begin};
+  template <typename TargetIndexType>
+  GenericPartitionResultByNullLikeness<TargetIndexType> TranslateTo(
+      IndexType* indices_begin, TargetIndexType* target_indices_begin) const {
+    return {.non_null_like_range = {(non_null_like_range.data() - 
indices_begin) +
+                                        target_indices_begin,
+                                    non_null_like_range.size()},
+            .nan_range = {(nan_range.data() - indices_begin) + 
target_indices_begin,
+                          nan_range.size()},
+            .null_range = {(null_range.data() - indices_begin) + 
target_indices_begin,
+                           null_range.size()}};
+  }
+
+  static GenericPartitionResultByNullLikeness fromCounts(std::span<IndexType> 
indices,

Review Comment:
   Style nit: `FromCounts`



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to