benibus commented on code in PR #35280:
URL: https://github.com/apache/arrow/pull/35280#discussion_r1181839289


##########
cpp/src/arrow/compute/kernels/vector_array_sort.cc:
##########
@@ -173,6 +173,116 @@ class ArrayCompareSorter {
   }
 };
 
+template <>
+class ArrayCompareSorter<DictionaryType> {
+ public:
+  Result<NullPartitionResult> operator()(uint64_t* indices_begin, uint64_t* 
indices_end,
+                                         const Array& array, int64_t offset,
+                                         const ArraySortOptions& options,
+                                         ExecContext* ctx) {
+    const auto& dict_array = checked_cast<const DictionaryArray&>(array);
+    // TODO: These methods should probably return a const&? They seem capable.
+    auto dict_values = dict_array.dictionary();
+    auto dict_indices = dict_array.indices();
+
+    // Algorithm:
+    // 1) Use the Rank function to get an exactly-equivalent-order array
+    //    of the dictionary values, but with a datatype that's friendlier to
+    //    sorting (uint64).
+    // 2) Act as if we were sorting a dictionary array with the same indices,
+    //    but with the ranks as dictionary values.
+    // 2a) Dictionary-decode the ranks by calling Take.
+    // 2b) Sort the decoded ranks. Not only those are uint64, they are dense
+    //     in a [0, k) range where k is the number of unique dictionary values.
+    //     Therefore, unless the dictionary is very large, a fast counting sort
+    //     will be used.
+    //
+    // The bottom line is that performance will usually be much better
+    // (potentially an order of magnitude faster) than by naively decoding
+    // the original dictionary and sorting the decoded version.
+
+    std::shared_ptr<Array> decoded_ranks;
+    // Skip the rank/take steps for cases with only nulls
+    if (IsAllNulls(dict_array)) {
+      ARROW_ASSIGN_OR_RAISE(decoded_ranks,
+                            MakeNullUInt64Array(dict_array.length(), 
ctx->memory_pool()));
+    } else {
+      ARROW_ASSIGN_OR_RAISE(auto ranks, RanksWithNulls(dict_values, ctx));
+
+      ARROW_ASSIGN_OR_RAISE(decoded_ranks,
+                            Take(*ranks, *dict_indices, 
TakeOptions::Defaults(), ctx));
+    }
+
+    DCHECK_EQ(decoded_ranks->type_id(), Type::UINT64);
+    DCHECK_EQ(decoded_ranks->length(), dict_array.length());
+    ARROW_ASSIGN_OR_RAISE(auto rank_sorter, 
GetArraySorter(*decoded_ranks->type()));
+
+    return rank_sorter(indices_begin, indices_end, *decoded_ranks, offset, 
options, ctx);
+  }
+
+ private:
+  static bool IsAllNulls(const DictionaryArray& dict_array) {
+    auto indices = dict_array.indices();
+    auto values = dict_array.dictionary();
+    if (values->null_count() == values->length() ||
+        indices->null_count() == indices->length()) {
+      return true;
+    }
+    for (int64_t i = 0; i < dict_array.length(); ++i) {
+      // TODO: Special handling for dictionary types in Array::IsValid/Null?

Review Comment:
   Yeah, basically just an equivalent of the next few lines. In practice, it 
would need to be added to `Array` itself since `IsValid` isn't virtual.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to