This is an automated email from the ASF dual-hosted git repository.

apitrou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 1567be0d74 GH-26648: [C++] Optimize union equality comparison (#45384)
1567be0d74 is described below

commit 1567be0d74c5ee90c3d989f6fcefb32c402a736c
Author: Shawn <[email protected]>
AuthorDate: Tue Feb 4 11:44:38 2025 -0600

    GH-26648: [C++] Optimize union equality comparison (#45384)
    
    
    
    ### Rationale for this change
    
    #26648 proposes an optimization in checking sparse array equality by 
detecting contiguous runs, this PR implements that change
    
    ### What changes are included in this PR?
    
    previously, sparse array comparison was checked one by one, in this change, 
contiguous runs are detected and compared by checking equality of current and 
previous child_ids
    
    ### Are these changes tested?
    
    already covered by existing unit tests
    
    ### Are there any user-facing changes?
    
    no
    
    * GitHub Issue: #26648
    
    Lead-authored-by: shawn <[email protected]>
    Co-authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Antoine Pitrou <[email protected]>
---
 cpp/src/arrow/array/array_union_test.cc | 30 ++++++++++++++++++++++
 cpp/src/arrow/compare.cc                | 44 +++++++++++++++++++++++++++------
 2 files changed, 66 insertions(+), 8 deletions(-)

diff --git a/cpp/src/arrow/array/array_union_test.cc 
b/cpp/src/arrow/array/array_union_test.cc
index 545425c264..77ba247779 100644
--- a/cpp/src/arrow/array/array_union_test.cc
+++ b/cpp/src/arrow/array/array_union_test.cc
@@ -166,6 +166,36 @@ TEST(TestSparseUnionArray, Validate) {
   ASSERT_RAISES(Invalid, arr->ValidateFull());
 }
 
+TEST(TestSparseUnionArray, Comparison) {
+  auto ints1 = ArrayFromJSON(int32(), "[1, 2, 3, 4, 5, 6]");
+  auto ints2 = ArrayFromJSON(int32(), "[1, 2, -3, 4, -5, 6]");
+  auto strs1 = ArrayFromJSON(utf8(), R"(["a", "b", "c", "d", "e", "f"])");
+  auto strs2 = ArrayFromJSON(utf8(), R"(["a", "*", "c", "d", "e", "*"])");
+  std::vector<int8_t> type_codes{8, 42};
+
+  auto check_equality = [&](const std::string& type_ids_json1,
+                            const std::string& type_ids_json2, bool 
expected_equals) {
+    auto type_ids1 = ArrayFromJSON(int8(), type_ids_json1);
+    auto type_ids2 = ArrayFromJSON(int8(), type_ids_json2);
+    ASSERT_OK_AND_ASSIGN(auto arr1,
+                         SparseUnionArray::Make(*type_ids1, {ints1, strs1}, 
type_codes));
+    ASSERT_OK_AND_ASSIGN(auto arr2,
+                         SparseUnionArray::Make(*type_ids2, {ints2, strs2}, 
type_codes));
+    ASSERT_EQ(arr1->Equals(arr2), expected_equals);
+    ASSERT_EQ(arr2->Equals(arr1), expected_equals);
+  };
+
+  // Same type ids
+  check_equality("[8, 8, 42, 42, 42, 8]", "[8, 8, 42, 42, 42, 8]", true);
+  check_equality("[8, 8, 42, 42, 42, 42]", "[8, 8, 42, 42, 42, 42]", false);
+  check_equality("[8, 8, 8, 42, 42, 8]", "[8, 8, 8, 42, 42, 8]", false);
+  check_equality("[8, 42, 42, 42, 42, 8]", "[8, 42, 42, 42, 42, 8]", false);
+
+  // Different type ids
+  check_equality("[42, 8, 42, 42, 42, 8]", "[8, 8, 42, 42, 42, 8]", false);
+  check_equality("[8, 8, 42, 42, 42, 8]", "[8, 8, 42, 42, 42, 42]", false);
+}
+
 // -------------------------------------------------------------------------
 // Tests for MakeDense and MakeSparse
 
diff --git a/cpp/src/arrow/compare.cc b/cpp/src/arrow/compare.cc
index 23a921cc5a..e0e6d18339 100644
--- a/cpp/src/arrow/compare.cc
+++ b/cpp/src/arrow/compare.cc
@@ -381,21 +381,49 @@ class RangeDataEqualsImpl {
     const int8_t* right_codes = right_.GetValues<int8_t>(1);
 
     // Unions don't have a null bitmap
+    int64_t run_start = 0;  // Start index of the current run
+
     for (int64_t i = 0; i < range_length_; ++i) {
-      const auto type_id = left_codes[left_start_idx_ + i];
-      if (type_id != right_codes[right_start_idx_ + i]) {
+      const auto current_type_id = left_codes[left_start_idx_ + i];
+
+      if (current_type_id != right_codes[right_start_idx_ + i]) {
         result_ = false;
         break;
       }
-      const auto child_num = child_ids[type_id];
-      // XXX can we instead detect runs of same-child union values?
+      // Check if the current element breaks the run
+      if (i > 0 && current_type_id != left_codes[left_start_idx_ + i - 1]) {
+        // Compare the previous run
+        const auto previous_child_num = child_ids[left_codes[left_start_idx_ + 
i - 1]];
+        int64_t run_length = i - run_start;
+
+        RangeDataEqualsImpl impl(
+            options_, floating_approximate_, 
*left_.child_data[previous_child_num],
+            *right_.child_data[previous_child_num],
+            left_start_idx_ + left_.offset + run_start,
+            right_start_idx_ + right_.offset + run_start, run_length);
+
+        if (!impl.Compare()) {
+          result_ = false;
+          break;
+        }
+
+        // Start a new run
+        run_start = i;
+      }
+    }
+
+    // Handle the final run
+    if (result_) {
+      const auto final_child_num = child_ids[left_codes[left_start_idx_ + 
run_start]];
+      int64_t final_run_length = range_length_ - run_start;
+
       RangeDataEqualsImpl impl(
-          options_, floating_approximate_, *left_.child_data[child_num],
-          *right_.child_data[child_num], left_start_idx_ + left_.offset + i,
-          right_start_idx_ + right_.offset + i, 1);
+          options_, floating_approximate_, *left_.child_data[final_child_num],
+          *right_.child_data[final_child_num], left_start_idx_ + left_.offset 
+ run_start,
+          right_start_idx_ + right_.offset + run_start, final_run_length);
+
       if (!impl.Compare()) {
         result_ = false;
-        break;
       }
     }
     return Status::OK();

Reply via email to