Re: [PR] Improve StringArray(Utf8) sort performance (1.2x faster) [arrow-rs]

via GitHub Thu, 03 Jul 2025 09:36:25 -0700


alamb commented on code in PR #7860:
URL: https://github.com/apache/arrow-rs/pull/7860#discussion_r2183227193



##########
arrow/benches/sort_kernel.rs:
##########
@@ -113,6 +113,16 @@ fn add_benchmark(c: &mut Criterion) {
         b.iter(|| bench_sort_to_indices(&arr, None))
     });
 
+    let arr = create_string_array::<i32>(2usize.pow(12), 0.0);

Review Comment:
   if you add this benchmark as a separate PR it would be easier for me to 
automate running benchmarks



##########
arrow-ord/src/sort.rs:
##########
@@ -295,12 +295,89 @@ fn sort_bytes<T: ByteArrayType>(
     options: SortOptions,
     limit: Option<usize>,
 ) -> UInt32Array {
-    let mut valids = value_indices
+    // 1. construct a list of (index, raw_bytes, length)
+    let mut valids: Vec<(u32, &[u8], usize)> = value_indices
         .into_iter()
-        .map(|index| (index, values.value(index as usize).as_ref()))
-        .collect::<Vec<(u32, &[u8])>>();
+        .map(|idx| {
+            let slice: &[u8] = values.value(idx as usize).as_ref();
+            (idx, slice, slice.len())
+        })
+        .collect();
 
-    sort_impl(options, &mut valids, &nulls, limit, Ord::cmp).into()
+    // 2. compute the number of non-null entries to partially sort
+    let vlimit = match (limit, options.nulls_first) {
+        (Some(l), true) => l.saturating_sub(nulls.len()).min(valids.len()),
+        _ => valids.len(),
+    };
+
+    // 3. comparator function for mixed byte views

Review Comment:
   what does "byte view" mean in this context?



##########
arrow-ord/src/sort.rs:
##########
@@ -295,12 +295,89 @@ fn sort_bytes<T: ByteArrayType>(
     options: SortOptions,
     limit: Option<usize>,
 ) -> UInt32Array {
-    let mut valids = value_indices
+    // 1. construct a list of (index, raw_bytes, length)
+    let mut valids: Vec<(u32, &[u8], usize)> = value_indices
         .into_iter()
-        .map(|index| (index, values.value(index as usize).as_ref()))
-        .collect::<Vec<(u32, &[u8])>>();
+        .map(|idx| {
+            let slice: &[u8] = values.value(idx as usize).as_ref();
+            (idx, slice, slice.len())
+        })
+        .collect();
 
-    sort_impl(options, &mut valids, &nulls, limit, Ord::cmp).into()
+    // 2. compute the number of non-null entries to partially sort
+    let vlimit = match (limit, options.nulls_first) {
+        (Some(l), true) => l.saturating_sub(nulls.len()).min(valids.len()),
+        _ => valids.len(),
+    };
+
+    // 3. comparator function for mixed byte views
+    let cmp_bytes = |a: &(u32, &[u8], usize), b: &(u32, &[u8], usize)| {
+        let (_, a_bytes, a_len) = *a;
+        let (_, b_bytes, b_len) = *b;
+        let min_len = a_len.min(b_len);
+
+        // 3. compare the prefix of the first 4 bytes
+        let pref = min_len.min(4);
+        let mut pa = 0u32;
+        let mut pb = 0u32;
+        for i in 0..pref {
+            pa = (pa << 8) | (a_bytes[i] as u32);
+            pb = (pb << 8) | (b_bytes[i] as u32);
+        }
+        if pa != pb {
+            return pa.cmp(&pb);
+        }
+
+        // 3.2 Use 8 bytes to compare one by one if the prefix is equal
+        let mut i = pref;
+        while i + 8 <= min_len {
+            let raw_a = unsafe { 
std::ptr::read_unaligned(a_bytes.as_ptr().add(i) as *const u64) };

Review Comment:
   How do we know that `a_bytes` is aligned to an 8 byte boundary? Can't be be 
any arbitrary offset in the data buffer?



##########
arrow-ord/src/sort.rs:
##########
@@ -295,12 +295,89 @@ fn sort_bytes<T: ByteArrayType>(
     options: SortOptions,
     limit: Option<usize>,
 ) -> UInt32Array {
-    let mut valids = value_indices
+    // 1. construct a list of (index, raw_bytes, length)
+    let mut valids: Vec<(u32, &[u8], usize)> = value_indices
         .into_iter()
-        .map(|index| (index, values.value(index as usize).as_ref()))
-        .collect::<Vec<(u32, &[u8])>>();
+        .map(|idx| {
+            let slice: &[u8] = values.value(idx as usize).as_ref();
+            (idx, slice, slice.len())
+        })
+        .collect();
 
-    sort_impl(options, &mut valids, &nulls, limit, Ord::cmp).into()
+    // 2. compute the number of non-null entries to partially sort
+    let vlimit = match (limit, options.nulls_first) {
+        (Some(l), true) => l.saturating_sub(nulls.len()).min(valids.len()),
+        _ => valids.len(),
+    };
+
+    // 3. comparator function for mixed byte views
+    let cmp_bytes = |a: &(u32, &[u8], usize), b: &(u32, &[u8], usize)| {
+        let (_, a_bytes, a_len) = *a;
+        let (_, b_bytes, b_len) = *b;
+        let min_len = a_len.min(b_len);
+
+        // 3. compare the prefix of the first 4 bytes
+        let pref = min_len.min(4);
+        let mut pa = 0u32;
+        let mut pb = 0u32;
+        for i in 0..pref {
+            pa = (pa << 8) | (a_bytes[i] as u32);
+            pb = (pb << 8) | (b_bytes[i] as u32);
+        }
+        if pa != pb {
+            return pa.cmp(&pb);
+        }
+
+        // 3.2 Use 8 bytes to compare one by one if the prefix is equal

Review Comment:
   I am quite surprised the built in implementation of `cmp` doesn't already 
have this optimization -- this code is basically some sort of manual 
vectorization -- I woud have expected that `a_bytes` and `b_bytes` would 
already do it.
   
   



##########
arrow-ord/src/sort.rs:
##########
@@ -295,12 +295,89 @@ fn sort_bytes<T: ByteArrayType>(
     options: SortOptions,
     limit: Option<usize>,
 ) -> UInt32Array {
-    let mut valids = value_indices
+    // 1. construct a list of (index, raw_bytes, length)
+    let mut valids: Vec<(u32, &[u8], usize)> = value_indices
         .into_iter()
-        .map(|index| (index, values.value(index as usize).as_ref()))
-        .collect::<Vec<(u32, &[u8])>>();
+        .map(|idx| {
+            let slice: &[u8] = values.value(idx as usize).as_ref();
+            (idx, slice, slice.len())
+        })
+        .collect();
 
-    sort_impl(options, &mut valids, &nulls, limit, Ord::cmp).into()
+    // 2. compute the number of non-null entries to partially sort
+    let vlimit = match (limit, options.nulls_first) {
+        (Some(l), true) => l.saturating_sub(nulls.len()).min(valids.len()),
+        _ => valids.len(),
+    };
+
+    // 3. comparator function for mixed byte views
+    let cmp_bytes = |a: &(u32, &[u8], usize), b: &(u32, &[u8], usize)| {
+        let (_, a_bytes, a_len) = *a;
+        let (_, b_bytes, b_len) = *b;
+        let min_len = a_len.min(b_len);
+
+        // 3. compare the prefix of the first 4 bytes
+        let pref = min_len.min(4);

Review Comment:
   Could you add some comment about why checking the first 4 bytes specially is 
worthwhile? Is it an optimization for small strings?
   
   Giveen the loop below uses `read_unaligned` for 8 byte pointers, maybe we 
should do the same thing for the 4 byte initial read?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Improve StringArray(Utf8) sort performance (1.2x faster) [arrow-rs]

Reply via email to