Re: [PR] Improve StringArray(Utf8) sort performance (~2-4x faster) [arrow-rs]

via GitHub Thu, 10 Jul 2025 06:23:36 -0700


zhuqi-lucas commented on code in PR #7860:
URL: https://github.com/apache/arrow-rs/pull/7860#discussion_r2197727619



##########
arrow-ord/src/sort.rs:
##########
@@ -295,12 +295,85 @@ fn sort_bytes<T: ByteArrayType>(
     options: SortOptions,
     limit: Option<usize>,
 ) -> UInt32Array {
-    let mut valids = value_indices
+    // Note: Why do we use 4‑byte prefix?
+    // Compute the 4‑byte prefix in BE order, or left‑pad if shorter.
+    // Most byte‐sequences differ in their first few bytes, so by
+    // comparing up to 4 bytes as a single u32 we avoid the overhead
+    // of a full lexicographical compare for the vast majority of cases.
+
+    // 1. Build a vector of (index, prefix, length) tuples
+    let mut valids: Vec<(u32, u32, u64)> = value_indices
         .into_iter()
-        .map(|index| (index, values.value(index as usize).as_ref()))
-        .collect::<Vec<(u32, &[u8])>>();
+        .map(|idx| {
+            let slice: &[u8] = values.value(idx as usize).as_ref();
+            let len = slice.len() as u64;
+            // Compute the 4‑byte prefix in BE order, or left‑pad if shorter
+            let prefix = if slice.len() >= 4 {
+                let raw = unsafe { std::ptr::read_unaligned(slice.as_ptr() as 
*const u32) };
+                u32::from_be(raw)
+            } else {
+                let mut v = 0u32;
+                for &b in slice {
+                    v = (v << 8) | (b as u32);
+                }
+                v << (8 * (4 - slice.len()))
+            };
+            (idx, prefix, len)

Review Comment:
   Thank you @Dandandan for this idea, i tried now, but it show 30% performance 
decrease: 
   
   ```patch
   diff --git a/arrow-ord/src/sort.rs b/arrow-ord/src/sort.rs
   index 093c52d867..29800663a0 100644
   --- a/arrow-ord/src/sort.rs
   +++ b/arrow-ord/src/sort.rs
   @@ -301,14 +301,15 @@ fn sort_bytes<T: ByteArrayType>(
        // comparing up to 4 bytes as a single u32 we avoid the overhead
        // of a full lexicographical compare for the vast majority of cases.
    
   -    // 1. Build a vector of (index, prefix, length) tuples
   -    let mut valids: Vec<(u32, u32, u64)> = value_indices
   +    // 1. Build a vector of (idx, prefix, is_small, slice) tuples
   +    let mut valids: Vec<(u32, u32, bool, &[u8])> = value_indices
            .into_iter()
            .map(|idx| {
                let slice: &[u8] = values.value(idx as usize).as_ref();
   -            let len = slice.len() as u64;
   -            // Compute the 4‑byte prefix in BE order, or left‑pad if shorter
   -            let prefix = if slice.len() >= 4 {
   +            // store the bool for whether the slice is smaller than 4 bytes
   +            let is_small = slice.len() < 4;
   +            // prefix: if the slice is smaller than 4 bytes, left-pad it 
with zeros,
   +            let prefix = if !is_small {
                    let raw = unsafe { std::ptr::read_unaligned(slice.as_ptr() 
as *const u32) };
                    u32::from_be(raw)
                } else {
   @@ -318,7 +319,7 @@ fn sort_bytes<T: ByteArrayType>(
                    }
                    v << (8 * (4 - slice.len()))
                };
   -            (idx, prefix, len)
   +            (idx, prefix, is_small, slice)
            })
            .collect();
    
   @@ -328,27 +329,24 @@ fn sort_bytes<T: ByteArrayType>(
            _ => valids.len(),
        };
    
   -    // 3. Comparator: compare prefix, then (when both slices shorter than 
4) length, otherwise full slice
   -    let cmp_bytes = |a: &(u32, u32, u64), b: &(u32, u32, u64)| {
   -        let (ia, pa, la) = *a;
   -        let (ib, pb, lb) = *b;
   -        // 3.1 prefix (first 4 bytes)
   -        let ord = pa.cmp(&pb);
   -        if ord != Ordering::Equal {
   -            return ord;
   +    // 3. Comparator: compare prefix first, then for both “small” slices 
compare length, and finally full lexicographical compare
   +    let cmp_bytes = |a: &(u32, u32, bool, &[u8]), b: &(u32, u32, bool, 
&[u8])| {
   +        let (_ia, pa, sa_small, sa) = a;
   +        let (_ib, pb, sb_small, sb) = b;
   +        // 3.1 Compare the 4‑byte prefix
   +        match pa.cmp(&pb) {
   +            Ordering::Equal => (),
   +            non_eq => return non_eq,
            }
   -        // 3.2 only if both slices had length < 4 (so prefix was padded)
   -        // length compare only when prefix was padded (i.e. original length 
< 4)
   -        if la < 4 || lb < 4 {
   -            let ord = la.cmp(&lb);
   -            if ord != Ordering::Equal {
   -                return ord;
   +        // 3.2 If both slices were shorter than 4 bytes, compare their 
actual lengths
   +        if *sa_small && *sb_small {
   +            match sa.len().cmp(&sb.len()) {
   +                Ordering::Equal => (),
   +                non_eq => return non_eq,
                }
            }
   -        // 3.3 full lexicographical compare
   -        let a_bytes: &[u8] = values.value(ia as usize).as_ref();
   -        let b_bytes: &[u8] = values.value(ib as usize).as_ref();
   -        a_bytes.cmp(b_bytes)
   +        // 3.3 Otherwise, do a full byte‑wise lexicographical comparison
   +        sa.cmp(sb)
        };
    
        // 4. Partially sort according to ascending/descending
   @@ -366,9 +364,9 @@ fn sort_bytes<T: ByteArrayType>(
        if options.nulls_first {
            out.extend_from_slice(&nulls[..nulls.len().min(out_limit)]);
            let rem = out_limit - out.len();
   -        out.extend(valids.iter().map(|&(i, _, _)| i).take(rem));
   +        out.extend(valids.iter().map(|&(i, _, _, _)| i).take(rem));
        } else {
   -        out.extend(valids.iter().map(|&(i, _, _)| i).take(out_limit));
   +        out.extend(valids.iter().map(|&(i, _, _, _)| i).take(out_limit));
            let rem = out_limit - out.len();
            out.extend_from_slice(&nulls[..rem]);
        }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Re: [PR] Improve StringArray(Utf8) sort performance (~2-4x faster) [arrow-rs]

Reply via email to