zhuqi-lucas commented on code in PR #7860:
URL: https://github.com/apache/arrow-rs/pull/7860#discussion_r2249179444


##########
arrow-ord/src/sort.rs:
##########
@@ -295,12 +295,85 @@ fn sort_bytes<T: ByteArrayType>(
     options: SortOptions,
     limit: Option<usize>,
 ) -> UInt32Array {
-    let mut valids = value_indices
+    // Note: Why do we use 4‑byte prefix?
+    // Compute the 4‑byte prefix in BE order, or left‑pad if shorter.
+    // Most byte‐sequences differ in their first few bytes, so by
+    // comparing up to 4 bytes as a single u32 we avoid the overhead
+    // of a full lexicographical compare for the vast majority of cases.
+
+    // 1. Build a vector of (index, prefix, length) tuples
+    let mut valids: Vec<(u32, u32, u64)> = value_indices
         .into_iter()
-        .map(|index| (index, values.value(index as usize).as_ref()))
-        .collect::<Vec<(u32, &[u8])>>();
+        .map(|idx| {
+            let slice: &[u8] = values.value(idx as usize).as_ref();
+            let len = slice.len() as u64;
+            // Compute the 4‑byte prefix in BE order, or left‑pad if shorter
+            let prefix = if slice.len() >= 4 {
+                let raw = unsafe { std::ptr::read_unaligned(slice.as_ptr() as 
*const u32) };
+                u32::from_be(raw)
+            } else {
+                let mut v = 0u32;
+                for &b in slice {
+                    v = (v << 8) | (b as u32);
+                }
+                v << (8 * (4 - slice.len()))
+            };
+            (idx, prefix, len)
+        })
+        .collect();
 
-    sort_impl(options, &mut valids, &nulls, limit, Ord::cmp).into()
+    // 2. compute the number of non-null entries to partially sort
+    let vlimit = match (limit, options.nulls_first) {
+        (Some(l), true) => l.saturating_sub(nulls.len()).min(valids.len()),
+        _ => valids.len(),
+    };
+
+    // 3. Comparator: compare prefix, then (when both slices shorter than 4) 
length, otherwise full slice
+    let cmp_bytes = |a: &(u32, u32, u64), b: &(u32, u32, u64)| {
+        let (ia, pa, la) = *a;
+        let (ib, pb, lb) = *b;
+        // 3.1 prefix (first 4 bytes)
+        let ord = pa.cmp(&pb);
+        if ord != Ordering::Equal {
+            return ord;
+        }
+        // 3.2 only if both slices had length < 4 (so prefix was padded)
+        // length compare only when prefix was padded (i.e. original length < 
4)
+        if la < 4 || lb < 4 {
+            let ord = la.cmp(&lb);
+            if ord != Ordering::Equal {
+                return ord;
+            }
+        }
+        // 3.3 full lexicographical compare
+        let a_bytes: &[u8] = values.value(ia as usize).as_ref();

Review Comment:
   Updated, 2 store start and len, it will also has more than 30% performance 
decrease same as the comments here:
   https://github.com/apache/arrow-rs/pull/7860#discussion_r2197727619
   
   So i not address this. I guess it may due to L1 cache will miss for store 
too much for sort process here. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to