Dandandan commented on code in PR #7860: URL: https://github.com/apache/arrow-rs/pull/7860#discussion_r2191576240
########## arrow-ord/src/sort.rs: ########## @@ -295,12 +295,85 @@ fn sort_bytes<T: ByteArrayType>( options: SortOptions, limit: Option<usize>, ) -> UInt32Array { - let mut valids = value_indices + // Note: Why do we use 4‑byte prefix? + // Compute the 4‑byte prefix in BE order, or left‑pad if shorter. + // Most byte‐sequences differ in their first few bytes, so by + // comparing up to 4 bytes as a single u32 we avoid the overhead + // of a full lexicographical compare for the vast majority of cases. + + // 1. Build a vector of (index, prefix, length) tuples + let mut valids: Vec<(u32, u32, u64)> = value_indices .into_iter() - .map(|index| (index, values.value(index as usize).as_ref())) - .collect::<Vec<(u32, &[u8])>>(); + .map(|idx| { + let slice: &[u8] = values.value(idx as usize).as_ref(); + let len = slice.len() as u64; + // Compute the 4‑byte prefix in BE order, or left‑pad if shorter + let prefix = if slice.len() >= 4 { + let raw = unsafe { std::ptr::read_unaligned(slice.as_ptr() as *const u32) }; + u32::from_be(raw) + } else { + let mut v = 0u32; + for &b in slice { + v = (v << 8) | (b as u32); + } + v << (8 * (4 - slice.len())) + }; + (idx, prefix, len) Review Comment: Alternatively or additionaly we could store the `&[u8]` instead of the index so it doesn't have to retrieve it via `values.value` again in the sort. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org