Dandandan commented on code in PR #7748: URL: https://github.com/apache/arrow-rs/pull/7748#discussion_r2166493183
########## arrow-array/src/array/byte_view_array.rs: ########## @@ -537,17 +538,37 @@ impl<T: ByteViewType + ?Sized> GenericByteViewArray<T> { left_idx: usize, right: &GenericByteViewArray<T>, right_idx: usize, - ) -> std::cmp::Ordering { + ) -> Ordering { let l_view = left.views().get_unchecked(left_idx); let l_len = *l_view as u32; let r_view = right.views().get_unchecked(right_idx); let r_len = *r_view as u32; if l_len <= 12 && r_len <= 12 { - let l_data = unsafe { GenericByteViewArray::<T>::inline_value(l_view, l_len as usize) }; - let r_data = unsafe { GenericByteViewArray::<T>::inline_value(r_view, r_len as usize) }; - return l_data.cmp(r_data); + // Remove the length bits, leaving only the data + let l_data = *l_view >> 32; + let r_data = *r_view >> 32; + + // The data is stored in little-endian order. To compare lexicographically, + // convert to big-endian: + let l_be = l_data.swap_bytes(); + let r_be = r_data.swap_bytes(); + + // Compare only the first min_len bytes + let min_len = l_len.min(r_len); + // We have all 12 bytes in the high bits, but only want the top min_len + let shift = (12 - min_len) * 8; + let l_partial = l_be >> shift; + let r_partial = r_be >> shift; + if l_partial < r_partial { + return Ordering::Less; + } else if l_partial > r_partial { + return Ordering::Greater; + } + + // If the prefixes are equal, the shorter one is considered smaller + return l_len.cmp(&r_len); } // one of the string is larger than 12 bytes, Review Comment: You can change this code below to use `(l_view >> 32) as u32` as well (or ByteView if it generates the same code). It seems that is a bit faster for the prefix comparison: ``` lt scalar StringViewArray time: [34.533 ms 34.567 ms 34.601 ms] change: [−11.030% −10.827% −10.620%] (p = 0.00 < 0.05) Performance has improved. Found 4 outliers among 100 measurements (4.00%) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org