alamb commented on code in PR #7860: URL: https://github.com/apache/arrow-rs/pull/7860#discussion_r2183227193
########## arrow/benches/sort_kernel.rs: ########## @@ -113,6 +113,16 @@ fn add_benchmark(c: &mut Criterion) { b.iter(|| bench_sort_to_indices(&arr, None)) }); + let arr = create_string_array::<i32>(2usize.pow(12), 0.0); Review Comment: if you add this benchmark as a separate PR it would be easier for me to automate running benchmarks ########## arrow-ord/src/sort.rs: ########## @@ -295,12 +295,89 @@ fn sort_bytes<T: ByteArrayType>( options: SortOptions, limit: Option<usize>, ) -> UInt32Array { - let mut valids = value_indices + // 1. construct a list of (index, raw_bytes, length) + let mut valids: Vec<(u32, &[u8], usize)> = value_indices .into_iter() - .map(|index| (index, values.value(index as usize).as_ref())) - .collect::<Vec<(u32, &[u8])>>(); + .map(|idx| { + let slice: &[u8] = values.value(idx as usize).as_ref(); + (idx, slice, slice.len()) + }) + .collect(); - sort_impl(options, &mut valids, &nulls, limit, Ord::cmp).into() + // 2. compute the number of non-null entries to partially sort + let vlimit = match (limit, options.nulls_first) { + (Some(l), true) => l.saturating_sub(nulls.len()).min(valids.len()), + _ => valids.len(), + }; + + // 3. comparator function for mixed byte views Review Comment: what does "byte view" mean in this context? ########## arrow-ord/src/sort.rs: ########## @@ -295,12 +295,89 @@ fn sort_bytes<T: ByteArrayType>( options: SortOptions, limit: Option<usize>, ) -> UInt32Array { - let mut valids = value_indices + // 1. construct a list of (index, raw_bytes, length) + let mut valids: Vec<(u32, &[u8], usize)> = value_indices .into_iter() - .map(|index| (index, values.value(index as usize).as_ref())) - .collect::<Vec<(u32, &[u8])>>(); + .map(|idx| { + let slice: &[u8] = values.value(idx as usize).as_ref(); + (idx, slice, slice.len()) + }) + .collect(); - sort_impl(options, &mut valids, &nulls, limit, Ord::cmp).into() + // 2. compute the number of non-null entries to partially sort + let vlimit = match (limit, options.nulls_first) { + (Some(l), true) => l.saturating_sub(nulls.len()).min(valids.len()), + _ => valids.len(), + }; + + // 3. comparator function for mixed byte views + let cmp_bytes = |a: &(u32, &[u8], usize), b: &(u32, &[u8], usize)| { + let (_, a_bytes, a_len) = *a; + let (_, b_bytes, b_len) = *b; + let min_len = a_len.min(b_len); + + // 3. compare the prefix of the first 4 bytes + let pref = min_len.min(4); + let mut pa = 0u32; + let mut pb = 0u32; + for i in 0..pref { + pa = (pa << 8) | (a_bytes[i] as u32); + pb = (pb << 8) | (b_bytes[i] as u32); + } + if pa != pb { + return pa.cmp(&pb); + } + + // 3.2 Use 8 bytes to compare one by one if the prefix is equal + let mut i = pref; + while i + 8 <= min_len { + let raw_a = unsafe { std::ptr::read_unaligned(a_bytes.as_ptr().add(i) as *const u64) }; Review Comment: How do we know that `a_bytes` is aligned to an 8 byte boundary? Can't be be any arbitrary offset in the data buffer? ########## arrow-ord/src/sort.rs: ########## @@ -295,12 +295,89 @@ fn sort_bytes<T: ByteArrayType>( options: SortOptions, limit: Option<usize>, ) -> UInt32Array { - let mut valids = value_indices + // 1. construct a list of (index, raw_bytes, length) + let mut valids: Vec<(u32, &[u8], usize)> = value_indices .into_iter() - .map(|index| (index, values.value(index as usize).as_ref())) - .collect::<Vec<(u32, &[u8])>>(); + .map(|idx| { + let slice: &[u8] = values.value(idx as usize).as_ref(); + (idx, slice, slice.len()) + }) + .collect(); - sort_impl(options, &mut valids, &nulls, limit, Ord::cmp).into() + // 2. compute the number of non-null entries to partially sort + let vlimit = match (limit, options.nulls_first) { + (Some(l), true) => l.saturating_sub(nulls.len()).min(valids.len()), + _ => valids.len(), + }; + + // 3. comparator function for mixed byte views + let cmp_bytes = |a: &(u32, &[u8], usize), b: &(u32, &[u8], usize)| { + let (_, a_bytes, a_len) = *a; + let (_, b_bytes, b_len) = *b; + let min_len = a_len.min(b_len); + + // 3. compare the prefix of the first 4 bytes + let pref = min_len.min(4); + let mut pa = 0u32; + let mut pb = 0u32; + for i in 0..pref { + pa = (pa << 8) | (a_bytes[i] as u32); + pb = (pb << 8) | (b_bytes[i] as u32); + } + if pa != pb { + return pa.cmp(&pb); + } + + // 3.2 Use 8 bytes to compare one by one if the prefix is equal Review Comment: I am quite surprised the built in implementation of `cmp` doesn't already have this optimization -- this code is basically some sort of manual vectorization -- I woud have expected that `a_bytes` and `b_bytes` would already do it. ########## arrow-ord/src/sort.rs: ########## @@ -295,12 +295,89 @@ fn sort_bytes<T: ByteArrayType>( options: SortOptions, limit: Option<usize>, ) -> UInt32Array { - let mut valids = value_indices + // 1. construct a list of (index, raw_bytes, length) + let mut valids: Vec<(u32, &[u8], usize)> = value_indices .into_iter() - .map(|index| (index, values.value(index as usize).as_ref())) - .collect::<Vec<(u32, &[u8])>>(); + .map(|idx| { + let slice: &[u8] = values.value(idx as usize).as_ref(); + (idx, slice, slice.len()) + }) + .collect(); - sort_impl(options, &mut valids, &nulls, limit, Ord::cmp).into() + // 2. compute the number of non-null entries to partially sort + let vlimit = match (limit, options.nulls_first) { + (Some(l), true) => l.saturating_sub(nulls.len()).min(valids.len()), + _ => valids.len(), + }; + + // 3. comparator function for mixed byte views + let cmp_bytes = |a: &(u32, &[u8], usize), b: &(u32, &[u8], usize)| { + let (_, a_bytes, a_len) = *a; + let (_, b_bytes, b_len) = *b; + let min_len = a_len.min(b_len); + + // 3. compare the prefix of the first 4 bytes + let pref = min_len.min(4); Review Comment: Could you add some comment about why checking the first 4 bytes specially is worthwhile? Is it an optimization for small strings? Giveen the loop below uses `read_unaligned` for 8 byte pointers, maybe we should do the same thing for the 4 byte initial read? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org