alamb commented on a change in pull request #716: URL: https://github.com/apache/arrow-rs/pull/716#discussion_r697690544
########## File path: arrow/src/array/transform/utils.rs ########## @@ -35,15 +42,37 @@ pub(super) fn set_bits( offset_read: usize, len: usize, ) -> usize { - let mut count = 0; - (0..len).for_each(|i| { - if bit_util::get_bit(data, offset_read + i) { - bit_util::set_bit(write_data, offset_write + i); - } else { - count += 1; - } + let mut null_count = 0; + + let mut bits_to_align = offset_write % 8; + if bits_to_align > 0 { + bits_to_align = std::cmp::min(len, 8 - bits_to_align); + } + let mut byte_index = ceil(offset_write + bits_to_align, 8); Review comment: Maybe callig this `write_byte_index` would help make it clearer what it represented ########## File path: arrow/src/array/transform/utils.rs ########## @@ -35,15 +42,37 @@ pub(super) fn set_bits( offset_read: usize, len: usize, ) -> usize { - let mut count = 0; - (0..len).for_each(|i| { - if bit_util::get_bit(data, offset_read + i) { - bit_util::set_bit(write_data, offset_write + i); - } else { - count += 1; - } + let mut null_count = 0; + + let mut bits_to_align = offset_write % 8; + if bits_to_align > 0 { + bits_to_align = std::cmp::min(len, 8 - bits_to_align); + } + let mut byte_index = ceil(offset_write + bits_to_align, 8); + + // Set full bytes provided by bit chunk iterator + let chunks = BitChunks::new(data, offset_read + bits_to_align, len - bits_to_align); + chunks.iter().for_each(|chunk| { + null_count += chunk.count_zeros(); + chunk.to_le_bytes().iter().for_each(|b| { Review comment: 👍 bit chunk iterator seems to use little endian as well: https://github.com/mathiaspeters-sig/arrow-rs/blob/improve-set-bits/arrow/src/util/bit_chunk_iterator.rs#L138 ########## File path: arrow/src/array/transform/utils.rs ########## @@ -74,3 +103,130 @@ pub(super) unsafe fn get_last_offset<T: OffsetSizeTrait>( debug_assert!(prefix.is_empty() && suffix.is_empty()); *offsets.get_unchecked(offsets.len() - 1) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_set_bits_aligned() { + let mut destination: Vec<u8> = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let source: &[u8] = &[ + 0b11100111, 0b11100111, 0b11100111, 0b11100111, 0b11100111, 0b11100111, + 0b11100111, 0b11100111, Review comment: I think one danger of using the same values for all of the source data is that it may mask offset calculation errors What would you think of using a different pattern for each byte? Perhaps something like ```suggestion 0b10101010, 0b11001100, 0b11100011, 0b1111000, 0b11111000, 0b11111100, 0b11111110, 0b1111111, ``` Or some other pattern where having "off by 8 errors" is not as likely to be masked? ########## File path: arrow/src/array/transform/utils.rs ########## @@ -74,3 +103,130 @@ pub(super) unsafe fn get_last_offset<T: OffsetSizeTrait>( debug_assert!(prefix.is_empty() && suffix.is_empty()); *offsets.get_unchecked(offsets.len() - 1) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_set_bits_aligned() { + let mut destination: Vec<u8> = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let source: &[u8] = &[ + 0b11100111, 0b11100111, 0b11100111, 0b11100111, 0b11100111, 0b11100111, + 0b11100111, 0b11100111, + ]; + + let destination_offset = 8; + let source_offset = 0; + + let len = 64; + + let expected_data: &[u8] = &[ + 0, 0b11100111, 0b11100111, 0b11100111, 0b11100111, 0b11100111, 0b11100111, + 0b11100111, 0b11100111, 0, + ]; + let expected_null_count = 16; + let result = set_bits( + destination.as_mut_slice(), + source, + destination_offset, + source_offset, + len, + ); + + assert_eq!(destination, expected_data); + assert_eq!(result, expected_null_count); + } + + #[test] + fn test_set_bits_unaligned_destination_start() { + let mut destination: Vec<u8> = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let source: &[u8] = &[ + 0b11100111, 0b11100111, 0b11100111, 0b11100111, 0b11100111, 0b11100111, + 0b11100111, 0b11100111, + ]; + + let destination_offset = 3; + let source_offset = 0; + + let len = 64; + + let expected_data: &[u8] = &[ + 0b00111000, 0b00111111, 0b00111111, 0b00111111, 0b00111111, 0b00111111, + 0b00111111, 0b00111111, 0b00000111, 0b00000000, Review comment: i think the 3rd byte looks suspicious -- should it be? ```suggestion 0b00111111, 0b00111111, 0b00111100, 0b00000000, ``` ########## File path: arrow/src/array/transform/utils.rs ########## @@ -35,15 +42,37 @@ pub(super) fn set_bits( offset_read: usize, len: usize, ) -> usize { Review comment: It would be cool to update the doc strings here too to explain what the return value of `set_bits` is supposed to be (seems like it is supposed to be the total number of `0` bits in `data[offset_read..offset_read+len]` ########## File path: arrow/src/array/transform/utils.rs ########## @@ -35,15 +42,37 @@ pub(super) fn set_bits( offset_read: usize, len: usize, ) -> usize { - let mut count = 0; - (0..len).for_each(|i| { - if bit_util::get_bit(data, offset_read + i) { - bit_util::set_bit(write_data, offset_write + i); - } else { - count += 1; - } + let mut null_count = 0; + + let mut bits_to_align = offset_write % 8; + if bits_to_align > 0 { + bits_to_align = std::cmp::min(len, 8 - bits_to_align); + } + let mut byte_index = ceil(offset_write + bits_to_align, 8); + + // Set full bytes provided by bit chunk iterator Review comment: ```suggestion // Set full bytes provided by bit chunk iterator (which iterates in 64 bits at a time) ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org