alamb commented on a change in pull request #716:
URL: https://github.com/apache/arrow-rs/pull/716#discussion_r697690544



##########
File path: arrow/src/array/transform/utils.rs
##########
@@ -35,15 +42,37 @@ pub(super) fn set_bits(
     offset_read: usize,
     len: usize,
 ) -> usize {
-    let mut count = 0;
-    (0..len).for_each(|i| {
-        if bit_util::get_bit(data, offset_read + i) {
-            bit_util::set_bit(write_data, offset_write + i);
-        } else {
-            count += 1;
-        }
+    let mut null_count = 0;
+
+    let mut bits_to_align = offset_write % 8;
+    if bits_to_align > 0 {
+        bits_to_align = std::cmp::min(len, 8 - bits_to_align);
+    }
+    let mut byte_index = ceil(offset_write + bits_to_align, 8);

Review comment:
       Maybe callig this `write_byte_index` would help make it clearer what it 
represented

##########
File path: arrow/src/array/transform/utils.rs
##########
@@ -35,15 +42,37 @@ pub(super) fn set_bits(
     offset_read: usize,
     len: usize,
 ) -> usize {
-    let mut count = 0;
-    (0..len).for_each(|i| {
-        if bit_util::get_bit(data, offset_read + i) {
-            bit_util::set_bit(write_data, offset_write + i);
-        } else {
-            count += 1;
-        }
+    let mut null_count = 0;
+
+    let mut bits_to_align = offset_write % 8;
+    if bits_to_align > 0 {
+        bits_to_align = std::cmp::min(len, 8 - bits_to_align);
+    }
+    let mut byte_index = ceil(offset_write + bits_to_align, 8);
+
+    // Set full bytes provided by bit chunk iterator
+    let chunks = BitChunks::new(data, offset_read + bits_to_align, len - 
bits_to_align);
+    chunks.iter().for_each(|chunk| {
+        null_count += chunk.count_zeros();
+        chunk.to_le_bytes().iter().for_each(|b| {

Review comment:
       👍  bit chunk iterator seems to use little endian as well: 
https://github.com/mathiaspeters-sig/arrow-rs/blob/improve-set-bits/arrow/src/util/bit_chunk_iterator.rs#L138

##########
File path: arrow/src/array/transform/utils.rs
##########
@@ -74,3 +103,130 @@ pub(super) unsafe fn get_last_offset<T: OffsetSizeTrait>(
     debug_assert!(prefix.is_empty() && suffix.is_empty());
     *offsets.get_unchecked(offsets.len() - 1)
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_set_bits_aligned() {
+        let mut destination: Vec<u8> = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let source: &[u8] = &[
+            0b11100111, 0b11100111, 0b11100111, 0b11100111, 0b11100111, 
0b11100111,
+            0b11100111, 0b11100111,

Review comment:
       I think one danger of using the same values for all of the source data 
is that it may mask offset calculation errors
   
   What would you think of using a different pattern for each byte? Perhaps 
something like
   
   
   ```suggestion
               0b10101010, 0b11001100, 0b11100011, 0b1111000, 0b11111000, 
0b11111100,
               0b11111110, 0b1111111,
   ```
   
   Or some other pattern where having "off by 8 errors" is not as likely to be 
masked?

##########
File path: arrow/src/array/transform/utils.rs
##########
@@ -74,3 +103,130 @@ pub(super) unsafe fn get_last_offset<T: OffsetSizeTrait>(
     debug_assert!(prefix.is_empty() && suffix.is_empty());
     *offsets.get_unchecked(offsets.len() - 1)
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_set_bits_aligned() {
+        let mut destination: Vec<u8> = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let source: &[u8] = &[
+            0b11100111, 0b11100111, 0b11100111, 0b11100111, 0b11100111, 
0b11100111,
+            0b11100111, 0b11100111,
+        ];
+
+        let destination_offset = 8;
+        let source_offset = 0;
+
+        let len = 64;
+
+        let expected_data: &[u8] = &[
+            0, 0b11100111, 0b11100111, 0b11100111, 0b11100111, 0b11100111, 
0b11100111,
+            0b11100111, 0b11100111, 0,
+        ];
+        let expected_null_count = 16;
+        let result = set_bits(
+            destination.as_mut_slice(),
+            source,
+            destination_offset,
+            source_offset,
+            len,
+        );
+
+        assert_eq!(destination, expected_data);
+        assert_eq!(result, expected_null_count);
+    }
+
+    #[test]
+    fn test_set_bits_unaligned_destination_start() {
+        let mut destination: Vec<u8> = vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0];
+        let source: &[u8] = &[
+            0b11100111, 0b11100111, 0b11100111, 0b11100111, 0b11100111, 
0b11100111,
+            0b11100111, 0b11100111,
+        ];
+
+        let destination_offset = 3;
+        let source_offset = 0;
+
+        let len = 64;
+
+        let expected_data: &[u8] = &[
+            0b00111000, 0b00111111, 0b00111111, 0b00111111, 0b00111111, 
0b00111111,
+            0b00111111, 0b00111111, 0b00000111, 0b00000000,

Review comment:
       i think the 3rd byte looks suspicious -- should it be?
   
   ```suggestion
               0b00111111, 0b00111111, 0b00111100, 0b00000000,
   ```

##########
File path: arrow/src/array/transform/utils.rs
##########
@@ -35,15 +42,37 @@ pub(super) fn set_bits(
     offset_read: usize,
     len: usize,
 ) -> usize {

Review comment:
       It would be cool to update the doc strings here too to explain what the 
return value of `set_bits` is supposed to be (seems like it is supposed to be 
the total number of `0` bits in  `data[offset_read..offset_read+len]`

##########
File path: arrow/src/array/transform/utils.rs
##########
@@ -35,15 +42,37 @@ pub(super) fn set_bits(
     offset_read: usize,
     len: usize,
 ) -> usize {
-    let mut count = 0;
-    (0..len).for_each(|i| {
-        if bit_util::get_bit(data, offset_read + i) {
-            bit_util::set_bit(write_data, offset_write + i);
-        } else {
-            count += 1;
-        }
+    let mut null_count = 0;
+
+    let mut bits_to_align = offset_write % 8;
+    if bits_to_align > 0 {
+        bits_to_align = std::cmp::min(len, 8 - bits_to_align);
+    }
+    let mut byte_index = ceil(offset_write + bits_to_align, 8);
+
+    // Set full bytes provided by bit chunk iterator

Review comment:
       ```suggestion
       // Set full bytes provided by bit chunk iterator (which iterates in 64 
bits at a time)
   ```




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to