This is an automated email from the ASF dual-hosted git repository.

kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 2ba9016  ARROW-2483: [Rust] use bit-packing for boolean vectors
2ba9016 is described below

commit 2ba90167ed8c6bfef4ba0ec1e30ae7d8ac8995a1
Author: Chao Sun <[email protected]>
AuthorDate: Wed Oct 17 12:40:42 2018 +0200

    ARROW-2483: [Rust] use bit-packing for boolean vectors
    
    Author: Chao Sun <[email protected]>
    
    Closes #2746 from sunchao/ARROW-2483 and squashes the following commits:
    
    0845dd90 <Chao Sun> Fix bug
    4083311c <Chao Sun> Address comments
    c8f3a9ff <Chao Sun> Address comments
    1a3bfad8 <Chao Sun> Fix lint
    0b078cc6 <Chao Sun> ARROW-2483:  use bit-packing for boolean vectors
---
 rust/src/array.rs         | 292 ++++++++++++++++++++++++++++++++++++++--------
 rust/src/buffer.rs        |  23 +++-
 rust/src/util/bit_util.rs |  63 +++++++++-
 3 files changed, 325 insertions(+), 53 deletions(-)

diff --git a/rust/src/array.rs b/rust/src/array.rs
index 5eb90f1..2da7104 100644
--- a/rust/src/array.rs
+++ b/rust/src/array.rs
@@ -18,8 +18,8 @@
 ///! Array types
 use std::any::Any;
 use std::convert::From;
+use std::io::Write;
 use std::mem;
-use std::ptr;
 use std::sync::Arc;
 
 use array_data::*;
@@ -208,30 +208,28 @@ macro_rules! def_primitive_array {
                 const TY_SIZE: usize = mem::size_of::<$native_ty>();
                 const NULL: [u8; TY_SIZE] = [0; TY_SIZE];
 
-                let data_len = data.len() as i64;
-                let size = bit_util::round_upto_multiple_of_64(data_len) as 
usize;
-                let mut null_buffer = Vec::with_capacity(size);
-                unsafe {
-                    ptr::write_bytes(null_buffer.as_mut_ptr(), 0, size);
-                    null_buffer.set_len(size);
-                }
-                let mut value_buffer: Vec<u8> = Vec::with_capacity(size * 
TY_SIZE);
-
-                let mut i = 0;
-                for n in data {
-                    if let Some(v) = n {
-                        bit_util::set_bit(&mut null_buffer[..], i);
-                        value_buffer.extend_from_slice(&v.to_byte_slice());
-                    } else {
-                        value_buffer.extend_from_slice(&NULL);
+                let data_len = data.len();
+                let mut null_buf = 
MutableBuffer::new(data_len).with_bitset(data_len, false);
+                let mut val_buf = MutableBuffer::new(data_len * TY_SIZE);
+
+                {
+                    let null_slice = null_buf.data_mut();
+                    for (i, v) in data.iter().enumerate() {
+                        if let Some(n) = v {
+                            bit_util::set_bit(null_slice, i as i64);
+                            // unwrap() in the following should be safe here 
since we've
+                            // made sure enough space is allocated for the 
values.
+                            val_buf.write(&n.to_byte_slice()).unwrap();
+                        } else {
+                            val_buf.write(&NULL).unwrap();
+                        }
                     }
-                    i += 1;
                 }
 
                 let array_data = ArrayData::builder($data_ty)
-                    .len(data_len)
-                    .add_buffer(Buffer::from(Buffer::from(value_buffer)))
-                    .null_bit_buffer(Buffer::from(null_buffer))
+                    .len(data_len as i64)
+                    .add_buffer(val_buf.freeze())
+                    .null_bit_buffer(null_buf.freeze())
                     .build();
                 PrimitiveArray::from(array_data)
             }
@@ -273,7 +271,6 @@ impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
     }
 }
 
-def_primitive_array!(DataType::Boolean, bool);
 def_primitive_array!(DataType::UInt8, u8);
 def_primitive_array!(DataType::UInt16, u16);
 def_primitive_array!(DataType::UInt32, u32);
@@ -285,6 +282,123 @@ def_primitive_array!(DataType::Int64, i64);
 def_primitive_array!(DataType::Float32, f32);
 def_primitive_array!(DataType::Float64, f64);
 
+/// Array whose elements are of boolean types.
+pub struct BooleanArray {
+    data: ArrayDataRef,
+    raw_values: RawPtrBox<u8>,
+}
+
+impl BooleanArray {
+    pub fn new(length: i64, values: Buffer, null_count: i64, offset: i64) -> 
Self {
+        let array_data = ArrayData::builder(DataType::Boolean)
+            .len(length)
+            .add_buffer(values)
+            .null_count(null_count)
+            .offset(offset)
+            .build();
+        BooleanArray::from(array_data)
+    }
+
+    /// Returns a `Buffer` holds all the values of this array.
+    ///
+    /// Note this doesn't take account into the offset of this array.
+    pub fn values(&self) -> Buffer {
+        self.data.buffers()[0].clone()
+    }
+
+    /// Returns the boolean value at index `i`.
+    pub fn value(&self, i: i64) -> bool {
+        let offset = i + self.offset();
+        assert!(offset < self.data.len() as i64);
+        unsafe { bit_util::get_bit_raw(self.raw_values.get(), offset) }
+    }
+}
+
+/// Constructs a boolean array from a vector. Should only be used for testing.
+impl From<Vec<bool>> for BooleanArray {
+    fn from(data: Vec<bool>) -> Self {
+        let num_byte = bit_util::ceil(data.len() as i64, 8) as usize;
+        let mut mut_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, 
false);
+        {
+            let mut_slice = mut_buf.data_mut();
+            for (i, b) in data.iter().enumerate() {
+                if *b {
+                    bit_util::set_bit(mut_slice, i as i64);
+                }
+            }
+        }
+        let array_data = ArrayData::builder(DataType::Boolean)
+            .len(data.len() as i64)
+            .add_buffer(mut_buf.freeze())
+            .build();
+        BooleanArray::from(array_data)
+    }
+}
+
+impl From<Vec<Option<bool>>> for BooleanArray {
+    fn from(data: Vec<Option<bool>>) -> Self {
+        let data_len = data.len() as i64;
+        let num_byte = bit_util::ceil(data_len, 8) as usize;
+        let mut null_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, 
false);
+        let mut val_buf = MutableBuffer::new(num_byte).with_bitset(num_byte, 
false);
+
+        {
+            let null_slice = null_buf.data_mut();
+            let val_slice = val_buf.data_mut();
+
+            for (i, v) in data.iter().enumerate() {
+                if let Some(b) = v {
+                    bit_util::set_bit(null_slice, i as i64);
+                    if *b {
+                        bit_util::set_bit(val_slice, i as i64);
+                    }
+                }
+            }
+        }
+
+        let array_data = ArrayData::builder(DataType::Boolean)
+            .len(data_len)
+            .add_buffer(val_buf.freeze())
+            .null_bit_buffer(null_buf.freeze())
+            .build();
+        BooleanArray::from(array_data)
+    }
+}
+
+/// Constructs a `BooleanArray` from an array data reference.
+impl From<ArrayDataRef> for BooleanArray {
+    fn from(data: ArrayDataRef) -> Self {
+        assert_eq!(
+            data.buffers().len(),
+            1,
+            "BooleanArray data should contain a single buffer only (values 
buffer)"
+        );
+        let raw_values = data.buffers()[0].raw_data();
+        assert!(
+            memory::is_aligned::<u8>(raw_values, mem::align_of::<u8>()),
+            "memory is not aligned"
+        );
+        Self {
+            data,
+            raw_values: RawPtrBox::new(raw_values),
+        }
+    }
+}
+
+impl Array for BooleanArray {
+    fn as_any(&self) -> &Any {
+        self
+    }
+
+    fn data(&self) -> ArrayDataRef {
+        self.data.clone()
+    }
+
+    fn data_ref(&self) -> &ArrayDataRef {
+        &self.data
+    }
+}
+
 /// A list array where each element is a variable-sized sequence of values 
with the same
 /// type.
 pub struct ListArray {
@@ -554,7 +668,7 @@ impl From<Vec<(Field, ArrayRef)>> for StructArray {
 mod tests {
     use std::thread;
 
-    use super::{Array, BinaryArray, ListArray, PrimitiveArray, StructArray};
+    use super::*;
     use array_data::ArrayData;
     use buffer::Buffer;
     use datatypes::{DataType, Field, ToByteSlice};
@@ -565,35 +679,35 @@ mod tests {
     fn test_primitive_array_from_vec() {
         let buf = Buffer::from(&[0, 1, 2, 3, 4].to_byte_slice());
         let buf2 = buf.clone();
-        let pa = PrimitiveArray::<i32>::new(5, buf, 0, 0);
-        let slice = unsafe { ::std::slice::from_raw_parts(pa.raw_values(), 5) 
};
-        assert_eq!(buf2, pa.values());
+        let arr = PrimitiveArray::<i32>::new(5, buf, 0, 0);
+        let slice = unsafe { ::std::slice::from_raw_parts(arr.raw_values(), 5) 
};
+        assert_eq!(buf2, arr.values());
         assert_eq!(&[0, 1, 2, 3, 4], slice);
-        assert_eq!(5, pa.len());
-        assert_eq!(0, pa.offset());
-        assert_eq!(0, pa.null_count());
+        assert_eq!(5, arr.len());
+        assert_eq!(0, arr.offset());
+        assert_eq!(0, arr.null_count());
         for i in 0..5 {
-            assert!(!pa.is_null(i));
-            assert!(pa.is_valid(i));
-            assert_eq!(i as i32, pa.value(i));
+            assert!(!arr.is_null(i));
+            assert!(arr.is_valid(i));
+            assert_eq!(i as i32, arr.value(i));
         }
     }
 
     #[test]
     fn test_primitive_array_from_vec_option() {
         // Test building a primitive array with null values
-        let pa = PrimitiveArray::<i32>::from(vec![Some(0), None, Some(2), 
None, Some(4)]);
-        assert_eq!(5, pa.len());
-        assert_eq!(0, pa.offset());
-        assert_eq!(2, pa.null_count());
+        let arr = PrimitiveArray::<i32>::from(vec![Some(0), None, Some(2), 
None, Some(4)]);
+        assert_eq!(5, arr.len());
+        assert_eq!(0, arr.offset());
+        assert_eq!(2, arr.null_count());
         for i in 0..5 {
             if i % 2 == 0 {
-                assert!(!pa.is_null(i));
-                assert!(pa.is_valid(i));
-                assert_eq!(i as i32, pa.value(i));
+                assert!(!arr.is_null(i));
+                assert!(arr.is_valid(i));
+                assert_eq!(i as i32, arr.value(i));
             } else {
-                assert!(pa.is_null(i));
-                assert!(!pa.is_valid(i));
+                assert!(arr.is_null(i));
+                assert!(!arr.is_valid(i));
             }
         }
     }
@@ -608,12 +722,12 @@ mod tests {
             .offset(2)
             .add_buffer(buf)
             .build();
-        let pa = PrimitiveArray::<i32>::from(data);
-        assert_eq!(buf2, pa.values());
-        assert_eq!(5, pa.len());
-        assert_eq!(0, pa.null_count());
+        let arr = PrimitiveArray::<i32>::from(data);
+        assert_eq!(buf2, arr.values());
+        assert_eq!(5, arr.len());
+        assert_eq!(0, arr.null_count());
         for i in 0..3 {
-            assert_eq!((i + 2) as i32, pa.value(i));
+            assert_eq!((i + 2) as i32, arr.value(i));
         }
     }
 
@@ -627,6 +741,88 @@ mod tests {
     }
 
     #[test]
+    fn test_boolean_array_new() {
+        // 00000010 01001000
+        let buf = Buffer::from([72_u8, 2_u8]);
+        let buf2 = buf.clone();
+        let arr = BooleanArray::new(10, buf, 0, 0);
+        assert_eq!(buf2, arr.values());
+        assert_eq!(10, arr.len());
+        assert_eq!(0, arr.offset());
+        assert_eq!(0, arr.null_count());
+        for i in 0..10 {
+            assert!(!arr.is_null(i));
+            assert!(arr.is_valid(i));
+            assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at 
{}", i)
+        }
+    }
+
+    #[test]
+    fn test_boolean_array_from_vec() {
+        let buf = Buffer::from([10_u8]);
+        let arr = BooleanArray::from(vec![false, true, false, true]);
+        assert_eq!(buf, arr.values());
+        assert_eq!(4, arr.len());
+        assert_eq!(0, arr.offset());
+        assert_eq!(0, arr.null_count());
+        for i in 0..4 {
+            assert!(!arr.is_null(i));
+            assert!(arr.is_valid(i));
+            assert_eq!(i == 1 || i == 3, arr.value(i), "failed at {}", i)
+        }
+    }
+
+    #[test]
+    fn test_boolean_array_from_vec_option() {
+        let buf = Buffer::from([10_u8]);
+        let arr = BooleanArray::from(vec![Some(false), Some(true), None, 
Some(true)]);
+        assert_eq!(buf, arr.values());
+        assert_eq!(4, arr.len());
+        assert_eq!(0, arr.offset());
+        assert_eq!(1, arr.null_count());
+        for i in 0..4 {
+            if i == 2 {
+                assert!(arr.is_null(i));
+                assert!(!arr.is_valid(i));
+            } else {
+                assert!(!arr.is_null(i));
+                assert!(arr.is_valid(i));
+                assert_eq!(i == 1 || i == 3, arr.value(i), "failed at {}", i)
+            }
+        }
+    }
+
+    #[test]
+    fn test_boolean_array_builder() {
+        // Test building a boolean array with ArrayData builder and offset
+        // 000011011
+        let buf = Buffer::from([27_u8]);
+        let buf2 = buf.clone();
+        let data = ArrayData::builder(DataType::Boolean)
+            .len(5)
+            .offset(2)
+            .add_buffer(buf)
+            .build();
+        let arr = BooleanArray::from(data);
+        assert_eq!(buf2, arr.values());
+        assert_eq!(5, arr.len());
+        assert_eq!(2, arr.offset());
+        assert_eq!(0, arr.null_count());
+        for i in 0..3 {
+            assert_eq!(i != 0, arr.value(i), "failed at {}", i);
+        }
+    }
+
+    #[test]
+    #[should_panic(
+        expected = "BooleanArray data should contain a single buffer only 
(values buffer)"
+    )]
+    fn test_boolean_array_invalid_buffer_len() {
+        let data = ArrayData::builder(DataType::Boolean).len(5).build();
+        BooleanArray::from(data);
+    }
+
+    #[test]
     fn test_list_array() {
         // Construct a value array
         let value_data = ArrayData::builder(DataType::Int32)
@@ -832,7 +1028,7 @@ mod tests {
     fn test_struct_array_from() {
         let boolean_data = ArrayData::builder(DataType::Boolean)
             .len(4)
-            .add_buffer(Buffer::from([false, false, true, 
true].to_byte_slice()))
+            .add_buffer(Buffer::from([12_u8]))
             .build();
         let int_data = ArrayData::builder(DataType::Int32)
             .len(4)
@@ -841,7 +1037,7 @@ mod tests {
         let struct_array = StructArray::from(vec![
             (
                 Field::new("b", DataType::Boolean, false),
-                Arc::new(PrimitiveArray::from(vec![false, false, true, true])) 
as Arc<Array>,
+                Arc::new(BooleanArray::from(vec![false, false, true, true])) 
as Arc<Array>,
             ),
             (
                 Field::new("c", DataType::Int32, false),
diff --git a/rust/src/buffer.rs b/rust/src/buffer.rs
index 827d2d6..85133d9 100644
--- a/rust/src/buffer.rs
+++ b/rust/src/buffer.rs
@@ -40,7 +40,7 @@ struct BufferData {
     /// The raw pointer into the buffer bytes
     ptr: *const u8,
 
-    /// The length of the buffer
+    /// The length (num of bytes) of the buffer
     len: usize,
 }
 
@@ -160,6 +160,22 @@ impl MutableBuffer {
         }
     }
 
+    /// Set the bits in the range of `[0, end)` to 0 (if `val` is false), or 1 
(if `val`
+    /// is true). Also extend the length of this buffer to be `end`.
+    ///
+    /// This is useful when one wants to clear (or set) the bits and then 
manipulate
+    /// the buffer directly (e.g., modifying the buffer by holding a mutable 
reference
+    /// from `data_mut()`).
+    pub fn with_bitset(mut self, end: usize, val: bool) -> Self {
+        assert!(end < self.capacity);
+        let v = if val { 1 } else { 0 };
+        unsafe {
+            ::std::ptr::write_bytes(self.data, v, end);
+            self.len = end;
+        }
+        self
+    }
+
     /// Adjust the capacity of this buffer to be at least `new_capacity`.
     ///
     /// If the `new_capacity` is less than the current capacity, nothing is 
done and `Ok`
@@ -203,6 +219,11 @@ impl MutableBuffer {
         unsafe { ::std::slice::from_raw_parts(self.raw_data(), self.len()) }
     }
 
+    /// Returns the data stored in this buffer as a mutable slice.
+    pub fn data_mut(&mut self) -> &mut [u8] {
+        unsafe { ::std::slice::from_raw_parts_mut(self.raw_data() as *mut u8, 
self.len()) }
+    }
+
     /// Returns a raw pointer for this buffer.
     ///
     /// Note that this should be used cautiously, and the returned pointer 
should not be
diff --git a/rust/src/util/bit_util.rs b/rust/src/util/bit_util.rs
index 8d9a805..5e4c35c 100644
--- a/rust/src/util/bit_util.rs
+++ b/rust/src/util/bit_util.rs
@@ -44,13 +44,22 @@ fn round_upto_power_of_2(num: i64, factor: i64) -> i64 {
 /// Returns whether bit at position `i` in `data` is set or not
 #[inline]
 pub fn get_bit(data: &[u8], i: i64) -> bool {
-    (data[(i / 8) as usize] & BIT_MASK[(i % 8) as usize]) != 0
+    (data[(i >> 3) as usize] & BIT_MASK[(i & 7) as usize]) != 0
+}
+
+/// Returns whether bit at position `i` in `data` is set or not.
+///
+/// Note this doesn't do any bound checking, for performance reason. The 
caller is
+/// responsible to guarantee that `i` is within bounds.
+#[inline]
+pub unsafe fn get_bit_raw(data: *const u8, i: i64) -> bool {
+    (*data.offset((i >> 3) as isize) & BIT_MASK[(i & 7) as usize]) != 0
 }
 
 /// Sets bit at position `i` for `data`
 #[inline]
 pub fn set_bit(data: &mut [u8], i: i64) {
-    data[(i / 8) as usize] |= BIT_MASK[(i % 8) as usize]
+    data[(i >> 3) as usize] |= BIT_MASK[(i & 7) as usize]
 }
 
 /// Returns the number of 1-bits in `data`
@@ -68,8 +77,8 @@ pub fn count_set_bits(data: &[u8]) -> i64 {
 pub fn count_set_bits_offset(data: &[u8], offset: i64) -> i64 {
     debug_assert!(offset <= (data.len() * 8) as i64);
 
-    let start_byte_pos = (offset / 8) as usize;
-    let start_bit_pos = offset % 8;
+    let start_byte_pos = (offset >> 3) as usize;
+    let start_bit_pos = offset & 7;
 
     if start_bit_pos == 0 {
         count_set_bits(&data[start_byte_pos..])
@@ -85,6 +94,16 @@ pub fn count_set_bits_offset(data: &[u8], offset: i64) -> 
i64 {
     }
 }
 
+/// Returns the ceil of `value`/`divisor`
+#[inline]
+pub fn ceil(value: i64, divisor: i64) -> i64 {
+    let mut result = value / divisor;
+    if value % divisor != 0 {
+        result += 1
+    };
+    result
+}
+
 #[cfg(test)]
 mod tests {
     use rand::{thread_rng, Rng};
@@ -130,6 +149,28 @@ mod tests {
     }
 
     #[test]
+    fn test_get_bit_raw() {
+        const NUM_BYTE: usize = 10;
+        let mut buf = vec![0; NUM_BYTE];
+        let mut expected = vec![];
+        let mut rng = thread_rng();
+        for i in 0..8 * NUM_BYTE {
+            let b = rng.gen_bool(0.5);
+            expected.push(b);
+            if b {
+                set_bit(&mut buf[..], i as i64)
+            }
+        }
+
+        let raw_ptr = buf.as_ptr();
+        for (i, b) in expected.iter().enumerate() {
+            unsafe {
+                assert_eq!(*b, get_bit_raw(raw_ptr, i as i64));
+            }
+        }
+    }
+
+    #[test]
     fn test_set_bit() {
         let mut b = [0b00000000];
         set_bit(&mut b, 0);
@@ -178,4 +219,18 @@ mod tests {
         assert_eq!(0, count_set_bits_offset(&[0b11111111, 0b11111111], 16));
     }
 
+    #[test]
+    fn test_ceil() {
+        assert_eq!(ceil(0, 1), 0);
+        assert_eq!(ceil(1, 1), 1);
+        assert_eq!(ceil(1, 2), 1);
+        assert_eq!(ceil(1, 8), 1);
+        assert_eq!(ceil(7, 8), 1);
+        assert_eq!(ceil(8, 8), 1);
+        assert_eq!(ceil(9, 8), 2);
+        assert_eq!(ceil(9, 9), 1);
+        assert_eq!(ceil(10000000000, 10), 1000000000);
+        assert_eq!(ceil(10, 10000000000), 1);
+        assert_eq!(ceil(10000000000, 1000000000), 10);
+    }
 }

Reply via email to