This is an automated email from the ASF dual-hosted git repository.
kszucs pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 2ba9016 ARROW-2483: [Rust] use bit-packing for boolean vectors
2ba9016 is described below
commit 2ba90167ed8c6bfef4ba0ec1e30ae7d8ac8995a1
Author: Chao Sun <[email protected]>
AuthorDate: Wed Oct 17 12:40:42 2018 +0200
ARROW-2483: [Rust] use bit-packing for boolean vectors
Author: Chao Sun <[email protected]>
Closes #2746 from sunchao/ARROW-2483 and squashes the following commits:
0845dd90 <Chao Sun> Fix bug
4083311c <Chao Sun> Address comments
c8f3a9ff <Chao Sun> Address comments
1a3bfad8 <Chao Sun> Fix lint
0b078cc6 <Chao Sun> ARROW-2483: use bit-packing for boolean vectors
---
rust/src/array.rs | 292 ++++++++++++++++++++++++++++++++++++++--------
rust/src/buffer.rs | 23 +++-
rust/src/util/bit_util.rs | 63 +++++++++-
3 files changed, 325 insertions(+), 53 deletions(-)
diff --git a/rust/src/array.rs b/rust/src/array.rs
index 5eb90f1..2da7104 100644
--- a/rust/src/array.rs
+++ b/rust/src/array.rs
@@ -18,8 +18,8 @@
///! Array types
use std::any::Any;
use std::convert::From;
+use std::io::Write;
use std::mem;
-use std::ptr;
use std::sync::Arc;
use array_data::*;
@@ -208,30 +208,28 @@ macro_rules! def_primitive_array {
const TY_SIZE: usize = mem::size_of::<$native_ty>();
const NULL: [u8; TY_SIZE] = [0; TY_SIZE];
- let data_len = data.len() as i64;
- let size = bit_util::round_upto_multiple_of_64(data_len) as
usize;
- let mut null_buffer = Vec::with_capacity(size);
- unsafe {
- ptr::write_bytes(null_buffer.as_mut_ptr(), 0, size);
- null_buffer.set_len(size);
- }
- let mut value_buffer: Vec<u8> = Vec::with_capacity(size *
TY_SIZE);
-
- let mut i = 0;
- for n in data {
- if let Some(v) = n {
- bit_util::set_bit(&mut null_buffer[..], i);
- value_buffer.extend_from_slice(&v.to_byte_slice());
- } else {
- value_buffer.extend_from_slice(&NULL);
+ let data_len = data.len();
+ let mut null_buf =
MutableBuffer::new(data_len).with_bitset(data_len, false);
+ let mut val_buf = MutableBuffer::new(data_len * TY_SIZE);
+
+ {
+ let null_slice = null_buf.data_mut();
+ for (i, v) in data.iter().enumerate() {
+ if let Some(n) = v {
+ bit_util::set_bit(null_slice, i as i64);
+ // unwrap() in the following should be safe here
since we've
+ // made sure enough space is allocated for the
values.
+ val_buf.write(&n.to_byte_slice()).unwrap();
+ } else {
+ val_buf.write(&NULL).unwrap();
+ }
}
- i += 1;
}
let array_data = ArrayData::builder($data_ty)
- .len(data_len)
- .add_buffer(Buffer::from(Buffer::from(value_buffer)))
- .null_bit_buffer(Buffer::from(null_buffer))
+ .len(data_len as i64)
+ .add_buffer(val_buf.freeze())
+ .null_bit_buffer(null_buf.freeze())
.build();
PrimitiveArray::from(array_data)
}
@@ -273,7 +271,6 @@ impl<T: ArrowPrimitiveType> Array for PrimitiveArray<T> {
}
}
-def_primitive_array!(DataType::Boolean, bool);
def_primitive_array!(DataType::UInt8, u8);
def_primitive_array!(DataType::UInt16, u16);
def_primitive_array!(DataType::UInt32, u32);
@@ -285,6 +282,123 @@ def_primitive_array!(DataType::Int64, i64);
def_primitive_array!(DataType::Float32, f32);
def_primitive_array!(DataType::Float64, f64);
+/// Array whose elements are of boolean types.
+pub struct BooleanArray {
+ data: ArrayDataRef,
+ raw_values: RawPtrBox<u8>,
+}
+
+impl BooleanArray {
+ pub fn new(length: i64, values: Buffer, null_count: i64, offset: i64) ->
Self {
+ let array_data = ArrayData::builder(DataType::Boolean)
+ .len(length)
+ .add_buffer(values)
+ .null_count(null_count)
+ .offset(offset)
+ .build();
+ BooleanArray::from(array_data)
+ }
+
+ /// Returns a `Buffer` holds all the values of this array.
+ ///
+ /// Note this doesn't take account into the offset of this array.
+ pub fn values(&self) -> Buffer {
+ self.data.buffers()[0].clone()
+ }
+
+ /// Returns the boolean value at index `i`.
+ pub fn value(&self, i: i64) -> bool {
+ let offset = i + self.offset();
+ assert!(offset < self.data.len() as i64);
+ unsafe { bit_util::get_bit_raw(self.raw_values.get(), offset) }
+ }
+}
+
+/// Constructs a boolean array from a vector. Should only be used for testing.
+impl From<Vec<bool>> for BooleanArray {
+ fn from(data: Vec<bool>) -> Self {
+ let num_byte = bit_util::ceil(data.len() as i64, 8) as usize;
+ let mut mut_buf = MutableBuffer::new(num_byte).with_bitset(num_byte,
false);
+ {
+ let mut_slice = mut_buf.data_mut();
+ for (i, b) in data.iter().enumerate() {
+ if *b {
+ bit_util::set_bit(mut_slice, i as i64);
+ }
+ }
+ }
+ let array_data = ArrayData::builder(DataType::Boolean)
+ .len(data.len() as i64)
+ .add_buffer(mut_buf.freeze())
+ .build();
+ BooleanArray::from(array_data)
+ }
+}
+
+impl From<Vec<Option<bool>>> for BooleanArray {
+ fn from(data: Vec<Option<bool>>) -> Self {
+ let data_len = data.len() as i64;
+ let num_byte = bit_util::ceil(data_len, 8) as usize;
+ let mut null_buf = MutableBuffer::new(num_byte).with_bitset(num_byte,
false);
+ let mut val_buf = MutableBuffer::new(num_byte).with_bitset(num_byte,
false);
+
+ {
+ let null_slice = null_buf.data_mut();
+ let val_slice = val_buf.data_mut();
+
+ for (i, v) in data.iter().enumerate() {
+ if let Some(b) = v {
+ bit_util::set_bit(null_slice, i as i64);
+ if *b {
+ bit_util::set_bit(val_slice, i as i64);
+ }
+ }
+ }
+ }
+
+ let array_data = ArrayData::builder(DataType::Boolean)
+ .len(data_len)
+ .add_buffer(val_buf.freeze())
+ .null_bit_buffer(null_buf.freeze())
+ .build();
+ BooleanArray::from(array_data)
+ }
+}
+
+/// Constructs a `BooleanArray` from an array data reference.
+impl From<ArrayDataRef> for BooleanArray {
+ fn from(data: ArrayDataRef) -> Self {
+ assert_eq!(
+ data.buffers().len(),
+ 1,
+ "BooleanArray data should contain a single buffer only (values
buffer)"
+ );
+ let raw_values = data.buffers()[0].raw_data();
+ assert!(
+ memory::is_aligned::<u8>(raw_values, mem::align_of::<u8>()),
+ "memory is not aligned"
+ );
+ Self {
+ data,
+ raw_values: RawPtrBox::new(raw_values),
+ }
+ }
+}
+
+impl Array for BooleanArray {
+ fn as_any(&self) -> &Any {
+ self
+ }
+
+ fn data(&self) -> ArrayDataRef {
+ self.data.clone()
+ }
+
+ fn data_ref(&self) -> &ArrayDataRef {
+ &self.data
+ }
+}
+
/// A list array where each element is a variable-sized sequence of values
with the same
/// type.
pub struct ListArray {
@@ -554,7 +668,7 @@ impl From<Vec<(Field, ArrayRef)>> for StructArray {
mod tests {
use std::thread;
- use super::{Array, BinaryArray, ListArray, PrimitiveArray, StructArray};
+ use super::*;
use array_data::ArrayData;
use buffer::Buffer;
use datatypes::{DataType, Field, ToByteSlice};
@@ -565,35 +679,35 @@ mod tests {
fn test_primitive_array_from_vec() {
let buf = Buffer::from(&[0, 1, 2, 3, 4].to_byte_slice());
let buf2 = buf.clone();
- let pa = PrimitiveArray::<i32>::new(5, buf, 0, 0);
- let slice = unsafe { ::std::slice::from_raw_parts(pa.raw_values(), 5)
};
- assert_eq!(buf2, pa.values());
+ let arr = PrimitiveArray::<i32>::new(5, buf, 0, 0);
+ let slice = unsafe { ::std::slice::from_raw_parts(arr.raw_values(), 5)
};
+ assert_eq!(buf2, arr.values());
assert_eq!(&[0, 1, 2, 3, 4], slice);
- assert_eq!(5, pa.len());
- assert_eq!(0, pa.offset());
- assert_eq!(0, pa.null_count());
+ assert_eq!(5, arr.len());
+ assert_eq!(0, arr.offset());
+ assert_eq!(0, arr.null_count());
for i in 0..5 {
- assert!(!pa.is_null(i));
- assert!(pa.is_valid(i));
- assert_eq!(i as i32, pa.value(i));
+ assert!(!arr.is_null(i));
+ assert!(arr.is_valid(i));
+ assert_eq!(i as i32, arr.value(i));
}
}
#[test]
fn test_primitive_array_from_vec_option() {
// Test building a primitive array with null values
- let pa = PrimitiveArray::<i32>::from(vec![Some(0), None, Some(2),
None, Some(4)]);
- assert_eq!(5, pa.len());
- assert_eq!(0, pa.offset());
- assert_eq!(2, pa.null_count());
+ let arr = PrimitiveArray::<i32>::from(vec![Some(0), None, Some(2),
None, Some(4)]);
+ assert_eq!(5, arr.len());
+ assert_eq!(0, arr.offset());
+ assert_eq!(2, arr.null_count());
for i in 0..5 {
if i % 2 == 0 {
- assert!(!pa.is_null(i));
- assert!(pa.is_valid(i));
- assert_eq!(i as i32, pa.value(i));
+ assert!(!arr.is_null(i));
+ assert!(arr.is_valid(i));
+ assert_eq!(i as i32, arr.value(i));
} else {
- assert!(pa.is_null(i));
- assert!(!pa.is_valid(i));
+ assert!(arr.is_null(i));
+ assert!(!arr.is_valid(i));
}
}
}
@@ -608,12 +722,12 @@ mod tests {
.offset(2)
.add_buffer(buf)
.build();
- let pa = PrimitiveArray::<i32>::from(data);
- assert_eq!(buf2, pa.values());
- assert_eq!(5, pa.len());
- assert_eq!(0, pa.null_count());
+ let arr = PrimitiveArray::<i32>::from(data);
+ assert_eq!(buf2, arr.values());
+ assert_eq!(5, arr.len());
+ assert_eq!(0, arr.null_count());
for i in 0..3 {
- assert_eq!((i + 2) as i32, pa.value(i));
+ assert_eq!((i + 2) as i32, arr.value(i));
}
}
@@ -627,6 +741,88 @@ mod tests {
}
#[test]
+ fn test_boolean_array_new() {
+ // 00000010 01001000
+ let buf = Buffer::from([72_u8, 2_u8]);
+ let buf2 = buf.clone();
+ let arr = BooleanArray::new(10, buf, 0, 0);
+ assert_eq!(buf2, arr.values());
+ assert_eq!(10, arr.len());
+ assert_eq!(0, arr.offset());
+ assert_eq!(0, arr.null_count());
+ for i in 0..10 {
+ assert!(!arr.is_null(i));
+ assert!(arr.is_valid(i));
+ assert_eq!(i == 3 || i == 6 || i == 9, arr.value(i), "failed at
{}", i)
+ }
+ }
+
+ #[test]
+ fn test_boolean_array_from_vec() {
+ let buf = Buffer::from([10_u8]);
+ let arr = BooleanArray::from(vec![false, true, false, true]);
+ assert_eq!(buf, arr.values());
+ assert_eq!(4, arr.len());
+ assert_eq!(0, arr.offset());
+ assert_eq!(0, arr.null_count());
+ for i in 0..4 {
+ assert!(!arr.is_null(i));
+ assert!(arr.is_valid(i));
+ assert_eq!(i == 1 || i == 3, arr.value(i), "failed at {}", i)
+ }
+ }
+
+ #[test]
+ fn test_boolean_array_from_vec_option() {
+ let buf = Buffer::from([10_u8]);
+ let arr = BooleanArray::from(vec![Some(false), Some(true), None,
Some(true)]);
+ assert_eq!(buf, arr.values());
+ assert_eq!(4, arr.len());
+ assert_eq!(0, arr.offset());
+ assert_eq!(1, arr.null_count());
+ for i in 0..4 {
+ if i == 2 {
+ assert!(arr.is_null(i));
+ assert!(!arr.is_valid(i));
+ } else {
+ assert!(!arr.is_null(i));
+ assert!(arr.is_valid(i));
+ assert_eq!(i == 1 || i == 3, arr.value(i), "failed at {}", i)
+ }
+ }
+ }
+
+ #[test]
+ fn test_boolean_array_builder() {
+ // Test building a boolean array with ArrayData builder and offset
+ // 000011011
+ let buf = Buffer::from([27_u8]);
+ let buf2 = buf.clone();
+ let data = ArrayData::builder(DataType::Boolean)
+ .len(5)
+ .offset(2)
+ .add_buffer(buf)
+ .build();
+ let arr = BooleanArray::from(data);
+ assert_eq!(buf2, arr.values());
+ assert_eq!(5, arr.len());
+ assert_eq!(2, arr.offset());
+ assert_eq!(0, arr.null_count());
+ for i in 0..3 {
+ assert_eq!(i != 0, arr.value(i), "failed at {}", i);
+ }
+ }
+
+ #[test]
+ #[should_panic(
+ expected = "BooleanArray data should contain a single buffer only
(values buffer)"
+ )]
+ fn test_boolean_array_invalid_buffer_len() {
+ let data = ArrayData::builder(DataType::Boolean).len(5).build();
+ BooleanArray::from(data);
+ }
+
+ #[test]
fn test_list_array() {
// Construct a value array
let value_data = ArrayData::builder(DataType::Int32)
@@ -832,7 +1028,7 @@ mod tests {
fn test_struct_array_from() {
let boolean_data = ArrayData::builder(DataType::Boolean)
.len(4)
- .add_buffer(Buffer::from([false, false, true,
true].to_byte_slice()))
+ .add_buffer(Buffer::from([12_u8]))
.build();
let int_data = ArrayData::builder(DataType::Int32)
.len(4)
@@ -841,7 +1037,7 @@ mod tests {
let struct_array = StructArray::from(vec![
(
Field::new("b", DataType::Boolean, false),
- Arc::new(PrimitiveArray::from(vec![false, false, true, true]))
as Arc<Array>,
+ Arc::new(BooleanArray::from(vec![false, false, true, true]))
as Arc<Array>,
),
(
Field::new("c", DataType::Int32, false),
diff --git a/rust/src/buffer.rs b/rust/src/buffer.rs
index 827d2d6..85133d9 100644
--- a/rust/src/buffer.rs
+++ b/rust/src/buffer.rs
@@ -40,7 +40,7 @@ struct BufferData {
/// The raw pointer into the buffer bytes
ptr: *const u8,
- /// The length of the buffer
+ /// The length (num of bytes) of the buffer
len: usize,
}
@@ -160,6 +160,22 @@ impl MutableBuffer {
}
}
+ /// Set the bits in the range of `[0, end)` to 0 (if `val` is false), or 1
(if `val`
+ /// is true). Also extend the length of this buffer to be `end`.
+ ///
+ /// This is useful when one wants to clear (or set) the bits and then
manipulate
+ /// the buffer directly (e.g., modifying the buffer by holding a mutable
reference
+ /// from `data_mut()`).
+ pub fn with_bitset(mut self, end: usize, val: bool) -> Self {
+ assert!(end < self.capacity);
+ let v = if val { 1 } else { 0 };
+ unsafe {
+ ::std::ptr::write_bytes(self.data, v, end);
+ self.len = end;
+ }
+ self
+ }
+
/// Adjust the capacity of this buffer to be at least `new_capacity`.
///
/// If the `new_capacity` is less than the current capacity, nothing is
done and `Ok`
@@ -203,6 +219,11 @@ impl MutableBuffer {
unsafe { ::std::slice::from_raw_parts(self.raw_data(), self.len()) }
}
+ /// Returns the data stored in this buffer as a mutable slice.
+ pub fn data_mut(&mut self) -> &mut [u8] {
+ unsafe { ::std::slice::from_raw_parts_mut(self.raw_data() as *mut u8,
self.len()) }
+ }
+
/// Returns a raw pointer for this buffer.
///
/// Note that this should be used cautiously, and the returned pointer
should not be
diff --git a/rust/src/util/bit_util.rs b/rust/src/util/bit_util.rs
index 8d9a805..5e4c35c 100644
--- a/rust/src/util/bit_util.rs
+++ b/rust/src/util/bit_util.rs
@@ -44,13 +44,22 @@ fn round_upto_power_of_2(num: i64, factor: i64) -> i64 {
/// Returns whether bit at position `i` in `data` is set or not
#[inline]
pub fn get_bit(data: &[u8], i: i64) -> bool {
- (data[(i / 8) as usize] & BIT_MASK[(i % 8) as usize]) != 0
+ (data[(i >> 3) as usize] & BIT_MASK[(i & 7) as usize]) != 0
+}
+
+/// Returns whether bit at position `i` in `data` is set or not.
+///
+/// Note this doesn't do any bound checking, for performance reason. The
caller is
+/// responsible to guarantee that `i` is within bounds.
+#[inline]
+pub unsafe fn get_bit_raw(data: *const u8, i: i64) -> bool {
+ (*data.offset((i >> 3) as isize) & BIT_MASK[(i & 7) as usize]) != 0
}
/// Sets bit at position `i` for `data`
#[inline]
pub fn set_bit(data: &mut [u8], i: i64) {
- data[(i / 8) as usize] |= BIT_MASK[(i % 8) as usize]
+ data[(i >> 3) as usize] |= BIT_MASK[(i & 7) as usize]
}
/// Returns the number of 1-bits in `data`
@@ -68,8 +77,8 @@ pub fn count_set_bits(data: &[u8]) -> i64 {
pub fn count_set_bits_offset(data: &[u8], offset: i64) -> i64 {
debug_assert!(offset <= (data.len() * 8) as i64);
- let start_byte_pos = (offset / 8) as usize;
- let start_bit_pos = offset % 8;
+ let start_byte_pos = (offset >> 3) as usize;
+ let start_bit_pos = offset & 7;
if start_bit_pos == 0 {
count_set_bits(&data[start_byte_pos..])
@@ -85,6 +94,16 @@ pub fn count_set_bits_offset(data: &[u8], offset: i64) ->
i64 {
}
}
+/// Returns the ceil of `value`/`divisor`
+#[inline]
+pub fn ceil(value: i64, divisor: i64) -> i64 {
+ let mut result = value / divisor;
+ if value % divisor != 0 {
+ result += 1
+ };
+ result
+}
+
#[cfg(test)]
mod tests {
use rand::{thread_rng, Rng};
@@ -130,6 +149,28 @@ mod tests {
}
#[test]
+ fn test_get_bit_raw() {
+ const NUM_BYTE: usize = 10;
+ let mut buf = vec![0; NUM_BYTE];
+ let mut expected = vec![];
+ let mut rng = thread_rng();
+ for i in 0..8 * NUM_BYTE {
+ let b = rng.gen_bool(0.5);
+ expected.push(b);
+ if b {
+ set_bit(&mut buf[..], i as i64)
+ }
+ }
+
+ let raw_ptr = buf.as_ptr();
+ for (i, b) in expected.iter().enumerate() {
+ unsafe {
+ assert_eq!(*b, get_bit_raw(raw_ptr, i as i64));
+ }
+ }
+ }
+
+ #[test]
fn test_set_bit() {
let mut b = [0b00000000];
set_bit(&mut b, 0);
@@ -178,4 +219,18 @@ mod tests {
assert_eq!(0, count_set_bits_offset(&[0b11111111, 0b11111111], 16));
}
+ #[test]
+ fn test_ceil() {
+ assert_eq!(ceil(0, 1), 0);
+ assert_eq!(ceil(1, 1), 1);
+ assert_eq!(ceil(1, 2), 1);
+ assert_eq!(ceil(1, 8), 1);
+ assert_eq!(ceil(7, 8), 1);
+ assert_eq!(ceil(8, 8), 1);
+ assert_eq!(ceil(9, 8), 2);
+ assert_eq!(ceil(9, 9), 1);
+ assert_eq!(ceil(10000000000, 10), 1000000000);
+ assert_eq!(ceil(10, 10000000000), 1);
+ assert_eq!(ceil(10000000000, 1000000000), 10);
+ }
}