This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-datafusion.git
The following commit(s) were added to refs/heads/master by this push:
new 454203174 Write faster kernel for is_distinct (#4560)
454203174 is described below
commit 454203174a49ac8f36fc64da59f43d20c1400348
Author: comphead <[email protected]>
AuthorDate: Wed Dec 14 12:28:25 2022 -0800
Write faster kernel for is_distinct (#4560)
* Optimize is_distinct
* added tests
* added 1 more test
---
.../src/expressions/binary/kernels_arrow.rs | 138 +++++++++++++++++----
1 file changed, 116 insertions(+), 22 deletions(-)
diff --git a/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs
b/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs
index d1c927284..2523a56df 100644
--- a/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs
+++ b/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs
@@ -55,11 +55,48 @@ pub(crate) fn is_distinct_from<T>(
where
T: ArrowNumericType,
{
- Ok(left
- .iter()
- .zip(right.iter())
- .map(|(x, y)| Some(x != y))
- .collect())
+ let left_data = left.data();
+ let right_data = right.data();
+ let array_len = left_data.len().min(right_data.len());
+
+ let left_values = left.values();
+ let right_values = right.values();
+
+ let distinct = arrow_buffer::MutableBuffer::collect_bool(array_len, |i| {
+ left_data.is_null(i) != right_data.is_null(i) || left_values[i] !=
right_values[i]
+ });
+
+ let array_data = ArrayData::builder(arrow_schema::DataType::Boolean)
+ .len(array_len)
+ .add_buffer(distinct.into());
+
+ Ok(BooleanArray::from(unsafe { array_data.build_unchecked() }))
+}
+
+pub(crate) fn is_not_distinct_from<T>(
+ left: &PrimitiveArray<T>,
+ right: &PrimitiveArray<T>,
+) -> Result<BooleanArray>
+where
+ T: ArrowNumericType,
+{
+ let left_data = left.data();
+ let right_data = right.data();
+ let array_len = left_data.len().min(right_data.len());
+
+ let left_values = left.values();
+ let right_values = right.values();
+
+ let distinct = arrow_buffer::MutableBuffer::collect_bool(array_len, |i| {
+ !(left_data.is_null(i) != right_data.is_null(i)
+ || left_values[i] != right_values[i])
+ });
+
+ let array_data = ArrayData::builder(arrow_schema::DataType::Boolean)
+ .len(array_len)
+ .add_buffer(distinct.into());
+
+ Ok(BooleanArray::from(unsafe { array_data.build_unchecked() }))
}
pub(crate) fn is_distinct_from_utf8<OffsetSize: OffsetSizeTrait>(
@@ -93,20 +130,6 @@ fn make_boolean_array(length: usize, value: bool) ->
Result<BooleanArray> {
Ok((0..length).into_iter().map(|_| Some(value)).collect())
}
-pub(crate) fn is_not_distinct_from<T>(
- left: &PrimitiveArray<T>,
- right: &PrimitiveArray<T>,
-) -> Result<BooleanArray>
-where
- T: ArrowNumericType,
-{
- Ok(left
- .iter()
- .zip(right.iter())
- .map(|(x, y)| Some(x == y))
- .collect())
-}
-
pub(crate) fn is_not_distinct_from_utf8<OffsetSize: OffsetSizeTrait>(
left: &GenericStringArray<OffsetSize>,
right: &GenericStringArray<OffsetSize>,
@@ -329,6 +352,15 @@ mod tests {
.unwrap()
}
+ fn create_int_array(array: &[Option<i32>]) -> Int32Array {
+ let mut int_builder = Int32Builder::with_capacity(array.len());
+
+ for value in array.iter().copied() {
+ int_builder.append_option(value)
+ }
+ int_builder.finish()
+ }
+
#[test]
fn comparison_decimal_op_test() -> Result<()> {
let value_i128: i128 = 123;
@@ -355,14 +387,13 @@ mod tests {
);
// is_distinct: left distinct right
- let result = is_distinct_from_decimal(&left_decimal_array,
&right_decimal_array)?;
+ let result = is_distinct_from(&left_decimal_array,
&right_decimal_array)?;
assert_eq!(
BooleanArray::from(vec![Some(true), Some(true), Some(true),
Some(false)]),
result
);
// is_distinct: left distinct right
- let result =
- is_not_distinct_from_decimal(&left_decimal_array,
&right_decimal_array)?;
+ let result = is_not_distinct_from(&left_decimal_array,
&right_decimal_array)?;
assert_eq!(
BooleanArray::from(vec![Some(false), Some(false), Some(false),
Some(true)]),
result
@@ -482,4 +513,67 @@ mod tests {
let err = modulus_decimal_scalar(&left_decimal_array, 0).unwrap_err();
assert_eq!("Arrow error: Divide by zero error", err.to_string());
}
+
+ #[test]
+ fn is_distinct_from_non_nulls() -> Result<()> {
+ let left_int_array =
+ create_int_array(&[Some(0), Some(1), Some(2), Some(3), Some(4)]);
+ let right_int_array =
+ create_int_array(&[Some(4), Some(3), Some(2), Some(1), Some(0)]);
+
+ assert_eq!(
+ BooleanArray::from(vec![
+ Some(true),
+ Some(true),
+ Some(false),
+ Some(true),
+ Some(true),
+ ]),
+ is_distinct_from(&left_int_array, &right_int_array)?
+ );
+ assert_eq!(
+ BooleanArray::from(vec![
+ Some(false),
+ Some(false),
+ Some(true),
+ Some(false),
+ Some(false),
+ ]),
+ is_not_distinct_from(&left_int_array, &right_int_array)?
+ );
+ Ok(())
+ }
+
+ #[test]
+ fn is_distinct_from_nulls() -> Result<()> {
+ let left_int_array =
+ create_int_array(&[Some(0), Some(0), None, Some(3), Some(0),
Some(0)]);
+ let right_int_array =
+ create_int_array(&[Some(0), None, None, None, Some(0), None]);
+
+ assert_eq!(
+ BooleanArray::from(vec![
+ Some(false),
+ Some(true),
+ Some(false),
+ Some(true),
+ Some(false),
+ Some(true)
+ ]),
+ is_distinct_from(&left_int_array, &right_int_array)?
+ );
+
+ assert_eq!(
+ BooleanArray::from(vec![
+ Some(true),
+ Some(false),
+ Some(true),
+ Some(false),
+ Some(true),
+ Some(false)
+ ]),
+ is_not_distinct_from(&left_int_array, &right_int_array)?
+ );
+ Ok(())
+ }
}