alamb commented on code in PR #18449:
URL: https://github.com/apache/datafusion/pull/18449#discussion_r2508160386
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -1514,4 +1528,648 @@ mod tests {
assert_snapshot!(display_string, @"a@0 NOT IN (SET) ([a, b, NULL])");
Ok(())
}
+
+ #[test]
+ fn in_list_struct() -> Result<()> {
Review Comment:
Can we also please add some .slt level tests for `IN` on a `set`?
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -80,92 +75,85 @@ struct ArrayHashSet {
map: HashMap<usize, (), ()>,
}
-struct ArraySet<T> {
- array: T,
- hash_set: ArrayHashSet,
-}
-
-impl<T> ArraySet<T>
-where
- T: Array + From<ArrayData>,
-{
- fn new(array: &T, hash_set: ArrayHashSet) -> Self {
- Self {
- array: downcast_array(array),
- hash_set,
+impl ArrayHashSet {
+ /// Checks if values in `v` are contained in the `in_array` using this
hash set for lookup.
+ fn contains(
+ &self,
+ v: &dyn Array,
+ in_array: &dyn Array,
+ negated: bool,
+ ) -> Result<BooleanArray> {
+ // Null type comparisons always return null (SQL three-valued logic)
+ if v.data_type() == &DataType::Null || in_array.data_type() ==
&DataType::Null {
+ return Ok(BooleanArray::from(vec![None; v.len()]));
}
- }
-}
-impl<T> Set for ArraySet<T>
-where
- T: Array + 'static,
- for<'a> &'a T: ArrayAccessor,
- for<'a> <&'a T as ArrayAccessor>::Item: IsEqual,
-{
- fn contains(&self, v: &dyn Array, negated: bool) -> Result<BooleanArray> {
downcast_dictionary_array! {
v => {
- let values_contains = self.contains(v.values().as_ref(),
negated)?;
+ let values_contains = self.contains(v.values().as_ref(),
in_array, negated)?;
let result = take(&values_contains, v.keys(), None)?;
return Ok(downcast_array(result.as_ref()))
}
_ => {}
}
- let v = v.as_any().downcast_ref::<T>().unwrap();
- let in_array = &self.array;
let has_nulls = in_array.null_count() != 0;
- Ok(ArrayIter::new(v)
- .map(|v| {
- v.and_then(|v| {
- let hash = v.hash_one(&self.hash_set.state);
- let contains = self
- .hash_set
- .map
- .raw_entry()
- .from_hash(hash, |idx|
in_array.value(*idx).is_equal(&v))
- .is_some();
-
- match contains {
- true => Some(!negated),
- false if has_nulls => None,
- false => Some(negated),
- }
- })
+ let mut hashes_buf = vec![0u64; v.len()];
Review Comment:
As a follow on PR we could potentially look into reusing this hashes_buf --
aka rather than reallocating each invocations of `contains` instead make a
field (probably needs to be a `Mutex` or something) that is a Vec
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -80,92 +75,85 @@ struct ArrayHashSet {
map: HashMap<usize, (), ()>,
}
-struct ArraySet<T> {
- array: T,
- hash_set: ArrayHashSet,
-}
-
-impl<T> ArraySet<T>
-where
- T: Array + From<ArrayData>,
-{
- fn new(array: &T, hash_set: ArrayHashSet) -> Self {
- Self {
- array: downcast_array(array),
- hash_set,
+impl ArrayHashSet {
+ /// Checks if values in `v` are contained in the `in_array` using this
hash set for lookup.
+ fn contains(
+ &self,
+ v: &dyn Array,
+ in_array: &dyn Array,
+ negated: bool,
+ ) -> Result<BooleanArray> {
+ // Null type comparisons always return null (SQL three-valued logic)
+ if v.data_type() == &DataType::Null || in_array.data_type() ==
&DataType::Null {
+ return Ok(BooleanArray::from(vec![None; v.len()]));
}
- }
-}
-impl<T> Set for ArraySet<T>
-where
- T: Array + 'static,
- for<'a> &'a T: ArrayAccessor,
- for<'a> <&'a T as ArrayAccessor>::Item: IsEqual,
-{
- fn contains(&self, v: &dyn Array, negated: bool) -> Result<BooleanArray> {
downcast_dictionary_array! {
v => {
- let values_contains = self.contains(v.values().as_ref(),
negated)?;
+ let values_contains = self.contains(v.values().as_ref(),
in_array, negated)?;
let result = take(&values_contains, v.keys(), None)?;
return Ok(downcast_array(result.as_ref()))
}
_ => {}
}
- let v = v.as_any().downcast_ref::<T>().unwrap();
- let in_array = &self.array;
let has_nulls = in_array.null_count() != 0;
- Ok(ArrayIter::new(v)
- .map(|v| {
- v.and_then(|v| {
- let hash = v.hash_one(&self.hash_set.state);
- let contains = self
- .hash_set
- .map
- .raw_entry()
- .from_hash(hash, |idx|
in_array.value(*idx).is_equal(&v))
- .is_some();
-
- match contains {
- true => Some(!negated),
- false if has_nulls => None,
- false => Some(negated),
- }
- })
+ let mut hashes_buf = vec![0u64; v.len()];
+ create_hashes([v], &self.state, &mut hashes_buf)?;
+ let cmp = make_comparator(v, in_array, SortOptions::default())?;
Review Comment:
the comparator is some dynamic function -- the overhead of using the dynamic
dispatch in the critical path may be substantial).
If it turns out to be too slow, we can potentially create specializations
for comparisons (aka make a speicalized hash set for the different physical
array types, and fall back to the dynamic comparator)
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -474,16 +427,46 @@ pub fn in_list(
)))
}
+/// Create a new InList expression directly from an array, bypassing
expression evaluation.
+///
+/// This is more efficient than `in_list()` when you already have the list as
an array,
+/// as it avoids the conversion: ArrayRef -> Vec<PhysicalExpr> -> ArrayRef ->
ArraySet.
+/// Instead it goes directly: ArrayRef -> ArraySet.
+///
+/// The `list` field will be empty when using this constructor, as the array
is stored
+/// directly in the static filter.
+pub fn in_list_from_array(
Review Comment:
I wonder if it would be more discoverable if this was a method on `InList`
rather than a free function
Something like
```rust
impl InLIst
fn new_from_array( expr: Arc<dyn PhysicalExpr>,
array: ArrayRef,
negated: bool,
) -> Result<Self> {
...
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]