alamb commented on code in PR #3975:
URL: https://github.com/apache/arrow-datafusion/pull/3975#discussion_r1006183923
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -714,188 +911,31 @@ impl PhysicalExpr for InListExpr {
};
match value_data_type {
- DataType::Float32 => {
- make_contains_primitive!(
- array,
- list_values,
- self.negated,
- Float32,
- Float32Array
- )
- }
- DataType::Float64 => {
- make_contains_primitive!(
- array,
- list_values,
- self.negated,
- Float64,
- Float64Array
- )
- }
- DataType::Int16 => {
- make_contains_primitive!(
- array,
- list_values,
- self.negated,
- Int16,
- Int16Array
- )
- }
- DataType::Int32 => {
- make_contains_primitive!(
- array,
- list_values,
- self.negated,
- Int32,
- Int32Array
- )
- }
- DataType::Int64 => {
- make_contains_primitive!(
- array,
- list_values,
- self.negated,
- Int64,
- Int64Array
- )
- }
- DataType::Int8 => {
- make_contains_primitive!(
- array,
- list_values,
- self.negated,
- Int8,
- Int8Array
- )
- }
- DataType::UInt16 => {
- make_contains_primitive!(
- array,
- list_values,
- self.negated,
- UInt16,
- UInt16Array
- )
- }
- DataType::UInt32 => {
- make_contains_primitive!(
- array,
- list_values,
- self.negated,
- UInt32,
- UInt32Array
- )
- }
- DataType::UInt64 => {
- make_contains_primitive!(
- array,
- list_values,
- self.negated,
- UInt64,
- UInt64Array
- )
- }
- DataType::UInt8 => {
- make_contains_primitive!(
- array,
- list_values,
- self.negated,
- UInt8,
- UInt8Array
- )
- }
- DataType::Date32 => {
- make_contains_primitive!(
- array,
- list_values,
- self.negated,
- Date32,
- Date32Array
- )
- }
- DataType::Date64 => {
- make_contains_primitive!(
- array,
- list_values,
- self.negated,
- Date64,
- Date64Array
- )
- }
- DataType::Boolean => Ok(make_contains!(
- array,
- list_values,
- self.negated,
- Boolean,
- BooleanArray
- )),
- DataType::Utf8 => {
- self.compare_utf8::<i32>(array, list_values, self.negated)
- }
- DataType::LargeUtf8 => {
- self.compare_utf8::<i64>(array, list_values, self.negated)
- }
- DataType::Binary => {
- self.compare_binary::<i32>(array, list_values,
self.negated)
- }
- DataType::LargeBinary => {
- self.compare_binary::<i64>(array, list_values,
self.negated)
- }
- DataType::Null => {
- let null_array = new_null_array(&DataType::Boolean,
array.len());
- Ok(ColumnarValue::Array(Arc::new(null_array)))
- }
- DataType::Decimal128(_, _) => {
- let decimal_array =
-
array.as_any().downcast_ref::<Decimal128Array>().unwrap();
- Ok(make_list_contains_decimal(
- decimal_array,
- list_values,
- self.negated,
- ))
- }
- DataType::Timestamp(unit, _) => match unit {
- TimeUnit::Second => {
- make_contains_primitive!(
- array,
- list_values,
- self.negated,
- TimestampSecond,
- TimestampSecondArray
- )
- }
- TimeUnit::Millisecond => {
- make_contains_primitive!(
- array,
- list_values,
- self.negated,
- TimestampMillisecond,
- TimestampMillisecondArray
- )
- }
- TimeUnit::Microsecond => {
- make_contains_primitive!(
- array,
- list_values,
- self.negated,
- TimestampMicrosecond,
- TimestampMicrosecondArray
- )
- }
- TimeUnit::Nanosecond => {
- make_contains_primitive!(
- array,
- list_values,
- self.negated,
- TimestampNanosecond,
- TimestampNanosecondArray
- )
+ DataType::Dictionary(_key_type, value_type) => {
+ // Get values from the dictionary that include nulls for
none values
+ let dict_array = array
Review Comment:
I feel like you should be able to evaluate the IN list only on the
dictionary values rather than continually looking up the same elements over and
over again in the dictionary and use 'take'
https://docs.rs/arrow/latest/arrow/compute/kernels/take/fn.take.html to form
the final array
Something like thus pseudo code maybe
```rust
let values = dict_array.values();
// recursively evaluate IN <..> on the value array
let values_result = evaluate_set(values, list_values);
// Then form the final boolean array by calling take on the indices
compute::take(values_result, dict_array.keys())
```
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -436,6 +436,13 @@ impl InListExpr {
ScalarValue::Utf8(None) => None,
ScalarValue::LargeUtf8(Some(v)) => Some(v.as_str()),
ScalarValue::LargeUtf8(None) => None,
+ ScalarValue::Dictionary(_, v) => match v.as_ref() {
Review Comment:
It allows getting the underlying value out of a (typed)
`ScalarValue::Dictionary`
##########
datafusion/core/tests/sql/predicates.rs:
##########
@@ -428,9 +428,8 @@ async fn csv_in_set_test() -> Result<()> {
}
#[tokio::test]
-#[ignore]
// https://github.com/apache/arrow-datafusion/issues/3936
Review Comment:
```suggestion
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]