alamb commented on code in PR #3324: URL: https://github.com/apache/arrow-datafusion/pull/3324#discussion_r962143914
########## datafusion/physical-expr/src/expressions/in_list.rs: ########## @@ -398,6 +357,26 @@ fn set_contains_utf8<OffsetSize: OffsetSizeTrait>( collection_contains_check!(array, native_set, negated, contains_null) } +fn set_contains_binary<OffsetSize: OffsetSizeTrait>( + array: &GenericBinaryArray<OffsetSize>, + set: &HashSet<ScalarValue>, + negated: bool, +) -> ColumnarValue { + let contains_null = set.iter().any(|v| v.is_null()); + let native_array = set + .iter() + .flat_map(|v| match v { + Binary(v) | LargeBinary(v) => v.as_deref(), + datatype => { + unreachable!("InList can't reach other data type {} for {}.", datatype, v) + } + }) + .collect::<Vec<_>>(); + let native_set: HashSet<&[u8]> = HashSet::from_iter(native_array); Review Comment: Can you skip this intermediate `Vec` and instead go directly to HashSet? Like `.collect::<HashSet<_>>()` ? ########## datafusion/physical-expr/src/expressions/in_list.rs: ########## @@ -471,37 +450,50 @@ impl InListExpr { }) .collect::<Vec<&str>>(); - if negated { - if contains_null { - Ok(ColumnarValue::Array(Arc::new( - array - .iter() - .map(|x| match x.map(|v| !values.contains(&v)) { - Some(true) => None, - x => x, - }) - .collect::<BooleanArray>(), - ))) - } else { - Ok(ColumnarValue::Array(Arc::new(not_in_list_utf8( - array, &values, - )?))) - } - } else if contains_null { - Ok(ColumnarValue::Array(Arc::new( - array - .iter() - .map(|x| match x.map(|v| values.contains(&v)) { - Some(false) => None, - x => x, - }) - .collect::<BooleanArray>(), - ))) - } else { - Ok(ColumnarValue::Array(Arc::new(in_list_utf8( - array, &values, - )?))) - } + Ok(collection_contains_check!( + array, + values, + negated, + contains_null + )) + } + + fn compare_binary<T: OffsetSizeTrait>( + &self, + array: ArrayRef, + list_values: Vec<ColumnarValue>, + negated: bool, + ) -> Result<ColumnarValue> { + let array = array + .as_any() + .downcast_ref::<GenericBinaryArray<T>>() + .unwrap(); + + let contains_null = list_values + .iter() + .any(|v| matches!(v, ColumnarValue::Scalar(s) if s.is_null())); + let values = list_values + .iter() + .flat_map(|expr| match expr { + ColumnarValue::Scalar(s) => match s { + ScalarValue::Binary(Some(v)) | ScalarValue::LargeBinary(Some(v)) => { + Some(v.as_slice()) + } + ScalarValue::Binary(None) | ScalarValue::LargeBinary(None) => None, + datatype => unimplemented!("Unexpected type {} for InList", datatype), + }, + ColumnarValue::Array(_) => { + unimplemented!("InList does not yet support nested columns.") Review Comment: I think this case is when one of the arguments to `InList` is a column (rather than a constant) So like `col1 IN ('1', '2', col2)` type thing (no change needed I was just trying to clarify) ########## datafusion/physical-expr/src/expressions/in_list.rs: ########## @@ -49,30 +48,6 @@ use datafusion_expr::ColumnarValue; /// TODO: add switch codeGen in In_List static OPTIMIZER_INSET_THRESHOLD: usize = 30; -macro_rules! compare_op_scalar { Review Comment: ❤️ -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org