alamb commented on code in PR #2764:
URL: https://github.com/apache/arrow-datafusion/pull/2764#discussion_r905386562
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -160,6 +165,7 @@ macro_rules! make_contains_primitive {
ColumnarValue::Scalar(s) => match s {
ScalarValue::$SCALAR_VALUE(Some(v)) => Some(*v),
ScalarValue::$SCALAR_VALUE(None) => None,
+ // TODO this is bug, for primitive the expr list should be
cast to the same data type
Review Comment:
Is it worth filing a follow on ticket to address? I am not 100% sure I
understand what the end user facing bug would be (is it null is the wrong type?)
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -504,6 +634,11 @@ impl PhysicalExpr for InListExpr {
.unwrap();
set_contains_with_negated!(array, set, self.negated)
}
+ DataType::Decimal(_, _) => {
+ let array =
array.as_any().downcast_ref::<DecimalArray>().unwrap();
Review Comment:
Stylistic suggestion: you can also use
https://docs.rs/arrow/16.0.0/arrow/array/fn.as_decimal_array.html here too
```suggestion
let array = as_decimal_array(array);
```
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -300,6 +306,130 @@ fn cast_static_filter_to_set(list: &[Arc<dyn
PhysicalExpr>]) -> HashSet<ScalarVa
}))
}
+fn make_list_contains_decimal(
+ array: &DecimalArray,
+ list: Vec<ColumnarValue>,
+ negated: bool,
+) -> BooleanArray {
+ let contains_null = list
+ .iter()
+ .any(|v| matches!(v, ColumnarValue::Scalar(s) if s.is_null()));
+ let values = list
+ .iter()
+ .flat_map(|v| match v {
+ ColumnarValue::Scalar(s) => match s {
+ Decimal128(v128op, _, _) => v128op.map(|v128| v128),
Review Comment:
I don't understand what this `map` does. Is this the same?
```suggestion
Decimal128(v128op, _, _) => *v128op,
```
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -640,13 +785,62 @@ impl PhysicalExpr for InListExpr {
}
}
+type InListCastResult = (Arc<dyn PhysicalExpr>, Vec<Arc<dyn PhysicalExpr>>);
+
/// Creates a unary expression InList
pub fn in_list(
expr: Arc<dyn PhysicalExpr>,
list: Vec<Arc<dyn PhysicalExpr>>,
negated: &bool,
+ input_schema: &Schema,
) -> Result<Arc<dyn PhysicalExpr>> {
- Ok(Arc::new(InListExpr::new(expr, list, *negated)))
+ let (cast_expr, cast_list) = in_list_cast(expr, list, input_schema)?;
+ Ok(Arc::new(InListExpr::new(cast_expr, cast_list, *negated)))
+}
+
+fn in_list_cast(
+ expr: Arc<dyn PhysicalExpr>,
+ list: Vec<Arc<dyn PhysicalExpr>>,
+ input_schema: &Schema,
+) -> Result<InListCastResult> {
+ let expr_type = &expr.data_type(input_schema)?;
+ let list_types: Vec<DataType> = list
+ .iter()
+ .map(|list_expr| list_expr.data_type(input_schema).unwrap())
+ .collect();
+ // TODO in the arrow-rs, should support NULL type to Decimal Data type
+ // TODO support in the arrow-rs, NULL value cast to Decimal Value
+ let result_type = get_coerce_type(expr_type, &list_types);
+ match result_type {
+ None => Err(DataFusionError::Internal(format!(
+ "In expr can find the coerced type for {:?} in {:?}",
+ expr_type, list_types
+ ))),
+ Some(data_type) => {
+ // find the coerced type
+ let cast_expr = try_cast(expr, input_schema, data_type.clone())?;
+ let cast_list_expr = list
+ .into_iter()
+ .map(|list_expr| {
+ try_cast(list_expr, input_schema,
data_type.clone()).unwrap()
+ })
+ .collect();
+ Ok((cast_expr, cast_list_expr))
+ }
+ }
+}
+
+fn get_coerce_type(expr_type: &DataType, list_type: &[DataType]) ->
Option<DataType> {
+ // get the equal coerced data type
+ list_type
+ .iter()
+ .fold(Some(expr_type.clone()), |left, right_type| {
+ match left {
+ None => None,
+ // TODO refactor a framework to do the data type coercion
Review Comment:
👍 I agree it could be much nicer
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -300,6 +306,130 @@ fn cast_static_filter_to_set(list: &[Arc<dyn
PhysicalExpr>]) -> HashSet<ScalarVa
}))
}
+fn make_list_contains_decimal(
+ array: &DecimalArray,
+ list: Vec<ColumnarValue>,
+ negated: bool,
+) -> BooleanArray {
+ let contains_null = list
+ .iter()
+ .any(|v| matches!(v, ColumnarValue::Scalar(s) if s.is_null()));
+ let values = list
+ .iter()
+ .flat_map(|v| match v {
+ ColumnarValue::Scalar(s) => match s {
+ Decimal128(v128op, _, _) => v128op.map(|v128| v128),
+ _ => {
+ unreachable!(
+ "InList can't reach other data type for decimal data
type."
+ )
+ }
+ },
+ ColumnarValue::Array(_) => {
+ unimplemented!("InList does not yet support nested columns.")
+ }
+ })
+ .collect::<Vec<_>>();
+
+ if !negated {
+ // In
+ array
+ .iter()
+ .map(|v| match v {
+ None => None,
+ Some(v128) => {
+ if values.contains(&v128) {
+ Some(true)
+ } else {
+ Some(false)
+ }
+ }
+ })
+ .collect::<BooleanArray>()
+ } else {
+ // Not in
+ if contains_null {
+ // if the expr is NOT IN and the list contains NULL value
+ // All the result must be NONE
+ vec![None; array.len()]
Review Comment:
I think using `new_null_array` here might be both simpler and easier to read:
https://docs.rs/arrow/16.0.0/arrow/array/fn.new_null_array.html
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -300,6 +306,130 @@ fn cast_static_filter_to_set(list: &[Arc<dyn
PhysicalExpr>]) -> HashSet<ScalarVa
}))
}
+fn make_list_contains_decimal(
+ array: &DecimalArray,
+ list: Vec<ColumnarValue>,
+ negated: bool,
+) -> BooleanArray {
+ let contains_null = list
+ .iter()
+ .any(|v| matches!(v, ColumnarValue::Scalar(s) if s.is_null()));
+ let values = list
+ .iter()
+ .flat_map(|v| match v {
+ ColumnarValue::Scalar(s) => match s {
+ Decimal128(v128op, _, _) => v128op.map(|v128| v128),
+ _ => {
+ unreachable!(
+ "InList can't reach other data type for decimal data
type."
+ )
+ }
+ },
+ ColumnarValue::Array(_) => {
+ unimplemented!("InList does not yet support nested columns.")
+ }
+ })
+ .collect::<Vec<_>>();
+
+ if !negated {
+ // In
+ array
+ .iter()
+ .map(|v| match v {
+ None => None,
+ Some(v128) => {
+ if values.contains(&v128) {
+ Some(true)
+ } else {
+ Some(false)
+ }
+ }
+ })
+ .collect::<BooleanArray>()
Review Comment:
```suggestion
array
.iter()
.map(|v| v.map(|v128| values.contains(&v128)))
.collect::<BooleanArray>()
```
Is a bit less verbose and perhaps easier to follow
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -300,6 +306,130 @@ fn cast_static_filter_to_set(list: &[Arc<dyn
PhysicalExpr>]) -> HashSet<ScalarVa
}))
}
+fn make_list_contains_decimal(
+ array: &DecimalArray,
+ list: Vec<ColumnarValue>,
+ negated: bool,
+) -> BooleanArray {
+ let contains_null = list
+ .iter()
+ .any(|v| matches!(v, ColumnarValue::Scalar(s) if s.is_null()));
+ let values = list
+ .iter()
+ .flat_map(|v| match v {
+ ColumnarValue::Scalar(s) => match s {
+ Decimal128(v128op, _, _) => v128op.map(|v128| v128),
+ _ => {
+ unreachable!(
+ "InList can't reach other data type for decimal data
type."
+ )
+ }
+ },
+ ColumnarValue::Array(_) => {
+ unimplemented!("InList does not yet support nested columns.")
+ }
+ })
+ .collect::<Vec<_>>();
+
+ if !negated {
+ // In
+ array
+ .iter()
+ .map(|v| match v {
+ None => None,
+ Some(v128) => {
+ if values.contains(&v128) {
+ Some(true)
+ } else {
+ Some(false)
+ }
+ }
+ })
+ .collect::<BooleanArray>()
+ } else {
+ // Not in
+ if contains_null {
+ // if the expr is NOT IN and the list contains NULL value
+ // All the result must be NONE
+ vec![None; array.len()]
+ .into_iter()
+ .collect::<BooleanArray>()
+ } else {
+ array
+ .iter()
+ .map(|v| match v {
+ None => None,
+ Some(v128) => {
+ if !values.contains(&v128) {
+ Some(true)
+ } else {
+ Some(false)
+ }
+ }
+ })
+ .collect::<BooleanArray>()
+ }
+ }
+}
+
+fn make_set_contains_decimal(
Review Comment:
The same comments apply here as do `make_list_contains_decimal` as they are
largely copy/paste of each other.
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -300,6 +306,130 @@ fn cast_static_filter_to_set(list: &[Arc<dyn
PhysicalExpr>]) -> HashSet<ScalarVa
}))
}
+fn make_list_contains_decimal(
+ array: &DecimalArray,
+ list: Vec<ColumnarValue>,
+ negated: bool,
+) -> BooleanArray {
+ let contains_null = list
+ .iter()
+ .any(|v| matches!(v, ColumnarValue::Scalar(s) if s.is_null()));
+ let values = list
+ .iter()
+ .flat_map(|v| match v {
+ ColumnarValue::Scalar(s) => match s {
+ Decimal128(v128op, _, _) => v128op.map(|v128| v128),
+ _ => {
+ unreachable!(
+ "InList can't reach other data type for decimal data
type."
+ )
+ }
+ },
+ ColumnarValue::Array(_) => {
+ unimplemented!("InList does not yet support nested columns.")
+ }
+ })
+ .collect::<Vec<_>>();
+
+ if !negated {
+ // In
+ array
+ .iter()
+ .map(|v| match v {
+ None => None,
+ Some(v128) => {
+ if values.contains(&v128) {
+ Some(true)
+ } else {
+ Some(false)
+ }
+ }
+ })
+ .collect::<BooleanArray>()
Review Comment:
The same could be applied to the `NOT IN` branch
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -640,13 +785,62 @@ impl PhysicalExpr for InListExpr {
}
}
+type InListCastResult = (Arc<dyn PhysicalExpr>, Vec<Arc<dyn PhysicalExpr>>);
+
/// Creates a unary expression InList
pub fn in_list(
expr: Arc<dyn PhysicalExpr>,
list: Vec<Arc<dyn PhysicalExpr>>,
negated: &bool,
+ input_schema: &Schema,
) -> Result<Arc<dyn PhysicalExpr>> {
- Ok(Arc::new(InListExpr::new(expr, list, *negated)))
+ let (cast_expr, cast_list) = in_list_cast(expr, list, input_schema)?;
+ Ok(Arc::new(InListExpr::new(cast_expr, cast_list, *negated)))
+}
+
+fn in_list_cast(
+ expr: Arc<dyn PhysicalExpr>,
+ list: Vec<Arc<dyn PhysicalExpr>>,
+ input_schema: &Schema,
+) -> Result<InListCastResult> {
+ let expr_type = &expr.data_type(input_schema)?;
+ let list_types: Vec<DataType> = list
+ .iter()
+ .map(|list_expr| list_expr.data_type(input_schema).unwrap())
+ .collect();
+ // TODO in the arrow-rs, should support NULL type to Decimal Data type
+ // TODO support in the arrow-rs, NULL value cast to Decimal Value
+ let result_type = get_coerce_type(expr_type, &list_types);
+ match result_type {
+ None => Err(DataFusionError::Internal(format!(
+ "In expr can find the coerced type for {:?} in {:?}",
+ expr_type, list_types
+ ))),
+ Some(data_type) => {
+ // find the coerced type
+ let cast_expr = try_cast(expr, input_schema, data_type.clone())?;
+ let cast_list_expr = list
+ .into_iter()
+ .map(|list_expr| {
+ try_cast(list_expr, input_schema,
data_type.clone()).unwrap()
+ })
+ .collect();
+ Ok((cast_expr, cast_list_expr))
+ }
+ }
+}
+
+fn get_coerce_type(expr_type: &DataType, list_type: &[DataType]) ->
Option<DataType> {
+ // get the equal coerced data type
+ list_type
+ .iter()
+ .fold(Some(expr_type.clone()), |left, right_type| {
+ match left {
+ None => None,
+ // TODO refactor a framework to do the data type coercion
Review Comment:
Maybe you could start by putting this code in
https://github.com/apache/arrow-datafusion/blob/master/datafusion/physical-expr/src/type_coercion.rs
or
https://github.com/apache/arrow-datafusion/blob/master/datafusion/expr/src/type_coercion.rs
🤔
##########
datafusion/physical-expr/src/expressions/in_list.rs:
##########
@@ -640,13 +785,62 @@ impl PhysicalExpr for InListExpr {
}
}
+type InListCastResult = (Arc<dyn PhysicalExpr>, Vec<Arc<dyn PhysicalExpr>>);
+
/// Creates a unary expression InList
pub fn in_list(
expr: Arc<dyn PhysicalExpr>,
list: Vec<Arc<dyn PhysicalExpr>>,
negated: &bool,
+ input_schema: &Schema,
) -> Result<Arc<dyn PhysicalExpr>> {
- Ok(Arc::new(InListExpr::new(expr, list, *negated)))
+ let (cast_expr, cast_list) = in_list_cast(expr, list, input_schema)?;
+ Ok(Arc::new(InListExpr::new(cast_expr, cast_list, *negated)))
+}
+
+fn in_list_cast(
+ expr: Arc<dyn PhysicalExpr>,
+ list: Vec<Arc<dyn PhysicalExpr>>,
+ input_schema: &Schema,
+) -> Result<InListCastResult> {
+ let expr_type = &expr.data_type(input_schema)?;
+ let list_types: Vec<DataType> = list
+ .iter()
+ .map(|list_expr| list_expr.data_type(input_schema).unwrap())
+ .collect();
+ // TODO in the arrow-rs, should support NULL type to Decimal Data type
+ // TODO support in the arrow-rs, NULL value cast to Decimal Value
+ let result_type = get_coerce_type(expr_type, &list_types);
Review Comment:
```suggestion
// https://github.com/apache/arrow-datafusion/issues/2759
let result_type = get_coerce_type(expr_type, &list_types);
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]