alamb commented on code in PR #3422:
URL: https://github.com/apache/arrow-datafusion/pull/3422#discussion_r967635359
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -487,6 +492,7 @@ fn rewrite_expr_to_prunable(
column_expr: &Expr,
op: Operator,
scalar_expr: &Expr,
+ schema: DFSchema,
Review Comment:
Could you please update the list of rules in the docstring of this function
to include the new logic for casts?
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -552,6 +577,25 @@ fn is_compare_op(op: Operator) -> bool {
)
}
+// The pruning logic is based on the comparing the min/max bounds.
+// Must make sure the two type has order.
+// For example, casts from string to numbers is not correct.
+// Because the "13" is less than "3" with UTF8 comparison order.
+fn support_type_for_prune(from_type: &DataType, to_type: &DataType) -> bool {
+ // TODO: support other data type
Review Comment:
I think it would be good to file a follow on ticket listing the other types
(you did so in
https://github.com/apache/arrow-datafusion/issues/3414#issuecomment-1242659478)
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -429,11 +429,16 @@ impl<'a> PruningExpressionBuilder<'a> {
}
};
- let (column_expr, correct_operator, scalar_expr) =
- match rewrite_expr_to_prunable(column_expr, correct_operator,
scalar_expr) {
- Ok(ret) => ret,
- Err(e) => return Err(e),
- };
+ let df_schema = DFSchema::try_from(schema.clone())?;
+ let (column_expr, correct_operator, scalar_expr) = match
rewrite_expr_to_prunable(
+ column_expr,
+ correct_operator,
+ scalar_expr,
+ df_schema,
+ ) {
+ Ok(ret) => ret,
+ Err(e) => return Err(e),
+ };
Review Comment:
Very minor suggestion (can do this as a follow on too)
```suggestion
let (column_expr, correct_operator, scalar_expr) =
rewrite_expr_to_prunable(
column_expr,
correct_operator,
scalar_expr,
df_schema,
)?;
```
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1995,4 +2039,66 @@ mod tests {
let result = p.prune(&statistics).unwrap();
assert_eq!(result, expected_ret);
}
+
+ #[test]
+ fn test_rewrite_expr_to_prunable() {
+ let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+ let df_schema = DFSchema::try_from(schema).unwrap();
+ // column op lit
+ let left_input = col("a");
+ let right_input = lit(ScalarValue::Int32(Some(12)));
+ let (result_left, _, result_right) = rewrite_expr_to_prunable(
+ &left_input,
+ Operator::Eq,
+ &right_input,
+ df_schema.clone(),
+ )
+ .unwrap();
+ assert_eq!(result_left, left_input);
+ assert_eq!(result_right, right_input);
+ // cast op lit
+ let left_input = cast(col("a"), DataType::Decimal128(20, 3));
+ let right_input = lit(ScalarValue::Decimal128(Some(12), 20, 3));
+ let (result_left, _, result_right) = rewrite_expr_to_prunable(
+ &left_input,
+ Operator::Gt,
+ &right_input,
+ df_schema.clone(),
+ )
+ .unwrap();
+ assert_eq!(result_left, left_input);
+ assert_eq!(result_right, right_input);
+ // try_cast op lit
+ let left_input = try_cast(col("a"), DataType::Int64);
+ let right_input = lit(ScalarValue::Int64(Some(12)));
+ let (result_left, _, result_right) =
+ rewrite_expr_to_prunable(&left_input, Operator::Gt, &right_input,
df_schema)
+ .unwrap();
+ assert_eq!(result_left, left_input);
+ assert_eq!(result_right, right_input);
+ // TODO: add test for other case and op
Review Comment:
Rather than extending this test for other ops, I think we should add tests
which actually run the pruning expressions on values (see my other comments).
Would it be helpful for me to write some more tests? I can do so tomorrow
morning
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -552,6 +577,25 @@ fn is_compare_op(op: Operator) -> bool {
)
}
+// The pruning logic is based on the comparing the min/max bounds.
+// Must make sure the two type has order.
+// For example, casts from string to numbers is not correct.
Review Comment:
👍 thank you for these comments
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -495,21 +501,40 @@ fn rewrite_expr_to_prunable(
}
match column_expr {
+ // col op lit()
+ Expr::Column(_) => Ok((column_expr.clone(), op, scalar_expr.clone())),
// `cast(col) op lit()`
Expr::Cast { expr, data_type } => {
- let (left, op, right) = rewrite_expr_to_prunable(expr, op,
scalar_expr)?;
- Ok((cast(left, data_type.clone()), op, right))
+ let from_type = expr.get_type(&schema)?;
+ if support_type_for_prune(&from_type, data_type) {
+ let (left, op, right) =
+ rewrite_expr_to_prunable(expr, op, scalar_expr, schema)?;
+ Ok((cast(left, data_type.clone()), op, right))
+ } else {
+ return Err(DataFusionError::Plan(format!(
+ "Cast with from type {} to type {} is not supported",
+ from_type, data_type
+ )));
+ }
Review Comment:
We could probably avoid the duplication of the error generation by moving
the error return into `support_type_for_prune` like:
```suggestion
verify_support_type_for_prune(&from_type, data_type)?;
let (left, op, right) = rewrite_expr_to_prunable(expr, op,
scalar_expr)?;
Ok((cast(left, data_type.clone()), op, right))
```
This is not needed for correctness, just a code structure suggestion
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1995,4 +2039,66 @@ mod tests {
let result = p.prune(&statistics).unwrap();
assert_eq!(result, expected_ret);
}
+
+ #[test]
+ fn test_rewrite_expr_to_prunable() {
+ let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+ let df_schema = DFSchema::try_from(schema).unwrap();
+ // column op lit
+ let left_input = col("a");
+ let right_input = lit(ScalarValue::Int32(Some(12)));
+ let (result_left, _, result_right) = rewrite_expr_to_prunable(
+ &left_input,
+ Operator::Eq,
+ &right_input,
+ df_schema.clone(),
+ )
+ .unwrap();
+ assert_eq!(result_left, left_input);
+ assert_eq!(result_right, right_input);
+ // cast op lit
+ let left_input = cast(col("a"), DataType::Decimal128(20, 3));
+ let right_input = lit(ScalarValue::Decimal128(Some(12), 20, 3));
+ let (result_left, _, result_right) = rewrite_expr_to_prunable(
+ &left_input,
+ Operator::Gt,
+ &right_input,
+ df_schema.clone(),
+ )
+ .unwrap();
+ assert_eq!(result_left, left_input);
+ assert_eq!(result_right, right_input);
+ // try_cast op lit
+ let left_input = try_cast(col("a"), DataType::Int64);
+ let right_input = lit(ScalarValue::Int64(Some(12)));
+ let (result_left, _, result_right) =
+ rewrite_expr_to_prunable(&left_input, Operator::Gt, &right_input,
df_schema)
+ .unwrap();
+ assert_eq!(result_left, left_input);
+ assert_eq!(result_right, right_input);
+ // TODO: add test for other case and op
+ }
+
+ #[test]
+ fn test_rewrite_expr_to_prunable_error() {
+ // cast string value to numeric value
+ // this cast is not supported
+ let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]);
+ let df_schema = DFSchema::try_from(schema).unwrap();
+ let left_input = cast(col("a"), DataType::Int64);
+ let right_input = lit(ScalarValue::Int64(Some(12)));
+ let result = rewrite_expr_to_prunable(
+ &left_input,
+ Operator::Gt,
+ &right_input,
+ df_schema.clone(),
+ );
+ assert!(result.is_err());
+ // other expr
+ let left_input = is_null(col("a"));
+ let right_input = lit(ScalarValue::Int64(Some(12)));
+ let result =
+ rewrite_expr_to_prunable(&left_input, Operator::Gt, &right_input,
df_schema);
+ assert!(result.is_err());
+ }
}
Review Comment:
I think we should add some tests that exercise the pruning code with actual
values too
For example, perhaps you can take `prune_int32_cast` from
https://github.com/apache/arrow-datafusion/pull/3378?
##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1995,4 +2039,66 @@ mod tests {
let result = p.prune(&statistics).unwrap();
assert_eq!(result, expected_ret);
}
+
+ #[test]
+ fn test_rewrite_expr_to_prunable() {
+ let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+ let df_schema = DFSchema::try_from(schema).unwrap();
+ // column op lit
+ let left_input = col("a");
+ let right_input = lit(ScalarValue::Int32(Some(12)));
+ let (result_left, _, result_right) = rewrite_expr_to_prunable(
+ &left_input,
+ Operator::Eq,
+ &right_input,
+ df_schema.clone(),
+ )
+ .unwrap();
+ assert_eq!(result_left, left_input);
+ assert_eq!(result_right, right_input);
+ // cast op lit
+ let left_input = cast(col("a"), DataType::Decimal128(20, 3));
+ let right_input = lit(ScalarValue::Decimal128(Some(12), 20, 3));
+ let (result_left, _, result_right) = rewrite_expr_to_prunable(
+ &left_input,
+ Operator::Gt,
+ &right_input,
+ df_schema.clone(),
+ )
+ .unwrap();
+ assert_eq!(result_left, left_input);
+ assert_eq!(result_right, right_input);
+ // try_cast op lit
+ let left_input = try_cast(col("a"), DataType::Int64);
+ let right_input = lit(ScalarValue::Int64(Some(12)));
+ let (result_left, _, result_right) =
+ rewrite_expr_to_prunable(&left_input, Operator::Gt, &right_input,
df_schema)
+ .unwrap();
+ assert_eq!(result_left, left_input);
+ assert_eq!(result_right, right_input);
+ // TODO: add test for other case and op
+ }
+
+ #[test]
+ fn test_rewrite_expr_to_prunable_error() {
+ // cast string value to numeric value
+ // this cast is not supported
Review Comment:
👍
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]