alamb commented on code in PR #3422:
URL: https://github.com/apache/arrow-datafusion/pull/3422#discussion_r967635359


##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -487,6 +492,7 @@ fn rewrite_expr_to_prunable(
     column_expr: &Expr,
     op: Operator,
     scalar_expr: &Expr,
+    schema: DFSchema,

Review Comment:
   Could you please update the list of rules in the docstring of this function 
to include the new logic for casts?



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -552,6 +577,25 @@ fn is_compare_op(op: Operator) -> bool {
     )
 }
 
+// The pruning logic is based on the comparing the min/max bounds.
+// Must make sure the two type has order.
+// For example, casts from string to numbers is not correct.
+// Because the "13" is less than "3" with UTF8 comparison order.
+fn support_type_for_prune(from_type: &DataType, to_type: &DataType) -> bool {
+    // TODO: support other data type

Review Comment:
   I think it would be good to file a follow on ticket listing the other types 
(you did so in 
https://github.com/apache/arrow-datafusion/issues/3414#issuecomment-1242659478)



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -429,11 +429,16 @@ impl<'a> PruningExpressionBuilder<'a> {
                 }
             };
 
-        let (column_expr, correct_operator, scalar_expr) =
-            match rewrite_expr_to_prunable(column_expr, correct_operator, 
scalar_expr) {
-                Ok(ret) => ret,
-                Err(e) => return Err(e),
-            };
+        let df_schema = DFSchema::try_from(schema.clone())?;
+        let (column_expr, correct_operator, scalar_expr) = match 
rewrite_expr_to_prunable(
+            column_expr,
+            correct_operator,
+            scalar_expr,
+            df_schema,
+        ) {
+            Ok(ret) => ret,
+            Err(e) => return Err(e),
+        };

Review Comment:
   Very minor suggestion (can do this as a follow on too)
   
   ```suggestion
           let (column_expr, correct_operator, scalar_expr) = 
rewrite_expr_to_prunable(
               column_expr,
               correct_operator,
               scalar_expr,
               df_schema,
           )?;
   ```



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1995,4 +2039,66 @@ mod tests {
         let result = p.prune(&statistics).unwrap();
         assert_eq!(result, expected_ret);
     }
+
+    #[test]
+    fn test_rewrite_expr_to_prunable() {
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let df_schema = DFSchema::try_from(schema).unwrap();
+        // column op lit
+        let left_input = col("a");
+        let right_input = lit(ScalarValue::Int32(Some(12)));
+        let (result_left, _, result_right) = rewrite_expr_to_prunable(
+            &left_input,
+            Operator::Eq,
+            &right_input,
+            df_schema.clone(),
+        )
+        .unwrap();
+        assert_eq!(result_left, left_input);
+        assert_eq!(result_right, right_input);
+        // cast op lit
+        let left_input = cast(col("a"), DataType::Decimal128(20, 3));
+        let right_input = lit(ScalarValue::Decimal128(Some(12), 20, 3));
+        let (result_left, _, result_right) = rewrite_expr_to_prunable(
+            &left_input,
+            Operator::Gt,
+            &right_input,
+            df_schema.clone(),
+        )
+        .unwrap();
+        assert_eq!(result_left, left_input);
+        assert_eq!(result_right, right_input);
+        // try_cast op lit
+        let left_input = try_cast(col("a"), DataType::Int64);
+        let right_input = lit(ScalarValue::Int64(Some(12)));
+        let (result_left, _, result_right) =
+            rewrite_expr_to_prunable(&left_input, Operator::Gt, &right_input, 
df_schema)
+                .unwrap();
+        assert_eq!(result_left, left_input);
+        assert_eq!(result_right, right_input);
+        // TODO: add test for other case and op

Review Comment:
   Rather than extending this test for other ops, I think we should add tests 
which actually run the pruning expressions on values (see my other comments).
   
   Would it be helpful for me to write some more tests? I can do so tomorrow 
morning 



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -552,6 +577,25 @@ fn is_compare_op(op: Operator) -> bool {
     )
 }
 
+// The pruning logic is based on the comparing the min/max bounds.
+// Must make sure the two type has order.
+// For example, casts from string to numbers is not correct.

Review Comment:
   👍  thank you for these comments



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -495,21 +501,40 @@ fn rewrite_expr_to_prunable(
     }
 
     match column_expr {
+        // col op lit()
+        Expr::Column(_) => Ok((column_expr.clone(), op, scalar_expr.clone())),
         // `cast(col) op lit()`
         Expr::Cast { expr, data_type } => {
-            let (left, op, right) = rewrite_expr_to_prunable(expr, op, 
scalar_expr)?;
-            Ok((cast(left, data_type.clone()), op, right))
+            let from_type = expr.get_type(&schema)?;
+            if support_type_for_prune(&from_type, data_type) {
+                let (left, op, right) =
+                    rewrite_expr_to_prunable(expr, op, scalar_expr, schema)?;
+                Ok((cast(left, data_type.clone()), op, right))
+            } else {
+                return Err(DataFusionError::Plan(format!(
+                    "Cast with from type {} to type {} is not supported",
+                    from_type, data_type
+                )));
+            }

Review Comment:
   We could probably avoid the duplication of the error generation by moving 
the error return into `support_type_for_prune` like:
   
   ```suggestion
               verify_support_type_for_prune(&from_type, data_type)?;
               let (left, op, right) = rewrite_expr_to_prunable(expr, op, 
scalar_expr)?;
               Ok((cast(left, data_type.clone()), op, right))
   ```
   
   This is not needed for correctness, just a code structure suggestion
   
   



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1995,4 +2039,66 @@ mod tests {
         let result = p.prune(&statistics).unwrap();
         assert_eq!(result, expected_ret);
     }
+
+    #[test]
+    fn test_rewrite_expr_to_prunable() {
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let df_schema = DFSchema::try_from(schema).unwrap();
+        // column op lit
+        let left_input = col("a");
+        let right_input = lit(ScalarValue::Int32(Some(12)));
+        let (result_left, _, result_right) = rewrite_expr_to_prunable(
+            &left_input,
+            Operator::Eq,
+            &right_input,
+            df_schema.clone(),
+        )
+        .unwrap();
+        assert_eq!(result_left, left_input);
+        assert_eq!(result_right, right_input);
+        // cast op lit
+        let left_input = cast(col("a"), DataType::Decimal128(20, 3));
+        let right_input = lit(ScalarValue::Decimal128(Some(12), 20, 3));
+        let (result_left, _, result_right) = rewrite_expr_to_prunable(
+            &left_input,
+            Operator::Gt,
+            &right_input,
+            df_schema.clone(),
+        )
+        .unwrap();
+        assert_eq!(result_left, left_input);
+        assert_eq!(result_right, right_input);
+        // try_cast op lit
+        let left_input = try_cast(col("a"), DataType::Int64);
+        let right_input = lit(ScalarValue::Int64(Some(12)));
+        let (result_left, _, result_right) =
+            rewrite_expr_to_prunable(&left_input, Operator::Gt, &right_input, 
df_schema)
+                .unwrap();
+        assert_eq!(result_left, left_input);
+        assert_eq!(result_right, right_input);
+        // TODO: add test for other case and op
+    }
+
+    #[test]
+    fn test_rewrite_expr_to_prunable_error() {
+        // cast string value to numeric value
+        // this cast is not supported
+        let schema = Schema::new(vec![Field::new("a", DataType::Utf8, true)]);
+        let df_schema = DFSchema::try_from(schema).unwrap();
+        let left_input = cast(col("a"), DataType::Int64);
+        let right_input = lit(ScalarValue::Int64(Some(12)));
+        let result = rewrite_expr_to_prunable(
+            &left_input,
+            Operator::Gt,
+            &right_input,
+            df_schema.clone(),
+        );
+        assert!(result.is_err());
+        // other expr
+        let left_input = is_null(col("a"));
+        let right_input = lit(ScalarValue::Int64(Some(12)));
+        let result =
+            rewrite_expr_to_prunable(&left_input, Operator::Gt, &right_input, 
df_schema);
+        assert!(result.is_err());
+    }
 }

Review Comment:
   I think we should add some tests that exercise the pruning code with actual 
values too
   
   For example, perhaps you can take `prune_int32_cast` from 
https://github.com/apache/arrow-datafusion/pull/3378?



##########
datafusion/core/src/physical_optimizer/pruning.rs:
##########
@@ -1995,4 +2039,66 @@ mod tests {
         let result = p.prune(&statistics).unwrap();
         assert_eq!(result, expected_ret);
     }
+
+    #[test]
+    fn test_rewrite_expr_to_prunable() {
+        let schema = Schema::new(vec![Field::new("a", DataType::Int32, true)]);
+        let df_schema = DFSchema::try_from(schema).unwrap();
+        // column op lit
+        let left_input = col("a");
+        let right_input = lit(ScalarValue::Int32(Some(12)));
+        let (result_left, _, result_right) = rewrite_expr_to_prunable(
+            &left_input,
+            Operator::Eq,
+            &right_input,
+            df_schema.clone(),
+        )
+        .unwrap();
+        assert_eq!(result_left, left_input);
+        assert_eq!(result_right, right_input);
+        // cast op lit
+        let left_input = cast(col("a"), DataType::Decimal128(20, 3));
+        let right_input = lit(ScalarValue::Decimal128(Some(12), 20, 3));
+        let (result_left, _, result_right) = rewrite_expr_to_prunable(
+            &left_input,
+            Operator::Gt,
+            &right_input,
+            df_schema.clone(),
+        )
+        .unwrap();
+        assert_eq!(result_left, left_input);
+        assert_eq!(result_right, right_input);
+        // try_cast op lit
+        let left_input = try_cast(col("a"), DataType::Int64);
+        let right_input = lit(ScalarValue::Int64(Some(12)));
+        let (result_left, _, result_right) =
+            rewrite_expr_to_prunable(&left_input, Operator::Gt, &right_input, 
df_schema)
+                .unwrap();
+        assert_eq!(result_left, left_input);
+        assert_eq!(result_right, right_input);
+        // TODO: add test for other case and op
+    }
+
+    #[test]
+    fn test_rewrite_expr_to_prunable_error() {
+        // cast string value to numeric value
+        // this cast is not supported

Review Comment:
   👍 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to