alamb commented on a change in pull request #500:
URL: https://github.com/apache/arrow-datafusion/pull/500#discussion_r645121254
##########
File path: datafusion/src/physical_optimizer/pruning.rs
##########
@@ -383,6 +454,46 @@ fn rewrite_column_expr(
utils::rewrite_expression(&expr, &expressions)
}
+/// Given a column reference to `column_name`, returns a pruning
+/// expression in terms of the min and max that will evaluate to true
+/// if the column may contain values, and false if definitely does not
+/// contain values
+fn build_single_column_expr(
+ column_name: &str,
+ schema: &Schema,
+ required_columns: &mut RequiredStatColumns,
+ is_not: bool, // if true, treat as !col
+) -> Option<Expr> {
+ use crate::logical_plan;
+ let field = schema.field_with_name(column_name).ok()?;
+
+ if matches!(field.data_type(), &DataType::Boolean) {
Review comment:
Here is the actual logic / rules
##########
File path: datafusion/src/physical_optimizer/pruning.rs
##########
@@ -953,4 +1111,92 @@ mod tests {
assert_eq!(result, expected);
}
+
+ /// Creates setup for boolean chunk pruning
+ ///
+ /// For predicate "b1" (boolean expr)
+ /// b1 [false, false] ==> no rows can pass (not keep)
+ /// b1 [false, true] ==> some rows could pass (must keep)
+ /// b1 [true, true] ==> all rows must pass (must keep)
+ /// b1 [NULL, NULL] ==> unknown (must keep)
+ /// b1 [false, NULL] ==> unknown (must keep)
+ ///
+ /// For predicate "!b1" (boolean expr)
+ /// b1 [false, false] ==> all rows pass (must keep)
+ /// b1 [false, true] ==> some rows could pass (must keep)
+ /// b1 [true, true] ==> no rows can pass (not keep)
+ /// b1 [NULL, NULL] ==> unknown (must keep)
+ /// b1 [false, NULL] ==> unknown (must keep)
+ fn bool_setup() -> (SchemaRef, TestStatistics, Vec<bool>, Vec<bool>) {
+ let schema =
+ Arc::new(Schema::new(vec![Field::new("b1", DataType::Boolean,
true)]));
+
+ let statistics = TestStatistics::new().with(
+ "b1",
+ ContainerStats::new_bool(
+ vec![Some(false), Some(false), Some(true), None, Some(false)],
// min
+ vec![Some(false), Some(true), Some(true), None, None],
// max
+ ),
+ );
+ let expected_true = vec![false, true, true, true, true];
+ let expected_false = vec![true, true, false, true, true];
+
+ (schema, statistics, expected_true, expected_false)
+ }
+
+ #[test]
+ fn prune_bool_column() {
+ let (schema, statistics, expected_true, _) = bool_setup();
+
+ // b1
+ let expr = col("b1");
+ let p = PruningPredicate::try_new(&expr, schema).unwrap();
+ let result = p.prune(&statistics).unwrap();
+ assert_eq!(result, expected_true);
+ }
+
+ #[test]
+ fn prune_bool_not_column() {
+ let (schema, statistics, _, expected_false) = bool_setup();
+
+ // !b1
+ let expr = col("b1").not();
+ let p = PruningPredicate::try_new(&expr, schema).unwrap();
+ let result = p.prune(&statistics).unwrap();
+ assert_eq!(result, expected_false);
+ }
+
+ #[test]
+ fn prune_bool_column_eq_true() {
+ let (schema, statistics, _, _) = bool_setup();
+
+ // b1 = true
+ let expr = col("b1").eq(lit(true));
+ let p = PruningPredicate::try_new(&expr, schema).unwrap();
+ let result = p.prune(&statistics).unwrap_err();
+ assert!(
+ result.to_string().contains(
+ "Data type Boolean not supported for scalar operation on dyn
array"
Review comment:
these aren't great messages, but they are what happens on master today,
and I figured I would document them for posterity (and maybe inspire people to
help fix them)
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]