adriangb commented on code in PR #16424: URL: https://github.com/apache/datafusion/pull/16424#discussion_r2157631212
########## datafusion/datasource-parquet/src/opener.rs: ########## @@ -524,6 +498,99 @@ fn should_enable_page_index( .unwrap_or(false) } +/// Prune based on partition values and file-level statistics. +pub struct FilePruner { + predicate_generation: u64, + predicate: Arc<dyn PhysicalExpr>, + /// Schema used for pruning, which combines the file schema and partition fields. + /// Partition fields are always at the end, as they are during scans. + pruning_schema: Arc<Schema>, + file: PartitionedFile, + partition_fields: Vec<FieldRef>, + predicate_creation_errors: Count, +} + +impl FilePruner { + pub fn new( + predicate: Arc<dyn PhysicalExpr>, + logical_file_schema: &SchemaRef, + partition_fields: Vec<FieldRef>, + file: PartitionedFile, + predicate_creation_errors: Count, + ) -> Result<Self> { + // Build a pruning schema that combines the file fields and partition fields. + // Partition fileds are always at the end. + let pruning_schema = Arc::new( + Schema::new( + logical_file_schema + .fields() + .iter() + .cloned() + .chain(partition_fields.iter().cloned()) + .collect_vec(), + ) + .with_metadata(logical_file_schema.metadata().clone()), + ); + Ok(Self { + // Initialize the predicate generation to 0 so that the first time we call `should_prune` we actually check the predicate Review Comment: ```suggestion // Initialize the predicate generation to 0 so that the first time we call `should_prune` we actually check the predicate // This also means that no pruning will happen unless there is a dynamic filter present. // See [`snapshot_generation`] for more info. ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org