suremarc commented on code in PR #9593: URL: https://github.com/apache/datafusion/pull/9593#discussion_r1585938974
########## datafusion/core/src/datasource/physical_plan/mod.rs: ########## @@ -473,15 +468,281 @@ fn get_projected_output_ordering( // since rest of the orderings are violated break; } + // do not push empty entries // otherwise we may have `Some(vec![])` at the output ordering. - if !new_ordering.is_empty() { - all_orderings.push(new_ordering); + if new_ordering.is_empty() { + continue; + } + + // Check if any file groups are not sorted + if base_config.file_groups.iter().any(|group| { + if group.len() <= 1 { + // File groups with <= 1 files are always sorted + return false; + } + + let statistics = match MinMaxStatistics::new_from_files( + &new_ordering, + projected_schema, + base_config.projection.as_deref(), + group, + ) { + Ok(statistics) => statistics, + Err(e) => { + log::trace!("Error fetching statistics for file group: {e}"); + // we can't prove that it's ordered, so we have to reject it + return true; + } + }; + + !statistics.is_sorted() + }) { + debug!( + "Skipping specified output ordering {:?}. \ + Some file groups couldn't be determined to be sorted: {:?}", + base_config.output_ordering[0], base_config.file_groups + ); + continue; } + + all_orderings.push(new_ordering); } all_orderings } +/// A normalized representation of file min/max statistics that allows for efficient sorting & comparison. +/// The min/max values are ordered by [`Self::sort_order`]. +/// Furthermore, any columns that are reversed in the sort order have their min/max values swapped. +pub(crate) struct MinMaxStatistics { + min_by_sort_order: arrow::row::Rows, + max_by_sort_order: arrow::row::Rows, + sort_order: Vec<PhysicalSortExpr>, +} + +impl MinMaxStatistics { + #[allow(unused)] + fn sort_order(&self) -> &[PhysicalSortExpr] { + &self.sort_order + } + + fn new_from_files<'a>( + projected_sort_order: &[PhysicalSortExpr], // Sort order with respect to projected schema + projected_schema: &SchemaRef, // Projected schema + projection: Option<&[usize]>, // Indices of projection in full table schema (None = all columns) Review Comment: I am not sure what you mean, but `projection` is what was used to produce `projected_schema`. It tells us what position the columns of `projected_schema` would be in the full schema. Does that make it more clear? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org