suremarc commented on code in PR #9593:
URL: https://github.com/apache/datafusion/pull/9593#discussion_r1585938974


##########
datafusion/core/src/datasource/physical_plan/mod.rs:
##########
@@ -473,15 +468,281 @@ fn get_projected_output_ordering(
             // since rest of the orderings are violated
             break;
         }
+
         // do not push empty entries
         // otherwise we may have `Some(vec![])` at the output ordering.
-        if !new_ordering.is_empty() {
-            all_orderings.push(new_ordering);
+        if new_ordering.is_empty() {
+            continue;
+        }
+
+        // Check if any file groups are not sorted
+        if base_config.file_groups.iter().any(|group| {
+            if group.len() <= 1 {
+                // File groups with <= 1 files are always sorted
+                return false;
+            }
+
+            let statistics = match MinMaxStatistics::new_from_files(
+                &new_ordering,
+                projected_schema,
+                base_config.projection.as_deref(),
+                group,
+            ) {
+                Ok(statistics) => statistics,
+                Err(e) => {
+                    log::trace!("Error fetching statistics for file group: 
{e}");
+                    // we can't prove that it's ordered, so we have to reject 
it
+                    return true;
+                }
+            };
+
+            !statistics.is_sorted()
+        }) {
+            debug!(
+                "Skipping specified output ordering {:?}. \
+                Some file groups couldn't be determined to be sorted: {:?}",
+                base_config.output_ordering[0], base_config.file_groups
+            );
+            continue;
         }
+
+        all_orderings.push(new_ordering);
     }
     all_orderings
 }
 
+/// A normalized representation of file min/max statistics that allows for 
efficient sorting & comparison.
+/// The min/max values are ordered by [`Self::sort_order`].
+/// Furthermore, any columns that are reversed in the sort order have their 
min/max values swapped.
+pub(crate) struct MinMaxStatistics {
+    min_by_sort_order: arrow::row::Rows,
+    max_by_sort_order: arrow::row::Rows,
+    sort_order: Vec<PhysicalSortExpr>,
+}
+
+impl MinMaxStatistics {
+    #[allow(unused)]
+    fn sort_order(&self) -> &[PhysicalSortExpr] {
+        &self.sort_order
+    }
+
+    fn new_from_files<'a>(
+        projected_sort_order: &[PhysicalSortExpr], // Sort order with respect 
to projected schema
+        projected_schema: &SchemaRef,              // Projected schema
+        projection: Option<&[usize]>, // Indices of projection in full table 
schema (None = all columns)

Review Comment:
   I am not sure what you mean, but `projection` is what was used to produce 
`projected_schema`. It tells us what position the columns of `projected_schema` 
would be in the full schema. Does that make it more clear?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to