zhuqi-lucas commented on code in PR #19064: URL: https://github.com/apache/datafusion/pull/19064#discussion_r2616857849
########## datafusion/datasource-parquet/src/sort.rs: ########## @@ -0,0 +1,318 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Sort-related utilities for Parquet scanning + +use datafusion_common::Result; +use parquet::arrow::arrow_reader::{RowSelection, RowSelector}; +use parquet::file::metadata::ParquetMetaData; +use std::collections::HashMap; + +/// Reverse a row selection to match reversed row group order. +/// +/// When scanning row groups in reverse order, we need to adjust the row selection +/// to account for the new ordering. This function: +/// 1. Maps each selection to its corresponding row group +/// 2. Reverses the order of row groups +/// 3. Reconstructs the row selection for the new order +/// +/// # Arguments +/// * `row_selection` - Original row selection +/// * `parquet_metadata` - Metadata containing row group information +/// +/// # Returns +/// A new `RowSelection` adjusted for reversed row group order +pub fn reverse_row_selection( + row_selection: &RowSelection, + parquet_metadata: &ParquetMetaData, +) -> Result<RowSelection> { + let rg_metadata = parquet_metadata.row_groups(); + + // Build a mapping of row group index to its row range in the file + let mut rg_row_ranges: Vec<(usize, usize, usize)> = + Vec::with_capacity(rg_metadata.len()); + let mut current_row = 0; + for (rg_idx, rg) in rg_metadata.iter().enumerate() { + let num_rows = rg.num_rows() as usize; + rg_row_ranges.push((rg_idx, current_row, current_row + num_rows)); + current_row += num_rows; + } + + // Map selections to row groups + let mut rg_selections: HashMap<usize, Vec<RowSelector>> = HashMap::new(); + + let mut current_file_row = 0; + for selector in row_selection.iter() { + let selector_end = current_file_row + selector.row_count; + + // Find which row groups this selector spans + for (rg_idx, rg_start, rg_end) in rg_row_ranges.iter() { + if current_file_row < *rg_end && selector_end > *rg_start { + // This selector overlaps with this row group + let overlap_start = current_file_row.max(*rg_start); + let overlap_end = selector_end.min(*rg_end); + let overlap_count = overlap_end - overlap_start; + + if overlap_count > 0 { + let entry = rg_selections.entry(*rg_idx).or_default(); + if selector.skip { + entry.push(RowSelector::skip(overlap_count)); + } else { + entry.push(RowSelector::select(overlap_count)); + } + } + } + } + + current_file_row = selector_end; + } + + // Build new selection for reversed row group order + let mut reversed_selectors = Vec::new(); + for rg_idx in (0..rg_metadata.len()).rev() { + if let Some(selectors) = rg_selections.get(&rg_idx) { + reversed_selectors.extend(selectors.iter().cloned()); + } else { + // No specific selection for this row group means select all Review Comment: Good catch @alamb , thank you, i will add more tests to cover this. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
