adriangb commented on code in PR #19557:
URL: https://github.com/apache/datafusion/pull/19557#discussion_r2653151616
##########
datafusion/datasource-parquet/src/sort.rs:
##########
@@ -101,67 +120,165 @@ pub fn reverse_row_selection(
#[cfg(test)]
mod tests {
- use super::*;
+ use crate::ParquetAccessPlan;
+ use crate::RowGroupAccess;
+ use crate::opener::PreparedAccessPlan;
use arrow::datatypes::{DataType, Field, Schema};
use bytes::Bytes;
use parquet::arrow::ArrowWriter;
+ use parquet::arrow::arrow_reader::{RowSelection, RowSelector};
use parquet::file::reader::FileReader;
use parquet::file::serialized_reader::SerializedFileReader;
use std::sync::Arc;
/// Helper function to create a ParquetMetaData with specified row group
sizes
/// by actually writing a parquet file in memory
- fn create_test_metadata(row_group_sizes: Vec<i64>) -> ParquetMetaData {
- // Create a simple schema
+ fn create_test_metadata(
+ row_group_sizes: Vec<i64>,
+ ) -> parquet::file::metadata::ParquetMetaData {
let schema = Arc::new(Schema::new(vec![Field::new("a",
DataType::Int32, false)]));
-
- // Create in-memory parquet file with the specified row groups
let mut buffer = Vec::new();
{
- let props = parquet::file::properties::WriterProperties::builder()
- .set_max_row_group_size(row_group_sizes[0] as usize)
- .build();
-
+ let props =
parquet::file::properties::WriterProperties::builder().build();
let mut writer =
ArrowWriter::try_new(&mut buffer, schema.clone(),
Some(props)).unwrap();
for &size in &row_group_sizes {
- // Create a batch with the specified number of rows
let array = arrow::array::Int32Array::from(vec![1; size as
usize]);
let batch = arrow::record_batch::RecordBatch::try_new(
schema.clone(),
vec![Arc::new(array)],
)
.unwrap();
writer.write(&batch).unwrap();
+ writer.flush().unwrap();
}
writer.close().unwrap();
}
- // Read back the metadata
let bytes = Bytes::from(buffer);
let reader = SerializedFileReader::new(bytes).unwrap();
reader.metadata().clone()
}
#[test]
- fn test_reverse_simple_selection() {
- // 3 row groups with 100 rows each
+ fn test_prepared_access_plan_reverse_simple() {
+ // Test: all row groups are scanned, no row selection
+ let metadata = create_test_metadata(vec![100, 100, 100]);
+
+ let access_plan = ParquetAccessPlan::new_all(3);
+ let rg_metadata = metadata.row_groups();
+
+ let prepared_plan =
+ PreparedAccessPlan::from_access_plan(access_plan, rg_metadata)
+ .expect("Failed to create PreparedAccessPlan");
+
+ // Verify original plan
+ assert_eq!(prepared_plan.row_group_indexes, vec![0, 1, 2]);
+
+ let reversed_plan = prepared_plan
+ .reverse(&metadata)
+ .expect("Failed to reverse PreparedAccessPlan");
+
+ // Verify row groups are reversed
+ assert_eq!(reversed_plan.row_group_indexes, vec![2, 1, 0]);
+
+ // If no selection originally, after reversal should still select all
rows
+ if let Some(selection) = reversed_plan.row_selection {
+ let total_selected: usize = selection
Review Comment:
Is this conditional necessary? Don't we know if there is/isn't a row
selection since we just created it? I.e. we can `.unwrap()`
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]