This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git


The following commit(s) were added to refs/heads/master by this push:
     new d48010e571 Workaround for missing Parquet page indexes in 
`ParquetMetadaReader` (#6450)
d48010e571 is described below

commit d48010e5717a1fb54822ad3a8c41f084cff05c92
Author: Ed Seidl <etse...@users.noreply.github.com>
AuthorDate: Wed Sep 25 16:55:21 2024 -0700

    Workaround for missing Parquet page indexes in `ParquetMetadaReader` (#6450)
    
    * workaround for missing page indexes
    
    * remove empty line
    
    * Apply suggestions from code review
    
    Co-authored-by: Andrew Lamb <and...@nerdnetworks.org>
    
    * fmt
    
    ---------
    
    Co-authored-by: Andrew Lamb <and...@nerdnetworks.org>
---
 parquet/src/arrow/arrow_reader/mod.rs | 26 ++++++--------------------
 parquet/src/file/metadata/reader.rs   | 28 ++++++++++++++++++++++++----
 2 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/parquet/src/arrow/arrow_reader/mod.rs 
b/parquet/src/arrow/arrow_reader/mod.rs
index 253625117c..a109851f72 100644
--- a/parquet/src/arrow/arrow_reader/mod.rs
+++ b/parquet/src/arrow/arrow_reader/mod.rs
@@ -34,9 +34,7 @@ use 
crate::arrow::schema::{parquet_to_arrow_schema_and_fields, ParquetField};
 use crate::arrow::{parquet_to_arrow_field_levels, FieldLevels, ProjectionMask};
 use crate::column::page::{PageIterator, PageReader};
 use crate::errors::{ParquetError, Result};
-use crate::file::footer;
-use crate::file::metadata::ParquetMetaData;
-use crate::file::page_index::index_reader;
+use crate::file::metadata::{ParquetMetaData, ParquetMetaDataReader};
 use crate::file::reader::{ChunkReader, SerializedPageReader};
 use crate::schema::types::SchemaDescriptor;
 
@@ -382,23 +380,9 @@ impl ArrowReaderMetadata {
     /// `Self::metadata` is missing the page index, this function will attempt
     /// to load the page index by making an object store request.
     pub fn load<T: ChunkReader>(reader: &T, options: ArrowReaderOptions) -> 
Result<Self> {
-        let mut metadata = footer::parse_metadata(reader)?;
-        if options.page_index {
-            let column_index = metadata
-                .row_groups()
-                .iter()
-                .map(|rg| index_reader::read_columns_indexes(reader, 
rg.columns()))
-                .collect::<Result<Vec<_>>>()?;
-            metadata.set_column_index(Some(column_index));
-
-            let offset_index = metadata
-                .row_groups()
-                .iter()
-                .map(|rg| index_reader::read_offset_indexes(reader, 
rg.columns()))
-                .collect::<Result<Vec<_>>>()?;
-
-            metadata.set_offset_index(Some(offset_index))
-        }
+        let metadata = ParquetMetaDataReader::new()
+            .with_page_indexes(options.page_index)
+            .parse_and_finish(reader)?;
         Self::try_new(Arc::new(metadata), options)
     }
 
@@ -3498,6 +3482,8 @@ mod tests {
             .unwrap();
             // Although `Vec<Vec<PageLoacation>>` of each row group is empty,
             // we should read the file successfully.
+            // FIXME: this test will fail when metadata parsing returns `None` 
for missing page
+            // indexes. https://github.com/apache/arrow-rs/issues/6447
             assert!(builder.metadata().offset_index().unwrap()[0].is_empty());
             let reader = builder.build().unwrap();
             let batches = reader.collect::<Result<Vec<_>, _>>().unwrap();
diff --git a/parquet/src/file/metadata/reader.rs 
b/parquet/src/file/metadata/reader.rs
index 333e0fd6e9..4a5a1bc93c 100644
--- a/parquet/src/file/metadata/reader.rs
+++ b/parquet/src/file/metadata/reader.rs
@@ -201,6 +201,7 @@ impl ParquetMetaDataReader {
             // need for more data. This is not it's intended use. The plan is 
to add a NeedMoreData
             // value to the enum, but this would be a breaking change. This 
will be done as
             // 54.0.0 draws nearer.
+            // https://github.com/apache/arrow-rs/issues/6447
             Err(ParquetError::IndexOutOfBound(needed, _)) => {
                 // If reader is the same length as `file_size` then presumably 
there is no more to
                 // read, so return an EOF error.
@@ -247,13 +248,18 @@ impl ParquetMetaDataReader {
             ));
         }
 
-        // TODO(ets): what is the correct behavior for missing page indexes? 
MetadataLoader would
-        // leave them as `None`, while the parser in 
`index_reader::read_columns_indexes` returns a
-        // vector of empty vectors.
-        // I think it's best to leave them as `None`.
+        // FIXME: there are differing implementations in the case where page 
indexes are missing
+        // from the file. `MetadataLoader` will leave them as `None`, while 
the parser in
+        // `index_reader::read_columns_indexes` returns a vector of empty 
vectors.
+        // It is best for this function to replicate the latter behavior for 
now, but in a future
+        // breaking release, the two paths to retrieve metadata should be made 
consistent. Note that this is only
+        // an issue if the user requested page indexes, so there is no need to 
provide empty
+        // vectors in `try_parse_sized()`.
+        // https://github.com/apache/arrow-rs/issues/6447
 
         // Get bounds needed for page indexes (if any are present in the file).
         let Some(range) = self.range_for_page_index() else {
+            self.empty_page_indexes();
             return Ok(());
         };
 
@@ -412,6 +418,20 @@ impl ParquetMetaDataReader {
         Ok(())
     }
 
+    /// Set the column_index and offset_indexes to empty `Vec` for backwards 
compatibility
+    ///
+    /// See <https://github.com/apache/arrow-rs/pull/6451>  for details
+    fn empty_page_indexes(&mut self) {
+        let metadata = self.metadata.as_mut().unwrap();
+        let num_row_groups = metadata.num_row_groups();
+        if self.column_index {
+            metadata.set_column_index(Some(vec![vec![]; num_row_groups]));
+        }
+        if self.offset_index {
+            metadata.set_offset_index(Some(vec![vec![]; num_row_groups]));
+        }
+    }
+
     fn range_for_page_index(&self) -> Option<Range<usize>> {
         // sanity check
         self.metadata.as_ref()?;

Reply via email to