This is an automated email from the ASF dual-hosted git repository.
alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git
The following commit(s) were added to refs/heads/main by this push:
new b07d8ca48d Skip page should also support skip dict page (#7409)
b07d8ca48d is described below
commit b07d8ca48d375dc5c4b1119e0b4aa0bf3e255772
Author: Qi Zhu <[email protected]>
AuthorDate: Thu Apr 17 01:02:59 2025 +0800
Skip page should also support skip dict page (#7409)
---
parquet/src/file/serialized_reader.rs | 60 +++++++++++++++++++++++++++++++++--
1 file changed, 58 insertions(+), 2 deletions(-)
diff --git a/parquet/src/file/serialized_reader.rs
b/parquet/src/file/serialized_reader.rs
index ec2cd38c13..c42b00625d 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -981,8 +981,18 @@ impl<R: ChunkReader> PageReader for
SerializedPageReader<R> {
}
Ok(())
}
- SerializedPageReaderState::Pages { page_locations, .. } => {
- page_locations.pop_front();
+ SerializedPageReaderState::Pages {
+ page_locations,
+ dictionary_page,
+ ..
+ } => {
+ if dictionary_page.is_some() {
+ // If a dictionary page exists, consume it by taking it
(sets to None)
+ dictionary_page.take();
+ } else {
+ // If no dictionary page exists, simply pop the data page
from page_locations
+ page_locations.pop_front();
+ }
Ok(())
}
@@ -1895,6 +1905,52 @@ mod tests {
)
}
+ #[test]
+ fn test_skip_next_page_with_dictionary_page() {
+ let test_file = get_test_file("alltypes_tiny_pages.parquet");
+ let builder = ReadOptionsBuilder::new();
+ // enable read page index
+ let options = builder.with_page_index().build();
+ let reader_result = SerializedFileReader::new_with_options(test_file,
options);
+ let reader = reader_result.unwrap();
+
+ let row_group_reader = reader.get_row_group(0).unwrap();
+
+ // use 'string_col', Boundary order: UNORDERED, total 352 data pages
and 1 dictionary page.
+ let mut column_page_reader =
row_group_reader.get_column_page_reader(9).unwrap();
+
+ let mut vec = vec![];
+
+ // Step 1: Peek and ensure dictionary page is correctly identified
+ let meta = column_page_reader.peek_next_page().unwrap().unwrap();
+ assert!(meta.is_dict);
+
+ // Step 2: Call skip_next_page to skip the dictionary page
+ column_page_reader.skip_next_page().unwrap();
+
+ // Step 3: Read the next data page after skipping the dictionary page
+ let page = column_page_reader.get_next_page().unwrap().unwrap();
+ assert!(matches!(page.page_type(), basic::PageType::DATA_PAGE));
+
+ // Step 4: Continue reading remaining data pages and verify correctness
+ for _i in 0..351 {
+ // 352 total pages, 1 dictionary page is skipped
+ let meta = column_page_reader.peek_next_page().unwrap().unwrap();
+ assert!(!meta.is_dict); // Verify no dictionary page here
+ vec.push(meta);
+
+ let page = column_page_reader.get_next_page().unwrap().unwrap();
+ assert!(matches!(page.page_type(), basic::PageType::DATA_PAGE));
+ }
+
+ // Step 5: Check if all pages are read
+ assert!(column_page_reader.peek_next_page().unwrap().is_none());
+ assert!(column_page_reader.get_next_page().unwrap().is_none());
+
+ // Step 6: Verify the number of data pages read (should be 351 data
pages)
+ assert_eq!(vec.len(), 351);
+ }
+
#[test]
fn test_skip_page_with_offset_index() {
let test_file = get_test_file("alltypes_tiny_pages_plain.parquet");