(arrow-rs) branch main updated: Skip page should also support skip dict page (#7409)

alamb Wed, 16 Apr 2025 10:12:32 -0700

This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-rs.git



The following commit(s) were added to refs/heads/main by this push:
     new b07d8ca48d Skip page should also support skip dict page (#7409)
b07d8ca48d is described below

commit b07d8ca48d375dc5c4b1119e0b4aa0bf3e255772
Author: Qi Zhu <[email protected]>
AuthorDate: Thu Apr 17 01:02:59 2025 +0800

    Skip page should also support skip dict page (#7409)
---
 parquet/src/file/serialized_reader.rs | 60 +++++++++++++++++++++++++++++++++--
 1 file changed, 58 insertions(+), 2 deletions(-)

diff --git a/parquet/src/file/serialized_reader.rs 
b/parquet/src/file/serialized_reader.rs
index ec2cd38c13..c42b00625d 100644
--- a/parquet/src/file/serialized_reader.rs
+++ b/parquet/src/file/serialized_reader.rs
@@ -981,8 +981,18 @@ impl<R: ChunkReader> PageReader for 
SerializedPageReader<R> {
                 }
                 Ok(())
             }
-            SerializedPageReaderState::Pages { page_locations, .. } => {
-                page_locations.pop_front();
+            SerializedPageReaderState::Pages {
+                page_locations,
+                dictionary_page,
+                ..
+            } => {
+                if dictionary_page.is_some() {
+                    // If a dictionary page exists, consume it by taking it 
(sets to None)
+                    dictionary_page.take();
+                } else {
+                    // If no dictionary page exists, simply pop the data page 
from page_locations
+                    page_locations.pop_front();
+                }
 
                 Ok(())
             }
@@ -1895,6 +1905,52 @@ mod tests {
         )
     }
 
+    #[test]
+    fn test_skip_next_page_with_dictionary_page() {
+        let test_file = get_test_file("alltypes_tiny_pages.parquet");
+        let builder = ReadOptionsBuilder::new();
+        // enable read page index
+        let options = builder.with_page_index().build();
+        let reader_result = SerializedFileReader::new_with_options(test_file, 
options);
+        let reader = reader_result.unwrap();
+
+        let row_group_reader = reader.get_row_group(0).unwrap();
+
+        // use 'string_col', Boundary order: UNORDERED, total 352 data pages 
and 1 dictionary page.
+        let mut column_page_reader = 
row_group_reader.get_column_page_reader(9).unwrap();
+
+        let mut vec = vec![];
+
+        // Step 1: Peek and ensure dictionary page is correctly identified
+        let meta = column_page_reader.peek_next_page().unwrap().unwrap();
+        assert!(meta.is_dict);
+
+        // Step 2: Call skip_next_page to skip the dictionary page
+        column_page_reader.skip_next_page().unwrap();
+
+        // Step 3: Read the next data page after skipping the dictionary page
+        let page = column_page_reader.get_next_page().unwrap().unwrap();
+        assert!(matches!(page.page_type(), basic::PageType::DATA_PAGE));
+
+        // Step 4: Continue reading remaining data pages and verify correctness
+        for _i in 0..351 {
+            // 352 total pages, 1 dictionary page is skipped
+            let meta = column_page_reader.peek_next_page().unwrap().unwrap();
+            assert!(!meta.is_dict); // Verify no dictionary page here
+            vec.push(meta);
+
+            let page = column_page_reader.get_next_page().unwrap().unwrap();
+            assert!(matches!(page.page_type(), basic::PageType::DATA_PAGE));
+        }
+
+        // Step 5: Check if all pages are read
+        assert!(column_page_reader.peek_next_page().unwrap().is_none());
+        assert!(column_page_reader.get_next_page().unwrap().is_none());
+
+        // Step 6: Verify the number of data pages read (should be 351 data 
pages)
+        assert_eq!(vec.len(), 351);
+    }
+
     #[test]
     fn test_skip_page_with_offset_index() {
         let test_file = get_test_file("alltypes_tiny_pages_plain.parquet");

(arrow-rs) branch main updated: Skip page should also support skip dict page (#7409)

Reply via email to