adriangb commented on code in PR #6319:
URL: https://github.com/apache/arrow-rs/pull/6319#discussion_r1735059501


##########
parquet/src/file/metadata/writer.rs:
##########
@@ -450,19 +451,47 @@ mod tests {
             true,
         )]));
 
-        let a: ArrayRef = Arc::new(Int32Array::from(vec![Some(1), None, 
Some(2)]));
+        // build row groups / pages that exercise different combinations of 
nulls and values
+        // note that below we set the row group and page sizes to 4 and 2 
respectively
+        // so that these "groupings" make sense
+        let a: ArrayRef = Arc::new(Int32Array::from(vec![
+            // a row group that has all values
+            Some(i32::MIN), Some(-1),
+            Some(1), Some(i32::MAX),
+            // a row group with a page of all nulls and a page of all values
+            None, None,
+            Some(2), Some(3),
+            // a row group that has all null pages
+            None, None,
+            None, None,
+            // a row group having 1 page with all values and 1 page with some 
nulls
+            Some(4), Some(5),
+            None, Some(6),
+            // a row group having 1 page with all nulls and 1 page with some 
nulls
+            None, None,
+            Some(7), None,
+            // a row group having all pages with some nulls
+            None, Some(8),
+            Some(9), None,
+        ]));
 
         let batch = RecordBatch::try_from_iter(vec![("a", a)]).unwrap();
 
-        let writer_props = match write_page_index {
+        let writer_props_builder = match write_page_index {
             true => WriterProperties::builder()
-                .set_statistics_enabled(EnabledStatistics::Page)
-                .build(),
+                .set_statistics_enabled(EnabledStatistics::Page),
             false => WriterProperties::builder()
-                .set_statistics_enabled(EnabledStatistics::Chunk)
-                .build(),
+                .set_statistics_enabled(EnabledStatistics::Chunk),
         };
 
+        // tune the size or pages to the data above
+        // to make sure we exercise code paths where all items in a page are 
null, etc.
+        let writer_props = writer_props_builder
+            .set_max_row_group_size(4)
+            .set_data_page_row_count_limit(2)
+            .set_write_batch_size(2)

Review Comment:
   I recognize that this is forcing things a bit. I leaned towards this 
approach because (1) it felt like a small step from where we are now and (2) 
I'm not that familiar with the lower level APIs that would be required if we 
wanted to build a `ParquetMetadata` by hand and use that for tests instead of 
relying on file writing to build one for us. I think a reasonable piece of 
feedback to this PR would be that now is the time to refactor the tests to 
build the `ParquetMetadata` by hand since we now care about getting pages of a 
certain size, etc.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to