alamb commented on code in PR #9972:
URL: https://github.com/apache/arrow-rs/pull/9972#discussion_r3357602241
##########
parquet/src/arrow/arrow_writer/mod.rs:
##########
@@ -4904,6 +4904,124 @@ mod tests {
assert_eq!(get_dict_page_size(col1_meta), 1024 * 1024 * 4);
}
+ #[test]
+ fn test_arrow_writer_granular_mode_roundtrip() {
+ // Granular mode subdivides chunks and writes more pages than
+ // `main`. Make sure the data we write back is bit-identical to
Review Comment:
Comments comparing to `main `are confusing I think as once this PR merges it
will become main
##########
parquet/tests/arrow_writer_layout.rs:
##########
@@ -599,3 +607,344 @@ fn test_per_column_data_page_size_limit() {
assert_eq!(col_a_page_count, 16);
assert_eq!(col_b_page_count, 1);
}
+
+#[test]
+fn test_fixed_size_binary() {
+ // FixedSizeBinary values larger than the data page byte limit.
+ let value_size = 1024usize;
+ let num_rows = 64usize;
+ let values: Vec<u8> = (0..num_rows)
+ .flat_map(|i| vec![i as u8; value_size])
+ .collect();
+ let array =
+ Arc::new(FixedSizeBinaryArray::try_new(value_size as i32,
values.into(), None).unwrap())
+ as _;
+ let batch = RecordBatch::try_from_iter([("col", array)]).unwrap();
+
+ let props = WriterProperties::builder()
+ .set_dictionary_enabled(false)
+ .set_data_page_size_limit(4096)
+ .set_write_page_header_statistics(true)
+ .build();
+
+ do_test(LayoutTest {
+ props,
+ batches: vec![batch],
+ layout: Layout {
+ row_groups: vec![RowGroup {
+ columns: vec![ColumnChunk {
+ // 12 pages of 5 values (5 * 1024 = 5120 B, the boundary
+ // value pushes each page just past the 4096 B limit) plus
+ // a final page with the remaining 4 values.
+ pages: (0..12)
+ .map(|_| Page {
+ rows: 5,
+ page_header_size: 157,
+ compressed_size: 5120,
+ encoding: Encoding::PLAIN,
+ page_type: PageType::DATA_PAGE,
+ })
+ .chain(std::iter::once(Page {
+ rows: 4,
+ page_header_size: 157,
+ compressed_size: 4096,
+ encoding: Encoding::PLAIN,
+ page_type: PageType::DATA_PAGE,
+ }))
+ .collect(),
+ dictionary_page: None,
+ }],
+ }],
+ },
+ });
+}
+
+#[test]
+fn test_dictionary() {
+ // Arrow `DictionaryArray<Int32, Utf8>` input.
+ let num_rows = 2000;
+ let dict_values = StringArray::from_iter_values(["alpha", "beta", "gamma",
"delta"]);
+ let keys = Int32Array::from_iter_values((0..num_rows).map(|i| i % 4));
+ let array =
+ Arc::new(DictionaryArray::<Int32Type>::try_new(keys,
Arc::new(dict_values)).unwrap()) as _;
+ let batch = RecordBatch::try_from_iter([("col", array)]).unwrap();
+
+ let props = WriterProperties::builder()
+ .set_dictionary_enabled(true)
+ .set_dictionary_page_size_limit(1000)
+ .set_data_page_size_limit(1000)
+ .set_write_batch_size(10)
+ .set_write_page_header_statistics(true)
+ .build();
+
+ do_test(LayoutTest {
+ props,
+ batches: vec![batch],
+ layout: Layout {
+ row_groups: vec![RowGroup {
+ columns: vec![ColumnChunk {
+ pages: vec![Page {
+ rows: 2000,
+ page_header_size: 40,
+ compressed_size: 505,
+ encoding: Encoding::RLE_DICTIONARY,
+ page_type: PageType::DATA_PAGE,
+ }],
+ dictionary_page: Some(Page {
+ rows: 4,
+ page_header_size: 38,
+ compressed_size: 35,
+ encoding: Encoding::PLAIN,
+ page_type: PageType::DICTIONARY_PAGE,
+ }),
+ }],
+ }],
+ },
+ });
+}
+
+#[test]
+fn test_large_string() {
+ // Large `Utf8` values (64 KiB each) with a 16 KiB data page limit.
+ //
+ // Each value already exceeds the page byte budget, so the byte-budget
+ // chunker in `ByteArrayEncoder` (the offsets-buffer scan in
+ // `count_within_budget_offsets`) cuts one value per page instead of
+ // buffering the whole ~2 MiB column into a single page. This drives the
+ // real `ArrowWriter` user path; the lower-level column writer is covered
+ // by `test_column_writer_caps_page_size_for_large_byte_array_values`.
+ let value_size = 64 * 1024;
+ let strings: Vec<String> = (0..32).map(|_|
"x".repeat(value_size)).collect();
+ let array = Arc::new(StringArray::from(strings)) as _;
+ let batch = RecordBatch::try_from_iter([("col", array)]).unwrap();
+ let props = WriterProperties::builder()
+ .set_dictionary_enabled(false)
+ .set_data_page_size_limit(16 * 1024)
+ // Disable statistics so page headers stay small and the layout is
+ // determined purely by the page-splitting logic under test.
+ .set_statistics_enabled(EnabledStatistics::None)
+ .build();
+
+ do_test(LayoutTest {
+ props,
+ batches: vec![batch],
+ layout: Layout {
+ row_groups: vec![RowGroup {
+ columns: vec![ColumnChunk {
+ // One 64 KiB value per page (4-byte length prefix +
value).
Review Comment:
nice
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]