rluvaton commented on code in PR #9357:
URL: https://github.com/apache/arrow-rs/pull/9357#discussion_r2770305033
##########
parquet/src/arrow/arrow_writer/mod.rs:
##########
@@ -4518,4 +4575,185 @@ mod tests {
assert_eq!(get_dict_page_size(col0_meta), 1024 * 1024);
assert_eq!(get_dict_page_size(col1_meta), 1024 * 1024 * 4);
}
+
+ /// Helper to create a test batch with the given number of rows.
+ /// Each row is approximately 4 bytes (one i32).
+ fn create_test_batch(num_rows: usize) -> RecordBatch {
+ let schema = Arc::new(Schema::new(vec![Field::new(
+ "int",
+ ArrowDataType::Int32,
+ false,
+ )]));
+ let array = Int32Array::from((0..num_rows as i32).collect::<Vec<_>>());
+ RecordBatch::try_new(schema, vec![Arc::new(array)]).unwrap()
+ }
+
+ #[test]
+ fn test_row_group_limit_none_writes_single_row_group() {
+ // When both limits are None, all data should go into a single row
group
+ let batch = create_test_batch(1000);
+
+ let props = WriterProperties::builder()
+ .set_max_row_group_row_count(None)
+ .set_max_row_group_bytes(None)
+ .build();
+
+ let file = tempfile::tempfile().unwrap();
+ let mut writer =
+ ArrowWriter::try_new(file.try_clone().unwrap(), batch.schema(),
Some(props)).unwrap();
+
+ writer.write(&batch).unwrap();
+ writer.close().unwrap();
+
+ let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
+ assert_eq!(
+ &row_group_sizes(builder.metadata()),
+ &[1000],
+ "With no limits, all rows should be in a single row group"
+ );
+ }
+
+ #[test]
+ fn test_row_group_limit_rows_only() {
+ // When only max_row_group_size is set, respect the row limit
+ let batch = create_test_batch(1000);
+
+ let props = WriterProperties::builder()
+ .set_max_row_group_size(300)
+ .set_max_row_group_bytes(None)
+ .build();
+
+ let file = tempfile::tempfile().unwrap();
+ let mut writer =
+ ArrowWriter::try_new(file.try_clone().unwrap(), batch.schema(),
Some(props)).unwrap();
+
+ writer.write(&batch).unwrap();
+ writer.close().unwrap();
+
+ let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
+ assert_eq!(
+ &row_group_sizes(builder.metadata()),
+ &[300, 300, 300, 100],
+ "Row groups should be split by row count"
+ );
+ }
+
+ #[test]
+ fn test_row_group_limit_bytes_only() {
+ // When only max_row_group_bytes is set, respect the byte limit
+ // Create batches with string data for more predictable byte sizes
+ // Write in multiple small batches so byte-based splitting can work
+ // (first batch establishes the avg row size, subsequent batches are
split)
+ let schema = Arc::new(Schema::new(vec![Field::new(
+ "str",
+ ArrowDataType::Utf8,
+ false,
+ )]));
+
+ // Set byte limit to approximately fit ~30 rows worth of data (~100
bytes each)
+ let props = WriterProperties::builder()
+ .set_max_row_group_row_count(None)
+ .set_max_row_group_bytes(Some(3500))
+ .build();
+
+ let file = tempfile::tempfile().unwrap();
+ let mut writer =
+ ArrowWriter::try_new(file.try_clone().unwrap(), schema.clone(),
Some(props)).unwrap();
+
+ // Write 10 batches of 10 rows each (100 rows total)
+ // Each string is ~100 bytes
+ for batch_idx in 0..10 {
+ let strings: Vec<String> = (0..10)
+ .map(|i| format!("{:0>100}", batch_idx * 10 + i))
+ .collect();
+ let array = StringArray::from(strings);
+ let batch = RecordBatch::try_new(schema.clone(),
vec![Arc::new(array)]).unwrap();
+ writer.write(&batch).unwrap();
+ }
+ writer.close().unwrap();
+
+ let builder = ParquetRecordBatchReaderBuilder::try_new(file).unwrap();
+ let sizes = row_group_sizes(builder.metadata());
+
+ assert!(
+ sizes.len() > 1,
+ "Should have multiple row groups due to byte limit, got {sizes:?}",
+ );
+
+ let total_rows: i64 = sizes.iter().sum();
+ assert_eq!(total_rows, 100, "Total rows should be preserved");
+ }
+
+ #[test]
+ fn test_row_group_limit_both_row_wins() {
Review Comment:
you don't need statistics, you can calculate it from the data types you need
to encocde
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]