alamb commented on code in PR #10950: URL: https://github.com/apache/datafusion/pull/10950#discussion_r1642804890
########## datafusion/core/benches/parquet_statistic.rs: ########## @@ -82,11 +103,18 @@ fn create_parquet_file(dtype: TestTypes, row_groups: usize) -> NamedTempFile { for _ in 0..row_groups { let batch = match dtype { TestTypes::UInt64 => make_uint64_batch(), + TestTypes::Int64 => make_int64_batch(), TestTypes::F64 => make_f64_batch(), TestTypes::String => make_string_batch(), TestTypes::Dictionary => make_dict_batch(), }; - writer.write(&batch).unwrap(); + if data_page_row_count_limit.is_some() { Review Comment: Perhaps we can add a comment here explaining why we write a batch at a time (basically because the page limit is only checked on record batch boundaries ########## datafusion/core/benches/parquet_statistic.rs: ########## @@ -70,7 +84,14 @@ fn create_parquet_file(dtype: TestTypes, row_groups: usize) -> NamedTempFile { )])), }; - let props = WriterProperties::builder().build(); + let mut props = WriterProperties::builder(); Review Comment: 🤔 while looking at this code, It seems like the benchmark for row groups doesn't actually make multiple row groups (it instead it encodes `row_goups` batches instead) We should probably also set https://docs.rs/parquet/latest/parquet/file/properties/struct.WriterProperties.html#method.max_row_group_size as well 🤔 ########## datafusion/core/benches/parquet_statistic.rs: ########## @@ -150,37 +195,93 @@ fn make_dict_batch() -> RecordBatch { fn criterion_benchmark(c: &mut Criterion) { let row_groups = 100; use TestTypes::*; - let types = vec![UInt64, F64, String, Dictionary]; + let types = vec![Int64, UInt64, F64, String, Dictionary]; + let data_page_row_count_limits = vec![None, Some(1)]; for dtype in types { - let file = create_parquet_file(dtype.clone(), row_groups); - let file = file.reopen().unwrap(); - let reader = ArrowReaderBuilder::try_new(file).unwrap(); - let metadata = reader.metadata(); - let row_groups = metadata.row_groups(); - - let mut group = - c.benchmark_group(format!("Extract statistics for {}", dtype.clone())); - group.bench_function( - BenchmarkId::new("extract_statistics", dtype.clone()), - |b| { - b.iter(|| { - let converter = StatisticsConverter::try_new( - "col", - reader.schema(), - reader.parquet_schema(), - ) - .unwrap(); - - let _ = converter.row_group_mins(row_groups.iter()).unwrap(); - let _ = converter.row_group_maxes(row_groups.iter()).unwrap(); - let _ = converter.row_group_null_counts(row_groups.iter()).unwrap(); - let _ = StatisticsConverter::row_group_row_counts(row_groups.iter()) + for data_page_row_count_limit in &data_page_row_count_limits { + let file = + create_parquet_file(dtype.clone(), row_groups, data_page_row_count_limit); + let file = file.reopen().unwrap(); + let options = ArrowReaderOptions::new().with_page_index(true); + let reader = ArrowReaderBuilder::try_new_with_options(file, options).unwrap(); + let metadata = reader.metadata(); + let row_groups = metadata.row_groups(); + let row_group_indices = row_groups Review Comment: I think we could get the correct indices using the following potentially shorter code: ```rust let row_group_indices: Vec<_> = (0..row_groups.len()).collect(); ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org For additional commands, e-mail: github-h...@datafusion.apache.org