alamb commented on code in PR #10950:
URL: https://github.com/apache/datafusion/pull/10950#discussion_r1642804890


##########
datafusion/core/benches/parquet_statistic.rs:
##########
@@ -82,11 +103,18 @@ fn create_parquet_file(dtype: TestTypes, row_groups: 
usize) -> NamedTempFile {
     for _ in 0..row_groups {
         let batch = match dtype {
             TestTypes::UInt64 => make_uint64_batch(),
+            TestTypes::Int64 => make_int64_batch(),
             TestTypes::F64 => make_f64_batch(),
             TestTypes::String => make_string_batch(),
             TestTypes::Dictionary => make_dict_batch(),
         };
-        writer.write(&batch).unwrap();
+        if data_page_row_count_limit.is_some() {

Review Comment:
   Perhaps we can add a comment here explaining why we write a batch at a time 
(basically because the page limit is only checked on record batch boundaries



##########
datafusion/core/benches/parquet_statistic.rs:
##########
@@ -70,7 +84,14 @@ fn create_parquet_file(dtype: TestTypes, row_groups: usize) 
-> NamedTempFile {
         )])),
     };
 
-    let props = WriterProperties::builder().build();
+    let mut props = WriterProperties::builder();

Review Comment:
    🤔  while looking at this code, It seems like the benchmark for row groups 
doesn't actually make multiple row groups (it instead it encodes `row_goups` 
batches instead)
   
   We should probably also set 
https://docs.rs/parquet/latest/parquet/file/properties/struct.WriterProperties.html#method.max_row_group_size
 as well 🤔 



##########
datafusion/core/benches/parquet_statistic.rs:
##########
@@ -150,37 +195,93 @@ fn make_dict_batch() -> RecordBatch {
 fn criterion_benchmark(c: &mut Criterion) {
     let row_groups = 100;
     use TestTypes::*;
-    let types = vec![UInt64, F64, String, Dictionary];
+    let types = vec![Int64, UInt64, F64, String, Dictionary];
+    let data_page_row_count_limits = vec![None, Some(1)];
 
     for dtype in types {
-        let file = create_parquet_file(dtype.clone(), row_groups);
-        let file = file.reopen().unwrap();
-        let reader = ArrowReaderBuilder::try_new(file).unwrap();
-        let metadata = reader.metadata();
-        let row_groups = metadata.row_groups();
-
-        let mut group =
-            c.benchmark_group(format!("Extract statistics for {}", 
dtype.clone()));
-        group.bench_function(
-            BenchmarkId::new("extract_statistics", dtype.clone()),
-            |b| {
-                b.iter(|| {
-                    let converter = StatisticsConverter::try_new(
-                        "col",
-                        reader.schema(),
-                        reader.parquet_schema(),
-                    )
-                    .unwrap();
-
-                    let _ = 
converter.row_group_mins(row_groups.iter()).unwrap();
-                    let _ = 
converter.row_group_maxes(row_groups.iter()).unwrap();
-                    let _ = 
converter.row_group_null_counts(row_groups.iter()).unwrap();
-                    let _ = 
StatisticsConverter::row_group_row_counts(row_groups.iter())
+        for data_page_row_count_limit in &data_page_row_count_limits {
+            let file =
+                create_parquet_file(dtype.clone(), row_groups, 
data_page_row_count_limit);
+            let file = file.reopen().unwrap();
+            let options = ArrowReaderOptions::new().with_page_index(true);
+            let reader = ArrowReaderBuilder::try_new_with_options(file, 
options).unwrap();
+            let metadata = reader.metadata();
+            let row_groups = metadata.row_groups();
+            let row_group_indices = row_groups

Review Comment:
   I think we could get the correct indices using the following potentially 
shorter code:
   
   ```rust
   let row_group_indices: Vec<_> = (0..row_groups.len()).collect();
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Reply via email to