etseidl commented on code in PR #8598:
URL: https://github.com/apache/arrow-rs/pull/8598#discussion_r2426852577


##########
parquet/benches/metadata.rs:
##########
@@ -15,86 +15,66 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use parquet::file::metadata::ParquetMetaDataReader;
+use std::sync::Arc;
+
+use parquet::basic::{Encoding, PageType, Type as PhysicalType};
+use parquet::file::metadata::{
+    ColumnChunkMetaData, FileMetaData, PageEncodingStats, ParquetMetaData, 
ParquetMetaDataReader,
+    ParquetMetaDataWriter, RowGroupMetaData,
+};
+use parquet::file::writer::TrackedWrite;
+use parquet::schema::parser::parse_message_type;
+use parquet::schema::types::{
+    ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescriptor, Type as 
SchemaType,
+};
 use rand::Rng;
-use thrift::protocol::TCompactOutputProtocol;
 
 use arrow::util::test_util::seedable_rng;
 use bytes::Bytes;
 use criterion::*;
 use parquet::file::reader::SerializedFileReader;
 use parquet::file::serialized_reader::ReadOptionsBuilder;
-use parquet::format::{
-    ColumnChunk, ColumnMetaData, CompressionCodec, Encoding, 
FieldRepetitionType, FileMetaData,
-    PageEncodingStats, PageType, RowGroup, SchemaElement, Type,
-};
-use parquet::thrift::TSerializable;
 
 const NUM_COLUMNS: usize = 10_000;
 const NUM_ROW_GROUPS: usize = 10;
 
 fn encoded_meta() -> Vec<u8> {
     let mut rng = seedable_rng();
 
-    let mut schema = Vec::with_capacity(NUM_COLUMNS + 1);
-    schema.push(SchemaElement {
-        type_: None,
-        type_length: None,
-        repetition_type: None,
-        name: Default::default(),
-        num_children: Some(NUM_COLUMNS as _),
-        converted_type: None,
-        scale: None,
-        precision: None,
-        field_id: None,
-        logical_type: None,
-    });
+    let mut message_type = "message test_schema {".to_string();
     for i in 0..NUM_COLUMNS {
-        schema.push(SchemaElement {
-            type_: Some(Type::FLOAT),
-            type_length: None,
-            repetition_type: Some(FieldRepetitionType::REQUIRED),
-            name: i.to_string(),
-            num_children: None,
-            converted_type: None,
-            scale: None,
-            precision: None,
-            field_id: None,
-            logical_type: None,
-        })
+        message_type.push_str(&format!("REQUIRED FLOAT {};", i));
     }
+    message_type.push('}');
 
-    let stats = parquet::format::Statistics {
-        min: None,
-        max: None,
-        null_count: Some(0),
-        distinct_count: None,
-        max_value: Some(vec![rng.random(); 8]),
-        min_value: Some(vec![rng.random(); 8]),
-        is_max_value_exact: Some(true),
-        is_min_value_exact: Some(true),
-    };
+    let schema_descr = parse_message_type(&message_type)
+        .map(|t| Arc::new(SchemaDescriptor::new(Arc::new(t))))
+        .unwrap();
 
     let row_groups = (0..NUM_ROW_GROUPS)
         .map(|i| {
             let columns = (0..NUM_COLUMNS)
-                .map(|_| ColumnChunk {
-                    file_path: None,
-                    file_offset: 0,
-                    meta_data: Some(ColumnMetaData {
-                        type_: Type::FLOAT,
-                        encodings: vec![Encoding::PLAIN, 
Encoding::RLE_DICTIONARY],
-                        path_in_schema: vec![],
-                        codec: CompressionCodec::UNCOMPRESSED,
-                        num_values: rng.random_range(1..1000000),
-                        total_uncompressed_size: 
rng.random_range(100000..100000000),
-                        total_compressed_size: 
rng.random_range(50000..5000000),
-                        key_value_metadata: None,
-                        data_page_offset: rng.random_range(4..2000000000),
-                        index_page_offset: None,
-                        dictionary_page_offset: 
Some(rng.random_range(4..2000000000)),
-                        statistics: Some(stats.clone()),
-                        encoding_stats: Some(vec![
+                .map(|j| {
+                    let column_desc_ptr = 
ColumnDescPtr::new(ColumnDescriptor::new(
+                        Arc::new(
+                            SchemaType::primitive_type_builder(&j.to_string(), 
PhysicalType::FLOAT)
+                                .build()
+                                .unwrap(),
+                        ),
+                        0,
+                        0,
+                        ColumnPath::new(vec![]),
+                    ));
+
+                    ColumnChunkMetaData::builder(column_desc_ptr)
+                        .set_encodings(vec![Encoding::PLAIN, 
Encoding::RLE_DICTIONARY])
+                        
.set_compression(parquet::basic::Compression::UNCOMPRESSED)
+                        .set_num_values(rng.random_range(1..1000000))
+                        
.set_total_compressed_size(rng.random_range(50000..5000000))
+                        .set_data_page_offset(rng.random_range(4..2000000000))
+                        
.set_dictionary_page_offset(Some(rng.random_range(4..2000000000)))
+                        //.set_statistics(Sta)

Review Comment:
   ```suggestion
                           .set_statistics(stats.clone())
   ```
   where
   
   ```rust
   let stats = Statistics::float(Some(rng.random()), Some(rng.random()), None, 
Some(0), false);
   ```
   
   We need the stats set to match old times.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to