etseidl commented on code in PR #8598:
URL: https://github.com/apache/arrow-rs/pull/8598#discussion_r2427125928
##########
parquet/benches/metadata.rs:
##########
@@ -15,86 +15,69 @@
// specific language governing permissions and limitations
// under the License.
-use parquet::file::metadata::ParquetMetaDataReader;
+use std::sync::Arc;
+
+use parquet::basic::{Encoding, PageType, Type as PhysicalType};
+use parquet::file::metadata::{
+ ColumnChunkMetaData, FileMetaData, PageEncodingStats, ParquetMetaData,
ParquetMetaDataReader,
+ ParquetMetaDataWriter, RowGroupMetaData,
+};
+use parquet::file::statistics::Statistics;
+use parquet::file::writer::TrackedWrite;
+use parquet::schema::parser::parse_message_type;
+use parquet::schema::types::{
+ ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescriptor, Type as
SchemaType,
+};
use rand::Rng;
-use thrift::protocol::TCompactOutputProtocol;
use arrow::util::test_util::seedable_rng;
use bytes::Bytes;
use criterion::*;
use parquet::file::reader::SerializedFileReader;
use parquet::file::serialized_reader::ReadOptionsBuilder;
-use parquet::format::{
- ColumnChunk, ColumnMetaData, CompressionCodec, Encoding,
FieldRepetitionType, FileMetaData,
- PageEncodingStats, PageType, RowGroup, SchemaElement, Type,
-};
-use parquet::thrift::TSerializable;
const NUM_COLUMNS: usize = 10_000;
const NUM_ROW_GROUPS: usize = 10;
fn encoded_meta() -> Vec<u8> {
let mut rng = seedable_rng();
- let mut schema = Vec::with_capacity(NUM_COLUMNS + 1);
- schema.push(SchemaElement {
- type_: None,
- type_length: None,
- repetition_type: None,
- name: Default::default(),
- num_children: Some(NUM_COLUMNS as _),
- converted_type: None,
- scale: None,
- precision: None,
- field_id: None,
- logical_type: None,
- });
+ let mut message_type = "message test_schema {".to_string();
for i in 0..NUM_COLUMNS {
- schema.push(SchemaElement {
- type_: Some(Type::FLOAT),
- type_length: None,
- repetition_type: Some(FieldRepetitionType::REQUIRED),
- name: i.to_string(),
- num_children: None,
- converted_type: None,
- scale: None,
- precision: None,
- field_id: None,
- logical_type: None,
- })
+ message_type.push_str(&format!("REQUIRED FLOAT {};", i));
}
+ message_type.push('}');
- let stats = parquet::format::Statistics {
- min: None,
- max: None,
- null_count: Some(0),
- distinct_count: None,
- max_value: Some(vec![rng.random(); 8]),
- min_value: Some(vec![rng.random(); 8]),
- is_max_value_exact: Some(true),
- is_min_value_exact: Some(true),
- };
+ let schema_descr = parse_message_type(&message_type)
+ .map(|t| Arc::new(SchemaDescriptor::new(Arc::new(t))))
+ .unwrap();
+
+ let stats = Statistics::float(Some(rng.random()), Some(rng.random()),
None, Some(0), false);
let row_groups = (0..NUM_ROW_GROUPS)
.map(|i| {
let columns = (0..NUM_COLUMNS)
Review Comment:
Rather than creating new column descriptors, you could instead replace this
with
```rust
let columns = schema_descr
.columns()
.iter()
.map(|column_desc_ptr| {
ColumnChunkMetaData::builder(column_desc_ptr.clone())
...
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]