tustvold commented on PR #5135:
URL: https://github.com/apache/arrow-rs/pull/5135#issuecomment-1830663668
Ok sorry for going back and forth on this, I hadn't quite grasped what the
issue here was
```
#[tokio::test]
async fn test_projection_schema() {
let mut metadata = HashMap::with_capacity(1);
metadata.insert("key".to_string(), "value".to_string());
let schema = Arc::new(
Schema::new(Fields::from(vec![
Field::new("a", DataType::Int32, true),
Field::new("c", DataType::UInt64, true),
Field::new("d", DataType::Float32, true),
]))
.with_metadata(metadata.clone()),
);
let mut file = tempfile().unwrap();
let mut writer = ArrowWriter::try_new(&mut file, schema.clone(),
None).unwrap();
writer
.write(&RecordBatch::new_empty(schema.clone()))
.unwrap();
writer.close().unwrap();
let builder =
ParquetRecordBatchReaderBuilder::try_new(file.try_clone().unwrap()).unwrap();
let sync_file_schema = builder.schema().clone();
let mask = ProjectionMask::leaves(&builder.parquet_schema(), [1, 2]);
let reader = builder.with_projection(mask).build().unwrap();
let sync_reader_schema = reader.schema();
assert_eq!(sync_file_schema.fields.len(), 3);
assert_eq!(sync_file_schema.metadata, metadata);
assert_eq!(sync_reader_schema.fields.len(), 2);
assert_eq!(sync_reader_schema.metadata, HashMap::default());
let file = tokio::fs::File::from(file);
let builder = ParquetRecordBatchStreamBuilder::new(file).await.unwrap();
let async_file_schema = builder.schema().clone();
let mask = ProjectionMask::leaves(&builder.parquet_schema(), [1, 2]);
let reader = builder.with_projection(mask).build().unwrap();
let async_reader_schema = reader.schema();
assert_eq!(async_file_schema.fields.len(), 3);
assert_eq!(async_file_schema.metadata, metadata);
assert_eq!(async_reader_schema.fields.len(), 2);
assert_eq!(async_reader_schema.metadata, HashMap::default());
}
```
I think demonstrates the issue, in particular the schema returned by
ParquetRecordBatchStream is incorrect, it should return the projected schema
with the metadata removed.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]