tustvold commented on code in PR #1682:
URL: https://github.com/apache/arrow-rs/pull/1682#discussion_r872206833
##########
parquet/src/arrow/array_reader/builder.rs:
##########
@@ -52,657 +50,278 @@ pub fn build_array_reader<T>(
where
T: IntoIterator<Item = usize>,
{
- let mut leaves = HashMap::<*const Type, usize>::new();
-
- let mut filtered_root_names = HashSet::<String>::new();
-
- for c in column_indices {
- let column = parquet_schema.column(c).self_type() as *const Type;
-
- leaves.insert(column, c);
-
- let root = parquet_schema.get_column_root_ptr(c);
- filtered_root_names.insert(root.name().to_string());
+ let field = convert_schema(
+ parquet_schema.as_ref(),
+ column_indices,
+ Some(arrow_schema.as_ref()),
+ )?;
+
+ match &field {
+ Some(field) => build_reader(field, row_groups.as_ref()),
+ None => Ok(make_empty_array_reader(row_groups.num_rows())),
}
-
- // Only pass root fields that take part in the projection
- // to avoid traversal of columns that are not read.
- // TODO: also prune unread parts of the tree in child structures
- let filtered_root_fields = parquet_schema
- .root_schema()
- .get_fields()
- .iter()
- .filter(|field| filtered_root_names.contains(field.name()))
- .cloned()
- .collect::<Vec<_>>();
-
- let proj = Type::GroupType {
- basic_info: parquet_schema.root_schema().get_basic_info().clone(),
- fields: filtered_root_fields,
- };
-
- ArrayReaderBuilder::new(Arc::new(proj), arrow_schema, Arc::new(leaves),
row_groups)
- .build_array_reader()
}
-/// Used to build array reader.
-struct ArrayReaderBuilder {
- root_schema: TypePtr,
- arrow_schema: Arc<Schema>,
- // Key: columns that need to be included in final array builder
- // Value: column index in schema
- columns_included: Arc<HashMap<*const Type, usize>>,
- row_groups: Box<dyn RowGroupCollection>,
+fn build_reader(
Review Comment:
As there wasn't any "state" to associate with a builder, and shouldn't ever
be, it seemed redundant to create an empty `Builder` struct?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]