Jefffrey commented on code in PR #5136:
URL: https://github.com/apache/arrow-rs/pull/5136#discussion_r1411897792
##########
arrow-ipc/src/reader.rs:
##########
@@ -498,61 +498,39 @@ pub fn read_dictionary(
Ok(())
}
-/// Arrow File reader
-pub struct FileReader<R: Read + Seek> {
- /// Buffered file reader that supports reading and seeking
- reader: BufReader<R>,
-
- /// The schema that is read from the file header
- schema: SchemaRef,
-
- /// The blocks in the file
- ///
- /// A block indicates the regions in the file to read to get data
- blocks: Vec<crate::Block>,
-
- /// A counter to keep track of the current block that should be read
- current_block: usize,
-
- /// The total number of blocks, which may contain record batches and other
types
- total_blocks: usize,
+/// Build an Arrow [`FileReader`] with custom options.
+#[derive(Debug, Default)]
+pub struct FileReaderBuilder {
+ /// Optional projection for which columns to load (zero-based column
indices)
+ projection: Option<Vec<usize>>,
+ /// Flatbuffers options for parsing footer
+ verifier_options: VerifierOptions,
+}
- /// Optional dictionaries for each schema field.
+impl FileReaderBuilder {
+ /// Options for creating a new [`FileReader`].
///
- /// Dictionaries may be appended to in the streaming format.
- dictionaries_by_id: HashMap<i64, ArrayRef>,
-
- /// Metadata version
- metadata_version: crate::MetadataVersion,
-
- /// User defined metadata
- custom_metadata: HashMap<String, String>,
+ /// To convert a builder into a reader, call [`FileReaderBuilder::build`].
+ pub fn new() -> Self {
+ Self::default()
+ }
- /// Optional projection and projected_schema
- projection: Option<(Vec<usize>, Schema)>,
-}
+ /// Optional projection for which columns to load (zero-based column
indices).
+ pub fn with_projection(mut self, projection: Vec<usize>) -> Self {
+ self.projection = Some(projection);
+ self
+ }
-impl<R: Read + Seek> fmt::Debug for FileReader<R> {
- fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::result::Result<(),
fmt::Error> {
- f.debug_struct("FileReader<R>")
- .field("reader", &"BufReader<..>")
- .field("schema", &self.schema)
- .field("blocks", &self.blocks)
- .field("current_block", &self.current_block)
- .field("total_blocks", &self.total_blocks)
- .field("dictionaries_by_id", &self.dictionaries_by_id)
- .field("metadata_version", &self.metadata_version)
- .field("projection", &self.projection)
- .finish()
+ /// Flatbuffers options for parsing footer. Useful if needing to parse a
file containing
+ /// millions of columns, in which case can up the value for `max_tables`
to accommodate parsing
+ /// such a file.
+ pub fn with_verifier_options(mut self, verifier_options: VerifierOptions)
-> Self {
+ self.verifier_options = verifier_options;
+ self
Review Comment:
Considering that keys in the schema custom metadata can also contribute to
the table count in the footer flatbuffer, not sure if naming it something like
`with_max_columns()` would be accurate.
Is it possible to simply abstract over those flatbuffer settings without
exposing the inner flatbuffer struct, such as
```rust
.with_flatbuffers_max_tables(10000000)
.with_flatbuffers_max_depth(100)
```
- In case a user has a file with a deeply nested schema and might want to
tune this parameter as well, unlikely as it might be
Can then document these methods to explain what effect tuning them would
have on the file reader, etc.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]