Re: [PR] Add FileReaderBuilder for arrow-ipc to allow reading large no. of column files [arrow-rs]

via GitHub Wed, 29 Nov 2023 06:05:01 -0800


tustvold commented on code in PR #5136:
URL: https://github.com/apache/arrow-rs/pull/5136#discussion_r1409328603



##########
arrow-ipc/src/reader.rs:
##########
@@ -498,61 +498,39 @@ pub fn read_dictionary(
     Ok(())
 }
 
-/// Arrow File reader
-pub struct FileReader<R: Read + Seek> {
-    /// Buffered file reader that supports reading and seeking
-    reader: BufReader<R>,
-
-    /// The schema that is read from the file header
-    schema: SchemaRef,
-
-    /// The blocks in the file
-    ///
-    /// A block indicates the regions in the file to read to get data
-    blocks: Vec<crate::Block>,
-
-    /// A counter to keep track of the current block that should be read
-    current_block: usize,
-
-    /// The total number of blocks, which may contain record batches and other 
types
-    total_blocks: usize,
+/// Build an Arrow [`FileReader`] with custom options.
+#[derive(Debug, Default)]
+pub struct FileReaderBuilder {
+    /// Optional projection for which columns to load (zero-based column 
indices)
+    projection: Option<Vec<usize>>,
+    /// Flatbuffers options for parsing footer
+    verifier_options: VerifierOptions,
+}
 
-    /// Optional dictionaries for each schema field.
+impl FileReaderBuilder {
+    /// Options for creating a new [`FileReader`].
     ///
-    /// Dictionaries may be appended to in the streaming format.
-    dictionaries_by_id: HashMap<i64, ArrayRef>,
-
-    /// Metadata version
-    metadata_version: crate::MetadataVersion,
-
-    /// User defined metadata
-    custom_metadata: HashMap<String, String>,
+    /// To convert a builder into a reader, call [`FileReaderBuilder::build`].
+    pub fn new() -> Self {
+        Self::default()
+    }
 
-    /// Optional projection and projected_schema
-    projection: Option<(Vec<usize>, Schema)>,
-}
+    /// Optional projection for which columns to load (zero-based column 
indices).
+    pub fn with_projection(mut self, projection: Vec<usize>) -> Self {
+        self.projection = Some(projection);
+        self
+    }
 
-impl<R: Read + Seek> fmt::Debug for FileReader<R> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::result::Result<(), 
fmt::Error> {
-        f.debug_struct("FileReader<R>")
-            .field("reader", &"BufReader<..>")
-            .field("schema", &self.schema)
-            .field("blocks", &self.blocks)
-            .field("current_block", &self.current_block)
-            .field("total_blocks", &self.total_blocks)
-            .field("dictionaries_by_id", &self.dictionaries_by_id)
-            .field("metadata_version", &self.metadata_version)
-            .field("projection", &self.projection)
-            .finish()
+    /// Flatbuffers options for parsing footer. Useful if needing to parse a 
file containing
+    /// millions of columns, in which case can up the value for `max_tables` 
to accommodate parsing
+    /// such a file.
+    pub fn with_verifier_options(mut self, verifier_options: VerifierOptions) 
-> Self {
+        self.verifier_options = verifier_options;
+        self

Review Comment:
   I think I would prefer the max columns option as it both avoids exposing 
flatbuffer types in our public API, and is more obvious to users why it might 
be relevant to them



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] Add FileReaderBuilder for arrow-ipc to allow reading large no. of column files [arrow-rs]

Reply via email to