Re: [PR] Concurrent data file fetching and parallel RecordBatch processing [iceberg-rust]

via GitHub Tue, 13 Aug 2024 12:11:36 -0700


sdd commented on code in PR #515:
URL: https://github.com/apache/iceberg-rust/pull/515#discussion_r1715801644



##########
crates/iceberg/src/arrow/reader.rs:
##########
@@ -44,25 +43,39 @@ use crate::error::Result;
 use crate::expr::visitors::bound_predicate_visitor::{visit, 
BoundPredicateVisitor};
 use crate::expr::{BoundPredicate, BoundReference};
 use crate::io::{FileIO, FileMetadata, FileRead};
-use crate::scan::{ArrowRecordBatchStream, FileScanTaskStream};
+use crate::runtime::spawn;
+use crate::scan::{ArrowRecordBatchStream, FileScanTask, FileScanTaskStream};
 use crate::spec::{Datum, Schema};
 use crate::{Error, ErrorKind};
 
 /// Builder to create ArrowReader
 pub struct ArrowReaderBuilder {
     batch_size: Option<usize>,
     file_io: FileIO,
+    concurrency_limit_data_files: usize,
 }
 
 impl ArrowReaderBuilder {
     /// Create a new ArrowReaderBuilder
     pub(crate) fn new(file_io: FileIO) -> Self {
+        let num_cpus = std::thread::available_parallelism()
+            .expect("failed to get number of CPUs")

Review Comment:
   Since the same function is queried in both `plan_files` and `read`, I've 
created a `utils.rs` file and moved the logic that you suggested there for 
re-use. Let me know if you have a better suggestion of where to put it. I'll 
address the `TODO` around logging / tracing once we have a more concrete plan 
when it comes to logging / tracing / metrics.



##########
crates/iceberg/src/arrow/reader.rs:
##########
@@ -84,73 +98,113 @@ impl ArrowReaderBuilder {
 pub struct ArrowReader {
     batch_size: Option<usize>,
     file_io: FileIO,
+
+    /// the maximum number of data files that can be fetched at the same time
+    concurrency_limit_data_files: usize,
 }
 
 impl ArrowReader {
     /// Take a stream of FileScanTasks and reads all the files.
     /// Returns a stream of Arrow RecordBatches containing the data from the 
files
-    pub fn read(self, mut tasks: FileScanTaskStream) -> 
crate::Result<ArrowRecordBatchStream> {
+    pub fn read(self, tasks: FileScanTaskStream) -> 
Result<ArrowRecordBatchStream> {
         let file_io = self.file_io.clone();
-
-        Ok(try_stream! {
-            while let Some(task_result) = tasks.next().await {
-                match task_result {
-                    Ok(task) => {
-                        // Collect Parquet column indices from field ids
-                        let mut collector = CollectFieldIdVisitor {
-                            field_ids: HashSet::default(),
-                        };
-                        if let Some(predicates) = task.predicate() {
-                            visit(&mut collector, predicates)?;
+        let batch_size = self.batch_size;
+        let max_concurrent_fetching_datafiles = 
self.concurrency_limit_data_files;
+
+        let (tx, rx) = channel(10);

Review Comment:
   Fixed.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] Concurrent data file fetching and parallel RecordBatch processing [iceberg-rust]

Reply via email to