Re: [PR] Add `FileScanConfig::new()` API [datafusion]

via GitHub Wed, 22 May 2024 10:46:16 -0700


alamb commented on code in PR #10623:
URL: https://github.com/apache/datafusion/pull/10623#discussion_r1610405921



##########
datafusion/core/src/datasource/physical_plan/file_scan_config.rs:
##########
@@ -64,12 +64,43 @@ pub fn wrap_partition_value_in_dict(val: ScalarValue) -> 
ScalarValue {
 
 /// The base configurations to provide when creating a physical plan for
 /// any given file format.
+///
+/// # Example
+/// ```
+/// # use std::sync::Arc;
+/// # use arrow_schema::Schema;
+/// use datafusion::datasource::listing::PartitionedFile;
+/// # use datafusion::datasource::physical_plan::FileScanConfig;
+/// # use datafusion_execution::object_store::ObjectStoreUrl;
+/// # let file_schema = Arc::new(Schema::empty());
+/// // create FileScan config for reading data from file://
+/// let object_store_url = ObjectStoreUrl::local_filesystem();
+/// let mut config = FileScanConfig::new(object_store_url, file_schema)
+///   .with_limit(Some(1000))            // read only the first 1000 records
+///   .with_projection(Some(vec![2, 3])); // project columns 2 and 3
+///
+/// // Read /tmp/file1.parquet with known size of 1234 bytes in a single group
+/// config.add_file(PartitionedFile::new("file1.parquet", 1234));
+///
+/// // Read /tmp/file2.parquet 56 bytes and /tmp/file3.parquet 78 bytes
+/// // in a  single row group
+/// config.add_file_group(vec![
+///    PartitionedFile::new("file2.parquet", 56),
+///    PartitionedFile::new("file3.parquet", 78),
+/// ]);
+/// ```
 #[derive(Clone)]
 pub struct FileScanConfig {

Review Comment:
   Here is the new API and documentation. Note this is entirely backwards 
compatible (I did not change any fields to non pub, etc)



##########
datafusion/core/src/datasource/physical_plan/file_scan_config.rs:
##########
@@ -101,6 +133,99 @@ pub struct FileScanConfig {
 }
 
 impl FileScanConfig {
+    /// Create a new `FileScanConfig` with default settings for scanning files.
+    ///
+    /// No file groups are added by default. See [`Self::add_file`] and
+    /// [`Self::add_file_group`]
+    ///
+    /// # Parameters:
+    /// * `object_store_url`: See [`Self::object_store_url`]
+    /// * `file_schema`: See [`Self::file_schema`]
+    pub fn new(object_store_url: ObjectStoreUrl, file_schema: SchemaRef) -> 
Self {
+        let statistics = Statistics::new_unknown(&file_schema);
+        Self {
+            object_store_url,
+            file_schema,
+            file_groups: vec![],
+            statistics,
+            projection: None,
+            limit: None,
+            table_partition_cols: vec![],
+            output_ordering: vec![],
+        }
+    }
+
+    /// Add a new file as a single file group
+    ///
+    /// See [Self::file_groups] for more information
+    pub fn add_file(&mut self, file: PartitionedFile) {
+        self.add_file_group(vec![file])
+    }
+
+    /// Add a new file group
+    ///
+    /// See [Self::file_groups] for more information
+    pub fn add_file_group(&mut self, file_group: Vec<PartitionedFile>) {
+        self.file_groups.push(file_group);
+    }
+
+    /// Set the statistics of the files
+    pub fn with_statistics(mut self, statistics: Statistics) -> Self {
+        self.statistics = statistics;
+        self
+    }
+
+    /// Set the projection of the files
+    pub fn with_projection(mut self, projection: Option<Vec<usize>>) -> Self {
+        self.projection = projection;
+        self
+    }
+
+    /// Set the limit of the files
+    pub fn with_limit(mut self, limit: Option<usize>) -> Self {
+        self.limit = limit;
+        self
+    }
+
+    /// Add a file as a single group

Review Comment:
   Having to wrap a single file in a `vec![vec![..]]` makes sense from the 
implementation point of view, but most users shouldn't have to worry about this



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: github-unsubscr...@datafusion.apache.org
For additional commands, e-mail: github-h...@datafusion.apache.org

Re: [PR] Add `FileScanConfig::new()` API [datafusion]

Reply via email to