(datafusion) branch main updated: Change default prefetch_hint to 512Kb to reduce number of object store requests when reading parquet files (#18160)

github-bot Tue, 28 Oct 2025 02:43:23 -0700

This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git



The following commit(s) were added to refs/heads/main by this push:
     new e12ef3ae90 Change default prefetch_hint to 512Kb to reduce number of 
object store requests when reading parquet files (#18160)
e12ef3ae90 is described below

commit e12ef3ae90677fe4b1bc548feea2b3082eecdaa2
Author: Qi Zhu <[email protected]>
AuthorDate: Tue Oct 28 17:41:21 2025 +0800

    Change default prefetch_hint to 512Kb to reduce number of object store 
requests when reading parquet files (#18160)
    
    …default (set metadata_size_hint)
    
    ## Which issue does this PR close?
    
    - Closes [#18118](https://github.com/apache/datafusion/issues/18118)
    
    ## Rationale for this change
    
    Reduce number of object store requests when reading parquet files by
    default (set metadata_size_hint)
    
    ## What changes are included in this PR?
    
    ```rust
     /// Default setting to 512 KB, which should be sufficient for most parquet 
files,
            /// it can reduce one I/O operation per parquet file. If the 
metadata is larger than
            /// the hint, two reads will still be performed.
            pub metadata_size_hint: Option<usize>, default = Some(512 * 1024)
    ```
    
    ## Are these changes tested?
    
    Yes
    
    ## Are there any user-facing changes?
    
    No
    
    ---------
    
    Co-authored-by: Daniël Heres <[email protected]>
    Co-authored-by: Andrew Lamb <[email protected]>
---
 datafusion/common/src/config.rs                    |   5 +-
 .../core/src/datasource/file_format/options.rs     |  14 +
 .../core/src/datasource/file_format/parquet.rs     |   7 +-
 .../core/tests/datasource/object_store_access.rs   | 291 +++++++++++++++------
 .../sqllogictest/test_files/information_schema.slt |   4 +-
 docs/source/user-guide/configs.md                  |   2 +-
 6 files changed, 239 insertions(+), 84 deletions(-)

diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs
index 1713377f8d..10199db1a1 100644
--- a/datafusion/common/src/config.rs
+++ b/datafusion/common/src/config.rs
@@ -621,7 +621,10 @@ config_namespace! {
         /// bytes of the parquet file optimistically. If not specified, two 
reads are required:
         /// One read to fetch the 8-byte parquet footer and
         /// another to fetch the metadata length encoded in the footer
-        pub metadata_size_hint: Option<usize>, default = None
+        /// Default setting to 512 KiB, which should be sufficient for most 
parquet files,
+        /// it can reduce one I/O operation per parquet file. If the metadata 
is larger than
+        /// the hint, two reads will still be performed.
+        pub metadata_size_hint: Option<usize>, default = Some(512 * 1024)
 
         /// (reading) If true, filter expressions are be applied during the 
parquet decoding operation to
         /// reduce the number of rows decoded. This optimization is sometimes 
called "late materialization".
diff --git a/datafusion/core/src/datasource/file_format/options.rs 
b/datafusion/core/src/datasource/file_format/options.rs
index 8c1bb02ef0..e78c5f0955 100644
--- a/datafusion/core/src/datasource/file_format/options.rs
+++ b/datafusion/core/src/datasource/file_format/options.rs
@@ -269,6 +269,8 @@ pub struct ParquetReadOptions<'a> {
     pub file_sort_order: Vec<Vec<SortExpr>>,
     /// Properties for decryption of Parquet files that use modular encryption
     pub file_decryption_properties: Option<ConfigFileDecryptionProperties>,
+    /// Metadata size hint for Parquet files reading (in bytes)
+    pub metadata_size_hint: Option<usize>,
 }
 
 impl Default for ParquetReadOptions<'_> {
@@ -281,6 +283,7 @@ impl Default for ParquetReadOptions<'_> {
             schema: None,
             file_sort_order: vec![],
             file_decryption_properties: None,
+            metadata_size_hint: None,
         }
     }
 }
@@ -340,6 +343,12 @@ impl<'a> ParquetReadOptions<'a> {
         self.file_decryption_properties = Some(file_decryption_properties);
         self
     }
+
+    /// Configure metadata size hint for Parquet files reading (in bytes)
+    pub fn metadata_size_hint(mut self, size_hint: Option<usize>) -> Self {
+        self.metadata_size_hint = size_hint;
+        self
+    }
 }
 
 /// Options that control the reading of ARROW files.
@@ -606,6 +615,11 @@ impl ReadOptions<'_> for ParquetReadOptions<'_> {
         if let Some(file_decryption_properties) = 
&self.file_decryption_properties {
             options.crypto.file_decryption = 
Some(file_decryption_properties.clone());
         }
+        // This can be overridden per-read in ParquetReadOptions, if setting.
+        if let Some(metadata_size_hint) = self.metadata_size_hint {
+            options.global.metadata_size_hint = Some(metadata_size_hint);
+        }
+
         let mut file_format = ParquetFormat::new().with_options(options);
 
         if let Some(parquet_pruning) = self.parquet_pruning {
diff --git a/datafusion/core/src/datasource/file_format/parquet.rs 
b/datafusion/core/src/datasource/file_format/parquet.rs
index 1781ea569d..52c5393e10 100644
--- a/datafusion/core/src/datasource/file_format/parquet.rs
+++ b/datafusion/core/src/datasource/file_format/parquet.rs
@@ -546,7 +546,8 @@ mod tests {
         let (files, _file_names) = store_parquet(vec![batch1], false).await?;
 
         let state = SessionContext::new().state();
-        let format = ParquetFormat::default();
+        // Make metadata size hint None to keep original behavior
+        let format = ParquetFormat::default().with_metadata_size_hint(None);
         let _schema = format.infer_schema(&state, &store.upcast(), 
&files).await?;
         assert_eq!(store.request_count(), 3);
         // No increase, cache being used.
@@ -620,7 +621,9 @@ mod tests {
 
         let mut state = SessionContext::new().state();
         state = set_view_state(state, force_views);
-        let format = 
ParquetFormat::default().with_force_view_types(force_views);
+        let format = ParquetFormat::default()
+            .with_force_view_types(force_views)
+            .with_metadata_size_hint(None);
         let schema = format.infer_schema(&state, &store.upcast(), 
&files).await?;
         assert_eq!(store.request_count(), 6);
 
diff --git a/datafusion/core/tests/datasource/object_store_access.rs 
b/datafusion/core/tests/datasource/object_store_access.rs
index 6b9585f408..d1592c2147 100644
--- a/datafusion/core/tests/datasource/object_store_access.rs
+++ b/datafusion/core/tests/datasource/object_store_access.rs
@@ -27,7 +27,7 @@
 use arrow::array::{ArrayRef, Int32Array, RecordBatch};
 use async_trait::async_trait;
 use bytes::Bytes;
-use datafusion::prelude::{CsvReadOptions, SessionContext};
+use datafusion::prelude::{CsvReadOptions, ParquetReadOptions, SessionContext};
 use futures::stream::BoxStream;
 use insta::assert_snapshot;
 use object_store::memory::InMemory;
@@ -45,8 +45,9 @@ use url::Url;
 
 #[tokio::test]
 async fn create_single_csv_file() {
+    let test = Test::new().with_single_file_csv().await;
     assert_snapshot!(
-        single_file_csv_test().await.requests(),
+        test.requests(),
         @r"
     RequestCountingObjectStore()
     Total Requests: 2
@@ -58,8 +59,9 @@ async fn create_single_csv_file() {
 
 #[tokio::test]
 async fn query_single_csv_file() {
+    let test = Test::new().with_single_file_csv().await;
     assert_snapshot!(
-        single_file_csv_test().await.query("select * from csv_table").await,
+        test.query("select * from csv_table").await,
         @r"
     ------- Query Output (2 rows) -------
     +---------+-------+-------+
@@ -79,8 +81,9 @@ async fn query_single_csv_file() {
 
 #[tokio::test]
 async fn create_multi_file_csv_file() {
+    let test = Test::new().with_multi_file_csv().await;
     assert_snapshot!(
-        multi_file_csv_test().await.requests(),
+        test.requests(),
         @r"
     RequestCountingObjectStore()
     Total Requests: 4
@@ -94,8 +97,9 @@ async fn create_multi_file_csv_file() {
 
 #[tokio::test]
 async fn query_multi_csv_file() {
+    let test = Test::new().with_multi_file_csv().await;
     assert_snapshot!(
-        multi_file_csv_test().await.query("select * from csv_table").await,
+        test.query("select * from csv_table").await,
         @r"
     ------- Query Output (6 rows) -------
     +---------+-------+-------+
@@ -120,24 +124,132 @@ async fn query_multi_csv_file() {
 }
 
 #[tokio::test]
-async fn create_single_parquet_file() {
+async fn create_single_parquet_file_default() {
+    // The default metadata size hint is 512KB
+    // which is enough to fetch the entire footer metadata and PageIndex
+    // in a single GET request.
+    let test = Test::new().with_single_file_parquet().await;
+    // expect 1 get request which reads the footer metadata and page index
     assert_snapshot!(
-        single_file_parquet_test().await.requests(),
+        test.requests(),
+        @r"
+    RequestCountingObjectStore()
+    Total Requests: 2
+    - HEAD path=parquet_table.parquet
+    - GET  (range) range=0-2994 path=parquet_table.parquet
+    "
+    );
+}
+
+#[tokio::test]
+async fn create_single_parquet_file_prefetch() {
+    // Explicitly specify a prefetch hint that is adequate for the footer and 
page index
+    let test = Test::new()
+        .with_parquet_metadata_size_hint(Some(1000))
+        .with_single_file_parquet()
+        .await;
+    // expect 1 1000 byte request which reads the footer metadata and page 
index
+    assert_snapshot!(
+        test.requests(),
+        @r"
+    RequestCountingObjectStore()
+    Total Requests: 2
+    - HEAD path=parquet_table.parquet
+    - GET  (range) range=1994-2994 path=parquet_table.parquet
+    "
+    );
+}
+
+#[tokio::test]
+async fn create_single_parquet_file_too_small_prefetch() {
+    // configure a prefetch size that is too small to fetch the footer
+    // metadata
+    //
+    // Using the ranges from  the test below (with no_prefetch),
+    // pick a number less than 730:
+    // --------
+    // 2286-2294: (8 bytes) footer + length
+    // 2264-2986: (722 bytes) footer metadata
+    let test = Test::new()
+        .with_parquet_metadata_size_hint(Some(500))
+        .with_single_file_parquet()
+        .await;
+    // expect three get requests:
+    // 1. read the footer (500 bytes per hint, not enough for the footer 
metadata)
+    // 2. Read the footer metadata
+    // 3. reads the PageIndex
+    assert_snapshot!(
+        test.requests(),
         @r"
     RequestCountingObjectStore()
     Total Requests: 4
     - HEAD path=parquet_table.parquet
-    - GET  (range) range=2986-2994 path=parquet_table.parquet
+    - GET  (range) range=2494-2994 path=parquet_table.parquet
     - GET  (range) range=2264-2986 path=parquet_table.parquet
     - GET  (range) range=2124-2264 path=parquet_table.parquet
     "
     );
 }
 
+#[tokio::test]
+async fn create_single_parquet_file_small_prefetch() {
+    // configure a prefetch size that is large enough for the footer
+    // metadata but **not** the PageIndex
+    //
+    // Using the ranges from the test below (with no_prefetch),
+    // the 730 is determined as follows;
+    // --------
+    // 2286-2294: (8 bytes) footer + length
+    // 2264-2986: (722 bytes) footer metadata
+    let test = Test::new()
+        // 740 is enough to get both the footer + length (8 bytes)
+        // but not the entire PageIndex
+        .with_parquet_metadata_size_hint(Some(740))
+        .with_single_file_parquet()
+        .await;
+    // expect two get requests:
+    // 1. read the footer metadata
+    // 2. reads the PageIndex
+    assert_snapshot!(
+        test.requests(),
+        @r"
+    RequestCountingObjectStore()
+    Total Requests: 3
+    - HEAD path=parquet_table.parquet
+    - GET  (range) range=2254-2994 path=parquet_table.parquet
+    - GET  (range) range=2124-2264 path=parquet_table.parquet
+    "
+    );
+}
+
+#[tokio::test]
+async fn create_single_parquet_file_no_prefetch() {
+    let test = Test::new()
+        // force no prefetch by setting size hint to None
+        .with_parquet_metadata_size_hint(None)
+        .with_single_file_parquet()
+        .await;
+    // Without a metadata size hint, the parquet reader
+    // does *three* range requests to read the footer metadata:
+    // 1. The footer length (last 8 bytes)
+    // 2. The footer metadata
+    // 3. The PageIndex metadata
+    assert_snapshot!(
+        test.requests(),
+        @r"
+    RequestCountingObjectStore()
+    Total Requests: 2
+    - HEAD path=parquet_table.parquet
+    - GET  (range) range=0-2994 path=parquet_table.parquet
+    "
+    );
+}
+
 #[tokio::test]
 async fn query_single_parquet_file() {
+    let test = Test::new().with_single_file_parquet().await;
     assert_snapshot!(
-        single_file_parquet_test().await.query("select count(distinct a), 
count(b) from parquet_table").await,
+        test.query("select count(distinct a), count(b) from 
parquet_table").await,
         @r"
     ------- Query Output (1 rows) -------
     +---------------------------------+------------------------+
@@ -157,10 +269,11 @@ async fn query_single_parquet_file() {
 
 #[tokio::test]
 async fn query_single_parquet_file_with_single_predicate() {
+    let test = Test::new().with_single_file_parquet().await;
     // Note that evaluating predicates requires additional object store 
requests
     // (to evaluate predicates)
     assert_snapshot!(
-        single_file_parquet_test().await.query("select min(a), max(b) from 
parquet_table WHERE a > 150").await,
+        test.query("select min(a), max(b) from parquet_table WHERE a > 
150").await,
         @r"
     ------- Query Output (1 rows) -------
     +----------------------+----------------------+
@@ -179,10 +292,12 @@ async fn 
query_single_parquet_file_with_single_predicate() {
 
 #[tokio::test]
 async fn query_single_parquet_file_multi_row_groups_multiple_predicates() {
+    let test = Test::new().with_single_file_parquet().await;
+
     // Note that evaluating predicates requires additional object store 
requests
     // (to evaluate predicates)
     assert_snapshot!(
-        single_file_parquet_test().await.query("select min(a), max(b) from 
parquet_table WHERE a > 50 AND b < 1150").await,
+        test.query("select min(a), max(b) from parquet_table WHERE a > 50 AND 
b < 1150").await,
         @r"
     ------- Query Output (1 rows) -------
     +----------------------+----------------------+
@@ -200,75 +315,16 @@ async fn 
query_single_parquet_file_multi_row_groups_multiple_predicates() {
     );
 }
 
-/// Create a test with a single CSV file with three columns and two rows
-async fn single_file_csv_test() -> Test {
-    // upload CSV data to object store
-    let csv_data = r#"c1,c2,c3
-0.00001,5e-12,true
-0.00002,4e-12,false
-"#;
-
-    Test::new()
-        .with_bytes("/csv_table.csv", csv_data)
-        .await
-        .register_csv("csv_table", "/csv_table.csv")
-        .await
-}
-
-/// Create a test with three CSV files in a directory
-async fn multi_file_csv_test() -> Test {
-    let mut test = Test::new();
-    // upload CSV data to object store
-    for i in 0..3 {
-        let csv_data1 = format!(
-            r#"c1,c2,c3
-0.0000{i},{i}e-12,true
-0.00003,5e-12,false
-"#
-        );
-        test = test
-            .with_bytes(&format!("/data/file_{i}.csv"), csv_data1)
-            .await;
-    }
-    // register table
-    test.register_csv("csv_table", "/data/").await
-}
-
-/// Create a test with a single parquet file that has two
-/// columns and two row groups
-///
-/// Column "a": Int32 with values 0-100] in row group 1
-/// and [101-200] in row group 2
-///
-/// Column "b": Int32 with values 1000-1100] in row group 1
-/// and [1101-1200] in row group 2
-async fn single_file_parquet_test() -> Test {
-    // Create parquet bytes
-    let a: ArrayRef = Arc::new(Int32Array::from_iter_values(0..200));
-    let b: ArrayRef = Arc::new(Int32Array::from_iter_values(1000..1200));
-    let batch = RecordBatch::try_from_iter([("a", a), ("b", b)]).unwrap();
-
-    let mut buffer = vec![];
-    let props = parquet::file::properties::WriterProperties::builder()
-        .set_max_row_group_size(100)
-        .build();
-    let mut writer =
-        parquet::arrow::ArrowWriter::try_new(&mut buffer, batch.schema(), 
Some(props))
-            .unwrap();
-    writer.write(&batch).unwrap();
-    writer.close().unwrap();
-
-    Test::new()
-        .with_bytes("/parquet_table.parquet", buffer)
-        .await
-        .register_parquet("parquet_table", "/parquet_table.parquet")
-        .await
-}
-
 /// Runs tests with a request counting object store
 struct Test {
     object_store: Arc<RequestCountingObjectStore>,
     session_context: SessionContext,
+    /// metadata size hint to use when registering parquet files
+    ///
+    /// * `None`: uses the default (does not set a size_hint)
+    /// * `Some(None)`L: set prefetch hint to None (prefetching)
+    /// * `Some(Some(size))`: set prefetch hint to size
+    parquet_metadata_size_hint: Option<Option<usize>>,
 }
 
 impl Test {
@@ -281,9 +337,16 @@ impl Test {
         Self {
             object_store,
             session_context,
+            parquet_metadata_size_hint: None,
         }
     }
 
+    /// Specify the metadata size hint to use when registering parquet files
+    fn with_parquet_metadata_size_hint(mut self, size_hint: Option<usize>) -> 
Self {
+        self.parquet_metadata_size_hint = Some(size_hint);
+        self
+    }
+
     /// Returns a string representation of all recorded requests thus far
     fn requests(&self) -> String {
         format!("{}", self.object_store)
@@ -312,16 +375,88 @@ impl Test {
         self
     }
 
-    /// Register a CSV file at the given path relative to the 
[`datafusion_test_data`] directory
+    /// Register a Parquet file at the given path relative to the
+    /// [`datafusion_test_data`] directory
     async fn register_parquet(self, table_name: &str, path: &str) -> Self {
         let path = format!("mem://{path}");
+        let mut options: ParquetReadOptions<'_> = ParquetReadOptions::new();
+
+        // If a metadata size hint was specified, apply it
+        if let Some(parquet_metadata_size_hint) = 
self.parquet_metadata_size_hint {
+            options = options.metadata_size_hint(parquet_metadata_size_hint);
+        }
+
         self.session_context
-            .register_parquet(table_name, path, Default::default())
+            .register_parquet(table_name, path, options)
             .await
             .unwrap();
         self
     }
 
+    /// Register a single CSV file with three columns and two row named
+    /// `csv_table`
+    async fn with_single_file_csv(self) -> Test {
+        // upload CSV data to object store
+        let csv_data = r#"c1,c2,c3
+0.00001,5e-12,true
+0.00002,4e-12,false
+"#;
+        self.with_bytes("/csv_table.csv", csv_data)
+            .await
+            .register_csv("csv_table", "/csv_table.csv")
+            .await
+    }
+
+    /// Register three CSV files in a directory, called `csv_table`
+    async fn with_multi_file_csv(mut self) -> Test {
+        // upload CSV data to object store
+        for i in 0..3 {
+            let csv_data1 = format!(
+                r#"c1,c2,c3
+0.0000{i},{i}e-12,true
+0.00003,5e-12,false
+"#
+            );
+            self = self
+                .with_bytes(&format!("/data/file_{i}.csv"), csv_data1)
+                .await;
+        }
+        // register table
+        self.register_csv("csv_table", "/data/").await
+    }
+
+    /// Add a single parquet file that has two columns and two row groups 
named `parquet_table`
+    ///
+    /// Column "a": Int32 with values 0-100] in row group 1
+    /// and [101-200] in row group 2
+    ///
+    /// Column "b": Int32 with values 1000-1100] in row group 1
+    /// and [1101-1200] in row group 2
+    async fn with_single_file_parquet(self) -> Test {
+        // Create parquet bytes
+        let a: ArrayRef = Arc::new(Int32Array::from_iter_values(0..200));
+        let b: ArrayRef = Arc::new(Int32Array::from_iter_values(1000..1200));
+        let batch = RecordBatch::try_from_iter([("a", a), ("b", b)]).unwrap();
+
+        let mut buffer = vec![];
+        let props = parquet::file::properties::WriterProperties::builder()
+            .set_max_row_group_size(100)
+            .build();
+        let mut writer = parquet::arrow::ArrowWriter::try_new(
+            &mut buffer,
+            batch.schema(),
+            Some(props),
+        )
+        .unwrap();
+        writer.write(&batch).unwrap();
+        writer.close().unwrap();
+
+        self.with_bytes("/parquet_table.parquet", buffer)
+            .await
+            .register_parquet("parquet_table", "/parquet_table.parquet")
+            .await
+    }
+
     /// Runs the specified query and returns a string representation of the 
results
     /// suitable for comparison with insta snapshots
     ///
diff --git a/datafusion/sqllogictest/test_files/information_schema.slt 
b/datafusion/sqllogictest/test_files/information_schema.slt
index b15ec02637..f1cc4c7a0c 100644
--- a/datafusion/sqllogictest/test_files/information_schema.slt
+++ b/datafusion/sqllogictest/test_files/information_schema.slt
@@ -246,7 +246,7 @@ datafusion.execution.parquet.max_predicate_cache_size NULL
 datafusion.execution.parquet.max_row_group_size 1048576
 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2
 datafusion.execution.parquet.maximum_parallel_row_group_writers 1
-datafusion.execution.parquet.metadata_size_hint NULL
+datafusion.execution.parquet.metadata_size_hint 524288
 datafusion.execution.parquet.pruning true
 datafusion.execution.parquet.pushdown_filters false
 datafusion.execution.parquet.reorder_filters false
@@ -366,7 +366,7 @@ datafusion.execution.parquet.max_predicate_cache_size NULL 
(reading) The maximum
 datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target 
maximum number of rows in each row group (defaults to 1M rows). Writing larger 
row groups requires more memory to write, but can get better compression and be 
faster to read.
 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 
(writing) By default parallel parquet writer is tuned for minimum memory usage 
in a streaming execution plan. You may see a performance benefit when writing 
large parquet files by increasing maximum_parallel_row_group_writers and 
maximum_buffered_record_batches_per_stream if your system has idle cores and 
can tolerate additional memory usage. Boosting these values is likely 
worthwhile when writing out already in-me [...]
 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By 
default parallel parquet writer is tuned for minimum memory usage in a 
streaming execution plan. You may see a performance benefit when writing large 
parquet files by increasing maximum_parallel_row_group_writers and 
maximum_buffered_record_batches_per_stream if your system has idle cores and 
can tolerate additional memory usage. Boosting these values is likely 
worthwhile when writing out already in-memory dat [...]
-datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, 
the parquet reader will try and fetch the last `size_hint` bytes of the parquet 
file optimistically. If not specified, two reads are required: One read to 
fetch the 8-byte parquet footer and another to fetch the metadata length 
encoded in the footer
+datafusion.execution.parquet.metadata_size_hint 524288 (reading) If specified, 
the parquet reader will try and fetch the last `size_hint` bytes of the parquet 
file optimistically. If not specified, two reads are required: One read to 
fetch the 8-byte parquet footer and another to fetch the metadata length 
encoded in the footer Default setting to 512 KiB, which should be sufficient 
for most parquet files, it can reduce one I/O operation per parquet file. If 
the metadata is larger than the [...]
 datafusion.execution.parquet.pruning true (reading) If true, the parquet 
reader attempts to skip entire row groups based on the predicate in the query 
and the metadata (min/max values) stored in the parquet file
 datafusion.execution.parquet.pushdown_filters false (reading) If true, filter 
expressions are be applied during the parquet decoding operation to reduce the 
number of rows decoded. This optimization is sometimes called "late 
materialization".
 datafusion.execution.parquet.reorder_filters false (reading) If true, filter 
expressions evaluated during the parquet decoding operation will be reordered 
heuristically to minimize the cost of evaluation. If false, the filters are 
applied in the same order as written in the query
diff --git a/docs/source/user-guide/configs.md 
b/docs/source/user-guide/configs.md
index fbf55a5605..7ca5eb8f7b 100644
--- a/docs/source/user-guide/configs.md
+++ b/docs/source/user-guide/configs.md
@@ -81,7 +81,7 @@ The following configuration settings are available:
 | datafusion.execution.parquet.enable_page_index                          | 
true                      | (reading) If true, reads the Parquet data page 
level metadata (the Page Index), if present, to reduce the I/O and number of 
rows decoded.                                                                   
                                                                                
                                                                                
                         [...]
 | datafusion.execution.parquet.pruning                                    | 
true                      | (reading) If true, the parquet reader attempts to 
skip entire row groups based on the predicate in the query and the metadata 
(min/max values) stored in the parquet file                                     
                                                                                
                                                                                
                       [...]
 | datafusion.execution.parquet.skip_metadata                              | 
true                      | (reading) If true, the parquet reader skip the 
optional embedded metadata that may be in the file Schema. This setting can 
help avoid schema conflicts when querying multiple parquet files with schemas 
containing compatible types but different metadata                              
                                                                                
                            [...]
-| datafusion.execution.parquet.metadata_size_hint                         | 
NULL                      | (reading) If specified, the parquet reader will try 
and fetch the last `size_hint` bytes of the parquet file optimistically. If not 
specified, two reads are required: One read to fetch the 8-byte parquet footer 
and another to fetch the metadata length encoded in the footer                  
                                                                                
                  [...]
+| datafusion.execution.parquet.metadata_size_hint                         | 
524288                    | (reading) If specified, the parquet reader will try 
and fetch the last `size_hint` bytes of the parquet file optimistically. If not 
specified, two reads are required: One read to fetch the 8-byte parquet footer 
and another to fetch the metadata length encoded in the footer Default setting 
to 512 KiB, which should be sufficient for most parquet files, it can reduce 
one I/O operation per [...]
 | datafusion.execution.parquet.pushdown_filters                           | 
false                     | (reading) If true, filter expressions are be 
applied during the parquet decoding operation to reduce the number of rows 
decoded. This optimization is sometimes called "late materialization".          
                                                                                
                                                                                
                             [...]
 | datafusion.execution.parquet.reorder_filters                            | 
false                     | (reading) If true, filter expressions evaluated 
during the parquet decoding operation will be reordered heuristically to 
minimize the cost of evaluation. If false, the filters are applied in the same 
order as written in the query                                                   
                                                                                
                             [...]
 | datafusion.execution.parquet.schema_force_view_types                    | 
true                      | (reading) If true, parquet reader will read columns 
of `Utf8/Utf8Large` with `Utf8View`, and `Binary/BinaryLarge` with 
`BinaryView`.                                                                   
                                                                                
                                                                                
                              [...]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(datafusion) branch main updated: Change default prefetch_hint to 512Kb to reduce number of object store requests when reading parquet files (#18160)

Reply via email to