This is an automated email from the ASF dual-hosted git repository.

github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new 4e45c19d10 Enables DefaultListFilesCache by default (#19366)
4e45c19d10 is described below

commit 4e45c19d10600fe12442e6855c52ab3afcaa8493
Author: Blake Orth <[email protected]>
AuthorDate: Tue Dec 30 17:45:15 2025 -0700

    Enables DefaultListFilesCache by default (#19366)
    
    ## Which issue does this PR close?
    
    - Closes https://github.com/apache/datafusion/issues/18827
    - Closes https://github.com/apache/datafusion/issues/9654
    
    ## Rationale for this change
    
    Now that the `DefaultListFilesCache` can be configured by users it's
    safe to enable it by default and fix the tests that caching broke!
    
    ## What changes are included in this PR?
    
     - Sets the DefaultListFilesCache to be enabled by default
    - Adds additional object store access tests to show list caching
    behavior
     - Adds variable setting/reading sqllogic test cases
    - Updates tests to disable caching when they relied on COPY commands so
    changes can be detected for each query
     - Updates docs to help users upgrade
    
    
    ## Are these changes tested?
    
    Yes, additional test cases have been added to help show the behavior of
    the caching
    
    ## Are there any user-facing changes?
    
    Yes, this changes the default behavior of DataFusion, however this
    information is already captured in the upgrade guide.
    
    ##
    cc @alamb
    
    ---------
    
    Co-authored-by: Andrew Lamb <[email protected]>
---
 .../core/tests/datasource/object_store_access.rs   | 76 +++++++++++++++++-----
 datafusion/execution/src/cache/cache_manager.rs    | 27 +++++---
 datafusion/sqllogictest/test_files/parquet.slt     |  4 ++
 .../sqllogictest/test_files/repartition_scan.slt   |  4 ++
 .../sqllogictest/test_files/set_variable.slt       | 18 +++++
 docs/source/library-user-guide/upgrading.md        | 17 ++++-
 6 files changed, 116 insertions(+), 30 deletions(-)

diff --git a/datafusion/core/tests/datasource/object_store_access.rs 
b/datafusion/core/tests/datasource/object_store_access.rs
index 2e1b148407..561de21520 100644
--- a/datafusion/core/tests/datasource/object_store_access.rs
+++ b/datafusion/core/tests/datasource/object_store_access.rs
@@ -117,15 +117,40 @@ async fn multi_query_multi_file_csv_file() {
     +---------+-------+-------+
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
-    Total Requests: 4
-    - LIST prefix=data
+    Total Requests: 3
     - GET  (opts) path=data/file_0.csv
     - GET  (opts) path=data/file_1.csv
     - GET  (opts) path=data/file_2.csv
     "
     );
 
-    // the second query should re-use the cached LIST results and should not 
reissue LIST
+    // Force a cache eviction by removing the data limit for the cache
+    assert_snapshot!(
+        test.query("set 
datafusion.runtime.list_files_cache_limit=\"0K\"").await,
+        @r"
+    ------- Query Output (0 rows) -------
+    ++
+    ++
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 0
+    "
+    );
+
+    // Then re-enable the cache
+    assert_snapshot!(
+        test.query("set 
datafusion.runtime.list_files_cache_limit=\"1M\"").await,
+        @r"
+    ------- Query Output (0 rows) -------
+    ++
+    ++
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 0
+    "
+    );
+
+    // this query should list the table since the cache entries were evicted
     assert_snapshot!(
         test.query("select * from csv_table").await,
         @r"
@@ -149,6 +174,30 @@ async fn multi_query_multi_file_csv_file() {
     - GET  (opts) path=data/file_2.csv
     "
     );
+
+    // this query should not list the table since the entries were added in 
the previous query
+    assert_snapshot!(
+        test.query("select * from csv_table").await,
+        @r"
+    ------- Query Output (6 rows) -------
+    +---------+-------+-------+
+    | c1      | c2    | c3    |
+    +---------+-------+-------+
+    | 0.0     | 0.0   | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00001 | 1e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    | 0.00002 | 2e-12 | true  |
+    | 0.00003 | 5e-12 | false |
+    +---------+-------+-------+
+    ------- Object Store Request Summary -------
+    RequestCountingObjectStore()
+    Total Requests: 3
+    - GET  (opts) path=data/file_0.csv
+    - GET  (opts) path=data/file_1.csv
+    - GET  (opts) path=data/file_2.csv
+    "
+    );
 }
 
 #[tokio::test]
@@ -170,8 +219,7 @@ async fn query_multi_csv_file() {
     +---------+-------+-------+
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
-    Total Requests: 4
-    - LIST prefix=data
+    Total Requests: 3
     - GET  (opts) path=data/file_0.csv
     - GET  (opts) path=data/file_1.csv
     - GET  (opts) path=data/file_2.csv
@@ -198,8 +246,7 @@ async fn query_partitioned_csv_file() {
     +---------+-------+-------+---+----+-----+
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
-    Total Requests: 4
-    - LIST prefix=data
+    Total Requests: 3
     - GET  (opts) path=data/a=1/b=10/c=100/file_1.csv
     - GET  (opts) path=data/a=2/b=20/c=200/file_2.csv
     - GET  (opts) path=data/a=3/b=30/c=300/file_3.csv
@@ -218,8 +265,7 @@ async fn query_partitioned_csv_file() {
     +---------+-------+-------+---+----+-----+
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
-    Total Requests: 2
-    - LIST prefix=data/a=2
+    Total Requests: 1
     - GET  (opts) path=data/a=2/b=20/c=200/file_2.csv
     "
     );
@@ -236,8 +282,7 @@ async fn query_partitioned_csv_file() {
     +---------+-------+-------+---+----+-----+
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
-    Total Requests: 2
-    - LIST prefix=data
+    Total Requests: 1
     - GET  (opts) path=data/a=2/b=20/c=200/file_2.csv
     "
     );
@@ -254,8 +299,7 @@ async fn query_partitioned_csv_file() {
     +---------+-------+-------+---+----+-----+
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
-    Total Requests: 2
-    - LIST prefix=data
+    Total Requests: 1
     - GET  (opts) path=data/a=2/b=20/c=200/file_2.csv
     "
     );
@@ -272,8 +316,7 @@ async fn query_partitioned_csv_file() {
     +---------+-------+-------+---+----+-----+
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
-    Total Requests: 2
-    - LIST prefix=data/a=2/b=20
+    Total Requests: 1
     - GET  (opts) path=data/a=2/b=20/c=200/file_2.csv
     "
     );
@@ -290,8 +333,7 @@ async fn query_partitioned_csv_file() {
     +---------+-------+-------+---+----+-----+
     ------- Object Store Request Summary -------
     RequestCountingObjectStore()
-    Total Requests: 2
-    - LIST prefix=data
+    Total Requests: 1
     - GET  (opts) path=data/a=1/b=10/c=100/file_1.csv
     "
     );
diff --git a/datafusion/execution/src/cache/cache_manager.rs 
b/datafusion/execution/src/cache/cache_manager.rs
index 1ff35f2707..c76a68c651 100644
--- a/datafusion/execution/src/cache/cache_manager.rs
+++ b/datafusion/execution/src/cache/cache_manager.rs
@@ -15,8 +15,8 @@
 // specific language governing permissions and limitations
 // under the License.
 
-use crate::cache::CacheAccessor;
 use crate::cache::cache_unit::DefaultFilesMetadataCache;
+use crate::cache::{CacheAccessor, DefaultListFilesCache};
 use datafusion_common::stats::Precision;
 use datafusion_common::{Result, Statistics};
 use object_store::ObjectMeta;
@@ -190,18 +190,25 @@ impl CacheManager {
         let file_statistic_cache =
             config.table_files_statistics_cache.as_ref().map(Arc::clone);
 
-        let list_files_cache = config
-            .list_files_cache
-            .as_ref()
-            .inspect(|c| {
+        let list_files_cache = match &config.list_files_cache {
+            Some(lfc) if config.list_files_cache_limit > 0 => {
                 // the cache memory limit or ttl might have changed, ensure 
they are updated
-                c.update_cache_limit(config.list_files_cache_limit);
+                lfc.update_cache_limit(config.list_files_cache_limit);
                 // Only update TTL if explicitly set in config, otherwise 
preserve the cache's existing TTL
                 if let Some(ttl) = config.list_files_cache_ttl {
-                    c.update_cache_ttl(Some(ttl));
+                    lfc.update_cache_ttl(Some(ttl));
                 }
-            })
-            .map(Arc::clone);
+                Some(Arc::clone(lfc))
+            }
+            None if config.list_files_cache_limit > 0 => {
+                let lfc: Arc<dyn ListFilesCache> = 
Arc::new(DefaultListFilesCache::new(
+                    config.list_files_cache_limit,
+                    config.list_files_cache_ttl,
+                ));
+                Some(lfc)
+            }
+            _ => None,
+        };
 
         let file_metadata_cache = config
             .file_metadata_cache
@@ -235,7 +242,7 @@ impl CacheManager {
     pub fn get_list_files_cache_limit(&self) -> usize {
         self.list_files_cache
             .as_ref()
-            .map_or(DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT, |c| c.cache_limit())
+            .map_or(0, |c| c.cache_limit())
     }
 
     /// Get the TTL (time-to-live) of the list files cache.
diff --git a/datafusion/sqllogictest/test_files/parquet.slt 
b/datafusion/sqllogictest/test_files/parquet.slt
index c786f7bdc7..be713b963b 100644
--- a/datafusion/sqllogictest/test_files/parquet.slt
+++ b/datafusion/sqllogictest/test_files/parquet.slt
@@ -21,6 +21,10 @@
 statement ok
 set datafusion.execution.target_partitions = 2;
 
+# disable the listing cache so DataFusion picks up changes from COPY statements
+statement ok
+set datafusion.runtime.list_files_cache_limit = "0K";
+
 # Create a table as a data source
 statement ok
 CREATE TABLE src_table (
diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt 
b/datafusion/sqllogictest/test_files/repartition_scan.slt
index 06ea22761d..c9c2f91257 100644
--- a/datafusion/sqllogictest/test_files/repartition_scan.slt
+++ b/datafusion/sqllogictest/test_files/repartition_scan.slt
@@ -27,6 +27,10 @@ set datafusion.execution.target_partitions = 4;
 statement ok
 set datafusion.optimizer.repartition_file_min_size = 1;
 
+# disable the listing cache so DataFusion picks up changes from COPY statements
+statement ok
+set datafusion.runtime.list_files_cache_limit = "0K";
+
 ###################
 ### Parquet tests
 ###################
diff --git a/datafusion/sqllogictest/test_files/set_variable.slt 
b/datafusion/sqllogictest/test_files/set_variable.slt
index 8957404799..c444128b18 100644
--- a/datafusion/sqllogictest/test_files/set_variable.slt
+++ b/datafusion/sqllogictest/test_files/set_variable.slt
@@ -416,6 +416,24 @@ SHOW datafusion.runtime.metadata_cache_limit
 ----
 datafusion.runtime.metadata_cache_limit 200M
 
+# Test SET and SHOW runtime.list_files_cache_limit
+statement ok
+SET datafusion.runtime.list_files_cache_limit = '2M'
+
+query TT
+SHOW datafusion.runtime.list_files_cache_limit
+----
+datafusion.runtime.list_files_cache_limit 2M
+
+# Test SET and SHOW runtime.list_files_cache_ttl
+statement ok
+SET datafusion.runtime.list_files_cache_ttl = '90s'
+
+query TT
+SHOW datafusion.runtime.list_files_cache_ttl
+----
+datafusion.runtime.list_files_cache_ttl 1m30s
+
 # Note: runtime.temp_directory shows the actual temp directory path with a 
unique suffix,
 # so we cannot test the exact value. We verify it exists in information_schema 
instead.
 
diff --git a/docs/source/library-user-guide/upgrading.md 
b/docs/source/library-user-guide/upgrading.md
index 39d52bd590..6b24c97ea4 100644
--- a/docs/source/library-user-guide/upgrading.md
+++ b/docs/source/library-user-guide/upgrading.md
@@ -45,15 +45,26 @@ directly on the `Field`. For example:
 In prior versions, `ListingTableProvider` would issue `LIST` commands to
 the underlying object store each time it needed to list files for a query.
 To improve performance, `ListingTableProvider` now caches the results of
-`LIST` commands for the lifetime of the `ListingTableProvider` instance.
+`LIST` commands for the lifetime of the `ListingTableProvider` instance or
+until a cache entry expires.
 
 Note that by default the cache has no expiration time, so if files are added 
or removed
 from the underlying object store, the `ListingTableProvider` will not see
 those changes until the `ListingTableProvider` instance is dropped and 
recreated.
 
-You will be able to configure the maximum cache size and cache expiration time 
via a configuration option:
+You can configure the maximum cache size and cache entry expiration time via 
configuration options:
 
-See <https://github.com/apache/datafusion/issues/19056> for more details.
+- `datafusion.runtime.list_files_cache_limit` - Limits the size of the cache 
in bytes
+- `datafusion.runtime.list_files_cache_ttl` - Limits the TTL (time-to-live) of 
an entry in seconds
+
+Detailed configuration information can be found in the [DataFusion Runtime
+Configuration](https://datafusion.apache.org/user-guide/configs.html#runtime-configuration-settings)
 user's guide.
+
+Caching can be disabled by setting the limit to 0:
+
+```sql
+SET datafusion.runtime.list_files_cache_limit TO "0K";
+```
 
 Note that the internal API has changed to use a trait `ListFilesCache` instead 
of a type alias.
 


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to