This is an automated email from the ASF dual-hosted git repository.
github-bot pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git
The following commit(s) were added to refs/heads/main by this push:
new 4e45c19d10 Enables DefaultListFilesCache by default (#19366)
4e45c19d10 is described below
commit 4e45c19d10600fe12442e6855c52ab3afcaa8493
Author: Blake Orth <[email protected]>
AuthorDate: Tue Dec 30 17:45:15 2025 -0700
Enables DefaultListFilesCache by default (#19366)
## Which issue does this PR close?
- Closes https://github.com/apache/datafusion/issues/18827
- Closes https://github.com/apache/datafusion/issues/9654
## Rationale for this change
Now that the `DefaultListFilesCache` can be configured by users it's
safe to enable it by default and fix the tests that caching broke!
## What changes are included in this PR?
- Sets the DefaultListFilesCache to be enabled by default
- Adds additional object store access tests to show list caching
behavior
- Adds variable setting/reading sqllogic test cases
- Updates tests to disable caching when they relied on COPY commands so
changes can be detected for each query
- Updates docs to help users upgrade
## Are these changes tested?
Yes, additional test cases have been added to help show the behavior of
the caching
## Are there any user-facing changes?
Yes, this changes the default behavior of DataFusion, however this
information is already captured in the upgrade guide.
##
cc @alamb
---------
Co-authored-by: Andrew Lamb <[email protected]>
---
.../core/tests/datasource/object_store_access.rs | 76 +++++++++++++++++-----
datafusion/execution/src/cache/cache_manager.rs | 27 +++++---
datafusion/sqllogictest/test_files/parquet.slt | 4 ++
.../sqllogictest/test_files/repartition_scan.slt | 4 ++
.../sqllogictest/test_files/set_variable.slt | 18 +++++
docs/source/library-user-guide/upgrading.md | 17 ++++-
6 files changed, 116 insertions(+), 30 deletions(-)
diff --git a/datafusion/core/tests/datasource/object_store_access.rs
b/datafusion/core/tests/datasource/object_store_access.rs
index 2e1b148407..561de21520 100644
--- a/datafusion/core/tests/datasource/object_store_access.rs
+++ b/datafusion/core/tests/datasource/object_store_access.rs
@@ -117,15 +117,40 @@ async fn multi_query_multi_file_csv_file() {
+---------+-------+-------+
------- Object Store Request Summary -------
RequestCountingObjectStore()
- Total Requests: 4
- - LIST prefix=data
+ Total Requests: 3
- GET (opts) path=data/file_0.csv
- GET (opts) path=data/file_1.csv
- GET (opts) path=data/file_2.csv
"
);
- // the second query should re-use the cached LIST results and should not
reissue LIST
+ // Force a cache eviction by removing the data limit for the cache
+ assert_snapshot!(
+ test.query("set
datafusion.runtime.list_files_cache_limit=\"0K\"").await,
+ @r"
+ ------- Query Output (0 rows) -------
+ ++
+ ++
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 0
+ "
+ );
+
+ // Then re-enable the cache
+ assert_snapshot!(
+ test.query("set
datafusion.runtime.list_files_cache_limit=\"1M\"").await,
+ @r"
+ ------- Query Output (0 rows) -------
+ ++
+ ++
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 0
+ "
+ );
+
+ // this query should list the table since the cache entries were evicted
assert_snapshot!(
test.query("select * from csv_table").await,
@r"
@@ -149,6 +174,30 @@ async fn multi_query_multi_file_csv_file() {
- GET (opts) path=data/file_2.csv
"
);
+
+ // this query should not list the table since the entries were added in
the previous query
+ assert_snapshot!(
+ test.query("select * from csv_table").await,
+ @r"
+ ------- Query Output (6 rows) -------
+ +---------+-------+-------+
+ | c1 | c2 | c3 |
+ +---------+-------+-------+
+ | 0.0 | 0.0 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00001 | 1e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ | 0.00002 | 2e-12 | true |
+ | 0.00003 | 5e-12 | false |
+ +---------+-------+-------+
+ ------- Object Store Request Summary -------
+ RequestCountingObjectStore()
+ Total Requests: 3
+ - GET (opts) path=data/file_0.csv
+ - GET (opts) path=data/file_1.csv
+ - GET (opts) path=data/file_2.csv
+ "
+ );
}
#[tokio::test]
@@ -170,8 +219,7 @@ async fn query_multi_csv_file() {
+---------+-------+-------+
------- Object Store Request Summary -------
RequestCountingObjectStore()
- Total Requests: 4
- - LIST prefix=data
+ Total Requests: 3
- GET (opts) path=data/file_0.csv
- GET (opts) path=data/file_1.csv
- GET (opts) path=data/file_2.csv
@@ -198,8 +246,7 @@ async fn query_partitioned_csv_file() {
+---------+-------+-------+---+----+-----+
------- Object Store Request Summary -------
RequestCountingObjectStore()
- Total Requests: 4
- - LIST prefix=data
+ Total Requests: 3
- GET (opts) path=data/a=1/b=10/c=100/file_1.csv
- GET (opts) path=data/a=2/b=20/c=200/file_2.csv
- GET (opts) path=data/a=3/b=30/c=300/file_3.csv
@@ -218,8 +265,7 @@ async fn query_partitioned_csv_file() {
+---------+-------+-------+---+----+-----+
------- Object Store Request Summary -------
RequestCountingObjectStore()
- Total Requests: 2
- - LIST prefix=data/a=2
+ Total Requests: 1
- GET (opts) path=data/a=2/b=20/c=200/file_2.csv
"
);
@@ -236,8 +282,7 @@ async fn query_partitioned_csv_file() {
+---------+-------+-------+---+----+-----+
------- Object Store Request Summary -------
RequestCountingObjectStore()
- Total Requests: 2
- - LIST prefix=data
+ Total Requests: 1
- GET (opts) path=data/a=2/b=20/c=200/file_2.csv
"
);
@@ -254,8 +299,7 @@ async fn query_partitioned_csv_file() {
+---------+-------+-------+---+----+-----+
------- Object Store Request Summary -------
RequestCountingObjectStore()
- Total Requests: 2
- - LIST prefix=data
+ Total Requests: 1
- GET (opts) path=data/a=2/b=20/c=200/file_2.csv
"
);
@@ -272,8 +316,7 @@ async fn query_partitioned_csv_file() {
+---------+-------+-------+---+----+-----+
------- Object Store Request Summary -------
RequestCountingObjectStore()
- Total Requests: 2
- - LIST prefix=data/a=2/b=20
+ Total Requests: 1
- GET (opts) path=data/a=2/b=20/c=200/file_2.csv
"
);
@@ -290,8 +333,7 @@ async fn query_partitioned_csv_file() {
+---------+-------+-------+---+----+-----+
------- Object Store Request Summary -------
RequestCountingObjectStore()
- Total Requests: 2
- - LIST prefix=data
+ Total Requests: 1
- GET (opts) path=data/a=1/b=10/c=100/file_1.csv
"
);
diff --git a/datafusion/execution/src/cache/cache_manager.rs
b/datafusion/execution/src/cache/cache_manager.rs
index 1ff35f2707..c76a68c651 100644
--- a/datafusion/execution/src/cache/cache_manager.rs
+++ b/datafusion/execution/src/cache/cache_manager.rs
@@ -15,8 +15,8 @@
// specific language governing permissions and limitations
// under the License.
-use crate::cache::CacheAccessor;
use crate::cache::cache_unit::DefaultFilesMetadataCache;
+use crate::cache::{CacheAccessor, DefaultListFilesCache};
use datafusion_common::stats::Precision;
use datafusion_common::{Result, Statistics};
use object_store::ObjectMeta;
@@ -190,18 +190,25 @@ impl CacheManager {
let file_statistic_cache =
config.table_files_statistics_cache.as_ref().map(Arc::clone);
- let list_files_cache = config
- .list_files_cache
- .as_ref()
- .inspect(|c| {
+ let list_files_cache = match &config.list_files_cache {
+ Some(lfc) if config.list_files_cache_limit > 0 => {
// the cache memory limit or ttl might have changed, ensure
they are updated
- c.update_cache_limit(config.list_files_cache_limit);
+ lfc.update_cache_limit(config.list_files_cache_limit);
// Only update TTL if explicitly set in config, otherwise
preserve the cache's existing TTL
if let Some(ttl) = config.list_files_cache_ttl {
- c.update_cache_ttl(Some(ttl));
+ lfc.update_cache_ttl(Some(ttl));
}
- })
- .map(Arc::clone);
+ Some(Arc::clone(lfc))
+ }
+ None if config.list_files_cache_limit > 0 => {
+ let lfc: Arc<dyn ListFilesCache> =
Arc::new(DefaultListFilesCache::new(
+ config.list_files_cache_limit,
+ config.list_files_cache_ttl,
+ ));
+ Some(lfc)
+ }
+ _ => None,
+ };
let file_metadata_cache = config
.file_metadata_cache
@@ -235,7 +242,7 @@ impl CacheManager {
pub fn get_list_files_cache_limit(&self) -> usize {
self.list_files_cache
.as_ref()
- .map_or(DEFAULT_LIST_FILES_CACHE_MEMORY_LIMIT, |c| c.cache_limit())
+ .map_or(0, |c| c.cache_limit())
}
/// Get the TTL (time-to-live) of the list files cache.
diff --git a/datafusion/sqllogictest/test_files/parquet.slt
b/datafusion/sqllogictest/test_files/parquet.slt
index c786f7bdc7..be713b963b 100644
--- a/datafusion/sqllogictest/test_files/parquet.slt
+++ b/datafusion/sqllogictest/test_files/parquet.slt
@@ -21,6 +21,10 @@
statement ok
set datafusion.execution.target_partitions = 2;
+# disable the listing cache so DataFusion picks up changes from COPY statements
+statement ok
+set datafusion.runtime.list_files_cache_limit = "0K";
+
# Create a table as a data source
statement ok
CREATE TABLE src_table (
diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt
b/datafusion/sqllogictest/test_files/repartition_scan.slt
index 06ea22761d..c9c2f91257 100644
--- a/datafusion/sqllogictest/test_files/repartition_scan.slt
+++ b/datafusion/sqllogictest/test_files/repartition_scan.slt
@@ -27,6 +27,10 @@ set datafusion.execution.target_partitions = 4;
statement ok
set datafusion.optimizer.repartition_file_min_size = 1;
+# disable the listing cache so DataFusion picks up changes from COPY statements
+statement ok
+set datafusion.runtime.list_files_cache_limit = "0K";
+
###################
### Parquet tests
###################
diff --git a/datafusion/sqllogictest/test_files/set_variable.slt
b/datafusion/sqllogictest/test_files/set_variable.slt
index 8957404799..c444128b18 100644
--- a/datafusion/sqllogictest/test_files/set_variable.slt
+++ b/datafusion/sqllogictest/test_files/set_variable.slt
@@ -416,6 +416,24 @@ SHOW datafusion.runtime.metadata_cache_limit
----
datafusion.runtime.metadata_cache_limit 200M
+# Test SET and SHOW runtime.list_files_cache_limit
+statement ok
+SET datafusion.runtime.list_files_cache_limit = '2M'
+
+query TT
+SHOW datafusion.runtime.list_files_cache_limit
+----
+datafusion.runtime.list_files_cache_limit 2M
+
+# Test SET and SHOW runtime.list_files_cache_ttl
+statement ok
+SET datafusion.runtime.list_files_cache_ttl = '90s'
+
+query TT
+SHOW datafusion.runtime.list_files_cache_ttl
+----
+datafusion.runtime.list_files_cache_ttl 1m30s
+
# Note: runtime.temp_directory shows the actual temp directory path with a
unique suffix,
# so we cannot test the exact value. We verify it exists in information_schema
instead.
diff --git a/docs/source/library-user-guide/upgrading.md
b/docs/source/library-user-guide/upgrading.md
index 39d52bd590..6b24c97ea4 100644
--- a/docs/source/library-user-guide/upgrading.md
+++ b/docs/source/library-user-guide/upgrading.md
@@ -45,15 +45,26 @@ directly on the `Field`. For example:
In prior versions, `ListingTableProvider` would issue `LIST` commands to
the underlying object store each time it needed to list files for a query.
To improve performance, `ListingTableProvider` now caches the results of
-`LIST` commands for the lifetime of the `ListingTableProvider` instance.
+`LIST` commands for the lifetime of the `ListingTableProvider` instance or
+until a cache entry expires.
Note that by default the cache has no expiration time, so if files are added
or removed
from the underlying object store, the `ListingTableProvider` will not see
those changes until the `ListingTableProvider` instance is dropped and
recreated.
-You will be able to configure the maximum cache size and cache expiration time
via a configuration option:
+You can configure the maximum cache size and cache entry expiration time via
configuration options:
-See <https://github.com/apache/datafusion/issues/19056> for more details.
+- `datafusion.runtime.list_files_cache_limit` - Limits the size of the cache
in bytes
+- `datafusion.runtime.list_files_cache_ttl` - Limits the TTL (time-to-live) of
an entry in seconds
+
+Detailed configuration information can be found in the [DataFusion Runtime
+Configuration](https://datafusion.apache.org/user-guide/configs.html#runtime-configuration-settings)
user's guide.
+
+Caching can be disabled by setting the limit to 0:
+
+```sql
+SET datafusion.runtime.list_files_cache_limit TO "0K";
+```
Note that the internal API has changed to use a trait `ListFilesCache` instead
of a type alias.
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]