This is an automated email from the ASF dual-hosted git repository.

alamb pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/datafusion.git


The following commit(s) were added to refs/heads/main by this push:
     new fe53eaf22e fix: Limit together with pushdown_filters (#13788)
fe53eaf22e is described below

commit fe53eaf22ef6b197eb96ce3723a8ce4b43c9f65b
Author: Qi Zhu <[email protected]>
AuthorDate: Mon Dec 16 23:47:50 2024 +0800

    fix: Limit together with pushdown_filters (#13788)
    
    * fix: Limit together with pushdown_filters
    
    * Fix format
    
    * Address new comments
    
    * Fix testing case to hit the problem
---
 datafusion/core/src/datasource/listing/table.rs    |  7 ++-
 .../sqllogictest/test_files/push_down_filter.slt   | 73 ++++++++++++++++++++++
 2 files changed, 79 insertions(+), 1 deletion(-)

diff --git a/datafusion/core/src/datasource/listing/table.rs 
b/datafusion/core/src/datasource/listing/table.rs
index ffe49dd2ba..b12f37ed75 100644
--- a/datafusion/core/src/datasource/listing/table.rs
+++ b/datafusion/core/src/datasource/listing/table.rs
@@ -843,8 +843,13 @@ impl TableProvider for ListingTable {
             });
         // TODO (https://github.com/apache/datafusion/issues/11600) remove 
downcast_ref from here?
         let session_state = 
state.as_any().downcast_ref::<SessionState>().unwrap();
+
+        // We should not limit the number of partitioned files to scan if 
there are filters and limit
+        // at the same time. This is because the limit should be applied after 
the filters are applied.
+        let statistic_file_limit = if filters.is_empty() { limit } else { None 
};
+
         let (mut partitioned_file_lists, statistics) = self
-            .list_files_for_scan(session_state, &partition_filters, limit)
+            .list_files_for_scan(session_state, &partition_filters, 
statistic_file_limit)
             .await?;
 
         // if no files need to be read, return an `EmptyExec`
diff --git a/datafusion/sqllogictest/test_files/push_down_filter.slt 
b/datafusion/sqllogictest/test_files/push_down_filter.slt
index 64cc51b3c4..521aa33409 100644
--- a/datafusion/sqllogictest/test_files/push_down_filter.slt
+++ b/datafusion/sqllogictest/test_files/push_down_filter.slt
@@ -122,3 +122,76 @@ logical_plan
 
 statement ok
 drop table d;
+
+
+# Test push down filter with limit for parquet
+statement ok
+set datafusion.execution.parquet.pushdown_filters = true;
+
+# this one is also required to make DF skip second file due to "sufficient" 
amount of rows
+statement ok
+set datafusion.execution.collect_statistics = true;
+
+# Create a table as a data source
+statement ok
+CREATE TABLE src_table (
+    part_key INT,
+    value INT
+) AS VALUES(1, 0), (1, 1), (1, 100), (2, 0), (2, 2), (2, 2), (2, 100), (3, 4), 
(3, 5), (3, 6);
+
+
+# There will be more than 2 records filtered from the table to check that 
`limit 1` actually applied.
+# Setup 3 files, i.e., as many as there are partitions:
+
+# File 1:
+query I
+COPY (SELECT * FROM src_table where part_key = 1)
+TO 'test_files/scratch/parquet/test_filter_with_limit/part-0.parquet'
+STORED AS PARQUET;
+----
+3
+
+# File 2:
+query I
+COPY (SELECT * FROM src_table where part_key = 2)
+TO 'test_files/scratch/parquet/test_filter_with_limit/part-1.parquet'
+STORED AS PARQUET;
+----
+4
+
+# File 3:
+query I
+COPY (SELECT * FROM src_table where part_key = 3)
+TO 'test_files/scratch/parquet/test_filter_with_limit/part-2.parquet'
+STORED AS PARQUET;
+----
+3
+
+statement ok
+CREATE EXTERNAL TABLE test_filter_with_limit
+(
+  part_key INT,
+  value INT
+)
+STORED AS PARQUET
+LOCATION 'test_files/scratch/parquet/test_filter_with_limit/';
+
+query TT
+explain select * from test_filter_with_limit where value = 2 limit 1;
+----
+logical_plan
+01)Limit: skip=0, fetch=1
+02)--TableScan: test_filter_with_limit projection=[part_key, value], 
full_filters=[test_filter_with_limit.value = Int32(2)], fetch=1
+
+query II
+select * from test_filter_with_limit where value = 2 limit 1;
+----
+2 2
+
+# Tear down test_filter_with_limit table:
+statement ok
+DROP TABLE test_filter_with_limit;
+
+# Tear down src_table table:
+statement ok
+DROP TABLE src_table;


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to