AndreaBozzo commented on PR #21141: URL: https://github.com/apache/datafusion/pull/21141#issuecomment-4164896115
> Any updates on testing @AndreaBozzo ? LGTM @alamb @hareshkh , i just wanted to run few additional slts (which pass), also leaving them in the dropdown menu <details> <summary>Additional stress tests (click to expand)</summary> ```sql # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. statement ok CREATE TABLE data(id INT, value INT, category TEXT) AS VALUES (1, 10, 'A'), (2, 20, 'B'), (3, 30, 'A'), (4, 40, 'B'), (5, 50, 'A'), (6, 60, 'B'), (7, 70, 'A'), (8, 80, 'B'), (9, 90, 'A'), (10, 100, 'B'); ############################################### # 1. OFFSET (skip) — filter must NOT be pushed past OFFSET ############################################### # Skip first 3 rows (by value ASC: 10,20,30), then filter value < 50. # Correct: after skipping, rows are 40,50,60,70,80,90,100 → filter keeps only 40. # Bug: if filter pushed below OFFSET, it would filter first then skip. query II SELECT id, value FROM (SELECT * FROM data ORDER BY value LIMIT 7 OFFSET 3) sub WHERE sub.value < 50; ---- 4 40 # Verify the plan shape: filter stays above sort+fetch query TT EXPLAIN SELECT id, value FROM (SELECT * FROM data ORDER BY value LIMIT 7 OFFSET 3) sub WHERE sub.value < 50; ---- logical_plan 01)SubqueryAlias: sub 02)--Filter: data.value < Int32(50) 03)----Limit: skip=3, fetch=7 04)------Sort: data.value ASC NULLS LAST, fetch=10 05)--------TableScan: data projection=[id, value] physical_plan 01)FilterExec: value@1 < 50 02)--GlobalLimitExec: skip=3, fetch=7 03)----SortExec: TopK(fetch=10), expr=[value@1 ASC NULLS LAST], preserve_partitioning=[false] 04)------DataSourceExec: partitions=1, partition_sizes=[1] ############################################### # 2. OFFSET only (no LIMIT) — filter must NOT be pushed past OFFSET ############################################### # Skip first 7 rows (values 10..70), then filter value > 90. # Correct result: after skipping, rows are 80,90,100 → filter keeps 100. query II SELECT id, value FROM (SELECT * FROM data ORDER BY value OFFSET 7) sub WHERE sub.value > 90; ---- 10 100 ############################################### # 3. Sort with fetch + filter on a different column ############################################### # Top 5 by value ASC (ids 1-5, values 10-50), then filter by category = 'A'. # Correct: ids 1,3,5 (values 10,30,50). # Bug: if filter pushed below sort, it would filter category='A' first (ids 1,3,5,7,9) # then take top 5 → ids 1,3,5,7,9 — WRONG! query IIT SELECT * FROM (SELECT * FROM data ORDER BY value LIMIT 5) sub WHERE sub.category = 'A'; ---- 1 10 A 3 30 A 5 50 A ############################################### # 4. CTE with LIMIT — filter must not be pushed into CTE ############################################### query II WITH top3 AS (SELECT id, value FROM data ORDER BY value LIMIT 3) SELECT * FROM top3 WHERE top3.value > 15; ---- 2 20 3 30 # Verify plan shape query TT EXPLAIN WITH top3 AS (SELECT id, value FROM data ORDER BY value LIMIT 3) SELECT * FROM top3 WHERE top3.value > 15; ---- logical_plan 01)SubqueryAlias: top3 02)--Filter: data.value > Int32(15) 03)----Sort: data.value ASC NULLS LAST, fetch=3 04)------TableScan: data projection=[id, value] physical_plan 01)FilterExec: value@1 > 15 02)--SortExec: TopK(fetch=3), expr=[value@1 ASC NULLS LAST], preserve_partitioning=[false] 03)----DataSourceExec: partitions=1, partition_sizes=[1] ############################################### # 5. Nested subqueries with multiple limits ############################################### # Inner: top 5 by value ASC (10,20,30,40,50) # Middle: filter value > 20 → (30,'A',40,'B',50,'A') # Outer: take top 2 of that → (30,'A'),(40,'B') query IIT SELECT * FROM ( SELECT * FROM ( SELECT * FROM data ORDER BY value LIMIT 5 ) inner_sub WHERE inner_sub.value > 20 ORDER BY value LIMIT 2 ) outer_sub; ---- 3 30 A 4 40 B ############################################### # 6. Filter after LIMIT on TableScan (no Sort) ############################################### # Without ORDER BY, LIMIT on a TableScan is non-deterministic, # but the filter must still not be pushed past it. # We just verify the plan shape here. query TT EXPLAIN SELECT * FROM (SELECT * FROM data LIMIT 4) sub WHERE sub.value > 30; ---- logical_plan 01)SubqueryAlias: sub 02)--Filter: data.value > Int32(30) 03)----Limit: skip=0, fetch=4 04)------TableScan: data projection=[id, value, category], fetch=4 physical_plan 01)FilterExec: value@1 > 30 02)--DataSourceExec: partitions=1, partition_sizes=[1], fetch=4 ############################################### # 7. Multiple filters — none should be pushed past sort+fetch ############################################### query IIT SELECT * FROM (SELECT * FROM data ORDER BY value LIMIT 6) sub WHERE sub.value > 15 AND sub.category = 'A'; ---- 3 30 A 5 50 A ############################################### # 8. Filter after LIMIT in a JOIN ############################################### # Create a small dimension table statement ok CREATE TABLE dim(category TEXT, label TEXT) AS VALUES ('A', 'Alpha'), ('B', 'Beta'); # Join the top-3 by value with the dimension table, then filter. # Top 3: (1,10,'A'), (2,20,'B'), (3,30,'A') # After join: (1,10,'A','Alpha'), (2,20,'B','Beta'), (3,30,'A','Alpha') # Filter value > 15: (2,20,'B','Beta'), (3,30,'A','Alpha') query IITT SELECT sub.id, sub.value, sub.category, dim.label FROM (SELECT * FROM data ORDER BY value LIMIT 3) sub JOIN dim ON sub.category = dim.category WHERE sub.value > 15 ORDER BY sub.value; ---- 2 20 B Beta 3 30 A Alpha ############################################### # 9. UNION ALL with LIMIT — filter must respect limits ############################################### # Each branch takes top 2; filter must apply AFTER the limit in each branch. # Category A top 2 by value: (1,10), (3,30). Category B top 2: (2,20), (4,40). # Union: (1,10),(3,30),(2,20),(4,40). Filter value > 15: (3,30),(2,20),(4,40). query II SELECT * FROM ( SELECT * FROM (SELECT id, value FROM data WHERE category = 'A' ORDER BY value LIMIT 2) a UNION ALL SELECT * FROM (SELECT id, value FROM data WHERE category = 'B' ORDER BY value LIMIT 2) b ) sub WHERE sub.value > 15 ORDER BY value; ---- 2 20 3 30 4 40 ############################################### # 10. Correctness: verify exact row counts ############################################### # Top 4 by value → {10,20,30,40}, filter >25 → {30,40} → count=2 query I SELECT COUNT(*) FROM (SELECT * FROM data ORDER BY value LIMIT 4) sub WHERE sub.value > 25; ---- 2 # If filter were incorrectly pushed down: filter first gives {30,40,50,60,70,80,90,100}, # then limit 4 → {30,40,50,60} → count=4. So count=2 proves correctness. statement ok DROP TABLE dim; statement ok DROP TABLE data; ``` </details> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
