[jira] [Updated] (IMPALA-13262) Predicate pushdown causes incorrect results in join condition

Fang-Yu Rao (Jira) Fri, 09 Aug 2024 19:24:42 -0700


     [ 
https://issues.apache.org/jira/browse/IMPALA-13262?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]


Fang-Yu Rao updated IMPALA-13262:
---------------------------------
    Description: 
We found that in some scenario Apache Impala 
([https://github.com/apache/impala/commit/c539874]) could incorrectly push 
predicates to scan nodes, which in turn produces the wrong result. The 
following is a concrete example to reproduce the issue.
{code:sql}
create database impala_13262;
use impala_13262;

create table department ( dept_no integer, dept_rank integer, start_date 
timestamp,end_date timestamp);

insert into department values(1,1,'2024-01-01','2024-01-02');
insert into department values(1,2,'2024-01-02','2024-01-03');
insert into department values(1,3,'2024-01-03','2024-01-03');

create table employee (employee_no integer, depart_no integer);

insert into employee values (1,1);

// The following query should return 0 row. However Apache Impala produces one 
row.

select * from employee t1
inner join (
select * from
(
select dept_no,dept_rank,start_date,end_date
,row_number() over(partition by dept_no order by dept_rank) rn
from department
) t2
where rn=1
) t2
on t1.depart_no=t2.dept_no
where t2.start_date=t2.end_date;

set explain_level=2;

// In the output of the EXPLAIN statement, we found that the predicate 
"start_data = end_date" was pushed
// down to the scan node, which is wrong.

| 01:SCAN HDFS [impala_13262.department, RANDOM]                                
                        |
|    HDFS partitions=1/1 files=3 size=132B                                      
                        |
|    predicates: start_date = end_date                                          
                        |
|    stored statistics:                                                         
                        |
|      table: rows=unavailable size=unavailable                                 
                        |
|      columns: unavailable                                                     
                        |
|    extrapolated-rows=disabled max-scan-range-rows=unavailable                 
                        |
|    mem-estimate=32.00MB mem-reservation=8.00KB thread-reservation=1           
                        |
|    tuple-ids=1 row-size=40B cardinality=1                                     
                        |
|    in pipelines: 01(GETNEXT)                                                  
                        |
+-------------------------------------------------------------------------------------------------------+
{code}
 

+*Edit:*+

The following is a smaller case to reproduce the issue. The correct result 
should be 0 row but Impala returns 1 row as above.
{code:java}
select * from
(
select dept_no,dept_rank,start_date,end_date
,row_number() over(partition by dept_no order by dept_rank) rn
from department
) t2
where rn=1 and t2.start_date=t2.end_date;
{code}
 

Recall the contents of the inline view '{*}t2{*}' above is as follows.
{code:java}
+---------+-----------+---------------------+---------------------+----+
| dept_no | dept_rank | start_date          | end_date            | rn |
+---------+-----------+---------------------+---------------------+----+
| 1       | 1         | 2024-01-01 00:00:00 | 2024-01-02 00:00:00 | 1  |
| 1       | 2         | 2024-01-02 00:00:00 | 2024-01-03 00:00:00 | 2  |
| 1       | 3         | 2024-01-03 00:00:00 | 2024-01-03 00:00:00 | 3  |
+---------+-----------+---------------------+---------------------+----+
{code}
 

On the other hand, the following query without the conjunct '{*}rn=1{*}' 
returns the correct result, which is the row with '{*}rn{*}' equal to *3* 
above. It almost looks like adding this '{*}rn=1{*}' predicate triggers the 
incorrect pushdown of '{*}t2.start_date=t2.end_date{*}' to the scan node of the 
table '{*}department{*}'.
{code:java}
select * from
(
select dept_no,dept_rank,start_date,end_date
,row_number() over(partition by dept_no order by dept_rank) rn
from department
) t2
where t2.start_date=t2.end_date;
{code}

  was:
We found that in some scenario Apache Impala 
(https://github.com/apache/impala/commit/c539874) could incorrectly push 
predicates to scan nodes, which in turn produces the wrong result. The 
following is a concrete example to reproduce the issue.
{code:sql}
create database impala_13262;
use impala_13262;

create table department ( dept_no integer, dept_rank integer, start_date 
timestamp,end_date timestamp);

insert into department values(1,1,'2024-01-01','2024-01-02');
insert into department values(1,2,'2024-01-02','2024-01-03');
insert into department values(1,3,'2024-01-03','2024-01-03');

create table employee (employee_no integer, depart_no integer);

insert into employee values (1,1);

// The following query should return 0 row. However Apache Impala produces one 
row.

select * from employee t1
inner join (
select * from
(
select dept_no,dept_rank,start_date,end_date
,row_number() over(partition by dept_no order by dept_rank) rn
from department
) t2
where rn=1
) t2
on t1.depart_no=t2.dept_no
where t2.start_date=t2.end_date;

set explain_level=2;

// In the output of the EXPLAIN statement, we found that the predicate 
"start_data = end_date" was pushed
// down to the scan node, which is wrong.

| 01:SCAN HDFS [impala_13262.department, RANDOM]                                
                        |
|    HDFS partitions=1/1 files=3 size=132B                                      
                        |
|    predicates: start_date = end_date                                          
                        |
|    stored statistics:                                                         
                        |
|      table: rows=unavailable size=unavailable                                 
                        |
|      columns: unavailable                                                     
                        |
|    extrapolated-rows=disabled max-scan-range-rows=unavailable                 
                        |
|    mem-estimate=32.00MB mem-reservation=8.00KB thread-reservation=1           
                        |
|    tuple-ids=1 row-size=40B cardinality=1                                     
                        |
|    in pipelines: 01(GETNEXT)                                                  
                        |
+-------------------------------------------------------------------------------------------------------+
{code}


> Predicate pushdown causes incorrect results in join condition
> -------------------------------------------------------------
>
>                 Key: IMPALA-13262
>                 URL: https://issues.apache.org/jira/browse/IMPALA-13262
>             Project: IMPALA
>          Issue Type: Bug
>            Reporter: Fang-Yu Rao
>            Assignee: Fang-Yu Rao
>            Priority: Major
>              Labels: correctness
>
> We found that in some scenario Apache Impala 
> ([https://github.com/apache/impala/commit/c539874]) could incorrectly push 
> predicates to scan nodes, which in turn produces the wrong result. The 
> following is a concrete example to reproduce the issue.
> {code:sql}
> create database impala_13262;
> use impala_13262;
> create table department ( dept_no integer, dept_rank integer, start_date 
> timestamp,end_date timestamp);
> insert into department values(1,1,'2024-01-01','2024-01-02');
> insert into department values(1,2,'2024-01-02','2024-01-03');
> insert into department values(1,3,'2024-01-03','2024-01-03');
> create table employee (employee_no integer, depart_no integer);
> insert into employee values (1,1);
> // The following query should return 0 row. However Apache Impala produces 
> one row.
> select * from employee t1
> inner join (
> select * from
> (
> select dept_no,dept_rank,start_date,end_date
> ,row_number() over(partition by dept_no order by dept_rank) rn
> from department
> ) t2
> where rn=1
> ) t2
> on t1.depart_no=t2.dept_no
> where t2.start_date=t2.end_date;
> set explain_level=2;
> // In the output of the EXPLAIN statement, we found that the predicate 
> "start_data = end_date" was pushed
> // down to the scan node, which is wrong.
> | 01:SCAN HDFS [impala_13262.department, RANDOM]                              
>                           |
> |    HDFS partitions=1/1 files=3 size=132B                                    
>                           |
> |    predicates: start_date = end_date                                        
>                           |
> |    stored statistics:                                                       
>                           |
> |      table: rows=unavailable size=unavailable                               
>                           |
> |      columns: unavailable                                                   
>                           |
> |    extrapolated-rows=disabled max-scan-range-rows=unavailable               
>                           |
> |    mem-estimate=32.00MB mem-reservation=8.00KB thread-reservation=1         
>                           |
> |    tuple-ids=1 row-size=40B cardinality=1                                   
>                           |
> |    in pipelines: 01(GETNEXT)                                                
>                           |
> +-------------------------------------------------------------------------------------------------------+
> {code}
>  
> +*Edit:*+
> The following is a smaller case to reproduce the issue. The correct result 
> should be 0 row but Impala returns 1 row as above.
> {code:java}
> select * from
> (
> select dept_no,dept_rank,start_date,end_date
> ,row_number() over(partition by dept_no order by dept_rank) rn
> from department
> ) t2
> where rn=1 and t2.start_date=t2.end_date;
> {code}
>  
> Recall the contents of the inline view '{*}t2{*}' above is as follows.
> {code:java}
> +---------+-----------+---------------------+---------------------+----+
> | dept_no | dept_rank | start_date          | end_date            | rn |
> +---------+-----------+---------------------+---------------------+----+
> | 1       | 1         | 2024-01-01 00:00:00 | 2024-01-02 00:00:00 | 1  |
> | 1       | 2         | 2024-01-02 00:00:00 | 2024-01-03 00:00:00 | 2  |
> | 1       | 3         | 2024-01-03 00:00:00 | 2024-01-03 00:00:00 | 3  |
> +---------+-----------+---------------------+---------------------+----+
> {code}
>  
> On the other hand, the following query without the conjunct '{*}rn=1{*}' 
> returns the correct result, which is the row with '{*}rn{*}' equal to *3* 
> above. It almost looks like adding this '{*}rn=1{*}' predicate triggers the 
> incorrect pushdown of '{*}t2.start_date=t2.end_date{*}' to the scan node of 
> the table '{*}department{*}'.
> {code:java}
> select * from
> (
> select dept_no,dept_rank,start_date,end_date
> ,row_number() over(partition by dept_no order by dept_rank) rn
> from department
> ) t2
> where t2.start_date=t2.end_date;
> {code}



--
This message was sent by Atlassian Jira
(v8.20.10#820010)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-all-unsubscr...@impala.apache.org
For additional commands, e-mail: issues-all-h...@impala.apache.org

[jira] [Updated] (IMPALA-13262) Predicate pushdown causes incorrect results in join condition

Reply via email to