[ https://issues.apache.org/jira/browse/HIVE-28513 ]
zhaolong deleted comment on HIVE-28513:
---------------------------------
was (Author: fsilent):
on ( D.column4_4 = 1 ) OR ( T1.column10_2 = '0') should be on ( D.column4_4
= ‘1’) OR ( T1.column10_2 = '0') because D.column4_4 is a string type
> when cbo is false,join with 'or' condition cause wrong result
> -------------------------------------------------------------
>
> Key: HIVE-28513
> URL: https://issues.apache.org/jira/browse/HIVE-28513
> Project: Hive
> Issue Type: Bug
> Components: CBO
> Affects Versions: 4.0.0-beta-1, 4.0.1
> Reporter: lotan
> Priority: Major
> Attachments: Reproduce SQL statements..txt,
> image-2024-09-10-09-56-28-750.png, image-2024-09-10-09-57-42-902.png
>
>
> In the 4 table join scenario, when join on is set to "or" for filtering and
> is a fixed value condition, data is incorrectly listed.
> The problem is reproduced as follows:
> CREATE TABLE `table1`(
> `column1` varchar(90),
> `column2` varchar(11) ,
>
> `column3` varchar(18),
> `column4` varchar(3) ,
>
> `column5` varchar(12) ,
> `column6` varchar(29) ,
>
> `column7` varchar(8) ,
>
> `column8` varchar(11) ,
>
> `column9` varchar(5),
> `column10` varchar(2) )
> PARTITIONED BY (
>
> `pt_dt` varchar(10) )
> ROW FORMAT SERDE
>
> 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
>
> WITH SERDEPROPERTIES (
>
> 'field.delim'='27',
>
> 'serialization.format'='27')
>
> STORED AS INPUTFORMAT
>
> 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
>
> OUTPUTFORMAT
>
> 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
> ;
>
>
> CREATE TABLE `table2`(
> `column2_1` varchar(9) ,
> `column2_2` varchar(30) ,
> `column2_3` varchar(30),
> `column2_4` varchar(90) ,
>
> `column2_5` varchar(15))
> PARTITIONED BY (
>
> `pt_dt` varchar(10))
>
> ROW FORMAT SERDE
>
> 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
>
> WITH SERDEPROPERTIES (
>
> 'field.delim'='27',
>
> 'serialization.format'='27')
>
> STORED AS INPUTFORMAT
>
> 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
>
> OUTPUTFORMAT
>
> 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
> ;
>
>
> CREATE TABLE `table3`(
> `column3_1` varchar(30) ,
> `column3_2` varchar(30) ,
> `column3_3` varchar(30) ,
> `column3_4` varchar(30),
> `column3_5` varchar(30))
>
> ROW FORMAT SERDE
>
> 'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
>
> WITH SERDEPROPERTIES (
>
> 'field.delim'='27',
>
> 'serialization.format'='27')
>
> STORED AS INPUTFORMAT
>
> 'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
>
> OUTPUTFORMAT
>
> 'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat' ;
>
>
> CREATE TABLE `table4`(
>
> `column4_2` string ,
> `column4_3` string ,
> `column4_1` string ,
> `column4_4` string )
>
> PARTITIONED BY (
> `pt_dt` string)
> ROW FORMAT SERDE
> 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
> WITH SERDEPROPERTIES (
> 'field.delim'='27',
> 'serialization.format'='27')
> STORED AS INPUTFORMAT
> 'org.apache.hadoop.mapred.TextInputFormat'
> OUTPUTFORMAT
> 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' ;
> Insert Reproduction Data:
> insert into table1 partition (pt_dt='2023-11-30')
> (column6,column3,column7,column2,column5,column10,column1,column8,column9,column4)
> values
> (16160200980000057,0140010000296,00001,0000001,'2022-11-30',1,1,1234567,12345,1),(27120403980000164,0140010000296,00002,0000001,'2022-11-30',1,1,1234567,12345,1),(26040204980001179,0140010000296,00001,0000001,'2022-11-30',1,1,1234567,12345,1),(20100213980049933,0140010000296,00001,0000001,'2022-11-30',1,1,1234567,12345,1),(16070091980002440,0140010000296,00001,0000001,'2022-11-30',1,1,1234567,12345,1),(16070017980004555,0030100000004,00001,0000001,'2022-11-30',1,1,1234567,12345,1),(06060822980002332,0030100000013,00001,0000001,'2022-11-30',1,1,1234567,12345,1);
> insert into table2 partition (pt_dt='2023-11-30')
> (column2_1,column2_4,column2_2,column2_3,column2_5) values
> ('S6',1,016160020000000055,027120000600001061,0161600203),('S6',1,027120000600001061,016160020000000055,0271200006),('S6',1,026040000700001541,026040000700001541,0260400007),('S6',1,020100025900019726,016070001700001229,0201000259),('S6',1,016070009100000471,016070001700001229,0160700091),('S6',1,016070001700001229,016070009100000471,0160700296),('S6',1,006060000500001050,016070009100000471,0060600005);
> Run the reproduction SQL statement:
> SELECT
> T2.column2_5
> ,T2.column2_2
> ,T6.column3_5
> ,T6.column3_2
> ,T6.column3_4
> ,T6.column3_3
> ,T1.column10
> ,T2.column2_4
> FROM (SELECT
> column8
> ,column1
> ,column5
> ,column9
> ,column4
> ,PT_DT
> ,column10
> ,column10 AS column10_2
> FROM table1
> WHERE PT_DT= '2023-11-30'
> ) T1
> INNER JOIN (SELECT column2_4
> ,column2_3
> ,column2_2
> ,column2_5
> FROM table2 A
> WHERE PT_DT = '2023-11-30'
> )T2
> ON T1.column1 = T2.column2_4
> LEFT JOIN (SELECT column4_1,column4_2,column4_3,column4_4
> FROM table4
> WHERE PT_DT='2023-11-30'
> ) D
> on ( D.column4_4 = 1 ) OR ( T1.column10_2 = '0')
> LEFT JOIN ( SELECT
> F.column3_1
> ,F.column3_2
> ,F.column3_3
> ,F.column3_4
> ,F.column3_5
> FROM table3 F
> ) T6
> ON T2.column2_5 = T6.column3_1
> WHERE D.column4_3 IS NULL ;
>
> Expected data:
> !image-2024-09-10-09-56-28-750.png|width=981,height=219!
> Abnormal data:
> !image-2024-09-10-09-57-42-902.png!
>
>
--
This message was sent by Atlassian Jira
(v8.20.10#820010)