Dhanesh Padmanabhan created HIVE-20525:
------------------------------------------
Summary: Hive UNION ALL query gives wrong results with
hive.optimize.skewjoin=true
Key: HIVE-20525
URL: https://issues.apache.org/jira/browse/HIVE-20525
Project: Hive
Issue Type: Bug
Affects Versions: 2.1.1, 1.2.1
Reporter: Dhanesh Padmanabhan
Following query gives 0 rows instead of the expected 3 rows. Tested on dataproc
1.2.47-deb8 with the default hive version 2.1.1. The query throws an
ArrayIndexOutOfBoundsException if hive.auto.convert.join=true.
On HDP2.6.4 sandbox, hive 1.2.1 version this gives 2 rows instead of 3
{code:java}
set hive.optimize.skewjoin=true;
set hive.auto.convert.join=false;
CREATE TABLE t_ins (user_id bigint, app_id string, store string, oem string ,dt
string);
CREATE TABLE t_fa (user_id bigint, app_type string, build_name string, dt
string);
CREATE TABLE t_la (user_id bigint, oem string, dt string);
CREATE TABLE t_tu (user_id bigint, new_user_id bigint, dt string);
CREATE TABLE t_duc (country_code string, user_id bigint);
CREATE TABLE t_app (name string, app_id string, dt string);
INSERT INTO t_ins VALUES (1234567, 'xyzuvw','store1','OEM1','20180814');
INSERT INTO t_app VALUES ('app1','xyzuvw','20180814');
INSERT INTO t_fa VALUES (1234567,'app1','store1','20180814');
INSERT INTO t_la VALUES (1234567,'OEM1','20180814');
INSERT INTO t_tu VALUES (1234567,1111111,'20180814');
INSERT INTO t_duc VALUES ('IN',1234567);
INSERT INTO t_fa VALUES (7654321,'app2','store2','20180814');
INSERT INTO t_la VALUES (7654321,'OEM2','20180814');
INSERT INTO t_tu VALUES (7654321,1111111,'20180814');
INSERT INTO t_duc VALUES ('SE',7654321);
CREATE TABLE `abs`(
`country_code` string,
`app_type` string,
`build_name` string,
`device` string,
`new_user_id` string,
`user_type_1` smallint,
`user_type_2` smallint)
PARTITIONED BY (
`dt` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.ql.io.orc.OrcSerde'
STORED AS INPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat'
;
INSERT OVERWRITE TABLE abs PARTITION (dt=20180814)
SELECT
duc.country_code country_code,
app.name app_type,
i.store build_name,
i.oem device,
tu.new_user_id new_user_id,
1 user_type_1,
0 user_type_2
FROM
t_ins i
JOIN t_app app ON app.app_id = i.app_id AND app.dt = 20180814
LEFT JOIN t_duc duc on i.user_id = duc.user_id
LEFT JOIN t_tu tu on tu.user_id = i.user_id and tu.dt = 20180814
WHERE
i.dt = 20180814
GROUP BY
duc.country_code,
app.name,
i.store,
i.oem,
tu.new_user_id
UNION ALL
SELECT
duc.country_code country_code,
fa.app_type app_type,
fa.build_name build_name,
la.oem device,
tu.new_user_id new_user_id,
0 user_type_1,
1 user_type_2
FROM
t_fa fa
LEFT JOIN t_duc duc on fa.user_id = duc.user_id
LEFT JOIN t_tu tu on tu.user_id = fa.user_id and tu.dt = 20180814
LEFT JOIN t_la la ON fa.user_id = la.user_id AND la.dt = 20180814
WHERE
fa.dt = 20180814
GROUP BY
duc.country_code,
fa.app_type,
fa.build_name,
la.oem,
tu.new_user_id
;
select * from abs where dt=20180814;
{code}
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)