[ 
https://issues.apache.org/jira/browse/HIVE-24198?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

mahesh kumar behera updated HIVE-24198:
---------------------------------------
    Description: 
{code:java}
 CREATE TABLE tbl1_n5(key int, value string) CLUSTERED BY (key) SORTED BY (key) 
INTO 2 BUCKETS ;
 CREATE TABLE tbl2_n4(key int, value string) CLUSTERED BY (key) SORTED BY (key) 
INTO 2 BUCKETS;

 set hive.auto.convert.join=true;
 set hive.optimize.bucketmapjoin = true;
 set hive.optimize.bucketmapjoin.sortedmerge = true;
 set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
 set hive.auto.convert.sortmerge.join=true;
 set hive.auto.convert.sortmerge.join.to.mapjoin=false;
 set hive.auto.convert.join.noconditionaltask.ize=1;
 set hive.optimize.semijoin.conversion = false;

 insert into tbl2_n4 values (2, 'val_2'), (0, 'val_0'), (0, 'val_0'), (0, 
'val_0'), (4, 'val_4') ,(5, 'val_5') ,(5, 'val_5') , (5, 'val_5'), (8, 
'val_8'), (9, 'val_9');

 insert into tbl1_n5 values (2, 'val_2'), (0, 'val_0'), (0, 'val_0'), (0, 
'val_0'), (4, 'val_4') ,(5, 'val_5') ,(5, 'val_5') , (5, 'val_5'), (8, 
'val_8'), (9, 'val_9');{code}
 

 
{code:java}
 Select * from (select b.key as key, count as value from tbl1_n5 b where key < 
6 group by b.key) subq1 join (select a.key as key, a.value as value from 
tbl2_n4 a where key < 6) subq2 on subq1.key = subq2.key;{code}
 

The above select is producing 0,0,0,2,4,5,5,5,5,5,5 instead of 0,0,0,2,4,5,5,5. 
The input format for sorted tables should be set to BucketizedHiveInputFormat 
instead of HiveInputFormat. This is done only for MapWork. But if the root task 
in a MapJoinWork, it is not handled. This is causing the mapper to create 
splits more than the number of buckets and resulting into extra records.

 

  was:
{code:java}
 CREATE TABLE tbl1_n5(key int, value string) CLUSTERED BY (key) SORTED BY (key) 
INTO 2 BUCKETS ;
 CREATE TABLE tbl2_n4(key int, value string) CLUSTERED BY (key) SORTED BY (key) 
INTO 2 BUCKETS;

 set hive.auto.convert.join=true;
 set hive.optimize.bucketmapjoin = true;
 set hive.optimize.bucketmapjoin.sortedmerge = true;
 set hive.input.format = org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
 set hive.auto.convert.sortmerge.join=true;
 set hive.auto.convert.sortmerge.join.to.mapjoin=false;
 set hive.auto.convert.join.noconditionaltask.ize=1;
 set hive.optimize.semijoin.conversion = false;

 insert into tbl2_n4 values (2, 'val_2'), (0, 'val_0'), (0, 'val_0'), (0, 
'val_0'), (4, 'val_4') ,(5, 'val_5') ,(5, 'val_5') , (5, 'val_5'), (8, 
'val_8'), (9, 'val_9');

 insert into tbl1_n5 values (2, 'val_2'), (0, 'val_0'), (0, 'val_0'), (0, 
'val_0'), (4, 'val_4') ,(5, 'val_5') ,(5, 'val_5') , (5, 'val_5'), (8, 
'val_8'), (9, 'val_9');{code}
 

 
{code:java}
 Select * from (select b.key as key, count as value from tbl1_n5 b where key < 
6 group by b.key) subq1 join (select a.key as key, a.value as value from 
tbl2_n4 a where key < 6) subq2 on subq1.key = subq2.key;{code}
The above select is producing 0,0,0,2,4,5,5,5,5,5,5 instead of 0,0,0,2,4,5,5,5.


> Map side SMB join is producing wrong result
> -------------------------------------------
>
>                 Key: HIVE-24198
>                 URL: https://issues.apache.org/jira/browse/HIVE-24198
>             Project: Hive
>          Issue Type: Bug
>          Components: Hive
>            Reporter: mahesh kumar behera
>            Assignee: mahesh kumar behera
>            Priority: Major
>
> {code:java}
>  CREATE TABLE tbl1_n5(key int, value string) CLUSTERED BY (key) SORTED BY 
> (key) INTO 2 BUCKETS ;
>  CREATE TABLE tbl2_n4(key int, value string) CLUSTERED BY (key) SORTED BY 
> (key) INTO 2 BUCKETS;
>  set hive.auto.convert.join=true;
>  set hive.optimize.bucketmapjoin = true;
>  set hive.optimize.bucketmapjoin.sortedmerge = true;
>  set hive.input.format = 
> org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat;
>  set hive.auto.convert.sortmerge.join=true;
>  set hive.auto.convert.sortmerge.join.to.mapjoin=false;
>  set hive.auto.convert.join.noconditionaltask.ize=1;
>  set hive.optimize.semijoin.conversion = false;
>  insert into tbl2_n4 values (2, 'val_2'), (0, 'val_0'), (0, 'val_0'), (0, 
> 'val_0'), (4, 'val_4') ,(5, 'val_5') ,(5, 'val_5') , (5, 'val_5'), (8, 
> 'val_8'), (9, 'val_9');
>  insert into tbl1_n5 values (2, 'val_2'), (0, 'val_0'), (0, 'val_0'), (0, 
> 'val_0'), (4, 'val_4') ,(5, 'val_5') ,(5, 'val_5') , (5, 'val_5'), (8, 
> 'val_8'), (9, 'val_9');{code}
>  
>  
> {code:java}
>  Select * from (select b.key as key, count as value from tbl1_n5 b where key 
> < 6 group by b.key) subq1 join (select a.key as key, a.value as value from 
> tbl2_n4 a where key < 6) subq2 on subq1.key = subq2.key;{code}
>  
> The above select is producing 0,0,0,2,4,5,5,5,5,5,5 instead of 
> 0,0,0,2,4,5,5,5. The input format for sorted tables should be set to 
> BucketizedHiveInputFormat instead of HiveInputFormat. This is done only for 
> MapWork. But if the root task in a MapJoinWork, it is not handled. This is 
> causing the mapper to create splits more than the number of buckets and 
> resulting into extra records.
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to