[jira] [Updated] (HIVE-16965) SMB join may produce incorrect results

Sergey Shelukhin (JIRA) Mon, 26 Jun 2017 16:01:25 -0700

     [ 
https://issues.apache.org/jira/browse/HIVE-16965?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]


Sergey Shelukhin updated HIVE-16965:
------------------------------------
    Description: 
Running the following on MiniTez
{noformat}
set hive.mapred.mode=nonstrict;
SET hive.vectorized.execution.enabled=true;
SET hive.exec.orc.default.buffer.size=32768;
SET hive.exec.orc.default.row.index.stride=1000;
SET hive.optimize.index.filter=true;
set hive.fetch.task.conversion=none;
set hive.exec.dynamic.partition.mode=nonstrict;

DROP TABLE orc_a;
DROP TABLE orc_b;

CREATE TABLE orc_a (id bigint, cdouble double) partitioned by (y int, q 
smallint)
  CLUSTERED BY (id) SORTED BY (id) INTO 2 BUCKETS stored as orc;
CREATE TABLE orc_b (id bigint, cfloat float)
  CLUSTERED BY (id) SORTED BY (id) INTO 2 BUCKETS stored as orc;

insert into table orc_a partition (y=2000, q)
select cbigint, cdouble, csmallint % 10 from alltypesorc
  where cbigint is not null and csmallint > 0 order by cbigint asc;
insert into table orc_a partition (y=2001, q)
select cbigint, cdouble, csmallint % 10 from alltypesorc
  where cbigint is not null and csmallint > 0 order by cbigint asc;

insert into table orc_b 
select cbigint, cfloat from alltypesorc
  where cbigint is not null and csmallint > 0 order by cbigint asc limit 200;

set hive.cbo.enable=false;

select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q;

set hive.enforce.sortmergebucketmapjoin=false;
set hive.optimize.bucketmapjoin=true;
set hive.optimize.bucketmapjoin.sortedmerge=true;
set hive.auto.convert.sortmerge.join=true;
set hive.auto.convert.join=true;
set hive.auto.convert.join.noconditionaltask.size=10;

explain
select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q;
select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q;

DROP TABLE orc_a;
DROP TABLE orc_b;
{noformat}

Produces different results for the two selects. The SMB one looks incorrect. cc 
[~djaiswal] [~hagleitn]

  was:
Running the following on MiniTez
{noformat}
set hive.mapred.mode=nonstrict;
SET hive.vectorized.execution.enabled=true;
SET hive.exec.orc.default.buffer.size=32768;
SET hive.exec.orc.default.row.index.stride=1000;
SET hive.optimize.index.filter=true;
set hive.fetch.task.conversion=none;
set hive.exec.dynamic.partition.mode=nonstrict;

DROP TABLE orc_a;
DROP TABLE orc_b;

CREATE TABLE orc_a (id bigint, cdouble double) partitioned by (y int, q 
smallint)
  CLUSTERED BY (id) SORTED BY (id) INTO 2 BUCKETS stored as orc;
CREATE TABLE orc_b (id bigint, cfloat float)
  CLUSTERED BY (id) SORTED BY (id) INTO 2 BUCKETS stored as orc;

insert into table orc_a partition (y=2000, q)
select cbigint, cdouble, csmallint % 10 from alltypesorc
  where cbigint is not null and csmallint > 0 order by cbigint asc;
insert into table orc_a partition (y=2001, q)
select cbigint, cdouble, csmallint % 10 from alltypesorc
  where cbigint is not null and csmallint > 0 order by cbigint asc;

insert into table orc_b 
select cbigint, cfloat from alltypesorc
  where cbigint is not null and csmallint > 0 order by cbigint asc limit 200;

set hive.cbo.enable=false;

select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q;

set hive.enforce.sortmergebucketmapjoin=false;
set hive.optimize.bucketmapjoin=true;
set hive.optimize.bucketmapjoin.sortedmerge=true;
set hive.auto.convert.sortmerge.join=true;
set hive.auto.convert.join=true;
set hive.auto.convert.join.noconditionaltask.size=10;

explain
select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q;
select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q;

DROP TABLE orc_a;
DROP TABLE orc_b;
{noformat}

Produces different results for two selects. The SMB one looks incorrect. cc 
[~djaiswal] [~hagleitn]


> SMB join may produce incorrect results
> --------------------------------------
>
>                 Key: HIVE-16965
>                 URL: https://issues.apache.org/jira/browse/HIVE-16965
>             Project: Hive
>          Issue Type: Bug
>            Reporter: Sergey Shelukhin
>            Assignee: Deepak Jaiswal
>
> Running the following on MiniTez
> {noformat}
> set hive.mapred.mode=nonstrict;
> SET hive.vectorized.execution.enabled=true;
> SET hive.exec.orc.default.buffer.size=32768;
> SET hive.exec.orc.default.row.index.stride=1000;
> SET hive.optimize.index.filter=true;
> set hive.fetch.task.conversion=none;
> set hive.exec.dynamic.partition.mode=nonstrict;
> DROP TABLE orc_a;
> DROP TABLE orc_b;
> CREATE TABLE orc_a (id bigint, cdouble double) partitioned by (y int, q 
> smallint)
>   CLUSTERED BY (id) SORTED BY (id) INTO 2 BUCKETS stored as orc;
> CREATE TABLE orc_b (id bigint, cfloat float)
>   CLUSTERED BY (id) SORTED BY (id) INTO 2 BUCKETS stored as orc;
> insert into table orc_a partition (y=2000, q)
> select cbigint, cdouble, csmallint % 10 from alltypesorc
>   where cbigint is not null and csmallint > 0 order by cbigint asc;
> insert into table orc_a partition (y=2001, q)
> select cbigint, cdouble, csmallint % 10 from alltypesorc
>   where cbigint is not null and csmallint > 0 order by cbigint asc;
> insert into table orc_b 
> select cbigint, cfloat from alltypesorc
>   where cbigint is not null and csmallint > 0 order by cbigint asc limit 200;
> set hive.cbo.enable=false;
> select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q;
> set hive.enforce.sortmergebucketmapjoin=false;
> set hive.optimize.bucketmapjoin=true;
> set hive.optimize.bucketmapjoin.sortedmerge=true;
> set hive.auto.convert.sortmerge.join=true;
> set hive.auto.convert.join=true;
> set hive.auto.convert.join.noconditionaltask.size=10;
> explain
> select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q;
> select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q;
> DROP TABLE orc_a;
> DROP TABLE orc_b;
> {noformat}
> Produces different results for the two selects. The SMB one looks incorrect. 
> cc [~djaiswal] [~hagleitn]



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Updated] (HIVE-16965) SMB join may produce incorrect results

Reply via email to