[ https://issues.apache.org/jira/browse/HIVE-16965?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Sergey Shelukhin updated HIVE-16965: ------------------------------------ Description: Running the following on MiniTez {noformat} set hive.mapred.mode=nonstrict; SET hive.vectorized.execution.enabled=true; SET hive.exec.orc.default.buffer.size=32768; SET hive.exec.orc.default.row.index.stride=1000; SET hive.optimize.index.filter=true; set hive.fetch.task.conversion=none; set hive.exec.dynamic.partition.mode=nonstrict; DROP TABLE orc_a; DROP TABLE orc_b; CREATE TABLE orc_a (id bigint, cdouble double) partitioned by (y int, q smallint) CLUSTERED BY (id) SORTED BY (id) INTO 2 BUCKETS stored as orc; CREATE TABLE orc_b (id bigint, cfloat float) CLUSTERED BY (id) SORTED BY (id) INTO 2 BUCKETS stored as orc; insert into table orc_a partition (y=2000, q) select cbigint, cdouble, csmallint % 10 from alltypesorc where cbigint is not null and csmallint > 0 order by cbigint asc; insert into table orc_a partition (y=2001, q) select cbigint, cdouble, csmallint % 10 from alltypesorc where cbigint is not null and csmallint > 0 order by cbigint asc; insert into table orc_b select cbigint, cfloat from alltypesorc where cbigint is not null and csmallint > 0 order by cbigint asc limit 200; set hive.cbo.enable=false; select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q; set hive.enforce.sortmergebucketmapjoin=false; set hive.optimize.bucketmapjoin=true; set hive.optimize.bucketmapjoin.sortedmerge=true; set hive.auto.convert.sortmerge.join=true; set hive.auto.convert.join=true; set hive.auto.convert.join.noconditionaltask.size=10; explain select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q; select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q; DROP TABLE orc_a; DROP TABLE orc_b; {noformat} Produces different results for the two selects. The SMB one looks incorrect. cc [~djaiswal] [~hagleitn] was: Running the following on MiniTez {noformat} set hive.mapred.mode=nonstrict; SET hive.vectorized.execution.enabled=true; SET hive.exec.orc.default.buffer.size=32768; SET hive.exec.orc.default.row.index.stride=1000; SET hive.optimize.index.filter=true; set hive.fetch.task.conversion=none; set hive.exec.dynamic.partition.mode=nonstrict; DROP TABLE orc_a; DROP TABLE orc_b; CREATE TABLE orc_a (id bigint, cdouble double) partitioned by (y int, q smallint) CLUSTERED BY (id) SORTED BY (id) INTO 2 BUCKETS stored as orc; CREATE TABLE orc_b (id bigint, cfloat float) CLUSTERED BY (id) SORTED BY (id) INTO 2 BUCKETS stored as orc; insert into table orc_a partition (y=2000, q) select cbigint, cdouble, csmallint % 10 from alltypesorc where cbigint is not null and csmallint > 0 order by cbigint asc; insert into table orc_a partition (y=2001, q) select cbigint, cdouble, csmallint % 10 from alltypesorc where cbigint is not null and csmallint > 0 order by cbigint asc; insert into table orc_b select cbigint, cfloat from alltypesorc where cbigint is not null and csmallint > 0 order by cbigint asc limit 200; set hive.cbo.enable=false; select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q; set hive.enforce.sortmergebucketmapjoin=false; set hive.optimize.bucketmapjoin=true; set hive.optimize.bucketmapjoin.sortedmerge=true; set hive.auto.convert.sortmerge.join=true; set hive.auto.convert.join=true; set hive.auto.convert.join.noconditionaltask.size=10; explain select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q; select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q; DROP TABLE orc_a; DROP TABLE orc_b; {noformat} Produces different results for two selects. The SMB one looks incorrect. cc [~djaiswal] [~hagleitn] > SMB join may produce incorrect results > -------------------------------------- > > Key: HIVE-16965 > URL: https://issues.apache.org/jira/browse/HIVE-16965 > Project: Hive > Issue Type: Bug > Reporter: Sergey Shelukhin > Assignee: Deepak Jaiswal > > Running the following on MiniTez > {noformat} > set hive.mapred.mode=nonstrict; > SET hive.vectorized.execution.enabled=true; > SET hive.exec.orc.default.buffer.size=32768; > SET hive.exec.orc.default.row.index.stride=1000; > SET hive.optimize.index.filter=true; > set hive.fetch.task.conversion=none; > set hive.exec.dynamic.partition.mode=nonstrict; > DROP TABLE orc_a; > DROP TABLE orc_b; > CREATE TABLE orc_a (id bigint, cdouble double) partitioned by (y int, q > smallint) > CLUSTERED BY (id) SORTED BY (id) INTO 2 BUCKETS stored as orc; > CREATE TABLE orc_b (id bigint, cfloat float) > CLUSTERED BY (id) SORTED BY (id) INTO 2 BUCKETS stored as orc; > insert into table orc_a partition (y=2000, q) > select cbigint, cdouble, csmallint % 10 from alltypesorc > where cbigint is not null and csmallint > 0 order by cbigint asc; > insert into table orc_a partition (y=2001, q) > select cbigint, cdouble, csmallint % 10 from alltypesorc > where cbigint is not null and csmallint > 0 order by cbigint asc; > insert into table orc_b > select cbigint, cfloat from alltypesorc > where cbigint is not null and csmallint > 0 order by cbigint asc limit 200; > set hive.cbo.enable=false; > select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q; > set hive.enforce.sortmergebucketmapjoin=false; > set hive.optimize.bucketmapjoin=true; > set hive.optimize.bucketmapjoin.sortedmerge=true; > set hive.auto.convert.sortmerge.join=true; > set hive.auto.convert.join=true; > set hive.auto.convert.join.noconditionaltask.size=10; > explain > select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q; > select y,q,count(*) from orc_a a join orc_b b on a.id=b.id group by y,q; > DROP TABLE orc_a; > DROP TABLE orc_b; > {noformat} > Produces different results for the two selects. The SMB one looks incorrect. > cc [~djaiswal] [~hagleitn] -- This message was sent by Atlassian JIRA (v6.4.14#64029)