[
https://issues.apache.org/jira/browse/HIVE-3734?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Gang Tim Liu updated HIVE-3734:
-------------------------------
Description:
Static DML create duplicate files and record.
Given the following test case, hive will return 2 records:
484 val_484
484 val_484
but srcpart returns one record:
484 val_484
If you look at file system, DML generates duplicate file with the same content:
-rw-r--r-- 1 gang THEFACEBOOK\Domain Users 5812 Nov 21 17:55 000000_0
-rwxr-xr-x 1 gang THEFACEBOOK\Domain Users 5812 Nov 21 17:55 000001_0
Test Case
===
set hive.mapred.supports.subdirectories=true;
set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
set hive.merge.mapfiles=false;
set hive.merge.mapredfiles=false;
set mapred.input.dir.recursive=true;
create table testtable (key String, value String) partitioned by (ds String, hr
String) ;
-- list bucketing DML
explain extended
insert overwrite table testtable partition (ds='2008-04-08', hr='11') select
key, value from srcpart where ds='2008-04-08';
insert overwrite table testtable partition (ds='2008-04-08', hr='11') select
key, value from srcpart where ds='2008-04-08';
-- check DML result
desc formatted testtable partition (ds='2008-04-08', hr='11');
select count(1) from srcpart where ds='2008-04-08';
select count(1) from testtable where ds='2008-04-08';
select key, value from srcpart where ds='2008-04-08' and hr='11' and key =
"484";
set hive.optimize.listbucketing=true;
explain extended
select key, value from testtable where ds='2008-04-08' and hr='11' and key =
"484";
select key, value from testtable where ds='2008-04-08' and hr='11' and key =
"484";
===
was:
Static DML create duplicate files and record.
Given the attached test case, hive will return 2 records:
484 val_484
484 val_484
but srcpart returns one record:
484 val_484
If you look at file system, DML generates duplicate file with the same content:
-rw-r--r-- 1 gang THEFACEBOOK\Domain Users 5812 Nov 21 17:55 000000_0
-rwxr-xr-x 1 gang THEFACEBOOK\Domain Users 5812 Nov 21 17:55 000001_0
> Static partition DML create duplicate files and records
> -------------------------------------------------------
>
> Key: HIVE-3734
> URL: https://issues.apache.org/jira/browse/HIVE-3734
> Project: Hive
> Issue Type: Bug
> Components: Query Processor
> Affects Versions: 0.10.0
> Reporter: Gang Tim Liu
>
> Static DML create duplicate files and record.
> Given the following test case, hive will return 2 records:
> 484 val_484
> 484 val_484
> but srcpart returns one record:
> 484 val_484
> If you look at file system, DML generates duplicate file with the same
> content:
> -rw-r--r-- 1 gang THEFACEBOOK\Domain Users 5812 Nov 21 17:55 000000_0
> -rwxr-xr-x 1 gang THEFACEBOOK\Domain Users 5812 Nov 21 17:55 000001_0
> Test Case
> ===
> set hive.mapred.supports.subdirectories=true;
> set hive.exec.dynamic.partition=true;
> set hive.exec.dynamic.partition.mode=nonstrict;
> set hive.input.format=org.apache.hadoop.hive.ql.io.HiveInputFormat;
> set hive.merge.mapfiles=false;
> set hive.merge.mapredfiles=false;
> set mapred.input.dir.recursive=true;
> create table testtable (key String, value String) partitioned by (ds String,
> hr String) ;
> -- list bucketing DML
> explain extended
> insert overwrite table testtable partition (ds='2008-04-08', hr='11') select
> key, value from srcpart where ds='2008-04-08';
> insert overwrite table testtable partition (ds='2008-04-08', hr='11') select
> key, value from srcpart where ds='2008-04-08';
> -- check DML result
> desc formatted testtable partition (ds='2008-04-08', hr='11');
> select count(1) from srcpart where ds='2008-04-08';
> select count(1) from testtable where ds='2008-04-08';
> select key, value from srcpart where ds='2008-04-08' and hr='11' and key =
> "484";
> set hive.optimize.listbucketing=true;
> explain extended
> select key, value from testtable where ds='2008-04-08' and hr='11' and key =
> "484";
> select key, value from testtable where ds='2008-04-08' and hr='11' and key =
> "484";
> ===
--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira