[jira] [Updated] (HIVE-28213) Incorrect results after insert-select from similar bucketed source & target table

2024-04-24 Thread Naresh P R (Jira)


 [ 
https://issues.apache.org/jira/browse/HIVE-28213?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Naresh P R updated HIVE-28213:
--
Description: 
Insert-select is not honoring bucketing if both source & target are bucketed on 
same column.

eg., 
{code:java}
CREATE EXTERNAL TABLE bucketing_table1 (id INT)
CLUSTERED BY (id)
SORTED BY (id ASC)
INTO 32 BUCKETS stored as textfile;

INSERT INTO TABLE bucketing_table1 VALUES (1), (2), (3), (4), (5);

CREATE EXTERNAL TABLE bucketing_table2 like bucketing_table1;

INSERT INTO TABLE bucketing_table2 select * from bucketing_table1;{code}
id=1 => murmur_hash(1) %32 should go to 29th bucket file.

bucketing_table1 has id=1 at 29th file,

but bucketing_table2 doesn't have 29th file because Insert-select dint honor 
the bucketing.
{code:java}
SELECT count(*) FROM bucketing_table1 WHERE id = 1;
===
1 //correct result

SELECT count(*) FROM bucketing_table2 WHERE id = 1;   
=== 
0 // incorrect result


select *, INPUT__FILE__NAME from bucketing_table1;
+--++
| bucketing_table1.id  |                 input__file__name                  |
+--++
| 2                    | /bucketing_table1/04_0 |
| 3                    | /bucketing_table1/06_0 |
| 5                    | /bucketing_table1/15_0 |
| 4                    | /bucketing_table1/21_0 |
| 1                    | /bucketing_table1/29_0 |
+--++

select *, INPUT__FILE__NAME from bucketing_table2;
+-++
| bucketing_table2.id  |                 input__file__name                  |
+-++
| 2           | /bucketing_table2/00_0 |
| 3           | /bucketing_table2/01_0 |
| 5           | /bucketing_table2/02_0 |
| 4           | /bucketing_table2/03_0 |
| 1           | /bucketing_table2/04_0 |
+--++{code}
Workaround for read: hive.tez.bucket.pruning=false;

PS: Attaching repro file [^test.q]

  was:
Insert-select is not honoring bucketing if both source & target are bucketed on 
same column.

eg., 
{code:java}
CREATE EXTERNAL TABLE bucketing_table1 (id INT)
CLUSTERED BY (id)
SORTED BY (id ASC)
INTO 32 BUCKETS stored as textfile;

INSERT INTO TABLE bucketing_table1 VALUES (1), (2), (3), (4), (5);

CREATE EXTERNAL TABLE bucketing_table2 like bucketing_table1;

INSERT INTO TABLE bucketing_table2 select * from bucketing_table1;{code}
id=1 => murmur_hash(1) %32 should go to 29th bucket file.

bucketing_table1 has id=1 at 29th file,

but bucketing_table2 doesn't have 29th file because Insert-select dint honor 
the bucketing.
{code:java}
SELECT count(*) FROM bucketing_table1 WHERE id = 1;
===
1 //correct result

SELECT count(*) FROM bucketing_table2 WHERE id = 1;   
=== 
0 // incorrect result


select *, INPUT__FILE__NAME from bucketing_table1;
+--++
| bucketing_table1.id  |                 input__file__name                  |
+--++
| 2                    | /bucketing_table1/04_0 |
| 3                    | /bucketing_table1/06_0 |
| 5                    | /bucketing_table1/15_0 |
| 4                    | /bucketing_table1/21_0 |
| 1                    | /bucketing_table1/29_0 |
+--++

select *, INPUT__FILE__NAME from bucketing_table2;
+-++
| bucketing_table2.id  |                 input__file__name                  |
+-++
| 2           | /bucketing_table2/00_0 |
| 3           | /bucketing_table2/01_0 |
| 5           | /bucketing_table2/02_0 |
| 4           | /bucketing_table2/03_0 |
| 1           | /bucketing_table2/04_0 |
+--++{code}
Query to identify in which bucketFile a particular row should be

 
{code:java}
with t as (select *, murmur_hash(id)%32 as bucket, INPUT__FILE__NAME from 
bucketing_table1)
select id, (case when bucket > 0 then bucket else 32 + bucket end) as 
bucket_number, INPUT__FILE__NAME from t; 
+-+++
| id  | bucket_number  |                 input__file__name                  |
+-+++
| 2   | 4              | /bucketing_table1/04_0 |
| 3   | 6              | /bucketing_table1/06_0 |
| 5   | 15             | /bucketing_ta

[jira] [Updated] (HIVE-28213) Incorrect results after insert-select from similar bucketed source & target table

2024-04-24 Thread Naresh P R (Jira)


 [ 
https://issues.apache.org/jira/browse/HIVE-28213?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Naresh P R updated HIVE-28213:
--
Description: 
Insert-select is not honoring bucketing if both source & target are bucketed on 
same column.

eg., 
{code:java}
CREATE EXTERNAL TABLE bucketing_table1 (id INT)
CLUSTERED BY (id)
SORTED BY (id ASC)
INTO 32 BUCKETS stored as textfile;

INSERT INTO TABLE bucketing_table1 VALUES (1), (2), (3), (4), (5);

CREATE EXTERNAL TABLE bucketing_table2 like bucketing_table1;

INSERT INTO TABLE bucketing_table2 select * from bucketing_table1;{code}
id=1 => murmur_hash(1) %32 should go to 29th bucket file.

bucketing_table1 has id=1 at 29th file,

but bucketing_table2 doesn't have 29th file because Insert-select dint honor 
the bucketing.
{code:java}
SELECT count(*) FROM bucketing_table1 WHERE id = 1;
===
1 //correct result

SELECT count(*) FROM bucketing_table2 WHERE id = 1;   
=== 
0 // incorrect result


select *, INPUT__FILE__NAME from bucketing_table1;
+--++
| bucketing_table1.id  |                 input__file__name                  |
+--++
| 2                    | /bucketing_table1/04_0 |
| 3                    | /bucketing_table1/06_0 |
| 5                    | /bucketing_table1/15_0 |
| 4                    | /bucketing_table1/21_0 |
| 1                    | /bucketing_table1/29_0 |
+--++

select *, INPUT__FILE__NAME from bucketing_table2;
+-++
| bucketing_table2.id  |                 input__file__name                  |
+-++
| 2           | /bucketing_table2/00_0 |
| 3           | /bucketing_table2/01_0 |
| 5           | /bucketing_table2/02_0 |
| 4           | /bucketing_table2/03_0 |
| 1           | /bucketing_table2/04_0 |
+--++{code}
Query to identify in which bucketFile a particular row should be

 
{code:java}
with t as (select *, murmur_hash(id)%32 as bucket, INPUT__FILE__NAME from 
bucketing_table1)
select id, (case when bucket > 0 then bucket else 32 + bucket end) as 
bucket_number, INPUT__FILE__NAME from t; 
+-+++
| id  | bucket_number  |                 input__file__name                  |
+-+++
| 2   | 4              | /bucketing_table1/04_0 |
| 3   | 6              | /bucketing_table1/06_0 |
| 5   | 15             | /bucketing_table1/15_0 |
| 4   | 21             | /bucketing_table1/21_0 |
| 1   | 29             | /bucketing_table1/29_0 |
+-+++{code}
 

Workaround for read: hive.tez.bucket.pruning=false;

PS: Attaching repro file [^test.q]

  was:
Insert-select is not honoring bucketing if both source & target are bucketed on 
same column.

eg., 
{code:java}
CREATE EXTERNAL TABLE bucketing_table1 (id INT)
CLUSTERED BY (id)
SORTED BY (id ASC)
INTO 32 BUCKETS stored as textfile;

INSERT INTO TABLE bucketing_table1 VALUES (1), (2), (3), (4), (5);

CREATE EXTERNAL TABLE bucketing_table2 like bucketing_table1;

INSERT INTO TABLE bucketing_table2 select * from bucketing_table1;{code}
id=1 => murmur_hash(1) %32 should go to 29th bucket file.

bucketing_table1 has id=1 at 29th file,

but bucketing_table2 doesn't have 29th file because Insert-select dint honor 
the bucketing.
{code:java}
SELECT count(*) FROM bucketing_table1 WHERE id = 1;
===
1 //correct result

SELECT count(*) FROM bucketing_table2 WHERE id = 1;   
=== 
0 // incorrect result


select *, INPUT__FILE__NAME from bucketing_table1;
+--++
| bucketing_table1.id  |                 input__file__name                  |
+--++
| 2                    | /bucketing_table1/04_0 |
| 3                    | /bucketing_table1/06_0 |
| 5                    | /bucketing_table1/15_0 |
| 4                    | /bucketing_table1/21_0 |
| 1                    | /bucketing_table1/29_0 |
+--++

select *, INPUT__FILE__NAME from bucketing_table2;
+-++
| bucketing_table2.id  |                 input__file__name                  |
+-++
| 2           | /bucketing_table2/00_0 |
| 3           | /bucketing_table2/01_0 |
| 5     

[jira] [Updated] (HIVE-28213) Incorrect results after insert-select from similar bucketed source & target table

2024-04-24 Thread Naresh P R (Jira)


 [ 
https://issues.apache.org/jira/browse/HIVE-28213?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Naresh P R updated HIVE-28213:
--
Description: 
Insert-select is not honoring bucketing if both source & target are bucketed on 
same column.

eg., 
{code:java}
CREATE EXTERNAL TABLE bucketing_table1 (id INT)
CLUSTERED BY (id)
SORTED BY (id ASC)
INTO 32 BUCKETS stored as textfile;

INSERT INTO TABLE bucketing_table1 VALUES (1), (2), (3), (4), (5);

CREATE EXTERNAL TABLE bucketing_table2 like bucketing_table1;

INSERT INTO TABLE bucketing_table2 select * from bucketing_table1;{code}
id=1 => murmur_hash(1) %32 should go to 29th bucket file.

bucketing_table1 has id=1 at 29th file,

but bucketing_table2 doesn't have 29th file because Insert-select dint honor 
the bucketing.
{code:java}
SELECT count(*) FROM bucketing_table1 WHERE id = 1;
===
1 //correct result

SELECT count(*) FROM bucketing_table2 WHERE id = 1;   
=== 
0 // incorrect result


select *, INPUT__FILE__NAME from bucketing_table1;
+--++
| bucketing_table1.id  |                 input__file__name                  |
+--++
| 2                    | /bucketing_table1/04_0 |
| 3                    | /bucketing_table1/06_0 |
| 5                    | /bucketing_table1/15_0 |
| 4                    | /bucketing_table1/21_0 |
| 1                    | /bucketing_table1/29_0 |
+--++

select *, INPUT__FILE__NAME from bucketing_table2;
+-++
| bucketing_table2.id  |                 input__file__name                  |
+-++
| 2           | /bucketing_table2/00_0 |
| 3           | /bucketing_table2/01_0 |
| 5           | /bucketing_table2/02_0 |
| 4           | /bucketing_table2/03_0 |
| 1           | /bucketing_table2/04_0 |
+--++{code}
Workaround for read: hive.tez.bucket.pruning=false;

PS: Attaching repro file [^test.q]

  was:
Insert-select is not honoring bucketing if both source & target are bucketed on 
same column.

eg., 
{code:java}
CREATE EXTERNAL TABLE bucketing_table1 (id INT)
CLUSTERED BY (id)
SORTED BY (id ASC)
INTO 32 BUCKETS stored as textfile;

INSERT INTO TABLE bucketing_table1 VALUES (1), (2), (3), (4), (5);

CREATE EXTERNAL TABLE bucketing_table2 like bucketing_table1;

INSERT INTO TABLE bucketing_table2 select * from bucketing_table1;{code}
id=1 => murmur_hash(1) %32 should go to 29th bucket file.

bucketing_table1 has id=1 at 29th file,

but bucketing_table2 doesn't have 29th file because Insert-select dint honor 
the bucketing.
{code:java}
SELECT count(*) FROM bucketing_table1 WHERE id = 1;
===
1 //correct result

SELECT count(*) FROM bucketing_table2 WHERE id = 1;  
===
0 // incorrect result{code}
Workaround: hive.tez.bucket.pruning=false;

PS: Attaching repro file [^test.q]


> Incorrect results after insert-select from similar bucketed source & target 
> table
> -
>
> Key: HIVE-28213
> URL: https://issues.apache.org/jira/browse/HIVE-28213
> Project: Hive
>  Issue Type: Bug
>Reporter: Naresh P R
>Priority: Major
> Attachments: test.q
>
>
> Insert-select is not honoring bucketing if both source & target are bucketed 
> on same column.
> eg., 
> {code:java}
> CREATE EXTERNAL TABLE bucketing_table1 (id INT)
> CLUSTERED BY (id)
> SORTED BY (id ASC)
> INTO 32 BUCKETS stored as textfile;
> INSERT INTO TABLE bucketing_table1 VALUES (1), (2), (3), (4), (5);
> CREATE EXTERNAL TABLE bucketing_table2 like bucketing_table1;
> INSERT INTO TABLE bucketing_table2 select * from bucketing_table1;{code}
> id=1 => murmur_hash(1) %32 should go to 29th bucket file.
> bucketing_table1 has id=1 at 29th file,
> but bucketing_table2 doesn't have 29th file because Insert-select dint honor 
> the bucketing.
> {code:java}
> SELECT count(*) FROM bucketing_table1 WHERE id = 1;
> ===
> 1 //correct result
> SELECT count(*) FROM bucketing_table2 WHERE id = 1;   
> === 
> 0 // incorrect result
> select *, INPUT__FILE__NAME from bucketing_table1;
> +--++
> | bucketing_table1.id  |                 input__file__name                  |
> +--++
> | 2                    | /bucketing_table1/04_0 |
> | 3                    | /bucketing_table1/06_0 |
> | 5                    | /bucketing_table1/15_0 |
> | 4                    | /bucketing_table1/21_0 |
> | 1