This is an automated email from the ASF dual-hosted git repository.
okumin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push:
new bfac94f1bcb HIVE-28798: Bucket Map Join partially using partition
transforms (#5670)
bfac94f1bcb is described below
commit bfac94f1bcb36bba620a5a59e9c6e9a087bb6a9b
Author: Shohei Okumiya <[email protected]>
AuthorDate: Fri Jul 18 21:24:38 2025 +0900
HIVE-28798: Bucket Map Join partially using partition transforms (#5670)
---
.../queries/positive/iceberg_bucket_map_join_8.q | 111 +++-
.../positive/llap/iceberg_bucket_map_join_8.q.out | 576 +++++++++++++++++----
.../annotation/OpTraitsRulesProcFactory.java | 105 ++--
3 files changed, 636 insertions(+), 156 deletions(-)
diff --git
a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_bucket_map_join_8.q
b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_bucket_map_join_8.q
index 91b30c7b231..4852a67cd35 100644
---
a/iceberg/iceberg-handler/src/test/queries/positive/iceberg_bucket_map_join_8.q
+++
b/iceberg/iceberg-handler/src/test/queries/positive/iceberg_bucket_map_join_8.q
@@ -4,47 +4,110 @@ set hive.auto.convert.join=true;
set hive.optimize.dynamic.partition.hashjoin=false;
set hive.convert.join.bucket.mapjoin.tez=true;
-CREATE TABLE srcbucket_big(key int, value string, id int)
-PARTITIONED BY SPEC(bucket(4, key)) STORED BY ICEBERG;
+CREATE TABLE srcbucket_big(key1 int, key2 string, value string, id int)
+PARTITIONED BY SPEC(bucket(4, key1), bucket(8, key2)) STORED BY ICEBERG;
INSERT INTO srcbucket_big VALUES
-(101, 'val_101', 1),
-(null, 'val_102', 2),
-(103, 'val_103', 3),
-(104, null, 4),
-(105, 'val_105', 5),
-(null, null, 6);
-
-CREATE TABLE src_small(key int, value string);
+(101, '1001', 'val_101', 1),
+(null, '1002', 'val_102', 2),
+(103, null, 'val_103', 3),
+(104, '1004', null, 4),
+(105, '1005', 'val_105', 5),
+(101, '1001', 'val_101', 6),
+(null, '1002', 'val_102', 7),
+(103, null, 'val_103', 8),
+(104, '1004', null, 9),
+(105, '1005', 'val_105', 10),
+(101, '1001', 'val_101', 11),
+(null, '1002', 'val_102', 12),
+(103, null, 'val_103', 13),
+(104, '1004', null, 14),
+(105, '1005', 'val_105', 15),
+(101, '1001', 'val_101', 16),
+(null, '1002', 'val_102', 17),
+(103, null, 'val_103', 18),
+(104, '1004', null, 19),
+(105, '1005', 'val_105', 20),
+(null, null, null, 21);
+
+CREATE TABLE src_small(key1 int, key2 string, value string);
INSERT INTO src_small VALUES
-(101, 'val_101'),
-(null, 'val_102'),
-(103, 'val_103'),
-(104, null),
-(105, 'val_105'),
-(null, null);
+(101, '1001', 'val_101'),
+(null, '1002', 'val_102'),
+(103, null, 'val_103'),
+(104, '1004', null),
+(105, '1005', 'val_105'),
+(null, null, null);
SELECT * FROM srcbucket_big ORDER BY id;
--- Using the bucket column
+-- key1
+EXPLAIN
+SELECT a.key1, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1
+ORDER BY a.id;
+
+SELECT a.key1, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1
+ORDER BY a.id;
+
+-- key2
EXPLAIN
-SELECT *
+SELECT a.key2, a.id
FROM srcbucket_big a
-JOIN src_small b ON a.key = b.key
+JOIN src_small b ON a.key2 = b.key2
ORDER BY a.id;
-SELECT *
+SELECT a.key2, a.id
FROM srcbucket_big a
-JOIN src_small b ON a.key = b.key
+JOIN src_small b ON a.key2 = b.key2
ORDER BY a.id;
--- Using a non-bucket column
+-- Using a non-partition column
EXPLAIN
-SELECT *
+SELECT a.value, a.id
FROM srcbucket_big a
JOIN src_small b ON a.value = b.value
ORDER BY a.id;
-SELECT *
+SELECT a.value, a.id
FROM srcbucket_big a
JOIN src_small b ON a.value = b.value
ORDER BY a.id;
+
+-- key1 & key2
+EXPLAIN
+SELECT a.key1, a.key2, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2
+ORDER BY a.id;
+
+SELECT a.key1, a.key2, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2
+ORDER BY a.id;
+
+-- key1 & non-partition column
+EXPLAIN
+SELECT a.key1, a.value, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.value = b.value
+ORDER BY a.id;
+
+SELECT a.key1, a.value, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.value = b.value
+ORDER BY a.id;
+
+-- key1 & key2 & non-partition column
+EXPLAIN
+SELECT a.key1, a.key2, a.value, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 AND a.value = b.value
+ORDER BY a.id;
+
+SELECT a.key1, a.key2, a.value, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 AND a.value = b.value
+ORDER BY a.id;
diff --git
a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_bucket_map_join_8.q.out
b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_bucket_map_join_8.q.out
index 6f6ca30e874..e7a677f0628 100644
---
a/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_bucket_map_join_8.q.out
+++
b/iceberg/iceberg-handler/src/test/results/positive/llap/iceberg_bucket_map_join_8.q.out
@@ -1,62 +1,93 @@
-PREHOOK: query: CREATE TABLE srcbucket_big(key int, value string, id int)
-PARTITIONED BY SPEC(bucket(4, key)) STORED BY ICEBERG
+PREHOOK: query: CREATE TABLE srcbucket_big(key1 int, key2 string, value
string, id int)
+PARTITIONED BY SPEC(bucket(4, key1), bucket(8, key2)) STORED BY ICEBERG
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@srcbucket_big
-POSTHOOK: query: CREATE TABLE srcbucket_big(key int, value string, id int)
-PARTITIONED BY SPEC(bucket(4, key)) STORED BY ICEBERG
+POSTHOOK: query: CREATE TABLE srcbucket_big(key1 int, key2 string, value
string, id int)
+PARTITIONED BY SPEC(bucket(4, key1), bucket(8, key2)) STORED BY ICEBERG
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@srcbucket_big
PREHOOK: query: INSERT INTO srcbucket_big VALUES
-(101, 'val_101', 1),
-(null, 'val_102', 2),
-(103, 'val_103', 3),
-(104, null, 4),
-(105, 'val_105', 5),
-(null, null, 6)
+(101, '1001', 'val_101', 1),
+(null, '1002', 'val_102', 2),
+(103, null, 'val_103', 3),
+(104, '1004', null, 4),
+(105, '1005', 'val_105', 5),
+(101, '1001', 'val_101', 6),
+(null, '1002', 'val_102', 7),
+(103, null, 'val_103', 8),
+(104, '1004', null, 9),
+(105, '1005', 'val_105', 10),
+(101, '1001', 'val_101', 11),
+(null, '1002', 'val_102', 12),
+(103, null, 'val_103', 13),
+(104, '1004', null, 14),
+(105, '1005', 'val_105', 15),
+(101, '1001', 'val_101', 16),
+(null, '1002', 'val_102', 17),
+(103, null, 'val_103', 18),
+(104, '1004', null, 19),
+(105, '1005', 'val_105', 20),
+(null, null, null, 21)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@srcbucket_big
POSTHOOK: query: INSERT INTO srcbucket_big VALUES
-(101, 'val_101', 1),
-(null, 'val_102', 2),
-(103, 'val_103', 3),
-(104, null, 4),
-(105, 'val_105', 5),
-(null, null, 6)
+(101, '1001', 'val_101', 1),
+(null, '1002', 'val_102', 2),
+(103, null, 'val_103', 3),
+(104, '1004', null, 4),
+(105, '1005', 'val_105', 5),
+(101, '1001', 'val_101', 6),
+(null, '1002', 'val_102', 7),
+(103, null, 'val_103', 8),
+(104, '1004', null, 9),
+(105, '1005', 'val_105', 10),
+(101, '1001', 'val_101', 11),
+(null, '1002', 'val_102', 12),
+(103, null, 'val_103', 13),
+(104, '1004', null, 14),
+(105, '1005', 'val_105', 15),
+(101, '1001', 'val_101', 16),
+(null, '1002', 'val_102', 17),
+(103, null, 'val_103', 18),
+(104, '1004', null, 19),
+(105, '1005', 'val_105', 20),
+(null, null, null, 21)
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@srcbucket_big
-PREHOOK: query: CREATE TABLE src_small(key int, value string)
+PREHOOK: query: CREATE TABLE src_small(key1 int, key2 string, value string)
PREHOOK: type: CREATETABLE
PREHOOK: Output: database:default
PREHOOK: Output: default@src_small
-POSTHOOK: query: CREATE TABLE src_small(key int, value string)
+POSTHOOK: query: CREATE TABLE src_small(key1 int, key2 string, value string)
POSTHOOK: type: CREATETABLE
POSTHOOK: Output: database:default
POSTHOOK: Output: default@src_small
PREHOOK: query: INSERT INTO src_small VALUES
-(101, 'val_101'),
-(null, 'val_102'),
-(103, 'val_103'),
-(104, null),
-(105, 'val_105'),
-(null, null)
+(101, '1001', 'val_101'),
+(null, '1002', 'val_102'),
+(103, null, 'val_103'),
+(104, '1004', null),
+(105, '1005', 'val_105'),
+(null, null, null)
PREHOOK: type: QUERY
PREHOOK: Input: _dummy_database@_dummy_table
PREHOOK: Output: default@src_small
POSTHOOK: query: INSERT INTO src_small VALUES
-(101, 'val_101'),
-(null, 'val_102'),
-(103, 'val_103'),
-(104, null),
-(105, 'val_105'),
-(null, null)
+(101, '1001', 'val_101'),
+(null, '1002', 'val_102'),
+(103, null, 'val_103'),
+(104, '1004', null),
+(105, '1005', 'val_105'),
+(null, null, null)
POSTHOOK: type: QUERY
POSTHOOK: Input: _dummy_database@_dummy_table
POSTHOOK: Output: default@src_small
-POSTHOOK: Lineage: src_small.key SCRIPT []
+POSTHOOK: Lineage: src_small.key1 SCRIPT []
+POSTHOOK: Lineage: src_small.key2 SCRIPT []
POSTHOOK: Lineage: src_small.value SCRIPT []
PREHOOK: query: SELECT * FROM srcbucket_big ORDER BY id
PREHOOK: type: QUERY
@@ -66,25 +97,40 @@ POSTHOOK: query: SELECT * FROM srcbucket_big ORDER BY id
POSTHOOK: type: QUERY
POSTHOOK: Input: default@srcbucket_big
#### A masked pattern was here ####
-101 val_101 1
-NULL val_102 2
-103 val_103 3
-104 NULL 4
-105 val_105 5
-NULL NULL 6
+101 1001 val_101 1
+NULL 1002 val_102 2
+103 NULL val_103 3
+104 1004 NULL 4
+105 1005 val_105 5
+101 1001 val_101 6
+NULL 1002 val_102 7
+103 NULL val_103 8
+104 1004 NULL 9
+105 1005 val_105 10
+101 1001 val_101 11
+NULL 1002 val_102 12
+103 NULL val_103 13
+104 1004 NULL 14
+105 1005 val_105 15
+101 1001 val_101 16
+NULL 1002 val_102 17
+103 NULL val_103 18
+104 1004 NULL 19
+105 1005 val_105 20
+NULL NULL NULL 21
PREHOOK: query: EXPLAIN
-SELECT *
+SELECT a.key1, a.id
FROM srcbucket_big a
-JOIN src_small b ON a.key = b.key
+JOIN src_small b ON a.key1 = b.key1
ORDER BY a.id
PREHOOK: type: QUERY
PREHOOK: Input: default@src_small
PREHOOK: Input: default@srcbucket_big
#### A masked pattern was here ####
POSTHOOK: query: EXPLAIN
-SELECT *
+SELECT a.key1, a.id
FROM srcbucket_big a
-JOIN src_small b ON a.key = b.key
+JOIN src_small b ON a.key1 = b.key1
ORDER BY a.id
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src_small
@@ -93,59 +139,155 @@ POSTHOOK: Input: default@srcbucket_big
Plan optimized by CBO.
Vertex dependency in root stage
-Map 2 <- Map 1 (BROADCAST_EDGE)
-Reducer 3 <- Map 2 (SIMPLE_EDGE)
+Map 1 <- Map 3 (CUSTOM_EDGE)
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
Stage-0
Fetch Operator
limit:-1
Stage-1
- Reducer 3 vectorized, llap
+ Reducer 2 vectorized, llap
File Output Operator [FS_37]
- Select Operator [SEL_36] (rows=4 width=192)
- Output:["_col0","_col1","_col2","_col3","_col4"]
- <-Map 2 [SIMPLE_EDGE] vectorized, llap
+ Select Operator [SEL_36] (rows=16 width=8)
+ Output:["_col0","_col1"]
+ <-Map 1 [SIMPLE_EDGE] vectorized, llap
SHUFFLE [RS_35]
- Map Join Operator [MAPJOIN_34] (rows=4 width=192)
-
Conds:RS_31._col0=SEL_33._col0(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
- <-Map 1 [BROADCAST_EDGE] vectorized, llap
- BROADCAST [RS_31]
+ Map Join Operator [MAPJOIN_34] (rows=16 width=8)
+
BucketMapJoin:true,Conds:SEL_33._col0=RS_31._col0(Inner),Output:["_col0","_col1"]
+ <-Map 3 [CUSTOM_EDGE] vectorized, llap
+ MULTICAST [RS_31]
PartitionCols:_col0
- Select Operator [SEL_30] (rows=4 width=99)
- Output:["_col0","_col1","_col2"]
- Filter Operator [FIL_29] (rows=4 width=99)
- predicate:key is not null
- TableScan [TS_0] (rows=4 width=99)
-
default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value","id"]
- <-Select Operator [SEL_33] (rows=4 width=93)
+ Select Operator [SEL_30] (rows=4 width=4)
+ Output:["_col0"]
+ Filter Operator [FIL_29] (rows=4 width=4)
+ predicate:key1 is not null
+ TableScan [TS_3] (rows=6 width=3)
+
default@src_small,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key1"]
+ <-Select Operator [SEL_33] (rows=16 width=8)
+ Output:["_col0","_col1"]
+ Filter Operator [FIL_32] (rows=16 width=8)
+ predicate:key1 is not null
+ TableScan [TS_0] (rows=16 width=8)
+ default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping
Num Buckets:4,Grouping Partition Columns:["key1"],Output:["key1","id"]
+
+PREHOOK: query: SELECT a.key1, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1
+ORDER BY a.id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src_small
+PREHOOK: Input: default@srcbucket_big
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.key1, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1
+ORDER BY a.id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src_small
+POSTHOOK: Input: default@srcbucket_big
+#### A masked pattern was here ####
+101 1
+103 3
+104 4
+105 5
+101 6
+103 8
+104 9
+105 10
+101 11
+103 13
+104 14
+105 15
+101 16
+103 18
+104 19
+105 20
+PREHOOK: query: EXPLAIN
+SELECT a.key2, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key2 = b.key2
+ORDER BY a.id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src_small
+PREHOOK: Input: default@srcbucket_big
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN
+SELECT a.key2, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key2 = b.key2
+ORDER BY a.id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src_small
+POSTHOOK: Input: default@srcbucket_big
+#### A masked pattern was here ####
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Map 1 <- Map 3 (CUSTOM_EDGE)
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized, llap
+ File Output Operator [FS_37]
+ Select Operator [SEL_36] (rows=16 width=92)
+ Output:["_col0","_col1"]
+ <-Map 1 [SIMPLE_EDGE] vectorized, llap
+ SHUFFLE [RS_35]
+ Map Join Operator [MAPJOIN_34] (rows=16 width=92)
+
BucketMapJoin:true,Conds:SEL_33._col0=RS_31._col0(Inner),Output:["_col0","_col1"]
+ <-Map 3 [CUSTOM_EDGE] vectorized, llap
+ MULTICAST [RS_31]
+ PartitionCols:_col0
+ Select Operator [SEL_30] (rows=4 width=87)
+ Output:["_col0"]
+ Filter Operator [FIL_29] (rows=4 width=87)
+ predicate:key2 is not null
+ TableScan [TS_3] (rows=6 width=72)
+
default@src_small,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key2"]
+ <-Select Operator [SEL_33] (rows=16 width=92)
Output:["_col0","_col1"]
- Filter Operator [FIL_32] (rows=4 width=93)
- predicate:key is not null
- TableScan [TS_3] (rows=6 width=77)
-
default@src_small,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
+ Filter Operator [FIL_32] (rows=16 width=92)
+ predicate:key2 is not null
+ TableScan [TS_0] (rows=16 width=92)
+ default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping
Num Buckets:8,Grouping Partition Columns:["key2"],Output:["key2","id"]
-PREHOOK: query: SELECT *
+PREHOOK: query: SELECT a.key2, a.id
FROM srcbucket_big a
-JOIN src_small b ON a.key = b.key
+JOIN src_small b ON a.key2 = b.key2
ORDER BY a.id
PREHOOK: type: QUERY
PREHOOK: Input: default@src_small
PREHOOK: Input: default@srcbucket_big
#### A masked pattern was here ####
-POSTHOOK: query: SELECT *
+POSTHOOK: query: SELECT a.key2, a.id
FROM srcbucket_big a
-JOIN src_small b ON a.key = b.key
+JOIN src_small b ON a.key2 = b.key2
ORDER BY a.id
POSTHOOK: type: QUERY
POSTHOOK: Input: default@src_small
POSTHOOK: Input: default@srcbucket_big
#### A masked pattern was here ####
-101 val_101 1 101 val_101
-103 val_103 3 103 val_103
-104 NULL 4 104 NULL
-105 val_105 5 105 val_105
+1001 1
+1002 2
+1004 4
+1005 5
+1001 6
+1002 7
+1004 9
+1005 10
+1001 11
+1002 12
+1004 14
+1005 15
+1001 16
+1002 17
+1004 19
+1005 20
PREHOOK: query: EXPLAIN
-SELECT *
+SELECT a.value, a.id
FROM srcbucket_big a
JOIN src_small b ON a.value = b.value
ORDER BY a.id
@@ -154,7 +296,7 @@ PREHOOK: Input: default@src_small
PREHOOK: Input: default@srcbucket_big
#### A masked pattern was here ####
POSTHOOK: query: EXPLAIN
-SELECT *
+SELECT a.value, a.id
FROM srcbucket_big a
JOIN src_small b ON a.value = b.value
ORDER BY a.id
@@ -174,29 +316,29 @@ Stage-0
Stage-1
Reducer 2 vectorized, llap
File Output Operator [FS_37]
- Select Operator [SEL_36] (rows=4 width=192)
- Output:["_col0","_col1","_col2","_col3","_col4"]
+ Select Operator [SEL_36] (rows=16 width=77)
+ Output:["_col0","_col1"]
<-Map 1 [SIMPLE_EDGE] vectorized, llap
SHUFFLE [RS_35]
- Map Join Operator [MAPJOIN_34] (rows=4 width=192)
-
Conds:SEL_33._col1=RS_31._col1(Inner),Output:["_col0","_col1","_col2","_col3","_col4"]
+ Map Join Operator [MAPJOIN_34] (rows=16 width=77)
+ Conds:SEL_33._col0=RS_31._col0(Inner),Output:["_col0","_col1"]
<-Map 3 [BROADCAST_EDGE] vectorized, llap
BROADCAST [RS_31]
- PartitionCols:_col1
- Select Operator [SEL_30] (rows=4 width=93)
- Output:["_col0","_col1"]
- Filter Operator [FIL_29] (rows=4 width=93)
+ PartitionCols:_col0
+ Select Operator [SEL_30] (rows=4 width=89)
+ Output:["_col0"]
+ Filter Operator [FIL_29] (rows=4 width=89)
predicate:value is not null
- TableScan [TS_3] (rows=6 width=77)
-
default@src_small,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value"]
- <-Select Operator [SEL_33] (rows=4 width=99)
- Output:["_col0","_col1","_col2"]
- Filter Operator [FIL_32] (rows=4 width=99)
+ TableScan [TS_3] (rows=6 width=74)
+
default@src_small,b,Tbl:COMPLETE,Col:COMPLETE,Output:["value"]
+ <-Select Operator [SEL_33] (rows=16 width=77)
+ Output:["_col0","_col1"]
+ Filter Operator [FIL_32] (rows=16 width=77)
predicate:value is not null
- TableScan [TS_0] (rows=6 width=83)
-
default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Output:["key","value","id"]
+ TableScan [TS_0] (rows=21 width=77)
+
default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Output:["value","id"]
-PREHOOK: query: SELECT *
+PREHOOK: query: SELECT a.value, a.id
FROM srcbucket_big a
JOIN src_small b ON a.value = b.value
ORDER BY a.id
@@ -204,7 +346,7 @@ PREHOOK: type: QUERY
PREHOOK: Input: default@src_small
PREHOOK: Input: default@srcbucket_big
#### A masked pattern was here ####
-POSTHOOK: query: SELECT *
+POSTHOOK: query: SELECT a.value, a.id
FROM srcbucket_big a
JOIN src_small b ON a.value = b.value
ORDER BY a.id
@@ -212,7 +354,255 @@ POSTHOOK: type: QUERY
POSTHOOK: Input: default@src_small
POSTHOOK: Input: default@srcbucket_big
#### A masked pattern was here ####
-101 val_101 1 101 val_101
-NULL val_102 2 NULL val_102
-103 val_103 3 103 val_103
-105 val_105 5 105 val_105
+val_101 1
+val_102 2
+val_103 3
+val_105 5
+val_101 6
+val_102 7
+val_103 8
+val_105 10
+val_101 11
+val_102 12
+val_103 13
+val_105 15
+val_101 16
+val_102 17
+val_103 18
+val_105 20
+PREHOOK: query: EXPLAIN
+SELECT a.key1, a.key2, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2
+ORDER BY a.id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src_small
+PREHOOK: Input: default@srcbucket_big
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN
+SELECT a.key1, a.key2, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2
+ORDER BY a.id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src_small
+POSTHOOK: Input: default@srcbucket_big
+#### A masked pattern was here ####
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Map 1 <- Map 3 (CUSTOM_EDGE)
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized, llap
+ File Output Operator [FS_57]
+ Select Operator [SEL_56] (rows=12 width=96)
+ Output:["_col0","_col1","_col2"]
+ <-Map 1 [SIMPLE_EDGE] vectorized, llap
+ SHUFFLE [RS_55]
+ Map Join Operator [MAPJOIN_54] (rows=12 width=96)
+ BucketMapJoin:true,Conds:SEL_53._col0, _col1=RS_51._col0,
_col1(Inner),Output:["_col0","_col1","_col2"]
+ <-Map 3 [CUSTOM_EDGE] vectorized, llap
+ MULTICAST [RS_51]
+ PartitionCols:_col0, _col1
+ Select Operator [SEL_50] (rows=3 width=91)
+ Output:["_col0","_col1"]
+ Filter Operator [FIL_49] (rows=3 width=91)
+ predicate:(key1 is not null and key2 is not null)
+ TableScan [TS_3] (rows=6 width=75)
+
default@src_small,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key1","key2"]
+ <-Select Operator [SEL_53] (rows=12 width=96)
+ Output:["_col0","_col1","_col2"]
+ Filter Operator [FIL_52] (rows=12 width=96)
+ predicate:(key1 is not null and key2 is not null)
+ TableScan [TS_0] (rows=12 width=96)
+ default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping
Num Buckets:32,Grouping Partition
Columns:["key1","key2"],Output:["key1","key2","id"]
+
+PREHOOK: query: SELECT a.key1, a.key2, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2
+ORDER BY a.id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src_small
+PREHOOK: Input: default@srcbucket_big
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.key1, a.key2, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2
+ORDER BY a.id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src_small
+POSTHOOK: Input: default@srcbucket_big
+#### A masked pattern was here ####
+101 1001 1
+104 1004 4
+105 1005 5
+101 1001 6
+104 1004 9
+105 1005 10
+101 1001 11
+104 1004 14
+105 1005 15
+101 1001 16
+104 1004 19
+105 1005 20
+PREHOOK: query: EXPLAIN
+SELECT a.key1, a.value, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.value = b.value
+ORDER BY a.id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src_small
+PREHOOK: Input: default@srcbucket_big
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN
+SELECT a.key1, a.value, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.value = b.value
+ORDER BY a.id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src_small
+POSTHOOK: Input: default@srcbucket_big
+#### A masked pattern was here ####
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Map 1 <- Map 3 (CUSTOM_EDGE)
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized, llap
+ File Output Operator [FS_57]
+ Select Operator [SEL_56] (rows=9 width=78)
+ Output:["_col0","_col1","_col2"]
+ <-Map 1 [SIMPLE_EDGE] vectorized, llap
+ SHUFFLE [RS_55]
+ Map Join Operator [MAPJOIN_54] (rows=9 width=78)
+ BucketMapJoin:true,Conds:SEL_53._col0, _col1=RS_51._col0,
_col1(Inner),Output:["_col0","_col1","_col2"]
+ <-Map 3 [CUSTOM_EDGE] vectorized, llap
+ MULTICAST [RS_51]
+ PartitionCols:_col0
+ Select Operator [SEL_50] (rows=3 width=93)
+ Output:["_col0","_col1"]
+ Filter Operator [FIL_49] (rows=3 width=93)
+ predicate:(key1 is not null and value is not null)
+ TableScan [TS_3] (rows=6 width=77)
+
default@src_small,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key1","value"]
+ <-Select Operator [SEL_53] (rows=12 width=83)
+ Output:["_col0","_col1","_col2"]
+ Filter Operator [FIL_52] (rows=12 width=83)
+ predicate:(value is not null and key1 is not null)
+ TableScan [TS_0] (rows=16 width=81)
+ default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping
Num Buckets:4,Grouping Partition Columns:["key1"],Output:["key1","value","id"]
+
+PREHOOK: query: SELECT a.key1, a.value, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.value = b.value
+ORDER BY a.id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src_small
+PREHOOK: Input: default@srcbucket_big
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.key1, a.value, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.value = b.value
+ORDER BY a.id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src_small
+POSTHOOK: Input: default@srcbucket_big
+#### A masked pattern was here ####
+101 val_101 1
+103 val_103 3
+105 val_105 5
+101 val_101 6
+103 val_103 8
+105 val_105 10
+101 val_101 11
+103 val_103 13
+105 val_105 15
+101 val_101 16
+103 val_103 18
+105 val_105 20
+PREHOOK: query: EXPLAIN
+SELECT a.key1, a.key2, a.value, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 AND a.value = b.value
+ORDER BY a.id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src_small
+PREHOOK: Input: default@srcbucket_big
+#### A masked pattern was here ####
+POSTHOOK: query: EXPLAIN
+SELECT a.key1, a.key2, a.value, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 AND a.value = b.value
+ORDER BY a.id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src_small
+POSTHOOK: Input: default@srcbucket_big
+#### A masked pattern was here ####
+Plan optimized by CBO.
+
+Vertex dependency in root stage
+Map 1 <- Map 3 (CUSTOM_EDGE)
+Reducer 2 <- Map 1 (SIMPLE_EDGE)
+
+Stage-0
+ Fetch Operator
+ limit:-1
+ Stage-1
+ Reducer 2 vectorized, llap
+ File Output Operator [FS_67]
+ Select Operator [SEL_66] (rows=5 width=150)
+ Output:["_col0","_col1","_col2","_col3"]
+ <-Map 1 [SIMPLE_EDGE] vectorized, llap
+ SHUFFLE [RS_65]
+ Map Join Operator [MAPJOIN_64] (rows=5 width=150)
+ BucketMapJoin:true,Conds:SEL_63._col0, _col1, _col2=RS_61._col0,
_col1, _col2(Inner),Output:["_col0","_col1","_col2","_col3"]
+ <-Map 3 [CUSTOM_EDGE] vectorized, llap
+ MULTICAST [RS_61]
+ PartitionCols:_col0, _col1
+ Select Operator [SEL_60] (rows=2 width=180)
+ Output:["_col0","_col1","_col2"]
+ Filter Operator [FIL_59] (rows=2 width=180)
+ predicate:(key1 is not null and key2 is not null and value
is not null)
+ TableScan [TS_3] (rows=6 width=150)
+
default@src_small,b,Tbl:COMPLETE,Col:COMPLETE,Output:["key1","key2","value"]
+ <-Select Operator [SEL_63] (rows=8 width=164)
+ Output:["_col0","_col1","_col2","_col3"]
+ Filter Operator [FIL_62] (rows=8 width=164)
+ predicate:(value is not null and key1 is not null and key2
is not null)
+ TableScan [TS_0] (rows=12 width=164)
+ default@srcbucket_big,a,Tbl:COMPLETE,Col:COMPLETE,Grouping
Num Buckets:32,Grouping Partition
Columns:["key1","key2"],Output:["key1","key2","value","id"]
+
+PREHOOK: query: SELECT a.key1, a.key2, a.value, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 AND a.value = b.value
+ORDER BY a.id
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src_small
+PREHOOK: Input: default@srcbucket_big
+#### A masked pattern was here ####
+POSTHOOK: query: SELECT a.key1, a.key2, a.value, a.id
+FROM srcbucket_big a
+JOIN src_small b ON a.key1 = b.key1 AND a.key2 = b.key2 AND a.value = b.value
+ORDER BY a.id
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src_small
+POSTHOOK: Input: default@srcbucket_big
+#### A masked pattern was here ####
+101 1001 val_101 1
+105 1005 val_105 5
+101 1001 val_101 6
+105 1005 val_105 10
+101 1001 val_101 11
+105 1005 val_105 15
+101 1001 val_101 16
+105 1005 val_105 20
diff --git
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java
index 7d91d7ea216..bf149397efe 100644
---
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java
+++
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/metainfo/annotation/OpTraitsRulesProcFactory.java
@@ -381,45 +381,82 @@ public Object process(Node nd, Stack<Node> stack,
NodeProcessorCtx procCtx,
public static class SelectRule implements SemanticNodeProcessor {
// For bucket columns
- // If all the columns match to the parent, put them in the bucket cols
+ // If the projected columns are compatible with the bucketing requirement,
put them in the bucket cols
// else, add empty list.
+ private void putConvertedColNamesForBucket(
+ List<List<String>> parentColNamesList, List<CustomBucketFunction>
parentBucketFunctions, SelectOperator selOp,
+ List<List<String>> newBucketColNamesList, List<CustomBucketFunction>
newBucketFunctions) {
+ Preconditions.checkState(parentColNamesList.size() ==
parentBucketFunctions.size());
+ for (int i = 0; i < parentColNamesList.size(); i++) {
+ List<String> colNames = parentColNamesList.get(i);
+
+ List<String> newBucketColNames = new ArrayList<>();
+ boolean[] retainedColumns = new boolean[colNames.size()];
+ boolean allFound = true;
+ for (int j = 0; j < colNames.size(); j++) {
+ final String colName = colNames.get(j);
+ Optional<String> newColName = resolveNewColName(colName, selOp);
+ if (newColName.isPresent()) {
+ retainedColumns[j] = true;
+ newBucketColNames.add(newColName.get());
+ } else {
+ retainedColumns[j] = false;
+ allFound = false;
+ }
+ }
+
+ CustomBucketFunction bucketFunction = parentBucketFunctions.get(i);
+ if (allFound) {
+ newBucketColNamesList.add(newBucketColNames);
+ newBucketFunctions.add(bucketFunction);
+ } else if (bucketFunction == null) {
+ // Hive's native bucketing is effective only when all the bucketing
columns are used
+ newBucketColNamesList.add(new ArrayList<>());
+ newBucketFunctions.add(null);
+ } else {
+ Optional<CustomBucketFunction> newBucketFunction =
bucketFunction.select(retainedColumns);
+ if (newBucketFunction.isPresent()) {
+ newBucketColNamesList.add(newBucketColNames);
+ newBucketFunctions.add(newBucketFunction.get());
+ } else {
+ newBucketColNamesList.add(new ArrayList<>());
+ newBucketFunctions.add(null);
+ }
+ }
+ }
+ }
+
// For sort columns
// Keep the subset of all the columns as long as order is maintained.
- public List<List<String>> getConvertedColNames(
- List<List<String>> parentColNames, SelectOperator selOp, boolean
processSortCols) {
+ private List<List<String>> getConvertedColNamesForSort(List<List<String>>
parentColNames, SelectOperator selOp) {
List<List<String>> listBucketCols = new ArrayList<>();
for (List<String> colNames : parentColNames) {
List<String> bucketColNames = new ArrayList<>();
- boolean found = false;
for (String colName : colNames) {
- // Reset found
- found = false;
- for (Entry<String, ExprNodeDesc> entry :
selOp.getColumnExprMap().entrySet()) {
- if ((entry.getValue() instanceof ExprNodeColumnDesc) &&
- (((ExprNodeColumnDesc)
(entry.getValue())).getColumn().equals(colName))) {
- bucketColNames.add(entry.getKey());
- found = true;
- break;
- }
- }
- if (!found) {
+ Optional<String> newColName = resolveNewColName(colName, selOp);
+ if (newColName.isPresent()) {
+ bucketColNames.add(newColName.get());
+ } else {
// Bail out on first missed column.
break;
}
}
- if (!processSortCols && !found) {
- // While processing bucket columns, atleast one bucket column
- // missed. This results in a different bucketing scheme.
- // Add empty list
- listBucketCols.add(new ArrayList<>());
- } else {
- listBucketCols.add(bucketColNames);
- }
+ listBucketCols.add(bucketColNames);
}
return listBucketCols;
}
+ private Optional<String> resolveNewColName(String parentColName,
SelectOperator selOp) {
+ for (Entry<String, ExprNodeDesc> entry :
selOp.getColumnExprMap().entrySet()) {
+ if ((entry.getValue() instanceof ExprNodeColumnDesc) &&
+ (((ExprNodeColumnDesc)
(entry.getValue())).getColumn().equals(parentColName))) {
+ return Optional.of(entry.getKey());
+ }
+ }
+ return Optional.empty();
+ }
+
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
Object... nodeOutputs) throws SemanticException {
@@ -428,28 +465,18 @@ public Object process(Node nd, Stack<Node> stack,
NodeProcessorCtx procCtx,
List<List<String>> parentBucketColNames =
parentOpTraits.getBucketColNames();
List<List<String>> listBucketCols = null;
+ List<CustomBucketFunction> bucketFunctions = null;
List<List<String>> listSortCols = null;
if (selOp.getColumnExprMap() != null) {
if (parentBucketColNames != null) {
- listBucketCols = getConvertedColNames(parentBucketColNames, selOp,
false);
+ listBucketCols = new ArrayList<>();
+ bucketFunctions = new ArrayList<>();
+ putConvertedColNamesForBucket(parentBucketColNames,
parentOpTraits.getCustomBucketFunctions(), selOp,
+ listBucketCols, bucketFunctions);
}
List<List<String>> parentSortColNames = parentOpTraits.getSortCols();
if (parentSortColNames != null) {
- listSortCols = getConvertedColNames(parentSortColNames, selOp, true);
- }
- }
-
- List<CustomBucketFunction> bucketFunctions = null;
- if (listBucketCols != null) {
- Preconditions.checkState(parentBucketColNames.size() ==
listBucketCols.size());
- bucketFunctions = new ArrayList<>();
- for (int i = 0; i < listBucketCols.size(); i++) {
- if (listBucketCols.get(i).isEmpty()) {
- bucketFunctions.add(null);
- } else {
- Preconditions.checkState(listBucketCols.get(i).size() ==
parentBucketColNames.get(i).size());
-
bucketFunctions.add(parentOpTraits.getCustomBucketFunctions().get(i));
- }
+ listSortCols = getConvertedColNamesForSort(parentSortColNames,
selOp);
}
}