This is an automated email from the ASF dual-hosted git repository. zabetak pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new f396676b09d HIVE-27658: Error resolving join keys during conversion to dynamic partition hashjoin (Stamatis Zampetakis reviewed by Denys Kuzmenko) f396676b09d is described below commit f396676b09d9dd706b4bff4e1c1d999f9f3b1d2f Author: Stamatis Zampetakis <zabe...@gmail.com> AuthorDate: Thu Dec 7 17:56:08 2023 +0100 HIVE-27658: Error resolving join keys during conversion to dynamic partition hashjoin (Stamatis Zampetakis reviewed by Denys Kuzmenko) Sometimes when the compiler attempts to convert a Join to a Dynamic Partition HashJoin (DPHJ) and certain assumptions about the shape of the plan do not hold a SemanticException is thrown. The DPHJ is a performance optimization so there is no reason to raise a fatal error when the conversion cannot be performed. It is preferable to simply skip the conversion and use a regular join instead of blocking completely the query. The `MapJoinProcessor.getMapJoinDesc` method already returns null in certain cases, so it is safe to add another exit condition. Overview of changes: 1. Return null when join key resolution fails and simply skip conversion to DPHJ. 2. Log a warning instead of throwing a fatal SemanticException. 3. Enrich error message with more information to improve diagnosability. Bringing the plan into a shape that will allow the DPHJ conversion is still meaningful but can be tracked independently with other tickets. Close apache/hive#4930 --- .../hadoop/hive/ql/optimizer/MapJoinProcessor.java | 3 +- .../clientpositive/tez_dynpart_hashjoin_4.q | 24 +++ .../llap/tez_dynpart_hashjoin_4.q.out | 210 +++++++++++++++++++++ 3 files changed, 236 insertions(+), 1 deletion(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java index e922ce47796..adf4fbe1b21 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/MapJoinProcessor.java @@ -1306,7 +1306,8 @@ public class MapJoinProcessor extends Transform { List<ExprNodeDesc> keyExprList = ExprNodeDescUtils.resolveJoinKeysAsRSColumns(mapEntry.getValue(), rsParent); if (keyExprList == null) { - throw new SemanticException("Error resolving join keys"); + LOG.warn("Error resolving join keys {} in {} {}", mapEntry.getValue(), rsParent, rsParent.getColumnExprMap()); + return null; } newKeyExprMap.put(pos, keyExprList); } diff --git a/ql/src/test/queries/clientpositive/tez_dynpart_hashjoin_4.q b/ql/src/test/queries/clientpositive/tez_dynpart_hashjoin_4.q new file mode 100644 index 00000000000..d15307a42de --- /dev/null +++ b/ql/src/test/queries/clientpositive/tez_dynpart_hashjoin_4.q @@ -0,0 +1,24 @@ +CREATE TABLE table_a (start_date date, product_id int); + +ALTER TABLE table_a UPDATE STATISTICS SET('numRows'='200000000','rawDataSize'='0' ); +ALTER TABLE table_a UPDATE STATISTICS FOR COLUMN product_id SET('lowValue'='1000000','highValue'='100000000','numNulls'='0','numDVs'='300000' ); +ALTER TABLE table_a UPDATE STATISTICS FOR COLUMN start_date SET('lowValue'='10000','highValue'='20000','numNulls'='0','numDVs'='2500' ); + +CREATE TABLE table_b (start_date date, product_id int, product_sk string); + +ALTER TABLE table_b UPDATE STATISTICS SET('numRows'='100000000','rawDataSize'='0' ); +ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN product_id SET('lowValue'='1000000','highValue'='100000000','numNulls'='0','numDVs'='300000' ); +ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN start_date SET('lowValue'='10000','highValue'='20000','numNulls'='0','numDVs'='500' ); +ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN product_sk SET ('numDVs'='300000','numNulls'='0','avgColLen'='10','maxColLen'='10'); + +set hive.optimize.dynamic.partition.hashjoin=true; +set hive.auto.convert.join=true; +set hive.auto.convert.join.noconditionaltask.size=180000000; + +EXPLAIN +SELECT TC.CONST_DATE, TB.PRODUCT_SK +FROM TABLE_A TA +INNER JOIN (SELECT TO_DATE(FROM_UNIXTIME(1701088643)) AS CONST_DATE) TC + ON TA.START_DATE = TC.CONST_DATE +INNER JOIN TABLE_B TB + ON TB.START_DATE = TC.CONST_DATE AND TA.PRODUCT_ID = TB.PRODUCT_ID; diff --git a/ql/src/test/results/clientpositive/llap/tez_dynpart_hashjoin_4.q.out b/ql/src/test/results/clientpositive/llap/tez_dynpart_hashjoin_4.q.out new file mode 100644 index 00000000000..5ca79e22b2b --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/tez_dynpart_hashjoin_4.q.out @@ -0,0 +1,210 @@ +PREHOOK: query: CREATE TABLE table_a (start_date date, product_id int) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@table_a +POSTHOOK: query: CREATE TABLE table_a (start_date date, product_id int) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@table_a +PREHOOK: query: ALTER TABLE table_a UPDATE STATISTICS SET('numRows'='200000000','rawDataSize'='0' ) +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@table_a +PREHOOK: Output: default@table_a +POSTHOOK: query: ALTER TABLE table_a UPDATE STATISTICS SET('numRows'='200000000','rawDataSize'='0' ) +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@table_a +POSTHOOK: Output: default@table_a +PREHOOK: query: ALTER TABLE table_a UPDATE STATISTICS FOR COLUMN product_id SET('lowValue'='1000000','highValue'='100000000','numNulls'='0','numDVs'='300000' ) +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@table_a +PREHOOK: Output: default@table_a +POSTHOOK: query: ALTER TABLE table_a UPDATE STATISTICS FOR COLUMN product_id SET('lowValue'='1000000','highValue'='100000000','numNulls'='0','numDVs'='300000' ) +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@table_a +POSTHOOK: Output: default@table_a +PREHOOK: query: ALTER TABLE table_a UPDATE STATISTICS FOR COLUMN start_date SET('lowValue'='10000','highValue'='20000','numNulls'='0','numDVs'='2500' ) +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@table_a +PREHOOK: Output: default@table_a +POSTHOOK: query: ALTER TABLE table_a UPDATE STATISTICS FOR COLUMN start_date SET('lowValue'='10000','highValue'='20000','numNulls'='0','numDVs'='2500' ) +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@table_a +POSTHOOK: Output: default@table_a +PREHOOK: query: CREATE TABLE table_b (start_date date, product_id int, product_sk string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@table_b +POSTHOOK: query: CREATE TABLE table_b (start_date date, product_id int, product_sk string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@table_b +PREHOOK: query: ALTER TABLE table_b UPDATE STATISTICS SET('numRows'='100000000','rawDataSize'='0' ) +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@table_b +PREHOOK: Output: default@table_b +POSTHOOK: query: ALTER TABLE table_b UPDATE STATISTICS SET('numRows'='100000000','rawDataSize'='0' ) +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@table_b +POSTHOOK: Output: default@table_b +PREHOOK: query: ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN product_id SET('lowValue'='1000000','highValue'='100000000','numNulls'='0','numDVs'='300000' ) +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@table_b +PREHOOK: Output: default@table_b +POSTHOOK: query: ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN product_id SET('lowValue'='1000000','highValue'='100000000','numNulls'='0','numDVs'='300000' ) +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@table_b +POSTHOOK: Output: default@table_b +PREHOOK: query: ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN start_date SET('lowValue'='10000','highValue'='20000','numNulls'='0','numDVs'='500' ) +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@table_b +PREHOOK: Output: default@table_b +POSTHOOK: query: ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN start_date SET('lowValue'='10000','highValue'='20000','numNulls'='0','numDVs'='500' ) +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@table_b +POSTHOOK: Output: default@table_b +PREHOOK: query: ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN product_sk SET ('numDVs'='300000','numNulls'='0','avgColLen'='10','maxColLen'='10') +PREHOOK: type: ALTERTABLE_UPDATETABLESTATS +PREHOOK: Input: default@table_b +PREHOOK: Output: default@table_b +POSTHOOK: query: ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN product_sk SET ('numDVs'='300000','numNulls'='0','avgColLen'='10','maxColLen'='10') +POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS +POSTHOOK: Input: default@table_b +POSTHOOK: Output: default@table_b +PREHOOK: query: EXPLAIN +SELECT TC.CONST_DATE, TB.PRODUCT_SK +FROM TABLE_A TA +INNER JOIN (SELECT TO_DATE(FROM_UNIXTIME(1701088643)) AS CONST_DATE) TC + ON TA.START_DATE = TC.CONST_DATE +INNER JOIN TABLE_B TB + ON TB.START_DATE = TC.CONST_DATE AND TA.PRODUCT_ID = TB.PRODUCT_ID +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Input: default@table_a +PREHOOK: Input: default@table_b +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN +SELECT TC.CONST_DATE, TB.PRODUCT_SK +FROM TABLE_A TA +INNER JOIN (SELECT TO_DATE(FROM_UNIXTIME(1701088643)) AS CONST_DATE) TC + ON TA.START_DATE = TC.CONST_DATE +INNER JOIN TABLE_B TB + ON TB.START_DATE = TC.CONST_DATE AND TA.PRODUCT_ID = TB.PRODUCT_ID +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Input: default@table_a +POSTHOOK: Input: default@table_b +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Map 1 <- Map 3 (BROADCAST_EDGE) + Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: ta + filterExpr: ((start_date = DATE'2023-11-27') and product_id is not null) (type: boolean) + Statistics: Num rows: 200000000 Data size: 12000000000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: ((start_date = DATE'2023-11-27') and product_id is not null) (type: boolean) + Statistics: Num rows: 100000000 Data size: 6000000000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: product_id (type: int) + outputColumnNames: _col1 + Statistics: Num rows: 100000000 Data size: 400000000 Basic stats: COMPLETE Column stats: COMPLETE + Map Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 DATE'2023-11-27' (type: date) + 1 DATE'2023-11-27' (type: date) + outputColumnNames: _col1, _col2 + input vertices: + 1 Map 3 + Statistics: Num rows: 100000000 Data size: 6000000000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col1 (type: int), _col2 (type: date) + null sort order: zz + sort order: ++ + Map-reduce partition columns: _col1 (type: int), _col2 (type: date) + Statistics: Num rows: 100000000 Data size: 6000000000 Basic stats: COMPLETE Column stats: COMPLETE + Execution mode: vectorized, llap + LLAP IO: all inputs + Map 3 + Map Operator Tree: + TableScan + alias: _dummy_table + Row Limit Per Split: 1 + Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: DATE'2023-11-27' (type: date) + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 56 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: DATE'2023-11-27' (type: date) + null sort order: z + sort order: + + Map-reduce partition columns: DATE'2023-11-27' (type: date) + Statistics: Num rows: 1 Data size: 56 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: date) + Execution mode: llap + LLAP IO: no inputs + Map 4 + Map Operator Tree: + TableScan + alias: tb + filterExpr: ((start_date = DATE'2023-11-27') and product_id is not null) (type: boolean) + Statistics: Num rows: 100000000 Data size: 15400000000 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: ((start_date = DATE'2023-11-27') and product_id is not null) (type: boolean) + Statistics: Num rows: 50000000 Data size: 7700000000 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: product_id (type: int), product_sk (type: string) + outputColumnNames: _col1, _col2 + Statistics: Num rows: 50000000 Data size: 4900000000 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + key expressions: _col1 (type: int), DATE'2023-11-27' (type: date) + null sort order: zz + sort order: ++ + Map-reduce partition columns: _col1 (type: int), DATE'2023-11-27' (type: date) + Statistics: Num rows: 50000000 Data size: 4900000000 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col2 (type: string) + Execution mode: vectorized, llap + LLAP IO: all inputs + Reducer 2 + Execution mode: llap + Reduce Operator Tree: + Merge Join Operator + condition map: + Inner Join 0 to 1 + keys: + 0 _col1 (type: int), _col2 (type: date) + 1 _col1 (type: int), DATE'2023-11-27' (type: date) + outputColumnNames: _col2, _col5 + Statistics: Num rows: 16666666666 Data size: 2499999999900 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + expressions: _col2 (type: date), _col5 (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 16666666666 Data size: 1566666666604 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 16666666666 Data size: 1566666666604 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +