This is an automated email from the ASF dual-hosted git repository. zabetak pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
commit 36ce858163a19e29eafe4a8d3307191bc28fc175 Author: Stamatis Zampetakis <zabe...@gmail.com> AuthorDate: Fri Dec 8 13:22:56 2023 +0100 HIVE-27919: Constant reduction in CBO does not work for FROM_UNIXTIME, DATE_ADD, DATE_SUB, TO_UNIX_TIMESTAMP (Stamatis Zampetakis reviewed by Akshat Mathur, Krisztian Kasa) Constant reduction does not work because the functions are declared as dynamic (isDynamicFunction returns true). However, the dynamic declaration is wrong cause none of the above depends on context variables; they all operate on concrete parameters and require one or more inputs. Moreover, DATE_ADD, DATE_SUB, and FROM_UNIXTIME are not time functions, so it is wrong to extend the SqlAbsctractTimeFunction class. The overrides in SqlAbsctractTimeFunction are not correct/relevant to these functions so the changes here address this as well. Overview of the changes: 1. Turn DATE_ADD, DATE_SUB, and FROM_UNIXTIME to regular SqlFunctions and pass the correct return type inference strategy. The operand type inference and type checker can remain null as they were before since they are not used currently in Hive. 2. Change the type family for FROM_UNIXTIME to reflect that the function returns a string (and not date or time). 3. Create and pass an appropriate operand checker for FROM_UNIXTIME (minor since it's not used at the moment). 4. Remove isDynamicFunction override from TO_UNIX_TIMESTAMP (which is wrong), to enable constant reduction. 5. Finalize classes and make them non-instantiable Constant reduction in CBO allows some further optimizations to kick-in such as the removal of the (always true) filter operator in constant_prop_coalesce.q.out and the transformation to dynamic partition hash join (DPHJ) in tez_dynpart_hashjoin_4.q.out. Note, that without the changes here the DPHJ transformation for the query in tez_dynpart_hashjoin_4.q fails due to inconsistencies on the way constant folding is performed at the Operator (physical) layer (HIVE-27658). Close apache/hive#4932 --- .../reloperators/HiveDateAddSqlOperator.java | 15 +-- .../reloperators/HiveDateSubSqlOperator.java | 15 +-- .../reloperators/HiveFromUnixTimeSqlOperator.java | 29 ++++-- .../HiveToUnixTimestampSqlOperator.java | 12 +-- .../queries/clientpositive/cbo_constantfolding.q | 5 + .../clientpositive/llap/cbo_constantfolding.q.out | 60 ++++++++++++ .../llap/constant_prop_coalesce.q.out | 10 +- .../llap/tez_dynpart_hashjoin_4.q.out | 101 ++++++++++----------- 8 files changed, 165 insertions(+), 82 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveDateAddSqlOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveDateAddSqlOperator.java index af9b12ee6f7..ef865e4d22c 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveDateAddSqlOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveDateAddSqlOperator.java @@ -18,13 +18,16 @@ package org.apache.hadoop.hive.ql.optimizer.calcite.reloperators; -import org.apache.calcite.sql.fun.SqlAbstractTimeFunction; -import org.apache.calcite.sql.type.SqlTypeName; +import org.apache.calcite.sql.SqlFunction; +import org.apache.calcite.sql.SqlFunctionCategory; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.sql.type.ReturnTypes; -public class HiveDateAddSqlOperator extends SqlAbstractTimeFunction { - public static final HiveDateAddSqlOperator INSTANCE = new HiveDateAddSqlOperator(); +public final class HiveDateAddSqlOperator { + public static final SqlFunction INSTANCE = + new SqlFunction("DATE_ADD", SqlKind.OTHER_FUNCTION, ReturnTypes.DATE_NULLABLE, null, null, + SqlFunctionCategory.TIMEDATE); - protected HiveDateAddSqlOperator() { - super("DATE_ADD", SqlTypeName.DATE); + private HiveDateAddSqlOperator() { } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveDateSubSqlOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveDateSubSqlOperator.java index 4f737126f02..d1c00211189 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveDateSubSqlOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveDateSubSqlOperator.java @@ -18,13 +18,16 @@ package org.apache.hadoop.hive.ql.optimizer.calcite.reloperators; -import org.apache.calcite.sql.fun.SqlAbstractTimeFunction; -import org.apache.calcite.sql.type.SqlTypeName; +import org.apache.calcite.sql.SqlFunction; +import org.apache.calcite.sql.SqlFunctionCategory; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.sql.type.ReturnTypes; -public class HiveDateSubSqlOperator extends SqlAbstractTimeFunction { - public static final HiveDateSubSqlOperator INSTANCE = new HiveDateSubSqlOperator(); +public final class HiveDateSubSqlOperator { + public static final SqlFunction INSTANCE = + new SqlFunction("DATE_SUB", SqlKind.OTHER_FUNCTION, ReturnTypes.DATE_NULLABLE, null, null, + SqlFunctionCategory.TIMEDATE); - protected HiveDateSubSqlOperator() { - super("DATE_SUB", SqlTypeName.DATE); + private HiveDateSubSqlOperator() { } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveFromUnixTimeSqlOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveFromUnixTimeSqlOperator.java index 22d91b5a537..54c68ec89be 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveFromUnixTimeSqlOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveFromUnixTimeSqlOperator.java @@ -18,15 +18,32 @@ package org.apache.hadoop.hive.ql.optimizer.calcite.reloperators; -import org.apache.calcite.sql.fun.SqlAbstractTimeFunction; +import org.apache.calcite.sql.SqlFunction; +import org.apache.calcite.sql.SqlFunctionCategory; +import org.apache.calcite.sql.SqlKind; +import org.apache.calcite.sql.type.OperandTypes; +import org.apache.calcite.sql.type.ReturnTypes; +import org.apache.calcite.sql.type.SqlTypeFamily; import org.apache.calcite.sql.type.SqlTypeName; +import org.apache.calcite.sql.type.SqlTypeTransforms; + +import java.util.Arrays; /** - * Calcite SQL operator mapping to FROM_UNIXTIME Hive UDF + * Calcite SQL operator mapping to FROM_UNIXTIME Hive UDF. + * <p> + * The return type of the function is declared as {@code VARCHAR(100)} since it is highly unlikely that a user will + * request a timestamp format that requires more than 100 characters. + * </p> */ -public class HiveFromUnixTimeSqlOperator extends SqlAbstractTimeFunction { - public static final HiveFromUnixTimeSqlOperator INSTANCE = new HiveFromUnixTimeSqlOperator(); - protected HiveFromUnixTimeSqlOperator() { - super("FROM_UNIXTIME", SqlTypeName.TIMESTAMP); +public final class HiveFromUnixTimeSqlOperator { + public static final SqlFunction INSTANCE = new SqlFunction("FROM_UNIXTIME", + SqlKind.OTHER_FUNCTION, + ReturnTypes.explicit(SqlTypeName.VARCHAR, 100).andThen(SqlTypeTransforms.TO_NULLABLE), + null, + OperandTypes.family(Arrays.asList(SqlTypeFamily.INTEGER, SqlTypeFamily.STRING), number -> number == 1), + SqlFunctionCategory.STRING); + + private HiveFromUnixTimeSqlOperator() { } } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveToUnixTimestampSqlOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveToUnixTimestampSqlOperator.java index 372aa300252..18751ce7392 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveToUnixTimestampSqlOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/reloperators/HiveToUnixTimestampSqlOperator.java @@ -27,15 +27,13 @@ import org.apache.calcite.sql.type.ReturnTypes; /** * Sql UNIX_TIMESTAMP calcite operator. */ -public class HiveToUnixTimestampSqlOperator { +public final class HiveToUnixTimestampSqlOperator { public static final SqlFunction INSTANCE = new SqlFunction("UNIX_TIMESTAMP", SqlKind.OTHER_FUNCTION, ReturnTypes.BIGINT, null, OperandTypes.or(OperandTypes.NILADIC, OperandTypes.or(OperandTypes.STRING, OperandTypes.TIMESTAMP, OperandTypes.DATE), - OperandTypes.STRING_STRING), SqlFunctionCategory.NUMERIC) { - @Override - public boolean isDynamicFunction() { - return true; - } - }; + OperandTypes.STRING_STRING), SqlFunctionCategory.NUMERIC); + + private HiveToUnixTimestampSqlOperator() { + } } diff --git a/ql/src/test/queries/clientpositive/cbo_constantfolding.q b/ql/src/test/queries/clientpositive/cbo_constantfolding.q new file mode 100644 index 00000000000..25f6f1345b7 --- /dev/null +++ b/ql/src/test/queries/clientpositive/cbo_constantfolding.q @@ -0,0 +1,5 @@ +EXPLAIN CBO SELECT DATE_ADD('2023-01-01', 1); +EXPLAIN CBO SELECT DATE_SUB('2023-01-01', 1); +EXPLAIN CBO SELECT FROM_UNIXTIME(1672560000); +EXPLAIN CBO SELECT TO_UNIX_TIMESTAMP(DATE '2023-01-01'); +EXPLAIN CBO SELECT UNIX_TIMESTAMP(DATE '2023-01-01'); diff --git a/ql/src/test/results/clientpositive/llap/cbo_constantfolding.q.out b/ql/src/test/results/clientpositive/llap/cbo_constantfolding.q.out new file mode 100644 index 00000000000..5e316937c46 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/cbo_constantfolding.q.out @@ -0,0 +1,60 @@ +PREHOOK: query: EXPLAIN CBO SELECT DATE_ADD('2023-01-01', 1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN CBO SELECT DATE_ADD('2023-01-01', 1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +CBO PLAN: +HiveProject(_o__c0=[CAST(2023-01-02:DATE):DATE]) + HiveTableScan(table=[[_dummy_database, _dummy_table]], table:alias=[_dummy_table]) + +PREHOOK: query: EXPLAIN CBO SELECT DATE_SUB('2023-01-01', 1) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN CBO SELECT DATE_SUB('2023-01-01', 1) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +CBO PLAN: +HiveProject(_o__c0=[CAST(2022-12-31:DATE):DATE]) + HiveTableScan(table=[[_dummy_database, _dummy_table]], table:alias=[_dummy_table]) + +PREHOOK: query: EXPLAIN CBO SELECT FROM_UNIXTIME(1672560000) +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN CBO SELECT FROM_UNIXTIME(1672560000) +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +CBO PLAN: +HiveProject(_o__c0=[CAST(_UTF-16LE'2023-01-01 00:00:00':VARCHAR(2147483647) CHARACTER SET "UTF-16LE"):VARCHAR(2147483647) CHARACTER SET "UTF-16LE"]) + HiveTableScan(table=[[_dummy_database, _dummy_table]], table:alias=[_dummy_table]) + +PREHOOK: query: EXPLAIN CBO SELECT TO_UNIX_TIMESTAMP(DATE '2023-01-01') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN CBO SELECT TO_UNIX_TIMESTAMP(DATE '2023-01-01') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +CBO PLAN: +HiveProject(_o__c0=[CAST(1672560000:BIGINT):BIGINT]) + HiveTableScan(table=[[_dummy_database, _dummy_table]], table:alias=[_dummy_table]) + +PREHOOK: query: EXPLAIN CBO SELECT UNIX_TIMESTAMP(DATE '2023-01-01') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN CBO SELECT UNIX_TIMESTAMP(DATE '2023-01-01') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +#### A masked pattern was here #### +CBO PLAN: +HiveProject(_o__c0=[CAST(1672560000:BIGINT):BIGINT]) + HiveTableScan(table=[[_dummy_database, _dummy_table]], table:alias=[_dummy_table]) + diff --git a/ql/src/test/results/clientpositive/llap/constant_prop_coalesce.q.out b/ql/src/test/results/clientpositive/llap/constant_prop_coalesce.q.out index cb0cd4de2c8..43bb312ec5e 100644 --- a/ql/src/test/results/clientpositive/llap/constant_prop_coalesce.q.out +++ b/ql/src/test/results/clientpositive/llap/constant_prop_coalesce.q.out @@ -50,12 +50,10 @@ STAGE PLANS: outputColumnNames: _col0, _col1, _col2 UDTF Operator function name: stack - Filter Operator - predicate: if((col0 = 1), true, true) (type: boolean) - Select Operator - expressions: if((col0 = 1), 20210308L, 20210309L) (type: bigint) - outputColumnNames: _col0 - ListSink + Select Operator + expressions: if((col0 = 1), 20210308L, 20210309L) (type: bigint) + outputColumnNames: _col0 + ListSink PREHOOK: query: select * from ( select diff --git a/ql/src/test/results/clientpositive/llap/tez_dynpart_hashjoin_4.q.out b/ql/src/test/results/clientpositive/llap/tez_dynpart_hashjoin_4.q.out index 5ca79e22b2b..9c999893817 100644 --- a/ql/src/test/results/clientpositive/llap/tez_dynpart_hashjoin_4.q.out +++ b/ql/src/test/results/clientpositive/llap/tez_dynpart_hashjoin_4.q.out @@ -70,6 +70,7 @@ POSTHOOK: query: ALTER TABLE table_b UPDATE STATISTICS FOR COLUMN product_sk SET POSTHOOK: type: ALTERTABLE_UPDATETABLESTATS POSTHOOK: Input: default@table_b POSTHOOK: Output: default@table_b +Warning: Map Join MAPJOIN[31][bigTable=?] in task 'Map 1' is a cross product PREHOOK: query: EXPLAIN SELECT TC.CONST_DATE, TB.PRODUCT_SK FROM TABLE_A TA @@ -103,100 +104,98 @@ STAGE PLANS: Tez #### A masked pattern was here #### Edges: - Map 1 <- Map 3 (BROADCAST_EDGE) - Reducer 2 <- Map 1 (SIMPLE_EDGE), Map 4 (SIMPLE_EDGE) + Map 1 <- Map 2 (BROADCAST_EDGE) + Reducer 4 <- Map 1 (CUSTOM_SIMPLE_EDGE), Map 3 (CUSTOM_SIMPLE_EDGE) #### A masked pattern was here #### Vertices: Map 1 Map Operator Tree: TableScan - alias: ta + alias: tb filterExpr: ((start_date = DATE'2023-11-27') and product_id is not null) (type: boolean) - Statistics: Num rows: 200000000 Data size: 12000000000 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 100000000 Data size: 15400000000 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: ((start_date = DATE'2023-11-27') and product_id is not null) (type: boolean) - Statistics: Num rows: 100000000 Data size: 6000000000 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 50000000 Data size: 7700000000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: product_id (type: int) - outputColumnNames: _col1 - Statistics: Num rows: 100000000 Data size: 400000000 Basic stats: COMPLETE Column stats: COMPLETE + expressions: product_id (type: int), product_sk (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 50000000 Data size: 4900000000 Basic stats: COMPLETE Column stats: COMPLETE Map Join Operator condition map: Inner Join 0 to 1 keys: - 0 DATE'2023-11-27' (type: date) - 1 DATE'2023-11-27' (type: date) - outputColumnNames: _col1, _col2 + 0 + 1 + outputColumnNames: _col0, _col1 input vertices: - 1 Map 3 - Statistics: Num rows: 100000000 Data size: 6000000000 Basic stats: COMPLETE Column stats: COMPLETE + 1 Map 2 + Statistics: Num rows: 50000000 Data size: 4900000000 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator - key expressions: _col1 (type: int), _col2 (type: date) - null sort order: zz - sort order: ++ - Map-reduce partition columns: _col1 (type: int), _col2 (type: date) - Statistics: Num rows: 100000000 Data size: 6000000000 Basic stats: COMPLETE Column stats: COMPLETE + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 50000000 Data size: 4900000000 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col1 (type: string) Execution mode: vectorized, llap LLAP IO: all inputs - Map 3 + Map 2 Map Operator Tree: TableScan alias: _dummy_table Row Limit Per Split: 1 Statistics: Num rows: 1 Data size: 10 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: DATE'2023-11-27' (type: date) - outputColumnNames: _col0 - Statistics: Num rows: 1 Data size: 56 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator - key expressions: DATE'2023-11-27' (type: date) - null sort order: z - sort order: + - Map-reduce partition columns: DATE'2023-11-27' (type: date) - Statistics: Num rows: 1 Data size: 56 Basic stats: COMPLETE Column stats: COMPLETE - value expressions: _col0 (type: date) + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE Execution mode: llap LLAP IO: no inputs - Map 4 + Map 3 Map Operator Tree: TableScan - alias: tb + alias: ta filterExpr: ((start_date = DATE'2023-11-27') and product_id is not null) (type: boolean) - Statistics: Num rows: 100000000 Data size: 15400000000 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 200000000 Data size: 12000000000 Basic stats: COMPLETE Column stats: COMPLETE Filter Operator predicate: ((start_date = DATE'2023-11-27') and product_id is not null) (type: boolean) - Statistics: Num rows: 50000000 Data size: 7700000000 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 100000000 Data size: 6000000000 Basic stats: COMPLETE Column stats: COMPLETE Select Operator - expressions: product_id (type: int), product_sk (type: string) - outputColumnNames: _col1, _col2 - Statistics: Num rows: 50000000 Data size: 4900000000 Basic stats: COMPLETE Column stats: COMPLETE + expressions: product_id (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 100000000 Data size: 400000000 Basic stats: COMPLETE Column stats: COMPLETE Reduce Output Operator - key expressions: _col1 (type: int), DATE'2023-11-27' (type: date) - null sort order: zz - sort order: ++ - Map-reduce partition columns: _col1 (type: int), DATE'2023-11-27' (type: date) - Statistics: Num rows: 50000000 Data size: 4900000000 Basic stats: COMPLETE Column stats: COMPLETE - value expressions: _col2 (type: string) + key expressions: _col0 (type: int) + null sort order: z + sort order: + + Map-reduce partition columns: _col0 (type: int) + Statistics: Num rows: 100000000 Data size: 400000000 Basic stats: COMPLETE Column stats: COMPLETE Execution mode: vectorized, llap LLAP IO: all inputs - Reducer 2 - Execution mode: llap + Reducer 4 + Execution mode: vectorized, llap Reduce Operator Tree: - Merge Join Operator + Map Join Operator condition map: Inner Join 0 to 1 keys: - 0 _col1 (type: int), _col2 (type: date) - 1 _col1 (type: int), DATE'2023-11-27' (type: date) - outputColumnNames: _col2, _col5 - Statistics: Num rows: 16666666666 Data size: 2499999999900 Basic stats: COMPLETE Column stats: COMPLETE + 0 KEY.reducesinkkey0 (type: int) + 1 KEY.reducesinkkey0 (type: int) + outputColumnNames: _col1 + input vertices: + 0 Map 1 + Statistics: Num rows: 16666666666 Data size: 1566666666604 Basic stats: COMPLETE Column stats: COMPLETE + DynamicPartitionHashJoin: true Select Operator - expressions: _col2 (type: date), _col5 (type: string) + expressions: DATE'2023-11-27' (type: date), _col1 (type: string) outputColumnNames: _col0, _col1 - Statistics: Num rows: 16666666666 Data size: 1566666666604 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 16666666666 Data size: 2499999999900 Basic stats: COMPLETE Column stats: COMPLETE File Output Operator compressed: false - Statistics: Num rows: 16666666666 Data size: 1566666666604 Basic stats: COMPLETE Column stats: COMPLETE + Statistics: Num rows: 16666666666 Data size: 2499999999900 Basic stats: COMPLETE Column stats: COMPLETE table: input format: org.apache.hadoop.mapred.SequenceFileInputFormat output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat