This is an automated email from the ASF dual-hosted git repository. zabetak pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
commit 2d855b27d31db6476f18870651db6987816bb5e3 Author: okumin <g...@okumin.com> AuthorDate: Sun Apr 21 22:55:34 2024 +0900 HIVE-28207: NullPointerException is thrown when checking column uniqueness (Shohei Okumiya reviewed by Stamatis Zampetakis, Ayush Saxena) Close apache/hive#5207 --- .../calcite/rules/HiveAggregateSplitRule.java | 4 +- .../calcite/rules/HiveJoinConstraintsRule.java | 3 +- .../optimizer/calcite/stats/HiveRelMdRowCount.java | 3 +- .../queries/clientpositive/cbo_join_constraints.q | 11 ++ .../cbo_row_count_non_simple_filter.q | 26 ++++ .../clientpositive/llap/cbo_join_constraints.q.out | 78 ++++++++++ .../llap/cbo_row_count_non_simple_filter.q.out | 159 +++++++++++++++++++++ 7 files changed, 281 insertions(+), 3 deletions(-) diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateSplitRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateSplitRule.java index b956201f133..8534981b19f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateSplitRule.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveAggregateSplitRule.java @@ -86,7 +86,9 @@ public class HiveAggregateSplitRule extends RelOptRule { aggregateCall.name)); } - if (aggregate.getCluster().getMetadataQuery().areColumnsUnique(aggregate.getInput(), bottomAggregateGroupSet)) { + final Boolean isUnique = + aggregate.getCluster().getMetadataQuery().areColumnsUnique(aggregate.getInput(), bottomAggregateGroupSet); + if (isUnique != null && isUnique) { // Nothing to do, probably already pushed return; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinConstraintsRule.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinConstraintsRule.java index d60d0ee3d84..58eb2011ed0 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinConstraintsRule.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/rules/HiveJoinConstraintsRule.java @@ -274,7 +274,8 @@ public class HiveJoinConstraintsRule extends RelOptRule { } } - if (!call.getMetadataQuery().areColumnsUnique(nonFkInput, fkJoinColBuilder.build())) { + final Boolean isUnique = call.getMetadataQuery().areColumnsUnique(nonFkInput, fkJoinColBuilder.build()); + if (isUnique == null || !isUnique) { return; } diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdRowCount.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdRowCount.java index b4c649871ea..254a5ed8c83 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdRowCount.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/calcite/stats/HiveRelMdRowCount.java @@ -638,7 +638,8 @@ public class HiveRelMdRowCount extends RelMdRowCount { private boolean isSimple(ImmutableBitSet condBits, Filter filter, RelMetadataQuery mq) { // Returns whether the filter is only applied on the key columns if (constraintsBased) { - return mq.areColumnsUnique(filter, condBits); + final Boolean isUnique = mq.areColumnsUnique(filter, condBits); + return isUnique != null && isUnique; } return isKey(condBits, filter, mq); } diff --git a/ql/src/test/queries/clientpositive/cbo_join_constraints.q b/ql/src/test/queries/clientpositive/cbo_join_constraints.q new file mode 100644 index 00000000000..2bba038d621 --- /dev/null +++ b/ql/src/test/queries/clientpositive/cbo_join_constraints.q @@ -0,0 +1,11 @@ +CREATE TABLE `store_sales` (`ss_item_sk` bigint); +CREATE TABLE `household_demographics` (`hd_demo_sk` bigint); +CREATE TABLE `item` (`i_item_sk` bigint); +ALTER TABLE `store_sales` ADD CONSTRAINT `pk_ss` PRIMARY KEY (`ss_item_sk`) DISABLE NOVALIDATE RELY; +ALTER TABLE `item` ADD CONSTRAINT `pk_i` PRIMARY KEY (`i_item_sk`) DISABLE NOVALIDATE RELY; +ALTER TABLE `store_sales` ADD CONSTRAINT `ss_i` FOREIGN KEY (`ss_item_sk`) REFERENCES `item`(`i_item_sk`) DISABLE NOVALIDATE RELY; + +EXPLAIN CBO +SELECT i_item_sk +FROM store_sales, household_demographics, item +WHERE ss_item_sk = i_item_sk; diff --git a/ql/src/test/queries/clientpositive/cbo_row_count_non_simple_filter.q b/ql/src/test/queries/clientpositive/cbo_row_count_non_simple_filter.q new file mode 100644 index 00000000000..6169370587e --- /dev/null +++ b/ql/src/test/queries/clientpositive/cbo_row_count_non_simple_filter.q @@ -0,0 +1,26 @@ +CREATE TABLE `store_sales` (`ss_item_sk` bigint, `ss_hdemo_sk` bigint); +CREATE TABLE `household_demographics` (`hd_demo_sk` bigint, `hd_income_band_sk` bigint); +CREATE TABLE `item` (`i_item_sk` bigint, `i_product_name` string); +CREATE TABLE `income_band`(`ib_income_band_sk` bigint); +CREATE TABLE `customer`(`c_current_hdemo_sk` bigint); +ALTER TABLE `store_sales` ADD CONSTRAINT `pk_ss` PRIMARY KEY (`ss_item_sk`) DISABLE NOVALIDATE RELY; +ALTER TABLE `item` ADD CONSTRAINT `pk_i` PRIMARY KEY (`i_item_sk`) DISABLE NOVALIDATE RELY; +ALTER TABLE `household_demographics` ADD CONSTRAINT `pk_hd` PRIMARY KEY (`hd_demo_sk`) DISABLE NOVALIDATE RELY; +ALTER TABLE `income_band` ADD CONSTRAINT `pk_ib` PRIMARY KEY (`ib_income_band_sk`) DISABLE NOVALIDATE RELY; +ALTER TABLE `store_sales` ADD CONSTRAINT `ss_i` FOREIGN KEY (`ss_item_sk`) REFERENCES `item`(`i_item_sk`) DISABLE NOVALIDATE RELY; +ALTER TABLE `household_demographics` ADD CONSTRAINT `hd_ib` FOREIGN KEY (`hd_income_band_sk`) REFERENCES `income_band`(`ib_income_band_sk`) DISABLE NOVALIDATE RELY; +ALTER TABLE `customer` ADD CONSTRAINT `c_hd` FOREIGN KEY (`c_current_hdemo_sk`) REFERENCES `household_demographics`(`hd_demo_sk`) DISABLE NOVALIDATE RELY; + +EXPLAIN CBO +SELECT i_product_name, i_item_sk +FROM store_sales + ,customer + ,household_demographics hd1 + ,household_demographics hd2 + ,income_band ib2 + ,item +WHERE ss_hdemo_sk = hd1.hd_demo_sk AND + ss_item_sk = i_item_sk and + c_current_hdemo_sk = hd2.hd_demo_sk AND + hd2.hd_income_band_sk = ib2.ib_income_band_sk +GROUP BY i_product_name, i_item_sk; diff --git a/ql/src/test/results/clientpositive/llap/cbo_join_constraints.q.out b/ql/src/test/results/clientpositive/llap/cbo_join_constraints.q.out new file mode 100644 index 00000000000..73a009d199b --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/cbo_join_constraints.q.out @@ -0,0 +1,78 @@ +PREHOOK: query: CREATE TABLE `store_sales` (`ss_item_sk` bigint) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@store_sales +POSTHOOK: query: CREATE TABLE `store_sales` (`ss_item_sk` bigint) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store_sales +PREHOOK: query: CREATE TABLE `household_demographics` (`hd_demo_sk` bigint) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@household_demographics +POSTHOOK: query: CREATE TABLE `household_demographics` (`hd_demo_sk` bigint) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@household_demographics +PREHOOK: query: CREATE TABLE `item` (`i_item_sk` bigint) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@item +POSTHOOK: query: CREATE TABLE `item` (`i_item_sk` bigint) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@item +PREHOOK: query: ALTER TABLE `store_sales` ADD CONSTRAINT `pk_ss` PRIMARY KEY (`ss_item_sk`) DISABLE NOVALIDATE RELY +PREHOOK: type: ALTERTABLE_ADDCONSTRAINT +PREHOOK: Input: default@store_sales +PREHOOK: Output: default@store_sales +POSTHOOK: query: ALTER TABLE `store_sales` ADD CONSTRAINT `pk_ss` PRIMARY KEY (`ss_item_sk`) DISABLE NOVALIDATE RELY +POSTHOOK: type: ALTERTABLE_ADDCONSTRAINT +POSTHOOK: Input: default@store_sales +POSTHOOK: Output: default@store_sales +PREHOOK: query: ALTER TABLE `item` ADD CONSTRAINT `pk_i` PRIMARY KEY (`i_item_sk`) DISABLE NOVALIDATE RELY +PREHOOK: type: ALTERTABLE_ADDCONSTRAINT +PREHOOK: Input: default@item +PREHOOK: Output: default@item +POSTHOOK: query: ALTER TABLE `item` ADD CONSTRAINT `pk_i` PRIMARY KEY (`i_item_sk`) DISABLE NOVALIDATE RELY +POSTHOOK: type: ALTERTABLE_ADDCONSTRAINT +POSTHOOK: Input: default@item +POSTHOOK: Output: default@item +PREHOOK: query: ALTER TABLE `store_sales` ADD CONSTRAINT `ss_i` FOREIGN KEY (`ss_item_sk`) REFERENCES `item`(`i_item_sk`) DISABLE NOVALIDATE RELY +PREHOOK: type: ALTERTABLE_ADDCONSTRAINT +PREHOOK: Input: default@store_sales +PREHOOK: Output: default@store_sales +POSTHOOK: query: ALTER TABLE `store_sales` ADD CONSTRAINT `ss_i` FOREIGN KEY (`ss_item_sk`) REFERENCES `item`(`i_item_sk`) DISABLE NOVALIDATE RELY +POSTHOOK: type: ALTERTABLE_ADDCONSTRAINT +POSTHOOK: Input: default@store_sales +POSTHOOK: Output: default@store_sales +Warning: Shuffle Join MERGEJOIN[29][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: EXPLAIN CBO +SELECT i_item_sk +FROM store_sales, household_demographics, item +WHERE ss_item_sk = i_item_sk +PREHOOK: type: QUERY +PREHOOK: Input: default@household_demographics +PREHOOK: Input: default@item +PREHOOK: Input: default@store_sales +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN CBO +SELECT i_item_sk +FROM store_sales, household_demographics, item +WHERE ss_item_sk = i_item_sk +POSTHOOK: type: QUERY +POSTHOOK: Input: default@household_demographics +POSTHOOK: Input: default@item +POSTHOOK: Input: default@store_sales +#### A masked pattern was here #### +CBO PLAN: +HiveProject(i_item_sk=[$2]) + HiveJoin(condition=[=($0, $2)], joinType=[inner], algorithm=[none], cost=[not available]) + HiveJoin(condition=[true], joinType=[inner], algorithm=[none], cost=[not available]) + HiveProject(ss_item_sk=[$0]) + HiveTableScan(table=[[default, store_sales]], table:alias=[store_sales]) + HiveProject(DUMMY=[0]) + HiveTableScan(table=[[default, household_demographics]], table:alias=[household_demographics]) + HiveProject(i_item_sk=[$0]) + HiveTableScan(table=[[default, item]], table:alias=[item]) + diff --git a/ql/src/test/results/clientpositive/llap/cbo_row_count_non_simple_filter.q.out b/ql/src/test/results/clientpositive/llap/cbo_row_count_non_simple_filter.q.out new file mode 100644 index 00000000000..3ef5aa196e4 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/cbo_row_count_non_simple_filter.q.out @@ -0,0 +1,159 @@ +PREHOOK: query: CREATE TABLE `store_sales` (`ss_item_sk` bigint, `ss_hdemo_sk` bigint) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@store_sales +POSTHOOK: query: CREATE TABLE `store_sales` (`ss_item_sk` bigint, `ss_hdemo_sk` bigint) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@store_sales +PREHOOK: query: CREATE TABLE `household_demographics` (`hd_demo_sk` bigint, `hd_income_band_sk` bigint) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@household_demographics +POSTHOOK: query: CREATE TABLE `household_demographics` (`hd_demo_sk` bigint, `hd_income_band_sk` bigint) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@household_demographics +PREHOOK: query: CREATE TABLE `item` (`i_item_sk` bigint, `i_product_name` string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@item +POSTHOOK: query: CREATE TABLE `item` (`i_item_sk` bigint, `i_product_name` string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@item +PREHOOK: query: CREATE TABLE `income_band`(`ib_income_band_sk` bigint) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@income_band +POSTHOOK: query: CREATE TABLE `income_band`(`ib_income_band_sk` bigint) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@income_band +PREHOOK: query: CREATE TABLE `customer`(`c_current_hdemo_sk` bigint) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@customer +POSTHOOK: query: CREATE TABLE `customer`(`c_current_hdemo_sk` bigint) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@customer +PREHOOK: query: ALTER TABLE `store_sales` ADD CONSTRAINT `pk_ss` PRIMARY KEY (`ss_item_sk`) DISABLE NOVALIDATE RELY +PREHOOK: type: ALTERTABLE_ADDCONSTRAINT +PREHOOK: Input: default@store_sales +PREHOOK: Output: default@store_sales +POSTHOOK: query: ALTER TABLE `store_sales` ADD CONSTRAINT `pk_ss` PRIMARY KEY (`ss_item_sk`) DISABLE NOVALIDATE RELY +POSTHOOK: type: ALTERTABLE_ADDCONSTRAINT +POSTHOOK: Input: default@store_sales +POSTHOOK: Output: default@store_sales +PREHOOK: query: ALTER TABLE `item` ADD CONSTRAINT `pk_i` PRIMARY KEY (`i_item_sk`) DISABLE NOVALIDATE RELY +PREHOOK: type: ALTERTABLE_ADDCONSTRAINT +PREHOOK: Input: default@item +PREHOOK: Output: default@item +POSTHOOK: query: ALTER TABLE `item` ADD CONSTRAINT `pk_i` PRIMARY KEY (`i_item_sk`) DISABLE NOVALIDATE RELY +POSTHOOK: type: ALTERTABLE_ADDCONSTRAINT +POSTHOOK: Input: default@item +POSTHOOK: Output: default@item +PREHOOK: query: ALTER TABLE `household_demographics` ADD CONSTRAINT `pk_hd` PRIMARY KEY (`hd_demo_sk`) DISABLE NOVALIDATE RELY +PREHOOK: type: ALTERTABLE_ADDCONSTRAINT +PREHOOK: Input: default@household_demographics +PREHOOK: Output: default@household_demographics +POSTHOOK: query: ALTER TABLE `household_demographics` ADD CONSTRAINT `pk_hd` PRIMARY KEY (`hd_demo_sk`) DISABLE NOVALIDATE RELY +POSTHOOK: type: ALTERTABLE_ADDCONSTRAINT +POSTHOOK: Input: default@household_demographics +POSTHOOK: Output: default@household_demographics +PREHOOK: query: ALTER TABLE `income_band` ADD CONSTRAINT `pk_ib` PRIMARY KEY (`ib_income_band_sk`) DISABLE NOVALIDATE RELY +PREHOOK: type: ALTERTABLE_ADDCONSTRAINT +PREHOOK: Input: default@income_band +PREHOOK: Output: default@income_band +POSTHOOK: query: ALTER TABLE `income_band` ADD CONSTRAINT `pk_ib` PRIMARY KEY (`ib_income_band_sk`) DISABLE NOVALIDATE RELY +POSTHOOK: type: ALTERTABLE_ADDCONSTRAINT +POSTHOOK: Input: default@income_band +POSTHOOK: Output: default@income_band +PREHOOK: query: ALTER TABLE `store_sales` ADD CONSTRAINT `ss_i` FOREIGN KEY (`ss_item_sk`) REFERENCES `item`(`i_item_sk`) DISABLE NOVALIDATE RELY +PREHOOK: type: ALTERTABLE_ADDCONSTRAINT +PREHOOK: Input: default@store_sales +PREHOOK: Output: default@store_sales +POSTHOOK: query: ALTER TABLE `store_sales` ADD CONSTRAINT `ss_i` FOREIGN KEY (`ss_item_sk`) REFERENCES `item`(`i_item_sk`) DISABLE NOVALIDATE RELY +POSTHOOK: type: ALTERTABLE_ADDCONSTRAINT +POSTHOOK: Input: default@store_sales +POSTHOOK: Output: default@store_sales +PREHOOK: query: ALTER TABLE `household_demographics` ADD CONSTRAINT `hd_ib` FOREIGN KEY (`hd_income_band_sk`) REFERENCES `income_band`(`ib_income_band_sk`) DISABLE NOVALIDATE RELY +PREHOOK: type: ALTERTABLE_ADDCONSTRAINT +PREHOOK: Input: default@household_demographics +PREHOOK: Output: default@household_demographics +POSTHOOK: query: ALTER TABLE `household_demographics` ADD CONSTRAINT `hd_ib` FOREIGN KEY (`hd_income_band_sk`) REFERENCES `income_band`(`ib_income_band_sk`) DISABLE NOVALIDATE RELY +POSTHOOK: type: ALTERTABLE_ADDCONSTRAINT +POSTHOOK: Input: default@household_demographics +POSTHOOK: Output: default@household_demographics +PREHOOK: query: ALTER TABLE `customer` ADD CONSTRAINT `c_hd` FOREIGN KEY (`c_current_hdemo_sk`) REFERENCES `household_demographics`(`hd_demo_sk`) DISABLE NOVALIDATE RELY +PREHOOK: type: ALTERTABLE_ADDCONSTRAINT +PREHOOK: Input: default@customer +PREHOOK: Output: default@customer +POSTHOOK: query: ALTER TABLE `customer` ADD CONSTRAINT `c_hd` FOREIGN KEY (`c_current_hdemo_sk`) REFERENCES `household_demographics`(`hd_demo_sk`) DISABLE NOVALIDATE RELY +POSTHOOK: type: ALTERTABLE_ADDCONSTRAINT +POSTHOOK: Input: default@customer +POSTHOOK: Output: default@customer +Warning: Shuffle Join MERGEJOIN[78][tables = [$hdt$_0, $hdt$_1]] in Stage 'Reducer 2' is a cross product +PREHOOK: query: EXPLAIN CBO +SELECT i_product_name, i_item_sk +FROM store_sales + ,customer + ,household_demographics hd1 + ,household_demographics hd2 + ,income_band ib2 + ,item +WHERE ss_hdemo_sk = hd1.hd_demo_sk AND + ss_item_sk = i_item_sk and + c_current_hdemo_sk = hd2.hd_demo_sk AND + hd2.hd_income_band_sk = ib2.ib_income_band_sk +GROUP BY i_product_name, i_item_sk +PREHOOK: type: QUERY +PREHOOK: Input: default@customer +PREHOOK: Input: default@household_demographics +PREHOOK: Input: default@income_band +PREHOOK: Input: default@item +PREHOOK: Input: default@store_sales +#### A masked pattern was here #### +POSTHOOK: query: EXPLAIN CBO +SELECT i_product_name, i_item_sk +FROM store_sales + ,customer + ,household_demographics hd1 + ,household_demographics hd2 + ,income_band ib2 + ,item +WHERE ss_hdemo_sk = hd1.hd_demo_sk AND + ss_item_sk = i_item_sk and + c_current_hdemo_sk = hd2.hd_demo_sk AND + hd2.hd_income_band_sk = ib2.ib_income_band_sk +GROUP BY i_product_name, i_item_sk +POSTHOOK: type: QUERY +POSTHOOK: Input: default@customer +POSTHOOK: Input: default@household_demographics +POSTHOOK: Input: default@income_band +POSTHOOK: Input: default@item +POSTHOOK: Input: default@store_sales +#### A masked pattern was here #### +CBO PLAN: +HiveProject(i_product_name=[$1], i_item_sk=[$0]) + HiveAggregate(group=[{0, 1}]) + HiveJoin(condition=[=($2, $6)], joinType=[inner], algorithm=[none], cost=[not available]) + HiveJoin(condition=[=($3, $0)], joinType=[inner], algorithm=[none], cost=[not available]) + HiveJoin(condition=[true], joinType=[inner], algorithm=[none], cost=[not available]) + HiveProject(i_item_sk=[$0], i_product_name=[$1]) + HiveTableScan(table=[[default, item]], table:alias=[item]) + HiveProject(c_current_hdemo_sk=[$0]) + HiveFilter(condition=[IS NOT NULL($0)]) + HiveTableScan(table=[[default, customer]], table:alias=[customer]) + HiveProject(ss_item_sk=[$0], ss_hdemo_sk=[$1], hd_demo_sk=[$2]) + HiveJoin(condition=[=($1, $2)], joinType=[inner], algorithm=[none], cost=[not available]) + HiveProject(ss_item_sk=[$0], ss_hdemo_sk=[$1]) + HiveFilter(condition=[IS NOT NULL($1)]) + HiveTableScan(table=[[default, store_sales]], table:alias=[store_sales]) + HiveProject(hd_demo_sk=[$0]) + HiveTableScan(table=[[default, household_demographics]], table:alias=[hd1]) + HiveProject(hd_demo_sk=[$0]) + HiveFilter(condition=[IS NOT NULL($1)]) + HiveTableScan(table=[[default, household_demographics]], table:alias=[hd2]) +