This is an automated email from the ASF dual-hosted git repository. krisztiankasa pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/master by this push: new 4c033b7ed16 HIVE-27558: HBase table query does not push BETWEEN predicate to storage layer (Dayakar M, reviewed by Krisztian Kasa) 4c033b7ed16 is described below commit 4c033b7ed16bc544cf1e6cb262208f05c380730e Author: Dayakar M <59791497+mdaya...@users.noreply.github.com> AuthorDate: Fri Sep 15 19:37:59 2023 +0530 HIVE-27558: HBase table query does not push BETWEEN predicate to storage layer (Dayakar M, reviewed by Krisztian Kasa) Co-authored-by: mdayakar <mdaya...@cloudera.com> --- .../hadoop/hive/hbase/HBaseStorageHandler.java | 44 +++++ .../test/queries/positive/hbase_between_pushdown.q | 28 +++ .../results/positive/hbase_between_pushdown.q.out | 217 +++++++++++++++++++++ 3 files changed, 289 insertions(+) diff --git a/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStorageHandler.java b/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStorageHandler.java index 24b2ec93933..4e1a5da725c 100644 --- a/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStorageHandler.java +++ b/hbase-handler/src/java/org/apache/hadoop/hive/hbase/HBaseStorageHandler.java @@ -22,6 +22,7 @@ import java.io.IOException; import java.net.URI; import java.net.URISyntaxException; import java.util.ArrayList; +import java.util.Arrays; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; @@ -54,6 +55,7 @@ import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; import org.apache.hadoop.hive.ql.plan.ExprNodeGenericFuncDesc; import org.apache.hadoop.hive.ql.plan.TableDesc; import org.apache.hadoop.hive.ql.security.authorization.HiveCustomStorageHandlerUtils; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDFBetween; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqual; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrGreaterThan; import org.apache.hadoop.hive.ql.udf.generic.GenericUDFOPEqualOrLessThan; @@ -427,6 +429,10 @@ public class HBaseStorageHandler extends DefaultStorageHandler ExprNodeDesc predicate) { ColumnMapping keyMapping = hBaseSerDe.getHBaseSerdeParam().getKeyColumnMapping(); ColumnMapping tsMapping = hBaseSerDe.getHBaseSerdeParam().getTimestampColumnMapping(); + + // converts 'BETWEEN' predicate to '<= AND >=' predicate + predicate = convertBetweenToLTOrEqualAndGTOrEqual(predicate); + IndexPredicateAnalyzer analyzer = HiveHBaseTableInputFormat.newIndexPredicateAnalyzer( keyMapping.columnName, keyMapping.isComparable(), tsMapping == null ? null : tsMapping.columnName); @@ -503,6 +509,19 @@ public class HBaseStorageHandler extends DefaultStorageHandler continue; } } + + // In HBase, Keys are sorted lexicographically so the byte representation of negative values comes after the positive values + // So don't pushdown the predicate if any constant value is a negative number. + // if first condition constant value is less than 0 then don't pushdown the predicate + if (((Number) searchConditions.get(0).getConstantDesc().getValue()).longValue() < 0 && scSize == 2) { + residualPredicate = extractResidualCondition(analyzer, searchConditions, residualPredicate); + continue; + } + // if second condition constant value is less than 0 then don't pushdown the predicate + if (scSize == 2 && ((Number) searchConditions.get(1).getConstantDesc().getValue()).longValue() < 0) { + residualPredicate = extractResidualCondition(analyzer, searchConditions, residualPredicate); + continue; + } } // This one can be pushed @@ -516,6 +535,31 @@ public class HBaseStorageHandler extends DefaultStorageHandler return decomposedPredicate; } + private static ExprNodeDesc convertBetweenToLTOrEqualAndGTOrEqual(ExprNodeDesc predicate) { + if (predicate instanceof ExprNodeGenericFuncDesc && ((ExprNodeGenericFuncDesc) predicate).getGenericUDF() instanceof GenericUDFBetween && "false".equalsIgnoreCase( + predicate.getChildren().get(0).getExprString())) { + try { + List<ExprNodeDesc> betweenInputs = predicate.getChildren(); + List<ExprNodeDesc> gtOrEqualInputs = Arrays.asList(betweenInputs.get(1), betweenInputs.get(2)); + List<ExprNodeDesc> ltOrEqualInputs = Arrays.asList(betweenInputs.get(1), betweenInputs.get(3)); + + ExprNodeGenericFuncDesc gtOrEqual = + ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo(">=").getGenericUDF(), ">=", + gtOrEqualInputs); + ExprNodeGenericFuncDesc ltOrEqual = + ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("<=").getGenericUDF(), "<=", + ltOrEqualInputs); + List<ExprNodeDesc> andInputs = Arrays.asList(ltOrEqual, gtOrEqual); + + predicate = ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("and").getGenericUDF(), "and", + andInputs); + } catch (Exception se) { + LOG.warn("Unable to convert 'BETWEEN' predicate to '>= AND <=' predicate.", se); + } + } + return predicate; + } + private static ExprNodeGenericFuncDesc extractStorageHandlerCondition(IndexPredicateAnalyzer analyzer, List<IndexSearchCondition> searchConditions, ExprNodeGenericFuncDesc inputExpr) { if (inputExpr == null) { diff --git a/hbase-handler/src/test/queries/positive/hbase_between_pushdown.q b/hbase-handler/src/test/queries/positive/hbase_between_pushdown.q new file mode 100644 index 00000000000..a2a349be190 --- /dev/null +++ b/hbase-handler/src/test/queries/positive/hbase_between_pushdown.q @@ -0,0 +1,28 @@ +--! qt:dataset:src + +CREATE EXTERNAL TABLE hbase_pushdown(key int, value string) +STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' +WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key#binary,cf:string") +TBLPROPERTIES ("external.table.purge" = "true"); + +INSERT OVERWRITE TABLE hbase_pushdown SELECT * FROM src; +INSERT INTO hbase_pushdown VALUES(-10, 'NegativeValue10'),(-9, 'NegativeValue9'),(-5, 'NegativeValue5'),(-2, 'NegativeValue2'); + +explain select * from hbase_pushdown where key >= 90 and key <= 100; +select * from hbase_pushdown where key >= 90 and key <= 100; + +-- In HBase, Keys are sorted lexicographically so the byte representation of negative values comes after the positive values +-- So don't pushdown the predicate if any constant value is a negative number. +explain select * from hbase_pushdown where key >= -5 and key <= 5; +select * from hbase_pushdown where key >= -5 and key <= 5; + +-- Here the predicate can be pushed down even though value is negative as its not a range predicate. +explain select * from hbase_pushdown where key = -5; +select * from hbase_pushdown where key = -5; + + +set hive.optimize.ppd.storage=false; + +-- Now 'BETWEEN' predicate will not convert to '<= AND >=', also 'BETWEEN' predicate will not be pushed down +explain select * from hbase_pushdown where key >= 90 and key <= 100; +select * from hbase_pushdown where key >= 90 and key <= 100; \ No newline at end of file diff --git a/hbase-handler/src/test/results/positive/hbase_between_pushdown.q.out b/hbase-handler/src/test/results/positive/hbase_between_pushdown.q.out new file mode 100644 index 00000000000..6fe60f9c599 --- /dev/null +++ b/hbase-handler/src/test/results/positive/hbase_between_pushdown.q.out @@ -0,0 +1,217 @@ +PREHOOK: query: CREATE EXTERNAL TABLE hbase_pushdown(key int, value string) +STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' +WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key#binary,cf:string") +TBLPROPERTIES ("external.table.purge" = "true") +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@hbase_pushdown +POSTHOOK: query: CREATE EXTERNAL TABLE hbase_pushdown(key int, value string) +STORED BY 'org.apache.hadoop.hive.hbase.HBaseStorageHandler' +WITH SERDEPROPERTIES ("hbase.columns.mapping" = ":key#binary,cf:string") +TBLPROPERTIES ("external.table.purge" = "true") +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@hbase_pushdown +PREHOOK: query: INSERT OVERWRITE TABLE hbase_pushdown SELECT * FROM src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@hbase_pushdown +POSTHOOK: query: INSERT OVERWRITE TABLE hbase_pushdown SELECT * FROM src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@hbase_pushdown +PREHOOK: query: INSERT INTO hbase_pushdown VALUES(-10, 'NegativeValue10'),(-9, 'NegativeValue9'),(-5, 'NegativeValue5'),(-2, 'NegativeValue2') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@hbase_pushdown +POSTHOOK: query: INSERT INTO hbase_pushdown VALUES(-10, 'NegativeValue10'),(-9, 'NegativeValue9'),(-5, 'NegativeValue5'),(-2, 'NegativeValue2') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@hbase_pushdown +PREHOOK: query: explain select * from hbase_pushdown where key >= 90 and key <= 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@hbase_pushdown +#### A masked pattern was here #### +POSTHOOK: query: explain select * from hbase_pushdown where key >= 90 and key <= 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hbase_pushdown +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: hbase_pushdown + filterExpr: ((key <= 100) and (key >= 90)) (type: boolean) + Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE + ListSink + +PREHOOK: query: select * from hbase_pushdown where key >= 90 and key <= 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@hbase_pushdown +#### A masked pattern was here #### +POSTHOOK: query: select * from hbase_pushdown where key >= 90 and key <= 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hbase_pushdown +#### A masked pattern was here #### +90 val_90 +92 val_92 +95 val_95 +96 val_96 +97 val_97 +98 val_98 +100 val_100 +PREHOOK: query: explain select * from hbase_pushdown where key >= -5 and key <= 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@hbase_pushdown +#### A masked pattern was here #### +POSTHOOK: query: explain select * from hbase_pushdown where key >= -5 and key <= 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hbase_pushdown +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: hbase_pushdown + Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: ((key <= 5) and (key >= -5)) (type: boolean) + Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from hbase_pushdown where key >= -5 and key <= 5 +PREHOOK: type: QUERY +PREHOOK: Input: default@hbase_pushdown +#### A masked pattern was here #### +POSTHOOK: query: select * from hbase_pushdown where key >= -5 and key <= 5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hbase_pushdown +#### A masked pattern was here #### +0 val_0 +2 val_2 +4 val_4 +5 val_5 +-5 NegativeValue5 +-2 NegativeValue2 +PREHOOK: query: explain select * from hbase_pushdown where key = -5 +PREHOOK: type: QUERY +PREHOOK: Input: default@hbase_pushdown +#### A masked pattern was here #### +POSTHOOK: query: explain select * from hbase_pushdown where key = -5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hbase_pushdown +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + TableScan + alias: hbase_pushdown + filterExpr: (key = -5) (type: boolean) + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: -5 (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 184 Basic stats: COMPLETE Column stats: NONE + ListSink + +PREHOOK: query: select * from hbase_pushdown where key = -5 +PREHOOK: type: QUERY +PREHOOK: Input: default@hbase_pushdown +#### A masked pattern was here #### +POSTHOOK: query: select * from hbase_pushdown where key = -5 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hbase_pushdown +#### A masked pattern was here #### +-5 NegativeValue5 +PREHOOK: query: explain select * from hbase_pushdown where key >= 90 and key <= 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@hbase_pushdown +#### A masked pattern was here #### +POSTHOOK: query: explain select * from hbase_pushdown where key >= 90 and key <= 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hbase_pushdown +#### A masked pattern was here #### +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: hbase_pushdown + Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE + Filter Operator + predicate: key BETWEEN 90 AND 100 (type: boolean) + Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: key (type: int), value (type: string) + outputColumnNames: _col0, _col1 + Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 188 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + Execution mode: vectorized + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from hbase_pushdown where key >= 90 and key <= 100 +PREHOOK: type: QUERY +PREHOOK: Input: default@hbase_pushdown +#### A masked pattern was here #### +POSTHOOK: query: select * from hbase_pushdown where key >= 90 and key <= 100 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@hbase_pushdown +#### A masked pattern was here #### +90 val_90 +92 val_92 +95 val_95 +96 val_96 +97 val_97 +98 val_98 +100 val_100