Repository: hive Updated Branches: refs/heads/master 39c858c77 -> b2338ff4e
HIVE-14199 Enable Bucket Pruning for ACID tables (Saket Saurabh via Eugene Koifman) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/b2338ff4 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/b2338ff4 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/b2338ff4 Branch: refs/heads/master Commit: b2338ff4ec6ce3b3bf3478b82d6e4b5d5ec8ca8e Parents: 39c858c Author: Eugene Koifman <ekoif...@hortonworks.com> Authored: Mon Aug 22 15:33:16 2016 -0700 Committer: Eugene Koifman <ekoif...@hortonworks.com> Committed: Mon Aug 22 15:33:16 2016 -0700 ---------------------------------------------------------------------- .../test/resources/testconfiguration.properties | 3 +- .../apache/hadoop/hive/ql/exec/Utilities.java | 10 ++ .../clientpositive/acid_bucket_pruning.q | 21 +++ .../tez/acid_bucket_pruning.q.out | 151 +++++++++++++++++++ 4 files changed, 184 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/b2338ff4/itests/src/test/resources/testconfiguration.properties ---------------------------------------------------------------------- diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 16c09da..662ae3e 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -391,7 +391,8 @@ minitez.query.files.shared=acid_globallimit.q,\ union_type_chk.q -minitez.query.files=acid_vectorization_missing_cols.q,\ +minitez.query.files=acid_bucket_pruning.q,\ + acid_vectorization_missing_cols.q,\ bucket_map_join_tez1.q,\ smb_cache.q,\ bucket_map_join_tez2.q,\ http://git-wip-us.apache.org/repos/asf/hive/blob/b2338ff4/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java index c97c335..a542dc4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/Utilities.java @@ -1683,6 +1683,16 @@ public final class Utilities { } return Integer.parseInt(m.group(2)); } + // Check to see if the bucketName matches the pattern "bucket_([0-9]+).*" + // This can happen in ACID cases when we have splits on delta files, where the filenames + // are of the form delta_x_y/bucket_a. + if (bucketName.startsWith(AcidUtils.BUCKET_PREFIX)) { + m = AcidUtils.BUCKET_DIGIT_PATTERN.matcher(bucketName); + if (m.find()) { + return Integer.parseInt(m.group()); + } + // Note that legacy bucket digit pattern are being ignored here. + } return -1; } http://git-wip-us.apache.org/repos/asf/hive/blob/b2338ff4/ql/src/test/queries/clientpositive/acid_bucket_pruning.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/acid_bucket_pruning.q b/ql/src/test/queries/clientpositive/acid_bucket_pruning.q new file mode 100644 index 0000000..24f8de1 --- /dev/null +++ b/ql/src/test/queries/clientpositive/acid_bucket_pruning.q @@ -0,0 +1,21 @@ +set hive.mapred.mode=nonstrict; +set hive.optimize.ppd=true; +set hive.optimize.index.filter=true; +set hive.tez.bucket.pruning=true; +set hive.explain.user=false; +set hive.fetch.task.conversion=none; +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; + +-- Bucket pruning only works for ACID when split-update (U=D+I) has been enabled for the table. +-- For e.g., this can be done by setting 'transactional_properties' = 'default'. +-- This also means that bucket pruning will not work for ACID tables with legacy behaviour. + +CREATE TABLE acidTblDefault(a INT) CLUSTERED BY(a) INTO 16 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true', 'transactional_properties'='default'); +INSERT INTO TABLE acidTblDefault SELECT cint FROM alltypesorc WHERE cint IS NOT NULL ORDER BY cint; +INSERT INTO TABLE acidTblDefault VALUES (1); + +-- Exactly one of the buckets should be selected out of the 16 buckets +-- by the following selection query. +EXPLAIN EXTENDED +SELECT * FROM acidTblDefault WHERE a = 1; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hive/blob/b2338ff4/ql/src/test/results/clientpositive/tez/acid_bucket_pruning.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/acid_bucket_pruning.q.out b/ql/src/test/results/clientpositive/tez/acid_bucket_pruning.q.out new file mode 100644 index 0000000..e71bc12 --- /dev/null +++ b/ql/src/test/results/clientpositive/tez/acid_bucket_pruning.q.out @@ -0,0 +1,151 @@ +PREHOOK: query: -- Bucket pruning only works for ACID when split-update (U=D+I) has been enabled for the table. +-- For e.g., this can be done by setting 'transactional_properties' = 'default'. +-- This also means that bucket pruning will not work for ACID tables with legacy behaviour. + +CREATE TABLE acidTblDefault(a INT) CLUSTERED BY(a) INTO 16 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true', 'transactional_properties'='default') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acidTblDefault +POSTHOOK: query: -- Bucket pruning only works for ACID when split-update (U=D+I) has been enabled for the table. +-- For e.g., this can be done by setting 'transactional_properties' = 'default'. +-- This also means that bucket pruning will not work for ACID tables with legacy behaviour. + +CREATE TABLE acidTblDefault(a INT) CLUSTERED BY(a) INTO 16 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true', 'transactional_properties'='default') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acidTblDefault +PREHOOK: query: INSERT INTO TABLE acidTblDefault SELECT cint FROM alltypesorc WHERE cint IS NOT NULL ORDER BY cint +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +PREHOOK: Output: default@acidtbldefault +POSTHOOK: query: INSERT INTO TABLE acidTblDefault SELECT cint FROM alltypesorc WHERE cint IS NOT NULL ORDER BY cint +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +POSTHOOK: Output: default@acidtbldefault +POSTHOOK: Lineage: acidtbldefault.a SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ] +PREHOOK: query: INSERT INTO TABLE acidTblDefault VALUES (1) +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@acidtbldefault +POSTHOOK: query: INSERT INTO TABLE acidTblDefault VALUES (1) +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@acidtbldefault +POSTHOOK: Lineage: acidtbldefault.a EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: -- Exactly one of the buckets should be selected out of the 16 buckets +-- by the following selection query. +EXPLAIN EXTENDED +SELECT * FROM acidTblDefault WHERE a = 1 +PREHOOK: type: QUERY +POSTHOOK: query: -- Exactly one of the buckets should be selected out of the 16 buckets +-- by the following selection query. +EXPLAIN EXTENDED +SELECT * FROM acidTblDefault WHERE a = 1 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: acidtbldefault + filterExpr: (a = 1) (type: boolean) + buckets included: [1,] of 16 + Statistics: Num rows: 8983 Data size: 35932 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: (a = 1) (type: boolean) + Statistics: Num rows: 4491 Data size: 17964 Basic stats: COMPLETE Column stats: NONE + Select Operator + expressions: 1 (type: int) + outputColumnNames: _col0 + Statistics: Num rows: 4491 Data size: 17964 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 4491 Data size: 17964 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0 + columns.types int + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.escape.crlf true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + Path -> Alias: +#### A masked pattern was here #### + Path -> Partition: +#### A masked pattern was here #### + Partition + base file name: acidtbldefault + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count 16 + bucket_field_name a + columns a + columns.comments + columns.types int +#### A masked pattern was here #### + name default.acidtbldefault + numFiles 17 + numRows 0 + rawDataSize 0 + serialization.ddl struct acidtbldefault { i32 a} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 35932 + transactional true + transactional_properties default +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count 16 + bucket_field_name a + columns a + columns.comments + columns.types int +#### A masked pattern was here #### + name default.acidtbldefault + numFiles 17 + numRows 0 + rawDataSize 0 + serialization.ddl struct acidtbldefault { i32 a} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 35932 + transactional true + transactional_properties default +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.acidtbldefault + name: default.acidtbldefault + Truncated Path -> Alias: + /acidtbldefault [acidtbldefault] + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink +