Repository: hive Updated Branches: refs/heads/master 111ed0964 -> e51f7c9d2
HIVE-18742: Vectorization acid/inputformat check should allow NullRowsInputFormat/OneNullRowInputFormat (Jason Dere, reviewed by Sergey Shelukhin) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/e51f7c9d Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/e51f7c9d Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/e51f7c9d Branch: refs/heads/master Commit: e51f7c9d277c8a1a7a289063b9bcf43ad6de8e99 Parents: 111ed09 Author: Jason Dere <jd...@hortonworks.com> Authored: Tue Feb 20 12:49:16 2018 -0800 Committer: Jason Dere <jd...@hortonworks.com> Committed: Tue Feb 20 12:49:16 2018 -0800 ---------------------------------------------------------------------- .../hive/ql/optimizer/physical/Vectorizer.java | 12 +- .../test/queries/clientpositive/acid_nullscan.q | 17 ++ .../results/clientpositive/acid_nullscan.q.out | 162 +++++++++++++++++++ 3 files changed, 190 insertions(+), 1 deletion(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/e51f7c9d/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 27b53b8..52ef2d3 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -101,6 +101,8 @@ import org.apache.hadoop.hive.ql.exec.vector.VectorizedSupport.Support; import org.apache.hadoop.hive.ql.exec.vector.expressions.IdentityExpression; import org.apache.hadoop.hive.ql.exec.vector.expressions.VectorExpression; import org.apache.hadoop.hive.ql.exec.vector.expressions.aggregates.VectorAggregateExpression; +import org.apache.hadoop.hive.ql.io.NullRowsInputFormat; +import org.apache.hadoop.hive.ql.io.OneNullRowInputFormat; import org.apache.hadoop.hive.ql.io.orc.OrcInputFormat; import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx; import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; @@ -353,6 +355,14 @@ public class Vectorizer implements PhysicalPlanResolver { vectorDeserializeTextSupportSet.addAll(Arrays.asList(Support.values())); } + private static final Set<String> supportedAcidInputFormats = new TreeSet<String>(); + static { + supportedAcidInputFormats.add(OrcInputFormat.class.getName()); + // For metadataonly or empty rows optimizations, null/onerow input format can be selected. + supportedAcidInputFormats.add(NullRowsInputFormat.class.getName()); + supportedAcidInputFormats.add(OneNullRowInputFormat.class.getName()); + } + private BaseWork currentBaseWork; private Operator<? extends OperatorDesc> currentOperator; private Collection<Class<?>> vectorizedInputFormatExcludes; @@ -1201,7 +1211,7 @@ public class Vectorizer implements PhysicalPlanResolver { // Today, ACID tables are only ORC and that format is vectorizable. Verify these // assumptions. Preconditions.checkState(isInputFileFormatVectorized); - Preconditions.checkState(inputFileFormatClassName.equals(OrcInputFormat.class.getName())); + Preconditions.checkState(supportedAcidInputFormats.contains(inputFileFormatClassName)); if (!useVectorizedInputFileFormat) { enabledConditionsNotMetList.add("Vectorizing ACID tables requires " http://git-wip-us.apache.org/repos/asf/hive/blob/e51f7c9d/ql/src/test/queries/clientpositive/acid_nullscan.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/acid_nullscan.q b/ql/src/test/queries/clientpositive/acid_nullscan.q new file mode 100644 index 0000000..d048231 --- /dev/null +++ b/ql/src/test/queries/clientpositive/acid_nullscan.q @@ -0,0 +1,17 @@ + +set hive.mapred.mode=nonstrict; +set hive.support.concurrency=true; +set hive.txn.manager=org.apache.hadoop.hive.ql.lockmgr.DbTxnManager; + +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.vectorized.execution.enabled=true; + +CREATE TABLE acid_vectorized(a INT, b STRING) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true'); +insert into table acid_vectorized select cint, cstring1 from alltypesorc where cint is not null order by cint limit 10; +insert into table acid_vectorized values (1, 'bar'); + +explain extended +select sum(a) from acid_vectorized where false; + +select sum(a) from acid_vectorized where false; + http://git-wip-us.apache.org/repos/asf/hive/blob/e51f7c9d/ql/src/test/results/clientpositive/acid_nullscan.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/acid_nullscan.q.out b/ql/src/test/results/clientpositive/acid_nullscan.q.out new file mode 100644 index 0000000..7fcc831 --- /dev/null +++ b/ql/src/test/results/clientpositive/acid_nullscan.q.out @@ -0,0 +1,162 @@ +PREHOOK: query: CREATE TABLE acid_vectorized(a INT, b STRING) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true') +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@acid_vectorized +POSTHOOK: query: CREATE TABLE acid_vectorized(a INT, b STRING) CLUSTERED BY(a) INTO 2 BUCKETS STORED AS ORC TBLPROPERTIES ('transactional'='true') +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@acid_vectorized +PREHOOK: query: insert into table acid_vectorized select cint, cstring1 from alltypesorc where cint is not null order by cint limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +PREHOOK: Output: default@acid_vectorized +POSTHOOK: query: insert into table acid_vectorized select cint, cstring1 from alltypesorc where cint is not null order by cint limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +POSTHOOK: Output: default@acid_vectorized +POSTHOOK: Lineage: acid_vectorized.a SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ] +POSTHOOK: Lineage: acid_vectorized.b SIMPLE [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:null), ] +PREHOOK: query: insert into table acid_vectorized values (1, 'bar') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@acid_vectorized +POSTHOOK: query: insert into table acid_vectorized values (1, 'bar') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@acid_vectorized +POSTHOOK: Lineage: acid_vectorized.a SCRIPT [] +POSTHOOK: Lineage: acid_vectorized.b SCRIPT [] +PREHOOK: query: explain extended +select sum(a) from acid_vectorized where false +PREHOOK: type: QUERY +POSTHOOK: query: explain extended +select sum(a) from acid_vectorized where false +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: acid_vectorized + Statistics: Num rows: 1 Data size: 24510 Basic stats: COMPLETE Column stats: NONE + GatherStats: false + Filter Operator + isSamplingPred: false + predicate: false (type: boolean) + Statistics: Num rows: 1 Data size: 24510 Basic stats: COMPLETE Column stats: NONE + Group By Operator + aggregations: sum(a) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + null sort order: + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + tag: -1 + value expressions: _col0 (type: bigint) + auto parallelism: false + Execution mode: vectorized + Path -> Alias: + nullscan://null/default.acid_vectorized/part_ [acid_vectorized] + Path -> Partition: + nullscan://null/default.acid_vectorized/part_ + Partition + input format: org.apache.hadoop.hive.ql.io.OneNullRowInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat + properties: + bucket_count 2 + bucket_field_name a + column.name.delimiter , + columns a,b + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.acid_vectorized + numFiles 3 + numRows 0 + rawDataSize 0 + serialization.ddl struct acid_vectorized { i32 a, string b} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.NullStructSerDe + totalSize 2451 + transactional true + transactional_properties default +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.serde2.NullStructSerDe + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count 2 + bucket_field_name a + column.name.delimiter , + columns a,b + columns.comments + columns.types int:string +#### A masked pattern was here #### + name default.acid_vectorized + numFiles 3 + numRows 0 + rawDataSize 0 + serialization.ddl struct acid_vectorized { i32 a, string b} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 2451 + transactional true + transactional_properties default +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.acid_vectorized + name: default.acid_vectorized + Truncated Path -> Alias: + nullscan://null/default.acid_vectorized/part_ [acid_vectorized] + Needs Tagging: false + Reduce Operator Tree: + Group By Operator + aggregations: sum(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + GlobalTableId: 0 +#### A masked pattern was here #### + NumFilesPerFileSink: 1 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: NONE +#### A masked pattern was here #### + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + properties: + columns _col0 + columns.types bigint + escape.delim \ + hive.serialization.extend.additional.nesting.levels true + serialization.escape.crlf true + serialization.format 1 + serialization.lib org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + TotalFiles: 1 + GatherStats: false + MultiFileSpray: false + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select sum(a) from acid_vectorized where false +PREHOOK: type: QUERY +PREHOOK: Input: default@acid_vectorized +#### A masked pattern was here #### +POSTHOOK: query: select sum(a) from acid_vectorized where false +POSTHOOK: type: QUERY +POSTHOOK: Input: default@acid_vectorized +#### A masked pattern was here #### +NULL