Repository: hive Updated Branches: refs/heads/branch-3 1cdd8c447 -> 698aa5d04
HIVE-19951: Vectorization: Need to disable encoded LLAP I/O for ORC when there is data type conversion (Schema Evolution) (Matt McCline, reviewed by Prasanth Jayachandran) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/698aa5d0 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/698aa5d0 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/698aa5d0 Branch: refs/heads/branch-3 Commit: 698aa5d048a7c3f79b3be527ee50694bc3a5fc70 Parents: 1cdd8c4 Author: Matt McCline <mmccl...@hortonworks.com> Authored: Mon Jul 2 08:40:39 2018 -0500 Committer: Matt McCline <mmccl...@hortonworks.com> Committed: Mon Jul 2 11:36:56 2018 -0500 ---------------------------------------------------------------------- .../test/resources/testconfiguration.properties | 1 + .../hive/llap/io/api/impl/LlapRecordReader.java | 64 +++++++ .../vector_llap_io_data_conversion.q | 19 ++ .../llap/vector_llap_io_data_conversion.q.out | 187 +++++++++++++++++++ 4 files changed, 271 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/698aa5d0/itests/src/test/resources/testconfiguration.properties ---------------------------------------------------------------------- diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index 09d8bfe..e677493 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -776,6 +776,7 @@ minillaplocal.query.files=\ vector_join_filters.q,\ vector_leftsemi_mapjoin.q,\ vector_like_2.q,\ + vector_llap_io_data_conversion.q,\ vector_llap_text_1.q,\ vector_mapjoin_reduce.q,\ vector_null_map.q,\ http://git-wip-us.apache.org/repos/asf/hive/blob/698aa5d0/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java ---------------------------------------------------------------------- diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java b/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java index 201c097..be748e9 100644 --- a/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java +++ b/llap-server/src/java/org/apache/hadoop/hive/llap/io/api/impl/LlapRecordReader.java @@ -289,8 +289,72 @@ class LlapRecordReader executor.submit(rp.getReadCallable()); } + private boolean hasSchemaEvolutionStringFamilyTruncateIssue(SchemaEvolution evolution) { + return hasStringFamilyTruncateTypeIssue(evolution, evolution.getReaderSchema()); + } + + // We recurse through the types. + private boolean hasStringFamilyTruncateTypeIssue(SchemaEvolution evolution, + TypeDescription readerType) { + TypeDescription fileType = evolution.getFileType(readerType); + if (fileType == null) { + return false; + } + switch (fileType.getCategory()) { + case BOOLEAN: + case BYTE: + case SHORT: + case INT: + case LONG: + case DOUBLE: + case FLOAT: + case STRING: + case TIMESTAMP: + case BINARY: + case DATE: + case DECIMAL: + // We are only looking for the CHAR/VARCHAR truncate issue. + return false; + case CHAR: + case VARCHAR: + if (readerType.getCategory().equals(TypeDescription.Category.CHAR) || + readerType.getCategory().equals(TypeDescription.Category.VARCHAR)) { + return (fileType.getMaxLength() > readerType.getMaxLength()); + } + return false; + case UNION: + case MAP: + case LIST: + case STRUCT: + { + List<TypeDescription> readerChildren = readerType.getChildren(); + final int childCount = readerChildren.size(); + for (int i = 0; i < childCount; ++i) { + if (hasStringFamilyTruncateTypeIssue(evolution, readerChildren.get(i))) { + return true; + } + } + } + return false; + default: + throw new IllegalArgumentException("Unknown type " + fileType); + } + } + private boolean checkOrcSchemaEvolution() { SchemaEvolution evolution = rp.getSchemaEvolution(); + + /* + * FUTURE: When SchemaEvolution.isOnlyImplicitConversion becomes available: + * 1) Replace the hasSchemaEvolutionStringFamilyTruncateIssue call with + * !isOnlyImplicitConversion. + * 2) Delete hasSchemaEvolutionStringFamilyTruncateIssue code. + */ + if (evolution.hasConversion() && hasSchemaEvolutionStringFamilyTruncateIssue(evolution)) { + + // We do not support data type conversion when reading encoded ORC data. + return false; + } // TODO: should this just use physical IDs? for (int i = 0; i < includes.getReaderLogicalColumnIds().size(); ++i) { int projectedColId = includes.getReaderLogicalColumnIds().get(i); http://git-wip-us.apache.org/repos/asf/hive/blob/698aa5d0/ql/src/test/queries/clientpositive/vector_llap_io_data_conversion.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/vector_llap_io_data_conversion.q b/ql/src/test/queries/clientpositive/vector_llap_io_data_conversion.q new file mode 100644 index 0000000..f40c4b9 --- /dev/null +++ b/ql/src/test/queries/clientpositive/vector_llap_io_data_conversion.q @@ -0,0 +1,19 @@ +--! qt:dataset:alltypesorc +set hive.explain.user=false; +SET hive.vectorized.execution.enabled=true; + +set hive.llap.io.enabled=true; +set hive.llap.io.encode.enabled=true; + +create table varchar_single_partition(vt varchar(10), vsi varchar(10), vi varchar(20), vb varchar(30), vf varchar(20),vd varchar(20),vs varchar(50)) + partitioned by(s varchar(50)) stored as orc; +insert into table varchar_single_partition partition(s='positive') select ctinyint,csmallint,cint,cbigint,cfloat,cdouble,cstring1 from alltypesorc where cint>0 limit 10; +insert into table varchar_single_partition partition(s='negative') select ctinyint,csmallint,cint,cbigint,cfloat,cdouble,cstring1 from alltypesorc where cint<0 limit 10; +alter table varchar_single_partition change column vs vs varchar(10); + +create table varchar_ctas_1 stored as orc as select vs, length(vs) as c1,reverse(vs) as c2 from varchar_single_partition where s='positive'; + +explain vectorization detail +select * from varchar_ctas_1 order by vs, c1, c2; + +select * from varchar_ctas_1 order by vs, c1, c2; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hive/blob/698aa5d0/ql/src/test/results/clientpositive/llap/vector_llap_io_data_conversion.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/vector_llap_io_data_conversion.q.out b/ql/src/test/results/clientpositive/llap/vector_llap_io_data_conversion.q.out new file mode 100644 index 0000000..f503761 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/vector_llap_io_data_conversion.q.out @@ -0,0 +1,187 @@ +PREHOOK: query: create table varchar_single_partition(vt varchar(10), vsi varchar(10), vi varchar(20), vb varchar(30), vf varchar(20),vd varchar(20),vs varchar(50)) + partitioned by(s varchar(50)) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@varchar_single_partition +POSTHOOK: query: create table varchar_single_partition(vt varchar(10), vsi varchar(10), vi varchar(20), vb varchar(30), vf varchar(20),vd varchar(20),vs varchar(50)) + partitioned by(s varchar(50)) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@varchar_single_partition +PREHOOK: query: insert into table varchar_single_partition partition(s='positive') select ctinyint,csmallint,cint,cbigint,cfloat,cdouble,cstring1 from alltypesorc where cint>0 limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +PREHOOK: Output: default@varchar_single_partition@s=positive +POSTHOOK: query: insert into table varchar_single_partition partition(s='positive') select ctinyint,csmallint,cint,cbigint,cfloat,cdouble,cstring1 from alltypesorc where cint>0 limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +POSTHOOK: Output: default@varchar_single_partition@s=positive +POSTHOOK: Lineage: varchar_single_partition PARTITION(s=positive).vb EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cbigint, type:bigint, comment:null), ] +POSTHOOK: Lineage: varchar_single_partition PARTITION(s=positive).vd EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cdouble, type:double, comment:null), ] +POSTHOOK: Lineage: varchar_single_partition PARTITION(s=positive).vf EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cfloat, type:float, comment:null), ] +POSTHOOK: Lineage: varchar_single_partition PARTITION(s=positive).vi EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ] +POSTHOOK: Lineage: varchar_single_partition PARTITION(s=positive).vs EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:null), ] +POSTHOOK: Lineage: varchar_single_partition PARTITION(s=positive).vsi EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:csmallint, type:smallint, comment:null), ] +POSTHOOK: Lineage: varchar_single_partition PARTITION(s=positive).vt EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:ctinyint, type:tinyint, comment:null), ] +PREHOOK: query: insert into table varchar_single_partition partition(s='negative') select ctinyint,csmallint,cint,cbigint,cfloat,cdouble,cstring1 from alltypesorc where cint<0 limit 10 +PREHOOK: type: QUERY +PREHOOK: Input: default@alltypesorc +PREHOOK: Output: default@varchar_single_partition@s=negative +POSTHOOK: query: insert into table varchar_single_partition partition(s='negative') select ctinyint,csmallint,cint,cbigint,cfloat,cdouble,cstring1 from alltypesorc where cint<0 limit 10 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@alltypesorc +POSTHOOK: Output: default@varchar_single_partition@s=negative +POSTHOOK: Lineage: varchar_single_partition PARTITION(s=negative).vb EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cbigint, type:bigint, comment:null), ] +POSTHOOK: Lineage: varchar_single_partition PARTITION(s=negative).vd EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cdouble, type:double, comment:null), ] +POSTHOOK: Lineage: varchar_single_partition PARTITION(s=negative).vf EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cfloat, type:float, comment:null), ] +POSTHOOK: Lineage: varchar_single_partition PARTITION(s=negative).vi EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cint, type:int, comment:null), ] +POSTHOOK: Lineage: varchar_single_partition PARTITION(s=negative).vs EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:cstring1, type:string, comment:null), ] +POSTHOOK: Lineage: varchar_single_partition PARTITION(s=negative).vsi EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:csmallint, type:smallint, comment:null), ] +POSTHOOK: Lineage: varchar_single_partition PARTITION(s=negative).vt EXPRESSION [(alltypesorc)alltypesorc.FieldSchema(name:ctinyint, type:tinyint, comment:null), ] +PREHOOK: query: alter table varchar_single_partition change column vs vs varchar(10) +PREHOOK: type: ALTERTABLE_RENAMECOL +PREHOOK: Input: default@varchar_single_partition +PREHOOK: Output: default@varchar_single_partition +POSTHOOK: query: alter table varchar_single_partition change column vs vs varchar(10) +POSTHOOK: type: ALTERTABLE_RENAMECOL +POSTHOOK: Input: default@varchar_single_partition +POSTHOOK: Output: default@varchar_single_partition +PREHOOK: query: create table varchar_ctas_1 stored as orc as select vs, length(vs) as c1,reverse(vs) as c2 from varchar_single_partition where s='positive' +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@varchar_single_partition +PREHOOK: Input: default@varchar_single_partition@s=positive +PREHOOK: Output: database:default +PREHOOK: Output: default@varchar_ctas_1 +POSTHOOK: query: create table varchar_ctas_1 stored as orc as select vs, length(vs) as c1,reverse(vs) as c2 from varchar_single_partition where s='positive' +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@varchar_single_partition +POSTHOOK: Input: default@varchar_single_partition@s=positive +POSTHOOK: Output: database:default +POSTHOOK: Output: default@varchar_ctas_1 +POSTHOOK: Lineage: varchar_ctas_1.c1 EXPRESSION [(varchar_single_partition)varchar_single_partition.FieldSchema(name:vs, type:varchar(10), comment:null), ] +POSTHOOK: Lineage: varchar_ctas_1.c2 EXPRESSION [(varchar_single_partition)varchar_single_partition.FieldSchema(name:vs, type:varchar(10), comment:null), ] +POSTHOOK: Lineage: varchar_ctas_1.vs SIMPLE [(varchar_single_partition)varchar_single_partition.FieldSchema(name:vs, type:varchar(10), comment:null), ] +PREHOOK: query: explain vectorization detail +select * from varchar_ctas_1 order by vs, c1, c2 +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization detail +select * from varchar_ctas_1 order by vs, c1, c2 +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Tez +#### A masked pattern was here #### + Edges: + Reducer 2 <- Map 1 (SIMPLE_EDGE) +#### A masked pattern was here #### + Vertices: + Map 1 + Map Operator Tree: + TableScan + alias: varchar_ctas_1 + Statistics: Num rows: 10 Data size: 2820 Basic stats: COMPLETE Column stats: NONE + TableScan Vectorization: + native: true + vectorizationSchemaColumns: [0:vs:varchar(10), 1:c1:int, 2:c2:string, 3:ROW__ID:struct<writeid:bigint,bucketid:int,rowid:bigint>] + Select Operator + expressions: vs (type: varchar(10)), c1 (type: int), c2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [0, 1, 2] + Statistics: Num rows: 10 Data size: 2820 Basic stats: COMPLETE Column stats: NONE + Reduce Output Operator + key expressions: _col0 (type: varchar(10)), _col1 (type: int), _col2 (type: string) + sort order: +++ + Reduce Sink Vectorization: + className: VectorReduceSinkObjectHashOperator + keyColumnNums: [0, 1, 2] + native: true + nativeConditionsMet: hive.vectorized.execution.reducesink.new.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true, No PTF TopN IS true, No DISTINCT columns IS true, BinarySortableSerDe for keys IS true, LazyBinarySerDe for values IS true + valueColumnNums: [] + Statistics: Num rows: 10 Data size: 2820 Basic stats: COMPLETE Column stats: NONE + Execution mode: vectorized, llap + LLAP IO: all inputs + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vectorized.input.format IS true + inputFormatFeatureSupport: [DECIMAL_64] + featureSupportInUse: [DECIMAL_64] + inputFileFormats: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + allNative: true + usesVectorUDFAdaptor: false + vectorized: true + rowBatchContext: + dataColumnCount: 3 + includeColumns: [0, 1, 2] + dataColumns: vs:varchar(10), c1:int, c2:string + partitionColumnCount: 0 + scratchColumnTypeNames: [] + Reducer 2 + Execution mode: vectorized, llap + Reduce Vectorization: + enabled: true + enableConditionsMet: hive.vectorized.execution.reduce.enabled IS true, hive.execution.engine tez IN [tez, spark] IS true + reduceColumnNullOrder: aaa + reduceColumnSortOrder: +++ + allNative: false + usesVectorUDFAdaptor: false + vectorized: true + rowBatchContext: + dataColumnCount: 3 + dataColumns: KEY.reducesinkkey0:varchar(10), KEY.reducesinkkey1:int, KEY.reducesinkkey2:string + partitionColumnCount: 0 + scratchColumnTypeNames: [] + Reduce Operator Tree: + Select Operator + expressions: KEY.reducesinkkey0 (type: varchar(10)), KEY.reducesinkkey1 (type: int), KEY.reducesinkkey2 (type: string) + outputColumnNames: _col0, _col1, _col2 + Select Vectorization: + className: VectorSelectOperator + native: true + projectedOutputColumnNums: [0, 1, 2] + Statistics: Num rows: 10 Data size: 2820 Basic stats: COMPLETE Column stats: NONE + File Output Operator + compressed: false + File Sink Vectorization: + className: VectorFileSinkOperator + native: false + Statistics: Num rows: 10 Data size: 2820 Basic stats: COMPLETE Column stats: NONE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select * from varchar_ctas_1 order by vs, c1, c2 +PREHOOK: type: QUERY +PREHOOK: Input: default@varchar_ctas_1 +#### A masked pattern was here #### +POSTHOOK: query: select * from varchar_ctas_1 order by vs, c1, c2 +POSTHOOK: type: QUERY +POSTHOOK: Input: default@varchar_ctas_1 +#### A masked pattern was here #### +cvLH6Eat2y 10 y2taE6HLvc +cvLH6Eat2y 10 y2taE6HLvc +cvLH6Eat2y 10 y2taE6HLvc +cvLH6Eat2y 10 y2taE6HLvc +cvLH6Eat2y 10 y2taE6HLvc +cvLH6Eat2y 10 y2taE6HLvc +cvLH6Eat2y 10 y2taE6HLvc +cvLH6Eat2y 10 y2taE6HLvc +cvLH6Eat2y 10 y2taE6HLvc +cvLH6Eat2y 10 y2taE6HLvc