Repository: hive Updated Branches: refs/heads/master d6ce23d53 -> 1320d2b31
HIVE-18191: Vectorization: Add validation of TableScanOperator (gather statistics) back (Matt McCline, reviewed by Teddy Choi) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/1320d2b3 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/1320d2b3 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/1320d2b3 Branch: refs/heads/master Commit: 1320d2b31a358f319d8eeb4aa4f781d47bb3f4b4 Parents: d6ce23d Author: Matt McCline <mmccl...@hortonworks.com> Authored: Tue Dec 12 12:55:11 2017 -0600 Committer: Matt McCline <mmccl...@hortonworks.com> Committed: Tue Dec 12 12:55:11 2017 -0600 ---------------------------------------------------------------------- .../hadoop/hive/ql/exec/TableScanOperator.java | 10 +- .../hive/ql/optimizer/physical/Vectorizer.java | 7 +- .../hadoop/hive/ql/plan/TableScanDesc.java | 4 + .../clientpositive/vector_gather_stats.q | 26 +++++ .../clientpositive/tez/explainuser_3.q.out | 2 +- .../clientpositive/vector_gather_stats.q.out | 101 +++++++++++++++++++ 6 files changed, 143 insertions(+), 7 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/1320d2b3/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java b/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java index c76026b..c0138f2 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java @@ -115,11 +115,7 @@ public class TableScanOperator extends Operator<TableScanDesc> implements @Override public void process(Object row, int tag) throws HiveException { if (rowLimit >= 0) { - if (row instanceof VectorizedRowBatch) { - // We need to check with 'instanceof' instead of just checking - // vectorized because the row can be a VectorizedRowBatch when - // FetchOptimizer kicks in even if the operator pipeline is not - // vectorized + if (vectorized) { VectorizedRowBatch batch = (VectorizedRowBatch) row; if (currCount >= rowLimit) { setDone(true); @@ -266,6 +262,10 @@ public class TableScanOperator extends Operator<TableScanDesc> implements currentStat = null; stats = new HashMap<String, Stat>(); + /* + * This TableScanDesc flag is strictly set by the Vectorizer class for vectorized MapWork + * vertices. + */ vectorized = conf.isVectorized(); } http://git-wip-us.apache.org/repos/asf/hive/blob/1320d2b3/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java index 6500682..8ce2c33 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java @@ -1513,7 +1513,7 @@ public class Vectorizer implements PhysicalPlanResolver { LOG.info("Examining input format to see if vectorization is enabled."); ImmutablePair<String,TableScanOperator> onlyOneTableScanPair = verifyOnlyOneTableScanOperator(mapWork); - if (onlyOneTableScanPair == null) { + if (onlyOneTableScanPair == null) { VectorizerReason notVectorizedReason = currentBaseWork.getNotVectorizedReason(); Preconditions.checkState(notVectorizedReason != null); mapWork.setVectorizationEnabledConditionsNotMet(Arrays.asList(new String[] {notVectorizedReason.toString()})); @@ -1638,6 +1638,11 @@ public class Vectorizer implements PhysicalPlanResolver { // Set "global" member indicating where to store "not vectorized" information if necessary. currentBaseWork = mapWork; + if (!validateTableScanOperator(tableScanOperator, mapWork)) { + + // The "not vectorized" information has been stored in the MapWork vertex. + return false; + } try { validateAndVectorizeMapOperators(tableScanOperator, isTezOrSpark, vectorTaskColumnInfo); } catch (VectorizerCannotVectorizeException e) { http://git-wip-us.apache.org/repos/asf/hive/blob/1320d2b3/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java b/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java index 4b7d2b4..8d966c7 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java @@ -495,6 +495,10 @@ public class TableScanDesc extends AbstractOperatorDesc implements IStatsGatherD return new TableScanOperatorExplainVectorization(this, vectorTableScanDesc); } + /* + * This TableScanDesc flag is strictly set by the Vectorizer class for vectorized MapWork + * vertices. + */ public void setVectorized(boolean vectorized) { this.vectorized = vectorized; } http://git-wip-us.apache.org/repos/asf/hive/blob/1320d2b3/ql/src/test/queries/clientpositive/vector_gather_stats.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/vector_gather_stats.q b/ql/src/test/queries/clientpositive/vector_gather_stats.q new file mode 100644 index 0000000..deaca0e --- /dev/null +++ b/ql/src/test/queries/clientpositive/vector_gather_stats.q @@ -0,0 +1,26 @@ +set hive.vectorized.execution.enabled=true; + +-- HIVE-18191 + +create table cd +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +partitioned by +( + cd_education_status string +); +alter table cd add partition (cd_education_status='Primary'); +insert into table cd partition (cd_education_status='Primary') values (1, 'M', 'M', 500, 'Good', 0, 0, 0); + +explain vectorization detail +analyze table cd partition (cd_education_status) compute statistics; + +analyze table cd partition (cd_education_status) compute statistics; \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hive/blob/1320d2b3/ql/src/test/results/clientpositive/tez/explainuser_3.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/tez/explainuser_3.q.out b/ql/src/test/results/clientpositive/tez/explainuser_3.q.out index 8b1ee28..b299377 100644 --- a/ql/src/test/results/clientpositive/tez/explainuser_3.q.out +++ b/ql/src/test/results/clientpositive/tez/explainuser_3.q.out @@ -191,7 +191,7 @@ POSTHOOK: type: QUERY Stage-2 Stats Work{} Stage-0 - Map 1 vectorized + Map 1 TableScan [TS_0] (rows=500 width=10) default@src,src,Tbl:COMPLETE,Col:COMPLETE http://git-wip-us.apache.org/repos/asf/hive/blob/1320d2b3/ql/src/test/results/clientpositive/vector_gather_stats.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/vector_gather_stats.q.out b/ql/src/test/results/clientpositive/vector_gather_stats.q.out new file mode 100644 index 0000000..cf9bc85 --- /dev/null +++ b/ql/src/test/results/clientpositive/vector_gather_stats.q.out @@ -0,0 +1,101 @@ +PREHOOK: query: create table cd +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +partitioned by +( + cd_education_status string +) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@cd +POSTHOOK: query: create table cd +( + cd_demo_sk int, + cd_gender string, + cd_marital_status string, + cd_purchase_estimate int, + cd_credit_rating string, + cd_dep_count int, + cd_dep_employed_count int, + cd_dep_college_count int +) +partitioned by +( + cd_education_status string +) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@cd +PREHOOK: query: alter table cd add partition (cd_education_status='Primary') +PREHOOK: type: ALTERTABLE_ADDPARTS +PREHOOK: Output: default@cd +POSTHOOK: query: alter table cd add partition (cd_education_status='Primary') +POSTHOOK: type: ALTERTABLE_ADDPARTS +POSTHOOK: Output: default@cd +POSTHOOK: Output: default@cd@cd_education_status=Primary +PREHOOK: query: insert into table cd partition (cd_education_status='Primary') values (1, 'M', 'M', 500, 'Good', 0, 0, 0) +PREHOOK: type: QUERY +PREHOOK: Output: default@cd@cd_education_status=Primary +POSTHOOK: query: insert into table cd partition (cd_education_status='Primary') values (1, 'M', 'M', 500, 'Good', 0, 0, 0) +POSTHOOK: type: QUERY +POSTHOOK: Output: default@cd@cd_education_status=Primary +POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_credit_rating SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col5, type:string, comment:), ] +POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_demo_sk EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_dep_college_count EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col8, type:string, comment:), ] +POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_dep_count EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col6, type:string, comment:), ] +POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_dep_employed_count EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col7, type:string, comment:), ] +POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_gender SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, type:string, comment:), ] +POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_marital_status SIMPLE [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col3, type:string, comment:), ] +POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_purchase_estimate EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col4, type:string, comment:), ] +PREHOOK: query: explain vectorization detail +analyze table cd partition (cd_education_status) compute statistics +PREHOOK: type: QUERY +POSTHOOK: query: explain vectorization detail +analyze table cd partition (cd_education_status) compute statistics +POSTHOOK: type: QUERY +PLAN VECTORIZATION: + enabled: true + enabledConditionsMet: [hive.vectorized.execution.enabled IS true] + +STAGE DEPENDENCIES: + Stage-0 is a root stage + Stage-1 depends on stages: Stage-0 + +STAGE PLANS: + Stage: Stage-0 + Map Reduce + Map Operator Tree: + TableScan + alias: cd + Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column stats: NONE + Map Vectorization: + enabled: true + enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize IS true + inputFileFormats: org.apache.hadoop.mapred.TextInputFormat + notVectorizedReason: TABLESCAN operator: gather stats not supported + vectorized: false + + Stage: Stage-1 + Stats Work + Basic Stats Work: + +PREHOOK: query: analyze table cd partition (cd_education_status) compute statistics +PREHOOK: type: QUERY +PREHOOK: Input: default@cd +PREHOOK: Input: default@cd@cd_education_status=Primary +PREHOOK: Output: default@cd +PREHOOK: Output: default@cd@cd_education_status=Primary +POSTHOOK: query: analyze table cd partition (cd_education_status) compute statistics +POSTHOOK: type: QUERY +POSTHOOK: Input: default@cd +POSTHOOK: Input: default@cd@cd_education_status=Primary +POSTHOOK: Output: default@cd +POSTHOOK: Output: default@cd@cd_education_status=Primary