Repository: hive
Updated Branches:
  refs/heads/master d6ce23d53 -> 1320d2b31


HIVE-18191: Vectorization: Add validation of TableScanOperator (gather 
statistics) back (Matt McCline, reviewed by Teddy Choi)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/1320d2b3
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/1320d2b3
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/1320d2b3

Branch: refs/heads/master
Commit: 1320d2b31a358f319d8eeb4aa4f781d47bb3f4b4
Parents: d6ce23d
Author: Matt McCline <mmccl...@hortonworks.com>
Authored: Tue Dec 12 12:55:11 2017 -0600
Committer: Matt McCline <mmccl...@hortonworks.com>
Committed: Tue Dec 12 12:55:11 2017 -0600

----------------------------------------------------------------------
 .../hadoop/hive/ql/exec/TableScanOperator.java  |  10 +-
 .../hive/ql/optimizer/physical/Vectorizer.java  |   7 +-
 .../hadoop/hive/ql/plan/TableScanDesc.java      |   4 +
 .../clientpositive/vector_gather_stats.q        |  26 +++++
 .../clientpositive/tez/explainuser_3.q.out      |   2 +-
 .../clientpositive/vector_gather_stats.q.out    | 101 +++++++++++++++++++
 6 files changed, 143 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/1320d2b3/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java 
b/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java
index c76026b..c0138f2 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/exec/TableScanOperator.java
@@ -115,11 +115,7 @@ public class TableScanOperator extends 
Operator<TableScanDesc> implements
   @Override
   public void process(Object row, int tag) throws HiveException {
     if (rowLimit >= 0) {
-      if (row instanceof VectorizedRowBatch) {
-        // We need to check with 'instanceof' instead of just checking
-        // vectorized because the row can be a VectorizedRowBatch when
-        // FetchOptimizer kicks in even if the operator pipeline is not
-        // vectorized
+      if (vectorized) {
         VectorizedRowBatch batch = (VectorizedRowBatch) row;
         if (currCount >= rowLimit) {
           setDone(true);
@@ -266,6 +262,10 @@ public class TableScanOperator extends 
Operator<TableScanDesc> implements
     currentStat = null;
     stats = new HashMap<String, Stat>();
 
+    /*
+     * This TableScanDesc flag is strictly set by the Vectorizer class for 
vectorized MapWork
+     * vertices.
+     */
     vectorized = conf.isVectorized();
   }
 

http://git-wip-us.apache.org/repos/asf/hive/blob/1320d2b3/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
index 6500682..8ce2c33 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/physical/Vectorizer.java
@@ -1513,7 +1513,7 @@ public class Vectorizer implements PhysicalPlanResolver {
       LOG.info("Examining input format to see if vectorization is enabled.");
 
       ImmutablePair<String,TableScanOperator> onlyOneTableScanPair = 
verifyOnlyOneTableScanOperator(mapWork);
-      if (onlyOneTableScanPair ==  null) {
+      if (onlyOneTableScanPair == null) {
         VectorizerReason notVectorizedReason = 
currentBaseWork.getNotVectorizedReason();
         Preconditions.checkState(notVectorizedReason != null);
         mapWork.setVectorizationEnabledConditionsNotMet(Arrays.asList(new 
String[] {notVectorizedReason.toString()}));
@@ -1638,6 +1638,11 @@ public class Vectorizer implements PhysicalPlanResolver {
       // Set "global" member indicating where to store "not vectorized" 
information if necessary.
       currentBaseWork = mapWork;
 
+      if (!validateTableScanOperator(tableScanOperator, mapWork)) {
+
+        // The "not vectorized" information has been stored in the MapWork 
vertex.
+        return false;
+      }
       try {
         validateAndVectorizeMapOperators(tableScanOperator, isTezOrSpark, 
vectorTaskColumnInfo);
       } catch (VectorizerCannotVectorizeException e) {

http://git-wip-us.apache.org/repos/asf/hive/blob/1320d2b3/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java
----------------------------------------------------------------------
diff --git a/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java 
b/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java
index 4b7d2b4..8d966c7 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/plan/TableScanDesc.java
@@ -495,6 +495,10 @@ public class TableScanDesc extends AbstractOperatorDesc 
implements IStatsGatherD
     return new TableScanOperatorExplainVectorization(this, 
vectorTableScanDesc);
   }
 
+  /*
+   * This TableScanDesc flag is strictly set by the Vectorizer class for 
vectorized MapWork
+   * vertices.
+   */
   public void setVectorized(boolean vectorized) {
     this.vectorized = vectorized;
   }

http://git-wip-us.apache.org/repos/asf/hive/blob/1320d2b3/ql/src/test/queries/clientpositive/vector_gather_stats.q
----------------------------------------------------------------------
diff --git a/ql/src/test/queries/clientpositive/vector_gather_stats.q 
b/ql/src/test/queries/clientpositive/vector_gather_stats.q
new file mode 100644
index 0000000..deaca0e
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/vector_gather_stats.q
@@ -0,0 +1,26 @@
+set hive.vectorized.execution.enabled=true;
+
+-- HIVE-18191
+
+create table cd
+(
+    cd_demo_sk                int,
+    cd_gender                 string,
+    cd_marital_status         string,
+    cd_purchase_estimate      int,
+    cd_credit_rating          string,
+    cd_dep_count              int,
+    cd_dep_employed_count     int,
+    cd_dep_college_count      int
+)
+partitioned by
+(
+    cd_education_status       string
+);
+alter table cd add partition (cd_education_status='Primary');
+insert into table cd partition (cd_education_status='Primary') values (1, 'M', 
'M', 500, 'Good', 0, 0, 0);
+
+explain vectorization detail
+analyze table cd partition (cd_education_status) compute statistics;
+
+analyze table cd partition (cd_education_status) compute statistics;
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hive/blob/1320d2b3/ql/src/test/results/clientpositive/tez/explainuser_3.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/tez/explainuser_3.q.out 
b/ql/src/test/results/clientpositive/tez/explainuser_3.q.out
index 8b1ee28..b299377 100644
--- a/ql/src/test/results/clientpositive/tez/explainuser_3.q.out
+++ b/ql/src/test/results/clientpositive/tez/explainuser_3.q.out
@@ -191,7 +191,7 @@ POSTHOOK: type: QUERY
 Stage-2
   Stats Work{}
     Stage-0
-      Map 1 vectorized
+      Map 1
       TableScan [TS_0] (rows=500 width=10)
         default@src,src,Tbl:COMPLETE,Col:COMPLETE
 

http://git-wip-us.apache.org/repos/asf/hive/blob/1320d2b3/ql/src/test/results/clientpositive/vector_gather_stats.q.out
----------------------------------------------------------------------
diff --git a/ql/src/test/results/clientpositive/vector_gather_stats.q.out 
b/ql/src/test/results/clientpositive/vector_gather_stats.q.out
new file mode 100644
index 0000000..cf9bc85
--- /dev/null
+++ b/ql/src/test/results/clientpositive/vector_gather_stats.q.out
@@ -0,0 +1,101 @@
+PREHOOK: query: create table cd
+(
+    cd_demo_sk                int,
+    cd_gender                 string,
+    cd_marital_status         string,
+    cd_purchase_estimate      int,
+    cd_credit_rating          string,
+    cd_dep_count              int,
+    cd_dep_employed_count     int,
+    cd_dep_college_count      int
+)
+partitioned by
+(
+    cd_education_status       string
+)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@cd
+POSTHOOK: query: create table cd
+(
+    cd_demo_sk                int,
+    cd_gender                 string,
+    cd_marital_status         string,
+    cd_purchase_estimate      int,
+    cd_credit_rating          string,
+    cd_dep_count              int,
+    cd_dep_employed_count     int,
+    cd_dep_college_count      int
+)
+partitioned by
+(
+    cd_education_status       string
+)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@cd
+PREHOOK: query: alter table cd add partition (cd_education_status='Primary')
+PREHOOK: type: ALTERTABLE_ADDPARTS
+PREHOOK: Output: default@cd
+POSTHOOK: query: alter table cd add partition (cd_education_status='Primary')
+POSTHOOK: type: ALTERTABLE_ADDPARTS
+POSTHOOK: Output: default@cd
+POSTHOOK: Output: default@cd@cd_education_status=Primary
+PREHOOK: query: insert into table cd partition (cd_education_status='Primary') 
values (1, 'M', 'M', 500, 'Good', 0, 0, 0)
+PREHOOK: type: QUERY
+PREHOOK: Output: default@cd@cd_education_status=Primary
+POSTHOOK: query: insert into table cd partition 
(cd_education_status='Primary') values (1, 'M', 'M', 500, 'Good', 0, 0, 0)
+POSTHOOK: type: QUERY
+POSTHOOK: Output: default@cd@cd_education_status=Primary
+POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_credit_rating 
SIMPLE 
[(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col5, 
type:string, comment:), ]
+POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_demo_sk 
EXPRESSION 
[(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, 
type:string, comment:), ]
+POSTHOOK: Lineage: cd 
PARTITION(cd_education_status=Primary).cd_dep_college_count EXPRESSION 
[(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col8, 
type:string, comment:), ]
+POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_dep_count 
EXPRESSION 
[(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col6, 
type:string, comment:), ]
+POSTHOOK: Lineage: cd 
PARTITION(cd_education_status=Primary).cd_dep_employed_count EXPRESSION 
[(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col7, 
type:string, comment:), ]
+POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_gender SIMPLE 
[(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col2, 
type:string, comment:), ]
+POSTHOOK: Lineage: cd PARTITION(cd_education_status=Primary).cd_marital_status 
SIMPLE 
[(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col3, 
type:string, comment:), ]
+POSTHOOK: Lineage: cd 
PARTITION(cd_education_status=Primary).cd_purchase_estimate EXPRESSION 
[(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col4, 
type:string, comment:), ]
+PREHOOK: query: explain vectorization detail
+analyze table cd partition (cd_education_status) compute statistics
+PREHOOK: type: QUERY
+POSTHOOK: query: explain vectorization detail
+analyze table cd partition (cd_education_status) compute statistics
+POSTHOOK: type: QUERY
+PLAN VECTORIZATION:
+  enabled: true
+  enabledConditionsMet: [hive.vectorized.execution.enabled IS true]
+
+STAGE DEPENDENCIES:
+  Stage-0 is a root stage
+  Stage-1 depends on stages: Stage-0
+
+STAGE PLANS:
+  Stage: Stage-0
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: cd
+            Statistics: Num rows: 1 Data size: 20 Basic stats: COMPLETE Column 
stats: NONE
+      Map Vectorization:
+          enabled: true
+          enabledConditionsMet: hive.vectorized.use.vector.serde.deserialize 
IS true
+          inputFileFormats: org.apache.hadoop.mapred.TextInputFormat
+          notVectorizedReason: TABLESCAN operator: gather stats not supported
+          vectorized: false
+
+  Stage: Stage-1
+    Stats Work
+      Basic Stats Work:
+
+PREHOOK: query: analyze table cd partition (cd_education_status) compute 
statistics
+PREHOOK: type: QUERY
+PREHOOK: Input: default@cd
+PREHOOK: Input: default@cd@cd_education_status=Primary
+PREHOOK: Output: default@cd
+PREHOOK: Output: default@cd@cd_education_status=Primary
+POSTHOOK: query: analyze table cd partition (cd_education_status) compute 
statistics
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@cd
+POSTHOOK: Input: default@cd@cd_education_status=Primary
+POSTHOOK: Output: default@cd
+POSTHOOK: Output: default@cd@cd_education_status=Primary

Reply via email to