[22/50] [abbrv] hive git commit: HIVE-13358: Stats state is not captured correctly: turn off stats optimizer for sampled table (Pengcheng Xiong, reviewed by Ashutosh Chauhan)

jdere Mon, 04 Apr 2016 13:37:33 -0700

HIVE-13358: Stats state is not captured correctly: turn off stats optimizer for 
sampled table (Pengcheng Xiong, reviewed by Ashutosh Chauhan)



Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/77474581
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/77474581
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/77474581

Branch: refs/heads/llap
Commit: 77474581df4016e3899a986e079513087a945674
Parents: 41a30b5
Author: Pengcheng Xiong <pxi...@apache.org>
Authored: Sun Mar 27 22:30:29 2016 -0700
Committer: Pengcheng Xiong <pxi...@apache.org>
Committed: Sun Mar 27 22:30:42 2016 -0700

----------------------------------------------------------------------
 .../hive/ql/optimizer/StatsOptimizer.java       |  14 +-
 .../sample_islocalmode_hook_use_metadata.q      |  48 ++++
 .../sample_islocalmode_hook_use_metadata.q.out  | 230 +++++++++++++++++++
 3 files changed, 289 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/77474581/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
----------------------------------------------------------------------
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java 
b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
index 4091c0d..bc17fec 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
@@ -100,9 +100,12 @@ public class StatsOptimizer extends Transform {
   @Override
   public ParseContext transform(ParseContext pctx) throws SemanticException {
 
-    if (pctx.getFetchTask() != null || !pctx.getQueryProperties().isQuery() ||
-        pctx.getQueryProperties().isAnalyzeRewrite() || 
pctx.getQueryProperties().isCTAS() ||
-        pctx.getLoadFileWork().size() > 1 || 
!pctx.getLoadTableWork().isEmpty()) {
+    if (pctx.getFetchTask() != null || !pctx.getQueryProperties().isQuery()
+        || pctx.getQueryProperties().isAnalyzeRewrite() || 
pctx.getQueryProperties().isCTAS()
+        || pctx.getLoadFileWork().size() > 1 || 
!pctx.getLoadTableWork().isEmpty()
+        // If getNameToSplitSample is not empty, at least one of the source
+        // tables is being sampled and we can not optimize.
+        || !pctx.getNameToSplitSample().isEmpty()) {
       return pctx;
     }
 
@@ -251,6 +254,11 @@ public class StatsOptimizer extends Transform {
           // looks like a subq plan.
           return null;
         }
+        if (tsOp.getConf().getRowLimit() != -1) {
+          // table is sampled. In some situation, we really can leverage row
+          // limit. In order to be safe, we do not use it now.
+          return null;
+        }
         SelectOperator pselOp = (SelectOperator)stack.get(1);
         for(ExprNodeDesc desc : pselOp.getConf().getColList()) {
           if (!((desc instanceof ExprNodeColumnDesc) || (desc instanceof 
ExprNodeConstantDesc))) {

http://git-wip-us.apache.org/repos/asf/hive/blob/77474581/ql/src/test/queries/clientpositive/sample_islocalmode_hook_use_metadata.q
----------------------------------------------------------------------
diff --git 
a/ql/src/test/queries/clientpositive/sample_islocalmode_hook_use_metadata.q 
b/ql/src/test/queries/clientpositive/sample_islocalmode_hook_use_metadata.q
new file mode 100644
index 0000000..ac915b5
--- /dev/null
+++ b/ql/src/test/queries/clientpositive/sample_islocalmode_hook_use_metadata.q
@@ -0,0 +1,48 @@
+set hive.mapred.mode=nonstrict;
+set hive.exec.submitviachild=true;
+set hive.exec.submit.local.task.via.child=true;
+set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat;
+set mapred.max.split.size=300;
+set mapred.min.split.size=300;
+set mapred.min.split.size.per.node=300;
+set mapred.min.split.size.per.rack=300;
+set hive.exec.mode.local.auto=true;
+set hive.merge.smallfiles.avgsize=1;
+set hive.compute.query.using.stats=true;
+
+-- EXCLUDE_HADOOP_MAJOR_VERSIONS( 0.20S)
+
+-- create file inputs
+create table sih_i_part (key int, value string) partitioned by (p string);
+insert overwrite table sih_i_part partition (p='1') select key, value from src;
+insert overwrite table sih_i_part partition (p='2') select key+10000, value 
from src;
+insert overwrite table sih_i_part partition (p='3') select key+20000, value 
from src;
+create table sih_src as select key, value from sih_i_part order by key, value;
+create table sih_src2 as select key, value from sih_src order by key, value;
+
+set hive.exec.post.hooks = 
org.apache.hadoop.hive.ql.hooks.VerifyIsLocalModeHook;
+set mapreduce.framework.name=yarn;
+set mapreduce.jobtracker.address=localhost:58;
+set hive.sample.seednumber=7;
+
+-- Relaxing hive.exec.mode.local.auto.input.files.max=1.
+-- Hadoop20 will not generate more splits than there are files (one).
+-- Hadoop23 generate splits correctly (four), hence the max needs to be 
adjusted to ensure running in local mode.
+-- Default value is hive.exec.mode.local.auto.input.files.max=4 which produces 
expected behavior on Hadoop23.
+-- hive.sample.seednumber is required because Hadoop23 generates multiple 
splits and tablesample is non-repeatable without it.
+
+-- sample split, running locally limited by num tasks
+
+desc formatted sih_src;
+
+explain select count(1) from sih_src;
+
+select count(1) from sih_src;
+
+explain select count(1) from sih_src tablesample(1 percent);
+
+select count(1) from sih_src tablesample(1 percent);
+
+explain select count(1) from sih_src tablesample(10 rows);
+
+select count(1) from sih_src tablesample(10 rows);
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hive/blob/77474581/ql/src/test/results/clientpositive/sample_islocalmode_hook_use_metadata.q.out
----------------------------------------------------------------------
diff --git 
a/ql/src/test/results/clientpositive/sample_islocalmode_hook_use_metadata.q.out 
b/ql/src/test/results/clientpositive/sample_islocalmode_hook_use_metadata.q.out
new file mode 100644
index 0000000..d268837
--- /dev/null
+++ 
b/ql/src/test/results/clientpositive/sample_islocalmode_hook_use_metadata.q.out
@@ -0,0 +1,230 @@
+PREHOOK: query: -- EXCLUDE_HADOOP_MAJOR_VERSIONS( 0.20S)
+
+-- create file inputs
+create table sih_i_part (key int, value string) partitioned by (p string)
+PREHOOK: type: CREATETABLE
+PREHOOK: Output: database:default
+PREHOOK: Output: default@sih_i_part
+POSTHOOK: query: -- EXCLUDE_HADOOP_MAJOR_VERSIONS( 0.20S)
+
+-- create file inputs
+create table sih_i_part (key int, value string) partitioned by (p string)
+POSTHOOK: type: CREATETABLE
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@sih_i_part
+PREHOOK: query: insert overwrite table sih_i_part partition (p='1') select 
key, value from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@sih_i_part@p=1
+POSTHOOK: query: insert overwrite table sih_i_part partition (p='1') select 
key, value from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@sih_i_part@p=1
+POSTHOOK: Lineage: sih_i_part PARTITION(p=1).key EXPRESSION 
[(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: sih_i_part PARTITION(p=1).value SIMPLE 
[(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: insert overwrite table sih_i_part partition (p='2') select 
key+10000, value from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@sih_i_part@p=2
+POSTHOOK: query: insert overwrite table sih_i_part partition (p='2') select 
key+10000, value from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@sih_i_part@p=2
+POSTHOOK: Lineage: sih_i_part PARTITION(p=2).key EXPRESSION 
[(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: sih_i_part PARTITION(p=2).value SIMPLE 
[(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: insert overwrite table sih_i_part partition (p='3') select 
key+20000, value from src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@src
+PREHOOK: Output: default@sih_i_part@p=3
+POSTHOOK: query: insert overwrite table sih_i_part partition (p='3') select 
key+20000, value from src
+POSTHOOK: type: QUERY
+POSTHOOK: Input: default@src
+POSTHOOK: Output: default@sih_i_part@p=3
+POSTHOOK: Lineage: sih_i_part PARTITION(p=3).key EXPRESSION 
[(src)src.FieldSchema(name:key, type:string, comment:default), ]
+POSTHOOK: Lineage: sih_i_part PARTITION(p=3).value SIMPLE 
[(src)src.FieldSchema(name:value, type:string, comment:default), ]
+PREHOOK: query: create table sih_src as select key, value from sih_i_part 
order by key, value
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@sih_i_part
+PREHOOK: Input: default@sih_i_part@p=1
+PREHOOK: Input: default@sih_i_part@p=2
+PREHOOK: Input: default@sih_i_part@p=3
+PREHOOK: Output: database:default
+PREHOOK: Output: default@sih_src
+POSTHOOK: query: create table sih_src as select key, value from sih_i_part 
order by key, value
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@sih_i_part
+POSTHOOK: Input: default@sih_i_part@p=1
+POSTHOOK: Input: default@sih_i_part@p=2
+POSTHOOK: Input: default@sih_i_part@p=3
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@sih_src
+POSTHOOK: Lineage: sih_src.key SIMPLE 
[(sih_i_part)sih_i_part.FieldSchema(name:key, type:int, comment:null), ]
+POSTHOOK: Lineage: sih_src.value SIMPLE 
[(sih_i_part)sih_i_part.FieldSchema(name:value, type:string, comment:null), ]
+PREHOOK: query: create table sih_src2 as select key, value from sih_src order 
by key, value
+PREHOOK: type: CREATETABLE_AS_SELECT
+PREHOOK: Input: default@sih_src
+PREHOOK: Output: database:default
+PREHOOK: Output: default@sih_src2
+POSTHOOK: query: create table sih_src2 as select key, value from sih_src order 
by key, value
+POSTHOOK: type: CREATETABLE_AS_SELECT
+POSTHOOK: Input: default@sih_src
+POSTHOOK: Output: database:default
+POSTHOOK: Output: default@sih_src2
+POSTHOOK: Lineage: sih_src2.key SIMPLE [(sih_src)sih_src.FieldSchema(name:key, 
type:int, comment:null), ]
+POSTHOOK: Lineage: sih_src2.value SIMPLE 
[(sih_src)sih_src.FieldSchema(name:value, type:string, comment:null), ]
+PREHOOK: query: -- Relaxing hive.exec.mode.local.auto.input.files.max=1.
+-- Hadoop20 will not generate more splits than there are files (one).
+-- Hadoop23 generate splits correctly (four), hence the max needs to be 
adjusted to ensure running in local mode.
+-- Default value is hive.exec.mode.local.auto.input.files.max=4 which produces 
expected behavior on Hadoop23.
+-- hive.sample.seednumber is required because Hadoop23 generates multiple 
splits and tablesample is non-repeatable without it.
+
+-- sample split, running locally limited by num tasks
+
+desc formatted sih_src
+PREHOOK: type: DESCTABLE
+PREHOOK: Input: default@sih_src
+# col_name             data_type               comment             
+                
+key                    int                                         
+value                  string                                      
+                
+# Detailed Table Information            
+Database:              default                  
+#### A masked pattern was here ####
+Retention:             0                        
+#### A masked pattern was here ####
+Table Type:            MANAGED_TABLE            
+Table Parameters:               
+       COLUMN_STATS_ACCURATE   {\"BASIC_STATS\":\"true\"}
+       numFiles                1                   
+       numRows                 1500                
+       rawDataSize             18124               
+       totalSize               19624               
+#### A masked pattern was here ####
+                
+# Storage Information           
+SerDe Library:         org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe      
 
+InputFormat:           org.apache.hadoop.mapred.TextInputFormat         
+OutputFormat:          
org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat       
+Compressed:            No                       
+Num Buckets:           -1                       
+Bucket Columns:        []                       
+Sort Columns:          []                       
+Storage Desc Params:            
+       serialization.format    1                   
+PREHOOK: query: explain select count(1) from sih_src
+PREHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-0 is a root stage
+
+STAGE PLANS:
+  Stage: Stage-0
+    Fetch Operator
+      limit: 1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select count(1) from sih_src
+PREHOOK: type: QUERY
+PREHOOK: Input: default@sih_src
+#### A masked pattern was here ####
+1500
+PREHOOK: query: explain select count(1) from sih_src tablesample(1 percent)
+PREHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: sih_src
+            Statistics: Num rows: 1500 Data size: 18124 Basic stats: COMPLETE 
Column stats: COMPLETE
+            Select Operator
+              Statistics: Num rows: 1500 Data size: 18124 Basic stats: 
COMPLETE Column stats: COMPLETE
+              Group By Operator
+                aggregations: count(1)
+                mode: hash
+                outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  value expressions: _col0 (type: bigint)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0)
+          mode: mergepartial
+          outputColumnNames: _col0
+          Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column 
stats: COMPLETE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column 
stats: COMPLETE
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select count(1) from sih_src tablesample(1 percent)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@sih_src
+#### A masked pattern was here ####
+25
+PREHOOK: query: explain select count(1) from sih_src tablesample(10 rows)
+PREHOOK: type: QUERY
+STAGE DEPENDENCIES:
+  Stage-1 is a root stage
+  Stage-0 depends on stages: Stage-1
+
+STAGE PLANS:
+  Stage: Stage-1
+    Map Reduce
+      Map Operator Tree:
+          TableScan
+            alias: sih_src
+            Row Limit Per Split: 10
+            Statistics: Num rows: 1500 Data size: 18124 Basic stats: COMPLETE 
Column stats: COMPLETE
+            Select Operator
+              Statistics: Num rows: 1500 Data size: 18124 Basic stats: 
COMPLETE Column stats: COMPLETE
+              Group By Operator
+                aggregations: count(1)
+                mode: hash
+                outputColumnNames: _col0
+                Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: COMPLETE
+                Reduce Output Operator
+                  sort order: 
+                  Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE 
Column stats: COMPLETE
+                  value expressions: _col0 (type: bigint)
+      Reduce Operator Tree:
+        Group By Operator
+          aggregations: count(VALUE._col0)
+          mode: mergepartial
+          outputColumnNames: _col0
+          Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column 
stats: COMPLETE
+          File Output Operator
+            compressed: false
+            Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column 
stats: COMPLETE
+            table:
+                input format: org.apache.hadoop.mapred.SequenceFileInputFormat
+                output format: 
org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat
+                serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe
+
+  Stage: Stage-0
+    Fetch Operator
+      limit: -1
+      Processor Tree:
+        ListSink
+
+PREHOOK: query: select count(1) from sih_src tablesample(10 rows)
+PREHOOK: type: QUERY
+PREHOOK: Input: default@sih_src
+#### A masked pattern was here ####
+650

[22/50] [abbrv] hive git commit: HIVE-13358: Stats state is not captured correctly: turn off stats optimizer for sampled table (Pengcheng Xiong, reviewed by Ashutosh Chauhan)

Reply via email to