HIVE-13358: Stats state is not captured correctly: turn off stats optimizer for sampled table (Pengcheng Xiong, reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/77474581 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/77474581 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/77474581 Branch: refs/heads/llap Commit: 77474581df4016e3899a986e079513087a945674 Parents: 41a30b5 Author: Pengcheng Xiong <pxi...@apache.org> Authored: Sun Mar 27 22:30:29 2016 -0700 Committer: Pengcheng Xiong <pxi...@apache.org> Committed: Sun Mar 27 22:30:42 2016 -0700 ---------------------------------------------------------------------- .../hive/ql/optimizer/StatsOptimizer.java | 14 +- .../sample_islocalmode_hook_use_metadata.q | 48 ++++ .../sample_islocalmode_hook_use_metadata.q.out | 230 +++++++++++++++++++ 3 files changed, 289 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/77474581/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java index 4091c0d..bc17fec 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java @@ -100,9 +100,12 @@ public class StatsOptimizer extends Transform { @Override public ParseContext transform(ParseContext pctx) throws SemanticException { - if (pctx.getFetchTask() != null || !pctx.getQueryProperties().isQuery() || - pctx.getQueryProperties().isAnalyzeRewrite() || pctx.getQueryProperties().isCTAS() || - pctx.getLoadFileWork().size() > 1 || !pctx.getLoadTableWork().isEmpty()) { + if (pctx.getFetchTask() != null || !pctx.getQueryProperties().isQuery() + || pctx.getQueryProperties().isAnalyzeRewrite() || pctx.getQueryProperties().isCTAS() + || pctx.getLoadFileWork().size() > 1 || !pctx.getLoadTableWork().isEmpty() + // If getNameToSplitSample is not empty, at least one of the source + // tables is being sampled and we can not optimize. + || !pctx.getNameToSplitSample().isEmpty()) { return pctx; } @@ -251,6 +254,11 @@ public class StatsOptimizer extends Transform { // looks like a subq plan. return null; } + if (tsOp.getConf().getRowLimit() != -1) { + // table is sampled. In some situation, we really can leverage row + // limit. In order to be safe, we do not use it now. + return null; + } SelectOperator pselOp = (SelectOperator)stack.get(1); for(ExprNodeDesc desc : pselOp.getConf().getColList()) { if (!((desc instanceof ExprNodeColumnDesc) || (desc instanceof ExprNodeConstantDesc))) { http://git-wip-us.apache.org/repos/asf/hive/blob/77474581/ql/src/test/queries/clientpositive/sample_islocalmode_hook_use_metadata.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/sample_islocalmode_hook_use_metadata.q b/ql/src/test/queries/clientpositive/sample_islocalmode_hook_use_metadata.q new file mode 100644 index 0000000..ac915b5 --- /dev/null +++ b/ql/src/test/queries/clientpositive/sample_islocalmode_hook_use_metadata.q @@ -0,0 +1,48 @@ +set hive.mapred.mode=nonstrict; +set hive.exec.submitviachild=true; +set hive.exec.submit.local.task.via.child=true; +set hive.input.format=org.apache.hadoop.hive.ql.io.CombineHiveInputFormat; +set mapred.max.split.size=300; +set mapred.min.split.size=300; +set mapred.min.split.size.per.node=300; +set mapred.min.split.size.per.rack=300; +set hive.exec.mode.local.auto=true; +set hive.merge.smallfiles.avgsize=1; +set hive.compute.query.using.stats=true; + +-- EXCLUDE_HADOOP_MAJOR_VERSIONS( 0.20S) + +-- create file inputs +create table sih_i_part (key int, value string) partitioned by (p string); +insert overwrite table sih_i_part partition (p='1') select key, value from src; +insert overwrite table sih_i_part partition (p='2') select key+10000, value from src; +insert overwrite table sih_i_part partition (p='3') select key+20000, value from src; +create table sih_src as select key, value from sih_i_part order by key, value; +create table sih_src2 as select key, value from sih_src order by key, value; + +set hive.exec.post.hooks = org.apache.hadoop.hive.ql.hooks.VerifyIsLocalModeHook; +set mapreduce.framework.name=yarn; +set mapreduce.jobtracker.address=localhost:58; +set hive.sample.seednumber=7; + +-- Relaxing hive.exec.mode.local.auto.input.files.max=1. +-- Hadoop20 will not generate more splits than there are files (one). +-- Hadoop23 generate splits correctly (four), hence the max needs to be adjusted to ensure running in local mode. +-- Default value is hive.exec.mode.local.auto.input.files.max=4 which produces expected behavior on Hadoop23. +-- hive.sample.seednumber is required because Hadoop23 generates multiple splits and tablesample is non-repeatable without it. + +-- sample split, running locally limited by num tasks + +desc formatted sih_src; + +explain select count(1) from sih_src; + +select count(1) from sih_src; + +explain select count(1) from sih_src tablesample(1 percent); + +select count(1) from sih_src tablesample(1 percent); + +explain select count(1) from sih_src tablesample(10 rows); + +select count(1) from sih_src tablesample(10 rows); \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hive/blob/77474581/ql/src/test/results/clientpositive/sample_islocalmode_hook_use_metadata.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/sample_islocalmode_hook_use_metadata.q.out b/ql/src/test/results/clientpositive/sample_islocalmode_hook_use_metadata.q.out new file mode 100644 index 0000000..d268837 --- /dev/null +++ b/ql/src/test/results/clientpositive/sample_islocalmode_hook_use_metadata.q.out @@ -0,0 +1,230 @@ +PREHOOK: query: -- EXCLUDE_HADOOP_MAJOR_VERSIONS( 0.20S) + +-- create file inputs +create table sih_i_part (key int, value string) partitioned by (p string) +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@sih_i_part +POSTHOOK: query: -- EXCLUDE_HADOOP_MAJOR_VERSIONS( 0.20S) + +-- create file inputs +create table sih_i_part (key int, value string) partitioned by (p string) +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sih_i_part +PREHOOK: query: insert overwrite table sih_i_part partition (p='1') select key, value from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@sih_i_part@p=1 +POSTHOOK: query: insert overwrite table sih_i_part partition (p='1') select key, value from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@sih_i_part@p=1 +POSTHOOK: Lineage: sih_i_part PARTITION(p=1).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: sih_i_part PARTITION(p=1).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert overwrite table sih_i_part partition (p='2') select key+10000, value from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@sih_i_part@p=2 +POSTHOOK: query: insert overwrite table sih_i_part partition (p='2') select key+10000, value from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@sih_i_part@p=2 +POSTHOOK: Lineage: sih_i_part PARTITION(p=2).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: sih_i_part PARTITION(p=2).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: insert overwrite table sih_i_part partition (p='3') select key+20000, value from src +PREHOOK: type: QUERY +PREHOOK: Input: default@src +PREHOOK: Output: default@sih_i_part@p=3 +POSTHOOK: query: insert overwrite table sih_i_part partition (p='3') select key+20000, value from src +POSTHOOK: type: QUERY +POSTHOOK: Input: default@src +POSTHOOK: Output: default@sih_i_part@p=3 +POSTHOOK: Lineage: sih_i_part PARTITION(p=3).key EXPRESSION [(src)src.FieldSchema(name:key, type:string, comment:default), ] +POSTHOOK: Lineage: sih_i_part PARTITION(p=3).value SIMPLE [(src)src.FieldSchema(name:value, type:string, comment:default), ] +PREHOOK: query: create table sih_src as select key, value from sih_i_part order by key, value +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@sih_i_part +PREHOOK: Input: default@sih_i_part@p=1 +PREHOOK: Input: default@sih_i_part@p=2 +PREHOOK: Input: default@sih_i_part@p=3 +PREHOOK: Output: database:default +PREHOOK: Output: default@sih_src +POSTHOOK: query: create table sih_src as select key, value from sih_i_part order by key, value +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@sih_i_part +POSTHOOK: Input: default@sih_i_part@p=1 +POSTHOOK: Input: default@sih_i_part@p=2 +POSTHOOK: Input: default@sih_i_part@p=3 +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sih_src +POSTHOOK: Lineage: sih_src.key SIMPLE [(sih_i_part)sih_i_part.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: sih_src.value SIMPLE [(sih_i_part)sih_i_part.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: create table sih_src2 as select key, value from sih_src order by key, value +PREHOOK: type: CREATETABLE_AS_SELECT +PREHOOK: Input: default@sih_src +PREHOOK: Output: database:default +PREHOOK: Output: default@sih_src2 +POSTHOOK: query: create table sih_src2 as select key, value from sih_src order by key, value +POSTHOOK: type: CREATETABLE_AS_SELECT +POSTHOOK: Input: default@sih_src +POSTHOOK: Output: database:default +POSTHOOK: Output: default@sih_src2 +POSTHOOK: Lineage: sih_src2.key SIMPLE [(sih_src)sih_src.FieldSchema(name:key, type:int, comment:null), ] +POSTHOOK: Lineage: sih_src2.value SIMPLE [(sih_src)sih_src.FieldSchema(name:value, type:string, comment:null), ] +PREHOOK: query: -- Relaxing hive.exec.mode.local.auto.input.files.max=1. +-- Hadoop20 will not generate more splits than there are files (one). +-- Hadoop23 generate splits correctly (four), hence the max needs to be adjusted to ensure running in local mode. +-- Default value is hive.exec.mode.local.auto.input.files.max=4 which produces expected behavior on Hadoop23. +-- hive.sample.seednumber is required because Hadoop23 generates multiple splits and tablesample is non-repeatable without it. + +-- sample split, running locally limited by num tasks + +desc formatted sih_src +PREHOOK: type: DESCTABLE +PREHOOK: Input: default@sih_src +# col_name data_type comment + +key int +value string + +# Detailed Table Information +Database: default +#### A masked pattern was here #### +Retention: 0 +#### A masked pattern was here #### +Table Type: MANAGED_TABLE +Table Parameters: + COLUMN_STATS_ACCURATE {\"BASIC_STATS\":\"true\"} + numFiles 1 + numRows 1500 + rawDataSize 18124 + totalSize 19624 +#### A masked pattern was here #### + +# Storage Information +SerDe Library: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe +InputFormat: org.apache.hadoop.mapred.TextInputFormat +OutputFormat: org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat +Compressed: No +Num Buckets: -1 +Bucket Columns: [] +Sort Columns: [] +Storage Desc Params: + serialization.format 1 +PREHOOK: query: explain select count(1) from sih_src +PREHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1) from sih_src +PREHOOK: type: QUERY +PREHOOK: Input: default@sih_src +#### A masked pattern was here #### +1500 +PREHOOK: query: explain select count(1) from sih_src tablesample(1 percent) +PREHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: sih_src + Statistics: Num rows: 1500 Data size: 18124 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 1500 Data size: 18124 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1) from sih_src tablesample(1 percent) +PREHOOK: type: QUERY +PREHOOK: Input: default@sih_src +#### A masked pattern was here #### +25 +PREHOOK: query: explain select count(1) from sih_src tablesample(10 rows) +PREHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: sih_src + Row Limit Per Split: 10 + Statistics: Num rows: 1500 Data size: 18124 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 1500 Data size: 18124 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count(1) + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: select count(1) from sih_src tablesample(10 rows) +PREHOOK: type: QUERY +PREHOOK: Input: default@sih_src +#### A masked pattern was here #### +650