Repository: hive Updated Branches: refs/heads/master 595398857 -> c53c9be71
HIVE-14773: NPE aggregating column statistics for date column in partitioned table (Pengcheng Xiong, reviewed by Gopal V) Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/c53c9be7 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/c53c9be7 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/c53c9be7 Branch: refs/heads/master Commit: c53c9be7181fc47bb5422473edbba1ad9ae81042 Parents: 5953988 Author: Pengcheng Xiong <pxi...@apache.org> Authored: Wed Oct 5 10:40:53 2016 -0700 Committer: Pengcheng Xiong <pxi...@apache.org> Committed: Wed Oct 5 10:41:00 2016 -0700 ---------------------------------------------------------------------- .../hive/metastore/IExtrapolatePartStatus.java | 1 + .../hive/metastore/StatObjectConverter.java | 29 ++ .../extrapolate_part_stats_date.q | 14 + .../extrapolate_part_stats_date.q.out | 302 +++++++++++++++++++ 4 files changed, 346 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/c53c9be7/metastore/src/java/org/apache/hadoop/hive/metastore/IExtrapolatePartStatus.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/IExtrapolatePartStatus.java b/metastore/src/java/org/apache/hadoop/hive/metastore/IExtrapolatePartStatus.java index 4859cff..d0569fb 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/IExtrapolatePartStatus.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/IExtrapolatePartStatus.java @@ -39,6 +39,7 @@ public interface IExtrapolatePartStatus { put("int", new Integer[] { 0, 1, 6, 7, 12, 15 }); put("smallint", new Integer[] { 0, 1, 6, 7, 12, 15 }); put("tinyint", new Integer[] { 0, 1, 6, 7, 12, 15 }); + put("date", new Integer[] { 0, 1, 6, 7, 12, 15 }); put("timestamp", new Integer[] { 0, 1, 6, 7, 12, 15 }); put("long", new Integer[] { 0, 1, 6, 7, 12, 15 }); put("double", new Integer[] { 2, 3, 6, 7, 13, 15 }); http://git-wip-us.apache.org/repos/asf/hive/blob/c53c9be7/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java b/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java index e119dd8..b259dfa 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java @@ -579,6 +579,35 @@ public class StatObjectConverter { longStats.setNumDVs(lowerBound); } data.setLongStats(longStats); + } else if (colType.equals("date")) { + DateColumnStatsData dateStats = new DateColumnStatsData(); + dateStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls)); + if (lhigh != null) { + dateStats.setHighValue(new Date(MetaStoreDirectSql.extractSqlLong(lhigh))); + } + if (llow != null) { + dateStats.setLowValue(new Date(MetaStoreDirectSql.extractSqlLong(llow))); + } + long lowerBound = MetaStoreDirectSql.extractSqlLong(dist); + long higherBound = MetaStoreDirectSql.extractSqlLong(sumDist); + if (useDensityFunctionForNDVEstimation && lhigh != null && llow != null && avgLong != null + && MetaStoreDirectSql.extractSqlDouble(avgLong) != 0.0) { + // We have estimation, lowerbound and higherbound. We use estimation if + // it is between lowerbound and higherbound. + long estimation = MetaStoreDirectSql + .extractSqlLong((MetaStoreDirectSql.extractSqlLong(lhigh) - MetaStoreDirectSql + .extractSqlLong(llow)) / MetaStoreDirectSql.extractSqlDouble(avgLong)); + if (estimation < lowerBound) { + dateStats.setNumDVs(lowerBound); + } else if (estimation > higherBound) { + dateStats.setNumDVs(higherBound); + } else { + dateStats.setNumDVs(estimation); + } + } else { + dateStats.setNumDVs(lowerBound); + } + data.setDateStats(dateStats); } else if (colType.equals("double") || colType.equals("float")) { DoubleColumnStatsData doubleStats = new DoubleColumnStatsData(); doubleStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls)); http://git-wip-us.apache.org/repos/asf/hive/blob/c53c9be7/ql/src/test/queries/clientpositive/extrapolate_part_stats_date.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/extrapolate_part_stats_date.q b/ql/src/test/queries/clientpositive/extrapolate_part_stats_date.q new file mode 100644 index 0000000..1f38a65 --- /dev/null +++ b/ql/src/test/queries/clientpositive/extrapolate_part_stats_date.q @@ -0,0 +1,14 @@ +set hive.exec.dynamic.partition.mode=nonstrict; +set hive.stats.fetch.column.stats=true; + +create table date_dim (d_date date) partitioned by (d_date_sk bigint) stored as orc; +insert into date_dim partition(d_date_sk=2416945) values('1905-04-09'); +insert into date_dim partition(d_date_sk=2416946) values('1905-04-10'); +insert into date_dim partition(d_date_sk=2416947) values('1905-04-11'); +analyze table date_dim partition(d_date_sk) compute statistics for columns; + +explain select count(*) from date_dim where d_date > date "1900-01-02" and d_date_sk= 2416945; + +insert into date_dim partition(d_date_sk=2416948) values('1905-04-12'); + +explain extended select d_date from date_dim; http://git-wip-us.apache.org/repos/asf/hive/blob/c53c9be7/ql/src/test/results/clientpositive/extrapolate_part_stats_date.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/extrapolate_part_stats_date.q.out b/ql/src/test/results/clientpositive/extrapolate_part_stats_date.q.out new file mode 100644 index 0000000..1dce4df --- /dev/null +++ b/ql/src/test/results/clientpositive/extrapolate_part_stats_date.q.out @@ -0,0 +1,302 @@ +PREHOOK: query: create table date_dim (d_date date) partitioned by (d_date_sk bigint) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@date_dim +POSTHOOK: query: create table date_dim (d_date date) partitioned by (d_date_sk bigint) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@date_dim +PREHOOK: query: insert into date_dim partition(d_date_sk=2416945) values('1905-04-09') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__1 +PREHOOK: Output: default@date_dim@d_date_sk=2416945 +POSTHOOK: query: insert into date_dim partition(d_date_sk=2416945) values('1905-04-09') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__1 +POSTHOOK: Output: default@date_dim@d_date_sk=2416945 +POSTHOOK: Lineage: date_dim PARTITION(d_date_sk=2416945).d_date EXPRESSION [(values__tmp__table__1)values__tmp__table__1.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into date_dim partition(d_date_sk=2416946) values('1905-04-10') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__2 +PREHOOK: Output: default@date_dim@d_date_sk=2416946 +POSTHOOK: query: insert into date_dim partition(d_date_sk=2416946) values('1905-04-10') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__2 +POSTHOOK: Output: default@date_dim@d_date_sk=2416946 +POSTHOOK: Lineage: date_dim PARTITION(d_date_sk=2416946).d_date EXPRESSION [(values__tmp__table__2)values__tmp__table__2.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: insert into date_dim partition(d_date_sk=2416947) values('1905-04-11') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__3 +PREHOOK: Output: default@date_dim@d_date_sk=2416947 +POSTHOOK: query: insert into date_dim partition(d_date_sk=2416947) values('1905-04-11') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__3 +POSTHOOK: Output: default@date_dim@d_date_sk=2416947 +POSTHOOK: Lineage: date_dim PARTITION(d_date_sk=2416947).d_date EXPRESSION [(values__tmp__table__3)values__tmp__table__3.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: analyze table date_dim partition(d_date_sk) compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@date_dim +PREHOOK: Input: default@date_dim@d_date_sk=2416945 +PREHOOK: Input: default@date_dim@d_date_sk=2416946 +PREHOOK: Input: default@date_dim@d_date_sk=2416947 +#### A masked pattern was here #### +POSTHOOK: query: analyze table date_dim partition(d_date_sk) compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@date_dim +POSTHOOK: Input: default@date_dim@d_date_sk=2416945 +POSTHOOK: Input: default@date_dim@d_date_sk=2416946 +POSTHOOK: Input: default@date_dim@d_date_sk=2416947 +#### A masked pattern was here #### +PREHOOK: query: explain select count(*) from date_dim where d_date > date "1900-01-02" and d_date_sk= 2416945 +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(*) from date_dim where d_date > date "1900-01-02" and d_date_sk= 2416945 +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-1 is a root stage + Stage-0 depends on stages: Stage-1 + +STAGE PLANS: + Stage: Stage-1 + Map Reduce + Map Operator Tree: + TableScan + alias: date_dim + Statistics: Num rows: 1 Data size: 56 Basic stats: COMPLETE Column stats: COMPLETE + Filter Operator + predicate: (d_date > 1900-01-02) (type: boolean) + Statistics: Num rows: 1 Data size: 56 Basic stats: COMPLETE Column stats: COMPLETE + Select Operator + Statistics: Num rows: 1 Data size: 56 Basic stats: COMPLETE Column stats: COMPLETE + Group By Operator + aggregations: count() + mode: hash + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + Reduce Output Operator + sort order: + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + value expressions: _col0 (type: bigint) + Reduce Operator Tree: + Group By Operator + aggregations: count(VALUE._col0) + mode: mergepartial + outputColumnNames: _col0 + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + File Output Operator + compressed: false + Statistics: Num rows: 1 Data size: 8 Basic stats: COMPLETE Column stats: COMPLETE + table: + input format: org.apache.hadoop.mapred.SequenceFileInputFormat + output format: org.apache.hadoop.hive.ql.io.HiveSequenceFileOutputFormat + serde: org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe + + Stage: Stage-0 + Fetch Operator + limit: -1 + Processor Tree: + ListSink + +PREHOOK: query: insert into date_dim partition(d_date_sk=2416948) values('1905-04-12') +PREHOOK: type: QUERY +PREHOOK: Input: default@values__tmp__table__4 +PREHOOK: Output: default@date_dim@d_date_sk=2416948 +POSTHOOK: query: insert into date_dim partition(d_date_sk=2416948) values('1905-04-12') +POSTHOOK: type: QUERY +POSTHOOK: Input: default@values__tmp__table__4 +POSTHOOK: Output: default@date_dim@d_date_sk=2416948 +POSTHOOK: Lineage: date_dim PARTITION(d_date_sk=2416948).d_date EXPRESSION [(values__tmp__table__4)values__tmp__table__4.FieldSchema(name:tmp_values_col1, type:string, comment:), ] +PREHOOK: query: explain extended select d_date from date_dim +PREHOOK: type: QUERY +POSTHOOK: query: explain extended select d_date from date_dim +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: -1 + Partition Description: + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + d_date_sk 2416945 + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"d_date":"true"}} + bucket_count -1 + columns d_date + columns.comments + columns.types date +#### A masked pattern was here #### + name default.date_dim + numFiles 1 + numRows 1 + partition_columns d_date_sk + partition_columns.types bigint + rawDataSize 56 + serialization.ddl struct date_dim { date d_date} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 193 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns d_date + columns.comments + columns.types date +#### A masked pattern was here #### + name default.date_dim + partition_columns d_date_sk + partition_columns.types bigint + serialization.ddl struct date_dim { date d_date} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.date_dim + name: default.date_dim + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + d_date_sk 2416946 + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"d_date":"true"}} + bucket_count -1 + columns d_date + columns.comments + columns.types date +#### A masked pattern was here #### + name default.date_dim + numFiles 1 + numRows 1 + partition_columns d_date_sk + partition_columns.types bigint + rawDataSize 56 + serialization.ddl struct date_dim { date d_date} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 193 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns d_date + columns.comments + columns.types date +#### A masked pattern was here #### + name default.date_dim + partition_columns d_date_sk + partition_columns.types bigint + serialization.ddl struct date_dim { date d_date} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.date_dim + name: default.date_dim + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + d_date_sk 2416947 + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true","COLUMN_STATS":{"d_date":"true"}} + bucket_count -1 + columns d_date + columns.comments + columns.types date +#### A masked pattern was here #### + name default.date_dim + numFiles 1 + numRows 1 + partition_columns d_date_sk + partition_columns.types bigint + rawDataSize 56 + serialization.ddl struct date_dim { date d_date} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 193 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns d_date + columns.comments + columns.types date +#### A masked pattern was here #### + name default.date_dim + partition_columns d_date_sk + partition_columns.types bigint + serialization.ddl struct date_dim { date d_date} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.date_dim + name: default.date_dim + Partition + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + partition values: + d_date_sk 2416948 + properties: + COLUMN_STATS_ACCURATE {"BASIC_STATS":"true"} + bucket_count -1 + columns d_date + columns.comments + columns.types date +#### A masked pattern was here #### + name default.date_dim + numFiles 1 + numRows 1 + partition_columns d_date_sk + partition_columns.types bigint + rawDataSize 56 + serialization.ddl struct date_dim { date d_date} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde + totalSize 193 +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + + input format: org.apache.hadoop.hive.ql.io.orc.OrcInputFormat + output format: org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat + properties: + bucket_count -1 + columns d_date + columns.comments + columns.types date +#### A masked pattern was here #### + name default.date_dim + partition_columns d_date_sk + partition_columns.types bigint + serialization.ddl struct date_dim { date d_date} + serialization.format 1 + serialization.lib org.apache.hadoop.hive.ql.io.orc.OrcSerde +#### A masked pattern was here #### + serde: org.apache.hadoop.hive.ql.io.orc.OrcSerde + name: default.date_dim + name: default.date_dim + Processor Tree: + TableScan + alias: date_dim + Statistics: Num rows: 4 Data size: 224 Basic stats: COMPLETE Column stats: PARTIAL + GatherStats: false + Select Operator + expressions: d_date (type: date) + outputColumnNames: _col0 + Statistics: Num rows: 4 Data size: 224 Basic stats: COMPLETE Column stats: PARTIAL + ListSink +