HIVE-19247 : StatsOptimizer: Missing stats fast-path for Date (Gopal V via Ashutosh Chauhan)
Signed-off-by: Ashutosh Chauhan <hashut...@apache.org> Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/34ced306 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/34ced306 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/34ced306 Branch: refs/heads/storage-branch-2.6 Commit: 34ced3062f0b5083049cf1c94aa6d5335ee923c7 Parents: 63923e7 Author: Gopal V <gop...@apache.org> Authored: Tue Apr 24 21:51:22 2018 -0700 Committer: Ashutosh Chauhan <hashut...@apache.org> Committed: Tue Apr 24 21:51:22 2018 -0700 ---------------------------------------------------------------------- .../test/resources/testconfiguration.properties | 3 +- .../hive/ql/optimizer/StatsOptimizer.java | 97 ++++++++++++++++++-- ql/src/test/queries/clientpositive/stats_date.q | 18 ++++ .../clientpositive/llap/stats_date.q.out | 80 ++++++++++++++++ 4 files changed, 189 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/34ced306/itests/src/test/resources/testconfiguration.properties ---------------------------------------------------------------------- diff --git a/itests/src/test/resources/testconfiguration.properties b/itests/src/test/resources/testconfiguration.properties index f32b431..2c1a76d 100644 --- a/itests/src/test/resources/testconfiguration.properties +++ b/itests/src/test/resources/testconfiguration.properties @@ -877,7 +877,8 @@ minillaplocal.query.files=\ unionDistinct_3.q,\ vectorized_join46.q,\ vectorized_multi_output_select.q,\ - partialdhj.q + partialdhj.q,\ + stats_date.q encrypted.query.files=encryption_join_unencrypted_tbl.q,\ encryption_insert_partition_static.q,\ http://git-wip-us.apache.org/repos/asf/hive/blob/34ced306/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java index d26a48b..a574372 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java @@ -17,6 +17,7 @@ */ package org.apache.hadoop.hive.ql.optimizer; +import java.sql.Date; import java.util.ArrayList; import java.util.Collection; import java.util.HashMap; @@ -30,6 +31,7 @@ import org.apache.hadoop.hive.common.StatsSetupConst; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.DateColumnStatsData; import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; import org.apache.hadoop.hive.metastore.utils.MetaStoreUtils; @@ -72,6 +74,8 @@ import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMin; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFResolver; import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFSum; import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.io.DateWritable; +import org.apache.hadoop.hive.serde2.io.TimestampWritable; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; @@ -146,11 +150,12 @@ public class StatsOptimizer extends Transform { } enum StatType{ - Integeral, + Integer, Double, String, Boolean, Binary, + Date, Unsupported } @@ -163,7 +168,6 @@ public class StatsOptimizer extends Transform { Object cast(long longValue) { return (short)longValue; } }, TINYINT { @Override Object cast(long longValue) { return (byte)longValue; } }; - abstract Object cast(long longValue); } @@ -175,6 +179,13 @@ public class StatsOptimizer extends Transform { abstract Object cast(double doubleValue); } + + enum DateSubType { + DAYS {@Override + Object cast(long longValue) { return (new DateWritable((int)longValue)).get();} + }; + abstract Object cast(long longValue); + } enum GbyKeyType { NULL, CONSTANT, OTHER @@ -182,7 +193,7 @@ public class StatsOptimizer extends Transform { private StatType getType(String origType) { if (serdeConstants.IntegralTypes.contains(origType)) { - return StatType.Integeral; + return StatType.Integer; } else if (origType.equals(serdeConstants.DOUBLE_TYPE_NAME) || origType.equals(serdeConstants.FLOAT_TYPE_NAME)) { return StatType.Double; @@ -192,6 +203,8 @@ public class StatsOptimizer extends Transform { return StatType.Boolean; } else if (origType.equals(serdeConstants.STRING_TYPE_NAME)) { return StatType.String; + } else if (origType.equals(serdeConstants.DATE_TYPE_NAME)) { + return StatType.Date; } return StatType.Unsupported; } @@ -199,7 +212,7 @@ public class StatsOptimizer extends Transform { private Long getNullcountFor(StatType type, ColumnStatisticsData statData) { switch(type) { - case Integeral : + case Integer : return statData.getLongStats().getNumNulls(); case Double: return statData.getDoubleStats().getNumNulls(); @@ -209,6 +222,8 @@ public class StatsOptimizer extends Transform { return statData.getBooleanStats().getNumNulls(); case Binary: return statData.getBinaryStats().getNumNulls(); + case Date: + return statData.getDateStats().getNumNulls(); default: return null; } @@ -515,7 +530,7 @@ public class StatsOptimizer extends Transform { ColumnStatisticsData statData = stats.get(0).getStatsData(); String name = colDesc.getTypeString().toUpperCase(); switch (type) { - case Integeral: { + case Integer: { LongSubType subType = LongSubType.valueOf(name); LongColumnStatsData lstats = statData.getLongStats(); if (lstats.isSetHighValue()) { @@ -535,6 +550,15 @@ public class StatsOptimizer extends Transform { } break; } + case Date: { + DateColumnStatsData dstats = statData.getDateStats(); + if (dstats.isSetHighValue()) { + oneRow.add(DateSubType.DAYS.cast(dstats.getHighValue().getDaysSinceEpoch())); + } else { + oneRow.add(null); + } + break; + } default: // unsupported type Logger.debug("Unsupported type: " + colDesc.getTypeString() + " encountered in " + @@ -546,7 +570,7 @@ public class StatsOptimizer extends Transform { tsOp.getConf().getAlias(), tsOp).getPartitions(); String name = colDesc.getTypeString().toUpperCase(); switch (type) { - case Integeral: { + case Integer: { LongSubType subType = LongSubType.valueOf(name); Long maxVal = null; @@ -598,6 +622,30 @@ public class StatsOptimizer extends Transform { } break; } + case Date: { + Long maxVal = null; + Collection<List<ColumnStatisticsObj>> result = + verifyAndGetPartColumnStats(hive, tbl, colName, parts); + if (result == null) { + return null; // logging inside + } + for (List<ColumnStatisticsObj> statObj : result) { + ColumnStatisticsData statData = validateSingleColStat(statObj); + if (statData == null) return null; + DateColumnStatsData dstats = statData.getDateStats(); + if (!dstats.isSetHighValue()) { + continue; + } + long curVal = dstats.getHighValue().getDaysSinceEpoch(); + maxVal = maxVal == null ? curVal : Math.max(maxVal, curVal); + } + if (maxVal != null) { + oneRow.add(DateSubType.DAYS.cast(maxVal)); + } else { + oneRow.add(null); + } + break; + } default: Logger.debug("Unsupported type: " + colDesc.getTypeString() + " encountered in " + "metadata optimizer for column : " + colName); @@ -619,7 +667,7 @@ public class StatsOptimizer extends Transform { .get(0).getStatsData(); String name = colDesc.getTypeString().toUpperCase(); switch (type) { - case Integeral: { + case Integer: { LongSubType subType = LongSubType.valueOf(name); LongColumnStatsData lstats = statData.getLongStats(); if (lstats.isSetLowValue()) { @@ -639,6 +687,15 @@ public class StatsOptimizer extends Transform { } break; } + case Date: { + DateColumnStatsData dstats = statData.getDateStats(); + if (dstats.isSetLowValue()) { + oneRow.add(DateSubType.DAYS.cast(dstats.getLowValue().getDaysSinceEpoch())); + } else { + oneRow.add(null); + } + break; + } default: // unsupported type Logger.debug("Unsupported type: " + colDesc.getTypeString() + " encountered in " + "metadata optimizer for column : " + colName); @@ -648,7 +705,7 @@ public class StatsOptimizer extends Transform { Set<Partition> parts = pctx.getPrunedPartitions(tsOp.getConf().getAlias(), tsOp).getPartitions(); String name = colDesc.getTypeString().toUpperCase(); switch(type) { - case Integeral: { + case Integer: { LongSubType subType = LongSubType.valueOf(name); Long minVal = null; @@ -700,6 +757,30 @@ public class StatsOptimizer extends Transform { } break; } + case Date: { + Long minVal = null; + Collection<List<ColumnStatisticsObj>> result = + verifyAndGetPartColumnStats(hive, tbl, colName, parts); + if (result == null) { + return null; // logging inside + } + for (List<ColumnStatisticsObj> statObj : result) { + ColumnStatisticsData statData = validateSingleColStat(statObj); + if (statData == null) return null; + DateColumnStatsData dstats = statData.getDateStats(); + if (!dstats.isSetLowValue()) { + continue; + } + long curVal = dstats.getLowValue().getDaysSinceEpoch(); + minVal = minVal == null ? curVal : Math.min(minVal, curVal); + } + if (minVal != null) { + oneRow.add(DateSubType.DAYS.cast(minVal)); + } else { + oneRow.add(null); + } + break; + } default: // unsupported type Logger.debug("Unsupported type: " + colDesc.getTypeString() + " encountered in " + "metadata optimizer for column : " + colName); http://git-wip-us.apache.org/repos/asf/hive/blob/34ced306/ql/src/test/queries/clientpositive/stats_date.q ---------------------------------------------------------------------- diff --git a/ql/src/test/queries/clientpositive/stats_date.q b/ql/src/test/queries/clientpositive/stats_date.q new file mode 100644 index 0000000..da1ef58 --- /dev/null +++ b/ql/src/test/queries/clientpositive/stats_date.q @@ -0,0 +1,18 @@ + +create table foo(x date, y timestamp) stored as orc; + +insert into foo values('1999-01-01', '1999-01-01 00:00:01'), ('2018-01-01', '2018-01-01 23:23:59'); + +analyze table foo compute statistics for columns; + +set hive.compute.query.using.stats=true; + +set test.comment=All queries need to be just metadata fetch tasks + +explain select min(x) from foo; +explain select max(x) from foo; +explain select count(x) from foo; + +explain select count(x), max(x), min(x) from foo; + +select count(x), max(x), min(x) from foo; http://git-wip-us.apache.org/repos/asf/hive/blob/34ced306/ql/src/test/results/clientpositive/llap/stats_date.q.out ---------------------------------------------------------------------- diff --git a/ql/src/test/results/clientpositive/llap/stats_date.q.out b/ql/src/test/results/clientpositive/llap/stats_date.q.out new file mode 100644 index 0000000..3ccf400 --- /dev/null +++ b/ql/src/test/results/clientpositive/llap/stats_date.q.out @@ -0,0 +1,80 @@ +PREHOOK: query: create table foo(x date, y timestamp) stored as orc +PREHOOK: type: CREATETABLE +PREHOOK: Output: database:default +PREHOOK: Output: default@foo +POSTHOOK: query: create table foo(x date, y timestamp) stored as orc +POSTHOOK: type: CREATETABLE +POSTHOOK: Output: database:default +POSTHOOK: Output: default@foo +PREHOOK: query: insert into foo values('1999-01-01', '1999-01-01 00:00:01'), ('2018-01-01', '2018-01-01 23:23:59') +PREHOOK: type: QUERY +PREHOOK: Input: _dummy_database@_dummy_table +PREHOOK: Output: default@foo +POSTHOOK: query: insert into foo values('1999-01-01', '1999-01-01 00:00:01'), ('2018-01-01', '2018-01-01 23:23:59') +POSTHOOK: type: QUERY +POSTHOOK: Input: _dummy_database@_dummy_table +POSTHOOK: Output: default@foo +POSTHOOK: Lineage: foo.x SCRIPT [] +POSTHOOK: Lineage: foo.y SCRIPT [] +PREHOOK: query: analyze table foo compute statistics for columns +PREHOOK: type: QUERY +PREHOOK: Input: default@foo +PREHOOK: Output: default@foo +#### A masked pattern was here #### +POSTHOOK: query: analyze table foo compute statistics for columns +POSTHOOK: type: QUERY +POSTHOOK: Input: default@foo +POSTHOOK: Output: default@foo +#### A masked pattern was here #### +Warning: Value had a \n character in it. +PREHOOK: query: explain select max(x) from foo +PREHOOK: type: QUERY +POSTHOOK: query: explain select max(x) from foo +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: explain select count(x) from foo +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(x) from foo +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: explain select count(x), max(x), min(x) from foo +PREHOOK: type: QUERY +POSTHOOK: query: explain select count(x), max(x), min(x) from foo +POSTHOOK: type: QUERY +STAGE DEPENDENCIES: + Stage-0 is a root stage + +STAGE PLANS: + Stage: Stage-0 + Fetch Operator + limit: 1 + Processor Tree: + ListSink + +PREHOOK: query: select count(x), max(x), min(x) from foo +PREHOOK: type: QUERY +PREHOOK: Input: default@foo +#### A masked pattern was here #### +POSTHOOK: query: select count(x), max(x), min(x) from foo +POSTHOOK: type: QUERY +POSTHOOK: Input: default@foo +#### A masked pattern was here #### +2 2018-01-01 1999-01-01