Author: thejas Date: Thu Oct 31 21:22:02 2013 New Revision: 1537667 URL: http://svn.apache.org/r1537667 Log: HIVE-5483 : use metastore statistics to optimize max/min/etc. queries (Ashutosh Chauhan via Thejas Nair)
Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java hive/trunk/ql/src/test/queries/clientpositive/metadata_only_queries.q hive/trunk/ql/src/test/queries/clientpositive/stats_only_null.q hive/trunk/ql/src/test/results/clientpositive/metadata_only_queries.q.out hive/trunk/ql/src/test/results/clientpositive/stats_only_null.q.out Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java hive/trunk/conf/hive-default.xml.template hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FetchWork.java hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java URL: http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1537667&r1=1537666&r2=1537667&view=diff ============================================================================== --- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java (original) +++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Thu Oct 31 21:22:02 2013 @@ -658,6 +658,8 @@ public class HiveConf extends Configurat HIVEFETCHTASKAGGR("hive.fetch.task.aggr", false), + HIVEOPTIMIZEMETADATAQUERIES("hive.compute.query.using.stats", false), + // Serde for FetchTask HIVEFETCHOUTPUTSERDE("hive.fetch.output.serde", "org.apache.hadoop.hive.serde2.DelimitedJSONSerDe"), Modified: hive/trunk/conf/hive-default.xml.template URL: http://svn.apache.org/viewvc/hive/trunk/conf/hive-default.xml.template?rev=1537667&r1=1537666&r2=1537667&view=diff ============================================================================== --- hive/trunk/conf/hive-default.xml.template (original) +++ hive/trunk/conf/hive-default.xml.template Thu Oct 31 21:22:02 2013 @@ -2032,6 +2032,17 @@ </property> <property> + <name>hive.compute.query.using.stats</name> + <value>false</value> + <description> + When set to true hive will answer few queries like count(1) purely using stats + stored in metastore. For basic stats collection turn on the config hive.stats.autogather to true. + For more advanced stats collection need to run analyze table queries. + </description> +</property> + + +<property> <name>hive.metastore.schema.verification</name> <value>false</value> <description> Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java?rev=1537667&r1=1537666&r2=1537667&view=diff ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java (original) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java Thu Oct 31 21:22:02 2013 @@ -498,6 +498,13 @@ public class FetchOperator implements Se * Currently only used by FetchTask. **/ public boolean pushRow() throws IOException, HiveException { + if(work.getRowsComputedUsingStats() != null) { + for (List<Object> row : work.getRowsComputedUsingStats()) { + operator.process(row, 0); + } + operator.flush(); + return true; + } InspectableObject row = getNextRow(); if (row != null) { pushRow(row); @@ -609,6 +616,9 @@ public class FetchOperator implements Se * returns output ObjectInspector, never null */ public ObjectInspector getOutputObjectInspector() throws HiveException { + if(null != work.getStatRowOI()) { + return work.getStatRowOI(); + } try { if (work.isNotPartitioned()) { return getRowInspectorFromTable(work.getTblDesc()); Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java?rev=1537667&r1=1537666&r2=1537667&view=diff ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java (original) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java Thu Oct 31 21:22:02 2013 @@ -48,6 +48,8 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.FsShell; import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.common.FileUtils; +import org.apache.hadoop.hive.common.classification.InterfaceAudience.LimitedPrivate; +import org.apache.hadoop.hive.common.classification.InterfaceStability.Unstable; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.conf.HiveConf.ConfVars; import org.apache.hadoop.hive.metastore.HiveMetaException; @@ -1730,7 +1732,7 @@ private void constructOneLBLocationMap(F * @param tbl table for which partitions are needed * @return list of partition objects */ - public Set<Partition> getAllPartitionsForPruner(Table tbl) throws HiveException { + public Set<Partition> getAllPartitionsOf(Table tbl) throws HiveException { if (!tbl.isPartitioned()) { return Sets.newHashSet(new Partition(tbl)); } @@ -2405,21 +2407,13 @@ private void constructOneLBLocationMap(F HiveMetaStoreClient.class.getName()); } - /* - * This api just sets up a metastore client. This is used for - * pre-launching the metastore client so as to reduce latency - * within a single session. - */ - public void setupMSC() throws MetaException { - getMSC(); - } - /** - * * @return the metastore client for the current thread * @throws MetaException */ - private IMetaStoreClient getMSC() throws MetaException { + @LimitedPrivate(value = {"Hive"}) + @Unstable + public IMetaStoreClient getMSC() throws MetaException { if (metaStoreClient == null) { metaStoreClient = createMetaStoreClient(); } Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java?rev=1537667&r1=1537666&r2=1537667&view=diff ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java (original) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java Thu Oct 31 21:22:02 2013 @@ -111,6 +111,9 @@ public class Optimizer { if (HiveConf.getFloatVar(hiveConf, HiveConf.ConfVars.HIVELIMITPUSHDOWNMEMORYUSAGE) > 0) { transformations.add(new LimitPushdownOptimizer()); } + if(HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEOPTIMIZEMETADATAQUERIES)) { + transformations.add(new StatsOptimizer()); + } transformations.add(new SimpleFetchOptimizer()); // must be called last if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEFETCHTASKAGGR)) { Added: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java?rev=1537667&view=auto ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java (added) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java Thu Oct 31 21:22:02 2013 @@ -0,0 +1,431 @@ +package org.apache.hadoop.hive.ql.optimizer; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Stack; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; +import org.apache.hadoop.hive.ql.exec.ColumnInfo; +import org.apache.hadoop.hive.ql.exec.Description; +import org.apache.hadoop.hive.ql.exec.FetchTask; +import org.apache.hadoop.hive.ql.exec.FileSinkOperator; +import org.apache.hadoop.hive.ql.exec.GroupByOperator; +import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator; +import org.apache.hadoop.hive.ql.exec.SelectOperator; +import org.apache.hadoop.hive.ql.exec.TableScanOperator; +import org.apache.hadoop.hive.ql.exec.TaskFactory; +import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker; +import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher; +import org.apache.hadoop.hive.ql.lib.Dispatcher; +import org.apache.hadoop.hive.ql.lib.GraphWalker; +import org.apache.hadoop.hive.ql.lib.Node; +import org.apache.hadoop.hive.ql.lib.NodeProcessor; +import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx; +import org.apache.hadoop.hive.ql.lib.Rule; +import org.apache.hadoop.hive.ql.lib.RuleRegExp; +import org.apache.hadoop.hive.ql.metadata.Hive; +import org.apache.hadoop.hive.ql.metadata.Partition; +import org.apache.hadoop.hive.ql.metadata.Table; +import org.apache.hadoop.hive.ql.parse.ParseContext; +import org.apache.hadoop.hive.ql.parse.SemanticException; +import org.apache.hadoop.hive.ql.plan.AggregationDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc; +import org.apache.hadoop.hive.ql.plan.ExprNodeDesc; +import org.apache.hadoop.hive.ql.plan.FetchWork; +import org.apache.hadoop.hive.ql.stats.StatsSetupConst; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax; +import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMin; +import org.apache.hadoop.hive.serde.serdeConstants; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; +import org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; + +/** There is a set of queries which can be answered entirely from statistics stored in metastore. + * Examples of such queries are count(*), count(a), max(a), min(b) etc. Hive already collects + * these basic statistics for query planning purposes. These same statistics can be used to + * answer queries also. + * + * Optimizer looks at query plan to determine if it can answer query using statistics + * and than change the plan to answer query entirely using statistics stored in metastore. + */ +public class StatsOptimizer implements Transform { + + private static final Log Log = LogFactory.getLog(StatsOptimizer.class); + + @Override + public ParseContext transform(ParseContext pctx) throws SemanticException { + + if (pctx.getFetchTask() != null || !pctx.getQB().getIsQuery() || + pctx.getQB().isAnalyzeRewrite() || pctx.getQB().isCTAS() || + pctx.getLoadFileWork().size() > 1 || !pctx.getLoadTableWork().isEmpty()) { + return pctx; + } + + String TS = TableScanOperator.getOperatorName() + "%"; + String GBY = GroupByOperator.getOperatorName() + "%"; + String RS = ReduceSinkOperator.getOperatorName() + "%"; + String SEL = SelectOperator.getOperatorName() + "%"; + String FS = FileSinkOperator.getOperatorName() + "%"; + + Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, NodeProcessor>(); + opRules.put(new RuleRegExp("R1", TS + SEL + GBY + RS + GBY + SEL + FS), + new MetaDataProcessor(pctx)); + + Dispatcher disp = new DefaultRuleDispatcher(null, opRules, null); + GraphWalker ogw = new DefaultGraphWalker(disp); + + ArrayList<Node> topNodes = new ArrayList<Node>(); + topNodes.addAll(pctx.getTopOps().values()); + ogw.startWalking(topNodes, null); + return pctx; + } + + private static class MetaDataProcessor implements NodeProcessor { + + private final ParseContext pctx; + + public MetaDataProcessor (ParseContext pctx) { + this.pctx = pctx; + } + + enum StatType{ + Integeral, + Double, + String, + Boolean, + Binary, + Unsupported + } + + private StatType getType(String origType) { + if (serdeConstants.IntegralTypes.contains(origType)) { + return StatType.Integeral; + } else if (origType.equals(serdeConstants.DOUBLE_TYPE_NAME) || + origType.equals(serdeConstants.FLOAT_TYPE_NAME)) { + return StatType.Double; + } else if (origType.equals(serdeConstants.BINARY_TYPE_NAME)) { + return StatType.Binary; + } else if (origType.equals(serdeConstants.BOOLEAN_TYPE_NAME)) { + return StatType.Boolean; + } else if (origType.equals(serdeConstants.STRING_TYPE_NAME)) { + return StatType.String; + } + return StatType.Unsupported; + } + + private Long getNullcountFor(StatType type, ColumnStatisticsData statData) { + + switch(type) { + case Integeral : + return statData.getLongStats().getNumNulls(); + case Double: + return statData.getDoubleStats().getNumNulls(); + case String: + return statData.getStringStats().getNumNulls(); + case Boolean: + return statData.getBooleanStats().getNumNulls(); + case Binary: + return statData.getBinaryStats().getNumNulls(); + default: + return null; + } + } + + @Override + public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx, + Object... nodeOutputs) throws SemanticException { + + // 1. Do few checks to determine eligibility of optimization + // 2. look at ExprNodeFuncGenericDesc in select list to see if its min, max, count etc. + // If it is + // 3. Connect to metastore and get the stats + // 4. Compose rows and add it in FetchWork + // 5. Delete GBY - RS - GBY - SEL from the pipeline. + + TableScanOperator tsOp = (TableScanOperator) stack.get(0); + if(tsOp.getParentOperators() != null && tsOp.getParentOperators().size() > 0) { + // looks like a subq plan. + return null; + } + SelectOperator selOp = (SelectOperator)tsOp.getChildren().get(0); + for(ExprNodeDesc desc : selOp.getConf().getColList()) { + if (!(desc instanceof ExprNodeColumnDesc)) { + // Probably an expression, cant handle that + return null; + } + } + // Since we have done an exact match on TS-SEL-GBY-RS-GBY-SEL-FS + // we need not to do any instanceof checks for following. + GroupByOperator gbyOp = (GroupByOperator)selOp.getChildren().get(0); + FileSinkOperator fsOp = (FileSinkOperator)(gbyOp.getChildren().get(0). + getChildren().get(0).getChildren().get(0).getChildren().get(0)); + if (fsOp.getChildOperators() != null && fsOp.getChildOperators().size() > 0) { + // looks like a subq plan. + return null; + } + List<AggregationDesc> aggrs = gbyOp.getConf().getAggregators(); + + Table tbl = pctx.getTopToTable().get(tsOp); + List<Object> oneRow = new ArrayList<Object>(); + List<ObjectInspector> ois = new ArrayList<ObjectInspector>(); + try{ + Hive hive = Hive.get(pctx.getConf()); + + for (AggregationDesc aggr : aggrs) { + if (aggr.getGenericUDAFName().equals(GenericUDAFCount.class.getAnnotation( + Description.class).name())) { + long rowCnt = 0; + if ((aggr.getParameters().isEmpty() || aggr.getParameters().get(0) instanceof + ExprNodeConstantDesc)) { + // Its either count (*) or count(1) case + if(tbl.isPartitioned()) { + for (Partition part : hive.getAllPartitionsOf(tbl)) { + long partRowCnt = Long.parseLong(part.getParameters() + .get(StatsSetupConst.ROW_COUNT)); + if (partRowCnt < 1) { + Log.debug("Partition doesn't have upto date stats " + part.getSpec()); + return null; + } + rowCnt += partRowCnt; + } + } else { // unpartitioned table + rowCnt = Long.parseLong(tbl.getProperty(StatsSetupConst.ROW_COUNT)); + if (rowCnt < 1) { + // if rowCnt < 1 than its either empty table or table on which stats are not + // computed We assume the worse and don't attempt to optimize. + Log.debug("Table doesn't have upto date stats " + tbl.getTableName()); + return null; + } + } + } else { + // Its count(col) case + if (!(aggr.getParameters().get(0) instanceof ExprNodeColumnDesc)) { + // this is weird, we got expr or something in there, bail out + Log.debug("Unexpected expression : " + aggr.getParameters().get(0)); + return null; + } + ExprNodeColumnDesc desc = (ExprNodeColumnDesc)aggr.getParameters().get(0); + String colName = desc.getColumn(); + StatType type = getType(desc.getTypeString()); + if(!tbl.isPartitioned()) { + rowCnt = Long.parseLong(tbl.getProperty(StatsSetupConst.ROW_COUNT)); + if (rowCnt < 1) { + Log.debug("Table doesn't have upto date stats " + tbl.getTableName()); + return null; + } + //TODO: After HIVE-3777 use the property to figure out if following + // stats is fresh or not. + ColumnStatisticsData statData = hive.getMSC().getTableColumnStatistics( + tbl.getDbName(),tbl.getTableName(),colName). + getStatsObjIterator().next().getStatsData(); + Long nullCnt = getNullcountFor(type, statData); + if (null == nullCnt) { + Log.debug("Unsupported type: " + desc.getTypeString() + " encountered in " + + "metadata optimizer for column : " + colName); + return null; + } else { + rowCnt -= nullCnt; + } + } else { + for (Partition part : hive.getAllPartitionsOf(tbl)) { + Long partRowCnt = Long.parseLong(part.getParameters() + .get(StatsSetupConst.ROW_COUNT)); + if (partRowCnt < 1) { + Log.debug("Partition doesn't have upto date stats " + part.getSpec()); + return null; + } + rowCnt += partRowCnt; + //TODO: After HIVE-3777 use the property to figure out if following + // stats is fresh or not. + ColumnStatisticsData statData = hive.getMSC().getPartitionColumnStatistics( + tbl.getDbName(), tbl.getTableName(),part.getName(), colName) + .getStatsObjIterator().next().getStatsData(); + Long nullCnt = getNullcountFor(type, statData); + if(nullCnt == null) { + Log.debug("Unsupported type: " + desc.getTypeString() + " encountered in " + + "metadata optimizer for column : " + colName); + return null; + } else { + rowCnt -= nullCnt; + } + } + } + } + oneRow.add(rowCnt); + ois.add(PrimitiveObjectInspectorFactory. + getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG)); + } else if (aggr.getGenericUDAFName().equals(GenericUDAFMax.class.getAnnotation( + Description.class).name())) { + ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc)aggr.getParameters().get(0); + String colName = colDesc.getColumn(); + StatType type = getType(colDesc.getTypeString()); + if(!tbl.isPartitioned()) { + //TODO: After HIVE-3777 use the property to figure out if following + // stats is fresh or not. + ColumnStatisticsData statData = hive.getMSC().getTableColumnStatistics( + tbl.getDbName(),tbl.getTableName(),colName). + getStatsObjIterator().next().getStatsData(); + switch (type) { + case Integeral: + oneRow.add(statData.getLongStats().getHighValue()); + ois.add(PrimitiveObjectInspectorFactory. + getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG)); + break; + case Double: + oneRow.add(statData.getDoubleStats().getHighValue()); + ois.add(PrimitiveObjectInspectorFactory. + getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE)); + break; + default: + // unsupported type + Log.debug("Unsupported type: " + colDesc.getTypeString() + " encountered in " + + "metadata optimizer for column : " + colName); + return null; + } + } else { + List<String> parts = hive.getMSC().listPartitionNames(tbl.getDbName(), + tbl.getTableName(), (short)-1); + switch(type) { + case Integeral: { + long maxVal = Long.MIN_VALUE; + for (String part : parts) { + //TODO: After HIVE-3777 use the property to figure out if following + // stats is fresh or not. + ColumnStatisticsData statData = hive.getMSC().getPartitionColumnStatistics( + tbl.getDbName(),tbl.getTableName(), part, colName). + getStatsObjIterator().next().getStatsData(); + maxVal = Math.max(maxVal,statData.getLongStats().getHighValue()); + } + oneRow.add(maxVal); + ois.add(PrimitiveObjectInspectorFactory. + getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG)); + break; + } + case Double: { + double maxVal = Double.MIN_VALUE; + for (String part : parts) { + //TODO: After HIVE-3777 use the property to figure out if following + // stats is fresh or not. + ColumnStatisticsData statData = hive.getMSC().getPartitionColumnStatistics( + tbl.getDbName(),tbl.getTableName(), part, colName). + getStatsObjIterator().next().getStatsData(); + maxVal = Math.max(maxVal,statData.getDoubleStats().getHighValue()); + } + oneRow.add(maxVal); + ois.add(PrimitiveObjectInspectorFactory. + getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE)); + break; + } + default: + Log.debug("Unsupported type: " + colDesc.getTypeString() + " encountered in " + + "metadata optimizer for column : " + colName); + return null; + } + } + } else if (aggr.getGenericUDAFName().equals(GenericUDAFMin.class.getAnnotation( + Description.class).name())) { + ExprNodeColumnDesc colDesc = (ExprNodeColumnDesc)aggr.getParameters().get(0); + String colName = colDesc.getColumn(); + StatType type = getType(colDesc.getTypeString()); + if (!tbl.isPartitioned()) { + //TODO: After HIVE-3777 use the property to figure out if following + // stats is fresh or not. + ColumnStatisticsData statData = hive.getMSC().getTableColumnStatistics( + tbl.getDbName(),tbl.getTableName(),colName). + getStatsObjIterator().next().getStatsData(); + switch (type) { + case Integeral: + oneRow.add(statData.getLongStats().getLowValue()); + ois.add(PrimitiveObjectInspectorFactory. + getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG)); + break; + case Double: + oneRow.add(statData.getDoubleStats().getLowValue()); + ois.add(PrimitiveObjectInspectorFactory. + getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE)); + break; + default: // unsupported type + Log.debug("Unsupported type: " + colDesc.getTypeString() + " encountered in " + + "metadata optimizer for column : " + colName); + return null; + } + } else { + List<String> parts = hive.getMSC().listPartitionNames(tbl.getDbName(), + tbl.getTableName(), (short)-1); + switch(type) { + case Integeral: { + long minVal = Long.MAX_VALUE; + for (String part : parts) { + //TODO: After HIVE-3777 use the property to figure out if following + // stats is fresh or not. + ColumnStatisticsData statData = hive.getMSC().getPartitionColumnStatistics( + tbl.getDbName(),tbl.getTableName(), part, colName). + getStatsObjIterator().next().getStatsData(); + minVal = Math.min(minVal,statData.getLongStats().getLowValue()); + } + oneRow.add(minVal); + ois.add(PrimitiveObjectInspectorFactory. + getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG)); + break; + } + case Double: { + double minVal = Double.MAX_VALUE; + for (String part : parts) { + //TODO: After HIVE-3777 use the property to figure out if following + // stats is fresh or not. + ColumnStatisticsData statData = hive.getMSC().getPartitionColumnStatistics( + tbl.getDbName(),tbl.getTableName(), part, colName). + getStatsObjIterator().next().getStatsData(); + minVal = Math.min(minVal,statData.getDoubleStats().getLowValue()); + } + oneRow.add(minVal); + ois.add(PrimitiveObjectInspectorFactory. + getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE)); + break; + } + default: // unsupported type + Log.debug("Unsupported type: " + colDesc.getTypeString() + " encountered in " + + "metadata optimizer for column : " + colName); + return null; + + } + } + } else { // Unsupported aggregation. + Log.debug("Unsupported aggregation for metadata optimizer: " + + aggr.getGenericUDAFName()); + return null; + } + } + } catch (Exception e) { + // this is best effort optimization, bail out in error conditions and + // try generate and execute slower plan + Log.debug("Failed to optimize using metadata optimizer", e); + return null; + } + + List<List<Object>> allRows = new ArrayList<List<Object>>(); + allRows.add(oneRow); + + List<String> colNames = new ArrayList<String>(); + for (ColumnInfo colInfo: gbyOp.getSchema().getSignature()) { + colNames.add(colInfo.getInternalName()); + } + StandardStructObjectInspector sOI = ObjectInspectorFactory. + getStandardStructObjectInspector(colNames, ois); + FetchWork fWork = new FetchWork(allRows, sOI); + FetchTask fTask = (FetchTask)TaskFactory.get(fWork, pctx.getConf()); + fWork.setLimit(allRows.size()); + pctx.setFetchTask(fTask); + + return null; + } + } +} Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java?rev=1537667&r1=1537666&r2=1537667&view=diff ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java (original) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java Thu Oct 31 21:22:02 2013 @@ -337,7 +337,7 @@ public class PartitionPruner implements private static Set<Partition> getAllPartitions(Table tab) throws HiveException { PerfLogger perfLogger = PerfLogger.getPerfLogger(); perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.PARTITION_RETRIEVING); - Set<Partition> result = Hive.get().getAllPartitionsForPruner(tab); + Set<Partition> result = Hive.get().getAllPartitionsOf(tab); perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.PARTITION_RETRIEVING); return result; } Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FetchWork.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FetchWork.java?rev=1537667&r1=1537666&r2=1537667&view=diff ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FetchWork.java (original) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FetchWork.java Thu Oct 31 21:22:02 2013 @@ -27,6 +27,7 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.hive.ql.exec.ListSinkOperator; import org.apache.hadoop.hive.ql.exec.Operator; import org.apache.hadoop.hive.ql.parse.SplitSample; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; /** * FetchWork. @@ -50,6 +51,9 @@ public class FetchWork implements Serial private SplitSample splitSample; + private transient List<List<Object>> rowsComputedFromStats; + private transient ObjectInspector statRowOI; + /** * Serialization Null Format for the serde used to fetch data. */ @@ -58,6 +62,19 @@ public class FetchWork implements Serial public FetchWork() { } + public FetchWork(List<List<Object>> rowsComputedFromStats,ObjectInspector statRowOI) { + this.rowsComputedFromStats = rowsComputedFromStats; + this.statRowOI = statRowOI; + } + + public ObjectInspector getStatRowOI() { + return statRowOI; + } + + public List<List<Object>> getRowsComputedUsingStats() { + return rowsComputedFromStats; + } + public FetchWork(String tblDir, TableDesc tblDesc) { this(tblDir, tblDesc, -1); } Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java?rev=1537667&r1=1537666&r2=1537667&view=diff ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java (original) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java Thu Oct 31 21:22:02 2013 @@ -33,8 +33,6 @@ import java.util.Map; import java.util.Set; import java.util.UUID; -import javax.security.auth.login.LoginException; - import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; import org.apache.commons.logging.Log; @@ -281,7 +279,7 @@ public class SessionState { // Get the following out of the way when you start the session these take a // while and should be done when we start up. try { - Hive.get(startSs.conf).setupMSC(); + Hive.get(startSs.conf).getMSC(); ShimLoader.getHadoopShims().getUGIForConf(startSs.conf); FileSystem.get(startSs.conf); } catch (Exception e) { @@ -289,7 +287,7 @@ public class SessionState { // that would cause ClassNoFoundException otherwise throw new RuntimeException(e); } - + try { startSs.authenticator = HiveUtils.getAuthenticator( startSs.getConf(),HiveConf.ConfVars.HIVE_AUTHENTICATOR_MANAGER); Added: hive/trunk/ql/src/test/queries/clientpositive/metadata_only_queries.q URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/metadata_only_queries.q?rev=1537667&view=auto ============================================================================== --- hive/trunk/ql/src/test/queries/clientpositive/metadata_only_queries.q (added) +++ hive/trunk/ql/src/test/queries/clientpositive/metadata_only_queries.q Thu Oct 31 21:22:02 2013 @@ -0,0 +1,75 @@ +set hive.compute.query.using.stats=true; +set hive.stats.autogather=true; +create table over10k( + t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + dec decimal, + bin binary) + row format delimited + fields terminated by '|'; + +load data local inpath '../data/files/over10k' into table over10k; + +create table stats_tbl( + t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + dec decimal, + bin binary); + +create table stats_tbl_part( + t tinyint, + si smallint, + i int, + b bigint, + f float, + d double, + bo boolean, + s string, + ts timestamp, + dec decimal, + bin binary) partitioned by (dt string); + + +insert overwrite table stats_tbl select * from over10k; + +insert into table stats_tbl_part partition (dt='2010') select * from over10k where t>0 and t<30; +insert into table stats_tbl_part partition (dt='2011') select * from over10k where t>30 and t<60; +insert into table stats_tbl_part partition (dt='2012') select * from over10k where t>60; + +explain +select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), min(b) from stats_tbl; +explain +select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), min(b) from stats_tbl_part; + +analyze table stats_tbl compute statistics for columns t,si,i,b,f,d,bo,s,bin; +analyze table stats_tbl_part partition(dt='2010') compute statistics for columns t,si,i,b,f,d,bo,s,bin; +analyze table stats_tbl_part partition(dt='2011') compute statistics for columns t,si,i,b,f,d,bo,s,bin; +analyze table stats_tbl_part partition(dt='2012') compute statistics for columns t,si,i,b,f,d,bo,s,bin; + +explain +select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), min(b), max(f), min(d) from stats_tbl; +select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), min(b), max(f), min(d) from stats_tbl; +explain +select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), min(b), max(f), min(d) from stats_tbl_part; +select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), min(b), max(f), min(d) from stats_tbl_part; + +explain select count(ts) from stats_tbl_part; + +drop table stats_tbl; +drop table stats_tbl_part; + +set hive.compute.query.using.stats=false; Added: hive/trunk/ql/src/test/queries/clientpositive/stats_only_null.q URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/stats_only_null.q?rev=1537667&view=auto ============================================================================== --- hive/trunk/ql/src/test/queries/clientpositive/stats_only_null.q (added) +++ hive/trunk/ql/src/test/queries/clientpositive/stats_only_null.q Thu Oct 31 21:22:02 2013 @@ -0,0 +1,36 @@ +set hive.compute.query.using.stats=true; +set hive.stats.autogather=true; +CREATE TABLE temps_null(a double, b int, c STRING, d smallint) STORED AS TEXTFILE; + +CREATE TABLE stats_null(a double, b int, c STRING, d smallint) STORED AS TEXTFILE; + +CREATE TABLE stats_null_part(a double, b int, c STRING, d smallint) partitioned by (dt string) STORED AS TEXTFILE; + +LOAD DATA LOCAL INPATH '../data/files/null.txt' INTO TABLE temps_null; + +insert overwrite table stats_null select * from temps_null; +insert overwrite table stats_null_part partition(dt='2010') select * from temps_null where d <=5; + +insert overwrite table stats_null_part partition(dt='2011') select * from temps_null where d > 5; +explain +select count(*), count(a), count(b), count(c), count(d) from stats_null; +explain +select count(*), count(a), count(b), count(c), count(d) from stats_null_part; + + +analyze table stats_null compute statistics for columns a,b,c,d; +analyze table stats_null_part partition(dt='2010') compute statistics for columns a,b,c,d; +analyze table stats_null_part partition(dt='2011') compute statistics for columns a,b,c,d; + +explain +select count(*), count(a), count(b), count(c), count(d) from stats_null; +explain +select count(*), count(a), count(b), count(c), count(d) from stats_null_part; + + +select count(*), count(a), count(b), count(c), count(d) from stats_null; +select count(*), count(a), count(b), count(c), count(d) from stats_null_part; +drop table stats_null; +drop table stats_null_part; +drop table temps_null; +set hive.compute.query.using.stats=false;