org...

thejas Thu, 31 Oct 2013 14:23:25 -0700

Author: thejas
Date: Thu Oct 31 21:22:02 2013
New Revision: 1537667

URL: http://svn.apache.org/r1537667
Log:
HIVE-5483 : use metastore statistics to optimize max/min/etc. queries (Ashutosh 
Chauhan via Thejas Nair)


Added:
    
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
    hive/trunk/ql/src/test/queries/clientpositive/metadata_only_queries.q
    hive/trunk/ql/src/test/queries/clientpositive/stats_only_null.q
    hive/trunk/ql/src/test/results/clientpositive/metadata_only_queries.q.out
    hive/trunk/ql/src/test/results/clientpositive/stats_only_null.q.out
Modified:
    hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
    hive/trunk/conf/hive-default.xml.template
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
    
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FetchWork.java
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java

Modified: hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java?rev=1537667&r1=1537666&r2=1537667&view=diff
==============================================================================
--- hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java 
(original)
+++ hive/trunk/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java Thu 
Oct 31 21:22:02 2013
@@ -658,6 +658,8 @@ public class HiveConf extends Configurat
 
     HIVEFETCHTASKAGGR("hive.fetch.task.aggr", false),
 
+    HIVEOPTIMIZEMETADATAQUERIES("hive.compute.query.using.stats", false),
+
     // Serde for FetchTask
     HIVEFETCHOUTPUTSERDE("hive.fetch.output.serde", 
"org.apache.hadoop.hive.serde2.DelimitedJSONSerDe"),
 

Modified: hive/trunk/conf/hive-default.xml.template
URL: 
http://svn.apache.org/viewvc/hive/trunk/conf/hive-default.xml.template?rev=1537667&r1=1537666&r2=1537667&view=diff
==============================================================================
--- hive/trunk/conf/hive-default.xml.template (original)
+++ hive/trunk/conf/hive-default.xml.template Thu Oct 31 21:22:02 2013
@@ -2032,6 +2032,17 @@
 </property>
 
 <property>
+  <name>hive.compute.query.using.stats</name>
+  <value>false</value>
+  <description>
+  When set to true hive will answer few queries like count(1) purely using 
stats
+  stored in metastore. For basic stats collection turn on the config 
hive.stats.autogather to true.
+  For more advanced stats collection need to run analyze table queries.
+  </description>
+</property>
+
+
+<property>
   <name>hive.metastore.schema.verification</name>
   <value>false</value>
    <description>

Modified: 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java?rev=1537667&r1=1537666&r2=1537667&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java 
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/exec/FetchOperator.java 
Thu Oct 31 21:22:02 2013
@@ -498,6 +498,13 @@ public class FetchOperator implements Se
    * Currently only used by FetchTask.
    **/
   public boolean pushRow() throws IOException, HiveException {
+    if(work.getRowsComputedUsingStats() != null) {
+      for (List<Object> row : work.getRowsComputedUsingStats()) {
+        operator.process(row, 0);
+      }
+      operator.flush();
+      return true;
+    }
     InspectableObject row = getNextRow();
     if (row != null) {
       pushRow(row);
@@ -609,6 +616,9 @@ public class FetchOperator implements Se
    * returns output ObjectInspector, never null
    */
   public ObjectInspector getOutputObjectInspector() throws HiveException {
+    if(null != work.getStatRowOI()) {
+      return work.getStatRowOI();
+    }
     try {
       if (work.isNotPartitioned()) {
         return getRowInspectorFromTable(work.getTblDesc());

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java?rev=1537667&r1=1537666&r2=1537667&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java 
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/metadata/Hive.java Thu Oct 
31 21:22:02 2013
@@ -48,6 +48,8 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.FsShell;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.common.FileUtils;
+import 
org.apache.hadoop.hive.common.classification.InterfaceAudience.LimitedPrivate;
+import 
org.apache.hadoop.hive.common.classification.InterfaceStability.Unstable;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.conf.HiveConf.ConfVars;
 import org.apache.hadoop.hive.metastore.HiveMetaException;
@@ -1730,7 +1732,7 @@ private void constructOneLBLocationMap(F
    * @param tbl table for which partitions are needed
    * @return list of partition objects
    */
-  public Set<Partition> getAllPartitionsForPruner(Table tbl) throws 
HiveException {
+  public Set<Partition> getAllPartitionsOf(Table tbl) throws HiveException {
     if (!tbl.isPartitioned()) {
       return Sets.newHashSet(new Partition(tbl));
     }
@@ -2405,21 +2407,13 @@ private void constructOneLBLocationMap(F
         HiveMetaStoreClient.class.getName());
   }
 
-  /*
-   * This api just sets up a metastore client. This is used for
-   * pre-launching the metastore client so as to reduce latency
-   * within a single session.
-   */
-  public void setupMSC() throws MetaException {
-    getMSC();
-  }
-
   /**
-   *
    * @return the metastore client for the current thread
    * @throws MetaException
    */
-  private IMetaStoreClient getMSC() throws MetaException {
+  @LimitedPrivate(value = {"Hive"})
+  @Unstable
+  public IMetaStoreClient getMSC() throws MetaException {
     if (metaStoreClient == null) {
       metaStoreClient = createMetaStoreClient();
     }

Modified: 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java?rev=1537667&r1=1537666&r2=1537667&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java 
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/Optimizer.java 
Thu Oct 31 21:22:02 2013
@@ -111,6 +111,9 @@ public class Optimizer {
     if (HiveConf.getFloatVar(hiveConf, 
HiveConf.ConfVars.HIVELIMITPUSHDOWNMEMORYUSAGE) > 0) {
       transformations.add(new LimitPushdownOptimizer());
     }
+    if(HiveConf.getBoolVar(hiveConf, 
HiveConf.ConfVars.HIVEOPTIMIZEMETADATAQUERIES)) {
+      transformations.add(new StatsOptimizer());
+    }
     transformations.add(new SimpleFetchOptimizer());  // must be called last
 
     if (HiveConf.getBoolVar(hiveConf, HiveConf.ConfVars.HIVEFETCHTASKAGGR)) {

Added: 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java?rev=1537667&view=auto
==============================================================================
--- 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java 
(added)
+++ 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/StatsOptimizer.java 
Thu Oct 31 21:22:02 2013
@@ -0,0 +1,431 @@
+package org.apache.hadoop.hive.ql.optimizer;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Stack;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData;
+import org.apache.hadoop.hive.ql.exec.ColumnInfo;
+import org.apache.hadoop.hive.ql.exec.Description;
+import org.apache.hadoop.hive.ql.exec.FetchTask;
+import org.apache.hadoop.hive.ql.exec.FileSinkOperator;
+import org.apache.hadoop.hive.ql.exec.GroupByOperator;
+import org.apache.hadoop.hive.ql.exec.ReduceSinkOperator;
+import org.apache.hadoop.hive.ql.exec.SelectOperator;
+import org.apache.hadoop.hive.ql.exec.TableScanOperator;
+import org.apache.hadoop.hive.ql.exec.TaskFactory;
+import org.apache.hadoop.hive.ql.lib.DefaultGraphWalker;
+import org.apache.hadoop.hive.ql.lib.DefaultRuleDispatcher;
+import org.apache.hadoop.hive.ql.lib.Dispatcher;
+import org.apache.hadoop.hive.ql.lib.GraphWalker;
+import org.apache.hadoop.hive.ql.lib.Node;
+import org.apache.hadoop.hive.ql.lib.NodeProcessor;
+import org.apache.hadoop.hive.ql.lib.NodeProcessorCtx;
+import org.apache.hadoop.hive.ql.lib.Rule;
+import org.apache.hadoop.hive.ql.lib.RuleRegExp;
+import org.apache.hadoop.hive.ql.metadata.Hive;
+import org.apache.hadoop.hive.ql.metadata.Partition;
+import org.apache.hadoop.hive.ql.metadata.Table;
+import org.apache.hadoop.hive.ql.parse.ParseContext;
+import org.apache.hadoop.hive.ql.parse.SemanticException;
+import org.apache.hadoop.hive.ql.plan.AggregationDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeConstantDesc;
+import org.apache.hadoop.hive.ql.plan.ExprNodeDesc;
+import org.apache.hadoop.hive.ql.plan.FetchWork;
+import org.apache.hadoop.hive.ql.stats.StatsSetupConst;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFCount;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMax;
+import org.apache.hadoop.hive.ql.udf.generic.GenericUDAFMin;
+import org.apache.hadoop.hive.serde.serdeConstants;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
+import 
org.apache.hadoop.hive.serde2.objectinspector.StandardStructObjectInspector;
+import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
+
+/** There is a set of queries which can be answered entirely from statistics 
stored in metastore.
+ * Examples of such queries are count(*), count(a), max(a), min(b) etc. Hive 
already collects
+ * these basic statistics for query planning purposes. These same statistics 
can be used to
+ * answer queries also.
+ *
+ * Optimizer looks at query plan to determine if it can answer query using 
statistics
+ * and than change the plan to answer query entirely using statistics stored 
in metastore.
+ */
+public class StatsOptimizer implements Transform {
+
+  private static final Log Log = LogFactory.getLog(StatsOptimizer.class);
+
+  @Override
+  public ParseContext transform(ParseContext pctx) throws SemanticException {
+
+    if (pctx.getFetchTask() != null || !pctx.getQB().getIsQuery() ||
+        pctx.getQB().isAnalyzeRewrite() || pctx.getQB().isCTAS() ||
+        pctx.getLoadFileWork().size() > 1 || 
!pctx.getLoadTableWork().isEmpty()) {
+      return pctx;
+    }
+
+    String TS = TableScanOperator.getOperatorName() + "%";
+    String GBY = GroupByOperator.getOperatorName() + "%";
+    String RS = ReduceSinkOperator.getOperatorName() + "%";
+    String SEL = SelectOperator.getOperatorName() + "%";
+    String FS = FileSinkOperator.getOperatorName() + "%";
+
+    Map<Rule, NodeProcessor> opRules = new LinkedHashMap<Rule, 
NodeProcessor>();
+    opRules.put(new RuleRegExp("R1", TS + SEL + GBY + RS + GBY + SEL + FS),
+        new MetaDataProcessor(pctx));
+
+    Dispatcher disp = new DefaultRuleDispatcher(null, opRules, null);
+    GraphWalker ogw = new DefaultGraphWalker(disp);
+
+    ArrayList<Node> topNodes = new ArrayList<Node>();
+    topNodes.addAll(pctx.getTopOps().values());
+    ogw.startWalking(topNodes, null);
+    return pctx;
+  }
+
+  private static class MetaDataProcessor implements NodeProcessor {
+
+    private final ParseContext pctx;
+
+    public MetaDataProcessor (ParseContext pctx) {
+      this.pctx = pctx;
+    }
+
+    enum StatType{
+      Integeral,
+      Double,
+      String,
+      Boolean,
+      Binary,
+      Unsupported
+    }
+
+    private StatType getType(String origType) {
+      if (serdeConstants.IntegralTypes.contains(origType)) {
+        return StatType.Integeral;
+      } else if (origType.equals(serdeConstants.DOUBLE_TYPE_NAME) ||
+          origType.equals(serdeConstants.FLOAT_TYPE_NAME)) {
+        return StatType.Double;
+      } else if (origType.equals(serdeConstants.BINARY_TYPE_NAME)) {
+        return StatType.Binary;
+      } else if (origType.equals(serdeConstants.BOOLEAN_TYPE_NAME)) {
+        return StatType.Boolean;
+      } else if (origType.equals(serdeConstants.STRING_TYPE_NAME)) {
+        return StatType.String;
+      }
+      return StatType.Unsupported;
+    }
+
+    private Long getNullcountFor(StatType type, ColumnStatisticsData statData) 
{
+
+      switch(type) {
+      case Integeral :
+        return statData.getLongStats().getNumNulls();
+      case Double:
+        return statData.getDoubleStats().getNumNulls();
+      case String:
+        return statData.getStringStats().getNumNulls();
+      case Boolean:
+        return statData.getBooleanStats().getNumNulls();
+      case Binary:
+        return statData.getBinaryStats().getNumNulls();
+      default:
+        return null;
+      }
+    }
+
+    @Override
+    public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procCtx,
+        Object... nodeOutputs) throws SemanticException {
+
+      // 1. Do few checks to determine eligibility of optimization
+      // 2. look at ExprNodeFuncGenericDesc in select list to see if its min, 
max, count etc.
+      //    If it is
+      // 3. Connect to metastore and get the stats
+      // 4. Compose rows and add it in FetchWork
+      // 5. Delete GBY - RS - GBY - SEL from the pipeline.
+
+      TableScanOperator tsOp = (TableScanOperator) stack.get(0);
+      if(tsOp.getParentOperators() != null && tsOp.getParentOperators().size() 
> 0) {
+        // looks like a subq plan.
+        return null;
+      }
+      SelectOperator selOp = (SelectOperator)tsOp.getChildren().get(0);
+      for(ExprNodeDesc desc : selOp.getConf().getColList()) {
+        if (!(desc instanceof ExprNodeColumnDesc)) {
+          // Probably an expression, cant handle that
+          return null;
+        }
+      }
+      // Since we have done an exact match on TS-SEL-GBY-RS-GBY-SEL-FS
+      // we need not to do any instanceof checks for following.
+      GroupByOperator gbyOp = (GroupByOperator)selOp.getChildren().get(0);
+      FileSinkOperator fsOp = (FileSinkOperator)(gbyOp.getChildren().get(0).
+          getChildren().get(0).getChildren().get(0).getChildren().get(0));
+      if (fsOp.getChildOperators() != null && fsOp.getChildOperators().size() 
> 0) {
+        // looks like a subq plan.
+        return null;
+      }
+      List<AggregationDesc> aggrs = gbyOp.getConf().getAggregators();
+
+      Table tbl = pctx.getTopToTable().get(tsOp);
+      List<Object> oneRow = new ArrayList<Object>();
+      List<ObjectInspector> ois = new ArrayList<ObjectInspector>();
+      try{
+        Hive hive = Hive.get(pctx.getConf());
+
+        for (AggregationDesc aggr : aggrs) {
+          if 
(aggr.getGenericUDAFName().equals(GenericUDAFCount.class.getAnnotation(
+              Description.class).name())) {
+            long rowCnt = 0;
+            if ((aggr.getParameters().isEmpty() || aggr.getParameters().get(0) 
instanceof
+                ExprNodeConstantDesc)) {
+              // Its either count (*) or count(1) case
+              if(tbl.isPartitioned()) {
+                for (Partition part : hive.getAllPartitionsOf(tbl)) {
+                  long partRowCnt = Long.parseLong(part.getParameters()
+                    .get(StatsSetupConst.ROW_COUNT));
+                  if (partRowCnt < 1) {
+                    Log.debug("Partition doesn't have upto date stats " + 
part.getSpec());
+                    return null;
+                  }
+                  rowCnt += partRowCnt;
+                }
+              } else { // unpartitioned table
+                rowCnt = 
Long.parseLong(tbl.getProperty(StatsSetupConst.ROW_COUNT));
+                if (rowCnt < 1) {
+                  // if rowCnt < 1 than its either empty table or table on 
which stats are not
+                  //  computed We assume the worse and don't attempt to 
optimize.
+                  Log.debug("Table doesn't have upto date stats " + 
tbl.getTableName());
+                  return null;
+                }
+              }
+            } else {
+              // Its count(col) case
+              if (!(aggr.getParameters().get(0) instanceof 
ExprNodeColumnDesc)) {
+                // this is weird, we got expr or something in there, bail out
+                Log.debug("Unexpected expression : " + 
aggr.getParameters().get(0));
+                return null;
+              }
+              ExprNodeColumnDesc desc = 
(ExprNodeColumnDesc)aggr.getParameters().get(0);
+              String colName = desc.getColumn();
+              StatType type = getType(desc.getTypeString());
+              if(!tbl.isPartitioned()) {
+                rowCnt = 
Long.parseLong(tbl.getProperty(StatsSetupConst.ROW_COUNT));
+                if (rowCnt < 1) {
+                  Log.debug("Table doesn't have upto date stats " + 
tbl.getTableName());
+                  return null;
+                }
+                //TODO: After HIVE-3777 use the property to figure out if 
following
+                // stats is fresh or not.
+                ColumnStatisticsData statData = 
hive.getMSC().getTableColumnStatistics(
+                    tbl.getDbName(),tbl.getTableName(),colName).
+                    getStatsObjIterator().next().getStatsData();
+                Long nullCnt = getNullcountFor(type, statData);
+                if (null == nullCnt) {
+                  Log.debug("Unsupported type: " + desc.getTypeString() + " 
encountered in " +
+                      "metadata optimizer for column : " + colName);
+                  return null;
+                } else {
+                  rowCnt -= nullCnt;
+                }
+              } else {
+                for (Partition part : hive.getAllPartitionsOf(tbl)) {
+                  Long partRowCnt = Long.parseLong(part.getParameters()
+                    .get(StatsSetupConst.ROW_COUNT));
+                  if (partRowCnt < 1) {
+                    Log.debug("Partition doesn't have upto date stats " + 
part.getSpec());
+                    return null;
+                  }
+                  rowCnt += partRowCnt;
+                  //TODO: After HIVE-3777 use the property to figure out if 
following
+                  // stats is fresh or not.
+                  ColumnStatisticsData statData = 
hive.getMSC().getPartitionColumnStatistics(
+                      tbl.getDbName(), tbl.getTableName(),part.getName(), 
colName)
+                      .getStatsObjIterator().next().getStatsData();
+                  Long nullCnt = getNullcountFor(type, statData);
+                  if(nullCnt == null) {
+                    Log.debug("Unsupported type: " + desc.getTypeString() + " 
encountered in " +
+                        "metadata optimizer for column : " + colName);
+                    return null;
+                  } else {
+                    rowCnt -= nullCnt;
+                  }
+                }
+              }
+            }
+            oneRow.add(rowCnt);
+            ois.add(PrimitiveObjectInspectorFactory.
+                getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
+          } else if 
(aggr.getGenericUDAFName().equals(GenericUDAFMax.class.getAnnotation(
+              Description.class).name())) {
+            ExprNodeColumnDesc colDesc = 
(ExprNodeColumnDesc)aggr.getParameters().get(0);
+            String colName = colDesc.getColumn();
+            StatType type = getType(colDesc.getTypeString());
+            if(!tbl.isPartitioned()) {
+              //TODO: After HIVE-3777 use the property to figure out if 
following
+              // stats is fresh or not.
+              ColumnStatisticsData statData = 
hive.getMSC().getTableColumnStatistics(
+                  tbl.getDbName(),tbl.getTableName(),colName).
+                  getStatsObjIterator().next().getStatsData();
+              switch (type) {
+              case Integeral:
+                oneRow.add(statData.getLongStats().getHighValue());
+                ois.add(PrimitiveObjectInspectorFactory.
+                    getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
+                break;
+              case Double:
+                oneRow.add(statData.getDoubleStats().getHighValue());
+                ois.add(PrimitiveObjectInspectorFactory.
+                    getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
+                break;
+              default:
+                // unsupported type
+                Log.debug("Unsupported type: " + colDesc.getTypeString() + " 
encountered in " +
+                    "metadata optimizer for column : " + colName);
+                return null;
+              }
+            } else {
+              List<String> parts = 
hive.getMSC().listPartitionNames(tbl.getDbName(),
+                  tbl.getTableName(), (short)-1);
+              switch(type) {
+              case Integeral: {
+                long maxVal = Long.MIN_VALUE;
+                for (String part : parts) {
+                  //TODO: After HIVE-3777 use the property to figure out if 
following
+                  // stats is fresh or not.
+                  ColumnStatisticsData statData = 
hive.getMSC().getPartitionColumnStatistics(
+                      tbl.getDbName(),tbl.getTableName(), part, colName).
+                      getStatsObjIterator().next().getStatsData();
+                  maxVal = 
Math.max(maxVal,statData.getLongStats().getHighValue());
+                }
+                oneRow.add(maxVal);
+                ois.add(PrimitiveObjectInspectorFactory.
+                    getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
+                break;
+              }
+              case Double: {
+                double maxVal = Double.MIN_VALUE;
+                for (String part : parts) {
+                  //TODO: After HIVE-3777 use the property to figure out if 
following
+                  // stats is fresh or not.
+                  ColumnStatisticsData statData = 
hive.getMSC().getPartitionColumnStatistics(
+                      tbl.getDbName(),tbl.getTableName(), part, colName).
+                      getStatsObjIterator().next().getStatsData();
+                  maxVal = 
Math.max(maxVal,statData.getDoubleStats().getHighValue());
+                }
+                oneRow.add(maxVal);
+                ois.add(PrimitiveObjectInspectorFactory.
+                    getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
+                break;
+              }
+              default:
+                Log.debug("Unsupported type: " + colDesc.getTypeString() + " 
encountered in " +
+                    "metadata optimizer for column : " + colName);
+                return null;
+              }
+            }
+          }  else if 
(aggr.getGenericUDAFName().equals(GenericUDAFMin.class.getAnnotation(
+              Description.class).name())) {
+            ExprNodeColumnDesc colDesc = 
(ExprNodeColumnDesc)aggr.getParameters().get(0);
+            String colName = colDesc.getColumn();
+            StatType type = getType(colDesc.getTypeString());
+            if (!tbl.isPartitioned()) {
+              //TODO: After HIVE-3777 use the property to figure out if 
following
+              // stats is fresh or not.
+              ColumnStatisticsData statData = 
hive.getMSC().getTableColumnStatistics(
+                  tbl.getDbName(),tbl.getTableName(),colName).
+                  getStatsObjIterator().next().getStatsData();
+              switch (type) {
+              case Integeral:
+                oneRow.add(statData.getLongStats().getLowValue());
+                ois.add(PrimitiveObjectInspectorFactory.
+                    getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
+                break;
+              case Double:
+                oneRow.add(statData.getDoubleStats().getLowValue());
+                ois.add(PrimitiveObjectInspectorFactory.
+                    getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
+                break;
+              default: // unsupported type
+                Log.debug("Unsupported type: " + colDesc.getTypeString() + " 
encountered in " +
+                    "metadata optimizer for column : " + colName);
+                return null;
+              }
+            } else {
+              List<String> parts = 
hive.getMSC().listPartitionNames(tbl.getDbName(),
+                  tbl.getTableName(), (short)-1);
+              switch(type) {
+              case Integeral: {
+                long minVal = Long.MAX_VALUE;
+                for (String part : parts) {
+                  //TODO: After HIVE-3777 use the property to figure out if 
following
+                  // stats is fresh or not.
+                  ColumnStatisticsData statData = 
hive.getMSC().getPartitionColumnStatistics(
+                      tbl.getDbName(),tbl.getTableName(), part, colName).
+                      getStatsObjIterator().next().getStatsData();
+                  minVal = 
Math.min(minVal,statData.getLongStats().getLowValue());
+                }
+                oneRow.add(minVal);
+                ois.add(PrimitiveObjectInspectorFactory.
+                    getPrimitiveJavaObjectInspector(PrimitiveCategory.LONG));
+                break;
+              }
+              case Double: {
+                double minVal = Double.MAX_VALUE;
+                for (String part : parts) {
+                  //TODO: After HIVE-3777 use the property to figure out if 
following
+                  // stats is fresh or not.
+                  ColumnStatisticsData statData = 
hive.getMSC().getPartitionColumnStatistics(
+                      tbl.getDbName(),tbl.getTableName(), part, colName).
+                      getStatsObjIterator().next().getStatsData();
+                  minVal = 
Math.min(minVal,statData.getDoubleStats().getLowValue());
+                }
+                oneRow.add(minVal);
+                ois.add(PrimitiveObjectInspectorFactory.
+                    getPrimitiveJavaObjectInspector(PrimitiveCategory.DOUBLE));
+                break;
+              }
+              default: // unsupported type
+                Log.debug("Unsupported type: " + colDesc.getTypeString() + " 
encountered in " +
+                    "metadata optimizer for column : " + colName);
+                return null;
+
+              }
+            }
+          } else { // Unsupported aggregation.
+            Log.debug("Unsupported aggregation for metadata optimizer: "
+                + aggr.getGenericUDAFName());
+            return null;
+          }
+        }
+      } catch (Exception e) {
+        // this is best effort optimization, bail out in error conditions and
+        // try generate and execute slower plan
+        Log.debug("Failed to optimize using metadata optimizer", e);
+        return null;
+      }
+
+      List<List<Object>> allRows = new ArrayList<List<Object>>();
+      allRows.add(oneRow);
+
+      List<String> colNames = new ArrayList<String>();
+      for (ColumnInfo colInfo: gbyOp.getSchema().getSignature()) {
+        colNames.add(colInfo.getInternalName());
+      }
+      StandardStructObjectInspector sOI = ObjectInspectorFactory.
+          getStandardStructObjectInspector(colNames, ois);
+      FetchWork fWork = new FetchWork(allRows, sOI);
+      FetchTask fTask = (FetchTask)TaskFactory.get(fWork, pctx.getConf());
+      fWork.setLimit(allRows.size());
+      pctx.setFetchTask(fTask);
+
+      return null;
+    }
+  }
+}

Modified: 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java?rev=1537667&r1=1537666&r2=1537667&view=diff
==============================================================================
--- 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java
 (original)
+++ 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/optimizer/ppr/PartitionPruner.java
 Thu Oct 31 21:22:02 2013
@@ -337,7 +337,7 @@ public class PartitionPruner implements 
   private static Set<Partition> getAllPartitions(Table tab) throws 
HiveException {
     PerfLogger perfLogger = PerfLogger.getPerfLogger();
     perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.PARTITION_RETRIEVING);
-    Set<Partition> result = Hive.get().getAllPartitionsForPruner(tab);
+    Set<Partition> result = Hive.get().getAllPartitionsOf(tab);
     perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.PARTITION_RETRIEVING);
     return result;
   }

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FetchWork.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FetchWork.java?rev=1537667&r1=1537666&r2=1537667&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FetchWork.java 
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/plan/FetchWork.java Thu 
Oct 31 21:22:02 2013
@@ -27,6 +27,7 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.exec.ListSinkOperator;
 import org.apache.hadoop.hive.ql.exec.Operator;
 import org.apache.hadoop.hive.ql.parse.SplitSample;
+import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
 
 /**
  * FetchWork.
@@ -50,6 +51,9 @@ public class FetchWork implements Serial
 
   private SplitSample splitSample;
 
+  private transient List<List<Object>> rowsComputedFromStats;
+  private transient ObjectInspector statRowOI;
+
   /**
    * Serialization Null Format for the serde used to fetch data.
    */
@@ -58,6 +62,19 @@ public class FetchWork implements Serial
   public FetchWork() {
   }
 
+  public FetchWork(List<List<Object>> rowsComputedFromStats,ObjectInspector 
statRowOI) {
+    this.rowsComputedFromStats = rowsComputedFromStats;
+    this.statRowOI = statRowOI;
+  }
+
+  public ObjectInspector getStatRowOI() {
+    return statRowOI;
+  }
+
+  public List<List<Object>> getRowsComputedUsingStats() {
+    return rowsComputedFromStats;
+  }
+
   public FetchWork(String tblDir, TableDesc tblDesc) {
     this(tblDir, tblDesc, -1);
   }

Modified: 
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java?rev=1537667&r1=1537666&r2=1537667&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java 
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/session/SessionState.java 
Thu Oct 31 21:22:02 2013
@@ -33,8 +33,6 @@ import java.util.Map;
 import java.util.Set;
 import java.util.UUID;
 
-import javax.security.auth.login.LoginException;
-
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.commons.logging.Log;
@@ -281,7 +279,7 @@ public class SessionState {
     // Get the following out of the way when you start the session these take a
     // while and should be done when we start up.
     try {
-      Hive.get(startSs.conf).setupMSC();
+      Hive.get(startSs.conf).getMSC();
       ShimLoader.getHadoopShims().getUGIForConf(startSs.conf);
       FileSystem.get(startSs.conf);
     } catch (Exception e) {
@@ -289,7 +287,7 @@ public class SessionState {
       // that would cause ClassNoFoundException otherwise
       throw new RuntimeException(e);
     }
-    
+
     try {
       startSs.authenticator = HiveUtils.getAuthenticator(
           startSs.getConf(),HiveConf.ConfVars.HIVE_AUTHENTICATOR_MANAGER);

Added: hive/trunk/ql/src/test/queries/clientpositive/metadata_only_queries.q
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/metadata_only_queries.q?rev=1537667&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/metadata_only_queries.q 
(added)
+++ hive/trunk/ql/src/test/queries/clientpositive/metadata_only_queries.q Thu 
Oct 31 21:22:02 2013
@@ -0,0 +1,75 @@
+set hive.compute.query.using.stats=true;
+set hive.stats.autogather=true;
+create table over10k(
+           t tinyint,
+           si smallint,
+           i int,
+           b bigint,
+           f float,
+           d double,
+           bo boolean,
+           s string,
+          ts timestamp, 
+           dec decimal,  
+           bin binary)
+       row format delimited
+       fields terminated by '|';
+
+load data local inpath '../data/files/over10k' into table over10k;
+
+create table stats_tbl(
+           t tinyint,
+           si smallint,
+           i int,
+           b bigint,
+           f float,
+           d double,
+           bo boolean,
+           s string,
+          ts timestamp, 
+           dec decimal,  
+           bin binary);
+
+create table stats_tbl_part(
+           t tinyint,
+           si smallint,
+           i int,
+           b bigint,
+           f float,
+           d double,
+           bo boolean,
+           s string,
+          ts timestamp, 
+           dec decimal,  
+           bin binary) partitioned by (dt string);
+
+
+insert overwrite table stats_tbl select * from over10k;
+
+insert into table stats_tbl_part partition (dt='2010') select * from over10k 
where t>0 and t<30;
+insert into table stats_tbl_part partition (dt='2011') select * from over10k 
where t>30 and t<60;
+insert into table stats_tbl_part partition (dt='2012') select * from over10k 
where t>60;
+
+explain 
+select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), 
min(b) from stats_tbl;
+explain
+select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), 
min(b) from stats_tbl_part;
+
+analyze table stats_tbl compute statistics for columns t,si,i,b,f,d,bo,s,bin;
+analyze table stats_tbl_part partition(dt='2010') compute statistics for 
columns t,si,i,b,f,d,bo,s,bin;
+analyze table stats_tbl_part partition(dt='2011') compute statistics for 
columns t,si,i,b,f,d,bo,s,bin;
+analyze table stats_tbl_part partition(dt='2012') compute statistics for 
columns t,si,i,b,f,d,bo,s,bin;
+
+explain 
+select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), 
min(b), max(f), min(d) from stats_tbl;
+select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), 
min(b), max(f), min(d) from stats_tbl;
+explain 
+select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), 
min(b), max(f), min(d) from stats_tbl_part;
+select count(*), count(1), count(s), count(bo), count(bin), count(si), max(i), 
min(b), max(f), min(d) from stats_tbl_part;
+
+explain select count(ts) from stats_tbl_part;
+
+drop table stats_tbl;
+drop table stats_tbl_part;
+
+set hive.compute.query.using.stats=false;

Added: hive/trunk/ql/src/test/queries/clientpositive/stats_only_null.q
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/queries/clientpositive/stats_only_null.q?rev=1537667&view=auto
==============================================================================
--- hive/trunk/ql/src/test/queries/clientpositive/stats_only_null.q (added)
+++ hive/trunk/ql/src/test/queries/clientpositive/stats_only_null.q Thu Oct 31 
21:22:02 2013
@@ -0,0 +1,36 @@
+set hive.compute.query.using.stats=true;
+set hive.stats.autogather=true;
+CREATE TABLE temps_null(a double, b int, c STRING, d smallint) STORED AS 
TEXTFILE; 
+
+CREATE TABLE stats_null(a double, b int, c STRING, d smallint) STORED AS 
TEXTFILE; 
+
+CREATE TABLE stats_null_part(a double, b int, c STRING, d smallint) 
partitioned by (dt string) STORED AS TEXTFILE; 
+
+LOAD DATA LOCAL INPATH '../data/files/null.txt' INTO TABLE temps_null;
+
+insert overwrite table stats_null select * from temps_null;
+insert overwrite table stats_null_part partition(dt='2010') select * from 
temps_null where d <=5;
+
+insert overwrite table stats_null_part partition(dt='2011') select * from 
temps_null where d > 5;
+explain 
+select count(*), count(a), count(b), count(c), count(d) from stats_null;
+explain 
+select count(*), count(a), count(b), count(c), count(d) from stats_null_part;
+
+
+analyze table stats_null compute statistics for columns a,b,c,d;
+analyze table stats_null_part partition(dt='2010') compute statistics for 
columns a,b,c,d;
+analyze table stats_null_part partition(dt='2011') compute statistics for 
columns a,b,c,d;
+
+explain 
+select count(*), count(a), count(b), count(c), count(d) from stats_null;
+explain 
+select count(*), count(a), count(b), count(c), count(d) from stats_null_part;
+
+
+select count(*), count(a), count(b), count(c), count(d) from stats_null;
+select count(*), count(a), count(b), count(c), count(d) from stats_null_part;
+drop table stats_null;
+drop table stats_null_part;
+drop table temps_null;
+set hive.compute.query.using.stats=false;

svn commit: r1537667 [1/3] - in /hive/trunk: common/src/java/org/apache/hadoop/hive/conf/ conf/ ql/src/java/org/apache/hadoop/hive/ql/exec/ ql/src/java/org/apache/hadoop/hive/ql/metadata/ ql/src/java/org/apache/hadoop/hive/ql/optimizer/ ql/src/java/org...

Reply via email to