http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregator.java new file mode 100644 index 0000000..6fae3e5 --- /dev/null +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/DateColumnStatsAggregator.java @@ -0,0 +1,358 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hadoop.hive.metastore.columnstats.aggr; + +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.Date; +import org.apache.hadoop.hive.metastore.api.DateColumnStatsData; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class DateColumnStatsAggregator extends ColumnStatsAggregator implements + IExtrapolatePartStatus { + + private static final Logger LOG = LoggerFactory.getLogger(DateColumnStatsAggregator.class); + + @Override + public ColumnStatisticsObj aggregate(String colName, List<String> partNames, + List<ColumnStatistics> css) throws MetaException { + ColumnStatisticsObj statsObj = null; + + // check if all the ColumnStatisticsObjs contain stats and all the ndv are + // bitvectors + boolean doAllPartitionContainStats = partNames.size() == css.size(); + LOG.debug("doAllPartitionContainStats for " + colName + " is " + doAllPartitionContainStats); + NumDistinctValueEstimator ndvEstimator = null; + String colType = null; + for (ColumnStatistics cs : css) { + if (cs.getStatsObjSize() != 1) { + throw new MetaException( + "The number of columns should be exactly one in aggrStats, but found " + + cs.getStatsObjSize()); + } + ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); + if (statsObj == null) { + colType = cso.getColType(); + statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso + .getStatsData().getSetField()); + } + if (!cso.getStatsData().getDateStats().isSetBitVectors() + || cso.getStatsData().getDateStats().getBitVectors().length() == 0) { + ndvEstimator = null; + break; + } else { + // check if all of the bit vectors can merge + NumDistinctValueEstimator estimator = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(cso.getStatsData().getDateStats().getBitVectors()); + if (ndvEstimator == null) { + ndvEstimator = estimator; + } else { + if (ndvEstimator.canMerge(estimator)) { + continue; + } else { + ndvEstimator = null; + break; + } + } + } + } + if (ndvEstimator != null) { + ndvEstimator = NumDistinctValueEstimatorFactory + .getEmptyNumDistinctValueEstimator(ndvEstimator); + } + LOG.debug("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null)); + ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); + if (doAllPartitionContainStats || css.size() < 2) { + DateColumnStatsData aggregateData = null; + long lowerBound = 0; + long higherBound = 0; + double densityAvgSum = 0.0; + for (ColumnStatistics cs : css) { + ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); + DateColumnStatsData newData = cso.getStatsData().getDateStats(); + lowerBound = Math.max(lowerBound, newData.getNumDVs()); + higherBound += newData.getNumDVs(); + densityAvgSum += (diff(newData.getHighValue(), newData.getLowValue())) + / newData.getNumDVs(); + if (ndvEstimator != null) { + ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors())); + } + if (aggregateData == null) { + aggregateData = newData.deepCopy(); + } else { + aggregateData.setLowValue(min(aggregateData.getLowValue(), newData.getLowValue())); + aggregateData + .setHighValue(max(aggregateData.getHighValue(), newData.getHighValue())); + aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); + aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); + } + } + if (ndvEstimator != null) { + // if all the ColumnStatisticsObjs contain bitvectors, we do not need to + // use uniform distribution assumption because we can merge bitvectors + // to get a good estimation. + aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); + } else { + long estimation; + if (useDensityFunctionForNDVEstimation) { + // We have estimation, lowerbound and higherbound. We use estimation + // if it is between lowerbound and higherbound. + double densityAvg = densityAvgSum / partNames.size(); + estimation = (long) (diff(aggregateData.getHighValue(), aggregateData.getLowValue()) / densityAvg); + if (estimation < lowerBound) { + estimation = lowerBound; + } else if (estimation > higherBound) { + estimation = higherBound; + } + } else { + estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner); + } + aggregateData.setNumDVs(estimation); + } + columnStatisticsData.setDateStats(aggregateData); + } else { + // we need extrapolation + LOG.debug("start extrapolation for " + colName); + + Map<String, Integer> indexMap = new HashMap<String, Integer>(); + for (int index = 0; index < partNames.size(); index++) { + indexMap.put(partNames.get(index), index); + } + Map<String, Double> adjustedIndexMap = new HashMap<String, Double>(); + Map<String, ColumnStatisticsData> adjustedStatsMap = new HashMap<String, ColumnStatisticsData>(); + // while we scan the css, we also get the densityAvg, lowerbound and + // higerbound when useDensityFunctionForNDVEstimation is true. + double densityAvgSum = 0.0; + if (ndvEstimator == null) { + // if not every partition uses bitvector for ndv, we just fall back to + // the traditional extrapolation methods. + for (ColumnStatistics cs : css) { + String partName = cs.getStatsDesc().getPartName(); + ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); + DateColumnStatsData newData = cso.getStatsData().getDateStats(); + if (useDensityFunctionForNDVEstimation) { + densityAvgSum += diff(newData.getHighValue(), newData.getLowValue()) / newData.getNumDVs(); + } + adjustedIndexMap.put(partName, (double) indexMap.get(partName)); + adjustedStatsMap.put(partName, cso.getStatsData()); + } + } else { + // we first merge all the adjacent bitvectors that we could merge and + // derive new partition names and index. + StringBuilder pseudoPartName = new StringBuilder(); + double pseudoIndexSum = 0; + int length = 0; + int curIndex = -1; + DateColumnStatsData aggregateData = null; + for (ColumnStatistics cs : css) { + String partName = cs.getStatsDesc().getPartName(); + ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); + DateColumnStatsData newData = cso.getStatsData().getDateStats(); + // newData.isSetBitVectors() should be true for sure because we + // already checked it before. + if (indexMap.get(partName) != curIndex) { + // There is bitvector, but it is not adjacent to the previous ones. + if (length > 0) { + // we have to set ndv + adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length); + aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); + ColumnStatisticsData csd = new ColumnStatisticsData(); + csd.setDateStats(aggregateData); + adjustedStatsMap.put(pseudoPartName.toString(), csd); + if (useDensityFunctionForNDVEstimation) { + densityAvgSum += diff(aggregateData.getHighValue(), aggregateData.getLowValue()) + / aggregateData.getNumDVs(); + } + // reset everything + pseudoPartName = new StringBuilder(); + pseudoIndexSum = 0; + length = 0; + ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator); + } + aggregateData = null; + } + curIndex = indexMap.get(partName); + pseudoPartName.append(partName); + pseudoIndexSum += curIndex; + length++; + curIndex++; + if (aggregateData == null) { + aggregateData = newData.deepCopy(); + } else { + aggregateData.setLowValue(min(aggregateData.getLowValue(), newData.getLowValue())); + aggregateData.setHighValue(max(aggregateData.getHighValue(), newData.getHighValue())); + aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); + } + ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors())); + } + if (length > 0) { + // we have to set ndv + adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length); + aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); + ColumnStatisticsData csd = new ColumnStatisticsData(); + csd.setDateStats(aggregateData); + adjustedStatsMap.put(pseudoPartName.toString(), csd); + if (useDensityFunctionForNDVEstimation) { + densityAvgSum += diff(aggregateData.getHighValue(), aggregateData.getLowValue()) + / aggregateData.getNumDVs(); + } + } + } + extrapolate(columnStatisticsData, partNames.size(), css.size(), adjustedIndexMap, + adjustedStatsMap, densityAvgSum / adjustedStatsMap.size()); + } + statsObj.setStatsData(columnStatisticsData); + LOG.debug("Ndv estimatation for " + colName + " is " + + columnStatisticsData.getDateStats().getNumDVs()); + return statsObj; + } + + private long diff(Date d1, Date d2) { + return d1.getDaysSinceEpoch() - d2.getDaysSinceEpoch(); + } + + private Date min(Date d1, Date d2) { + return d1.compareTo(d2) < 0 ? d1 : d2; + } + + private Date max(Date d1, Date d2) { + return d1.compareTo(d2) < 0 ? d2 : d1; + } + + @Override + public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, + int numPartsWithStats, Map<String, Double> adjustedIndexMap, + Map<String, ColumnStatisticsData> adjustedStatsMap, double densityAvg) { + int rightBorderInd = numParts; + DateColumnStatsData extrapolateDateData = new DateColumnStatsData(); + Map<String, DateColumnStatsData> extractedAdjustedStatsMap = new HashMap<>(); + for (Map.Entry<String, ColumnStatisticsData> entry : adjustedStatsMap.entrySet()) { + extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getDateStats()); + } + List<Map.Entry<String, DateColumnStatsData>> list = new LinkedList<Map.Entry<String, DateColumnStatsData>>( + extractedAdjustedStatsMap.entrySet()); + // get the lowValue + Collections.sort(list, new Comparator<Map.Entry<String, DateColumnStatsData>>() { + public int compare(Map.Entry<String, DateColumnStatsData> o1, + Map.Entry<String, DateColumnStatsData> o2) { + return diff(o1.getValue().getLowValue(), o2.getValue().getLowValue()) < 0 ? -1 : 1; + } + }); + double minInd = adjustedIndexMap.get(list.get(0).getKey()); + double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); + long lowValue = 0; + long min = list.get(0).getValue().getLowValue().getDaysSinceEpoch(); + long max = list.get(list.size() - 1).getValue().getLowValue().getDaysSinceEpoch(); + if (minInd == maxInd) { + lowValue = min; + } else if (minInd < maxInd) { + // left border is the min + lowValue = (long) (max - (max - min) * maxInd / (maxInd - minInd)); + } else { + // right border is the min + lowValue = (long) (max - (max - min) * (rightBorderInd - maxInd) / (minInd - maxInd)); + } + + // get the highValue + Collections.sort(list, new Comparator<Map.Entry<String, DateColumnStatsData>>() { + public int compare(Map.Entry<String, DateColumnStatsData> o1, + Map.Entry<String, DateColumnStatsData> o2) { + return diff(o1.getValue().getHighValue(), o2.getValue().getHighValue()) < 0 ? -1 : 1; + } + }); + minInd = adjustedIndexMap.get(list.get(0).getKey()); + maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); + long highValue = 0; + min = list.get(0).getValue().getHighValue().getDaysSinceEpoch(); + max = list.get(list.size() - 1).getValue().getHighValue().getDaysSinceEpoch(); + if (minInd == maxInd) { + highValue = min; + } else if (minInd < maxInd) { + // right border is the max + highValue = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd)); + } else { + // left border is the max + highValue = (long) (min + (max - min) * minInd / (minInd - maxInd)); + } + + // get the #nulls + long numNulls = 0; + for (Map.Entry<String, DateColumnStatsData> entry : extractedAdjustedStatsMap.entrySet()) { + numNulls += entry.getValue().getNumNulls(); + } + // we scale up sumNulls based on the number of partitions + numNulls = numNulls * numParts / numPartsWithStats; + + // get the ndv + long ndv = 0; + Collections.sort(list, new Comparator<Map.Entry<String, DateColumnStatsData>>() { + public int compare(Map.Entry<String, DateColumnStatsData> o1, + Map.Entry<String, DateColumnStatsData> o2) { + return o1.getValue().getNumDVs() < o2.getValue().getNumDVs() ? -1 : 1; + } + }); + long lowerBound = list.get(list.size() - 1).getValue().getNumDVs(); + long higherBound = 0; + for (Map.Entry<String, DateColumnStatsData> entry : list) { + higherBound += entry.getValue().getNumDVs(); + } + if (useDensityFunctionForNDVEstimation && densityAvg != 0.0) { + ndv = (long) ((highValue - lowValue) / densityAvg); + if (ndv < lowerBound) { + ndv = lowerBound; + } else if (ndv > higherBound) { + ndv = higherBound; + } + } else { + minInd = adjustedIndexMap.get(list.get(0).getKey()); + maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); + min = list.get(0).getValue().getNumDVs(); + max = list.get(list.size() - 1).getValue().getNumDVs(); + if (minInd == maxInd) { + ndv = min; + } else if (minInd < maxInd) { + // right border is the max + ndv = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd)); + } else { + // left border is the max + ndv = (long) (min + (max - min) * minInd / (minInd - maxInd)); + } + } + extrapolateDateData.setLowValue(new Date(lowValue)); + extrapolateDateData.setHighValue(new Date(highValue)); + extrapolateDateData.setNumNulls(numNulls); + extrapolateDateData.setNumDVs(ndv); + extrapolateData.setDateStats(extrapolateDateData); + } +}
http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java new file mode 100644 index 0000000..2ea2fcc --- /dev/null +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/columnstats/aggr/StringColumnStatsAggregator.java @@ -0,0 +1,303 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +package org.apache.hadoop.hive.metastore.columnstats.aggr; + +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; + +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; +import org.apache.hadoop.hive.metastore.api.ColumnStatistics; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; +import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.MetaException; +import org.apache.hadoop.hive.metastore.api.StringColumnStatsData; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class StringColumnStatsAggregator extends ColumnStatsAggregator implements + IExtrapolatePartStatus { + + private static final Logger LOG = LoggerFactory.getLogger(LongColumnStatsAggregator.class); + + @Override + public ColumnStatisticsObj aggregate(String colName, List<String> partNames, + List<ColumnStatistics> css) throws MetaException { + ColumnStatisticsObj statsObj = null; + + // check if all the ColumnStatisticsObjs contain stats and all the ndv are + // bitvectors. Only when both of the conditions are true, we merge bit + // vectors. Otherwise, just use the maximum function. + boolean doAllPartitionContainStats = partNames.size() == css.size(); + LOG.debug("doAllPartitionContainStats for " + colName + " is " + doAllPartitionContainStats); + NumDistinctValueEstimator ndvEstimator = null; + String colType = null; + for (ColumnStatistics cs : css) { + if (cs.getStatsObjSize() != 1) { + throw new MetaException( + "The number of columns should be exactly one in aggrStats, but found " + + cs.getStatsObjSize()); + } + ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); + if (statsObj == null) { + colType = cso.getColType(); + statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso + .getStatsData().getSetField()); + } + if (!cso.getStatsData().getStringStats().isSetBitVectors() + || cso.getStatsData().getStringStats().getBitVectors().length() == 0) { + ndvEstimator = null; + break; + } else { + // check if all of the bit vectors can merge + NumDistinctValueEstimator estimator = NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(cso.getStatsData().getStringStats().getBitVectors()); + if (ndvEstimator == null) { + ndvEstimator = estimator; + } else { + if (ndvEstimator.canMerge(estimator)) { + continue; + } else { + ndvEstimator = null; + break; + } + } + } + } + if (ndvEstimator != null) { + ndvEstimator = NumDistinctValueEstimatorFactory + .getEmptyNumDistinctValueEstimator(ndvEstimator); + } + LOG.debug("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null)); + ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); + if (doAllPartitionContainStats || css.size() < 2) { + StringColumnStatsData aggregateData = null; + for (ColumnStatistics cs : css) { + ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); + StringColumnStatsData newData = cso.getStatsData().getStringStats(); + if (ndvEstimator != null) { + ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors())); + } + if (aggregateData == null) { + aggregateData = newData.deepCopy(); + } else { + aggregateData + .setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen())); + aggregateData + .setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen())); + aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); + aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); + } + } + if (ndvEstimator != null) { + // if all the ColumnStatisticsObjs contain bitvectors, we do not need to + // use uniform distribution assumption because we can merge bitvectors + // to get a good estimation. + aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); + } else { + // aggregateData already has the ndv of the max of all + } + columnStatisticsData.setStringStats(aggregateData); + } else { + // we need extrapolation + LOG.debug("start extrapolation for " + colName); + + Map<String, Integer> indexMap = new HashMap<String, Integer>(); + for (int index = 0; index < partNames.size(); index++) { + indexMap.put(partNames.get(index), index); + } + Map<String, Double> adjustedIndexMap = new HashMap<String, Double>(); + Map<String, ColumnStatisticsData> adjustedStatsMap = new HashMap<String, ColumnStatisticsData>(); + if (ndvEstimator == null) { + // if not every partition uses bitvector for ndv, we just fall back to + // the traditional extrapolation methods. + for (ColumnStatistics cs : css) { + String partName = cs.getStatsDesc().getPartName(); + ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); + StringColumnStatsData newData = cso.getStatsData().getStringStats(); + adjustedIndexMap.put(partName, (double) indexMap.get(partName)); + adjustedStatsMap.put(partName, cso.getStatsData()); + } + } else { + // we first merge all the adjacent bitvectors that we could merge and + // derive new partition names and index. + StringBuilder pseudoPartName = new StringBuilder(); + double pseudoIndexSum = 0; + int length = 0; + int curIndex = -1; + StringColumnStatsData aggregateData = null; + for (ColumnStatistics cs : css) { + String partName = cs.getStatsDesc().getPartName(); + ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); + StringColumnStatsData newData = cso.getStatsData().getStringStats(); + // newData.isSetBitVectors() should be true for sure because we + // already checked it before. + if (indexMap.get(partName) != curIndex) { + // There is bitvector, but it is not adjacent to the previous ones. + if (length > 0) { + // we have to set ndv + adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length); + aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); + ColumnStatisticsData csd = new ColumnStatisticsData(); + csd.setStringStats(aggregateData); + adjustedStatsMap.put(pseudoPartName.toString(), csd); + // reset everything + pseudoPartName = new StringBuilder(); + pseudoIndexSum = 0; + length = 0; + ndvEstimator = NumDistinctValueEstimatorFactory + .getEmptyNumDistinctValueEstimator(ndvEstimator); + } + aggregateData = null; + } + curIndex = indexMap.get(partName); + pseudoPartName.append(partName); + pseudoIndexSum += curIndex; + length++; + curIndex++; + if (aggregateData == null) { + aggregateData = newData.deepCopy(); + } else { + aggregateData.setAvgColLen(Math.min(aggregateData.getAvgColLen(), + newData.getAvgColLen())); + aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), + newData.getMaxColLen())); + aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); + } + ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(newData.getBitVectors())); + } + if (length > 0) { + // we have to set ndv + adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length); + aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); + ColumnStatisticsData csd = new ColumnStatisticsData(); + csd.setStringStats(aggregateData); + adjustedStatsMap.put(pseudoPartName.toString(), csd); + } + } + extrapolate(columnStatisticsData, partNames.size(), css.size(), adjustedIndexMap, + adjustedStatsMap, -1); + } + LOG.debug("Ndv estimatation for " + colName + " is " + + columnStatisticsData.getStringStats().getNumDVs()); + statsObj.setStatsData(columnStatisticsData); + return statsObj; + } + + @Override + public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, + int numPartsWithStats, Map<String, Double> adjustedIndexMap, + Map<String, ColumnStatisticsData> adjustedStatsMap, double densityAvg) { + int rightBorderInd = numParts; + StringColumnStatsData extrapolateStringData = new StringColumnStatsData(); + Map<String, StringColumnStatsData> extractedAdjustedStatsMap = new HashMap<>(); + for (Map.Entry<String, ColumnStatisticsData> entry : adjustedStatsMap.entrySet()) { + extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getStringStats()); + } + List<Map.Entry<String, StringColumnStatsData>> list = new LinkedList<Map.Entry<String, StringColumnStatsData>>( + extractedAdjustedStatsMap.entrySet()); + // get the avgLen + Collections.sort(list, new Comparator<Map.Entry<String, StringColumnStatsData>>() { + public int compare(Map.Entry<String, StringColumnStatsData> o1, + Map.Entry<String, StringColumnStatsData> o2) { + return o1.getValue().getAvgColLen() < o2.getValue().getAvgColLen() ? -1 : 1; + } + }); + double minInd = adjustedIndexMap.get(list.get(0).getKey()); + double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); + double avgColLen = 0; + double min = list.get(0).getValue().getAvgColLen(); + double max = list.get(list.size() - 1).getValue().getAvgColLen(); + if (minInd == maxInd) { + avgColLen = min; + } else if (minInd < maxInd) { + // right border is the max + avgColLen = (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd)); + } else { + // left border is the max + avgColLen = (min + (max - min) * minInd / (minInd - maxInd)); + } + + // get the maxLen + Collections.sort(list, new Comparator<Map.Entry<String, StringColumnStatsData>>() { + public int compare(Map.Entry<String, StringColumnStatsData> o1, + Map.Entry<String, StringColumnStatsData> o2) { + return o1.getValue().getMaxColLen() < o2.getValue().getMaxColLen() ? -1 : 1; + } + }); + minInd = adjustedIndexMap.get(list.get(0).getKey()); + maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); + double maxColLen = 0; + min = list.get(0).getValue().getAvgColLen(); + max = list.get(list.size() - 1).getValue().getAvgColLen(); + if (minInd == maxInd) { + maxColLen = min; + } else if (minInd < maxInd) { + // right border is the max + maxColLen = (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd)); + } else { + // left border is the max + maxColLen = (min + (max - min) * minInd / (minInd - maxInd)); + } + + // get the #nulls + long numNulls = 0; + for (Map.Entry<String, StringColumnStatsData> entry : extractedAdjustedStatsMap.entrySet()) { + numNulls += entry.getValue().getNumNulls(); + } + // we scale up sumNulls based on the number of partitions + numNulls = numNulls * numParts / numPartsWithStats; + + // get the ndv + long ndv = 0; + Collections.sort(list, new Comparator<Map.Entry<String, StringColumnStatsData>>() { + public int compare(Map.Entry<String, StringColumnStatsData> o1, + Map.Entry<String, StringColumnStatsData> o2) { + return o1.getValue().getNumDVs() < o2.getValue().getNumDVs() ? -1 : 1; + } + }); + minInd = adjustedIndexMap.get(list.get(0).getKey()); + maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey()); + min = list.get(0).getValue().getNumDVs(); + max = list.get(list.size() - 1).getValue().getNumDVs(); + if (minInd == maxInd) { + ndv = (long) min; + } else if (minInd < maxInd) { + // right border is the max + ndv = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd)); + } else { + // left border is the max + ndv = (long) (min + (max - min) * minInd / (minInd - maxInd)); + } + extrapolateStringData.setAvgColLen(avgColLen); + ; + extrapolateStringData.setMaxColLen((long) maxColLen); + extrapolateStringData.setNumNulls(numNulls); + extrapolateStringData.setNumDVs(ndv); + extrapolateData.setStringStats(extrapolateStringData); + } + +} http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/StatsCache.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/StatsCache.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/StatsCache.java index 0e11989..78a962a 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/StatsCache.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/StatsCache.java @@ -32,8 +32,8 @@ import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.api.AggrStats; import org.apache.hadoop.hive.metastore.api.ColumnStatistics; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; -import org.apache.hadoop.hive.metastore.hbase.stats.ColumnStatsAggregator; -import org.apache.hadoop.hive.metastore.hbase.stats.ColumnStatsAggregatorFactory; +import org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator; +import org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregatorFactory; import java.io.IOException; import java.security.MessageDigest; @@ -84,7 +84,10 @@ class StatsCache { .build(new CacheLoader<StatsCacheKey, AggrStats>() { @Override public AggrStats load(StatsCacheKey key) throws Exception { - boolean useDensityFunctionForNDVEstimation = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION); + boolean useDensityFunctionForNDVEstimation = HiveConf.getBoolVar(conf, + HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION); + double ndvTuner = HiveConf.getFloatVar(conf, + HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_TUNER); HBaseReadWrite hrw = HBaseReadWrite.getInstance(); AggrStats aggrStats = hrw.getAggregatedStats(key.hashed); if (aggrStats == null) { @@ -100,7 +103,7 @@ class StatsCache { if (aggregator == null) { aggregator = ColumnStatsAggregatorFactory.getColumnStatsAggregator(css.iterator() .next().getStatsObj().iterator().next().getStatsData().getSetField(), - useDensityFunctionForNDVEstimation); + useDensityFunctionForNDVEstimation, ndvTuner); } ColumnStatisticsObj statsObj = aggregator .aggregate(key.colName, key.partNames, css); http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/BinaryColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/BinaryColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/BinaryColumnStatsAggregator.java index d81d612..e6c836b 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/BinaryColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/BinaryColumnStatsAggregator.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats; +package org.apache.hadoop.hive.metastore.columnstats.aggr; import java.util.List; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/BooleanColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/BooleanColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/BooleanColumnStatsAggregator.java index e796df2..a34bc9f 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/BooleanColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/BooleanColumnStatsAggregator.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats; +package org.apache.hadoop.hive.metastore.columnstats.aggr; import java.util.List; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregator.java index 29a0539..a52e5e5 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregator.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats; +package org.apache.hadoop.hive.metastore.columnstats.aggr; import java.util.List; @@ -27,7 +27,7 @@ import org.apache.hadoop.hive.metastore.api.MetaException; public abstract class ColumnStatsAggregator { public boolean useDensityFunctionForNDVEstimation; - + public double ndvTuner; public abstract ColumnStatisticsObj aggregate(String colName, List<String> partNames, List<ColumnStatistics> css) throws MetaException; } http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregatorFactory.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregatorFactory.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregatorFactory.java index 568bf06..173e06f 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregatorFactory.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/ColumnStatsAggregatorFactory.java @@ -17,13 +17,14 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats; +package org.apache.hadoop.hive.metastore.columnstats.aggr; import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData; import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; +import org.apache.hadoop.hive.metastore.api.DateColumnStatsData; import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData; import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; @@ -34,7 +35,8 @@ public class ColumnStatsAggregatorFactory { private ColumnStatsAggregatorFactory() { } - public static ColumnStatsAggregator getColumnStatsAggregator(_Fields type, boolean useDensityFunctionForNDVEstimation) { + public static ColumnStatsAggregator getColumnStatsAggregator(_Fields type, + boolean useDensityFunctionForNDVEstimation, double ndvTuner) { ColumnStatsAggregator agg; switch (type) { case BOOLEAN_STATS: @@ -43,6 +45,9 @@ public class ColumnStatsAggregatorFactory { case LONG_STATS: agg = new LongColumnStatsAggregator(); break; + case DATE_STATS: + agg = new DateColumnStatsAggregator(); + break; case DOUBLE_STATS: agg = new DoubleColumnStatsAggregator(); break; @@ -59,6 +64,7 @@ public class ColumnStatsAggregatorFactory { throw new RuntimeException("Woh, bad. Unknown stats type " + type.toString()); } agg.useDensityFunctionForNDVEstimation = useDensityFunctionForNDVEstimation; + agg.ndvTuner = ndvTuner; return agg; } @@ -76,6 +82,10 @@ public class ColumnStatsAggregatorFactory { csd.setLongStats(new LongColumnStatsData()); break; + case DATE_STATS: + csd.setDateStats(new DateColumnStatsData()); + break; + case DOUBLE_STATS: csd.setDoubleStats(new DoubleColumnStatsData()); break; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DecimalColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DecimalColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DecimalColumnStatsAggregator.java index 8eb64e0..5924c3e 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DecimalColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DecimalColumnStatsAggregator.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats; +package org.apache.hadoop.hive.metastore.columnstats.aggr; import java.util.Collections; import java.util.Comparator; @@ -35,9 +35,13 @@ import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData; import org.apache.hadoop.hive.metastore.api.MetaException; import org.apache.hadoop.hive.metastore.hbase.HBaseUtils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class DecimalColumnStatsAggregator extends ColumnStatsAggregator implements IExtrapolatePartStatus { + + private static final Logger LOG = LoggerFactory.getLogger(DecimalColumnStatsAggregator.class); @Override public ColumnStatisticsObj aggregate(String colName, List<String> partNames, @@ -47,6 +51,7 @@ public class DecimalColumnStatsAggregator extends ColumnStatsAggregator implemen // check if all the ColumnStatisticsObjs contain stats and all the ndv are // bitvectors boolean doAllPartitionContainStats = partNames.size() == css.size(); + LOG.debug("doAllPartitionContainStats for " + colName + " is " + doAllPartitionContainStats); NumDistinctValueEstimator ndvEstimator = null; String colType = null; for (ColumnStatistics cs : css) { @@ -85,6 +90,7 @@ public class DecimalColumnStatsAggregator extends ColumnStatsAggregator implemen ndvEstimator = NumDistinctValueEstimatorFactory .getEmptyNumDistinctValueEstimator(ndvEstimator); } + LOG.debug("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null)); ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); if (doAllPartitionContainStats || css.size() < 2) { DecimalColumnStatsData aggregateData = null; @@ -94,12 +100,10 @@ public class DecimalColumnStatsAggregator extends ColumnStatsAggregator implemen for (ColumnStatistics cs : css) { ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); DecimalColumnStatsData newData = cso.getStatsData().getDecimalStats(); - if (useDensityFunctionForNDVEstimation) { - lowerBound = Math.max(lowerBound, newData.getNumDVs()); - higherBound += newData.getNumDVs(); - densityAvgSum += (HBaseUtils.getDoubleValue(newData.getHighValue()) - HBaseUtils - .getDoubleValue(newData.getLowValue())) / newData.getNumDVs(); - } + lowerBound = Math.max(lowerBound, newData.getNumDVs()); + higherBound += newData.getNumDVs(); + densityAvgSum += (HBaseUtils.getDoubleValue(newData.getHighValue()) - HBaseUtils + .getDoubleValue(newData.getLowValue())) / newData.getNumDVs(); if (ndvEstimator != null) { ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory .getNumDistinctValueEstimator(newData.getBitVectors())); @@ -129,28 +133,27 @@ public class DecimalColumnStatsAggregator extends ColumnStatsAggregator implemen // to get a good estimation. aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); } else { + long estimation; if (useDensityFunctionForNDVEstimation) { // We have estimation, lowerbound and higherbound. We use estimation // if it is between lowerbound and higherbound. double densityAvg = densityAvgSum / partNames.size(); - long estimation = (long) ((HBaseUtils.getDoubleValue(aggregateData.getHighValue()) - HBaseUtils + estimation = (long) ((HBaseUtils.getDoubleValue(aggregateData.getHighValue()) - HBaseUtils .getDoubleValue(aggregateData.getLowValue())) / densityAvg); if (estimation < lowerBound) { - aggregateData.setNumDVs(lowerBound); + estimation = lowerBound; } else if (estimation > higherBound) { - aggregateData.setNumDVs(higherBound); - } else { - aggregateData.setNumDVs(estimation); + estimation = higherBound; } } else { - // Without useDensityFunctionForNDVEstimation, we just use the - // default one, which is the max of all the partitions and it is - // already done. + estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner); } + aggregateData.setNumDVs(estimation); } columnStatisticsData.setDecimalStats(aggregateData); } else { // we need extrapolation + LOG.debug("start extrapolation for " + colName); Map<String, Integer> indexMap = new HashMap<String, Integer>(); for (int index = 0; index < partNames.size(); index++) { indexMap.put(partNames.get(index), index); @@ -251,6 +254,8 @@ public class DecimalColumnStatsAggregator extends ColumnStatsAggregator implemen adjustedStatsMap, densityAvgSum / adjustedStatsMap.size()); } statsObj.setStatsData(columnStatisticsData); + LOG.debug("Ndv estimatation for " + colName + " is " + + columnStatisticsData.getDecimalStats().getNumDVs()); return statsObj; } http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DoubleColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DoubleColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DoubleColumnStatsAggregator.java index b6b8612..e55c412 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DoubleColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/DoubleColumnStatsAggregator.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats; +package org.apache.hadoop.hive.metastore.columnstats.aggr; import java.util.Collections; import java.util.Comparator; @@ -33,10 +33,14 @@ import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData; import org.apache.hadoop.hive.metastore.api.MetaException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class DoubleColumnStatsAggregator extends ColumnStatsAggregator implements IExtrapolatePartStatus { + private static final Logger LOG = LoggerFactory.getLogger(LongColumnStatsAggregator.class); + @Override public ColumnStatisticsObj aggregate(String colName, List<String> partNames, List<ColumnStatistics> css) throws MetaException { @@ -45,6 +49,7 @@ public class DoubleColumnStatsAggregator extends ColumnStatsAggregator implement // check if all the ColumnStatisticsObjs contain stats and all the ndv are // bitvectors boolean doAllPartitionContainStats = partNames.size() == css.size(); + LOG.debug("doAllPartitionContainStats for " + colName + " is " + doAllPartitionContainStats); NumDistinctValueEstimator ndvEstimator = null; String colType = null; for (ColumnStatistics cs : css) { @@ -83,6 +88,7 @@ public class DoubleColumnStatsAggregator extends ColumnStatsAggregator implement ndvEstimator = NumDistinctValueEstimatorFactory .getEmptyNumDistinctValueEstimator(ndvEstimator); } + LOG.debug("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null)); ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); if (doAllPartitionContainStats || css.size() < 2) { DoubleColumnStatsData aggregateData = null; @@ -92,11 +98,9 @@ public class DoubleColumnStatsAggregator extends ColumnStatsAggregator implement for (ColumnStatistics cs : css) { ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); DoubleColumnStatsData newData = cso.getStatsData().getDoubleStats(); - if (useDensityFunctionForNDVEstimation) { - lowerBound = Math.max(lowerBound, newData.getNumDVs()); - higherBound += newData.getNumDVs(); - densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs(); - } + lowerBound = Math.max(lowerBound, newData.getNumDVs()); + higherBound += newData.getNumDVs(); + densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs(); if (ndvEstimator != null) { ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory .getNumDistinctValueEstimator(newData.getBitVectors())); @@ -117,27 +121,26 @@ public class DoubleColumnStatsAggregator extends ColumnStatsAggregator implement // to get a good estimation. aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); } else { + long estimation; if (useDensityFunctionForNDVEstimation) { // We have estimation, lowerbound and higherbound. We use estimation // if it is between lowerbound and higherbound. double densityAvg = densityAvgSum / partNames.size(); - long estimation = (long) ((aggregateData.getHighValue() - aggregateData.getLowValue()) / densityAvg); + estimation = (long) ((aggregateData.getHighValue() - aggregateData.getLowValue()) / densityAvg); if (estimation < lowerBound) { - aggregateData.setNumDVs(lowerBound); + estimation = lowerBound; } else if (estimation > higherBound) { - aggregateData.setNumDVs(higherBound); - } else { - aggregateData.setNumDVs(estimation); + estimation = higherBound; } } else { - // Without useDensityFunctionForNDVEstimation, we just use the - // default one, which is the max of all the partitions and it is - // already done. + estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner); } + aggregateData.setNumDVs(estimation); } columnStatisticsData.setDoubleStats(aggregateData); } else { // we need extrapolation + LOG.debug("start extrapolation for " + colName); Map<String, Integer> indexMap = new HashMap<String, Integer>(); for (int index = 0; index < partNames.size(); index++) { indexMap.put(partNames.get(index), index); @@ -225,6 +228,8 @@ public class DoubleColumnStatsAggregator extends ColumnStatsAggregator implement extrapolate(columnStatisticsData, partNames.size(), css.size(), adjustedIndexMap, adjustedStatsMap, densityAvgSum / adjustedStatsMap.size()); } + LOG.debug("Ndv estimatation for " + colName + " is " + + columnStatisticsData.getDoubleStats().getNumDVs()); statsObj.setStatsData(columnStatisticsData); return statsObj; } http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/IExtrapolatePartStatus.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/IExtrapolatePartStatus.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/IExtrapolatePartStatus.java index af75bce..acf679e 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/IExtrapolatePartStatus.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/IExtrapolatePartStatus.java @@ -15,7 +15,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats; +package org.apache.hadoop.hive.metastore.columnstats.aggr; import java.util.Map; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/LongColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/LongColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/LongColumnStatsAggregator.java index 2da6f60..2ee09f3 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/LongColumnStatsAggregator.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/LongColumnStatsAggregator.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats; +package org.apache.hadoop.hive.metastore.columnstats.aggr; import java.util.Collections; import java.util.Comparator; @@ -33,10 +33,14 @@ import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.apache.hadoop.hive.metastore.api.LongColumnStatsData; import org.apache.hadoop.hive.metastore.api.MetaException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class LongColumnStatsAggregator extends ColumnStatsAggregator implements IExtrapolatePartStatus { + private static final Logger LOG = LoggerFactory.getLogger(LongColumnStatsAggregator.class); + @Override public ColumnStatisticsObj aggregate(String colName, List<String> partNames, List<ColumnStatistics> css) throws MetaException { @@ -45,6 +49,7 @@ public class LongColumnStatsAggregator extends ColumnStatsAggregator implements // check if all the ColumnStatisticsObjs contain stats and all the ndv are // bitvectors boolean doAllPartitionContainStats = partNames.size() == css.size(); + LOG.debug("doAllPartitionContainStats for " + colName + " is " + doAllPartitionContainStats); NumDistinctValueEstimator ndvEstimator = null; String colType = null; for (ColumnStatistics cs : css) { @@ -83,6 +88,7 @@ public class LongColumnStatsAggregator extends ColumnStatsAggregator implements ndvEstimator = NumDistinctValueEstimatorFactory .getEmptyNumDistinctValueEstimator(ndvEstimator); } + LOG.debug("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null)); ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); if (doAllPartitionContainStats || css.size() < 2) { LongColumnStatsData aggregateData = null; @@ -92,11 +98,9 @@ public class LongColumnStatsAggregator extends ColumnStatsAggregator implements for (ColumnStatistics cs : css) { ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); LongColumnStatsData newData = cso.getStatsData().getLongStats(); - if (useDensityFunctionForNDVEstimation) { - lowerBound = Math.max(lowerBound, newData.getNumDVs()); - higherBound += newData.getNumDVs(); - densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs(); - } + lowerBound = Math.max(lowerBound, newData.getNumDVs()); + higherBound += newData.getNumDVs(); + densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs(); if (ndvEstimator != null) { ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory .getNumDistinctValueEstimator(newData.getBitVectors())); @@ -117,27 +121,27 @@ public class LongColumnStatsAggregator extends ColumnStatsAggregator implements // to get a good estimation. aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); } else { + long estimation; if (useDensityFunctionForNDVEstimation) { // We have estimation, lowerbound and higherbound. We use estimation // if it is between lowerbound and higherbound. double densityAvg = densityAvgSum / partNames.size(); - long estimation = (long) ((aggregateData.getHighValue() - aggregateData.getLowValue()) / densityAvg); + estimation = (long) ((aggregateData.getHighValue() - aggregateData.getLowValue()) / densityAvg); if (estimation < lowerBound) { - aggregateData.setNumDVs(lowerBound); + estimation = lowerBound; } else if (estimation > higherBound) { - aggregateData.setNumDVs(higherBound); - } else { - aggregateData.setNumDVs(estimation); + estimation = higherBound; } } else { - // Without useDensityFunctionForNDVEstimation, we just use the - // default one, which is the max of all the partitions and it is - // already done. + estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner); } + aggregateData.setNumDVs(estimation); } columnStatisticsData.setLongStats(aggregateData); } else { // we need extrapolation + LOG.debug("start extrapolation for " + colName); + Map<String, Integer> indexMap = new HashMap<String, Integer>(); for (int index = 0; index < partNames.size(); index++) { indexMap.put(partNames.get(index), index); @@ -226,6 +230,8 @@ public class LongColumnStatsAggregator extends ColumnStatsAggregator implements adjustedStatsMap, densityAvgSum / adjustedStatsMap.size()); } statsObj.setStatsData(columnStatisticsData); + LOG.debug("Ndv estimatation for " + colName + " is " + + columnStatisticsData.getLongStats().getNumDVs()); return statsObj; } http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/StringColumnStatsAggregator.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/StringColumnStatsAggregator.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/StringColumnStatsAggregator.java deleted file mode 100644 index 83c6c54..0000000 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/StringColumnStatsAggregator.java +++ /dev/null @@ -1,122 +0,0 @@ -/** - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -package org.apache.hadoop.hive.metastore.hbase.stats; - -import java.util.List; - -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; -import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; -import org.apache.hadoop.hive.metastore.api.ColumnStatistics; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsData; -import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; -import org.apache.hadoop.hive.metastore.api.MetaException; -import org.apache.hadoop.hive.metastore.api.StringColumnStatsData; - -public class StringColumnStatsAggregator extends ColumnStatsAggregator { - - @Override - public ColumnStatisticsObj aggregate(String colName, List<String> partNames, - List<ColumnStatistics> css) throws MetaException { - ColumnStatisticsObj statsObj = null; - - // check if all the ColumnStatisticsObjs contain stats and all the ndv are - // bitvectors. Only when both of the conditions are true, we merge bit - // vectors. Otherwise, just use the maximum function. - boolean doAllPartitionContainStats = partNames.size() == css.size(); - NumDistinctValueEstimator ndvEstimator = null; - String colType = null; - for (ColumnStatistics cs : css) { - if (cs.getStatsObjSize() != 1) { - throw new MetaException( - "The number of columns should be exactly one in aggrStats, but found " - + cs.getStatsObjSize()); - } - ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); - if (statsObj == null) { - colType = cso.getColType(); - statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso - .getStatsData().getSetField()); - } - if (!cso.getStatsData().getStringStats().isSetBitVectors() - || cso.getStatsData().getStringStats().getBitVectors().length() == 0) { - ndvEstimator = null; - break; - } else { - // check if all of the bit vectors can merge - NumDistinctValueEstimator estimator = NumDistinctValueEstimatorFactory - .getNumDistinctValueEstimator(cso.getStatsData().getStringStats().getBitVectors()); - if (ndvEstimator == null) { - ndvEstimator = estimator; - } else { - if (ndvEstimator.canMerge(estimator)) { - continue; - } else { - ndvEstimator = null; - break; - } - } - } - } - if (ndvEstimator != null) { - ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator); - } - ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData(); - if (doAllPartitionContainStats && ndvEstimator!=null) { - StringColumnStatsData aggregateData = null; - for (ColumnStatistics cs : css) { - ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); - StringColumnStatsData newData = cso.getStatsData().getStringStats(); - ndvEstimator.mergeEstimators(NumDistinctValueEstimatorFactory - .getNumDistinctValueEstimator(newData.getBitVectors())); - if (aggregateData == null) { - aggregateData = newData.deepCopy(); - } else { - aggregateData - .setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen())); - aggregateData - .setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen())); - aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); - } - } - aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues()); - columnStatisticsData.setStringStats(aggregateData); - } else { - StringColumnStatsData aggregateData = null; - for (ColumnStatistics cs : css) { - ColumnStatisticsObj cso = cs.getStatsObjIterator().next(); - StringColumnStatsData newData = cso.getStatsData().getStringStats(); - if (aggregateData == null) { - aggregateData = newData.deepCopy(); - } else { - aggregateData - .setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen())); - aggregateData - .setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen())); - aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls()); - aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs())); - } - } - columnStatisticsData.setStringStats(aggregateData); - } - statsObj.setStatsData(columnStatisticsData); - return statsObj; - } - -} http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/BinaryColumnStatsMerger.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/BinaryColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/BinaryColumnStatsMerger.java index af0669e..4c2d1bc 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/BinaryColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/BinaryColumnStatsMerger.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/BooleanColumnStatsMerger.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/BooleanColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/BooleanColumnStatsMerger.java index 33ff6a1..8e50153 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/BooleanColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/BooleanColumnStatsMerger.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMerger.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMerger.java index d3051a2..474d4dd 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMerger.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj; import org.slf4j.Logger; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMergerFactory.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMergerFactory.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMergerFactory.java index c013ba5..0ce1847 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMergerFactory.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/ColumnStatsMergerFactory.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DateColumnStatsMerger.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DateColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DateColumnStatsMerger.java index e899bfe..2542a00 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DateColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DateColumnStatsMerger.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DecimalColumnStatsMerger.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DecimalColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DecimalColumnStatsMerger.java index 4099ffc..4e8e129 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DecimalColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DecimalColumnStatsMerger.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DoubleColumnStatsMerger.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DoubleColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DoubleColumnStatsMerger.java index 1691fc9..4ef5c39 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DoubleColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/DoubleColumnStatsMerger.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/LongColumnStatsMerger.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/LongColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/LongColumnStatsMerger.java index 361af35..acf7f03 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/LongColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/LongColumnStatsMerger.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/StringColumnStatsMerger.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/StringColumnStatsMerger.java b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/StringColumnStatsMerger.java index 8e28f90..b3cd33c 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/StringColumnStatsMerger.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/hbase/stats/merge/StringColumnStatsMerger.java @@ -17,7 +17,7 @@ * under the License. */ -package org.apache.hadoop.hive.metastore.hbase.stats.merge; +package org.apache.hadoop.hive.metastore.columnstats.merge; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/model/org/apache/hadoop/hive/metastore/model/MPartitionColumnStatistics.java ---------------------------------------------------------------------- diff --git a/metastore/src/model/org/apache/hadoop/hive/metastore/model/MPartitionColumnStatistics.java b/metastore/src/model/org/apache/hadoop/hive/metastore/model/MPartitionColumnStatistics.java index 2967a60..20129bb 100644 --- a/metastore/src/model/org/apache/hadoop/hive/metastore/model/MPartitionColumnStatistics.java +++ b/metastore/src/model/org/apache/hadoop/hive/metastore/model/MPartitionColumnStatistics.java @@ -48,6 +48,7 @@ public class MPartitionColumnStatistics { private String decimalHighValue; private Long numNulls; private Long numDVs; + private byte[] bitVector; private Double avgColLen; private Long maxColLen; private Long numTrues; @@ -166,31 +167,35 @@ public class MPartitionColumnStatistics { this.numNulls = numNulls; } - public void setLongStats(Long numNulls, Long numNDVs, Long lowValue, Long highValue) { + public void setLongStats(Long numNulls, Long numNDVs, byte[] bitVector, Long lowValue, Long highValue) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.longLowValue = lowValue; this.longHighValue = highValue; } - public void setDoubleStats(Long numNulls, Long numNDVs, Double lowValue, Double highValue) { + public void setDoubleStats(Long numNulls, Long numNDVs, byte[] bitVector, Double lowValue, Double highValue) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.doubleLowValue = lowValue; this.doubleHighValue = highValue; } public void setDecimalStats( - Long numNulls, Long numNDVs, String lowValue, String highValue) { + Long numNulls, Long numNDVs, byte[] bitVector, String lowValue, String highValue) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.decimalLowValue = lowValue; this.decimalHighValue = highValue; } - public void setStringStats(Long numNulls, Long numNDVs, Long maxColLen, Double avgColLen) { + public void setStringStats(Long numNulls, Long numNDVs, byte[] bitVector, Long maxColLen, Double avgColLen) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.maxColLen = maxColLen; this.avgColLen = avgColLen; } @@ -201,9 +206,10 @@ public class MPartitionColumnStatistics { this.avgColLen = avgColLen; } - public void setDateStats(Long numNulls, Long numNDVs, Long lowValue, Long highValue) { + public void setDateStats(Long numNulls, Long numNDVs, byte[] bitVector, Long lowValue, Long highValue) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.longLowValue = lowValue; this.longHighValue = highValue; } @@ -255,4 +261,12 @@ public class MPartitionColumnStatistics { public void setDecimalHighValue(String decimalHighValue) { this.decimalHighValue = decimalHighValue; } + + public byte[] getBitVector() { + return bitVector; + } + + public void setBitVector(byte[] bitVector) { + this.bitVector = bitVector; + } } http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/model/org/apache/hadoop/hive/metastore/model/MTableColumnStatistics.java ---------------------------------------------------------------------- diff --git a/metastore/src/model/org/apache/hadoop/hive/metastore/model/MTableColumnStatistics.java b/metastore/src/model/org/apache/hadoop/hive/metastore/model/MTableColumnStatistics.java index 132f7a1..6cfaca3 100644 --- a/metastore/src/model/org/apache/hadoop/hive/metastore/model/MTableColumnStatistics.java +++ b/metastore/src/model/org/apache/hadoop/hive/metastore/model/MTableColumnStatistics.java @@ -46,6 +46,7 @@ public class MTableColumnStatistics { private String decimalHighValue; private Long numNulls; private Long numDVs; + private byte[] bitVector; private Double avgColLen; private Long maxColLen; private Long numTrues; @@ -156,31 +157,35 @@ public class MTableColumnStatistics { this.numNulls = numNulls; } - public void setLongStats(Long numNulls, Long numNDVs, Long lowValue, Long highValue) { + public void setLongStats(Long numNulls, Long numNDVs, byte[] bitVector, Long lowValue, Long highValue) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.longLowValue = lowValue; this.longHighValue = highValue; } - public void setDoubleStats(Long numNulls, Long numNDVs, Double lowValue, Double highValue) { + public void setDoubleStats(Long numNulls, Long numNDVs, byte[] bitVector, Double lowValue, Double highValue) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.doubleLowValue = lowValue; this.doubleHighValue = highValue; } public void setDecimalStats( - Long numNulls, Long numNDVs, String lowValue, String highValue) { + Long numNulls, Long numNDVs, byte[] bitVector, String lowValue, String highValue) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.decimalLowValue = lowValue; this.decimalHighValue = highValue; } - public void setStringStats(Long numNulls, Long numNDVs, Long maxColLen, Double avgColLen) { + public void setStringStats(Long numNulls, Long numNDVs, byte[] bitVector, Long maxColLen, Double avgColLen) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.maxColLen = maxColLen; this.avgColLen = avgColLen; } @@ -191,9 +196,10 @@ public class MTableColumnStatistics { this.avgColLen = avgColLen; } - public void setDateStats(Long numNulls, Long numNDVs, Long lowValue, Long highValue) { + public void setDateStats(Long numNulls, Long numNDVs, byte[] bitVector, Long lowValue, Long highValue) { this.numNulls = numNulls; this.numDVs = numNDVs; + this.bitVector = bitVector; this.longLowValue = lowValue; this.longHighValue = highValue; } @@ -246,4 +252,12 @@ public class MTableColumnStatistics { public void setDecimalHighValue(String decimalHighValue) { this.decimalHighValue = decimalHighValue; } + + public byte[] getBitVector() { + return bitVector; + } + + public void setBitVector(byte[] bitVector) { + this.bitVector = bitVector; + } } http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/model/package.jdo ---------------------------------------------------------------------- diff --git a/metastore/src/model/package.jdo b/metastore/src/model/package.jdo index 9c4bc21..570fd44 100644 --- a/metastore/src/model/package.jdo +++ b/metastore/src/model/package.jdo @@ -879,6 +879,9 @@ <field name="numDVs"> <column name="NUM_DISTINCTS" jdbc-type="BIGINT" allows-null="true"/> </field> + <field name="bitVector"> + <column name="BIT_VECTOR" jdbc-type="BLOB" allows-null="true"/> + </field> <field name="avgColLen"> <column name="AVG_COL_LEN" jdbc-type="DOUBLE" allows-null="true"/> </field> @@ -943,6 +946,9 @@ <field name="numDVs"> <column name="NUM_DISTINCTS" jdbc-type="BIGINT" allows-null="true"/> </field> + <field name="bitVector"> + <column name="BIT_VECTOR" jdbc-type="BLOB" allows-null="true"/> + </field> <field name="avgColLen"> <column name="AVG_COL_LEN" jdbc-type="DOUBLE" allows-null="true"/> </field>