HIVE-16997: Extend object store to store and use bit vectors (Pengcheng Xiong, reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/f8b79fe6 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/f8b79fe6 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/f8b79fe6 Branch: refs/heads/master Commit: f8b79fe6d136f348820ce81dc7a6883f1e70dcfc Parents: 892841a Author: Pengcheng Xiong <pxi...@apache.org> Authored: Tue Jul 25 15:41:14 2017 -0700 Committer: Pengcheng Xiong <pxi...@apache.org> Committed: Tue Jul 25 15:41:14 2017 -0700 ---------------------------------------------------------------------- .../apache/hadoop/hive/common/ndv/FMSketch.java | 117 +-- .../ndv/NumDistinctValueEstimatorFactory.java | 30 +- .../hive/common/ndv/fm/FMSketchUtils.java | 133 ++++ .../hadoop/hive/common/ndv/hll/HyperLogLog.java | 4 +- .../org/apache/hadoop/hive/conf/HiveConf.java | 4 +- .../ndv/fm/TestFMSketchSerialization.java | 97 +++ data/conf/hive-site.xml | 4 + data/conf/llap/hive-site.xml | 4 + data/conf/perf-reg/hive-site.xml | 5 + data/conf/tez/hive-site.xml | 5 + .../upgrade/derby/044-HIVE-16997.derby.sql | 1 + .../upgrade/derby/hive-schema-3.0.0.derby.sql | 2 +- .../derby/upgrade-2.3.0-to-3.0.0.derby.sql | 1 + .../upgrade/mssql/029-HIVE-16997.mssql.sql | 1 + .../upgrade/mssql/hive-schema-3.0.0.mssql.sql | 1 + .../mssql/upgrade-2.3.0-to-3.0.0.mssql.sql | 1 + .../upgrade/mysql/044-HIVE-16997.mysql.sql | 1 + .../upgrade/mysql/hive-schema-3.0.0.mysql.sql | 1 + .../mysql/upgrade-2.3.0-to-3.0.0.mysql.sql | 1 + .../upgrade/oracle/044-HIVE-16997.oracle.sql | 1 + .../upgrade/oracle/hive-schema-3.0.0.oracle.sql | 1 + .../oracle/upgrade-2.3.0-to-3.0.0.oracle.sql | 1 + .../postgres/043-HIVE-16997.postgres.sql | 1 + .../postgres/hive-schema-3.0.0.postgres.sql | 1 + .../upgrade-2.3.0-to-3.0.0.postgres.sql | 1 + .../hive/metastore/MetaStoreDirectSql.java | 98 ++- .../hadoop/hive/metastore/MetaStoreUtils.java | 49 +- .../hadoop/hive/metastore/ObjectStore.java | 20 +- .../hive/metastore/StatObjectConverter.java | 40 +- .../hive/metastore/cache/CachedStore.java | 50 +- .../aggr/DateColumnStatsAggregator.java | 358 +++++++++ .../aggr/StringColumnStatsAggregator.java | 303 ++++++++ .../hadoop/hive/metastore/hbase/StatsCache.java | 11 +- .../stats/BinaryColumnStatsAggregator.java | 2 +- .../stats/BooleanColumnStatsAggregator.java | 2 +- .../hbase/stats/ColumnStatsAggregator.java | 4 +- .../stats/ColumnStatsAggregatorFactory.java | 14 +- .../stats/DecimalColumnStatsAggregator.java | 35 +- .../stats/DoubleColumnStatsAggregator.java | 33 +- .../hbase/stats/IExtrapolatePartStatus.java | 2 +- .../hbase/stats/LongColumnStatsAggregator.java | 34 +- .../stats/StringColumnStatsAggregator.java | 122 --- .../stats/merge/BinaryColumnStatsMerger.java | 2 +- .../stats/merge/BooleanColumnStatsMerger.java | 2 +- .../hbase/stats/merge/ColumnStatsMerger.java | 2 +- .../stats/merge/ColumnStatsMergerFactory.java | 2 +- .../stats/merge/DateColumnStatsMerger.java | 2 +- .../stats/merge/DecimalColumnStatsMerger.java | 2 +- .../stats/merge/DoubleColumnStatsMerger.java | 2 +- .../stats/merge/LongColumnStatsMerger.java | 2 +- .../stats/merge/StringColumnStatsMerger.java | 2 +- .../model/MPartitionColumnStatistics.java | 24 +- .../metastore/model/MTableColumnStatistics.java | 24 +- metastore/src/model/package.jdo | 6 + .../hadoop/hive/metastore/TestOldSchema.java | 229 ++++++ .../hive/metastore/cache/TestCachedStore.java | 156 ++++ ...stHBaseAggregateStatsCacheWithBitVector.java | 13 +- .../TestHBaseAggregateStatsExtrapolation.java | 11 +- .../TestHBaseAggregateStatsNDVUniformDist.java | 15 +- .../org/apache/hadoop/hive/ql/exec/DDLTask.java | 2 +- .../formatting/MetaDataFormatUtils.java | 19 +- .../hadoop/hive/ql/plan/ColStatistics.java | 3 - .../hadoop/hive/ql/plan/DescTableDesc.java | 4 +- .../ql/udf/generic/GenericUDAFComputeStats.java | 2 +- ...lter_table_update_status_disable_bitvector.q | 139 ++++ ql/src/test/queries/clientpositive/bitvector.q | 3 + ql/src/test/queries/clientpositive/fm-sketch.q | 58 ++ ql/src/test/queries/clientpositive/hll.q | 11 + .../clientpositive/alterColumnStats.q.out | 16 +- .../clientpositive/alterColumnStatsPart.q.out | 6 +- .../alter_partition_update_status.q.out | 40 +- .../alter_table_column_stats.q.out | 764 +++++++++++-------- .../alter_table_update_status.q.out | 394 +++++----- ..._table_update_status_disable_bitvector.q.out | 708 +++++++++++++++++ .../clientpositive/analyze_tbl_part.q.out | 24 +- .../clientpositive/autoColumnStats_5.q.out | 36 +- .../clientpositive/autoColumnStats_9.q.out | 58 +- .../results/clientpositive/avro_decimal.q.out | 9 +- .../clientpositive/avro_decimal_native.q.out | 9 +- .../test/results/clientpositive/bitvector.q.out | 31 + .../test/results/clientpositive/char_udf1.q.out | 2 +- .../clientpositive/colstats_all_nulls.q.out | 18 +- ...names_with_leading_and_trailing_spaces.q.out | 17 +- .../column_pruner_multiple_children.q.out | 18 +- .../clientpositive/columnstats_partlvl.q.out | 46 +- .../clientpositive/columnstats_partlvl_dp.q.out | 70 +- .../clientpositive/columnstats_tbllvl.q.out | 78 +- .../results/clientpositive/compustat_avro.q.out | 16 +- .../clientpositive/compute_stats_date.q.out | 20 +- .../clientpositive/compute_stats_decimal.q.out | 2 +- .../clientpositive/compute_stats_double.q.out | 2 +- .../clientpositive/compute_stats_long.q.out | 2 +- .../clientpositive/compute_stats_string.q.out | 2 +- .../confirm_initial_tbl_stats.q.out | 223 ++++-- .../results/clientpositive/decimal_stats.q.out | 9 +- .../results/clientpositive/deleteAnalyze.q.out | 8 +- .../clientpositive/describe_syntax.q.out | 24 +- .../results/clientpositive/describe_table.q.out | 74 +- .../display_colstats_tbllvl.q.out | 111 +-- .../encrypted/encryption_move_tbl.q.out | 116 ++- .../extrapolate_part_stats_full.q.out | 7 +- .../extrapolate_part_stats_partial.q.out | 22 +- .../test/results/clientpositive/fm-sketch.q.out | 333 ++++++++ ql/src/test/results/clientpositive/hll.q.out | 181 ++++- .../clientpositive/llap/autoColumnStats_2.q.out | 233 +++++- ...names_with_leading_and_trailing_spaces.q.out | 17 +- .../llap/columnstats_part_coltype.q.out | 165 ++-- .../clientpositive/llap/deleteAnalyze.q.out | 8 +- .../extrapolate_part_stats_partial_ndv.q.out | 154 ++-- .../results/clientpositive/llap/llap_smb.q.out | 4 +- .../clientpositive/llap/stats_only_null.q.out | 7 +- .../clientpositive/llap/subquery_scalar.q.out | 10 +- .../clientpositive/llap/varchar_udf1.q.out | 2 +- .../clientpositive/llap/vector_udf1.q.out | 2 +- .../clientpositive/partial_column_stats.q.out | 8 +- .../partition_coltype_literals.q.out | 48 +- .../reduceSinkDeDuplication_pRS_key_empty.q.out | 6 +- .../rename_external_partition_location.q.out | 28 +- .../rename_table_update_column_stats.q.out | 108 +-- .../spark/avro_decimal_native.q.out | 8 +- .../spark/spark_dynamic_partition_pruning.q.out | 8 +- .../clientpositive/spark/stats_only_null.q.out | 6 +- .../clientpositive/stats_only_null.q.out | 7 +- .../temp_table_display_colstats_tbllvl.q.out | 99 ++- .../clientpositive/tez/explainanalyze_5.q.out | 6 +- .../clientpositive/tez/explainuser_3.q.out | 6 +- .../results/clientpositive/tunable_ndv.q.out | 68 +- 127 files changed, 5217 insertions(+), 1620 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/common/src/java/org/apache/hadoop/hive/common/ndv/FMSketch.java ---------------------------------------------------------------------- diff --git a/common/src/java/org/apache/hadoop/hive/common/ndv/FMSketch.java b/common/src/java/org/apache/hadoop/hive/common/ndv/FMSketch.java index e20d299..160ce66 100644 --- a/common/src/java/org/apache/hadoop/hive/common/ndv/FMSketch.java +++ b/common/src/java/org/apache/hadoop/hive/common/ndv/FMSketch.java @@ -15,22 +15,28 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.hadoop.hive.common.ndv; +package org.apache.hadoop.hive.common.ndv.fm; +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; import java.util.Random; import javolution.util.FastBitSet; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.apache.commons.lang.StringUtils; +import org.apache.commons.codec.binary.Base64; import org.apache.hadoop.hive.common.classification.InterfaceAudience; +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator; import org.apache.hadoop.hive.common.type.HiveDecimal; import org.apache.hadoop.hive.ql.util.JavaDataModel; public class FMSketch implements NumDistinctValueEstimator{ static final Logger LOG = LoggerFactory.getLogger(FMSketch.class.getName()); + public static final byte[] MAGIC = new byte[] { 'F', 'M' }; /* We want a,b,x to come from a finite field of size 0 to k, where k is a prime number. * 2^p - 1 is prime for p = 31. Hence bitvectorSize has to be 31. Pick k to be 2^p -1. @@ -38,7 +44,7 @@ public class FMSketch implements NumDistinctValueEstimator{ * independent. As a consequence, the hash values will not distribute uniformly from 0 to 2^p-1 * thus introducing errors in the estimates. */ - private static final int BIT_VECTOR_SIZE = 31; + public static final int BIT_VECTOR_SIZE = 31; // Refer to Flajolet-Martin'86 for the value of phi private static final double PHI = 0.77351; @@ -111,27 +117,6 @@ public class FMSketch implements NumDistinctValueEstimator{ } } - public FMSketch(String s, int numBitVectors) { - this.numBitVectors = numBitVectors; - FastBitSet bitVectorDeser[] = genBitSet(s, numBitVectors); - bitVector = new FastBitSet[numBitVectors]; - for(int i=0; i <numBitVectors; i++) { - bitVector[i] = new FastBitSet(BIT_VECTOR_SIZE); - bitVector[i].clear(); - bitVector[i].or(bitVectorDeser[i]); - } - - a = null; - b = null; - - aValue = null; - bValue = null; - } - - public FMSketch(String s) { - this(s, StringUtils.countMatches(s, "{")); - } - /** * Resets a distinctValueEstimator object to its original state. */ @@ -145,6 +130,10 @@ public class FMSketch implements NumDistinctValueEstimator{ return bitVector[index]; } + public FastBitSet setBitVector(FastBitSet fastBitSet, int index) { + return bitVector[index] = fastBitSet; + } + public int getnumBitVectors() { return numBitVectors; } @@ -168,67 +157,30 @@ public class FMSketch implements NumDistinctValueEstimator{ LOG.debug(t); } - /* Serializes a distinctValueEstimator object to Text for transport. - * - */ + @Override public String serialize() { - String s = new String(); - for(int i=0; i < numBitVectors; i++) { - s = s + (bitVector[i].toString()); + ByteArrayOutputStream bos = new ByteArrayOutputStream(); + // write bytes to bos ... + try { + FMSketchUtils.serializeFM(bos, this); + String result = Base64.encodeBase64String(bos.toByteArray()); + bos.close(); + return result; + } catch (IOException e) { + throw new RuntimeException(e); } - return s; } - /* Deserializes from string to FastBitSet; Creates a NumDistinctValueEstimator object and - * returns it. - */ - - private FastBitSet[] genBitSet(String s, int numBitVectors) { - FastBitSet[] b = new FastBitSet[numBitVectors]; - for (int j=0; j < numBitVectors; j++) { - b[j] = new FastBitSet(BIT_VECTOR_SIZE); - b[j].clear(); - } - - int vectorIndex =0; - - /* Parse input string to obtain the indexes that are set in the bitvector. - * When a toString() is called on a FastBitSet object to serialize it, the serialization - * adds { and } to the beginning and end of the return String. - * Skip "{", "}", ",", " " in the input string. - */ - for(int i=1; i < s.length()-1;) { - char c = s.charAt(i); - i = i + 1; - - // Move on to the next bit vector - if (c == '}') { - vectorIndex = vectorIndex + 1; - } - - // Encountered a numeric value; Extract out the entire number - if (c >= '0' && c <= '9') { - String t = new String(); - t = t + c; - c = s.charAt(i); - i = i + 1; - - while (c != ',' && c!= '}') { - t = t + c; - c = s.charAt(i); - i = i + 1; - } - - int bitIndex = Integer.parseInt(t); - assert(bitIndex >= 0); - assert(vectorIndex < numBitVectors); - b[vectorIndex].set(bitIndex); - if (c == '}') { - vectorIndex = vectorIndex + 1; - } - } + @Override + public NumDistinctValueEstimator deserialize(String s) { + InputStream is = new ByteArrayInputStream(Base64.decodeBase64(s)); + try { + NumDistinctValueEstimator n = FMSketchUtils.deserializeFM(is); + is.close(); + return n; + } catch (IOException e) { + throw new RuntimeException(e); } - return b; } private int generateHash(long v, int hashNum) { @@ -387,11 +339,6 @@ public class FMSketch implements NumDistinctValueEstimator{ return lengthFor(model, getnumBitVectors()); } - @Override - public NumDistinctValueEstimator deserialize(String s) { - return new FMSketch(s); - } - // the caller needs to gurrantee that they are the same type based on numBitVectors @Override public void mergeEstimators(NumDistinctValueEstimator o) { http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/common/src/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java ---------------------------------------------------------------------- diff --git a/common/src/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java b/common/src/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java index e810ac5..6a29859 100644 --- a/common/src/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java +++ b/common/src/java/org/apache/hadoop/hive/common/ndv/NumDistinctValueEstimatorFactory.java @@ -19,6 +19,14 @@ package org.apache.hadoop.hive.common.ndv; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; + +import org.apache.commons.codec.binary.Base64; +import org.apache.hadoop.hive.common.ndv.fm.FMSketch; +import org.apache.hadoop.hive.common.ndv.fm.FMSketchUtils; import org.apache.hadoop.hive.common.ndv.hll.HyperLogLog; public class NumDistinctValueEstimatorFactory { @@ -26,11 +34,25 @@ public class NumDistinctValueEstimatorFactory { private NumDistinctValueEstimatorFactory() { } + private static boolean isFMSketch(String s) throws IOException { + InputStream in = new ByteArrayInputStream(Base64.decodeBase64(s)); + byte[] magic = new byte[2]; + magic[0] = (byte) in.read(); + magic[1] = (byte) in.read(); + in.close(); + return Arrays.equals(magic, FMSketchUtils.MAGIC); + } + public static NumDistinctValueEstimator getNumDistinctValueEstimator(String s) { - if (s.startsWith("{")) { - return new FMSketch(s); - } else { - return HyperLogLog.builder().build().deserialize(s); + // Right now we assume only FM and HLL are available. + try { + if (isFMSketch(s)) { + return FMSketchUtils.deserializeFM(s); + } else { + return HyperLogLog.builder().build().deserialize(s); + } + } catch (IOException e) { + throw new RuntimeException(e); } } http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/common/src/java/org/apache/hadoop/hive/common/ndv/fm/FMSketchUtils.java ---------------------------------------------------------------------- diff --git a/common/src/java/org/apache/hadoop/hive/common/ndv/fm/FMSketchUtils.java b/common/src/java/org/apache/hadoop/hive/common/ndv/fm/FMSketchUtils.java new file mode 100644 index 0000000..b6f7fdd --- /dev/null +++ b/common/src/java/org/apache/hadoop/hive/common/ndv/fm/FMSketchUtils.java @@ -0,0 +1,133 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.common.ndv.fm; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.Arrays; + +import javolution.util.FastBitSet; + +import org.apache.commons.codec.binary.Base64; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class FMSketchUtils { + + static final Logger LOG = LoggerFactory.getLogger(FMSketch.class.getName()); + public static final byte[] MAGIC = new byte[] { 'F', 'M' }; + + /* + * Serializes a distinctValueEstimator object to Text for transport. + * + * <b>4 byte header</b> is encoded like below 2 bytes - FM magic string to + * identify serialized stream 2 bytes - numbitvectors because + * BIT_VECTOR_SIZE=31, 4 bytes are enough to hold positions of 0-31 + */ + public static void serializeFM(OutputStream out, FMSketch fm) throws IOException { + out.write(MAGIC); + + // max of numBitVectors = 1024, 2 bytes is enough. + byte[] nbv = new byte[2]; + nbv[0] = (byte) fm.getnumBitVectors(); + nbv[1] = (byte) (fm.getnumBitVectors() >>> 8); + + out.write(nbv); + + // original toString takes too much space + // we compress a fastbitset to 4 bytes + for (int i = 0; i < fm.getnumBitVectors(); i++) { + writeBitVector(out, fm.getBitVector(i)); + } + } + + // BIT_VECTOR_SIZE is 31, we can use 32 bits, i.e., 4 bytes to represent a + // FastBitSet, rather than using 31 integers. + private static void writeBitVector(OutputStream out, FastBitSet bit) throws IOException { + int num = 0; + for (int pos = 0; pos < FMSketch.BIT_VECTOR_SIZE; pos++) { + if (bit.get(pos)) { + num |= 1 << pos; + } + } + byte[] i = new byte[4]; + for (int j = 0; j < 4; j++) { + i[j] = (byte) ((num >>> (8 * j)) & 0xff); + } + out.write(i); + } + + /* + * Deserializes from string to FastBitSet; Creates a NumDistinctValueEstimator + * object and returns it. + */ + public static FMSketch deserializeFM(String s) throws IOException { + InputStream is = new ByteArrayInputStream(Base64.decodeBase64(s)); + try { + FMSketch sketch = deserializeFM(is); + is.close(); + return sketch; + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public static FMSketch deserializeFM(InputStream in) throws IOException { + checkMagicString(in); + + byte[] nbv = new byte[2]; + nbv[0] = (byte) in.read(); + nbv[1] = (byte) in.read(); + + int numBitVectors = 0; + numBitVectors |= (nbv[0] & 0xff); + numBitVectors |= ((nbv[1] & 0xff) << 8); + + FMSketch sketch = new FMSketch(numBitVectors); + for (int n = 0; n < numBitVectors; n++) { + sketch.setBitVector(readBitVector(in), n); + } + return sketch; + } + + private static FastBitSet readBitVector(InputStream in) throws IOException { + FastBitSet fastBitSet = new FastBitSet(); + fastBitSet.clear(); + for (int i = 0; i < 4; i++) { + byte b = (byte) in.read(); + for (int j = 0; j < 8; j++) { + if ((b & (1 << j)) != 0) { + fastBitSet.set(j + 8 * i); + } + } + } + return fastBitSet; + } + + private static void checkMagicString(InputStream in) throws IOException { + byte[] magic = new byte[2]; + magic[0] = (byte) in.read(); + magic[1] = (byte) in.read(); + + if (!Arrays.equals(magic, MAGIC)) { + throw new IllegalArgumentException("The input stream is not a FMSketch stream."); + } + } +} http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/common/src/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java ---------------------------------------------------------------------- diff --git a/common/src/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java b/common/src/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java index d195546..182560a 100644 --- a/common/src/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java +++ b/common/src/java/org/apache/hadoop/hive/common/ndv/hll/HyperLogLog.java @@ -277,7 +277,9 @@ public class HyperLogLog implements NumDistinctValueEstimator{ } public long estimateNumDistinctValues() { - return count(); + // FMSketch treats the ndv of all nulls as 1 but hll treates the ndv as 0. + // In order to get rid of divide by 0 problem, we follow FMSketch + return count() > 0 ? count() : 1; } public long count() { http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java ---------------------------------------------------------------------- diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index dd9ad71..05f6cc9 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -1737,7 +1737,9 @@ public class HiveConf extends Configuration { "Whether column accesses are tracked in the QueryPlan.\n" + "This is useful to identify how tables are accessed and to determine if there are wasted columns that can be trimmed."), HIVE_STATS_NDV_ALGO("hive.stats.ndv.algo", "hll", new PatternSet("hll", "fm"), - "hll and fm stand for HyperLogLog and FM-sketch, respectively for computing ndv."), + "hll and fm stand for HyperLogLog and FM-sketch, respectively for computing ndv."), + HIVE_STATS_FETCH_BITVECTOR("hive.stats.fetch.bitvector", false, + "Whether we fetch bitvector when we compute ndv. Users can turn it off if they want to use old schema"), // standard error allowed for ndv estimates for FM-sketch. A lower value indicates higher accuracy and a // higher compute cost. HIVE_STATS_NDV_ERROR("hive.stats.ndv.error", (float)20.0, http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/common/src/test/org/apache/hadoop/hive/common/ndv/fm/TestFMSketchSerialization.java ---------------------------------------------------------------------- diff --git a/common/src/test/org/apache/hadoop/hive/common/ndv/fm/TestFMSketchSerialization.java b/common/src/test/org/apache/hadoop/hive/common/ndv/fm/TestFMSketchSerialization.java new file mode 100644 index 0000000..74fdf58 --- /dev/null +++ b/common/src/test/org/apache/hadoop/hive/common/ndv/fm/TestFMSketchSerialization.java @@ -0,0 +1,97 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.common.ndv.fm; + +import static org.junit.Assert.assertEquals; + +import java.io.IOException; + +import javolution.util.FastBitSet; + +import org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimatorFactory; +import org.junit.Test; + +public class TestFMSketchSerialization { + + private FastBitSet[] deserialize(String s, int numBitVectors) { + FastBitSet[] b = new FastBitSet[numBitVectors]; + for (int j = 0; j < numBitVectors; j++) { + b[j] = new FastBitSet(FMSketch.BIT_VECTOR_SIZE); + b[j].clear(); + } + + int vectorIndex = 0; + + /* + * Parse input string to obtain the indexes that are set in the bitvector. + * When a toString() is called on a FastBitSet object to serialize it, the + * serialization adds { and } to the beginning and end of the return String. + * Skip "{", "}", ",", " " in the input string. + */ + for (int i = 1; i < s.length() - 1;) { + char c = s.charAt(i); + i = i + 1; + + // Move on to the next bit vector + if (c == '}') { + vectorIndex = vectorIndex + 1; + } + + // Encountered a numeric value; Extract out the entire number + if (c >= '0' && c <= '9') { + String t = new String(); + t = t + c; + c = s.charAt(i); + i = i + 1; + + while (c != ',' && c != '}') { + t = t + c; + c = s.charAt(i); + i = i + 1; + } + + int bitIndex = Integer.parseInt(t); + assert (bitIndex >= 0); + assert (vectorIndex < numBitVectors); + b[vectorIndex].set(bitIndex); + if (c == '}') { + vectorIndex = vectorIndex + 1; + } + } + } + return b; + } + + @Test + public void testSerDe() throws IOException { + String bitVectors = "{0, 4, 5, 7}{0, 1}{0, 1, 2}{0, 1, 4}{0}{0, 2}{0, 3}{0, 2, 3, 4}{0, 1, 4}{0, 1}{0}{0, 1, 3, 8}{0, 2}{0, 2}{0, 9}{0, 1, 4}"; + FastBitSet[] fastBitSet = deserialize(bitVectors, 16); + FMSketch sketch = new FMSketch(16); + for (int i = 0; i < 16; i++) { + sketch.setBitVector(fastBitSet[i], i); + } + assertEquals(sketch.estimateNumDistinctValues(), 3); + String s = sketch.serialize(); + FMSketch newSketch = (FMSketch) NumDistinctValueEstimatorFactory + .getNumDistinctValueEstimator(s); + sketch.equals(newSketch); + assertEquals(newSketch.estimateNumDistinctValues(), 3); + assertEquals(newSketch.serialize(), s); + } + +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/data/conf/hive-site.xml ---------------------------------------------------------------------- diff --git a/data/conf/hive-site.xml b/data/conf/hive-site.xml index 62364fe..a205b8c 100644 --- a/data/conf/hive-site.xml +++ b/data/conf/hive-site.xml @@ -314,5 +314,9 @@ <value>true</value> </property> +<property> + <name>hive.stats.fetch.bitvector</name> + <value>true</value> +</property> </configuration> http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/data/conf/llap/hive-site.xml ---------------------------------------------------------------------- diff --git a/data/conf/llap/hive-site.xml b/data/conf/llap/hive-site.xml index cac5a3b..870b584 100644 --- a/data/conf/llap/hive-site.xml +++ b/data/conf/llap/hive-site.xml @@ -333,5 +333,9 @@ <value>4</value> <description> </description> </property> +<property> + <name>hive.stats.fetch.bitvector</name> + <value>true</value> +</property> </configuration> http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/data/conf/perf-reg/hive-site.xml ---------------------------------------------------------------------- diff --git a/data/conf/perf-reg/hive-site.xml b/data/conf/perf-reg/hive-site.xml index 012369f..331a500 100644 --- a/data/conf/perf-reg/hive-site.xml +++ b/data/conf/perf-reg/hive-site.xml @@ -282,4 +282,9 @@ <value>true</value> </property> +<property> + <name>hive.stats.fetch.bitvector</name> + <value>true</value> +</property> + </configuration> http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/data/conf/tez/hive-site.xml ---------------------------------------------------------------------- diff --git a/data/conf/tez/hive-site.xml b/data/conf/tez/hive-site.xml index 28abc2d..35e8c99 100644 --- a/data/conf/tez/hive-site.xml +++ b/data/conf/tez/hive-site.xml @@ -278,4 +278,9 @@ <value>true</value> </property> +<property> + <name>hive.stats.fetch.bitvector</name> + <value>true</value> +</property> + </configuration> http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/scripts/upgrade/derby/044-HIVE-16997.derby.sql ---------------------------------------------------------------------- diff --git a/metastore/scripts/upgrade/derby/044-HIVE-16997.derby.sql b/metastore/scripts/upgrade/derby/044-HIVE-16997.derby.sql new file mode 100644 index 0000000..2c2177b --- /dev/null +++ b/metastore/scripts/upgrade/derby/044-HIVE-16997.derby.sql @@ -0,0 +1 @@ +ALTER TABLE "APP"."PART_COL_STATS" ADD COLUMN "BIT_VECTOR" BLOB; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/scripts/upgrade/derby/hive-schema-3.0.0.derby.sql ---------------------------------------------------------------------- diff --git a/metastore/scripts/upgrade/derby/hive-schema-3.0.0.derby.sql b/metastore/scripts/upgrade/derby/hive-schema-3.0.0.derby.sql index a9a5329..f4cbba6 100644 --- a/metastore/scripts/upgrade/derby/hive-schema-3.0.0.derby.sql +++ b/metastore/scripts/upgrade/derby/hive-schema-3.0.0.derby.sql @@ -94,7 +94,7 @@ CREATE TABLE "APP"."MASTER_KEYS" ("KEY_ID" INTEGER NOT NULL generated always as CREATE TABLE "APP"."DELEGATION_TOKENS" ( "TOKEN_IDENT" VARCHAR(767) NOT NULL, "TOKEN" VARCHAR(767)); -CREATE TABLE "APP"."PART_COL_STATS"("DB_NAME" VARCHAR(128) NOT NULL,"TABLE_NAME" VARCHAR(256) NOT NULL, "PARTITION_NAME" VARCHAR(767) NOT NULL, "COLUMN_NAME" VARCHAR(767) NOT NULL, "COLUMN_TYPE" VARCHAR(128) NOT NULL, "LONG_LOW_VALUE" BIGINT, "LONG_HIGH_VALUE" BIGINT, "DOUBLE_LOW_VALUE" DOUBLE, "DOUBLE_HIGH_VALUE" DOUBLE, "BIG_DECIMAL_LOW_VALUE" VARCHAR(4000), "BIG_DECIMAL_HIGH_VALUE" VARCHAR(4000),"NUM_DISTINCTS" BIGINT, "NUM_NULLS" BIGINT NOT NULL, "AVG_COL_LEN" DOUBLE, "MAX_COL_LEN" BIGINT, "NUM_TRUES" BIGINT, "NUM_FALSES" BIGINT, "LAST_ANALYZED" BIGINT, "CS_ID" BIGINT NOT NULL, "PART_ID" BIGINT NOT NULL); +CREATE TABLE "APP"."PART_COL_STATS"("DB_NAME" VARCHAR(128) NOT NULL,"TABLE_NAME" VARCHAR(256) NOT NULL, "PARTITION_NAME" VARCHAR(767) NOT NULL, "COLUMN_NAME" VARCHAR(767) NOT NULL, "COLUMN_TYPE" VARCHAR(128) NOT NULL, "LONG_LOW_VALUE" BIGINT, "LONG_HIGH_VALUE" BIGINT, "DOUBLE_LOW_VALUE" DOUBLE, "DOUBLE_HIGH_VALUE" DOUBLE, "BIG_DECIMAL_LOW_VALUE" VARCHAR(4000), "BIG_DECIMAL_HIGH_VALUE" VARCHAR(4000),"NUM_DISTINCTS" BIGINT, "BIT_VECTOR" BLOB, "NUM_NULLS" BIGINT NOT NULL, "AVG_COL_LEN" DOUBLE, "MAX_COL_LEN" BIGINT, "NUM_TRUES" BIGINT, "NUM_FALSES" BIGINT, "LAST_ANALYZED" BIGINT, "CS_ID" BIGINT NOT NULL, "PART_ID" BIGINT NOT NULL); CREATE TABLE "APP"."VERSION" ("VER_ID" BIGINT NOT NULL, "SCHEMA_VERSION" VARCHAR(127) NOT NULL, "VERSION_COMMENT" VARCHAR(255)); http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/scripts/upgrade/derby/upgrade-2.3.0-to-3.0.0.derby.sql ---------------------------------------------------------------------- diff --git a/metastore/scripts/upgrade/derby/upgrade-2.3.0-to-3.0.0.derby.sql b/metastore/scripts/upgrade/derby/upgrade-2.3.0-to-3.0.0.derby.sql index 30513dc..01b6f90 100644 --- a/metastore/scripts/upgrade/derby/upgrade-2.3.0-to-3.0.0.derby.sql +++ b/metastore/scripts/upgrade/derby/upgrade-2.3.0-to-3.0.0.derby.sql @@ -2,5 +2,6 @@ RUN '041-HIVE-16556.derby.sql'; RUN '042-HIVE-16575.derby.sql'; RUN '043-HIVE-16922.derby.sql'; +RUN '044-HIVE-16997.derby.sql'; UPDATE "APP".VERSION SET SCHEMA_VERSION='3.0.0', VERSION_COMMENT='Hive release version 3.0.0' where VER_ID=1; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/scripts/upgrade/mssql/029-HIVE-16997.mssql.sql ---------------------------------------------------------------------- diff --git a/metastore/scripts/upgrade/mssql/029-HIVE-16997.mssql.sql b/metastore/scripts/upgrade/mssql/029-HIVE-16997.mssql.sql new file mode 100644 index 0000000..1882c59 --- /dev/null +++ b/metastore/scripts/upgrade/mssql/029-HIVE-16997.mssql.sql @@ -0,0 +1 @@ +ALTER TABLE PART_COL_STATS ADD BIT_VECTOR VARBINARY(MAX); http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/scripts/upgrade/mssql/hive-schema-3.0.0.mssql.sql ---------------------------------------------------------------------- diff --git a/metastore/scripts/upgrade/mssql/hive-schema-3.0.0.mssql.sql b/metastore/scripts/upgrade/mssql/hive-schema-3.0.0.mssql.sql index 1cfe2d1..fa8fc6e 100644 --- a/metastore/scripts/upgrade/mssql/hive-schema-3.0.0.mssql.sql +++ b/metastore/scripts/upgrade/mssql/hive-schema-3.0.0.mssql.sql @@ -87,6 +87,7 @@ CREATE TABLE PART_COL_STATS LONG_LOW_VALUE bigint NULL, MAX_COL_LEN bigint NULL, NUM_DISTINCTS bigint NULL, + BIT_VECTOR varbinary(max) NULL, NUM_FALSES bigint NULL, NUM_NULLS bigint NOT NULL, NUM_TRUES bigint NULL, http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/scripts/upgrade/mssql/upgrade-2.3.0-to-3.0.0.mssql.sql ---------------------------------------------------------------------- diff --git a/metastore/scripts/upgrade/mssql/upgrade-2.3.0-to-3.0.0.mssql.sql b/metastore/scripts/upgrade/mssql/upgrade-2.3.0-to-3.0.0.mssql.sql index 5683254..21d62ae 100644 --- a/metastore/scripts/upgrade/mssql/upgrade-2.3.0-to-3.0.0.mssql.sql +++ b/metastore/scripts/upgrade/mssql/upgrade-2.3.0-to-3.0.0.mssql.sql @@ -3,6 +3,7 @@ SELECT 'Upgrading MetaStore schema from 2.3.0 to 3.0.0' AS MESSAGE; :r 026-HIVE-16556.mssql.sql :r 027-HIVE-16575.mssql.sql :r 028-HIVE-16922.mssql.sql +:r 029-HIVE-16997.mssql.sql UPDATE VERSION SET SCHEMA_VERSION='3.0.0', VERSION_COMMENT='Hive release version 3.0.0' where VER_ID=1; SELECT 'Finished upgrading MetaStore schema from 2.3.0 to 3.0.0' AS MESSAGE; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/scripts/upgrade/mysql/044-HIVE-16997.mysql.sql ---------------------------------------------------------------------- diff --git a/metastore/scripts/upgrade/mysql/044-HIVE-16997.mysql.sql b/metastore/scripts/upgrade/mysql/044-HIVE-16997.mysql.sql new file mode 100644 index 0000000..4954b2e --- /dev/null +++ b/metastore/scripts/upgrade/mysql/044-HIVE-16997.mysql.sql @@ -0,0 +1 @@ +ALTER TABLE PART_COL_STATS ADD COLUMN BIT_VECTOR BLOB; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/scripts/upgrade/mysql/hive-schema-3.0.0.mysql.sql ---------------------------------------------------------------------- diff --git a/metastore/scripts/upgrade/mysql/hive-schema-3.0.0.mysql.sql b/metastore/scripts/upgrade/mysql/hive-schema-3.0.0.mysql.sql index 97d881f..31963d0 100644 --- a/metastore/scripts/upgrade/mysql/hive-schema-3.0.0.mysql.sql +++ b/metastore/scripts/upgrade/mysql/hive-schema-3.0.0.mysql.sql @@ -690,6 +690,7 @@ CREATE TABLE IF NOT EXISTS `PART_COL_STATS` ( `BIG_DECIMAL_HIGH_VALUE` varchar(4000) CHARACTER SET latin1 COLLATE latin1_bin, `NUM_NULLS` bigint(20) NOT NULL, `NUM_DISTINCTS` bigint(20), + `BIT_VECTOR` blob, `AVG_COL_LEN` double(53,4), `MAX_COL_LEN` bigint(20), `NUM_TRUES` bigint(20), http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/scripts/upgrade/mysql/upgrade-2.3.0-to-3.0.0.mysql.sql ---------------------------------------------------------------------- diff --git a/metastore/scripts/upgrade/mysql/upgrade-2.3.0-to-3.0.0.mysql.sql b/metastore/scripts/upgrade/mysql/upgrade-2.3.0-to-3.0.0.mysql.sql index ba62939..9cd3a62 100644 --- a/metastore/scripts/upgrade/mysql/upgrade-2.3.0-to-3.0.0.mysql.sql +++ b/metastore/scripts/upgrade/mysql/upgrade-2.3.0-to-3.0.0.mysql.sql @@ -3,6 +3,7 @@ SELECT 'Upgrading MetaStore schema from 2.3.0 to 3.0.0' AS ' '; SOURCE 041-HIVE-16556.mysql.sql; SOURCE 042-HIVE-16575.mysql.sql; SOURCE 043-HIVE-16922.mysql.sql; +SOURCE 044-HIVE-16997.mysql.sql; UPDATE VERSION SET SCHEMA_VERSION='3.0.0', VERSION_COMMENT='Hive release version 3.0.0' where VER_ID=1; SELECT 'Finished upgrading MetaStore schema from 2.3.0 to 3.0.0' AS ' '; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/scripts/upgrade/oracle/044-HIVE-16997.oracle.sql ---------------------------------------------------------------------- diff --git a/metastore/scripts/upgrade/oracle/044-HIVE-16997.oracle.sql b/metastore/scripts/upgrade/oracle/044-HIVE-16997.oracle.sql new file mode 100644 index 0000000..44e5fa3 --- /dev/null +++ b/metastore/scripts/upgrade/oracle/044-HIVE-16997.oracle.sql @@ -0,0 +1 @@ +ALTER TABLE PART_COL_STATS ADD BIT_VECTOR BLOB NULL; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/scripts/upgrade/oracle/hive-schema-3.0.0.oracle.sql ---------------------------------------------------------------------- diff --git a/metastore/scripts/upgrade/oracle/hive-schema-3.0.0.oracle.sql b/metastore/scripts/upgrade/oracle/hive-schema-3.0.0.oracle.sql index 8fdb552..81e4208 100644 --- a/metastore/scripts/upgrade/oracle/hive-schema-3.0.0.oracle.sql +++ b/metastore/scripts/upgrade/oracle/hive-schema-3.0.0.oracle.sql @@ -515,6 +515,7 @@ CREATE TABLE PART_COL_STATS ( BIG_DECIMAL_HIGH_VALUE VARCHAR2(4000), NUM_NULLS NUMBER NOT NULL, NUM_DISTINCTS NUMBER, + BIT_VECTOR BLOB, AVG_COL_LEN NUMBER, MAX_COL_LEN NUMBER, NUM_TRUES NUMBER, http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/scripts/upgrade/oracle/upgrade-2.3.0-to-3.0.0.oracle.sql ---------------------------------------------------------------------- diff --git a/metastore/scripts/upgrade/oracle/upgrade-2.3.0-to-3.0.0.oracle.sql b/metastore/scripts/upgrade/oracle/upgrade-2.3.0-to-3.0.0.oracle.sql index 0a70d47..6a26649 100644 --- a/metastore/scripts/upgrade/oracle/upgrade-2.3.0-to-3.0.0.oracle.sql +++ b/metastore/scripts/upgrade/oracle/upgrade-2.3.0-to-3.0.0.oracle.sql @@ -3,6 +3,7 @@ SELECT 'Upgrading MetaStore schema from 2.3.0 to 3.0.0' AS Status from dual; @041-HIVE-16556.oracle.sql; @042-HIVE-16575.oracle.sql; @043-HIVE-16922.oracle.sql; +@044-HIVE-16997.oracle.sql; UPDATE VERSION SET SCHEMA_VERSION='3.0.0', VERSION_COMMENT='Hive release version 3.0.0' where VER_ID=1; SELECT 'Finished upgrading MetaStore schema from 2.3.0 to 3.0.0' AS Status from dual; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/scripts/upgrade/postgres/043-HIVE-16997.postgres.sql ---------------------------------------------------------------------- diff --git a/metastore/scripts/upgrade/postgres/043-HIVE-16997.postgres.sql b/metastore/scripts/upgrade/postgres/043-HIVE-16997.postgres.sql new file mode 100644 index 0000000..bee8c44 --- /dev/null +++ b/metastore/scripts/upgrade/postgres/043-HIVE-16997.postgres.sql @@ -0,0 +1 @@ +ALTER TABLE "PART_COL_STATS" ADD COLUMN "BIT_VECTOR" BYTEA; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/scripts/upgrade/postgres/hive-schema-3.0.0.postgres.sql ---------------------------------------------------------------------- diff --git a/metastore/scripts/upgrade/postgres/hive-schema-3.0.0.postgres.sql b/metastore/scripts/upgrade/postgres/hive-schema-3.0.0.postgres.sql index 1cdeb6b..5cb5cb0 100644 --- a/metastore/scripts/upgrade/postgres/hive-schema-3.0.0.postgres.sql +++ b/metastore/scripts/upgrade/postgres/hive-schema-3.0.0.postgres.sql @@ -534,6 +534,7 @@ CREATE TABLE "PART_COL_STATS" ( "BIG_DECIMAL_HIGH_VALUE" character varying(4000) DEFAULT NULL::character varying, "NUM_NULLS" bigint NOT NULL, "NUM_DISTINCTS" bigint, + "BIT_VECTOR" bytea, "AVG_COL_LEN" double precision, "MAX_COL_LEN" bigint, "NUM_TRUES" bigint, http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/scripts/upgrade/postgres/upgrade-2.3.0-to-3.0.0.postgres.sql ---------------------------------------------------------------------- diff --git a/metastore/scripts/upgrade/postgres/upgrade-2.3.0-to-3.0.0.postgres.sql b/metastore/scripts/upgrade/postgres/upgrade-2.3.0-to-3.0.0.postgres.sql index c44dd06..ee5a673 100644 --- a/metastore/scripts/upgrade/postgres/upgrade-2.3.0-to-3.0.0.postgres.sql +++ b/metastore/scripts/upgrade/postgres/upgrade-2.3.0-to-3.0.0.postgres.sql @@ -3,6 +3,7 @@ SELECT 'Upgrading MetaStore schema from 2.3.0 to 3.0.0'; \i 040-HIVE-16556.postgres.sql; \i 041-HIVE-16575.postgres.sql; \i 042-HIVE-16922.postgres.sql; +\i 043-HIVE-16997.postgres.sql; UPDATE "VERSION" SET "SCHEMA_VERSION"='3.0.0', "VERSION_COMMENT"='Hive release version 3.0.0' where "VER_ID"=1; SELECT 'Finished upgrading MetaStore schema from 2.3.0 to 3.0.0'; http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java index a960b2d..73754ff 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreDirectSql.java @@ -21,6 +21,7 @@ package org.apache.hadoop.hive.metastore; import static org.apache.commons.lang.StringUtils.join; import static org.apache.commons.lang.StringUtils.repeat; +import java.sql.Blob; import java.sql.Clob; import java.sql.Connection; import java.sql.Statement; @@ -33,6 +34,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.TreeMap; import javax.jdo.PersistenceManager; @@ -64,6 +66,8 @@ import org.apache.hadoop.hive.metastore.api.SerDeInfo; import org.apache.hadoop.hive.metastore.api.SkewedInfo; import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.metastore.api.Table; +import org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator; +import org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregatorFactory; import org.apache.hadoop.hive.metastore.model.MConstraint; import org.apache.hadoop.hive.metastore.model.MDatabase; import org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics; @@ -941,6 +945,24 @@ class MetaStoreDirectSql { } } + static String extractSqlBlob(Object value) throws MetaException { + if (value == null) + return null; + if (value instanceof Blob) { + try { + // getBytes function says: pos the ordinal position of the first byte in + // the BLOB value to be extracted; the first byte is at position 1 + return new String(((Blob) value).getBytes(1, (int) ((Blob) value).length())); + } catch (SQLException e) { + throw new MetaException("Encounter error while processing blob."); + } + } else { + // this may happen when enablebitvector is false + LOG.debug("Expected blob type but got " + value.getClass().getName()); + return null; + } + } + private static String trimCommaList(StringBuilder sb) { if (sb.length() > 0) { sb.setLength(sb.length() - 1); @@ -1221,12 +1243,12 @@ class MetaStoreDirectSql { * @throws MetaException */ public ColumnStatistics getTableStats(final String dbName, final String tableName, - List<String> colNames) throws MetaException { + List<String> colNames, boolean enableBitVector) throws MetaException { if (colNames == null || colNames.isEmpty()) { return null; } final boolean doTrace = LOG.isDebugEnabled(); - final String queryText0 = "select " + STATS_COLLIST + " from " + TAB_COL_STATS + " " + final String queryText0 = "select " + getStatsList(enableBitVector) + " from " + TAB_COL_STATS + " " + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? and \"COLUMN_NAME\" in ("; Batchable<String, Object[]> b = new Batchable<String, Object[]>() { public List<Object[]> run(List<String> input) throws MetaException { @@ -1260,8 +1282,8 @@ class MetaStoreDirectSql { } public AggrStats aggrColStatsForPartitions(String dbName, String tableName, - List<String> partNames, List<String> colNames, boolean useDensityFunctionForNDVEstimation, double ndvTuner) - throws MetaException { + List<String> partNames, List<String> colNames, boolean useDensityFunctionForNDVEstimation, + double ndvTuner, boolean enableBitVector) throws MetaException { if (colNames.isEmpty() || partNames.isEmpty()) { LOG.debug("Columns is empty or partNames is empty : Short-circuiting stats eval"); return new AggrStats(Collections.<ColumnStatisticsObj>emptyList(), 0); // Nothing to aggregate @@ -1295,7 +1317,7 @@ class MetaStoreDirectSql { // Read aggregated stats for one column colStatsAggrFromDB = columnStatisticsObjForPartitions(dbName, tableName, partNames, colNamesForDB, - partsFound, useDensityFunctionForNDVEstimation, ndvTuner); + partsFound, useDensityFunctionForNDVEstimation, ndvTuner, enableBitVector); if (!colStatsAggrFromDB.isEmpty()) { ColumnStatisticsObj colStatsAggr = colStatsAggrFromDB.get(0); colStatsList.add(colStatsAggr); @@ -1308,7 +1330,7 @@ class MetaStoreDirectSql { partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames); colStatsList = columnStatisticsObjForPartitions(dbName, tableName, partNames, colNames, partsFound, - useDensityFunctionForNDVEstimation, ndvTuner); + useDensityFunctionForNDVEstimation, ndvTuner, enableBitVector); } LOG.info("useDensityFunctionForNDVEstimation = " + useDensityFunctionForNDVEstimation + "\npartsFound = " + partsFound + "\nColumnStatisticsObj = " @@ -1371,14 +1393,14 @@ class MetaStoreDirectSql { private List<ColumnStatisticsObj> columnStatisticsObjForPartitions(final String dbName, final String tableName, final List<String> partNames, List<String> colNames, long partsFound, - final boolean useDensityFunctionForNDVEstimation, final double ndvTuner) throws MetaException { + final boolean useDensityFunctionForNDVEstimation, final double ndvTuner, final boolean enableBitVector) throws MetaException { final boolean areAllPartsFound = (partsFound == partNames.size()); return runBatched(colNames, new Batchable<String, ColumnStatisticsObj>() { public List<ColumnStatisticsObj> run(final List<String> inputColNames) throws MetaException { return runBatched(partNames, new Batchable<String, ColumnStatisticsObj>() { public List<ColumnStatisticsObj> run(List<String> inputPartNames) throws MetaException { return columnStatisticsObjForPartitionsBatch(dbName, tableName, inputPartNames, - inputColNames, areAllPartsFound, useDensityFunctionForNDVEstimation, ndvTuner); + inputColNames, areAllPartsFound, useDensityFunctionForNDVEstimation, ndvTuner, enableBitVector); } }); } @@ -1388,14 +1410,10 @@ class MetaStoreDirectSql { // Get aggregated column stats for a table per partition for all columns in the partition // This is primarily used to populate stats object when using CachedStore (Check CachedStore#prewarm) public Map<String, List<ColumnStatisticsObj>> getColStatsForTablePartitions(String dbName, - String tblName) throws MetaException { - String queryText = - "select \"PARTITION_NAME\", \"COLUMN_NAME\", \"COLUMN_TYPE\", \"LONG_LOW_VALUE\", " - + "\"LONG_HIGH_VALUE\", \"DOUBLE_LOW_VALUE\", \"DOUBLE_HIGH_VALUE\", " - + "\"BIG_DECIMAL_LOW_VALUE\", \"BIG_DECIMAL_HIGH_VALUE\", \"NUM_NULLS\", " - + "\"NUM_DISTINCTS\", \"AVG_COL_LEN\", \"MAX_COL_LEN\", \"NUM_TRUES\", \"NUM_FALSES\"" - + " from " + PART_COL_STATS + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ?" - + " order by \"PARTITION_NAME\""; + String tblName, boolean enableBitVector) throws MetaException { + String queryText = "select \"PARTITION_NAME\", " + getStatsList(enableBitVector) + " from " + + " " + PART_COL_STATS + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ?" + + " order by \"PARTITION_NAME\""; long start = 0; long end = 0; Query query = null; @@ -1446,6 +1464,28 @@ class MetaStoreDirectSql { /** Should be called with the list short enough to not trip up Oracle/etc. */ private List<ColumnStatisticsObj> columnStatisticsObjForPartitionsBatch(String dbName, String tableName, List<String> partNames, List<String> colNames, boolean areAllPartsFound, + boolean useDensityFunctionForNDVEstimation, double ndvTuner, boolean enableBitVector) throws MetaException { + if(enableBitVector) { + return aggrStatsUseJava(dbName, tableName, partNames, colNames, areAllPartsFound, useDensityFunctionForNDVEstimation, ndvTuner); + } + else { + return aggrStatsUseDB(dbName, tableName, partNames, colNames, areAllPartsFound, useDensityFunctionForNDVEstimation, ndvTuner); + } + } + + private List<ColumnStatisticsObj> aggrStatsUseJava(String dbName, String tableName, + List<String> partNames, List<String> colNames, boolean areAllPartsFound, + boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException { + // 1. get all the stats for colNames in partNames; + List<ColumnStatistics> partStats = getPartitionStats(dbName, tableName, partNames, colNames, + true); + // 2. use util function to aggr stats + return MetaStoreUtils.aggrPartitionStats(partStats, dbName, tableName, partNames, colNames, + areAllPartsFound, useDensityFunctionForNDVEstimation, ndvTuner); + } + + private List<ColumnStatisticsObj> aggrStatsUseDB(String dbName, + String tableName, List<String> partNames, List<String> colNames, boolean areAllPartsFound, boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException { // TODO: all the extrapolation logic should be moved out of this class, // only mechanical data retrieval should remain here. @@ -1717,10 +1757,10 @@ class MetaStoreDirectSql { ColumnStatisticsData data = new ColumnStatisticsData(); ColumnStatisticsObj cso = new ColumnStatisticsObj((String)row[i++], (String)row[i++], data); Object llow = row[i++], lhigh = row[i++], dlow = row[i++], dhigh = row[i++], - declow = row[i++], dechigh = row[i++], nulls = row[i++], dist = row[i++], + declow = row[i++], dechigh = row[i++], nulls = row[i++], dist = row[i++], bitVector = row[i++], avglen = row[i++], maxlen = row[i++], trues = row[i++], falses = row[i++]; StatObjectConverter.fillColumnStatisticsData(cso.getColType(), data, - llow, lhigh, dlow, dhigh, declow, dechigh, nulls, dist, avglen, maxlen, trues, falses); + llow, lhigh, dlow, dhigh, declow, dechigh, nulls, dist, bitVector, avglen, maxlen, trues, falses); return cso; } @@ -1753,14 +1793,14 @@ class MetaStoreDirectSql { } public List<ColumnStatistics> getPartitionStats(final String dbName, final String tableName, - final List<String> partNames, List<String> colNames) throws MetaException { + final List<String> partNames, List<String> colNames, boolean enableBitVector) throws MetaException { if (colNames.isEmpty() || partNames.isEmpty()) { return Collections.emptyList(); } final boolean doTrace = LOG.isDebugEnabled(); - final String queryText0 = "select \"PARTITION_NAME\", " + STATS_COLLIST + " from " - + " " + PART_COL_STATS + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? and \"COLUMN_NAME\"" - + " in (%1$s) AND \"PARTITION_NAME\" in (%2$s) order by \"PARTITION_NAME\""; + final String queryText0 = "select \"PARTITION_NAME\", " + getStatsList(enableBitVector) + " from " + + " " + PART_COL_STATS + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? and \"COLUMN_NAME\"" + + " in (%1$s) AND \"PARTITION_NAME\" in (%2$s) order by \"PARTITION_NAME\""; Batchable<String, Object[]> b = new Batchable<String, Object[]>() { public List<Object[]> run(final List<String> inputColNames) throws MetaException { Batchable<String, Object[]> b2 = new Batchable<String, Object[]>() { @@ -1812,11 +1852,13 @@ class MetaStoreDirectSql { } /** The common query part for table and partition stats */ - private static final String STATS_COLLIST = - "\"COLUMN_NAME\", \"COLUMN_TYPE\", \"LONG_LOW_VALUE\", \"LONG_HIGH_VALUE\", " - + "\"DOUBLE_LOW_VALUE\", \"DOUBLE_HIGH_VALUE\", \"BIG_DECIMAL_LOW_VALUE\", " - + "\"BIG_DECIMAL_HIGH_VALUE\", \"NUM_NULLS\", \"NUM_DISTINCTS\", \"AVG_COL_LEN\", " - + "\"MAX_COL_LEN\", \"NUM_TRUES\", \"NUM_FALSES\", \"LAST_ANALYZED\" "; + private final String getStatsList(boolean enableBitVector) { + return "\"COLUMN_NAME\", \"COLUMN_TYPE\", \"LONG_LOW_VALUE\", \"LONG_HIGH_VALUE\", " + + "\"DOUBLE_LOW_VALUE\", \"DOUBLE_HIGH_VALUE\", \"BIG_DECIMAL_LOW_VALUE\", " + + "\"BIG_DECIMAL_HIGH_VALUE\", \"NUM_NULLS\", \"NUM_DISTINCTS\", " + + (enableBitVector ? "\"BIT_VECTOR\", " : "\'\', ") + "\"AVG_COL_LEN\", " + + "\"MAX_COL_LEN\", \"NUM_TRUES\", \"NUM_FALSES\", \"LAST_ANALYZED\" "; + } private ColumnStatistics makeColumnStats( List<Object[]> list, ColumnStatisticsDesc csd, int offset) throws MetaException { @@ -1826,7 +1868,7 @@ class MetaStoreDirectSql { for (Object[] row : list) { // LastAnalyzed is stored per column but thrift has it per several; // get the lowest for now as nobody actually uses this field. - Object laObj = row[offset + 14]; + Object laObj = row[offset + 15]; if (laObj != null && (!csd.isSetLastAnalyzed() || csd.getLastAnalyzed() > extractSqlLong(laObj))) { csd.setLastAnalyzed(extractSqlLong(laObj)); } http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java index b52c94c..edfbf3a 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/MetaStoreUtils.java @@ -71,8 +71,10 @@ import org.apache.hadoop.hive.metastore.api.SerDeInfo; import org.apache.hadoop.hive.metastore.api.StorageDescriptor; import org.apache.hadoop.hive.metastore.api.Table; import org.apache.hadoop.hive.metastore.api.hive_metastoreConstants; -import org.apache.hadoop.hive.metastore.hbase.stats.merge.ColumnStatsMerger; -import org.apache.hadoop.hive.metastore.hbase.stats.merge.ColumnStatsMergerFactory; +import org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator; +import org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregatorFactory; +import org.apache.hadoop.hive.metastore.columnstats.merge.ColumnStatsMerger; +import org.apache.hadoop.hive.metastore.columnstats.merge.ColumnStatsMergerFactory; import org.apache.hadoop.hive.metastore.partition.spec.PartitionSpecProxy; import org.apache.hadoop.hive.serde.serdeConstants; import org.apache.hadoop.hive.serde2.Deserializer; @@ -1936,7 +1938,7 @@ public class MetaStoreUtils { } return metaException; } - + public static List<String> getColumnNames(List<FieldSchema> schema) { List<String> cols = new ArrayList<>(schema.size()); for (FieldSchema fs : schema) { @@ -1945,4 +1947,45 @@ public class MetaStoreUtils { return cols; } + // given a list of partStats, this function will give you an aggr stats + public static List<ColumnStatisticsObj> aggrPartitionStats(List<ColumnStatistics> partStats, + String dbName, String tableName, List<String> partNames, List<String> colNames, + boolean areAllPartsFound, boolean useDensityFunctionForNDVEstimation, double ndvTuner) + throws MetaException { + // 1. group by the stats by colNames + // map the colName to List<ColumnStatistics> + Map<String, List<ColumnStatistics>> map = new HashMap<>(); + for (ColumnStatistics css : partStats) { + List<ColumnStatisticsObj> objs = css.getStatsObj(); + for (ColumnStatisticsObj obj : objs) { + List<ColumnStatisticsObj> singleObj = new ArrayList<>(); + singleObj.add(obj); + ColumnStatistics singleCS = new ColumnStatistics(css.getStatsDesc(), singleObj); + if (!map.containsKey(obj.getColName())) { + map.put(obj.getColName(), new ArrayList<ColumnStatistics>()); + } + map.get(obj.getColName()).add(singleCS); + } + } + return aggrPartitionStats(map,dbName,tableName,partNames,colNames,areAllPartsFound,useDensityFunctionForNDVEstimation, ndvTuner); + } + + public static List<ColumnStatisticsObj> aggrPartitionStats( + Map<String, List<ColumnStatistics>> map, String dbName, String tableName, + List<String> partNames, List<String> colNames, boolean areAllPartsFound, + boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException { + List<ColumnStatisticsObj> colStats = new ArrayList<>(); + // 2. aggr stats for each colName + // TODO: thread pool can be used to speed up the process + for (Entry<String, List<ColumnStatistics>> entry : map.entrySet()) { + List<ColumnStatistics> css = entry.getValue(); + ColumnStatsAggregator aggregator = ColumnStatsAggregatorFactory.getColumnStatsAggregator(css + .iterator().next().getStatsObj().iterator().next().getStatsData().getSetField(), + useDensityFunctionForNDVEstimation, ndvTuner); + ColumnStatisticsObj statsObj = aggregator.aggregate(entry.getKey(), partNames, css); + colStats.add(statsObj); + } + return colStats; + } + } http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java b/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java index db4ec91..eea12291 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/ObjectStore.java @@ -7193,11 +7193,13 @@ public class ObjectStore implements RawStore, Configurable { protected ColumnStatistics getTableColumnStatisticsInternal( String dbName, String tableName, final List<String> colNames, boolean allowSql, boolean allowJdo) throws MetaException, NoSuchObjectException { + final boolean enableBitVector = HiveConf.getBoolVar(getConf(), + HiveConf.ConfVars.HIVE_STATS_FETCH_BITVECTOR); return new GetStatHelper(HiveStringUtils.normalizeIdentifier(dbName), HiveStringUtils.normalizeIdentifier(tableName), allowSql, allowJdo) { @Override protected ColumnStatistics getSqlResult(GetHelper<ColumnStatistics> ctx) throws MetaException { - return directSql.getTableStats(dbName, tblName, colNames); + return directSql.getTableStats(dbName, tblName, colNames, enableBitVector); } @Override protected ColumnStatistics getJdoResult( @@ -7215,7 +7217,7 @@ public class ObjectStore implements RawStore, Configurable { if (desc.getLastAnalyzed() > mStat.getLastAnalyzed()) { desc.setLastAnalyzed(mStat.getLastAnalyzed()); } - statObjs.add(StatObjectConverter.getTableColumnStatisticsObj(mStat)); + statObjs.add(StatObjectConverter.getTableColumnStatisticsObj(mStat, enableBitVector)); Deadline.checkTimeout(); } return new ColumnStatistics(desc, statObjs); @@ -7236,11 +7238,13 @@ public class ObjectStore implements RawStore, Configurable { protected List<ColumnStatistics> getPartitionColumnStatisticsInternal( String dbName, String tableName, final List<String> partNames, final List<String> colNames, boolean allowSql, boolean allowJdo) throws MetaException, NoSuchObjectException { + final boolean enableBitVector = HiveConf.getBoolVar(getConf(), + HiveConf.ConfVars.HIVE_STATS_FETCH_BITVECTOR); return new GetListHelper<ColumnStatistics>(dbName, tableName, allowSql, allowJdo) { @Override protected List<ColumnStatistics> getSqlResult( GetHelper<List<ColumnStatistics>> ctx) throws MetaException { - return directSql.getPartitionStats(dbName, tblName, partNames, colNames); + return directSql.getPartitionStats(dbName, tblName, partNames, colNames, enableBitVector); } @Override protected List<ColumnStatistics> getJdoResult( @@ -7268,7 +7272,7 @@ public class ObjectStore implements RawStore, Configurable { csd = StatObjectConverter.getPartitionColumnStatisticsDesc(mStatsObj); curList = new ArrayList<ColumnStatisticsObj>(colNames.size()); } - curList.add(StatObjectConverter.getPartitionColumnStatisticsObj(mStatsObj)); + curList.add(StatObjectConverter.getPartitionColumnStatisticsObj(mStatsObj, enableBitVector)); lastPartName = partName; Deadline.checkTimeout(); } @@ -7288,12 +7292,14 @@ public class ObjectStore implements RawStore, Configurable { HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION); final double ndvTuner = HiveConf.getFloatVar(getConf(), HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_TUNER); + final boolean enableBitVector = HiveConf.getBoolVar(getConf(), + HiveConf.ConfVars.HIVE_STATS_FETCH_BITVECTOR); return new GetHelper<AggrStats>(dbName, tblName, true, false) { @Override protected AggrStats getSqlResult(GetHelper<AggrStats> ctx) throws MetaException { return directSql.aggrColStatsForPartitions(dbName, tblName, partNames, - colNames, useDensityFunctionForNDVEstimation, ndvTuner); + colNames, useDensityFunctionForNDVEstimation, ndvTuner, enableBitVector); } @Override protected AggrStats getJdoResult(GetHelper<AggrStats> ctx) @@ -7313,11 +7319,13 @@ public class ObjectStore implements RawStore, Configurable { @Override public Map<String, List<ColumnStatisticsObj>> getColStatsForTablePartitions(String dbName, String tableName) throws MetaException, NoSuchObjectException { + final boolean enableBitVector = HiveConf.getBoolVar(getConf(), + HiveConf.ConfVars.HIVE_STATS_FETCH_BITVECTOR); return new GetHelper<Map<String, List<ColumnStatisticsObj>>>(dbName, tableName, true, false) { @Override protected Map<String, List<ColumnStatisticsObj>> getSqlResult( GetHelper<Map<String, List<ColumnStatisticsObj>>> ctx) throws MetaException { - return directSql.getColStatsForTablePartitions(dbName, tblName); + return directSql.getColStatsForTablePartitions(dbName, tblName, enableBitVector); } @Override http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java b/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java index 2dc2804..d53ea4c 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/StatObjectConverter.java @@ -76,6 +76,7 @@ public class StatObjectConverter { mColStats.setLongStats( longStats.isSetNumNulls() ? longStats.getNumNulls() : null, longStats.isSetNumDVs() ? longStats.getNumDVs() : null, + longStats.isSetBitVectors() ? longStats.getBitVectors().getBytes() : null, longStats.isSetLowValue() ? longStats.getLowValue() : null, longStats.isSetHighValue() ? longStats.getHighValue() : null); } else if (statsObj.getStatsData().isSetDoubleStats()) { @@ -83,6 +84,7 @@ public class StatObjectConverter { mColStats.setDoubleStats( doubleStats.isSetNumNulls() ? doubleStats.getNumNulls() : null, doubleStats.isSetNumDVs() ? doubleStats.getNumDVs() : null, + doubleStats.isSetBitVectors() ? doubleStats.getBitVectors().getBytes() : null, doubleStats.isSetLowValue() ? doubleStats.getLowValue() : null, doubleStats.isSetHighValue() ? doubleStats.getHighValue() : null); } else if (statsObj.getStatsData().isSetDecimalStats()) { @@ -92,12 +94,14 @@ public class StatObjectConverter { mColStats.setDecimalStats( decimalStats.isSetNumNulls() ? decimalStats.getNumNulls() : null, decimalStats.isSetNumDVs() ? decimalStats.getNumDVs() : null, + decimalStats.isSetBitVectors() ? decimalStats.getBitVectors().getBytes() : null, low, high); } else if (statsObj.getStatsData().isSetStringStats()) { StringColumnStatsData stringStats = statsObj.getStatsData().getStringStats(); mColStats.setStringStats( stringStats.isSetNumNulls() ? stringStats.getNumNulls() : null, stringStats.isSetNumDVs() ? stringStats.getNumDVs() : null, + stringStats.isSetBitVectors() ? stringStats.getBitVectors().getBytes() : null, stringStats.isSetMaxColLen() ? stringStats.getMaxColLen() : null, stringStats.isSetAvgColLen() ? stringStats.getAvgColLen() : null); } else if (statsObj.getStatsData().isSetBinaryStats()) { @@ -111,6 +115,7 @@ public class StatObjectConverter { mColStats.setDateStats( dateStats.isSetNumNulls() ? dateStats.getNumNulls() : null, dateStats.isSetNumDVs() ? dateStats.getNumDVs() : null, + dateStats.isSetBitVectors() ? dateStats.getBitVectors().getBytes() : null, dateStats.isSetLowValue() ? dateStats.getLowValue().getDaysSinceEpoch() : null, dateStats.isSetHighValue() ? dateStats.getHighValue().getDaysSinceEpoch() : null); } @@ -146,6 +151,9 @@ public class StatObjectConverter { if (mStatsObj.getNumDVs() != null) { oldStatsObj.setNumDVs(mStatsObj.getNumDVs()); } + if (mStatsObj.getBitVector() != null) { + oldStatsObj.setBitVector(mStatsObj.getBitVector()); + } if (mStatsObj.getNumFalses() != null) { oldStatsObj.setNumFalses(mStatsObj.getNumFalses()); } @@ -188,6 +196,9 @@ public class StatObjectConverter { if (mStatsObj.getNumDVs() != null) { oldStatsObj.setNumDVs(mStatsObj.getNumDVs()); } + if (mStatsObj.getBitVector() != null) { + oldStatsObj.setBitVector(mStatsObj.getBitVector()); + } if (mStatsObj.getNumFalses() != null) { oldStatsObj.setNumFalses(mStatsObj.getNumFalses()); } @@ -200,7 +211,7 @@ public class StatObjectConverter { } public static ColumnStatisticsObj getTableColumnStatisticsObj( - MTableColumnStatistics mStatsObj) { + MTableColumnStatistics mStatsObj, boolean enableBitVector) { ColumnStatisticsObj statsObj = new ColumnStatisticsObj(); statsObj.setColType(mStatsObj.getColType()); statsObj.setColName(mStatsObj.getColName()); @@ -220,6 +231,7 @@ public class StatObjectConverter { stringStats.setAvgColLen(mStatsObj.getAvgColLen()); stringStats.setMaxColLen(mStatsObj.getMaxColLen()); stringStats.setNumDVs(mStatsObj.getNumDVs()); + stringStats.setBitVectors((mStatsObj.getBitVector()==null||!enableBitVector)? null : new String(mStatsObj.getBitVector())); colStatsData.setStringStats(stringStats); } else if (colType.equals("binary")) { BinaryColumnStatsData binaryStats = new BinaryColumnStatsData(); @@ -241,6 +253,7 @@ public class StatObjectConverter { longStats.setLowValue(longLowValue); } longStats.setNumDVs(mStatsObj.getNumDVs()); + longStats.setBitVectors((mStatsObj.getBitVector()==null||!enableBitVector)? null : new String(mStatsObj.getBitVector())); colStatsData.setLongStats(longStats); } else if (colType.equals("double") || colType.equals("float")) { DoubleColumnStatsData doubleStats = new DoubleColumnStatsData(); @@ -254,6 +267,7 @@ public class StatObjectConverter { doubleStats.setLowValue(doubleLowValue); } doubleStats.setNumDVs(mStatsObj.getNumDVs()); + doubleStats.setBitVectors((mStatsObj.getBitVector()==null||!enableBitVector)? null : new String(mStatsObj.getBitVector())); colStatsData.setDoubleStats(doubleStats); } else if (colType.startsWith("decimal")) { DecimalColumnStatsData decimalStats = new DecimalColumnStatsData(); @@ -267,6 +281,7 @@ public class StatObjectConverter { decimalStats.setLowValue(createThriftDecimal(decimalLowValue)); } decimalStats.setNumDVs(mStatsObj.getNumDVs()); + decimalStats.setBitVectors((mStatsObj.getBitVector()==null||!enableBitVector)? null : new String(mStatsObj.getBitVector())); colStatsData.setDecimalStats(decimalStats); } else if (colType.equals("date")) { DateColumnStatsData dateStats = new DateColumnStatsData(); @@ -280,6 +295,7 @@ public class StatObjectConverter { dateStats.setLowValue(new Date(lowValue)); } dateStats.setNumDVs(mStatsObj.getNumDVs()); + dateStats.setBitVectors((mStatsObj.getBitVector()==null||!enableBitVector)? null : new String(mStatsObj.getBitVector())); colStatsData.setDateStats(dateStats); } statsObj.setStatsData(colStatsData); @@ -323,6 +339,7 @@ public class StatObjectConverter { mColStats.setLongStats( longStats.isSetNumNulls() ? longStats.getNumNulls() : null, longStats.isSetNumDVs() ? longStats.getNumDVs() : null, + longStats.isSetBitVectors() ? longStats.getBitVectors().getBytes() : null, longStats.isSetLowValue() ? longStats.getLowValue() : null, longStats.isSetHighValue() ? longStats.getHighValue() : null); } else if (statsObj.getStatsData().isSetDoubleStats()) { @@ -330,6 +347,7 @@ public class StatObjectConverter { mColStats.setDoubleStats( doubleStats.isSetNumNulls() ? doubleStats.getNumNulls() : null, doubleStats.isSetNumDVs() ? doubleStats.getNumDVs() : null, + doubleStats.isSetBitVectors() ? doubleStats.getBitVectors().getBytes() : null, doubleStats.isSetLowValue() ? doubleStats.getLowValue() : null, doubleStats.isSetHighValue() ? doubleStats.getHighValue() : null); } else if (statsObj.getStatsData().isSetDecimalStats()) { @@ -339,12 +357,14 @@ public class StatObjectConverter { mColStats.setDecimalStats( decimalStats.isSetNumNulls() ? decimalStats.getNumNulls() : null, decimalStats.isSetNumDVs() ? decimalStats.getNumDVs() : null, + decimalStats.isSetBitVectors() ? decimalStats.getBitVectors().getBytes() : null, low, high); } else if (statsObj.getStatsData().isSetStringStats()) { StringColumnStatsData stringStats = statsObj.getStatsData().getStringStats(); mColStats.setStringStats( stringStats.isSetNumNulls() ? stringStats.getNumNulls() : null, stringStats.isSetNumDVs() ? stringStats.getNumDVs() : null, + stringStats.isSetBitVectors() ? stringStats.getBitVectors().getBytes() : null, stringStats.isSetMaxColLen() ? stringStats.getMaxColLen() : null, stringStats.isSetAvgColLen() ? stringStats.getAvgColLen() : null); } else if (statsObj.getStatsData().isSetBinaryStats()) { @@ -358,6 +378,7 @@ public class StatObjectConverter { mColStats.setDateStats( dateStats.isSetNumNulls() ? dateStats.getNumNulls() : null, dateStats.isSetNumDVs() ? dateStats.getNumDVs() : null, + dateStats.isSetBitVectors() ? dateStats.getBitVectors().getBytes() : null, dateStats.isSetLowValue() ? dateStats.getLowValue().getDaysSinceEpoch() : null, dateStats.isSetHighValue() ? dateStats.getHighValue().getDaysSinceEpoch() : null); } @@ -365,7 +386,7 @@ public class StatObjectConverter { } public static ColumnStatisticsObj getPartitionColumnStatisticsObj( - MPartitionColumnStatistics mStatsObj) { + MPartitionColumnStatistics mStatsObj, boolean enableBitVector) { ColumnStatisticsObj statsObj = new ColumnStatisticsObj(); statsObj.setColType(mStatsObj.getColType()); statsObj.setColName(mStatsObj.getColName()); @@ -385,6 +406,7 @@ public class StatObjectConverter { stringStats.setAvgColLen(mStatsObj.getAvgColLen()); stringStats.setMaxColLen(mStatsObj.getMaxColLen()); stringStats.setNumDVs(mStatsObj.getNumDVs()); + stringStats.setBitVectors((mStatsObj.getBitVector()==null||!enableBitVector)? null : new String(mStatsObj.getBitVector())); colStatsData.setStringStats(stringStats); } else if (colType.equals("binary")) { BinaryColumnStatsData binaryStats = new BinaryColumnStatsData(); @@ -404,6 +426,7 @@ public class StatObjectConverter { longStats.setLowValue(mStatsObj.getLongLowValue()); } longStats.setNumDVs(mStatsObj.getNumDVs()); + longStats.setBitVectors((mStatsObj.getBitVector()==null||!enableBitVector)? null : new String(mStatsObj.getBitVector())); colStatsData.setLongStats(longStats); } else if (colType.equals("double") || colType.equals("float")) { DoubleColumnStatsData doubleStats = new DoubleColumnStatsData(); @@ -415,6 +438,7 @@ public class StatObjectConverter { doubleStats.setLowValue(mStatsObj.getDoubleLowValue()); } doubleStats.setNumDVs(mStatsObj.getNumDVs()); + doubleStats.setBitVectors((mStatsObj.getBitVector()==null||!enableBitVector)? null : new String(mStatsObj.getBitVector())); colStatsData.setDoubleStats(doubleStats); } else if (colType.startsWith("decimal")) { DecimalColumnStatsData decimalStats = new DecimalColumnStatsData(); @@ -426,6 +450,7 @@ public class StatObjectConverter { decimalStats.setLowValue(createThriftDecimal(mStatsObj.getDecimalLowValue())); } decimalStats.setNumDVs(mStatsObj.getNumDVs()); + decimalStats.setBitVectors((mStatsObj.getBitVector()==null||!enableBitVector)? null : new String(mStatsObj.getBitVector())); colStatsData.setDecimalStats(decimalStats); } else if (colType.equals("date")) { DateColumnStatsData dateStats = new DateColumnStatsData(); @@ -433,6 +458,7 @@ public class StatObjectConverter { dateStats.setHighValue(new Date(mStatsObj.getLongHighValue())); dateStats.setLowValue(new Date(mStatsObj.getLongLowValue())); dateStats.setNumDVs(mStatsObj.getNumDVs()); + dateStats.setBitVectors((mStatsObj.getBitVector()==null||!enableBitVector)? null : new String(mStatsObj.getBitVector())); colStatsData.setDateStats(dateStats); } statsObj.setStatsData(colStatsData); @@ -450,10 +476,10 @@ public class StatObjectConverter { return statsDesc; } - // SQL + // JAVA public static void fillColumnStatisticsData(String colType, ColumnStatisticsData data, Object llow, Object lhigh, Object dlow, Object dhigh, Object declow, Object dechigh, - Object nulls, Object dist, Object avglen, Object maxlen, Object trues, Object falses) throws MetaException { + Object nulls, Object dist, Object bitVector, Object avglen, Object maxlen, Object trues, Object falses) throws MetaException { colType = colType.toLowerCase(); if (colType.equals("boolean")) { BooleanColumnStatsData boolStats = new BooleanColumnStatsData(); @@ -468,6 +494,7 @@ public class StatObjectConverter { stringStats.setAvgColLen(MetaStoreDirectSql.extractSqlDouble(avglen)); stringStats.setMaxColLen(MetaStoreDirectSql.extractSqlLong(maxlen)); stringStats.setNumDVs(MetaStoreDirectSql.extractSqlLong(dist)); + stringStats.setBitVectors(MetaStoreDirectSql.extractSqlBlob(bitVector)); data.setStringStats(stringStats); } else if (colType.equals("binary")) { BinaryColumnStatsData binaryStats = new BinaryColumnStatsData(); @@ -487,6 +514,7 @@ public class StatObjectConverter { longStats.setLowValue(MetaStoreDirectSql.extractSqlLong(llow)); } longStats.setNumDVs(MetaStoreDirectSql.extractSqlLong(dist)); + longStats.setBitVectors(MetaStoreDirectSql.extractSqlBlob(bitVector)); data.setLongStats(longStats); } else if (colType.equals("double") || colType.equals("float")) { DoubleColumnStatsData doubleStats = new DoubleColumnStatsData(); @@ -498,6 +526,7 @@ public class StatObjectConverter { doubleStats.setLowValue(MetaStoreDirectSql.extractSqlDouble(dlow)); } doubleStats.setNumDVs(MetaStoreDirectSql.extractSqlLong(dist)); + doubleStats.setBitVectors(MetaStoreDirectSql.extractSqlBlob(bitVector)); data.setDoubleStats(doubleStats); } else if (colType.startsWith("decimal")) { DecimalColumnStatsData decimalStats = new DecimalColumnStatsData(); @@ -509,6 +538,7 @@ public class StatObjectConverter { decimalStats.setLowValue(createThriftDecimal((String)declow)); } decimalStats.setNumDVs(MetaStoreDirectSql.extractSqlLong(dist)); + decimalStats.setBitVectors(MetaStoreDirectSql.extractSqlBlob(bitVector)); data.setDecimalStats(decimalStats); } else if (colType.equals("date")) { DateColumnStatsData dateStats = new DateColumnStatsData(); @@ -520,10 +550,12 @@ public class StatObjectConverter { dateStats.setLowValue(new Date(MetaStoreDirectSql.extractSqlLong(llow))); } dateStats.setNumDVs(MetaStoreDirectSql.extractSqlLong(dist)); + dateStats.setBitVectors(MetaStoreDirectSql.extractSqlBlob(bitVector)); data.setDateStats(dateStats); } } + //DB public static void fillColumnStatisticsData(String colType, ColumnStatisticsData data, Object llow, Object lhigh, Object dlow, Object dhigh, Object declow, Object dechigh, Object nulls, Object dist, Object avglen, Object maxlen, Object trues, Object falses, http://git-wip-us.apache.org/repos/asf/hive/blob/f8b79fe6/metastore/src/java/org/apache/hadoop/hive/metastore/cache/CachedStore.java ---------------------------------------------------------------------- diff --git a/metastore/src/java/org/apache/hadoop/hive/metastore/cache/CachedStore.java b/metastore/src/java/org/apache/hadoop/hive/metastore/cache/CachedStore.java index cea94a0..fb98ccf 100644 --- a/metastore/src/java/org/apache/hadoop/hive/metastore/cache/CachedStore.java +++ b/metastore/src/java/org/apache/hadoop/hive/metastore/cache/CachedStore.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.metastore.cache; import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.HashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; @@ -80,8 +81,8 @@ import org.apache.hadoop.hive.metastore.api.Type; import org.apache.hadoop.hive.metastore.api.UnknownDBException; import org.apache.hadoop.hive.metastore.api.UnknownPartitionException; import org.apache.hadoop.hive.metastore.api.UnknownTableException; -import org.apache.hadoop.hive.metastore.hbase.stats.merge.ColumnStatsMerger; -import org.apache.hadoop.hive.metastore.hbase.stats.merge.ColumnStatsMergerFactory; +import org.apache.hadoop.hive.metastore.columnstats.merge.ColumnStatsMerger; +import org.apache.hadoop.hive.metastore.columnstats.merge.ColumnStatsMergerFactory; import org.apache.hadoop.hive.metastore.partition.spec.PartitionSpecProxy; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hadoop.hive.serde2.typeinfo.TypeInfoFactory; @@ -92,6 +93,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Lists; // TODO filter->expr // TODO functionCache @@ -1562,27 +1564,37 @@ public class CachedStore implements RawStore, Configurable { private ColumnStatisticsObj mergeColStatsForPartitions(String dbName, String tblName, List<String> partNames, String colName) throws MetaException { - ColumnStatisticsObj colStats = null; + final boolean useDensityFunctionForNDVEstimation = HiveConf.getBoolVar(getConf(), + HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_DENSITY_FUNCTION); + final double ndvTuner = HiveConf.getFloatVar(getConf(), + HiveConf.ConfVars.HIVE_METASTORE_STATS_NDV_TUNER); + Map<String, List<ColumnStatistics>> map = new HashMap<>(); + List<ColumnStatistics> list = new ArrayList<>(); + boolean areAllPartsFound = true; for (String partName : partNames) { - String colStatsCacheKey = - CacheUtils.buildKey(dbName, tblName, partNameToVals(partName), colName); - ColumnStatisticsObj colStatsForPart = - SharedCache.getCachedPartitionColStats(colStatsCacheKey); - if (colStatsForPart == null) { - // we don't have stats for all the partitions - // logic for extrapolation hasn't been added to CacheStore - // So stop now, and lets fallback to underlying RawStore - return null; - } - if (colStats == null) { - colStats = colStatsForPart; + String colStatsCacheKey = CacheUtils.buildKey(dbName, tblName, partNameToVals(partName), + colName); + List<ColumnStatisticsObj> singleObj = new ArrayList<>(); + ColumnStatisticsObj colStatsForPart = SharedCache + .getCachedPartitionColStats(colStatsCacheKey); + if (colStatsForPart != null) { + singleObj.add(colStatsForPart); + ColumnStatisticsDesc css = new ColumnStatisticsDesc(false, dbName, tblName); + css.setPartName(partName); + list.add(new ColumnStatistics(css, singleObj)); } else { - ColumnStatsMerger merger = - ColumnStatsMergerFactory.getColumnStatsMerger(colStats, colStatsForPart); - merger.merge(colStats, colStatsForPart); + areAllPartsFound = false; } } - return colStats; + map.put(colName, list); + List<String> colNames = new ArrayList<>(); + colNames.add(colName); + // Note that enableBitVector does not apply here because ColumnStatisticsObj + // itself will tell whether + // bitvector is null or not and aggr logic can automatically apply. + return MetaStoreUtils + .aggrPartitionStats(map, dbName, tblName, partNames, colNames, areAllPartsFound, + useDensityFunctionForNDVEstimation, ndvTuner).iterator().next(); } @Override