HIVE-10592: ORC file dump in JSON format (Prasanth Jayachandran reviewed by Gopal V)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/80fb8913 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/80fb8913 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/80fb8913 Branch: refs/heads/beeline-cli Commit: 80fb8913196eef8e4125544c3138b0c73be267b7 Parents: 93995c8 Author: Prasanth Jayachandran <j.prasant...@gmail.com> Authored: Wed May 6 18:52:17 2015 -0700 Committer: Prasanth Jayachandran <j.prasant...@gmail.com> Committed: Wed May 6 18:52:17 2015 -0700 ---------------------------------------------------------------------- bin/ext/orcfiledump.sh | 9 +- .../hive/ql/io/orc/ColumnStatisticsImpl.java | 16 +- .../apache/hadoop/hive/ql/io/orc/FileDump.java | 91 +- .../hadoop/hive/ql/io/orc/JsonFileDump.java | 365 +++++ .../hadoop/hive/ql/io/orc/TestJsonFileDump.java | 138 ++ ql/src/test/resources/orc-file-dump.json | 1354 ++++++++++++++++++ 6 files changed, 1929 insertions(+), 44 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/80fb8913/bin/ext/orcfiledump.sh ---------------------------------------------------------------------- diff --git a/bin/ext/orcfiledump.sh b/bin/ext/orcfiledump.sh index 752e437..6139de2 100644 --- a/bin/ext/orcfiledump.sh +++ b/bin/ext/orcfiledump.sh @@ -23,5 +23,12 @@ orcfiledump () { } orcfiledump_help () { - echo "usage ./hive orcfiledump [-d] [--rowindex <col_ids>] <path_to_file>" + echo "usage ./hive orcfiledump [-h] [-j] [-p] [-t] [-d] [-r <col_ids>] <path_to_file>" + echo "" + echo " --json (-j) Print metadata in JSON format" + echo " --pretty (-p) Pretty print json metadata output" + echo " --timezone (-t) Print writer's time zone" + echo " --data (-d) Should the data be printed" + echo " --rowindex (-r) <_col_ids_> Comma separated list of column ids for which row index should be printed" + echo " --help (-h) Print help message" } http://git-wip-us.apache.org/repos/asf/hive/blob/80fb8913/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ColumnStatisticsImpl.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ColumnStatisticsImpl.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ColumnStatisticsImpl.java index 7cfbd81..ffba3c6 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ColumnStatisticsImpl.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/ColumnStatisticsImpl.java @@ -699,12 +699,18 @@ class ColumnStatisticsImpl implements ColumnStatistics { @Override public Date getMinimum() { + if (minimum == null) { + return null; + } minDate.set(minimum); return minDate.get(); } @Override public Date getMaximum() { + if (maximum == null) { + return null; + } maxDate.set(maximum); return maxDate.get(); } @@ -793,14 +799,12 @@ class ColumnStatisticsImpl implements ColumnStatistics { @Override public Timestamp getMinimum() { - Timestamp minTimestamp = new Timestamp(minimum); - return minTimestamp; + return minimum == null ? null : new Timestamp(minimum); } @Override public Timestamp getMaximum() { - Timestamp maxTimestamp = new Timestamp(maximum); - return maxTimestamp; + return maximum == null ? null : new Timestamp(maximum); } @Override @@ -808,9 +812,9 @@ class ColumnStatisticsImpl implements ColumnStatistics { StringBuilder buf = new StringBuilder(super.toString()); if (getNumberOfValues() != 0) { buf.append(" min: "); - buf.append(minimum); + buf.append(getMinimum()); buf.append(" max: "); - buf.append(maximum); + buf.append(getMaximum()); } return buf.toString(); } http://git-wip-us.apache.org/repos/asf/hive/blob/80fb8913/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java index cd4db75..33c4cd8 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -50,10 +50,11 @@ import org.codehaus.jettison.json.JSONWriter; * A tool for printing out the file structure of ORC files. */ public final class FileDump { - private static final String UNKNOWN = "UNKNOWN"; + public static final String UNKNOWN = "UNKNOWN"; // not used - private FileDump() {} + private FileDump() { + } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); @@ -69,21 +70,28 @@ public final class FileDump { } boolean dumpData = cli.hasOption('d'); - if (cli.hasOption("rowindex")) { - String[] colStrs = cli.getOptionValue("rowindex").split(","); + if (cli.hasOption("r")) { + String[] colStrs = cli.getOptionValue("r").split(","); rowIndexCols = new ArrayList<Integer>(colStrs.length); for (String colStr : colStrs) { rowIndexCols.add(Integer.parseInt(colStr)); } } - boolean printTimeZone = false; - if (cli.hasOption('t')) { - printTimeZone = true; - } + boolean printTimeZone = cli.hasOption('t'); + boolean jsonFormat = cli.hasOption('j'); String[] files = cli.getArgs(); - if (dumpData) printData(Arrays.asList(files), conf); - else printMetaData(Arrays.asList(files), conf, rowIndexCols, printTimeZone); + if (dumpData) { + printData(Arrays.asList(files), conf); + } else { + if (jsonFormat) { + boolean prettyPrint = cli.hasOption('p'); + JsonFileDump.printJsonMetaData(Arrays.asList(files), conf, rowIndexCols, prettyPrint, + printTimeZone); + } else { + printMetaData(Arrays.asList(files), conf, rowIndexCols, printTimeZone); + } + } } private static void printData(List<String> files, Configuration conf) throws IOException, @@ -100,7 +108,7 @@ public final class FileDump { Path path = new Path(filename); Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); System.out.println("File Version: " + reader.getFileVersion().getName() + - " with " + reader.getWriterVersion()); + " with " + reader.getWriterVersion()); RecordReaderImpl rows = (RecordReaderImpl) reader.rows(); System.out.println("Rows: " + reader.getNumberOfRows()); System.out.println("Compression: " + reader.getCompression()); @@ -121,7 +129,7 @@ public final class FileDump { ColumnStatistics[] stats = reader.getStatistics(); int colCount = stats.length; System.out.println("\nFile Statistics:"); - for(int i=0; i < stats.length; ++i) { + for (int i = 0; i < stats.length; ++i) { System.out.println(" Column " + i + ": " + stats[i].toString()); } System.out.println("\nStripes:"); @@ -140,7 +148,7 @@ public final class FileDump { System.out.println(" Stripe: " + stripe.toString()); } long sectionStart = stripeStart; - for(OrcProto.Stream section: footer.getStreamsList()) { + for (OrcProto.Stream section : footer.getStreamsList()) { String kind = section.hasKind() ? section.getKind().name() : UNKNOWN; System.out.println(" Stream: column " + section.getColumn() + " section " + kind + " start: " + sectionStart + @@ -270,7 +278,7 @@ public final class FileDump { return buf.toString(); } - private static long getTotalPaddingSize(Reader reader) throws IOException { + public static long getTotalPaddingSize(Reader reader) throws IOException { long paddedBytes = 0; List<org.apache.hadoop.hive.ql.io.orc.StripeInformation> stripes = reader.getStripes(); for (int i = 1; i < stripes.size(); i++) { @@ -307,21 +315,30 @@ public final class FileDump { .withArgName("comma separated list of column ids for which row index should be printed") .withDescription("Dump stats for column number(s)") .hasArg() - .create()); + .create('r')); + + result.addOption(OptionBuilder + .withLongOpt("json") + .withDescription("Print metadata in JSON format") + .create('j')); + result.addOption(OptionBuilder + .withLongOpt("pretty") + .withDescription("Pretty print json metadata output") + .create('p')); return result; } private static void printMap(JSONWriter writer, - Map<Object, Object> obj, - List<OrcProto.Type> types, - OrcProto.Type type + Map<Object, Object> obj, + List<OrcProto.Type> types, + OrcProto.Type type ) throws IOException, JSONException { writer.array(); int keyType = type.getSubtypes(0); int valueType = type.getSubtypes(1); - for(Map.Entry<Object,Object> item: obj.entrySet()) { + for (Map.Entry<Object, Object> item : obj.entrySet()) { writer.object(); writer.key("_key"); printObject(writer, item.getKey(), types, keyType); @@ -333,34 +350,34 @@ public final class FileDump { } private static void printList(JSONWriter writer, - List<Object> obj, - List<OrcProto.Type> types, - OrcProto.Type type + List<Object> obj, + List<OrcProto.Type> types, + OrcProto.Type type ) throws IOException, JSONException { int subtype = type.getSubtypes(0); writer.array(); - for(Object item: obj) { + for (Object item : obj) { printObject(writer, item, types, subtype); } writer.endArray(); } private static void printUnion(JSONWriter writer, - OrcUnion obj, - List<OrcProto.Type> types, - OrcProto.Type type + OrcUnion obj, + List<OrcProto.Type> types, + OrcProto.Type type ) throws IOException, JSONException { int subtype = type.getSubtypes(obj.getTag()); printObject(writer, obj.getObject(), types, subtype); } static void printStruct(JSONWriter writer, - OrcStruct obj, - List<OrcProto.Type> types, - OrcProto.Type type) throws IOException, JSONException { + OrcStruct obj, + List<OrcProto.Type> types, + OrcProto.Type type) throws IOException, JSONException { writer.object(); List<Integer> fieldTypes = type.getSubtypesList(); - for(int i=0; i < fieldTypes.size(); ++i) { + for (int i = 0; i < fieldTypes.size(); ++i) { writer.key(type.getFieldNames(i)); printObject(writer, obj.getFieldValue(i), types, fieldTypes.get(i)); } @@ -368,9 +385,9 @@ public final class FileDump { } static void printObject(JSONWriter writer, - Object obj, - List<OrcProto.Type> types, - int typeId) throws IOException, JSONException { + Object obj, + List<OrcProto.Type> types, + int typeId) throws IOException, JSONException { OrcProto.Type type = types.get(typeId); if (obj == null) { writer.value(null); @@ -417,7 +434,7 @@ public final class FileDump { } static void printJsonData(Configuration conf, - String filename) throws IOException, JSONException { + String filename) throws IOException, JSONException { Path path = new Path(filename); Reader reader = OrcFile.createReader(path.getFileSystem(conf), path); OutputStreamWriter out = new OutputStreamWriter(System.out, "UTF-8"); http://git-wip-us.apache.org/repos/asf/hive/blob/80fb8913/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java new file mode 100644 index 0000000..c33004e --- /dev/null +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java @@ -0,0 +1,365 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * <p/> + * http://www.apache.org/licenses/LICENSE-2.0 + * <p/> + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.hadoop.hive.ql.io.orc; + +import java.io.IOException; +import java.util.List; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.ql.io.filters.BloomFilterIO; +import org.codehaus.jettison.json.JSONException; +import org.codehaus.jettison.json.JSONObject; +import org.codehaus.jettison.json.JSONArray; +import org.codehaus.jettison.json.JSONStringer; +import org.codehaus.jettison.json.JSONWriter; + +/** + * File dump tool with json formatted output. + */ +public class JsonFileDump { + + public static void printJsonMetaData(List<String> files, Configuration conf, + List<Integer> rowIndexCols, boolean prettyPrint, boolean printTimeZone) throws JSONException, IOException { + JSONStringer writer = new JSONStringer(); + boolean multiFile = files.size() > 1; + if (multiFile) { + writer.array(); + } else { + writer.object(); + } + for (String filename : files) { + if (multiFile) { + writer.object(); + } + writer.key("fileName").value(filename); + Path path = new Path(filename); + Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); + writer.key("fileVersion").value(reader.getFileVersion().getName()); + writer.key("writerVersion").value(reader.getWriterVersion()); + RecordReaderImpl rows = (RecordReaderImpl) reader.rows(); + writer.key("numberOfRows").value(reader.getNumberOfRows()); + writer.key("compression").value(reader.getCompression()); + if (reader.getCompression() != CompressionKind.NONE) { + writer.key("compressionBufferSize").value(reader.getCompressionSize()); + } + writer.key("schemaString").value(reader.getObjectInspector().getTypeName()); + writer.key("schema").array(); + writeSchema(writer, reader.getTypes()); + writer.endArray(); + + writer.key("stripeStatistics").array(); + Metadata metadata = reader.getMetadata(); + for (int n = 0; n < metadata.getStripeStatistics().size(); n++) { + writer.object(); + writer.key("stripeNumber").value(n + 1); + StripeStatistics ss = metadata.getStripeStatistics().get(n); + writer.key("columnStatistics").array(); + for (int i = 0; i < ss.getColumnStatistics().length; i++) { + writer.object(); + writer.key("columnId").value(i); + writeColumnStatistics(writer, ss.getColumnStatistics()[i]); + writer.endObject(); + } + writer.endArray(); + writer.endObject(); + } + writer.endArray(); + + ColumnStatistics[] stats = reader.getStatistics(); + int colCount = stats.length; + writer.key("fileStatistics").array(); + for (int i = 0; i < stats.length; ++i) { + writer.object(); + writer.key("columnId").value(i); + writeColumnStatistics(writer, stats[i]); + writer.endObject(); + } + writer.endArray(); + + writer.key("stripes").array(); + int stripeIx = -1; + for (StripeInformation stripe : reader.getStripes()) { + ++stripeIx; + long stripeStart = stripe.getOffset(); + OrcProto.StripeFooter footer = rows.readStripeFooter(stripe); + writer.object(); // start of stripe information + writer.key("stripeNumber").value(stripeIx + 1); + writer.key("stripeInformation"); + writeStripeInformation(writer, stripe); + if (printTimeZone) { + writer.key("writerTimezone").value( + footer.hasWriterTimezone() ? footer.getWriterTimezone() : FileDump.UNKNOWN); + } + long sectionStart = stripeStart; + + writer.key("streams").array(); + for (OrcProto.Stream section : footer.getStreamsList()) { + writer.object(); + String kind = section.hasKind() ? section.getKind().name() : FileDump.UNKNOWN; + writer.key("columnId").value(section.getColumn()); + writer.key("section").value(kind); + writer.key("startOffset").value(sectionStart); + writer.key("length").value(section.getLength()); + sectionStart += section.getLength(); + writer.endObject(); + } + writer.endArray(); + + writer.key("encodings").array(); + for (int i = 0; i < footer.getColumnsCount(); ++i) { + writer.object(); + OrcProto.ColumnEncoding encoding = footer.getColumns(i); + writer.key("columnId").value(i); + writer.key("kind").value(encoding.getKind()); + if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY || + encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) { + writer.key("dictionarySize").value(encoding.getDictionarySize()); + } + writer.endObject(); + } + writer.endArray(); + + if (rowIndexCols != null && !rowIndexCols.isEmpty()) { + // include the columns that are specified, only if the columns are included, bloom filter + // will be read + boolean[] sargColumns = new boolean[colCount]; + for (int colIdx : rowIndexCols) { + sargColumns[colIdx] = true; + } + RecordReaderImpl.Index indices = rows.readRowIndex(stripeIx, null, sargColumns); + writer.key("indexes").array(); + for (int col : rowIndexCols) { + writer.object(); + writer.key("columnId").value(col); + writeRowGroupIndexes(writer, col, indices.getRowGroupIndex()); + writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex()); + writer.endObject(); + } + writer.endArray(); + } + writer.endObject(); // end of stripe information + } + writer.endArray(); + + FileSystem fs = path.getFileSystem(conf); + long fileLen = fs.getContentSummary(path).getLength(); + long paddedBytes = FileDump.getTotalPaddingSize(reader); + // empty ORC file is ~45 bytes. Assumption here is file length always >0 + double percentPadding = ((double) paddedBytes / (double) fileLen) * 100; + writer.key("fileLength").value(fileLen); + writer.key("paddingLength").value(paddedBytes); + writer.key("paddingRatio").value(percentPadding); + rows.close(); + + writer.endObject(); + } + if (multiFile) { + writer.endArray(); + } + + if (prettyPrint) { + final String prettyJson; + if (multiFile) { + JSONArray jsonArray = new JSONArray(writer.toString()); + prettyJson = jsonArray.toString(2); + } else { + JSONObject jsonObject = new JSONObject(writer.toString()); + prettyJson = jsonObject.toString(2); + } + System.out.println(prettyJson); + } else { + System.out.println(writer.toString()); + } + } + + private static void writeSchema(JSONStringer writer, List<OrcProto.Type> types) + throws JSONException { + int i = 0; + for(OrcProto.Type type : types) { + writer.object(); + writer.key("columnId").value(i++); + writer.key("columnType").value(type.getKind()); + if (type.getFieldNamesCount() > 0) { + writer.key("childColumnNames").array(); + for (String field : type.getFieldNamesList()) { + writer.value(field); + } + writer.endArray(); + writer.key("childColumnIds").array(); + for (Integer colId : type.getSubtypesList()) { + writer.value(colId); + } + writer.endArray(); + } + if (type.hasPrecision()) { + writer.key("precision").value(type.getPrecision()); + } + + if (type.hasScale()) { + writer.key("scale").value(type.getScale()); + } + + if (type.hasMaximumLength()) { + writer.key("maxLength").value(type.getMaximumLength()); + } + writer.endObject(); + } + } + + private static void writeStripeInformation(JSONWriter writer, StripeInformation stripe) + throws JSONException { + writer.object(); + writer.key("offset").value(stripe.getOffset()); + writer.key("indexLength").value(stripe.getIndexLength()); + writer.key("dataLength").value(stripe.getDataLength()); + writer.key("footerLength").value(stripe.getFooterLength()); + writer.key("rowCount").value(stripe.getNumberOfRows()); + writer.endObject(); + } + + private static void writeColumnStatistics(JSONWriter writer, ColumnStatistics cs) + throws JSONException { + if (cs != null) { + writer.key("count").value(cs.getNumberOfValues()); + writer.key("hasNull").value(cs.hasNull()); + if (cs instanceof BinaryColumnStatistics) { + writer.key("totalLength").value(((BinaryColumnStatistics) cs).getSum()); + writer.key("type").value(OrcProto.Type.Kind.BINARY); + } else if (cs instanceof BooleanColumnStatistics) { + writer.key("trueCount").value(((BooleanColumnStatistics) cs).getTrueCount()); + writer.key("falseCount").value(((BooleanColumnStatistics) cs).getFalseCount()); + writer.key("type").value(OrcProto.Type.Kind.BOOLEAN); + } else if (cs instanceof IntegerColumnStatistics) { + writer.key("min").value(((IntegerColumnStatistics) cs).getMinimum()); + writer.key("max").value(((IntegerColumnStatistics) cs).getMaximum()); + if (((IntegerColumnStatistics) cs).isSumDefined()) { + writer.key("sum").value(((IntegerColumnStatistics) cs).getSum()); + } + writer.key("type").value(OrcProto.Type.Kind.LONG); + } else if (cs instanceof DoubleColumnStatistics) { + writer.key("min").value(((DoubleColumnStatistics) cs).getMinimum()); + writer.key("max").value(((DoubleColumnStatistics) cs).getMaximum()); + writer.key("sum").value(((DoubleColumnStatistics) cs).getSum()); + writer.key("type").value(OrcProto.Type.Kind.DOUBLE); + } else if (cs instanceof StringColumnStatistics) { + writer.key("min").value(((StringColumnStatistics) cs).getMinimum()); + writer.key("max").value(((StringColumnStatistics) cs).getMaximum()); + writer.key("totalLength").value(((StringColumnStatistics) cs).getSum()); + writer.key("type").value(OrcProto.Type.Kind.STRING); + } else if (cs instanceof DateColumnStatistics) { + if (((DateColumnStatistics) cs).getMaximum() != null) { + writer.key("min").value(((DateColumnStatistics) cs).getMinimum()); + writer.key("max").value(((DateColumnStatistics) cs).getMaximum()); + } + writer.key("type").value(OrcProto.Type.Kind.DATE); + } else if (cs instanceof TimestampColumnStatistics) { + if (((TimestampColumnStatistics) cs).getMaximum() != null) { + writer.key("min").value(((TimestampColumnStatistics) cs).getMinimum()); + writer.key("max").value(((TimestampColumnStatistics) cs).getMaximum()); + } + writer.key("type").value(OrcProto.Type.Kind.TIMESTAMP); + } else if (cs instanceof DecimalColumnStatistics) { + if (((DecimalColumnStatistics) cs).getMaximum() != null) { + writer.key("min").value(((DecimalColumnStatistics) cs).getMinimum()); + writer.key("max").value(((DecimalColumnStatistics) cs).getMaximum()); + writer.key("sum").value(((DecimalColumnStatistics) cs).getSum()); + } + writer.key("type").value(OrcProto.Type.Kind.DECIMAL); + } + } + } + + private static void writeBloomFilterIndexes(JSONWriter writer, int col, + OrcProto.BloomFilterIndex[] bloomFilterIndex) throws JSONException { + + BloomFilterIO stripeLevelBF = null; + if (bloomFilterIndex != null && bloomFilterIndex[col] != null) { + int entryIx = 0; + writer.key("bloomFilterIndexes").array(); + for (OrcProto.BloomFilter bf : bloomFilterIndex[col].getBloomFilterList()) { + writer.object(); + writer.key("entryId").value(entryIx++); + BloomFilterIO toMerge = new BloomFilterIO(bf); + writeBloomFilterStats(writer, toMerge); + if (stripeLevelBF == null) { + stripeLevelBF = toMerge; + } else { + stripeLevelBF.merge(toMerge); + } + writer.endObject(); + } + writer.endArray(); + } + if (stripeLevelBF != null) { + writer.key("stripeLevelBloomFilter"); + writer.object(); + writeBloomFilterStats(writer, stripeLevelBF); + writer.endObject(); + } + } + + private static void writeBloomFilterStats(JSONWriter writer, BloomFilterIO bf) + throws JSONException { + int bitCount = bf.getBitSize(); + int popCount = 0; + for (long l : bf.getBitSet()) { + popCount += Long.bitCount(l); + } + int k = bf.getNumHashFunctions(); + float loadFactor = (float) popCount / (float) bitCount; + float expectedFpp = (float) Math.pow(loadFactor, k); + writer.key("numHashFunctions").value(k); + writer.key("bitCount").value(bitCount); + writer.key("popCount").value(popCount); + writer.key("loadFactor").value(loadFactor); + writer.key("expectedFpp").value(expectedFpp); + } + + private static void writeRowGroupIndexes(JSONWriter writer, int col, + OrcProto.RowIndex[] rowGroupIndex) + throws JSONException { + + OrcProto.RowIndex index; + if (rowGroupIndex == null || (col >= rowGroupIndex.length) || + ((index = rowGroupIndex[col]) == null)) { + return; + } + + writer.key("rowGroupIndexes").array(); + for (int entryIx = 0; entryIx < index.getEntryCount(); ++entryIx) { + writer.object(); + writer.key("entryId").value(entryIx); + OrcProto.RowIndexEntry entry = index.getEntry(entryIx); + if (entry == null) { + continue; + } + OrcProto.ColumnStatistics colStats = entry.getStatistics(); + writeColumnStatistics(writer, ColumnStatisticsImpl.deserialize(colStats)); + writer.key("positions").array(); + for (int posIx = 0; posIx < entry.getPositionsCount(); ++posIx) { + writer.value(entry.getPositions(posIx)); + } + writer.endArray(); + writer.endObject(); + } + writer.endArray(); + } + +} http://git-wip-us.apache.org/repos/asf/hive/blob/80fb8913/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestJsonFileDump.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestJsonFileDump.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestJsonFileDump.java new file mode 100644 index 0000000..d17c528 --- /dev/null +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestJsonFileDump.java @@ -0,0 +1,138 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.hadoop.hive.ql.io.orc; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNull; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileOutputStream; +import java.io.FileReader; +import java.io.PrintStream; +import java.util.Random; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory; +import org.apache.hive.common.util.HiveTestUtils; +import org.junit.Before; +import org.junit.Test; + +public class TestJsonFileDump { + + Path workDir = new Path(System.getProperty("test.tmp.dir")); + Configuration conf; + FileSystem fs; + Path testFilePath; + + @Before + public void openFileSystem () throws Exception { + conf = new Configuration(); + fs = FileSystem.getLocal(conf); + fs.setWorkingDirectory(workDir); + testFilePath = new Path("TestFileDump.testDump.orc"); + fs.delete(testFilePath, false); + } + + static class MyRecord { + int i; + long l; + String s; + MyRecord(int i, long l, String s) { + this.i = i; + this.l = l; + this.s = s; + } + } + + static void checkOutput(String expected, + String actual) throws Exception { + BufferedReader eStream = + new BufferedReader(new FileReader(HiveTestUtils.getFileFromClasspath(expected))); + BufferedReader aStream = + new BufferedReader(new FileReader(actual)); + String expectedLine = eStream.readLine(); + while (expectedLine != null) { + String actualLine = aStream.readLine(); + System.out.println("actual: " + actualLine); + System.out.println("expected: " + expectedLine); + assertEquals(expectedLine, actualLine); + expectedLine = eStream.readLine(); + } + assertNull(eStream.readLine()); + assertNull(aStream.readLine()); + } + + @Test + public void testJsonDump() throws Exception { + ObjectInspector inspector; + synchronized (TestOrcFile.class) { + inspector = ObjectInspectorFactory.getReflectionObjectInspector + (MyRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); + } + conf.set(HiveConf.ConfVars.HIVE_ORC_ENCODING_STRATEGY.varname, "COMPRESSION"); + OrcFile.WriterOptions options = OrcFile.writerOptions(conf) + .fileSystem(fs) + .inspector(inspector) + .stripeSize(100000) + .compress(CompressionKind.ZLIB) + .bufferSize(10000) + .rowIndexStride(1000) + .bloomFilterColumns("s"); + Writer writer = OrcFile.createWriter(testFilePath, options); + Random r1 = new Random(1); + String[] words = new String[]{"It", "was", "the", "best", "of", "times,", + "it", "was", "the", "worst", "of", "times,", "it", "was", "the", "age", + "of", "wisdom,", "it", "was", "the", "age", "of", "foolishness,", "it", + "was", "the", "epoch", "of", "belief,", "it", "was", "the", "epoch", + "of", "incredulity,", "it", "was", "the", "season", "of", "Light,", + "it", "was", "the", "season", "of", "Darkness,", "it", "was", "the", + "spring", "of", "hope,", "it", "was", "the", "winter", "of", "despair,", + "we", "had", "everything", "before", "us,", "we", "had", "nothing", + "before", "us,", "we", "were", "all", "going", "direct", "to", + "Heaven,", "we", "were", "all", "going", "direct", "the", "other", + "way"}; + for(int i=0; i < 21000; ++i) { + if (i % 100 == 0) { + writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(), null)); + } else { + writer.addRow(new MyRecord(r1.nextInt(), r1.nextLong(), + words[r1.nextInt(words.length)])); + } + } + + writer.close(); + PrintStream origOut = System.out; + String outputFilename = "orc-file-dump.json"; + FileOutputStream myOut = new FileOutputStream(workDir + File.separator + outputFilename); + + // replace stdout and run command + System.setOut(new PrintStream(myOut)); + FileDump.main(new String[]{testFilePath.toString(), "-j", "-p", "--rowindex=3"}); + System.out.flush(); + System.setOut(origOut); + + + checkOutput(outputFilename, workDir + File.separator + outputFilename); + } +} http://git-wip-us.apache.org/repos/asf/hive/blob/80fb8913/ql/src/test/resources/orc-file-dump.json ---------------------------------------------------------------------- diff --git a/ql/src/test/resources/orc-file-dump.json b/ql/src/test/resources/orc-file-dump.json new file mode 100644 index 0000000..125a32e --- /dev/null +++ b/ql/src/test/resources/orc-file-dump.json @@ -0,0 +1,1354 @@ +{ + "fileName": "TestFileDump.testDump.orc", + "fileVersion": "0.12", + "writerVersion": "HIVE_8732", + "numberOfRows": 21000, + "compression": "ZLIB", + "compressionBufferSize": 10000, + "schemaString": "struct<i:int,l:bigint,s:string>", + "schema": [ + { + "columnId": 0, + "columnType": "STRUCT", + "childColumnNames": [ + "i", + "l", + "s" + ], + "childColumnIds": [ + 1, + 2, + 3 + ] + }, + { + "columnId": 1, + "columnType": "INT" + }, + { + "columnId": 2, + "columnType": "LONG" + }, + { + "columnId": 3, + "columnType": "STRING" + } + ], + "stripeStatistics": [ + { + "stripeNumber": 1, + "columnStatistics": [ + { + "columnId": 0, + "count": 5000, + "hasNull": false + }, + { + "columnId": 1, + "count": 5000, + "hasNull": false, + "min": -2147115959, + "max": 2145210552, + "sum": 50111854553, + "type": "LONG" + }, + { + "columnId": 2, + "count": 5000, + "hasNull": false, + "min": -9223180583305557329, + "max": 9221614132680747961, + "type": "LONG" + }, + { + "columnId": 3, + "count": 4950, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 19283, + "type": "STRING" + } + ] + }, + { + "stripeNumber": 2, + "columnStatistics": [ + { + "columnId": 0, + "count": 5000, + "hasNull": false + }, + { + "columnId": 1, + "count": 5000, + "hasNull": false, + "min": -2147390285, + "max": 2147224606, + "sum": -22290798217, + "type": "LONG" + }, + { + "columnId": 2, + "count": 5000, + "hasNull": false, + "min": -9219295160509160427, + "max": 9217571024994660020, + "type": "LONG" + }, + { + "columnId": 3, + "count": 4950, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 19397, + "type": "STRING" + } + ] + }, + { + "stripeNumber": 3, + "columnStatistics": [ + { + "columnId": 0, + "count": 5000, + "hasNull": false + }, + { + "columnId": 1, + "count": 5000, + "hasNull": false, + "min": -2146954065, + "max": 2146722468, + "sum": 20639652136, + "type": "LONG" + }, + { + "columnId": 2, + "count": 5000, + "hasNull": false, + "min": -9214076359988107846, + "max": 9222919052987871506, + "type": "LONG" + }, + { + "columnId": 3, + "count": 4950, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 19031, + "type": "STRING" + } + ] + }, + { + "stripeNumber": 4, + "columnStatistics": [ + { + "columnId": 0, + "count": 5000, + "hasNull": false + }, + { + "columnId": 1, + "count": 5000, + "hasNull": false, + "min": -2146969085, + "max": 2146025044, + "sum": -5156814387, + "type": "LONG" + }, + { + "columnId": 2, + "count": 5000, + "hasNull": false, + "min": -9222731174895935707, + "max": 9220625004936875965, + "type": "LONG" + }, + { + "columnId": 3, + "count": 4950, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 19459, + "type": "STRING" + } + ] + }, + { + "stripeNumber": 5, + "columnStatistics": [ + { + "columnId": 0, + "count": 1000, + "hasNull": false + }, + { + "columnId": 1, + "count": 1000, + "hasNull": false, + "min": -2144303438, + "max": 2127599049, + "sum": 62841564778, + "type": "LONG" + }, + { + "columnId": 2, + "count": 1000, + "hasNull": false, + "min": -9195133638801798919, + "max": 9218626063131504414, + "type": "LONG" + }, + { + "columnId": 3, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3963, + "type": "STRING" + } + ] + } + ], + "fileStatistics": [ + { + "columnId": 0, + "count": 21000, + "hasNull": false + }, + { + "columnId": 1, + "count": 21000, + "hasNull": false, + "min": -2147390285, + "max": 2147224606, + "sum": 106145458863, + "type": "LONG" + }, + { + "columnId": 2, + "count": 21000, + "hasNull": false, + "min": -9223180583305557329, + "max": 9222919052987871506, + "type": "LONG" + }, + { + "columnId": 3, + "count": 20790, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 81133, + "type": "STRING" + } + ], + "stripes": [ + { + "stripeNumber": 1, + "stripeInformation": { + "offset": 3, + "indexLength": 863, + "dataLength": 63749, + "footerLength": 103, + "rowCount": 5000 + }, + "streams": [ + { + "columnId": 0, + "section": "ROW_INDEX", + "startOffset": 3, + "length": 17 + }, + { + "columnId": 1, + "section": "ROW_INDEX", + "startOffset": 20, + "length": 165 + }, + { + "columnId": 2, + "section": "ROW_INDEX", + "startOffset": 185, + "length": 174 + }, + { + "columnId": 3, + "section": "ROW_INDEX", + "startOffset": 359, + "length": 103 + }, + { + "columnId": 3, + "section": "BLOOM_FILTER", + "startOffset": 462, + "length": 404 + }, + { + "columnId": 1, + "section": "DATA", + "startOffset": 866, + "length": 20029 + }, + { + "columnId": 2, + "section": "DATA", + "startOffset": 20895, + "length": 40035 + }, + { + "columnId": 3, + "section": "PRESENT", + "startOffset": 60930, + "length": 17 + }, + { + "columnId": 3, + "section": "DATA", + "startOffset": 60947, + "length": 3510 + }, + { + "columnId": 3, + "section": "LENGTH", + "startOffset": 64457, + "length": 25 + }, + { + "columnId": 3, + "section": "DICTIONARY_DATA", + "startOffset": 64482, + "length": 133 + } + ], + "encodings": [ + { + "columnId": 0, + "kind": "DIRECT" + }, + { + "columnId": 1, + "kind": "DIRECT_V2" + }, + { + "columnId": 2, + "kind": "DIRECT_V2" + }, + { + "columnId": 3, + "kind": "DICTIONARY_V2", + "dictionarySize": 35 + } + ], + "indexes": [{ + "columnId": 3, + "rowGroupIndexes": [ + { + "entryId": 0, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3873, + "type": "STRING", + "positions": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + }, + { + "entryId": 1, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3861, + "type": "STRING", + "positions": [ + 0, + 38, + 12, + 0, + 0, + 736, + 23 + ] + }, + { + "entryId": 2, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3946, + "type": "STRING", + "positions": [ + 0, + 78, + 12, + 0, + 0, + 1473, + 43 + ] + }, + { + "entryId": 3, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3774, + "type": "STRING", + "positions": [ + 0, + 118, + 12, + 0, + 0, + 2067, + 261 + ] + }, + { + "entryId": 4, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3829, + "type": "STRING", + "positions": [ + 0, + 158, + 12, + 0, + 0, + 2992, + 35 + ] + } + ], + "bloomFilterIndexes": [ + { + "entryId": 0, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + }, + { + "entryId": 1, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + }, + { + "entryId": 2, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + }, + { + "entryId": 3, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + }, + { + "entryId": 4, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + } + ], + "stripeLevelBloomFilter": { + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + } + }] + }, + { + "stripeNumber": 2, + "stripeInformation": { + "offset": 64718, + "indexLength": 854, + "dataLength": 63742, + "footerLength": 103, + "rowCount": 5000 + }, + "streams": [ + { + "columnId": 0, + "section": "ROW_INDEX", + "startOffset": 64718, + "length": 17 + }, + { + "columnId": 1, + "section": "ROW_INDEX", + "startOffset": 64735, + "length": 164 + }, + { + "columnId": 2, + "section": "ROW_INDEX", + "startOffset": 64899, + "length": 169 + }, + { + "columnId": 3, + "section": "ROW_INDEX", + "startOffset": 65068, + "length": 100 + }, + { + "columnId": 3, + "section": "BLOOM_FILTER", + "startOffset": 65168, + "length": 404 + }, + { + "columnId": 1, + "section": "DATA", + "startOffset": 65572, + "length": 20029 + }, + { + "columnId": 2, + "section": "DATA", + "startOffset": 85601, + "length": 40035 + }, + { + "columnId": 3, + "section": "PRESENT", + "startOffset": 125636, + "length": 17 + }, + { + "columnId": 3, + "section": "DATA", + "startOffset": 125653, + "length": 3503 + }, + { + "columnId": 3, + "section": "LENGTH", + "startOffset": 129156, + "length": 25 + }, + { + "columnId": 3, + "section": "DICTIONARY_DATA", + "startOffset": 129181, + "length": 133 + } + ], + "encodings": [ + { + "columnId": 0, + "kind": "DIRECT" + }, + { + "columnId": 1, + "kind": "DIRECT_V2" + }, + { + "columnId": 2, + "kind": "DIRECT_V2" + }, + { + "columnId": 3, + "kind": "DICTIONARY_V2", + "dictionarySize": 35 + } + ], + "indexes": [{ + "columnId": 3, + "rowGroupIndexes": [ + { + "entryId": 0, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3946, + "type": "STRING", + "positions": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + }, + { + "entryId": 1, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3836, + "type": "STRING", + "positions": [ + 0, + 38, + 12, + 0, + 0, + 746, + 11 + ] + }, + { + "entryId": 2, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3791, + "type": "STRING", + "positions": [ + 0, + 78, + 12, + 0, + 0, + 1430, + 95 + ] + }, + { + "entryId": 3, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3904, + "type": "STRING", + "positions": [ + 0, + 118, + 12, + 0, + 0, + 2239, + 23 + ] + }, + { + "entryId": 4, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3920, + "type": "STRING", + "positions": [ + 0, + 158, + 12, + 0, + 0, + 2994, + 17 + ] + } + ], + "bloomFilterIndexes": [ + { + "entryId": 0, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + }, + { + "entryId": 1, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + }, + { + "entryId": 2, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + }, + { + "entryId": 3, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + }, + { + "entryId": 4, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + } + ], + "stripeLevelBloomFilter": { + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + } + }] + }, + { + "stripeNumber": 3, + "stripeInformation": { + "offset": 129417, + "indexLength": 853, + "dataLength": 63749, + "footerLength": 103, + "rowCount": 5000 + }, + "streams": [ + { + "columnId": 0, + "section": "ROW_INDEX", + "startOffset": 129417, + "length": 17 + }, + { + "columnId": 1, + "section": "ROW_INDEX", + "startOffset": 129434, + "length": 160 + }, + { + "columnId": 2, + "section": "ROW_INDEX", + "startOffset": 129594, + "length": 170 + }, + { + "columnId": 3, + "section": "ROW_INDEX", + "startOffset": 129764, + "length": 102 + }, + { + "columnId": 3, + "section": "BLOOM_FILTER", + "startOffset": 129866, + "length": 404 + }, + { + "columnId": 1, + "section": "DATA", + "startOffset": 130270, + "length": 20029 + }, + { + "columnId": 2, + "section": "DATA", + "startOffset": 150299, + "length": 40035 + }, + { + "columnId": 3, + "section": "PRESENT", + "startOffset": 190334, + "length": 17 + }, + { + "columnId": 3, + "section": "DATA", + "startOffset": 190351, + "length": 3510 + }, + { + "columnId": 3, + "section": "LENGTH", + "startOffset": 193861, + "length": 25 + }, + { + "columnId": 3, + "section": "DICTIONARY_DATA", + "startOffset": 193886, + "length": 133 + } + ], + "encodings": [ + { + "columnId": 0, + "kind": "DIRECT" + }, + { + "columnId": 1, + "kind": "DIRECT_V2" + }, + { + "columnId": 2, + "kind": "DIRECT_V2" + }, + { + "columnId": 3, + "kind": "DICTIONARY_V2", + "dictionarySize": 35 + } + ], + "indexes": [{ + "columnId": 3, + "rowGroupIndexes": [ + { + "entryId": 0, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3829, + "type": "STRING", + "positions": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + }, + { + "entryId": 1, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3853, + "type": "STRING", + "positions": [ + 0, + 38, + 12, + 0, + 0, + 698, + 74 + ] + }, + { + "entryId": 2, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3796, + "type": "STRING", + "positions": [ + 0, + 78, + 12, + 0, + 0, + 1483, + 39 + ] + }, + { + "entryId": 3, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3736, + "type": "STRING", + "positions": [ + 0, + 118, + 12, + 0, + 0, + 2148, + 155 + ] + }, + { + "entryId": 4, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3817, + "type": "STRING", + "positions": [ + 0, + 158, + 12, + 0, + 0, + 3018, + 8 + ] + } + ], + "bloomFilterIndexes": [ + { + "entryId": 0, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + }, + { + "entryId": 1, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + }, + { + "entryId": 2, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + }, + { + "entryId": 3, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + }, + { + "entryId": 4, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + } + ], + "stripeLevelBloomFilter": { + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + } + }] + }, + { + "stripeNumber": 4, + "stripeInformation": { + "offset": 194122, + "indexLength": 866, + "dataLength": 63735, + "footerLength": 103, + "rowCount": 5000 + }, + "streams": [ + { + "columnId": 0, + "section": "ROW_INDEX", + "startOffset": 194122, + "length": 17 + }, + { + "columnId": 1, + "section": "ROW_INDEX", + "startOffset": 194139, + "length": 164 + }, + { + "columnId": 2, + "section": "ROW_INDEX", + "startOffset": 194303, + "length": 174 + }, + { + "columnId": 3, + "section": "ROW_INDEX", + "startOffset": 194477, + "length": 107 + }, + { + "columnId": 3, + "section": "BLOOM_FILTER", + "startOffset": 194584, + "length": 404 + }, + { + "columnId": 1, + "section": "DATA", + "startOffset": 194988, + "length": 20029 + }, + { + "columnId": 2, + "section": "DATA", + "startOffset": 215017, + "length": 40035 + }, + { + "columnId": 3, + "section": "PRESENT", + "startOffset": 255052, + "length": 17 + }, + { + "columnId": 3, + "section": "DATA", + "startOffset": 255069, + "length": 3496 + }, + { + "columnId": 3, + "section": "LENGTH", + "startOffset": 258565, + "length": 25 + }, + { + "columnId": 3, + "section": "DICTIONARY_DATA", + "startOffset": 258590, + "length": 133 + } + ], + "encodings": [ + { + "columnId": 0, + "kind": "DIRECT" + }, + { + "columnId": 1, + "kind": "DIRECT_V2" + }, + { + "columnId": 2, + "kind": "DIRECT_V2" + }, + { + "columnId": 3, + "kind": "DICTIONARY_V2", + "dictionarySize": 35 + } + ], + "indexes": [{ + "columnId": 3, + "rowGroupIndexes": [ + { + "entryId": 0, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3959, + "type": "STRING", + "positions": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + }, + { + "entryId": 1, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3816, + "type": "STRING", + "positions": [ + 0, + 38, + 12, + 0, + 0, + 495, + 338 + ] + }, + { + "entryId": 2, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3883, + "type": "STRING", + "positions": [ + 0, + 78, + 12, + 0, + 0, + 1449, + 71 + ] + }, + { + "entryId": 3, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3938, + "type": "STRING", + "positions": [ + 0, + 118, + 12, + 0, + 0, + 2207, + 59 + ] + }, + { + "entryId": 4, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3863, + "type": "STRING", + "positions": [ + 0, + 158, + 12, + 0, + 0, + 2838, + 223 + ] + } + ], + "bloomFilterIndexes": [ + { + "entryId": 0, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + }, + { + "entryId": 1, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + }, + { + "entryId": 2, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + }, + { + "entryId": 3, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + }, + { + "entryId": 4, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + } + ], + "stripeLevelBloomFilter": { + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + } + }] + }, + { + "stripeNumber": 5, + "stripeInformation": { + "offset": 258826, + "indexLength": 433, + "dataLength": 12940, + "footerLength": 95, + "rowCount": 1000 + }, + "streams": [ + { + "columnId": 0, + "section": "ROW_INDEX", + "startOffset": 258826, + "length": 12 + }, + { + "columnId": 1, + "section": "ROW_INDEX", + "startOffset": 258838, + "length": 38 + }, + { + "columnId": 2, + "section": "ROW_INDEX", + "startOffset": 258876, + "length": 41 + }, + { + "columnId": 3, + "section": "ROW_INDEX", + "startOffset": 258917, + "length": 41 + }, + { + "columnId": 3, + "section": "BLOOM_FILTER", + "startOffset": 258958, + "length": 301 + }, + { + "columnId": 1, + "section": "DATA", + "startOffset": 259259, + "length": 4007 + }, + { + "columnId": 2, + "section": "DATA", + "startOffset": 263266, + "length": 8007 + }, + { + "columnId": 3, + "section": "PRESENT", + "startOffset": 271273, + "length": 16 + }, + { + "columnId": 3, + "section": "DATA", + "startOffset": 271289, + "length": 752 + }, + { + "columnId": 3, + "section": "LENGTH", + "startOffset": 272041, + "length": 25 + }, + { + "columnId": 3, + "section": "DICTIONARY_DATA", + "startOffset": 272066, + "length": 133 + } + ], + "encodings": [ + { + "columnId": 0, + "kind": "DIRECT" + }, + { + "columnId": 1, + "kind": "DIRECT_V2" + }, + { + "columnId": 2, + "kind": "DIRECT_V2" + }, + { + "columnId": 3, + "kind": "DICTIONARY_V2", + "dictionarySize": 35 + } + ], + "indexes": [{ + "columnId": 3, + "rowGroupIndexes": [{ + "entryId": 0, + "count": 990, + "hasNull": true, + "min": "Darkness,", + "max": "worst", + "totalLength": 3963, + "type": "STRING", + "positions": [ + 0, + 0, + 0, + 0, + 0, + 0, + 0 + ] + }], + "bloomFilterIndexes": [{ + "entryId": 0, + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + }], + "stripeLevelBloomFilter": { + "numHashFunctions": 4, + "bitCount": 6272, + "popCount": 138, + "loadFactor": 0.022002551704645157, + "expectedFpp": 2.3436470542037569E-7 + } + }] + } + ], + "fileLength": 272842, + "paddingLength": 0, + "paddingRatio": 0 +}