HIVE-11523: org.apache.hadoop.hive.ql.io.orc.FileDump should handle errors (Prasanth Jayachandran reviewed by Ashutosh Chauhan)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/d84e393e Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/d84e393e Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/d84e393e Branch: refs/heads/spark Commit: d84e393ed66829fe0c8cc87254fef2a329b96163 Parents: a91e147 Author: Prasanth Jayachandran <j.prasant...@gmail.com> Authored: Mon Oct 26 14:18:36 2015 -0500 Committer: Prasanth Jayachandran <j.prasant...@gmail.com> Committed: Mon Oct 26 14:18:36 2015 -0500 ---------------------------------------------------------------------- .../apache/hadoop/hive/ql/io/orc/FileDump.java | 193 +++++++++-------- .../hadoop/hive/ql/io/orc/JsonFileDump.java | 210 ++++++++++--------- .../hadoop/hive/ql/io/orc/TestFileDump.java | 50 ----- 3 files changed, 213 insertions(+), 240 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/d84e393e/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java index a1c5058..9c6538f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java @@ -136,9 +136,16 @@ public final class FileDump { private static void printData(List<String> files, Configuration conf) throws IOException, JSONException { for (String file : files) { - printJsonData(conf, file); - if (files.size() > 1) { - System.out.println(Strings.repeat("=", 80) + "\n"); + try { + printJsonData(conf, file); + if (files.size() > 1) { + System.out.println(Strings.repeat("=", 80) + "\n"); + } + } catch (Exception e) { + System.err.println("Unable to dump data for file: " + file); + e.printStackTrace(); + System.err.println(Strings.repeat("=", 80) + "\n"); + continue; } } } @@ -146,103 +153,111 @@ public final class FileDump { private static void printMetaData(List<String> files, Configuration conf, List<Integer> rowIndexCols, boolean printTimeZone) throws IOException { for (String filename : files) { - System.out.println("Structure for " + filename); - Path path = new Path(filename); - Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); - System.out.println("File Version: " + reader.getFileVersion().getName() + - " with " + reader.getWriterVersion()); - RecordReaderImpl rows = (RecordReaderImpl) reader.rows(); - System.out.println("Rows: " + reader.getNumberOfRows()); - System.out.println("Compression: " + reader.getCompression()); - if (reader.getCompression() != CompressionKind.NONE) { - System.out.println("Compression size: " + reader.getCompressionSize()); - } - System.out.println("Type: " + reader.getObjectInspector().getTypeName()); - System.out.println("\nStripe Statistics:"); - List<StripeStatistics> stripeStats = reader.getStripeStatistics(); - for (int n = 0; n < stripeStats.size(); n++) { - System.out.println(" Stripe " + (n + 1) + ":"); - StripeStatistics ss = stripeStats.get(n); - for (int i = 0; i < ss.getColumnStatistics().length; ++i) { - System.out.println(" Column " + i + ": " + - ss.getColumnStatistics()[i].toString()); + try { + Path path = new Path(filename); + Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); + System.out.println("Structure for " + filename); + System.out.println("File Version: " + reader.getFileVersion().getName() + + " with " + reader.getWriterVersion()); + RecordReaderImpl rows = (RecordReaderImpl) reader.rows(); + System.out.println("Rows: " + reader.getNumberOfRows()); + System.out.println("Compression: " + reader.getCompression()); + if (reader.getCompression() != CompressionKind.NONE) { + System.out.println("Compression size: " + reader.getCompressionSize()); } - } - ColumnStatistics[] stats = reader.getStatistics(); - int colCount = stats.length; - System.out.println("\nFile Statistics:"); - for (int i = 0; i < stats.length; ++i) { - System.out.println(" Column " + i + ": " + stats[i].toString()); - } - System.out.println("\nStripes:"); - int stripeIx = -1; - for (StripeInformation stripe : reader.getStripes()) { - ++stripeIx; - long stripeStart = stripe.getOffset(); - OrcProto.StripeFooter footer = rows.readStripeFooter(stripe); - if (printTimeZone) { - String tz = footer.getWriterTimezone(); - if (tz == null || tz.isEmpty()) { - tz = UNKNOWN; + System.out.println("Type: " + reader.getObjectInspector().getTypeName()); + System.out.println("\nStripe Statistics:"); + List<StripeStatistics> stripeStats = reader.getStripeStatistics(); + for (int n = 0; n < stripeStats.size(); n++) { + System.out.println(" Stripe " + (n + 1) + ":"); + StripeStatistics ss = stripeStats.get(n); + for (int i = 0; i < ss.getColumnStatistics().length; ++i) { + System.out.println(" Column " + i + ": " + + ss.getColumnStatistics()[i].toString()); } - System.out.println(" Stripe: " + stripe.toString() + " timezone: " + tz); - } else { - System.out.println(" Stripe: " + stripe.toString()); } - long sectionStart = stripeStart; - for (OrcProto.Stream section : footer.getStreamsList()) { - String kind = section.hasKind() ? section.getKind().name() : UNKNOWN; - System.out.println(" Stream: column " + section.getColumn() + - " section " + kind + " start: " + sectionStart + - " length " + section.getLength()); - sectionStart += section.getLength(); + ColumnStatistics[] stats = reader.getStatistics(); + int colCount = stats.length; + System.out.println("\nFile Statistics:"); + for (int i = 0; i < stats.length; ++i) { + System.out.println(" Column " + i + ": " + stats[i].toString()); } - for (int i = 0; i < footer.getColumnsCount(); ++i) { - OrcProto.ColumnEncoding encoding = footer.getColumns(i); - StringBuilder buf = new StringBuilder(); - buf.append(" Encoding column "); - buf.append(i); - buf.append(": "); - buf.append(encoding.getKind()); - if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY || - encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) { - buf.append("["); - buf.append(encoding.getDictionarySize()); - buf.append("]"); + System.out.println("\nStripes:"); + int stripeIx = -1; + for (StripeInformation stripe : reader.getStripes()) { + ++stripeIx; + long stripeStart = stripe.getOffset(); + OrcProto.StripeFooter footer = rows.readStripeFooter(stripe); + if (printTimeZone) { + String tz = footer.getWriterTimezone(); + if (tz == null || tz.isEmpty()) { + tz = UNKNOWN; + } + System.out.println(" Stripe: " + stripe.toString() + " timezone: " + tz); + } else { + System.out.println(" Stripe: " + stripe.toString()); } - System.out.println(buf); - } - if (rowIndexCols != null && !rowIndexCols.isEmpty()) { - // include the columns that are specified, only if the columns are included, bloom filter - // will be read - boolean[] sargColumns = new boolean[colCount]; - for (int colIdx : rowIndexCols) { - sargColumns[colIdx] = true; + long sectionStart = stripeStart; + for (OrcProto.Stream section : footer.getStreamsList()) { + String kind = section.hasKind() ? section.getKind().name() : UNKNOWN; + System.out.println(" Stream: column " + section.getColumn() + + " section " + kind + " start: " + sectionStart + + " length " + section.getLength()); + sectionStart += section.getLength(); } - RecordReaderImpl.Index indices = rows.readRowIndex(stripeIx, null, null, null, sargColumns); - for (int col : rowIndexCols) { + for (int i = 0; i < footer.getColumnsCount(); ++i) { + OrcProto.ColumnEncoding encoding = footer.getColumns(i); StringBuilder buf = new StringBuilder(); - String rowIdxString = getFormattedRowIndices(col, indices.getRowGroupIndex()); - buf.append(rowIdxString); - String bloomFilString = getFormattedBloomFilters(col, indices.getBloomFilterIndex()); - buf.append(bloomFilString); + buf.append(" Encoding column "); + buf.append(i); + buf.append(": "); + buf.append(encoding.getKind()); + if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY || + encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) { + buf.append("["); + buf.append(encoding.getDictionarySize()); + buf.append("]"); + } System.out.println(buf); } + if (rowIndexCols != null && !rowIndexCols.isEmpty()) { + // include the columns that are specified, only if the columns are included, bloom filter + // will be read + boolean[] sargColumns = new boolean[colCount]; + for (int colIdx : rowIndexCols) { + sargColumns[colIdx] = true; + } + RecordReaderImpl.Index indices = rows + .readRowIndex(stripeIx, null, null, null, sargColumns); + for (int col : rowIndexCols) { + StringBuilder buf = new StringBuilder(); + String rowIdxString = getFormattedRowIndices(col, indices.getRowGroupIndex()); + buf.append(rowIdxString); + String bloomFilString = getFormattedBloomFilters(col, indices.getBloomFilterIndex()); + buf.append(bloomFilString); + System.out.println(buf); + } + } } - } - FileSystem fs = path.getFileSystem(conf); - long fileLen = fs.getContentSummary(path).getLength(); - long paddedBytes = getTotalPaddingSize(reader); - // empty ORC file is ~45 bytes. Assumption here is file length always >0 - double percentPadding = ((double) paddedBytes / (double) fileLen) * 100; - DecimalFormat format = new DecimalFormat("##.##"); - System.out.println("\nFile length: " + fileLen + " bytes"); - System.out.println("Padding length: " + paddedBytes + " bytes"); - System.out.println("Padding ratio: " + format.format(percentPadding) + "%"); - rows.close(); - if (files.size() > 1) { - System.out.println(Strings.repeat("=", 80) + "\n"); + FileSystem fs = path.getFileSystem(conf); + long fileLen = fs.getContentSummary(path).getLength(); + long paddedBytes = getTotalPaddingSize(reader); + // empty ORC file is ~45 bytes. Assumption here is file length always >0 + double percentPadding = ((double) paddedBytes / (double) fileLen) * 100; + DecimalFormat format = new DecimalFormat("##.##"); + System.out.println("\nFile length: " + fileLen + " bytes"); + System.out.println("Padding length: " + paddedBytes + " bytes"); + System.out.println("Padding ratio: " + format.format(percentPadding) + "%"); + rows.close(); + if (files.size() > 1) { + System.out.println(Strings.repeat("=", 80) + "\n"); + } + } catch (Exception e) { + System.err.println("Unable to dump metadata for file: " + filename); + e.printStackTrace(); + System.err.println(Strings.repeat("=", 80) + "\n"); + continue; } } } http://git-wip-us.apache.org/repos/asf/hive/blob/d84e393e/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java index a438855..02e01b4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/JsonFileDump.java @@ -45,130 +45,138 @@ public class JsonFileDump { writer.object(); } for (String filename : files) { - if (multiFile) { - writer.object(); - } - writer.key("fileName").value(filename); - Path path = new Path(filename); - Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); - writer.key("fileVersion").value(reader.getFileVersion().getName()); - writer.key("writerVersion").value(reader.getWriterVersion()); - RecordReaderImpl rows = (RecordReaderImpl) reader.rows(); - writer.key("numberOfRows").value(reader.getNumberOfRows()); - writer.key("compression").value(reader.getCompression()); - if (reader.getCompression() != CompressionKind.NONE) { - writer.key("compressionBufferSize").value(reader.getCompressionSize()); - } - writer.key("schemaString").value(reader.getObjectInspector().getTypeName()); - writer.key("schema").array(); - writeSchema(writer, reader.getTypes()); - writer.endArray(); - - writer.key("stripeStatistics").array(); - List<StripeStatistics> stripeStatistics = reader.getStripeStatistics(); - for (int n = 0; n < stripeStatistics.size(); n++) { - writer.object(); - writer.key("stripeNumber").value(n + 1); - StripeStatistics ss = stripeStatistics.get(n); - writer.key("columnStatistics").array(); - for (int i = 0; i < ss.getColumnStatistics().length; i++) { + try { + if (multiFile) { writer.object(); - writer.key("columnId").value(i); - writeColumnStatistics(writer, ss.getColumnStatistics()[i]); - writer.endObject(); } - writer.endArray(); - writer.endObject(); - } - writer.endArray(); - - ColumnStatistics[] stats = reader.getStatistics(); - int colCount = stats.length; - writer.key("fileStatistics").array(); - for (int i = 0; i < stats.length; ++i) { - writer.object(); - writer.key("columnId").value(i); - writeColumnStatistics(writer, stats[i]); - writer.endObject(); - } - writer.endArray(); - - writer.key("stripes").array(); - int stripeIx = -1; - for (StripeInformation stripe : reader.getStripes()) { - ++stripeIx; - long stripeStart = stripe.getOffset(); - OrcProto.StripeFooter footer = rows.readStripeFooter(stripe); - writer.object(); // start of stripe information - writer.key("stripeNumber").value(stripeIx + 1); - writer.key("stripeInformation"); - writeStripeInformation(writer, stripe); - if (printTimeZone) { - writer.key("writerTimezone").value( - footer.hasWriterTimezone() ? footer.getWriterTimezone() : FileDump.UNKNOWN); + writer.key("fileName").value(filename); + Path path = new Path(filename); + Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); + writer.key("fileVersion").value(reader.getFileVersion().getName()); + writer.key("writerVersion").value(reader.getWriterVersion()); + RecordReaderImpl rows = (RecordReaderImpl) reader.rows(); + writer.key("numberOfRows").value(reader.getNumberOfRows()); + writer.key("compression").value(reader.getCompression()); + if (reader.getCompression() != CompressionKind.NONE) { + writer.key("compressionBufferSize").value(reader.getCompressionSize()); } - long sectionStart = stripeStart; + writer.key("schemaString").value(reader.getObjectInspector().getTypeName()); + writer.key("schema").array(); + writeSchema(writer, reader.getTypes()); + writer.endArray(); - writer.key("streams").array(); - for (OrcProto.Stream section : footer.getStreamsList()) { + writer.key("stripeStatistics").array(); + List<StripeStatistics> stripeStatistics = reader.getStripeStatistics(); + for (int n = 0; n < stripeStatistics.size(); n++) { writer.object(); - String kind = section.hasKind() ? section.getKind().name() : FileDump.UNKNOWN; - writer.key("columnId").value(section.getColumn()); - writer.key("section").value(kind); - writer.key("startOffset").value(sectionStart); - writer.key("length").value(section.getLength()); - sectionStart += section.getLength(); + writer.key("stripeNumber").value(n + 1); + StripeStatistics ss = stripeStatistics.get(n); + writer.key("columnStatistics").array(); + for (int i = 0; i < ss.getColumnStatistics().length; i++) { + writer.object(); + writer.key("columnId").value(i); + writeColumnStatistics(writer, ss.getColumnStatistics()[i]); + writer.endObject(); + } + writer.endArray(); writer.endObject(); } writer.endArray(); - writer.key("encodings").array(); - for (int i = 0; i < footer.getColumnsCount(); ++i) { + ColumnStatistics[] stats = reader.getStatistics(); + int colCount = stats.length; + writer.key("fileStatistics").array(); + for (int i = 0; i < stats.length; ++i) { writer.object(); - OrcProto.ColumnEncoding encoding = footer.getColumns(i); writer.key("columnId").value(i); - writer.key("kind").value(encoding.getKind()); - if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY || - encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) { - writer.key("dictionarySize").value(encoding.getDictionarySize()); - } + writeColumnStatistics(writer, stats[i]); writer.endObject(); } writer.endArray(); - if (rowIndexCols != null && !rowIndexCols.isEmpty()) { - // include the columns that are specified, only if the columns are included, bloom filter - // will be read - boolean[] sargColumns = new boolean[colCount]; - for (int colIdx : rowIndexCols) { - sargColumns[colIdx] = true; + writer.key("stripes").array(); + int stripeIx = -1; + for (StripeInformation stripe : reader.getStripes()) { + ++stripeIx; + long stripeStart = stripe.getOffset(); + OrcProto.StripeFooter footer = rows.readStripeFooter(stripe); + writer.object(); // start of stripe information + writer.key("stripeNumber").value(stripeIx + 1); + writer.key("stripeInformation"); + writeStripeInformation(writer, stripe); + if (printTimeZone) { + writer.key("writerTimezone").value( + footer.hasWriterTimezone() ? footer.getWriterTimezone() : FileDump.UNKNOWN); } - RecordReaderImpl.Index indices = rows.readRowIndex(stripeIx, null, sargColumns); - writer.key("indexes").array(); - for (int col : rowIndexCols) { + long sectionStart = stripeStart; + + writer.key("streams").array(); + for (OrcProto.Stream section : footer.getStreamsList()) { writer.object(); - writer.key("columnId").value(col); - writeRowGroupIndexes(writer, col, indices.getRowGroupIndex()); - writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex()); + String kind = section.hasKind() ? section.getKind().name() : FileDump.UNKNOWN; + writer.key("columnId").value(section.getColumn()); + writer.key("section").value(kind); + writer.key("startOffset").value(sectionStart); + writer.key("length").value(section.getLength()); + sectionStart += section.getLength(); writer.endObject(); } writer.endArray(); + + writer.key("encodings").array(); + for (int i = 0; i < footer.getColumnsCount(); ++i) { + writer.object(); + OrcProto.ColumnEncoding encoding = footer.getColumns(i); + writer.key("columnId").value(i); + writer.key("kind").value(encoding.getKind()); + if (encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY || + encoding.getKind() == OrcProto.ColumnEncoding.Kind.DICTIONARY_V2) { + writer.key("dictionarySize").value(encoding.getDictionarySize()); + } + writer.endObject(); + } + writer.endArray(); + + if (rowIndexCols != null && !rowIndexCols.isEmpty()) { + // include the columns that are specified, only if the columns are included, bloom filter + // will be read + boolean[] sargColumns = new boolean[colCount]; + for (int colIdx : rowIndexCols) { + sargColumns[colIdx] = true; + } + RecordReaderImpl.Index indices = rows.readRowIndex(stripeIx, null, sargColumns); + writer.key("indexes").array(); + for (int col : rowIndexCols) { + writer.object(); + writer.key("columnId").value(col); + writeRowGroupIndexes(writer, col, indices.getRowGroupIndex()); + writeBloomFilterIndexes(writer, col, indices.getBloomFilterIndex()); + writer.endObject(); + } + writer.endArray(); + } + writer.endObject(); // end of stripe information } - writer.endObject(); // end of stripe information - } - writer.endArray(); + writer.endArray(); - FileSystem fs = path.getFileSystem(conf); - long fileLen = fs.getContentSummary(path).getLength(); - long paddedBytes = FileDump.getTotalPaddingSize(reader); - // empty ORC file is ~45 bytes. Assumption here is file length always >0 - double percentPadding = ((double) paddedBytes / (double) fileLen) * 100; - writer.key("fileLength").value(fileLen); - writer.key("paddingLength").value(paddedBytes); - writer.key("paddingRatio").value(percentPadding); - rows.close(); + FileSystem fs = path.getFileSystem(conf); + long fileLen = fs.getContentSummary(path).getLength(); + long paddedBytes = FileDump.getTotalPaddingSize(reader); + // empty ORC file is ~45 bytes. Assumption here is file length always >0 + double percentPadding = ((double) paddedBytes / (double) fileLen) * 100; + writer.key("fileLength").value(fileLen); + writer.key("paddingLength").value(paddedBytes); + writer.key("paddingRatio").value(percentPadding); + writer.key("status").value("OK"); + rows.close(); - writer.endObject(); + writer.endObject(); + } catch (Exception e) { + writer.key("status").value("FAILED"); + System.err.println("Unable to dump data for file: " + filename); + e.printStackTrace(); + throw e; + } } if (multiFile) { writer.endArray(); http://git-wip-us.apache.org/repos/asf/hive/blob/d84e393e/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java ---------------------------------------------------------------------- diff --git a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java index 68d503e..40674ea 100644 --- a/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java +++ b/ql/src/test/org/apache/hadoop/hive/ql/io/orc/TestFileDump.java @@ -26,8 +26,6 @@ import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileOutputStream; import java.io.FileReader; -import java.io.IOException; -import java.io.OutputStream; import java.io.PrintStream; import java.sql.Date; import java.sql.Timestamp; @@ -252,54 +250,6 @@ public class TestFileDump { assertEquals("{\"b\":false,\"bt\":20,\"s\":200,\"i\":2000,\"l\":20000,\"f\":8,\"d\":40,\"de\":\"2.2222\",\"t\":\"2014-11-25 18:02:44\",\"dt\":\"2014-09-28\",\"str\":\"abcd\",\"c\":\"world \",\"vc\":\"world\",\"m\":[{\"_key\":\"k3\",\"_value\":\"v3\"}],\"a\":[200,300],\"st\":{\"i\":20,\"s\":\"bar\"}}", lines[1]); } - @Test(expected = IOException.class) - public void testDataDumpThrowsIOException() throws Exception { - PrintStream origOut = System.out; - try { - ObjectInspector inspector; - synchronized (TestOrcFile.class) { - inspector = ObjectInspectorFactory.getReflectionObjectInspector - (AllTypesRecord.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); - } - Writer writer = OrcFile.createWriter(fs, testFilePath, conf, inspector, - 100000, CompressionKind.NONE, 10000, 1000); - Map<String, String> m = new HashMap<String, String>(2); - m.put("k1", "v1"); - writer.addRow(new AllTypesRecord( - true, - (byte) 10, - (short) 100, - 1000, - 10000L, - 4.0f, - 20.0, - HiveDecimal.create("4.2222"), - new Timestamp(1416967764000L), - new Date(1416967764000L), - "string", - new HiveChar("hello", 5), - new HiveVarchar("hello", 10), - m, - Arrays.asList(100, 200), - new AllTypesRecord.Struct(10, "foo"))); - - writer.close(); - - OutputStream myOut = new OutputStream() { - @Override - public void write(int b) throws IOException { - throw new IOException(); - } - }; - - // replace stdout and run command - System.setOut(new PrintStream(myOut)); - FileDump.main(new String[]{testFilePath.toString(), "-d"}); - } finally { - System.setOut(origOut); - } - } - // Test that if the fraction of rows that have distinct strings is greater than the configured // threshold dictionary encoding is turned off. If dictionary encoding is turned off the length // of the dictionary stream for the column will be 0 in the ORC file dump.