Author: prasanthj
Date: Mon Jul 14 18:11:14 2014
New Revision: 1610475
URL: http://svn.apache.org/r1610475
Log:
HIVE-7243: Print padding information in ORC file dump (Prasanth J, reviewed by
Gunther Hagleitner)
Modified:
hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java
hive/trunk/ql/src/test/resources/orc-file-dump-dictionary-threshold.out
hive/trunk/ql/src/test/resources/orc-file-dump.out
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java
URL:
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java?rev=1610475&r1=1610474&r2=1610475&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java Mon
Jul 14 18:11:14 2014
@@ -20,7 +20,12 @@ package org.apache.hadoop.hive.ql.io.orc
import java.util.ArrayList;
import java.util.List;
+import java.io.IOException;
+import java.text.DecimalFormat;
+import java.util.List;
+
import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndex;
import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry;
@@ -147,7 +152,28 @@ public final class FileDump {
}
}
}
+
+ FileSystem fs = path.getFileSystem(conf);
+ long fileLen = fs.getContentSummary(path).getLength();
+ long paddedBytes = getTotalPaddingSize(reader);
+ // empty ORC file is ~45 bytes. Assumption here is file length always >0
+ double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
+ DecimalFormat format = new DecimalFormat("##.##");
+ System.out.println("\nFile length: " + fileLen + " bytes");
+ System.out.println("Padding length: " + paddedBytes + " bytes");
+ System.out.println("Padding ratio: " + format.format(percentPadding) +
"%");
rows.close();
}
}
+
+ private static long getTotalPaddingSize(Reader reader) throws IOException {
+ long paddedBytes = 0;
+ List<org.apache.hadoop.hive.ql.io.orc.StripeInformation> stripes =
reader.getStripes();
+ for (int i = 1; i < stripes.size(); i++) {
+ long prevStripeOffset = stripes.get(i - 1).getOffset();
+ long prevStripeLen = stripes.get(i - 1).getLength();
+ paddedBytes += stripes.get(i).getOffset() - (prevStripeOffset +
prevStripeLen);
+ }
+ return paddedBytes;
+ }
}
Modified:
hive/trunk/ql/src/test/resources/orc-file-dump-dictionary-threshold.out
URL:
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/resources/orc-file-dump-dictionary-threshold.out?rev=1610475&r1=1610474&r2=1610475&view=diff
==============================================================================
--- hive/trunk/ql/src/test/resources/orc-file-dump-dictionary-threshold.out
(original)
+++ hive/trunk/ql/src/test/resources/orc-file-dump-dictionary-threshold.out Mon
Jul 14 18:11:14 2014
@@ -103,3 +103,7 @@ Stripes:
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
Encoding column 3: DIRECT_V2
+
+File length: 1932446 bytes
+Padding length: 0 bytes
+Padding ratio: 0%
Modified: hive/trunk/ql/src/test/resources/orc-file-dump.out
URL:
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/resources/orc-file-dump.out?rev=1610475&r1=1610474&r2=1610475&view=diff
==============================================================================
--- hive/trunk/ql/src/test/resources/orc-file-dump.out (original)
+++ hive/trunk/ql/src/test/resources/orc-file-dump.out Mon Jul 14 18:11:14 2014
@@ -108,3 +108,7 @@ Stripes:
Encoding column 1: DIRECT_V2
Encoding column 2: DIRECT_V2
Encoding column 3: DICTIONARY_V2
+
+File length: 269529 bytes
+Padding length: 0 bytes
+Padding ratio: 0%