Author: prasanthj
Date: Mon Jul 14 18:11:14 2014
New Revision: 1610475

URL: http://svn.apache.org/r1610475
Log:
HIVE-7243: Print padding information in ORC file dump (Prasanth J, reviewed by 
Gunther Hagleitner)

Modified:
    hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java
    hive/trunk/ql/src/test/resources/orc-file-dump-dictionary-threshold.out
    hive/trunk/ql/src/test/resources/orc-file-dump.out

Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java?rev=1610475&r1=1610474&r2=1610475&view=diff
==============================================================================
--- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java 
(original)
+++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/orc/FileDump.java Mon 
Jul 14 18:11:14 2014
@@ -20,7 +20,12 @@ package org.apache.hadoop.hive.ql.io.orc
 import java.util.ArrayList;
 import java.util.List;
 
+import java.io.IOException;
+import java.text.DecimalFormat;
+import java.util.List;
+
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndex;
 import org.apache.hadoop.hive.ql.io.orc.OrcProto.RowIndexEntry;
@@ -147,7 +152,28 @@ public final class FileDump {
           }
         }
       }
+
+      FileSystem fs = path.getFileSystem(conf);
+      long fileLen = fs.getContentSummary(path).getLength();
+      long paddedBytes = getTotalPaddingSize(reader);
+      // empty ORC file is ~45 bytes. Assumption here is file length always >0
+      double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
+      DecimalFormat format = new DecimalFormat("##.##");
+      System.out.println("\nFile length: " + fileLen + " bytes");
+      System.out.println("Padding length: " + paddedBytes + " bytes");
+      System.out.println("Padding ratio: " + format.format(percentPadding) + 
"%");
       rows.close();
     }
   }
+
+  private static long getTotalPaddingSize(Reader reader) throws IOException {
+    long paddedBytes = 0;
+    List<org.apache.hadoop.hive.ql.io.orc.StripeInformation> stripes = 
reader.getStripes();
+    for (int i = 1; i < stripes.size(); i++) {
+      long prevStripeOffset = stripes.get(i - 1).getOffset();
+      long prevStripeLen = stripes.get(i - 1).getLength();
+      paddedBytes += stripes.get(i).getOffset() - (prevStripeOffset + 
prevStripeLen);
+    }
+    return paddedBytes;
+  }
 }

Modified: 
hive/trunk/ql/src/test/resources/orc-file-dump-dictionary-threshold.out
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/resources/orc-file-dump-dictionary-threshold.out?rev=1610475&r1=1610474&r2=1610475&view=diff
==============================================================================
--- hive/trunk/ql/src/test/resources/orc-file-dump-dictionary-threshold.out 
(original)
+++ hive/trunk/ql/src/test/resources/orc-file-dump-dictionary-threshold.out Mon 
Jul 14 18:11:14 2014
@@ -103,3 +103,7 @@ Stripes:
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
     Encoding column 3: DIRECT_V2
+
+File length: 1932446 bytes
+Padding length: 0 bytes
+Padding ratio: 0%

Modified: hive/trunk/ql/src/test/resources/orc-file-dump.out
URL: 
http://svn.apache.org/viewvc/hive/trunk/ql/src/test/resources/orc-file-dump.out?rev=1610475&r1=1610474&r2=1610475&view=diff
==============================================================================
--- hive/trunk/ql/src/test/resources/orc-file-dump.out (original)
+++ hive/trunk/ql/src/test/resources/orc-file-dump.out Mon Jul 14 18:11:14 2014
@@ -108,3 +108,7 @@ Stripes:
     Encoding column 1: DIRECT_V2
     Encoding column 2: DIRECT_V2
     Encoding column 3: DICTIONARY_V2
+
+File length: 269529 bytes
+Padding length: 0 bytes
+Padding ratio: 0%


Reply via email to