Author: cws Date: Mon May 6 19:24:34 2013 New Revision: 1479685 URL: http://svn.apache.org/r1479685 Log: HIVE-3957. Add pseudo-BNF grammar for RCFile to Javadoc (Mark Grover via cws)
Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/RCFile.java Modified: hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/RCFile.java URL: http://svn.apache.org/viewvc/hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/RCFile.java?rev=1479685&r1=1479684&r2=1479685&view=diff ============================================================================== --- hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/RCFile.java (original) +++ hive/trunk/ql/src/java/org/apache/hadoop/hive/ql/io/RCFile.java Mon May 6 19:24:34 2013 @@ -140,7 +140,200 @@ import org.apache.hadoop.util.Reflection * </ul> * </li> * </ul> + * <p> + * <pre> + * {@code + * The following is a pseudo-BNF grammar for RCFile. Comments are prefixed + * with dashes: * + * rcfile ::= + * <file-header> + * <rcfile-rowgroup>+ + * + * file-header ::= + * <file-version-header> + * <file-key-class-name> (only exists if version is seq6) + * <file-value-class-name> (only exists if version is seq6) + * <file-is-compressed> + * <file-is-block-compressed> (only exists if version is seq6) + * [<file-compression-codec-class>] + * <file-header-metadata> + * <file-sync-field> + * + * -- The normative RCFile implementation included with Hive is actually + * -- based on a modified version of Hadoop's SequenceFile code. Some + * -- things which should have been modified were not, including the code + * -- that writes out the file version header. Consequently, RCFile and + * -- SequenceFile originally shared the same version header. A newer + * -- release has created a unique version string. + * + * file-version-header ::= Byte[4] {'S', 'E', 'Q', 6} + * | Byte[4] {'R', 'C', 'F', 1} + * + * -- The name of the Java class responsible for reading the key buffer + * -- component of the rowgroup. + * + * file-key-class-name ::= + * Text {"org.apache.hadoop.hive.ql.io.RCFile$KeyBuffer"} + * + * -- The name of the Java class responsible for reading the value buffer + * -- component of the rowgroup. + * + * file-value-class-name ::= + * Text {"org.apache.hadoop.hive.ql.io.RCFile$ValueBuffer"} + * + * -- Boolean variable indicating whether or not the file uses compression + * -- for the key and column buffer sections. + * + * file-is-compressed ::= Byte[1] + * + * -- A boolean field indicating whether or not the file is block compressed. + * -- This field is *always* false. According to comments in the original + * -- RCFile implementation this field was retained for backwards + * -- compatability with the SequenceFile format. + * + * file-is-block-compressed ::= Byte[1] {false} + * + * -- The Java class name of the compression codec iff <file-is-compressed> + * -- is true. The named class must implement + * -- org.apache.hadoop.io.compress.CompressionCodec. + * -- The expected value is org.apache.hadoop.io.compress.GzipCodec. + * + * file-compression-codec-class ::= Text + * + * -- A collection of key-value pairs defining metadata values for the + * -- file. The Map is serialized using standard JDK serialization, i.e. + * -- an Int corresponding to the number of key-value pairs, followed by + * -- Text key and value pairs. The following metadata properties are + * -- mandatory for all RCFiles: + * -- + * -- hive.io.rcfile.column.number: the number of columns in the RCFile + * + * file-header-metadata ::= Map<Text, Text> + * + * -- A 16 byte marker that is generated by the writer. This marker appears + * -- at regular intervals at the beginning of rowgroup-headers, and is + * -- intended to enable readers to skip over corrupted rowgroups. + * + * file-sync-hash ::= Byte[16] + * + * -- Each row group is split into three sections: a header, a set of + * -- key buffers, and a set of column buffers. The header section includes + * -- an optional sync hash, information about the size of the row group, and + * -- the total number of rows in the row group. Each key buffer + * -- consists of run-length encoding data which is used to decode + * -- the length and offsets of individual fields in the corresponding column + * -- buffer. + * + * rcfile-rowgroup ::= + * <rowgroup-header> + * <rowgroup-key-data> + * <rowgroup-column-buffers> + * + * rowgroup-header ::= + * [<rowgroup-sync-marker>, <rowgroup-sync-hash>] + * <rowgroup-record-length> + * <rowgroup-key-length> + * <rowgroup-compressed-key-length> + * + * -- rowgroup-key-data is compressed if the column data is compressed. + * rowgroup-key-data ::= + * <rowgroup-num-rows> + * <rowgroup-key-buffers> + * + * -- An integer (always -1) signaling the beginning of a sync-hash + * -- field. + * + * rowgroup-sync-marker ::= Int + * + * -- A 16 byte sync field. This must match the <file-sync-hash> value read + * -- in the file header. + * + * rowgroup-sync-hash ::= Byte[16] + * + * -- The record-length is the sum of the number of bytes used to store + * -- the key and column parts, i.e. it is the total length of the current + * -- rowgroup. + * + * rowgroup-record-length ::= Int + * + * -- Total length in bytes of the rowgroup's key sections. + * + * rowgroup-key-length ::= Int + * + * -- Total compressed length in bytes of the rowgroup's key sections. + * + * rowgroup-compressed-key-length ::= Int + * + * -- Number of rows in the current rowgroup. + * + * rowgroup-num-rows ::= VInt + * + * -- One or more column key buffers corresponding to each column + * -- in the RCFile. + * + * rowgroup-key-buffers ::= <rowgroup-key-buffer>+ + * + * -- Data in each column buffer is stored using a run-length + * -- encoding scheme that is intended to reduce the cost of + * -- repeated column field values. This mechanism is described + * -- in more detail in the following entries. + * + * rowgroup-key-buffer ::= + * <column-buffer-length> + * <column-buffer-uncompressed-length> + * <column-key-buffer-length> + * <column-key-buffer> + * + * -- The serialized length on disk of the corresponding column buffer. + * + * column-buffer-length ::= VInt + * + * -- The uncompressed length of the corresponding column buffer. This + * -- is equivalent to column-buffer-length if the RCFile is not compressed. + * + * column-buffer-uncompressed-length ::= VInt + * + * -- The length in bytes of the current column key buffer + * + * column-key-buffer-length ::= VInt + * + * -- The column-key-buffer contains a sequence of serialized VInt values + * -- corresponding to the byte lengths of the serialized column fields + * -- in the corresponding rowgroup-column-buffer. For example, consider + * -- an integer column that contains the consecutive values 1, 2, 3, 44. + * -- The RCFile format stores these values as strings in the column buffer, + * -- e.g. "12344". The length of each column field is recorded in + * -- the column-key-buffer as a sequence of VInts: 1,1,1,2. However, + * -- if the same length occurs repeatedly, then we replace repeated + * -- run lengths with the complement (i.e. negative) of the number of + * -- repetitions, so 1,1,1,2 becomes 1,~2,2. + * + * column-key-buffer ::= Byte[column-key-buffer-length] + * + * rowgroup-column-buffers ::= <rowgroup-value-buffer>+ + * + * -- RCFile stores all column data as strings regardless of the + * -- underlying column type. The strings are neither length-prefixed or + * -- null-terminated, and decoding them into individual fields requires + * -- the use of the run-length information contained in the corresponding + * -- column-key-buffer. + * + * rowgroup-column-buffer ::= Byte[column-buffer-length] + * + * Byte ::= An eight-bit byte + * + * VInt ::= Variable length integer. The high-order bit of each byte + * indicates whether more bytes remain to be read. The low-order seven + * bits are appended as increasingly more significant bits in the + * resulting integer value. + * + * Int ::= A four-byte integer in big-endian format. + * + * Text ::= VInt, Chars (Length prefixed UTF-8 characters) + * } + * </pre> + * </p> */ public class RCFile {