HIVE-11321. Move OrcFile.OrcTableProperties from OrcFile into OrcConf. (omalley reviewed by prasanthj)
Project: http://git-wip-us.apache.org/repos/asf/hive/repo Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/cd2b4997 Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/cd2b4997 Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/cd2b4997 Branch: refs/heads/spark Commit: cd2b49970cec15c4a4b281301e28f71f25deba4d Parents: 0ead9fe Author: Owen O'Malley <omal...@apache.org> Authored: Tue Jul 28 13:02:07 2015 -0700 Committer: Owen O'Malley <omal...@apache.org> Committed: Tue Jul 28 13:02:07 2015 -0700 ---------------------------------------------------------------------- .../hive/hcatalog/mapreduce/SpecialCases.java | 8 +- .../apache/hadoop/hive/ql/io/orc/OrcConf.java | 129 ++++++++++++----- .../apache/hadoop/hive/ql/io/orc/OrcFile.java | 138 ++++++++----------- .../hadoop/hive/ql/io/orc/OrcOutputFormat.java | 67 +-------- 4 files changed, 153 insertions(+), 189 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/hive/blob/cd2b4997/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/SpecialCases.java ---------------------------------------------------------------------- diff --git a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/SpecialCases.java b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/SpecialCases.java index f38d53b..7da2ab0 100644 --- a/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/SpecialCases.java +++ b/hcatalog/core/src/main/java/org/apache/hive/hcatalog/mapreduce/SpecialCases.java @@ -24,7 +24,7 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.ql.io.RCFileOutputFormat; import org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat; -import org.apache.hadoop.hive.ql.io.orc.OrcFile; +import org.apache.hadoop.hive.ql.io.orc.OrcConf; import org.apache.hadoop.hive.ql.io.orc.OrcOutputFormat; import org.apache.hadoop.hive.serde2.avro.AvroSerDe; import org.apache.hadoop.hive.serde2.avro.AvroSerdeUtils; @@ -85,10 +85,10 @@ public class SpecialCases { // them to job properties, so that it will be available in jobconf at runtime // See HIVE-5504 for details Map<String, String> tableProps = jobInfo.getTableInfo().getTable().getParameters(); - for (OrcFile.OrcTableProperties property : OrcFile.OrcTableProperties.values()){ - String propName = property.getPropName(); + for (OrcConf property : OrcConf.values()){ + String propName = property.getAttribute(); if (tableProps.containsKey(propName)){ - jobProperties.put(propName,tableProps.get(propName)); + jobProperties.put(propName, tableProps.get(propName)); } } } else if (ofclass == AvroContainerOutputFormat.class) { http://git-wip-us.apache.org/repos/asf/hive/blob/cd2b4997/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcConf.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcConf.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcConf.java index aeb0ec1..81b822f 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcConf.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcConf.java @@ -20,41 +20,48 @@ package org.apache.hadoop.hive.ql.io.orc; import org.apache.hadoop.conf.Configuration; +import java.util.Properties; + /** * Define the configuration properties that Orc understands. */ public enum OrcConf { - STRIPE_SIZE("hive.exec.orc.default.stripe.size", + STRIPE_SIZE("orc.stripe.size", "hive.exec.orc.default.stripe.size", 64L * 1024 * 1024, "Define the default ORC stripe size, in bytes."), - BLOCK_SIZE("hive.exec.orc.default.block.size", 256L * 1024 * 1024, + BLOCK_SIZE("orc.block.size", "hive.exec.orc.default.block.size", + 256L * 1024 * 1024, "Define the default file system block size for ORC files."), - ROW_INDEX_STRIDE("hive.exec.orc.default.row.index.stride", 10000, + ENABLE_INDEXES("orc.create.index", "orc.create.index", true, + "Should the ORC writer create indexes as part of the file."), + ROW_INDEX_STRIDE("orc.row.index.stride", + "hive.exec.orc.default.row.index.stride", 10000, "Define the default ORC index stride in number of rows. (Stride is the\n"+ " number of rows n index entry represents.)"), - BUFFER_SIZE("hive.exec.orc.default.buffer.size", 256 * 1024, - "Define the default ORC buffer size, in bytes."), - BLOCK_PADDING("hive.exec.orc.default.block.padding", true, - "Define the default block padding, which pads stripes to the HDFS\n" + - " block boundaries."), - COMPRESS("hive.exec.orc.default.compress", "ZLIB", + BUFFER_SIZE("orc.compress.size", "hive.exec.orc.default.buffer.size", + 256 * 1024, "Define the default ORC buffer size, in bytes."), + BLOCK_PADDING("orc.block.padding", "hive.exec.orc.default.block.padding", + true, + "Define whether stripes should be padded to the HDFS block boundaries."), + COMPRESS("orc.compress", "hive.exec.orc.default.compress", "ZLIB", "Define the default compression codec for ORC file"), - WRITE_FORMAT("hive.exec.orc.write.format", null, + WRITE_FORMAT("orc.write.format", "hive.exec.orc.write.format", "0.12", "Define the version of the file to write. Possible values are 0.11 and\n"+ " 0.12. If this parameter is not defined, ORC will use the run\n" + - " length encoding (RLE) introduced in Hive 0.12. Any value other\n" + - " than 0.11 results in the 0.12 encoding."), - ENCODING_STRATEGY("hive.exec.orc.encoding.strategy", "SPEED", + " length encoding (RLE) introduced in Hive 0.12."), + ENCODING_STRATEGY("orc.encoding.strategy", "hive.exec.orc.encoding.strategy", + "SPEED", "Define the encoding strategy to use while writing data. Changing this\n"+ "will only affect the light weight encoding for integers. This\n" + "flag will not change the compression level of higher level\n" + "compression codec (like ZLIB)."), - COMPRESSION_STRATEGY("hive.exec.orc.compression.strategy", "SPEED", + COMPRESSION_STRATEGY("orc.compression.strategy", + "hive.exec.orc.compression.strategy", "SPEED", "Define the compression strategy to use while writing data.\n" + "This changes the compression level of higher level compression\n" + "codec (like ZLIB)."), - BLOCK_PADDING_TOLERANCE("hive.exec.orc.block.padding.tolerance", - 0.05, + BLOCK_PADDING_TOLERANCE("orc.block.padding.tolerance", + "hive.exec.orc.block.padding.tolerance", 0.05, "Define the tolerance for block padding as a decimal fraction of\n" + "stripe size (for example, the default value 0.05 is 5% of the\n" + "stripe size). For the defaults of 64Mb ORC stripe and 256Mb HDFS\n" + @@ -64,35 +71,45 @@ public enum OrcConf { "3.2Mb, a new smaller stripe will be inserted to fit within that\n" + "space. This will make sure that no stripe written will block\n" + " boundaries and cause remote reads within a node local task."), - BLOOM_FILTER_FPP("orc.default.bloom.fpp", 0.05, + BLOOM_FILTER_FPP("orc.bloom.filter.fpp", "orc.default.bloom.fpp", 0.05, "Define the default false positive probability for bloom filters."), - USE_ZEROCOPY("hive.exec.orc.zerocopy", false, + USE_ZEROCOPY("orc.use.zerocopy", "hive.exec.orc.zerocopy", false, "Use zerocopy reads with ORC. (This requires Hadoop 2.3 or later.)"), - SKIP_CORRUPT_DATA("hive.exec.orc.skip.corrupt.data", false, + SKIP_CORRUPT_DATA("orc.skip.corrupt.data", "hive.exec.orc.skip.corrupt.data", + false, "If ORC reader encounters corrupt data, this value will be used to\n" + "determine whether to skip the corrupt data or throw exception.\n" + "The default behavior is to throw exception."), - MEMORY_POOL("hive.exec.orc.memory.pool", 0.5, + MEMORY_POOL("orc.memory.pool", "hive.exec.orc.memory.pool", 0.5, "Maximum fraction of heap that can be used by ORC file writers"), - DICTIONARY_KEY_SIZE_THRESHOLD("hive.exec.orc.dictionary.key.size.threshold", + DICTIONARY_KEY_SIZE_THRESHOLD("orc.dictionary.key.threshold", + "hive.exec.orc.dictionary.key.size.threshold", 0.8, - "If the number of keys in a dictionary is greater than this fraction\n" + - "of the total number of non-null rows, turn off dictionary\n" + - "encoding. Use 1 to always use dictionary encoding."), - ROW_INDEX_STRIDE_DICTIONARY_CHECK("hive.orc.row.index.stride.dictionary.check", + "If the number of distinct keys in a dictionary is greater than this\n" + + "fraction of the total number of non-null rows, turn off \n" + + "dictionary encoding. Use 1 to always use dictionary encoding."), + ROW_INDEX_STRIDE_DICTIONARY_CHECK("orc.dictionary.early.check", + "hive.orc.row.index.stride.dictionary.check", true, "If enabled dictionary check will happen after first row index stride\n" + "(default 10000 rows) else dictionary check will happen before\n" + "writing first stripe. In both cases, the decision to use\n" + "dictionary or not will be retained thereafter."), + BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns", "orc.bloom.filter.columns", + "", "List of columns to create bloom filters for when writing.") ; private final String attribute; + private final String hiveConfName; private final Object defaultValue; private final String description; - OrcConf(String attribute, Object defaultValue, String description) { + OrcConf(String attribute, + String hiveConfName, + Object defaultValue, + String description) { this.attribute = attribute; + this.hiveConfName = hiveConfName; this.defaultValue = defaultValue; this.description = description; } @@ -101,6 +118,10 @@ public enum OrcConf { return attribute; } + public String getHiveConfName() { + return hiveConfName; + } + public Object getDefaultValue() { return defaultValue; } @@ -109,26 +130,62 @@ public enum OrcConf { return description; } + private String lookupValue(Properties tbl, Configuration conf) { + String result = null; + if (tbl != null) { + result = conf.get(attribute); + } + if (result == null && conf != null) { + result = conf.get(attribute); + if (result == null) { + result = conf.get(hiveConfName); + } + } + return result; + } + + public long getLong(Properties tbl, Configuration conf) { + String value = lookupValue(tbl, conf); + if (value != null) { + return Long.parseLong(value); + } + return ((Number) defaultValue).longValue(); + } + public long getLong(Configuration conf) { - return conf.getLong(attribute, ((Number) defaultValue).longValue()); + return getLong(null, conf); + } + + public String getString(Properties tbl, Configuration conf) { + String value = lookupValue(tbl, conf); + return value == null ? (String) defaultValue : value; } public String getString(Configuration conf) { - return conf.get(attribute, (String) defaultValue); + return getString(null, conf); + } + + public boolean getBoolean(Properties tbl, Configuration conf) { + String value = lookupValue(tbl, conf); + if (value != null) { + return Boolean.parseBoolean(value); + } + return (Boolean) defaultValue; } public boolean getBoolean(Configuration conf) { - if (conf == null) { - return (Boolean) defaultValue; + return getBoolean(null, conf); + } + + public double getDouble(Properties tbl, Configuration conf) { + String value = lookupValue(tbl, conf); + if (value != null) { + return Double.parseDouble(value); } - return conf.getBoolean(attribute, (Boolean) defaultValue); + return ((Number) defaultValue).doubleValue(); } public double getDouble(Configuration conf) { - String str = conf.get(attribute); - if (str == null) { - return ((Number) defaultValue).doubleValue(); - } - return Double.parseDouble(str); + return getDouble(null, conf); } } http://git-wip-us.apache.org/repos/asf/hive/blob/cd2b4997/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java index 976a84b..2210769 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcFile.java @@ -19,6 +19,7 @@ package org.apache.hadoop.hive.ql.io.orc; import java.io.IOException; +import java.util.Properties; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; @@ -47,7 +48,7 @@ public final class OrcFile { * prevent the new reader from reading ORC files generated by any released * version of Hive. */ - public static enum Version { + public enum Version { V_0_11("0.11", 0, 11), V_0_12("0.12", 0, 12); @@ -57,7 +58,7 @@ public final class OrcFile { private final int major; private final int minor; - private Version(String name, int major, int minor) { + Version(String name, int major, int minor) { this.name = name; this.major = major; this.minor = minor; @@ -99,7 +100,7 @@ public final class OrcFile { * For bugs in the writer, but the old readers already read the new data * correctly, bump this version instead of the Version. */ - public static enum WriterVersion { + public enum WriterVersion { ORIGINAL(0), HIVE_8732(1); // corrupted stripe/file maximum column statistics @@ -109,55 +110,17 @@ public final class OrcFile { return id; } - private WriterVersion(int id) { + WriterVersion(int id) { this.id = id; } } - public static enum EncodingStrategy { - SPEED, COMPRESSION; + public enum EncodingStrategy { + SPEED, COMPRESSION } - public static enum CompressionStrategy { - SPEED, COMPRESSION; - } - - // Note : these string definitions for table properties are deprecated, - // and retained only for backward compatibility, please do not add to - // them, add to OrcTableProperties below instead - @Deprecated public static final String COMPRESSION = "orc.compress"; - @Deprecated public static final String COMPRESSION_BLOCK_SIZE = "orc.compress.size"; - @Deprecated public static final String STRIPE_SIZE = "orc.stripe.size"; - @Deprecated public static final String ROW_INDEX_STRIDE = "orc.row.index.stride"; - @Deprecated public static final String ENABLE_INDEXES = "orc.create.index"; - @Deprecated public static final String BLOCK_PADDING = "orc.block.padding"; - - /** - * Enum container for all orc table properties. - * If introducing a new orc-specific table property, - * add it here. - */ - public static enum OrcTableProperties { - COMPRESSION("orc.compress"), - COMPRESSION_BLOCK_SIZE("orc.compress.size"), - STRIPE_SIZE("orc.stripe.size"), - BLOCK_SIZE("orc.block.size"), - ROW_INDEX_STRIDE("orc.row.index.stride"), - ENABLE_INDEXES("orc.create.index"), - BLOCK_PADDING("orc.block.padding"), - ENCODING_STRATEGY("orc.encoding.strategy"), - BLOOM_FILTER_COLUMNS("orc.bloom.filter.columns"), - BLOOM_FILTER_FPP("orc.bloom.filter.fpp"); - - private final String propName; - - OrcTableProperties(String propName) { - this.propName = propName; - } - - public String getPropName(){ - return this.propName; - } + public enum CompressionStrategy { + SPEED, COMPRESSION } // unused @@ -227,13 +190,13 @@ public final class OrcFile { return new ReaderImpl(path, options); } - public static interface WriterContext { + public interface WriterContext { Writer getWriter(); } - public static interface WriterCallback { - public void preStripeWrite(WriterContext context) throws IOException; - public void preFooterWrite(WriterContext context) throws IOException; + public interface WriterCallback { + void preStripeWrite(WriterContext context) throws IOException; + void preFooterWrite(WriterContext context) throws IOException; } /** @@ -258,40 +221,35 @@ public final class OrcFile { private String bloomFilterColumns; private double bloomFilterFpp; - WriterOptions(Configuration conf) { + WriterOptions(Properties tableProperties, Configuration conf) { configuration = conf; memoryManagerValue = getMemoryManager(conf); - stripeSizeValue = OrcConf.STRIPE_SIZE.getLong(conf); - blockSizeValue = OrcConf.BLOCK_SIZE.getLong(conf); + stripeSizeValue = OrcConf.STRIPE_SIZE.getLong(tableProperties, conf); + blockSizeValue = OrcConf.BLOCK_SIZE.getLong(tableProperties, conf); rowIndexStrideValue = - (int) OrcConf.ROW_INDEX_STRIDE.getLong(conf); - bufferSizeValue = (int) OrcConf.BUFFER_SIZE.getLong(conf); - blockPaddingValue = OrcConf.BLOCK_PADDING.getBoolean(conf); + (int) OrcConf.ROW_INDEX_STRIDE.getLong(tableProperties, conf); + bufferSizeValue = (int) OrcConf.BUFFER_SIZE.getLong(tableProperties, + conf); + blockPaddingValue = + OrcConf.BLOCK_PADDING.getBoolean(tableProperties, conf); compressValue = - CompressionKind.valueOf(OrcConf.COMPRESS.getString(conf)); - String versionName = OrcConf.WRITE_FORMAT.getString(conf); - if (versionName == null) { - versionValue = Version.CURRENT; - } else { - versionValue = Version.byName(versionName); - } - String enString = OrcConf.ENCODING_STRATEGY.getString(conf); - if (enString == null) { - encodingStrategy = EncodingStrategy.SPEED; - } else { - encodingStrategy = EncodingStrategy.valueOf(enString); - } - - String compString = OrcConf.COMPRESSION_STRATEGY.getString(conf); - if (compString == null) { - compressionStrategy = CompressionStrategy.SPEED; - } else { - compressionStrategy = CompressionStrategy.valueOf(compString); - } + CompressionKind.valueOf(OrcConf.COMPRESS.getString(tableProperties, + conf)); + String versionName = OrcConf.WRITE_FORMAT.getString(tableProperties, + conf); + versionValue = Version.byName(versionName); + String enString = OrcConf.ENCODING_STRATEGY.getString(tableProperties, + conf); + encodingStrategy = EncodingStrategy.valueOf(enString); + + String compString = + OrcConf.COMPRESSION_STRATEGY.getString(tableProperties, conf); + compressionStrategy = CompressionStrategy.valueOf(compString); paddingTolerance = - OrcConf.BLOCK_PADDING_TOLERANCE.getDouble(conf); - bloomFilterFpp = OrcConf.BLOOM_FILTER_FPP.getDouble(conf); + OrcConf.BLOCK_PADDING_TOLERANCE.getDouble(tableProperties, conf); + bloomFilterFpp = OrcConf.BLOOM_FILTER_FPP.getDouble(tableProperties, + conf); } /** @@ -362,7 +320,7 @@ public final class OrcFile { /** * Sets the tolerance for block padding as a percentage of stripe size. */ - public WriterOptions paddingTolerance(float value) { + public WriterOptions paddingTolerance(double value) { paddingTolerance = value; return this; } @@ -378,7 +336,7 @@ public final class OrcFile { /** * Specify the false positive probability for bloom filter. * @param fpp - false positive probability - * @return + * @return this */ public WriterOptions bloomFilterFpp(double fpp) { bloomFilterFpp = fpp; @@ -413,7 +371,7 @@ public final class OrcFile { /** * Add a listener for when the stripe and file are about to be closed. * @param callback the object to be called when the stripe is closed - * @return + * @return this */ public WriterOptions callback(WriterCallback callback) { this.callback = callback; @@ -431,10 +389,24 @@ public final class OrcFile { } /** - * Create a default set of write options that can be modified. + * Create a set of writer options based on a configuration. + * @param conf the configuration to use for values + * @return A WriterOptions object that can be modified */ public static WriterOptions writerOptions(Configuration conf) { - return new WriterOptions(conf); + return new WriterOptions(null, conf); + } + + /** + * Create a set of write options based on a set of table properties and + * configuration. + * @param tableProperties the properties of the table + * @param conf the configuration of the query + * @return a WriterOptions object that can be modified + */ + public static WriterOptions writerOptions(Properties tableProperties, + Configuration conf) { + return new WriterOptions(tableProperties, conf); } /** http://git-wip-us.apache.org/repos/asf/hive/blob/cd2b4997/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java ---------------------------------------------------------------------- diff --git a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java index 8625ff1..ea4ebb4 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/io/orc/OrcOutputFormat.java @@ -114,73 +114,8 @@ public class OrcOutputFormat extends FileOutputFormat<NullWritable, OrcSerdeRow> } } - /** - * Helper method to get a parameter first from props if present, falling back to JobConf if not. - * Returns null if key is present in neither. - */ - private String getSettingFromPropsFallingBackToConf(String key, Properties props, JobConf conf){ - if ((props != null) && props.containsKey(key)){ - return props.getProperty(key); - } else if(conf != null) { - // If conf is not null, and the key is not present, Configuration.get() will - // return null for us. So, we don't have to check if it contains it. - return conf.get(key); - } else { - return null; - } - } - private OrcFile.WriterOptions getOptions(JobConf conf, Properties props) { - OrcFile.WriterOptions options = OrcFile.writerOptions(conf); - String propVal ; - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.STRIPE_SIZE.getPropName(),props,conf)) != null){ - options.stripeSize(Long.parseLong(propVal)); - } - - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.COMPRESSION.getPropName(),props,conf)) != null){ - options.compress(CompressionKind.valueOf(propVal)); - } - - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.COMPRESSION_BLOCK_SIZE.getPropName(),props,conf)) != null){ - options.bufferSize(Integer.parseInt(propVal)); - } - - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.ROW_INDEX_STRIDE.getPropName(),props,conf)) != null){ - options.rowIndexStride(Integer.parseInt(propVal)); - } - - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.ENABLE_INDEXES.getPropName(),props,conf)) != null){ - if ("false".equalsIgnoreCase(propVal)) { - options.rowIndexStride(0); - } - } - - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.BLOCK_PADDING.getPropName(),props,conf)) != null){ - options.blockPadding(Boolean.parseBoolean(propVal)); - } - - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.ENCODING_STRATEGY.getPropName(),props,conf)) != null){ - options.encodingStrategy(EncodingStrategy.valueOf(propVal)); - } - - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.BLOOM_FILTER_COLUMNS.getPropName(), props, conf)) != null) { - options.bloomFilterColumns(propVal); - } - - if ((propVal = getSettingFromPropsFallingBackToConf( - OrcFile.OrcTableProperties.BLOOM_FILTER_FPP.getPropName(), props, conf)) != null) { - options.bloomFilterFpp(Double.parseDouble(propVal)); - } - - return options; + return OrcFile.writerOptions(props, conf); } @Override