This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc-format.git
The following commit(s) were added to refs/heads/main by this push:
new ca0112c ORC-FORMAT-2: Enforce `BASIC` rule (#5)
ca0112c is described below
commit ca0112c821c40649a5d7c61975b5b9c5e48cd99d
Author: Dongjoon Hyun <[email protected]>
AuthorDate: Thu Dec 7 22:03:33 2023 -0800
ORC-FORMAT-2: Enforce `BASIC` rule (#5)
### What changes were proposed in this pull request?
This closes #2 .
### Why are the changes needed?
`BASIC` has many rules in addition to `MINIMAL`.
### How was this patch tested?
Pass the CIs.
---
src/main/proto/buf.yaml | 2 +-
src/main/proto/orc/proto/orc_proto.proto | 112 +++++++++++++++----------------
2 files changed, 57 insertions(+), 57 deletions(-)
diff --git a/src/main/proto/buf.yaml b/src/main/proto/buf.yaml
index 31b7729..47f6919 100644
--- a/src/main/proto/buf.yaml
+++ b/src/main/proto/buf.yaml
@@ -20,4 +20,4 @@ breaking:
- FILE
lint:
use:
- - MINIMAL
+ - BASIC
diff --git a/src/main/proto/orc/proto/orc_proto.proto
b/src/main/proto/orc/proto/orc_proto.proto
index 45d7d2a..5ca432d 100644
--- a/src/main/proto/orc/proto/orc_proto.proto
+++ b/src/main/proto/orc/proto/orc_proto.proto
@@ -41,8 +41,8 @@ message StringStatistics {
optional sint64 sum = 3;
// If the minimum or maximum value was longer than 1024 bytes, store a lower
or upper
// bound instead of the minimum or maximum values above.
- optional string lowerBound = 4;
- optional string upperBound = 5;
+ optional string lower_bound = 4;
+ optional string upper_bound = 5;
}
message BucketStatistics {
@@ -65,11 +65,11 @@ message TimestampStatistics {
// min,max values saved as milliseconds since epoch
optional sint64 minimum = 1;
optional sint64 maximum = 2;
- optional sint64 minimumUtc = 3;
- optional sint64 maximumUtc = 4;
+ optional sint64 minimum_utc = 3;
+ optional sint64 maximum_utc = 4;
// store the lower 6 TS digits for min/max to achieve nanosecond precision
- optional int32 minimumNanos = 5;
- optional int32 maximumNanos = 6;
+ optional int32 minimum_nanos = 5;
+ optional int32 maximum_nanos = 6;
}
message BinaryStatistics {
@@ -79,24 +79,24 @@ message BinaryStatistics {
// Statistics for list and map
message CollectionStatistics {
- optional uint64 minChildren = 1;
- optional uint64 maxChildren = 2;
- optional uint64 totalChildren = 3;
+ optional uint64 min_children = 1;
+ optional uint64 max_children = 2;
+ optional uint64 total_children = 3;
}
message ColumnStatistics {
- optional uint64 numberOfValues = 1;
- optional IntegerStatistics intStatistics = 2;
- optional DoubleStatistics doubleStatistics = 3;
- optional StringStatistics stringStatistics = 4;
- optional BucketStatistics bucketStatistics = 5;
- optional DecimalStatistics decimalStatistics = 6;
- optional DateStatistics dateStatistics = 7;
- optional BinaryStatistics binaryStatistics = 8;
- optional TimestampStatistics timestampStatistics = 9;
- optional bool hasNull = 10;
- optional uint64 bytesOnDisk = 11;
- optional CollectionStatistics collectionStatistics = 12;
+ optional uint64 number_of_values = 1;
+ optional IntegerStatistics int_statistics = 2;
+ optional DoubleStatistics double_statistics = 3;
+ optional StringStatistics string_statistics = 4;
+ optional BucketStatistics bucket_statistics = 5;
+ optional DecimalStatistics decimal_statistics = 6;
+ optional DateStatistics date_statistics = 7;
+ optional BinaryStatistics binary_statistics = 8;
+ optional TimestampStatistics timestamp_statistics = 9;
+ optional bool has_null = 10;
+ optional uint64 bytes_on_disk = 11;
+ optional CollectionStatistics collection_statistics = 12;
}
message RowIndexEntry {
@@ -109,13 +109,13 @@ message RowIndex {
}
message BloomFilter {
- optional uint32 numHashFunctions = 1;
+ optional uint32 num_hash_functions = 1;
repeated fixed64 bitset = 2;
optional bytes utf8bitset = 3;
}
message BloomFilterIndex {
- repeated BloomFilter bloomFilter = 1;
+ repeated BloomFilter bloom_filter = 1;
}
message Stream {
@@ -153,12 +153,12 @@ message ColumnEncoding {
DICTIONARY_V2 = 3;
}
optional Kind kind = 1;
- optional uint32 dictionarySize = 2;
+ optional uint32 dictionary_size = 2;
// The encoding of the bloom filters for this column:
// 0 or missing = none or original
// 1 = ORC-135 (utc for timestamps)
- optional uint32 bloomEncoding = 3;
+ optional uint32 bloom_encoding = 3;
}
message StripeEncryptionVariant {
@@ -178,7 +178,7 @@ message StripeEncryptionVariant {
message StripeFooter {
repeated Stream streams = 1;
repeated ColumnEncoding columns = 2;
- optional string writerTimezone = 3;
+ optional string writer_timezone = 3;
// one for each column encryption variant
repeated StripeEncryptionVariant encryption = 4;
}
@@ -219,8 +219,8 @@ message Type {
}
optional Kind kind = 1;
repeated uint32 subtypes = 2 [packed=true];
- repeated string fieldNames = 3;
- optional uint32 maximumLength = 4;
+ repeated string field_names = 3;
+ optional uint32 maximum_length = 4;
optional uint32 precision = 5;
optional uint32 scale = 6;
repeated StringPair attributes = 7;
@@ -230,13 +230,13 @@ message StripeInformation {
// the global file offset of the start of the stripe
optional uint64 offset = 1;
// the number of bytes of index
- optional uint64 indexLength = 2;
+ optional uint64 index_length = 2;
// the number of bytes of data
- optional uint64 dataLength = 3;
+ optional uint64 data_length = 3;
// the number of bytes in the stripe footer
- optional uint64 footerLength = 4;
+ optional uint64 footer_length = 4;
// the number of rows in this stripe
- optional uint64 numberOfRows = 5;
+ optional uint64 number_of_rows = 5;
// If this is present, the reader should use this value for the encryption
// stripe id for setting the encryption IV. Otherwise, the reader should
// use one larger than the previous stripe's encryptStripeId.
@@ -246,10 +246,10 @@ message StripeInformation {
// each of the input files will reset it to 1.
// Note that 1 was choosen, because protobuf v3 doesn't serialize
// primitive types that are the default (eg. 0).
- optional uint64 encryptStripeId = 6;
+ optional uint64 encrypt_stripe_id = 6;
// For each encryption variant, the new encrypted local key to use
// until we find a replacement.
- repeated bytes encryptedLocalKeys = 7;
+ repeated bytes encrypted_local_keys = 7;
}
message UserMetadataItem {
@@ -261,19 +261,19 @@ message UserMetadataItem {
// ColumnStatistics for each column.
// This message type is only used in ORC v0 and v1.
message StripeStatistics {
- repeated ColumnStatistics colStats = 1;
+ repeated ColumnStatistics col_stats = 1;
}
// This message type is only used in ORC v0 and v1.
message Metadata {
- repeated StripeStatistics stripeStats = 1;
+ repeated StripeStatistics stripe_stats = 1;
}
// In ORC v2 (and for encrypted columns in v1), each column has
// their column statistics written separately.
message ColumnarStripeStatistics {
// one value for each stripe in the file
- repeated ColumnStatistics colStats = 1;
+ repeated ColumnStatistics col_stats = 1;
}
enum EncryptionAlgorithm {
@@ -292,15 +292,15 @@ message DataMask {
// the kind of masking, which may include third party masks
optional string name = 1;
// parameters for the mask
- repeated string maskParameters = 2;
+ repeated string mask_parameters = 2;
// the unencrypted column roots this mask was applied to
repeated uint32 columns = 3 [packed = true];
}
// Information about the encryption keys.
message EncryptionKey {
- optional string keyName = 1;
- optional uint32 keyVersion = 2;
+ optional string key_name = 1;
+ optional uint32 key_version = 2;
optional EncryptionAlgorithm algorithm = 3;
}
@@ -313,11 +313,11 @@ message EncryptionVariant {
// an index into the Encryption.key list.
optional uint32 key = 2;
// the encrypted key for the file footer
- optional bytes encryptedKey = 3;
+ optional bytes encrypted_key = 3;
// the stripe statistics for this variant
- repeated Stream stripeStatistics = 4;
+ repeated Stream stripe_statistics = 4;
// encrypted file statistics as a FileStatistics
- optional bytes fileStatistics = 5;
+ optional bytes file_statistics = 5;
}
// Which KeyProvider encrypted the local keys.
@@ -340,7 +340,7 @@ message Encryption {
// they should get the unencrypted masked data.
repeated EncryptionVariant variants = 3;
// How are the local keys encrypted?
- optional KeyProviderKind keyProvider = 4;
+ optional KeyProviderKind key_provider = 4;
}
enum CalendarKind {
@@ -352,14 +352,14 @@ enum CalendarKind {
}
message Footer {
- optional uint64 headerLength = 1;
- optional uint64 contentLength = 2;
+ optional uint64 header_length = 1;
+ optional uint64 content_length = 2;
repeated StripeInformation stripes = 3;
repeated Type types = 4;
repeated UserMetadataItem metadata = 5;
- optional uint64 numberOfRows = 6;
+ optional uint64 number_of_rows = 6;
repeated ColumnStatistics statistics = 7;
- optional uint32 rowIndexStride = 8;
+ optional uint32 row_index_stride = 8;
// Each implementation that writes ORC files should register for a code
// 0 = ORC Java
@@ -377,7 +377,7 @@ message Footer {
// informative description about the version of the software that wrote
// the file. It is assumed to be within a given writer, so for example
// ORC 1.7.2 = "1.7.2". It may include suffixes, such as "-SNAPSHOT".
- optional string softwareVersion = 12;
+ optional string software_version = 12;
}
enum CompressionKind {
@@ -391,14 +391,14 @@ enum CompressionKind {
// Serialized length must be less that 255 bytes
message PostScript {
- optional uint64 footerLength = 1;
+ optional uint64 footer_length = 1;
optional CompressionKind compression = 2;
- optional uint64 compressionBlockSize = 3;
+ optional uint64 compression_block_size = 3;
// the version of the file format
// [0, 11] = Hive 0.11
// [0, 12] = Hive 0.12
repeated uint32 version = 4 [packed = true];
- optional uint64 metadataLength = 5;
+ optional uint64 metadata_length = 5;
// The version of the writer that wrote the file. This number is
// updated when we make fixes or large changes to the writer so that
@@ -436,10 +436,10 @@ message PostScript {
// Version of the CUDF writer:
// 6 = original
//
- optional uint32 writerVersion = 6;
+ optional uint32 writer_version = 6;
// the number of bytes in the encrypted stripe statistics
- optional uint64 stripeStatisticsLength = 7;
+ optional uint64 stripe_statistics_length = 7;
// Leave this last in the record
optional string magic = 8000;
@@ -450,6 +450,6 @@ message PostScript {
message FileTail {
optional PostScript postscript = 1;
optional Footer footer = 2;
- optional uint64 fileLength = 3;
- optional uint64 postscriptLength = 4;
+ optional uint64 file_length = 3;
+ optional uint64 postscript_length = 4;
}