Jiayi-Wang-db commented on code in PR #544: URL: https://github.com/apache/parquet-format/pull/544#discussion_r3045419957
########## src/main/flatbuf/parquet3.fbs: ########## @@ -0,0 +1,640 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +namespace parquet.format; + +// The FlatBuffers footer preserves the same information as the Thrift Parquet footer, +// while removing duplicated fields, unused details, and inefficient encodings that +// waste space and memory. +// It can currently be attached as a footer extension, and may fully replace the +// Thrift footer in the future. As of now, the Thrift footer is still required; +// this FlatBuffers footer is supplementary. +// +// Optimization notes: +// 1. Statistics use fixed-width integral types when possible; otherwise they are +// encoded as prefix + truncated suffix. SizeStatistics and Statistics.distinct_count +// are removed. +// 2. ColumnChunk file_path and file_offset are removed since they are unused. +// 3. ColumnMetaData.encoding_stats are removed and replaced by +// ColumnMetaData.is_fully_dict_encoded. +// 4. ColumnMetaData.path_in_schema is removed since it can be derived from the schema. +// 5. ConvertedType is fully dropped as it is superseded by LogicalType. +// 6. Offset and column indexes are removed since they are small and their offsets +// alone take comparable space. + +/** + * Types supported by Parquet. These types are intended to be used in combination + * with the encodings to control the on disk storage format. + * For example INT16 is not included as a type since a good encoding of INT32 + * would handle this. + */ +enum Type : byte { + BOOLEAN = 0, + INT32 = 1, + INT64 = 2, + INT96 = 3, // deprecated, new Parquet writers should not write data in INT96 + FLOAT = 4, + DOUBLE = 5, + BYTE_ARRAY = 6, + FIXED_LEN_BYTE_ARRAY = 7, +} + +/** + * Representation of Schemas + */ +enum FieldRepetitionType : byte { + /** This field is required (can not be null) and each row has exactly 1 value. */ + REQUIRED = 0, + + /** The field is optional (can be null) and each row has 0 or 1 values. */ + OPTIONAL = 1, + + /** The field is repeated and can contain 0 or more values */ + REPEATED = 2, +} + +/** + * Encodings supported by Parquet. Not all encodings are valid for all types. These + * enums are also used to specify the encoding of definition and repetition levels. + * See the accompanying doc for the details of the more complicated encodings. + * Note: Match the thrift enum values so that we can cast between them. + */ +enum Encoding : byte { + /** Default encoding. + * BOOLEAN - 1 bit per value. 0 is false; 1 is true. + * INT32 - 4 bytes per value. Stored as little-endian. + * INT64 - 8 bytes per value. Stored as little-endian. + * FLOAT - 4 bytes per value. IEEE. Stored as little-endian. + * DOUBLE - 8 bytes per value. IEEE. Stored as little-endian. + * BYTE_ARRAY - 4 byte length stored as little endian, followed by bytes. + * FIXED_LEN_BYTE_ARRAY - Just the bytes. + */ + PLAIN = 0, + + /** Group VarInt encoding for INT32/INT64. + * This encoding is deprecated. It was never used + */ + // GROUP_VAR_INT = 1, + + /** + * Deprecated: Dictionary encoding. The values in the dictionary are encoded in the Review Comment: Commented out PLAIN_DICTIONARY. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
