This is an automated email from the ASF dual-hosted git repository. gangwu pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/parquet-format.git
The following commit(s) were added to refs/heads/master by this push: new 38c108c PARQUET-2473: Clarify records can not be split across v2 pages or PageIndex (#244) 38c108c is described below commit 38c108c8ff24a432db40453b2f04493534c1d2cf Author: Andrew Lamb <and...@nerdnetworks.org> AuthorDate: Fri May 31 08:55:23 2024 -0400 PARQUET-2473: Clarify records can not be split across v2 pages or PageIndex (#244) Co-authored-by: Ed Seidl <etse...@users.noreply.github.com> --- src/main/thrift/parquet.thrift | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index c928ad6..85e8887 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -578,7 +578,13 @@ enum BoundaryOrder { /** Data page header */ struct DataPageHeader { - /** Number of values, including NULLs, in this data page. **/ + /** + * Number of values, including NULLs, in this data page. + * + * If a OffsetIndex is present, a page must begin at a record + * boundary (repetition_level = 0). Otherwise, pages may begin + * within a record (repetition_level > 0). + **/ 1: required i32 num_values /** Encoding used for this data page **/ @@ -625,7 +631,11 @@ struct DataPageHeaderV2 { /** Number of NULL values, in this data page. Number of non-null = num_values - num_nulls which is also the number of values in the data section **/ 2: required i32 num_nulls - /** Number of rows in this data page. which means pages change on record boundaries (r = 0) **/ + /** + * Number of rows in this data page. Every page must begin at a + * record boundary (repetition_level = 0): records must **not** be + * split across page boundaries when using V2 data pages. + **/ 3: required i32 num_rows /** Encoding used for data in this page **/ 4: required Encoding encoding @@ -995,8 +1005,9 @@ struct PageLocation { 2: required i32 compressed_page_size /** - * Index within the RowGroup of the first row of the page; this means pages - * change on record boundaries (r = 0). + * Index within the RowGroup of the first row of the page. When an + * OffsetIndex is present, pages must begin on record boundaries + * (repetition_level = 0). */ 3: required i64 first_row_index } @@ -1190,4 +1201,3 @@ struct FileCryptoMetaData { * and (possibly) columns **/ 2: optional binary key_metadata } -