This is an automated email from the ASF dual-hosted git repository. zivanfi pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/parquet-mr.git
The following commit(s) were added to refs/heads/master by this push: new a69f2b3 PARQUET-1365: Don't write page level statistics (#549) a69f2b3 is described below commit a69f2b30cd3c581588977ea4c93a53989e9c031c Author: Gabor Szadovszky <ga...@apache.org> AuthorDate: Mon Nov 19 13:15:39 2018 +0100 PARQUET-1365: Don't write page level statistics (#549) Page level statistics were never used in production and became pointless after adding column indexes. --- .../format/converter/ParquetMetadataConverter.java | 47 ++++++++++++++++------ .../parquet/hadoop/ColumnChunkPageWriteStore.java | 4 +- .../apache/parquet/hadoop/ParquetFileWriter.java | 5 +-- .../hadoop/TestColumnChunkPageWriteStore.java | 1 - 4 files changed, 38 insertions(+), 19 deletions(-) diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java index 58ae503..b9c8996 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/format/converter/ParquetMetadataConverter.java @@ -1334,12 +1334,13 @@ public class ParquetMetadataConverter { writePageHeader(newDataPageHeader(uncompressedSize, compressedSize, valueCount, - new org.apache.parquet.column.statistics.BooleanStatistics(), rlEncoding, dlEncoding, valuesEncoding), to); } + // Statistics are no longer saved in page headers + @Deprecated public void writeDataPageHeader( int uncompressedSize, int compressedSize, @@ -1350,7 +1351,7 @@ public class ParquetMetadataConverter { org.apache.parquet.column.Encoding valuesEncoding, OutputStream to) throws IOException { writePageHeader( - newDataPageHeader(uncompressedSize, compressedSize, valueCount, statistics, + newDataPageHeader(uncompressedSize, compressedSize, valueCount, rlEncoding, dlEncoding, valuesEncoding), to); } @@ -1358,7 +1359,6 @@ public class ParquetMetadataConverter { private PageHeader newDataPageHeader( int uncompressedSize, int compressedSize, int valueCount, - org.apache.parquet.column.statistics.Statistics statistics, org.apache.parquet.column.Encoding rlEncoding, org.apache.parquet.column.Encoding dlEncoding, org.apache.parquet.column.Encoding valuesEncoding) { @@ -1369,12 +1369,11 @@ public class ParquetMetadataConverter { getEncoding(valuesEncoding), getEncoding(dlEncoding), getEncoding(rlEncoding))); - if (!statistics.isEmpty()) { - pageHeader.getData_page_header().setStatistics(toParquetStatistics(statistics)); - } return pageHeader; } + // Statistics are no longer saved in page headers + @Deprecated public void writeDataPageV2Header( int uncompressedSize, int compressedSize, int valueCount, int nullCount, int rowCount, @@ -1386,7 +1385,36 @@ public class ParquetMetadataConverter { newDataPageV2Header( uncompressedSize, compressedSize, valueCount, nullCount, rowCount, - statistics, + dataEncoding, + rlByteLength, dlByteLength), to); + } + + public void writeDataPageV1Header( + int uncompressedSize, + int compressedSize, + int valueCount, + org.apache.parquet.column.Encoding rlEncoding, + org.apache.parquet.column.Encoding dlEncoding, + org.apache.parquet.column.Encoding valuesEncoding, + OutputStream to) throws IOException { + writePageHeader(newDataPageHeader(uncompressedSize, + compressedSize, + valueCount, + rlEncoding, + dlEncoding, + valuesEncoding), to); + } + + public void writeDataPageV2Header( + int uncompressedSize, int compressedSize, + int valueCount, int nullCount, int rowCount, + org.apache.parquet.column.Encoding dataEncoding, + int rlByteLength, int dlByteLength, + OutputStream to) throws IOException { + writePageHeader( + newDataPageV2Header( + uncompressedSize, compressedSize, + valueCount, nullCount, rowCount, dataEncoding, rlByteLength, dlByteLength), to); } @@ -1394,7 +1422,6 @@ public class ParquetMetadataConverter { private PageHeader newDataPageV2Header( int uncompressedSize, int compressedSize, int valueCount, int nullCount, int rowCount, - org.apache.parquet.column.statistics.Statistics<?> statistics, org.apache.parquet.column.Encoding dataEncoding, int rlByteLength, int dlByteLength) { // TODO: pageHeader.crc = ...; @@ -1402,10 +1429,6 @@ public class ParquetMetadataConverter { valueCount, nullCount, rowCount, getEncoding(dataEncoding), dlByteLength, rlByteLength); - if (!statistics.isEmpty()) { - dataPageHeaderV2.setStatistics( - toParquetStatistics(statistics)); - } PageHeader pageHeader = new PageHeader(PageType.DATA_PAGE_V2, uncompressedSize, compressedSize); pageHeader.setData_page_header_v2(dataPageHeaderV2); return pageHeader; diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java index 85bdbdb..f87630b 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ColumnChunkPageWriteStore.java @@ -119,11 +119,10 @@ class ColumnChunkPageWriteStore implements PageWriteStore { + compressedSize); } tempOutputStream.reset(); - parquetMetadataConverter.writeDataPageHeader( + parquetMetadataConverter.writeDataPageV1Header( (int)uncompressedSize, (int)compressedSize, valueCount, - statistics, rlEncoding, dlEncoding, valuesEncoding, @@ -171,7 +170,6 @@ class ColumnChunkPageWriteStore implements PageWriteStore { parquetMetadataConverter.writeDataPageV2Header( uncompressedSize, compressedSize, valueCount, nullCount, rowCount, - statistics, dataEncoding, rlByteLength, dlByteLength, diff --git a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java index a8cd686..20efe47 100644 --- a/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java +++ b/parquet-hadoop/src/main/java/org/apache/parquet/hadoop/ParquetFileWriter.java @@ -433,7 +433,7 @@ public class ParquetFileWriter { long beforeHeader = out.getPos(); LOG.debug("{}: write data page: {} values", beforeHeader, valueCount); int compressedPageSize = (int)bytes.size(); - metadataConverter.writeDataPageHeader( + metadataConverter.writeDataPageV1Header( uncompressedPageSize, compressedPageSize, valueCount, rlEncoding, @@ -518,10 +518,9 @@ public class ParquetFileWriter { } LOG.debug("{}: write data page: {} values", beforeHeader, valueCount); int compressedPageSize = (int) bytes.size(); - metadataConverter.writeDataPageHeader( + metadataConverter.writeDataPageV1Header( uncompressedPageSize, compressedPageSize, valueCount, - statistics, rlEncoding, dlEncoding, valuesEncoding, diff --git a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java index 9a27def..c353ee3 100644 --- a/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java +++ b/parquet-hadoop/src/test/java/org/apache/parquet/hadoop/TestColumnChunkPageWriteStore.java @@ -189,7 +189,6 @@ public class TestColumnChunkPageWriteStore { assertEquals(r, intValue(page.getRepetitionLevels())); assertEquals(dataEncoding, page.getDataEncoding()); assertEquals(v, intValue(page.getData())); - assertEquals(statistics.toString(), page.getStatistics().toString()); // Checking column/offset indexes for the one page ColumnChunkMetaData column = footer.getBlocks().get(0).getColumns().get(0);