kszucs commented on code in PR #45360: URL: https://github.com/apache/arrow/pull/45360#discussion_r2028665323
########## python/pyarrow/tests/parquet/test_parquet_writer.py: ########## @@ -361,3 +361,82 @@ def test_parquet_writer_append_key_value_metadata(tempdir): assert metadata[b'key1'] == b'1' assert metadata[b'key2'] == b'2' assert metadata[b'key3'] == b'3' + + +def test_parquet_content_defined_chunking(tempdir): + table = pa.table({'a': range(100_000)}) + + pq.write_table(table, tempdir / 'unchunked.parquet') + pq.write_table(table, tempdir / 'chunked-default.parquet', + use_content_defined_chunking=True) + pq.write_table(table, tempdir / 'chunked-custom.parquet', + use_content_defined_chunking={"min_chunk_size": 32_768, + "max_chunk_size": 65_536}) + + # the data must be the same + unchunked = pq.read_table(tempdir / 'unchunked.parquet') + chunked_default = pq.read_table(tempdir / 'chunked-default.parquet') + chunked_custom = pq.read_table(tempdir / 'chunked-custom.parquet') + assert unchunked.equals(chunked_default) + assert unchunked.equals(chunked_custom) + + # number of row groups and their sizes are not affected by content defined chunking + unchunked_metadata = pq.read_metadata(tempdir / 'unchunked.parquet') + chunked_default_metadata = pq.read_metadata(tempdir / 'chunked-default.parquet') + chunked_custom_metadata = pq.read_metadata(tempdir / 'chunked-custom.parquet') + + assert unchunked_metadata.num_row_groups == chunked_default_metadata.num_row_groups + assert unchunked_metadata.num_row_groups == chunked_custom_metadata.num_row_groups + + for i in range(unchunked_metadata.num_row_groups): + rg_unchunked = unchunked_metadata.row_group(i) + rg_chunked_default = chunked_default_metadata.row_group(i) + rg_chunked_custom = chunked_custom_metadata.row_group(i) + assert rg_unchunked.num_rows == rg_chunked_default.num_rows + assert rg_unchunked.num_rows == rg_chunked_custom.num_rows + # since PageReader is not exposed we don't cannot inspect the page sizes + # so just check that the total byte size is different Review Comment: Interestingly this is only true if we use PLAIN encoding. With the default encoding the chunked parquet files were actually smaller for this data sample but turned over for bigger samples. Anyway, changed the assertions and using PLAIN encoding now. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org