kszucs commented on code in PR #45360:
URL: https://github.com/apache/arrow/pull/45360#discussion_r2083091794
##########
cpp/src/parquet/properties.h:
##########
@@ -275,10 +305,38 @@ class PARQUET_EXPORT WriterProperties {
page_checksum_enabled_(properties.page_checksum_enabled()),
size_statistics_level_(properties.size_statistics_level()),
sorting_columns_(properties.sorting_columns()),
- default_column_properties_(properties.default_column_properties()) {}
+ default_column_properties_(properties.default_column_properties()),
+ content_defined_chunking_enabled_(
+ properties.content_defined_chunking_enabled()),
+ content_defined_chunking_options_(
+ properties.content_defined_chunking_options()) {}
virtual ~Builder() {}
+ /// \brief EXPERIMENTAL: Use content-defined page chunking for all columns.
+ ///
+ /// Optimize parquet files for content addressable storage (CAS) systems
by writing
+ /// data pages according to content-defined chunk boundaries. This allows
for more
+ /// efficient deduplication of data across files, hence more efficient
network
+ /// transfers and storage. The chunking is based on a rolling hash
algorithm that
+ /// identifies chunk boundaries based on the actual content of the data.
+ Builder* enable_content_defined_chunking() {
+ content_defined_chunking_enabled_ = true;
+ return this;
+ }
+
+ /// \brief EXPERIMENTAL: Disable content-defined page chunking for all
columns.
+ Builder* disable_content_defined_chunking() {
+ content_defined_chunking_enabled_ = false;
+ return this;
+ }
+
+ /// \brief EXPERIMENTAL: Specify content-defined chunking options, see
CdcOptions.
+ Builder* content_defined_chunking_options(const CdcOptions options) {
Review Comment:
Updated to the const ref.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]